xref: /openbmc/linux/drivers/block/rbd.c (revision b155e86c)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
59602adf40SYehuda Sadeh 
60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
62d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63d4b125e9SAlex Elder 
6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
65602adf40SYehuda Sadeh 
66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
67602adf40SYehuda Sadeh 
689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
719e15b77dSAlex Elder 
721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
73589d30e0SAlex Elder 
74d889140cSAlex Elder /* Feature bits */
75d889140cSAlex Elder 
765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
795cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
80d889140cSAlex Elder 
81d889140cSAlex Elder /* Features supported by this (client software) implementation. */
82d889140cSAlex Elder 
835cbf6f12SAlex Elder #define RBD_FEATURES_SUPPORTED	(0)
84d889140cSAlex Elder 
8581a89793SAlex Elder /*
8681a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8781a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8881a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8981a89793SAlex Elder  * enough to hold all possible device names.
9081a89793SAlex Elder  */
91602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh /*
95602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
96602adf40SYehuda Sadeh  */
97602adf40SYehuda Sadeh struct rbd_image_header {
98f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
99849b4260SAlex Elder 	char *object_prefix;
10034b13184SAlex Elder 	u64 features;
101602adf40SYehuda Sadeh 	__u8 obj_order;
102602adf40SYehuda Sadeh 	__u8 crypt_type;
103602adf40SYehuda Sadeh 	__u8 comp_type;
104602adf40SYehuda Sadeh 
105f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
106f84344f3SAlex Elder 	u64 image_size;
107f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
108602adf40SYehuda Sadeh 	char *snap_names;
109602adf40SYehuda Sadeh 	u64 *snap_sizes;
11059c2be1eSYehuda Sadeh 
11159c2be1eSYehuda Sadeh 	u64 obj_version;
11259c2be1eSYehuda Sadeh };
11359c2be1eSYehuda Sadeh 
1140d7dbfceSAlex Elder /*
1150d7dbfceSAlex Elder  * An rbd image specification.
1160d7dbfceSAlex Elder  *
1170d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
118c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
119c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
120c66c6e0cSAlex Elder  *
121c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
122c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
123c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
124c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
127c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
128c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
129c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
130c66c6e0cSAlex Elder  * is shared between the parent and child).
131c66c6e0cSAlex Elder  *
132c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
133c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
134c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
137c66c6e0cSAlex Elder  * could be a null pointer).
1380d7dbfceSAlex Elder  */
1390d7dbfceSAlex Elder struct rbd_spec {
1400d7dbfceSAlex Elder 	u64		pool_id;
1410d7dbfceSAlex Elder 	char		*pool_name;
1420d7dbfceSAlex Elder 
1430d7dbfceSAlex Elder 	char		*image_id;
1440d7dbfceSAlex Elder 	char		*image_name;
1450d7dbfceSAlex Elder 
1460d7dbfceSAlex Elder 	u64		snap_id;
1470d7dbfceSAlex Elder 	char		*snap_name;
1480d7dbfceSAlex Elder 
1490d7dbfceSAlex Elder 	struct kref	kref;
1500d7dbfceSAlex Elder };
1510d7dbfceSAlex Elder 
152602adf40SYehuda Sadeh /*
153f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
154602adf40SYehuda Sadeh  */
155602adf40SYehuda Sadeh struct rbd_client {
156602adf40SYehuda Sadeh 	struct ceph_client	*client;
157602adf40SYehuda Sadeh 	struct kref		kref;
158602adf40SYehuda Sadeh 	struct list_head	node;
159602adf40SYehuda Sadeh };
160602adf40SYehuda Sadeh 
161bf0d5f50SAlex Elder struct rbd_img_request;
162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163bf0d5f50SAlex Elder 
164bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
165bf0d5f50SAlex Elder 
166bf0d5f50SAlex Elder struct rbd_obj_request;
167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168bf0d5f50SAlex Elder 
1699969ebc5SAlex Elder enum obj_request_type {
1709969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1719969ebc5SAlex Elder };
172bf0d5f50SAlex Elder 
173926f9b3fSAlex Elder enum obj_req_flags {
174926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1756365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
176926f9b3fSAlex Elder };
177926f9b3fSAlex Elder 
178bf0d5f50SAlex Elder struct rbd_obj_request {
179bf0d5f50SAlex Elder 	const char		*object_name;
180bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
181bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
182926f9b3fSAlex Elder 	unsigned long		flags;
183bf0d5f50SAlex Elder 
184bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
1857da22d29SAlex Elder 	u64			img_offset;	/* image relative offset */
186bf0d5f50SAlex Elder 	struct list_head	links;		/* img_request->obj_requests */
187bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
188bf0d5f50SAlex Elder 
189bf0d5f50SAlex Elder 	enum obj_request_type	type;
190788e2df3SAlex Elder 	union {
191bf0d5f50SAlex Elder 		struct bio	*bio_list;
192788e2df3SAlex Elder 		struct {
193788e2df3SAlex Elder 			struct page	**pages;
194788e2df3SAlex Elder 			u32		page_count;
195788e2df3SAlex Elder 		};
196788e2df3SAlex Elder 	};
197bf0d5f50SAlex Elder 
198bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
199bf0d5f50SAlex Elder 
200bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
201bf0d5f50SAlex Elder 	u64			version;
2021b83bef2SSage Weil 	int			result;
203bf0d5f50SAlex Elder 
204bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
205788e2df3SAlex Elder 	struct completion	completion;
206bf0d5f50SAlex Elder 
207bf0d5f50SAlex Elder 	struct kref		kref;
208bf0d5f50SAlex Elder };
209bf0d5f50SAlex Elder 
2100c425248SAlex Elder enum img_req_flags {
2119849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2129849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
213d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2140c425248SAlex Elder };
2150c425248SAlex Elder 
216bf0d5f50SAlex Elder struct rbd_img_request {
217bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
218bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
219bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2200c425248SAlex Elder 	unsigned long		flags;
221bf0d5f50SAlex Elder 	union {
222bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2239849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2249849e986SAlex Elder 	};
2259849e986SAlex Elder 	union {
2269849e986SAlex Elder 		struct request		*rq;		/* block request */
2279849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
228bf0d5f50SAlex Elder 	};
229bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
230bf0d5f50SAlex Elder 	u32			next_completion;
231bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
23255f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
233a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
234bf0d5f50SAlex Elder 
235bf0d5f50SAlex Elder 	u32			obj_request_count;
236bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
237bf0d5f50SAlex Elder 
238bf0d5f50SAlex Elder 	struct kref		kref;
239bf0d5f50SAlex Elder };
240bf0d5f50SAlex Elder 
241bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
242ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
243bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
244ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
245bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
246ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
247bf0d5f50SAlex Elder 
248dfc5606dSYehuda Sadeh struct rbd_snap {
249dfc5606dSYehuda Sadeh 	struct	device		dev;
250dfc5606dSYehuda Sadeh 	const char		*name;
2513591538fSJosh Durgin 	u64			size;
252dfc5606dSYehuda Sadeh 	struct list_head	node;
253dfc5606dSYehuda Sadeh 	u64			id;
25434b13184SAlex Elder 	u64			features;
255dfc5606dSYehuda Sadeh };
256dfc5606dSYehuda Sadeh 
257f84344f3SAlex Elder struct rbd_mapping {
25899c1f08fSAlex Elder 	u64                     size;
25934b13184SAlex Elder 	u64                     features;
260f84344f3SAlex Elder 	bool			read_only;
261f84344f3SAlex Elder };
262f84344f3SAlex Elder 
263602adf40SYehuda Sadeh /*
264602adf40SYehuda Sadeh  * a single device
265602adf40SYehuda Sadeh  */
266602adf40SYehuda Sadeh struct rbd_device {
267de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
268602adf40SYehuda Sadeh 
269602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
270602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
271602adf40SYehuda Sadeh 
272a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
273602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
274602adf40SYehuda Sadeh 
275602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
276602adf40SYehuda Sadeh 
277b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
278602adf40SYehuda Sadeh 
279602adf40SYehuda Sadeh 	struct rbd_image_header	header;
280b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
2810d7dbfceSAlex Elder 	struct rbd_spec		*spec;
282602adf40SYehuda Sadeh 
2830d7dbfceSAlex Elder 	char			*header_name;
284971f839aSAlex Elder 
2850903e875SAlex Elder 	struct ceph_file_layout	layout;
2860903e875SAlex Elder 
28759c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
288975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
28959c2be1eSYehuda Sadeh 
29086b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
29186b00e0dSAlex Elder 	u64			parent_overlap;
2922f82ee54SAlex Elder 	struct rbd_device	*parent;
29386b00e0dSAlex Elder 
294c666601aSJosh Durgin 	/* protects updating the header */
295c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
296f84344f3SAlex Elder 
297f84344f3SAlex Elder 	struct rbd_mapping	mapping;
298602adf40SYehuda Sadeh 
299602adf40SYehuda Sadeh 	struct list_head	node;
300dfc5606dSYehuda Sadeh 
301dfc5606dSYehuda Sadeh 	/* list of snapshots */
302dfc5606dSYehuda Sadeh 	struct list_head	snaps;
303dfc5606dSYehuda Sadeh 
304dfc5606dSYehuda Sadeh 	/* sysfs related */
305dfc5606dSYehuda Sadeh 	struct device		dev;
306b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
307dfc5606dSYehuda Sadeh };
308dfc5606dSYehuda Sadeh 
309b82d167bSAlex Elder /*
310b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
311b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
312b82d167bSAlex Elder  *
313b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
314b82d167bSAlex Elder  * "open_count" field) requires atomic access.
315b82d167bSAlex Elder  */
3166d292906SAlex Elder enum rbd_dev_flags {
3176d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
318b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3196d292906SAlex Elder };
3206d292906SAlex Elder 
321602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
322e124a82fSAlex Elder 
323602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
324e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
325e124a82fSAlex Elder 
326602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
327432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
328602adf40SYehuda Sadeh 
329304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
330304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
331304f6808SAlex Elder 
332dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
33341f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
334dfc5606dSYehuda Sadeh 
335f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
336f0f8cef5SAlex Elder 		       size_t count);
337f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
338f0f8cef5SAlex Elder 			  size_t count);
3392f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev);
340f0f8cef5SAlex Elder 
341f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
342f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
343f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
344f0f8cef5SAlex Elder 	__ATTR_NULL
345f0f8cef5SAlex Elder };
346f0f8cef5SAlex Elder 
347f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
348f0f8cef5SAlex Elder 	.name		= "rbd",
349f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
350f0f8cef5SAlex Elder };
351f0f8cef5SAlex Elder 
352f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
353f0f8cef5SAlex Elder {
354f0f8cef5SAlex Elder }
355f0f8cef5SAlex Elder 
356f0f8cef5SAlex Elder static struct device rbd_root_dev = {
357f0f8cef5SAlex Elder 	.init_name =    "rbd",
358f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
359f0f8cef5SAlex Elder };
360f0f8cef5SAlex Elder 
36106ecc6cbSAlex Elder static __printf(2, 3)
36206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
36306ecc6cbSAlex Elder {
36406ecc6cbSAlex Elder 	struct va_format vaf;
36506ecc6cbSAlex Elder 	va_list args;
36606ecc6cbSAlex Elder 
36706ecc6cbSAlex Elder 	va_start(args, fmt);
36806ecc6cbSAlex Elder 	vaf.fmt = fmt;
36906ecc6cbSAlex Elder 	vaf.va = &args;
37006ecc6cbSAlex Elder 
37106ecc6cbSAlex Elder 	if (!rbd_dev)
37206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
37306ecc6cbSAlex Elder 	else if (rbd_dev->disk)
37406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
37506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
37606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
37706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
37806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
37906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
38006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
38106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
38206ecc6cbSAlex Elder 	else	/* punt */
38306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
38406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
38506ecc6cbSAlex Elder 	va_end(args);
38606ecc6cbSAlex Elder }
38706ecc6cbSAlex Elder 
388aafb230eSAlex Elder #ifdef RBD_DEBUG
389aafb230eSAlex Elder #define rbd_assert(expr)						\
390aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
391aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
392aafb230eSAlex Elder 						"at line %d:\n\n"	\
393aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
394aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
395aafb230eSAlex Elder 			BUG();						\
396aafb230eSAlex Elder 		}
397aafb230eSAlex Elder #else /* !RBD_DEBUG */
398aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
399aafb230eSAlex Elder #endif /* !RBD_DEBUG */
400dfc5606dSYehuda Sadeh 
4018b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
4028b3e1a56SAlex Elder 
403117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
404117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
40559c2be1eSYehuda Sadeh 
406602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
407602adf40SYehuda Sadeh {
408f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
409b82d167bSAlex Elder 	bool removing = false;
410602adf40SYehuda Sadeh 
411f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
412602adf40SYehuda Sadeh 		return -EROFS;
413602adf40SYehuda Sadeh 
414a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
415b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
416b82d167bSAlex Elder 		removing = true;
417b82d167bSAlex Elder 	else
418b82d167bSAlex Elder 		rbd_dev->open_count++;
419a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
420b82d167bSAlex Elder 	if (removing)
421b82d167bSAlex Elder 		return -ENOENT;
422b82d167bSAlex Elder 
42342382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
424c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
425f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
42642382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
427340c7a2bSAlex Elder 
428602adf40SYehuda Sadeh 	return 0;
429602adf40SYehuda Sadeh }
430602adf40SYehuda Sadeh 
431dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
432dfc5606dSYehuda Sadeh {
433dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
434b82d167bSAlex Elder 	unsigned long open_count_before;
435b82d167bSAlex Elder 
436a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
437b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
438a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
439b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
440dfc5606dSYehuda Sadeh 
44142382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
442c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
44342382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
444dfc5606dSYehuda Sadeh 
445dfc5606dSYehuda Sadeh 	return 0;
446dfc5606dSYehuda Sadeh }
447dfc5606dSYehuda Sadeh 
448602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
449602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
450602adf40SYehuda Sadeh 	.open			= rbd_open,
451dfc5606dSYehuda Sadeh 	.release		= rbd_release,
452602adf40SYehuda Sadeh };
453602adf40SYehuda Sadeh 
454602adf40SYehuda Sadeh /*
455602adf40SYehuda Sadeh  * Initialize an rbd client instance.
45643ae4701SAlex Elder  * We own *ceph_opts.
457602adf40SYehuda Sadeh  */
458f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
459602adf40SYehuda Sadeh {
460602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
461602adf40SYehuda Sadeh 	int ret = -ENOMEM;
462602adf40SYehuda Sadeh 
46337206ee5SAlex Elder 	dout("%s:\n", __func__);
464602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
465602adf40SYehuda Sadeh 	if (!rbdc)
466602adf40SYehuda Sadeh 		goto out_opt;
467602adf40SYehuda Sadeh 
468602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
469602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
470602adf40SYehuda Sadeh 
471bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
472bc534d86SAlex Elder 
47343ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
474602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
475bc534d86SAlex Elder 		goto out_mutex;
47643ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
477602adf40SYehuda Sadeh 
478602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
479602adf40SYehuda Sadeh 	if (ret < 0)
480602adf40SYehuda Sadeh 		goto out_err;
481602adf40SYehuda Sadeh 
482432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
483602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
484432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
485602adf40SYehuda Sadeh 
486bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
48737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
488bc534d86SAlex Elder 
489602adf40SYehuda Sadeh 	return rbdc;
490602adf40SYehuda Sadeh 
491602adf40SYehuda Sadeh out_err:
492602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
493bc534d86SAlex Elder out_mutex:
494bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
495602adf40SYehuda Sadeh 	kfree(rbdc);
496602adf40SYehuda Sadeh out_opt:
49743ae4701SAlex Elder 	if (ceph_opts)
49843ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
49937206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
50037206ee5SAlex Elder 
50128f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
502602adf40SYehuda Sadeh }
503602adf40SYehuda Sadeh 
5042f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5052f82ee54SAlex Elder {
5062f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5072f82ee54SAlex Elder 
5082f82ee54SAlex Elder 	return rbdc;
5092f82ee54SAlex Elder }
5102f82ee54SAlex Elder 
511602adf40SYehuda Sadeh /*
5121f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5131f7ba331SAlex Elder  * found, bump its reference count.
514602adf40SYehuda Sadeh  */
5151f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
516602adf40SYehuda Sadeh {
517602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5181f7ba331SAlex Elder 	bool found = false;
519602adf40SYehuda Sadeh 
52043ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
521602adf40SYehuda Sadeh 		return NULL;
522602adf40SYehuda Sadeh 
5231f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5241f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5251f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5262f82ee54SAlex Elder 			__rbd_get_client(client_node);
5272f82ee54SAlex Elder 
5281f7ba331SAlex Elder 			found = true;
5291f7ba331SAlex Elder 			break;
5301f7ba331SAlex Elder 		}
5311f7ba331SAlex Elder 	}
5321f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5331f7ba331SAlex Elder 
5341f7ba331SAlex Elder 	return found ? client_node : NULL;
535602adf40SYehuda Sadeh }
536602adf40SYehuda Sadeh 
537602adf40SYehuda Sadeh /*
53859c2be1eSYehuda Sadeh  * mount options
53959c2be1eSYehuda Sadeh  */
54059c2be1eSYehuda Sadeh enum {
54159c2be1eSYehuda Sadeh 	Opt_last_int,
54259c2be1eSYehuda Sadeh 	/* int args above */
54359c2be1eSYehuda Sadeh 	Opt_last_string,
54459c2be1eSYehuda Sadeh 	/* string args above */
545cc0538b6SAlex Elder 	Opt_read_only,
546cc0538b6SAlex Elder 	Opt_read_write,
547cc0538b6SAlex Elder 	/* Boolean args above */
548cc0538b6SAlex Elder 	Opt_last_bool,
54959c2be1eSYehuda Sadeh };
55059c2be1eSYehuda Sadeh 
55143ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
55259c2be1eSYehuda Sadeh 	/* int args above */
55359c2be1eSYehuda Sadeh 	/* string args above */
554be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
555cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
556cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
557cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
558cc0538b6SAlex Elder 	/* Boolean args above */
55959c2be1eSYehuda Sadeh 	{-1, NULL}
56059c2be1eSYehuda Sadeh };
56159c2be1eSYehuda Sadeh 
56298571b5aSAlex Elder struct rbd_options {
56398571b5aSAlex Elder 	bool	read_only;
56498571b5aSAlex Elder };
56598571b5aSAlex Elder 
56698571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
56798571b5aSAlex Elder 
56859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
56959c2be1eSYehuda Sadeh {
57043ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
57159c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
57259c2be1eSYehuda Sadeh 	int token, intval, ret;
57359c2be1eSYehuda Sadeh 
57443ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
57559c2be1eSYehuda Sadeh 	if (token < 0)
57659c2be1eSYehuda Sadeh 		return -EINVAL;
57759c2be1eSYehuda Sadeh 
57859c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
57959c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
58059c2be1eSYehuda Sadeh 		if (ret < 0) {
58159c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
58259c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
58359c2be1eSYehuda Sadeh 			return ret;
58459c2be1eSYehuda Sadeh 		}
58559c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
58659c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
58759c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
58859c2be1eSYehuda Sadeh 		     argstr[0].from);
589cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
590cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
59159c2be1eSYehuda Sadeh 	} else {
59259c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
59359c2be1eSYehuda Sadeh 	}
59459c2be1eSYehuda Sadeh 
59559c2be1eSYehuda Sadeh 	switch (token) {
596cc0538b6SAlex Elder 	case Opt_read_only:
597cc0538b6SAlex Elder 		rbd_opts->read_only = true;
598cc0538b6SAlex Elder 		break;
599cc0538b6SAlex Elder 	case Opt_read_write:
600cc0538b6SAlex Elder 		rbd_opts->read_only = false;
601cc0538b6SAlex Elder 		break;
60259c2be1eSYehuda Sadeh 	default:
603aafb230eSAlex Elder 		rbd_assert(false);
604aafb230eSAlex Elder 		break;
60559c2be1eSYehuda Sadeh 	}
60659c2be1eSYehuda Sadeh 	return 0;
60759c2be1eSYehuda Sadeh }
60859c2be1eSYehuda Sadeh 
60959c2be1eSYehuda Sadeh /*
610602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
611602adf40SYehuda Sadeh  * not exist create it.
612602adf40SYehuda Sadeh  */
6139d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
614602adf40SYehuda Sadeh {
615f8c38929SAlex Elder 	struct rbd_client *rbdc;
61659c2be1eSYehuda Sadeh 
6171f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6189d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
61943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6209d3997fdSAlex Elder 	else
621f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
622d720bcb0SAlex Elder 
6239d3997fdSAlex Elder 	return rbdc;
624602adf40SYehuda Sadeh }
625602adf40SYehuda Sadeh 
626602adf40SYehuda Sadeh /*
627602adf40SYehuda Sadeh  * Destroy ceph client
628d23a4b3fSAlex Elder  *
629432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
630602adf40SYehuda Sadeh  */
631602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
632602adf40SYehuda Sadeh {
633602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
634602adf40SYehuda Sadeh 
63537206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
636cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
637602adf40SYehuda Sadeh 	list_del(&rbdc->node);
638cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
639602adf40SYehuda Sadeh 
640602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
641602adf40SYehuda Sadeh 	kfree(rbdc);
642602adf40SYehuda Sadeh }
643602adf40SYehuda Sadeh 
644602adf40SYehuda Sadeh /*
645602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
646602adf40SYehuda Sadeh  * it.
647602adf40SYehuda Sadeh  */
6489d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
649602adf40SYehuda Sadeh {
650c53d5893SAlex Elder 	if (rbdc)
6519d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
652602adf40SYehuda Sadeh }
653602adf40SYehuda Sadeh 
654a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
655a30b71b9SAlex Elder {
656a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
657a30b71b9SAlex Elder }
658a30b71b9SAlex Elder 
6598e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6608e94af8eSAlex Elder {
661103a150fSAlex Elder 	size_t size;
662103a150fSAlex Elder 	u32 snap_count;
663103a150fSAlex Elder 
664103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
665103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
666103a150fSAlex Elder 		return false;
667103a150fSAlex Elder 
668db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
669db2388b6SAlex Elder 
670db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
671db2388b6SAlex Elder 		return false;
672db2388b6SAlex Elder 
673db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
674db2388b6SAlex Elder 
675db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
676db2388b6SAlex Elder 		return false;
677db2388b6SAlex Elder 
678103a150fSAlex Elder 	/*
679103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
680103a150fSAlex Elder 	 * that limits the number of snapshots.
681103a150fSAlex Elder 	 */
682103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
683103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
684103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
685103a150fSAlex Elder 		return false;
686103a150fSAlex Elder 
687103a150fSAlex Elder 	/*
688103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
689103a150fSAlex Elder 	 * header must also be representable in a size_t.
690103a150fSAlex Elder 	 */
691103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
692103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
693103a150fSAlex Elder 		return false;
694103a150fSAlex Elder 
695103a150fSAlex Elder 	return true;
6968e94af8eSAlex Elder }
6978e94af8eSAlex Elder 
698602adf40SYehuda Sadeh /*
699602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
700602adf40SYehuda Sadeh  * header.
701602adf40SYehuda Sadeh  */
702602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
7034156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
704602adf40SYehuda Sadeh {
705ccece235SAlex Elder 	u32 snap_count;
70658c17b0eSAlex Elder 	size_t len;
707d2bb24e5SAlex Elder 	size_t size;
708621901d6SAlex Elder 	u32 i;
709602adf40SYehuda Sadeh 
7106a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
7116a52325fSAlex Elder 
712103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
713103a150fSAlex Elder 
71458c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
71558c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
7166a52325fSAlex Elder 	if (!header->object_prefix)
717602adf40SYehuda Sadeh 		return -ENOMEM;
71858c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
71958c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
72000f1f36fSAlex Elder 
721602adf40SYehuda Sadeh 	if (snap_count) {
722f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
723f785cc1dSAlex Elder 
724621901d6SAlex Elder 		/* Save a copy of the snapshot names */
725621901d6SAlex Elder 
726f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
727f785cc1dSAlex Elder 			return -EIO;
728f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
729602adf40SYehuda Sadeh 		if (!header->snap_names)
7306a52325fSAlex Elder 			goto out_err;
731f785cc1dSAlex Elder 		/*
732f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
733f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
734f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
735f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
736f785cc1dSAlex Elder 		 */
737f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
738f785cc1dSAlex Elder 			snap_names_len);
7396a52325fSAlex Elder 
740621901d6SAlex Elder 		/* Record each snapshot's size */
741621901d6SAlex Elder 
742d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
743d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
744602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7456a52325fSAlex Elder 			goto out_err;
746621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
747621901d6SAlex Elder 			header->snap_sizes[i] =
748621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
749602adf40SYehuda Sadeh 	} else {
750ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
751602adf40SYehuda Sadeh 		header->snap_names = NULL;
752602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
753602adf40SYehuda Sadeh 	}
754849b4260SAlex Elder 
75534b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
756602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
757602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
758602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7596a52325fSAlex Elder 
760621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
761621901d6SAlex Elder 
762f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
7636a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
7646a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
7656a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
7666a52325fSAlex Elder 	if (!header->snapc)
7676a52325fSAlex Elder 		goto out_err;
768602adf40SYehuda Sadeh 
769602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
770505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
771602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
772621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
773602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
774602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
775602adf40SYehuda Sadeh 
776602adf40SYehuda Sadeh 	return 0;
777602adf40SYehuda Sadeh 
7786a52325fSAlex Elder out_err:
779849b4260SAlex Elder 	kfree(header->snap_sizes);
780ccece235SAlex Elder 	header->snap_sizes = NULL;
781602adf40SYehuda Sadeh 	kfree(header->snap_names);
782ccece235SAlex Elder 	header->snap_names = NULL;
7836a52325fSAlex Elder 	kfree(header->object_prefix);
7846a52325fSAlex Elder 	header->object_prefix = NULL;
785ccece235SAlex Elder 
78600f1f36fSAlex Elder 	return -ENOMEM;
787602adf40SYehuda Sadeh }
788602adf40SYehuda Sadeh 
7899e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
7909e15b77dSAlex Elder {
7919e15b77dSAlex Elder 	struct rbd_snap *snap;
7929e15b77dSAlex Elder 
7939e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
7949e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
7959e15b77dSAlex Elder 
7969e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
7979e15b77dSAlex Elder 		if (snap_id == snap->id)
7989e15b77dSAlex Elder 			return snap->name;
7999e15b77dSAlex Elder 
8009e15b77dSAlex Elder 	return NULL;
8019e15b77dSAlex Elder }
8029e15b77dSAlex Elder 
8038836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
804602adf40SYehuda Sadeh {
805602adf40SYehuda Sadeh 
806e86924a8SAlex Elder 	struct rbd_snap *snap;
80700f1f36fSAlex Elder 
808e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
809e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
8100d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
811e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
81234b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
81300f1f36fSAlex Elder 
814e86924a8SAlex Elder 			return 0;
815602adf40SYehuda Sadeh 		}
81600f1f36fSAlex Elder 	}
817e86924a8SAlex Elder 
81800f1f36fSAlex Elder 	return -ENOENT;
81900f1f36fSAlex Elder }
820602adf40SYehuda Sadeh 
821819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
822602adf40SYehuda Sadeh {
82378dc447dSAlex Elder 	int ret;
824602adf40SYehuda Sadeh 
8250d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
826cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
8270d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
82899c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
82934b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
830e86924a8SAlex Elder 		ret = 0;
831602adf40SYehuda Sadeh 	} else {
8320d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
833602adf40SYehuda Sadeh 		if (ret < 0)
834602adf40SYehuda Sadeh 			goto done;
835f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
836602adf40SYehuda Sadeh 	}
8376d292906SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
8386d292906SAlex Elder 
839602adf40SYehuda Sadeh done:
840602adf40SYehuda Sadeh 	return ret;
841602adf40SYehuda Sadeh }
842602adf40SYehuda Sadeh 
843602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
844602adf40SYehuda Sadeh {
845849b4260SAlex Elder 	kfree(header->object_prefix);
846d78fd7aeSAlex Elder 	header->object_prefix = NULL;
847602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
848d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
849849b4260SAlex Elder 	kfree(header->snap_names);
850d78fd7aeSAlex Elder 	header->snap_names = NULL;
851d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
852d78fd7aeSAlex Elder 	header->snapc = NULL;
853602adf40SYehuda Sadeh }
854602adf40SYehuda Sadeh 
85598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
856602adf40SYehuda Sadeh {
85765ccfe21SAlex Elder 	char *name;
85865ccfe21SAlex Elder 	u64 segment;
85965ccfe21SAlex Elder 	int ret;
860602adf40SYehuda Sadeh 
8612fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
86265ccfe21SAlex Elder 	if (!name)
86365ccfe21SAlex Elder 		return NULL;
86465ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8652fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
86665ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8672fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
86865ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
86965ccfe21SAlex Elder 			segment, ret);
87065ccfe21SAlex Elder 		kfree(name);
87165ccfe21SAlex Elder 		name = NULL;
87265ccfe21SAlex Elder 	}
873602adf40SYehuda Sadeh 
87465ccfe21SAlex Elder 	return name;
87565ccfe21SAlex Elder }
876602adf40SYehuda Sadeh 
87765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
87865ccfe21SAlex Elder {
87965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
880602adf40SYehuda Sadeh 
88165ccfe21SAlex Elder 	return offset & (segment_size - 1);
88265ccfe21SAlex Elder }
88365ccfe21SAlex Elder 
88465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
88565ccfe21SAlex Elder 				u64 offset, u64 length)
88665ccfe21SAlex Elder {
88765ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
88865ccfe21SAlex Elder 
88965ccfe21SAlex Elder 	offset &= segment_size - 1;
89065ccfe21SAlex Elder 
891aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
89265ccfe21SAlex Elder 	if (offset + length > segment_size)
89365ccfe21SAlex Elder 		length = segment_size - offset;
89465ccfe21SAlex Elder 
89565ccfe21SAlex Elder 	return length;
896602adf40SYehuda Sadeh }
897602adf40SYehuda Sadeh 
898602adf40SYehuda Sadeh /*
899029bcbd8SJosh Durgin  * returns the size of an object in the image
900029bcbd8SJosh Durgin  */
901029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
902029bcbd8SJosh Durgin {
903029bcbd8SJosh Durgin 	return 1 << header->obj_order;
904029bcbd8SJosh Durgin }
905029bcbd8SJosh Durgin 
906029bcbd8SJosh Durgin /*
907602adf40SYehuda Sadeh  * bio helpers
908602adf40SYehuda Sadeh  */
909602adf40SYehuda Sadeh 
910602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
911602adf40SYehuda Sadeh {
912602adf40SYehuda Sadeh 	struct bio *tmp;
913602adf40SYehuda Sadeh 
914602adf40SYehuda Sadeh 	while (chain) {
915602adf40SYehuda Sadeh 		tmp = chain;
916602adf40SYehuda Sadeh 		chain = chain->bi_next;
917602adf40SYehuda Sadeh 		bio_put(tmp);
918602adf40SYehuda Sadeh 	}
919602adf40SYehuda Sadeh }
920602adf40SYehuda Sadeh 
921602adf40SYehuda Sadeh /*
922602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
923602adf40SYehuda Sadeh  */
924602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
925602adf40SYehuda Sadeh {
926602adf40SYehuda Sadeh 	struct bio_vec *bv;
927602adf40SYehuda Sadeh 	unsigned long flags;
928602adf40SYehuda Sadeh 	void *buf;
929602adf40SYehuda Sadeh 	int i;
930602adf40SYehuda Sadeh 	int pos = 0;
931602adf40SYehuda Sadeh 
932602adf40SYehuda Sadeh 	while (chain) {
933602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
934602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
935602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
936602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
937602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
938602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
93985b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
940602adf40SYehuda Sadeh 			}
941602adf40SYehuda Sadeh 			pos += bv->bv_len;
942602adf40SYehuda Sadeh 		}
943602adf40SYehuda Sadeh 
944602adf40SYehuda Sadeh 		chain = chain->bi_next;
945602adf40SYehuda Sadeh 	}
946602adf40SYehuda Sadeh }
947602adf40SYehuda Sadeh 
948602adf40SYehuda Sadeh /*
949f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
950f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
951602adf40SYehuda Sadeh  */
952f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
953f7760dadSAlex Elder 					unsigned int offset,
954f7760dadSAlex Elder 					unsigned int len,
955f7760dadSAlex Elder 					gfp_t gfpmask)
956602adf40SYehuda Sadeh {
957f7760dadSAlex Elder 	struct bio_vec *bv;
958f7760dadSAlex Elder 	unsigned int resid;
959f7760dadSAlex Elder 	unsigned short idx;
960f7760dadSAlex Elder 	unsigned int voff;
961f7760dadSAlex Elder 	unsigned short end_idx;
962f7760dadSAlex Elder 	unsigned short vcnt;
963f7760dadSAlex Elder 	struct bio *bio;
964602adf40SYehuda Sadeh 
965f7760dadSAlex Elder 	/* Handle the easy case for the caller */
966f7760dadSAlex Elder 
967f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
968f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
969f7760dadSAlex Elder 
970f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
971f7760dadSAlex Elder 		return NULL;
972f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
973f7760dadSAlex Elder 		return NULL;
974f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
975f7760dadSAlex Elder 		return NULL;
976f7760dadSAlex Elder 
977f7760dadSAlex Elder 	/* Find first affected segment... */
978f7760dadSAlex Elder 
979f7760dadSAlex Elder 	resid = offset;
980f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
981f7760dadSAlex Elder 		if (resid < bv->bv_len)
982f7760dadSAlex Elder 			break;
983f7760dadSAlex Elder 		resid -= bv->bv_len;
984602adf40SYehuda Sadeh 	}
985f7760dadSAlex Elder 	voff = resid;
986602adf40SYehuda Sadeh 
987f7760dadSAlex Elder 	/* ...and the last affected segment */
988542582fcSAlex Elder 
989f7760dadSAlex Elder 	resid += len;
990f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
991f7760dadSAlex Elder 		if (resid <= bv->bv_len)
992f7760dadSAlex Elder 			break;
993f7760dadSAlex Elder 		resid -= bv->bv_len;
994f7760dadSAlex Elder 	}
995f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
996602adf40SYehuda Sadeh 
997f7760dadSAlex Elder 	/* Build the clone */
998f7760dadSAlex Elder 
999f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1000f7760dadSAlex Elder 	if (!bio)
1001f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1002f7760dadSAlex Elder 
1003f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1004f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1005f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1006f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1007602adf40SYehuda Sadeh 
1008602adf40SYehuda Sadeh 	/*
1009f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1010f7760dadSAlex Elder 	 * and last (or only) entries.
1011602adf40SYehuda Sadeh 	 */
1012f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1013f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1014f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1015f7760dadSAlex Elder 	if (vcnt > 1) {
1016f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1017f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1018602adf40SYehuda Sadeh 	} else {
1019f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1020602adf40SYehuda Sadeh 	}
1021602adf40SYehuda Sadeh 
1022f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1023f7760dadSAlex Elder 	bio->bi_size = len;
1024f7760dadSAlex Elder 	bio->bi_idx = 0;
1025602adf40SYehuda Sadeh 
1026f7760dadSAlex Elder 	return bio;
1027602adf40SYehuda Sadeh }
1028602adf40SYehuda Sadeh 
1029f7760dadSAlex Elder /*
1030f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1031f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1032f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1033f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1034f7760dadSAlex Elder  *
1035f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1036f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1037f7760dadSAlex Elder  * the start of data to be cloned is located.
1038f7760dadSAlex Elder  *
1039f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1040f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1041f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1042f7760dadSAlex Elder  */
1043f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1044f7760dadSAlex Elder 					unsigned int *offset,
1045f7760dadSAlex Elder 					unsigned int len,
1046f7760dadSAlex Elder 					gfp_t gfpmask)
1047f7760dadSAlex Elder {
1048f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1049f7760dadSAlex Elder 	unsigned int off = *offset;
1050f7760dadSAlex Elder 	struct bio *chain = NULL;
1051f7760dadSAlex Elder 	struct bio **end;
1052602adf40SYehuda Sadeh 
1053f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1054602adf40SYehuda Sadeh 
1055f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1056f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1057602adf40SYehuda Sadeh 
1058f7760dadSAlex Elder 	end = &chain;
1059f7760dadSAlex Elder 	while (len) {
1060f7760dadSAlex Elder 		unsigned int bi_size;
1061f7760dadSAlex Elder 		struct bio *bio;
1062f7760dadSAlex Elder 
1063f5400b7aSAlex Elder 		if (!bi) {
1064f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1065f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1066f5400b7aSAlex Elder 		}
1067f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1068f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1069f7760dadSAlex Elder 		if (!bio)
1070f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1071f7760dadSAlex Elder 
1072f7760dadSAlex Elder 		*end = bio;
1073f7760dadSAlex Elder 		end = &bio->bi_next;
1074f7760dadSAlex Elder 
1075f7760dadSAlex Elder 		off += bi_size;
1076f7760dadSAlex Elder 		if (off == bi->bi_size) {
1077f7760dadSAlex Elder 			bi = bi->bi_next;
1078f7760dadSAlex Elder 			off = 0;
1079f7760dadSAlex Elder 		}
1080f7760dadSAlex Elder 		len -= bi_size;
1081f7760dadSAlex Elder 	}
1082f7760dadSAlex Elder 	*bio_src = bi;
1083f7760dadSAlex Elder 	*offset = off;
1084f7760dadSAlex Elder 
1085f7760dadSAlex Elder 	return chain;
1086f7760dadSAlex Elder out_err:
1087f7760dadSAlex Elder 	bio_chain_put(chain);
1088f7760dadSAlex Elder 
1089602adf40SYehuda Sadeh 	return NULL;
1090602adf40SYehuda Sadeh }
1091602adf40SYehuda Sadeh 
1092926f9b3fSAlex Elder /*
1093926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1094926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1095926f9b3fSAlex Elder  * again.
1096926f9b3fSAlex Elder  */
1097926f9b3fSAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
1098926f9b3fSAlex Elder {
1099926f9b3fSAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1100926f9b3fSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
1101926f9b3fSAlex Elder 		struct rbd_device *rbd_dev;
1102926f9b3fSAlex Elder 
1103926f9b3fSAlex Elder 		rbd_dev = img_request ? img_request->rbd_dev : NULL;
1104926f9b3fSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1105926f9b3fSAlex Elder 			obj_request);
1106926f9b3fSAlex Elder 	}
1107926f9b3fSAlex Elder }
1108926f9b3fSAlex Elder 
1109926f9b3fSAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1110926f9b3fSAlex Elder {
1111926f9b3fSAlex Elder 	smp_mb();
1112926f9b3fSAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1113926f9b3fSAlex Elder }
1114926f9b3fSAlex Elder 
11156365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
11166365d33aSAlex Elder {
11176365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
11186365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
11196365d33aSAlex Elder 		struct rbd_device *rbd_dev;
11206365d33aSAlex Elder 
11216365d33aSAlex Elder 		rbd_dev = img_request ? img_request->rbd_dev : NULL;
11226365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
11236365d33aSAlex Elder 			obj_request);
11246365d33aSAlex Elder 	}
11256365d33aSAlex Elder }
11266365d33aSAlex Elder 
11276365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
11286365d33aSAlex Elder {
11296365d33aSAlex Elder 	smp_mb();
11306365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
11316365d33aSAlex Elder }
11326365d33aSAlex Elder 
1133bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1134bf0d5f50SAlex Elder {
113537206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
113637206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1137bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1138bf0d5f50SAlex Elder }
1139bf0d5f50SAlex Elder 
1140bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1141bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1142bf0d5f50SAlex Elder {
1143bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
114437206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
114537206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1146bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1147bf0d5f50SAlex Elder }
1148bf0d5f50SAlex Elder 
1149bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1150bf0d5f50SAlex Elder {
115137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
115237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1153bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1154bf0d5f50SAlex Elder }
1155bf0d5f50SAlex Elder 
1156bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1157bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1158bf0d5f50SAlex Elder {
1159bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
116037206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
116137206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1162bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1163bf0d5f50SAlex Elder }
1164bf0d5f50SAlex Elder 
1165bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1166bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1167bf0d5f50SAlex Elder {
116825dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
116925dcf954SAlex Elder 
1170b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1171bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
117225dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
11736365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
11746365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1175bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
117625dcf954SAlex Elder 	img_request->obj_request_count++;
117725dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
117837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
117937206ee5SAlex Elder 		obj_request->which);
1180bf0d5f50SAlex Elder }
1181bf0d5f50SAlex Elder 
1182bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1183bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1184bf0d5f50SAlex Elder {
1185bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
118625dcf954SAlex Elder 
118737206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
118837206ee5SAlex Elder 		obj_request->which);
1189bf0d5f50SAlex Elder 	list_del(&obj_request->links);
119025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
119125dcf954SAlex Elder 	img_request->obj_request_count--;
119225dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
119325dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
11946365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1195bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1196bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
119725dcf954SAlex Elder 	obj_request->callback = NULL;
1198bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1199bf0d5f50SAlex Elder }
1200bf0d5f50SAlex Elder 
1201bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1202bf0d5f50SAlex Elder {
1203bf0d5f50SAlex Elder 	switch (type) {
12049969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1205bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1206788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1207bf0d5f50SAlex Elder 		return true;
1208bf0d5f50SAlex Elder 	default:
1209bf0d5f50SAlex Elder 		return false;
1210bf0d5f50SAlex Elder 	}
1211bf0d5f50SAlex Elder }
1212bf0d5f50SAlex Elder 
1213bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1214bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1215bf0d5f50SAlex Elder {
121637206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
121737206ee5SAlex Elder 
1218bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1219bf0d5f50SAlex Elder }
1220bf0d5f50SAlex Elder 
1221bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1222bf0d5f50SAlex Elder {
122355f27e09SAlex Elder 
122437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
122555f27e09SAlex Elder 
122655f27e09SAlex Elder 	/*
122755f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
122855f27e09SAlex Elder 	 * count for the image request.  We could instead use
122955f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
123055f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
123155f27e09SAlex Elder 	 */
123255f27e09SAlex Elder 	if (!img_request->result) {
123355f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
123455f27e09SAlex Elder 		u64 xferred = 0;
123555f27e09SAlex Elder 
123655f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
123755f27e09SAlex Elder 			xferred += obj_request->xferred;
123855f27e09SAlex Elder 		img_request->xferred = xferred;
123955f27e09SAlex Elder 	}
124055f27e09SAlex Elder 
1241bf0d5f50SAlex Elder 	if (img_request->callback)
1242bf0d5f50SAlex Elder 		img_request->callback(img_request);
1243bf0d5f50SAlex Elder 	else
1244bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1245bf0d5f50SAlex Elder }
1246bf0d5f50SAlex Elder 
1247788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1248788e2df3SAlex Elder 
1249788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1250788e2df3SAlex Elder {
125137206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
125237206ee5SAlex Elder 
1253788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1254788e2df3SAlex Elder }
1255788e2df3SAlex Elder 
12560c425248SAlex Elder /*
12570c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
12580c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
12590c425248SAlex Elder  * and currently never change thereafter.
12600c425248SAlex Elder  */
12610c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
12620c425248SAlex Elder {
12630c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
12640c425248SAlex Elder 	smp_mb();
12650c425248SAlex Elder }
12660c425248SAlex Elder 
12670c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
12680c425248SAlex Elder {
12690c425248SAlex Elder 	smp_mb();
12700c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
12710c425248SAlex Elder }
12720c425248SAlex Elder 
12739849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
12749849e986SAlex Elder {
12759849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
12769849e986SAlex Elder 	smp_mb();
12779849e986SAlex Elder }
12789849e986SAlex Elder 
12799849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
12809849e986SAlex Elder {
12819849e986SAlex Elder 	smp_mb();
12829849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
12839849e986SAlex Elder }
12849849e986SAlex Elder 
1285d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1286d0b2e944SAlex Elder {
1287d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1288d0b2e944SAlex Elder 	smp_mb();
1289d0b2e944SAlex Elder }
1290d0b2e944SAlex Elder 
1291d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1292d0b2e944SAlex Elder {
1293d0b2e944SAlex Elder 	smp_mb();
1294d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1295d0b2e944SAlex Elder }
1296d0b2e944SAlex Elder 
12976e2a4505SAlex Elder static void
12986e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
12996e2a4505SAlex Elder {
13006e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
13016e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
13026e2a4505SAlex Elder 		obj_request->xferred, obj_request->length);
13036e2a4505SAlex Elder 	/*
13046e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
13056e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
13066e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
13076e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
13086e2a4505SAlex Elder 	 * was satisfied.
13096e2a4505SAlex Elder 	 */
13106e2a4505SAlex Elder 	BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
13116e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
13126e2a4505SAlex Elder 		zero_bio_chain(obj_request->bio_list, 0);
13136e2a4505SAlex Elder 		obj_request->result = 0;
13146e2a4505SAlex Elder 		obj_request->xferred = obj_request->length;
13156e2a4505SAlex Elder 	} else if (obj_request->xferred < obj_request->length &&
13166e2a4505SAlex Elder 			!obj_request->result) {
13176e2a4505SAlex Elder 		zero_bio_chain(obj_request->bio_list, obj_request->xferred);
13186e2a4505SAlex Elder 		obj_request->xferred = obj_request->length;
13196e2a4505SAlex Elder 	}
13206e2a4505SAlex Elder 	obj_request_done_set(obj_request);
13216e2a4505SAlex Elder }
13226e2a4505SAlex Elder 
1323bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1324bf0d5f50SAlex Elder {
132537206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
132637206ee5SAlex Elder 		obj_request->callback);
1327bf0d5f50SAlex Elder 	if (obj_request->callback)
1328bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1329788e2df3SAlex Elder 	else
1330788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1331bf0d5f50SAlex Elder }
1332bf0d5f50SAlex Elder 
1333c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
133439bf2c5dSAlex Elder {
133539bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
133639bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
133739bf2c5dSAlex Elder }
133839bf2c5dSAlex Elder 
1339c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1340bf0d5f50SAlex Elder {
13418b3e1a56SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
13428b3e1a56SAlex Elder 	bool layered = img_request && img_request_layered_test(img_request);
13438b3e1a56SAlex Elder 
13448b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
13458b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
13468b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
13478b3e1a56SAlex Elder 	if (layered && obj_request->result == -ENOENT)
13488b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
13498b3e1a56SAlex Elder 	else if (img_request)
13506e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
13516e2a4505SAlex Elder 	else
135207741308SAlex Elder 		obj_request_done_set(obj_request);
1353bf0d5f50SAlex Elder }
1354bf0d5f50SAlex Elder 
1355c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1356bf0d5f50SAlex Elder {
13571b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
13581b83bef2SSage Weil 		obj_request->result, obj_request->length);
13591b83bef2SSage Weil 	/*
13608b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
13618b3e1a56SAlex Elder 	 * it to our originally-requested length.
13621b83bef2SSage Weil 	 */
13631b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
136407741308SAlex Elder 	obj_request_done_set(obj_request);
1365bf0d5f50SAlex Elder }
1366bf0d5f50SAlex Elder 
1367fbfab539SAlex Elder /*
1368fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1369fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1370fbfab539SAlex Elder  */
1371c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1372fbfab539SAlex Elder {
137337206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1374fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1375fbfab539SAlex Elder }
1376fbfab539SAlex Elder 
1377bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1378bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1379bf0d5f50SAlex Elder {
1380bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1381bf0d5f50SAlex Elder 	u16 opcode;
1382bf0d5f50SAlex Elder 
138337206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1384bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
13856365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request) ^
13866365d33aSAlex Elder 				!obj_request->img_request);
13876365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request) ^
1388bf0d5f50SAlex Elder 				(obj_request->which == BAD_WHICH));
1389bf0d5f50SAlex Elder 
13901b83bef2SSage Weil 	if (osd_req->r_result < 0)
13911b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1392bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1393bf0d5f50SAlex Elder 
13941b83bef2SSage Weil 	WARN_ON(osd_req->r_num_ops != 1);	/* For now */
1395bf0d5f50SAlex Elder 
1396c47f9371SAlex Elder 	/*
1397c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1398c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1399c47f9371SAlex Elder 	 */
14001b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1401c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
140279528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1403bf0d5f50SAlex Elder 	switch (opcode) {
1404bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1405c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1406bf0d5f50SAlex Elder 		break;
1407bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1408c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1409bf0d5f50SAlex Elder 		break;
1410fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1411c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1412fbfab539SAlex Elder 		break;
141336be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1414b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
14159969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1416c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
14179969ebc5SAlex Elder 		break;
1418bf0d5f50SAlex Elder 	default:
1419bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1420bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1421bf0d5f50SAlex Elder 		break;
1422bf0d5f50SAlex Elder 	}
1423bf0d5f50SAlex Elder 
142407741308SAlex Elder 	if (obj_request_done_test(obj_request))
1425bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1426bf0d5f50SAlex Elder }
1427bf0d5f50SAlex Elder 
14282fa12320SAlex Elder static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
142979528734SAlex Elder 					bool write_request)
1430430c28c3SAlex Elder {
1431430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
14328c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1433430c28c3SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1434430c28c3SAlex Elder 	u64 snap_id = CEPH_NOSNAP;
1435430c28c3SAlex Elder 	struct timespec *mtime = NULL;
1436430c28c3SAlex Elder 	struct timespec now;
1437430c28c3SAlex Elder 
14388c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1439430c28c3SAlex Elder 
1440430c28c3SAlex Elder 	if (write_request) {
1441430c28c3SAlex Elder 		now = CURRENT_TIME;
1442430c28c3SAlex Elder 		mtime = &now;
1443430c28c3SAlex Elder 		if (img_request)
1444430c28c3SAlex Elder 			snapc = img_request->snapc;
14452fa12320SAlex Elder 	} else if (img_request) {
1446430c28c3SAlex Elder 		snap_id = img_request->snap_id;
1447430c28c3SAlex Elder 	}
14488c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
144979528734SAlex Elder 			snapc, snap_id, mtime);
1450430c28c3SAlex Elder }
1451430c28c3SAlex Elder 
1452bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1453bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1454bf0d5f50SAlex Elder 					bool write_request,
1455430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1456bf0d5f50SAlex Elder {
1457bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1458bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1459bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1460bf0d5f50SAlex Elder 
14616365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
14626365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
14636365d33aSAlex Elder 
14640c425248SAlex Elder 		rbd_assert(write_request ==
14650c425248SAlex Elder 				img_request_write_test(img_request));
14660c425248SAlex Elder 		if (write_request)
1467bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1468bf0d5f50SAlex Elder 	}
1469bf0d5f50SAlex Elder 
1470bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1471bf0d5f50SAlex Elder 
1472bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1473bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1474bf0d5f50SAlex Elder 	if (!osd_req)
1475bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1476bf0d5f50SAlex Elder 
1477430c28c3SAlex Elder 	if (write_request)
1478bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1479430c28c3SAlex Elder 	else
1480bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1481bf0d5f50SAlex Elder 
1482bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1483bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1484bf0d5f50SAlex Elder 
1485bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1486bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1487bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1488bf0d5f50SAlex Elder 
1489bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1490bf0d5f50SAlex Elder 
1491bf0d5f50SAlex Elder 	return osd_req;
1492bf0d5f50SAlex Elder }
1493bf0d5f50SAlex Elder 
1494bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1495bf0d5f50SAlex Elder {
1496bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1497bf0d5f50SAlex Elder }
1498bf0d5f50SAlex Elder 
1499bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1500bf0d5f50SAlex Elder 
1501bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1502bf0d5f50SAlex Elder 						u64 offset, u64 length,
1503bf0d5f50SAlex Elder 						enum obj_request_type type)
1504bf0d5f50SAlex Elder {
1505bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1506bf0d5f50SAlex Elder 	size_t size;
1507bf0d5f50SAlex Elder 	char *name;
1508bf0d5f50SAlex Elder 
1509bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1510bf0d5f50SAlex Elder 
1511bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1512bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1513bf0d5f50SAlex Elder 	if (!obj_request)
1514bf0d5f50SAlex Elder 		return NULL;
1515bf0d5f50SAlex Elder 
1516bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1517bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1518bf0d5f50SAlex Elder 	obj_request->offset = offset;
1519bf0d5f50SAlex Elder 	obj_request->length = length;
1520926f9b3fSAlex Elder 	obj_request->flags = 0;
1521bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1522bf0d5f50SAlex Elder 	obj_request->type = type;
1523bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1524788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1525bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1526bf0d5f50SAlex Elder 
152737206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
152837206ee5SAlex Elder 		offset, length, (int)type, obj_request);
152937206ee5SAlex Elder 
1530bf0d5f50SAlex Elder 	return obj_request;
1531bf0d5f50SAlex Elder }
1532bf0d5f50SAlex Elder 
1533bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1534bf0d5f50SAlex Elder {
1535bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1536bf0d5f50SAlex Elder 
1537bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1538bf0d5f50SAlex Elder 
153937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
154037206ee5SAlex Elder 
1541bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1542bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1543bf0d5f50SAlex Elder 
1544bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1545bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1546bf0d5f50SAlex Elder 
1547bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1548bf0d5f50SAlex Elder 	switch (obj_request->type) {
15499969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
15509969ebc5SAlex Elder 		break;		/* Nothing to do */
1551bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1552bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1553bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1554bf0d5f50SAlex Elder 		break;
1555788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1556788e2df3SAlex Elder 		if (obj_request->pages)
1557788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1558788e2df3SAlex Elder 						obj_request->page_count);
1559788e2df3SAlex Elder 		break;
1560bf0d5f50SAlex Elder 	}
1561bf0d5f50SAlex Elder 
1562bf0d5f50SAlex Elder 	kfree(obj_request);
1563bf0d5f50SAlex Elder }
1564bf0d5f50SAlex Elder 
1565bf0d5f50SAlex Elder /*
1566bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1567bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1568bf0d5f50SAlex Elder  * (if there is one).
1569bf0d5f50SAlex Elder  */
1570cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1571cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1572bf0d5f50SAlex Elder 					u64 offset, u64 length,
15739849e986SAlex Elder 					bool write_request,
15749849e986SAlex Elder 					bool child_request)
1575bf0d5f50SAlex Elder {
1576bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1577bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1578bf0d5f50SAlex Elder 
1579bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1580bf0d5f50SAlex Elder 	if (!img_request)
1581bf0d5f50SAlex Elder 		return NULL;
1582bf0d5f50SAlex Elder 
1583bf0d5f50SAlex Elder 	if (write_request) {
1584bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1585bf0d5f50SAlex Elder 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1586bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1587bf0d5f50SAlex Elder 		if (WARN_ON(!snapc)) {
1588bf0d5f50SAlex Elder 			kfree(img_request);
1589bf0d5f50SAlex Elder 			return NULL;	/* Shouldn't happen */
1590bf0d5f50SAlex Elder 		}
15910c425248SAlex Elder 
1592bf0d5f50SAlex Elder 	}
1593bf0d5f50SAlex Elder 
1594bf0d5f50SAlex Elder 	img_request->rq = NULL;
1595bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1596bf0d5f50SAlex Elder 	img_request->offset = offset;
1597bf0d5f50SAlex Elder 	img_request->length = length;
15980c425248SAlex Elder 	img_request->flags = 0;
15990c425248SAlex Elder 	if (write_request) {
16000c425248SAlex Elder 		img_request_write_set(img_request);
1601bf0d5f50SAlex Elder 		img_request->snapc = snapc;
16020c425248SAlex Elder 	} else {
1603bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
16040c425248SAlex Elder 	}
16059849e986SAlex Elder 	if (child_request)
16069849e986SAlex Elder 		img_request_child_set(img_request);
1607d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1608d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1609bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1610bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1611bf0d5f50SAlex Elder 	img_request->callback = NULL;
1612a5a337d4SAlex Elder 	img_request->result = 0;
1613bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1614bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1615bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1616bf0d5f50SAlex Elder 
1617bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1618bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1619bf0d5f50SAlex Elder 
162037206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
162137206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
162237206ee5SAlex Elder 		img_request);
162337206ee5SAlex Elder 
1624bf0d5f50SAlex Elder 	return img_request;
1625bf0d5f50SAlex Elder }
1626bf0d5f50SAlex Elder 
1627bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1628bf0d5f50SAlex Elder {
1629bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1630bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1631bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1632bf0d5f50SAlex Elder 
1633bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1634bf0d5f50SAlex Elder 
163537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
163637206ee5SAlex Elder 
1637bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1638bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
163925dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1640bf0d5f50SAlex Elder 
16410c425248SAlex Elder 	if (img_request_write_test(img_request))
1642bf0d5f50SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1643bf0d5f50SAlex Elder 
16448b3e1a56SAlex Elder 	if (img_request_child_test(img_request))
16458b3e1a56SAlex Elder 		rbd_obj_request_put(img_request->obj_request);
16468b3e1a56SAlex Elder 
1647bf0d5f50SAlex Elder 	kfree(img_request);
1648bf0d5f50SAlex Elder }
1649bf0d5f50SAlex Elder 
16501217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
16511217857fSAlex Elder {
16526365d33aSAlex Elder 	struct rbd_img_request *img_request;
16531217857fSAlex Elder 	unsigned int xferred;
16541217857fSAlex Elder 	int result;
16558b3e1a56SAlex Elder 	bool more;
16561217857fSAlex Elder 
16576365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
16586365d33aSAlex Elder 	img_request = obj_request->img_request;
16596365d33aSAlex Elder 
16601217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
16611217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
16621217857fSAlex Elder 	result = obj_request->result;
16631217857fSAlex Elder 	if (result) {
16641217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
16651217857fSAlex Elder 
16661217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
16671217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
16681217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
16691217857fSAlex Elder 			obj_request->offset);
16701217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
16711217857fSAlex Elder 			result, xferred);
16721217857fSAlex Elder 		if (!img_request->result)
16731217857fSAlex Elder 			img_request->result = result;
16741217857fSAlex Elder 	}
16751217857fSAlex Elder 
16768b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
16778b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
16788b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
16798b3e1a56SAlex Elder 	} else {
16808b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
16818b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
16828b3e1a56SAlex Elder 	}
16838b3e1a56SAlex Elder 
16848b3e1a56SAlex Elder 	return more;
16851217857fSAlex Elder }
16861217857fSAlex Elder 
16872169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
16882169238dSAlex Elder {
16892169238dSAlex Elder 	struct rbd_img_request *img_request;
16902169238dSAlex Elder 	u32 which = obj_request->which;
16912169238dSAlex Elder 	bool more = true;
16922169238dSAlex Elder 
16936365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
16942169238dSAlex Elder 	img_request = obj_request->img_request;
16952169238dSAlex Elder 
16962169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
16972169238dSAlex Elder 	rbd_assert(img_request != NULL);
16982169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
16992169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
17002169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
17012169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
17022169238dSAlex Elder 
17032169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
17042169238dSAlex Elder 	if (which != img_request->next_completion)
17052169238dSAlex Elder 		goto out;
17062169238dSAlex Elder 
17072169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
17082169238dSAlex Elder 		rbd_assert(more);
17092169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
17102169238dSAlex Elder 
17112169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
17122169238dSAlex Elder 			break;
17131217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
17142169238dSAlex Elder 		which++;
17152169238dSAlex Elder 	}
17162169238dSAlex Elder 
17172169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
17182169238dSAlex Elder 	img_request->next_completion = which;
17192169238dSAlex Elder out:
17202169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
17212169238dSAlex Elder 
17222169238dSAlex Elder 	if (!more)
17232169238dSAlex Elder 		rbd_img_request_complete(img_request);
17242169238dSAlex Elder }
17252169238dSAlex Elder 
1726bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1727bf0d5f50SAlex Elder 					struct bio *bio_list)
1728bf0d5f50SAlex Elder {
1729bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1730bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1731bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
17320c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
1733bf0d5f50SAlex Elder 	unsigned int bio_offset;
17347da22d29SAlex Elder 	u64 img_offset;
1735bf0d5f50SAlex Elder 	u64 resid;
1736bf0d5f50SAlex Elder 	u16 opcode;
1737bf0d5f50SAlex Elder 
173837206ee5SAlex Elder 	dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
173937206ee5SAlex Elder 
1740430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
1741bf0d5f50SAlex Elder 	bio_offset = 0;
17427da22d29SAlex Elder 	img_offset = img_request->offset;
17437da22d29SAlex Elder 	rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1744bf0d5f50SAlex Elder 	resid = img_request->length;
17454dda41d3SAlex Elder 	rbd_assert(resid > 0);
1746bf0d5f50SAlex Elder 	while (resid) {
17472fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
1748bf0d5f50SAlex Elder 		const char *object_name;
1749bf0d5f50SAlex Elder 		unsigned int clone_size;
1750bf0d5f50SAlex Elder 		u64 offset;
1751bf0d5f50SAlex Elder 		u64 length;
1752bf0d5f50SAlex Elder 
17537da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
1754bf0d5f50SAlex Elder 		if (!object_name)
1755bf0d5f50SAlex Elder 			goto out_unwind;
17567da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
17577da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
1758bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1759bf0d5f50SAlex Elder 						offset, length,
1760bf0d5f50SAlex Elder 						OBJ_REQUEST_BIO);
1761bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1762bf0d5f50SAlex Elder 		if (!obj_request)
1763bf0d5f50SAlex Elder 			goto out_unwind;
1764bf0d5f50SAlex Elder 
1765bf0d5f50SAlex Elder 		rbd_assert(length <= (u64) UINT_MAX);
1766bf0d5f50SAlex Elder 		clone_size = (unsigned int) length;
1767bf0d5f50SAlex Elder 		obj_request->bio_list = bio_chain_clone_range(&bio_list,
1768bf0d5f50SAlex Elder 						&bio_offset, clone_size,
1769bf0d5f50SAlex Elder 						GFP_ATOMIC);
1770bf0d5f50SAlex Elder 		if (!obj_request->bio_list)
1771bf0d5f50SAlex Elder 			goto out_partial;
1772bf0d5f50SAlex Elder 
17732fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
17742fa12320SAlex Elder 						obj_request);
17752fa12320SAlex Elder 		if (!osd_req)
1776bf0d5f50SAlex Elder 			goto out_partial;
17772fa12320SAlex Elder 		obj_request->osd_req = osd_req;
17782169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1779430c28c3SAlex Elder 
17802fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
17812fa12320SAlex Elder 						0, 0);
1782406e2c9fSAlex Elder 		osd_req_op_extent_osd_data_bio(osd_req, 0,
1783a4ce40a9SAlex Elder 				obj_request->bio_list, obj_request->length);
17842fa12320SAlex Elder 		rbd_osd_req_format(obj_request, write_request);
1785430c28c3SAlex Elder 
17867da22d29SAlex Elder 		obj_request->img_offset = img_offset;
1787bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1788bf0d5f50SAlex Elder 
17897da22d29SAlex Elder 		img_offset += length;
1790bf0d5f50SAlex Elder 		resid -= length;
1791bf0d5f50SAlex Elder 	}
1792bf0d5f50SAlex Elder 
1793bf0d5f50SAlex Elder 	return 0;
1794bf0d5f50SAlex Elder 
1795bf0d5f50SAlex Elder out_partial:
1796bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1797bf0d5f50SAlex Elder out_unwind:
1798bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1799bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1800bf0d5f50SAlex Elder 
1801bf0d5f50SAlex Elder 	return -ENOMEM;
1802bf0d5f50SAlex Elder }
1803bf0d5f50SAlex Elder 
1804bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
1805bf0d5f50SAlex Elder {
1806bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1807bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1808bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
180946faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
1810bf0d5f50SAlex Elder 
181137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
181246faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
1813bf0d5f50SAlex Elder 		int ret;
1814bf0d5f50SAlex Elder 
1815bf0d5f50SAlex Elder 		ret = rbd_obj_request_submit(osdc, obj_request);
1816bf0d5f50SAlex Elder 		if (ret)
1817bf0d5f50SAlex Elder 			return ret;
1818bf0d5f50SAlex Elder 	}
1819bf0d5f50SAlex Elder 
1820bf0d5f50SAlex Elder 	return 0;
1821bf0d5f50SAlex Elder }
1822bf0d5f50SAlex Elder 
18238b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
18248b3e1a56SAlex Elder {
18258b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
18268b3e1a56SAlex Elder 
18278b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
18288b3e1a56SAlex Elder 
18298b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
18308b3e1a56SAlex Elder 	rbd_assert(obj_request != NULL);
18318b3e1a56SAlex Elder 	obj_request->result = img_request->result;
18328b3e1a56SAlex Elder 	obj_request->xferred = img_request->xferred;
18338b3e1a56SAlex Elder 
18348b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
18358b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
18368b3e1a56SAlex Elder }
18378b3e1a56SAlex Elder 
18388b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
18398b3e1a56SAlex Elder {
18408b3e1a56SAlex Elder 	struct rbd_device *rbd_dev;
18418b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
18428b3e1a56SAlex Elder 	int result;
18438b3e1a56SAlex Elder 
18448b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18458b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
18468b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
18478b3e1a56SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
18488b3e1a56SAlex Elder 
18498b3e1a56SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
18508b3e1a56SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
18518b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
18528b3e1a56SAlex Elder 	img_request = rbd_img_request_create(rbd_dev->parent,
18538b3e1a56SAlex Elder 						obj_request->img_offset,
18548b3e1a56SAlex Elder 						obj_request->length,
18558b3e1a56SAlex Elder 						false, true);
18568b3e1a56SAlex Elder 	result = -ENOMEM;
18578b3e1a56SAlex Elder 	if (!img_request)
18588b3e1a56SAlex Elder 		goto out_err;
18598b3e1a56SAlex Elder 
18608b3e1a56SAlex Elder 	rbd_obj_request_get(obj_request);
18618b3e1a56SAlex Elder 	img_request->obj_request = obj_request;
18628b3e1a56SAlex Elder 
18638b3e1a56SAlex Elder 	result = rbd_img_request_fill_bio(img_request, obj_request->bio_list);
18648b3e1a56SAlex Elder 	if (result)
18658b3e1a56SAlex Elder 		goto out_err;
18668b3e1a56SAlex Elder 
18678b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
18688b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
18698b3e1a56SAlex Elder 	if (result)
18708b3e1a56SAlex Elder 		goto out_err;
18718b3e1a56SAlex Elder 
18728b3e1a56SAlex Elder 	return;
18738b3e1a56SAlex Elder out_err:
18748b3e1a56SAlex Elder 	if (img_request)
18758b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
18768b3e1a56SAlex Elder 	obj_request->result = result;
18778b3e1a56SAlex Elder 	obj_request->xferred = 0;
18788b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
18798b3e1a56SAlex Elder }
18808b3e1a56SAlex Elder 
1881cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1882b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
1883b8d70035SAlex Elder {
1884b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
18852169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1886b8d70035SAlex Elder 	int ret;
1887b8d70035SAlex Elder 
1888b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1889b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
1890b8d70035SAlex Elder 	if (!obj_request)
1891b8d70035SAlex Elder 		return -ENOMEM;
1892b8d70035SAlex Elder 
1893b8d70035SAlex Elder 	ret = -ENOMEM;
1894430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
1895b8d70035SAlex Elder 	if (!obj_request->osd_req)
1896b8d70035SAlex Elder 		goto out;
18972169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
1898b8d70035SAlex Elder 
1899c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1900c99d2d4aSAlex Elder 					notify_id, ver, 0);
19012fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
1902430c28c3SAlex Elder 
1903b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
1904b8d70035SAlex Elder out:
1905cf81b60eSAlex Elder 	if (ret)
1906b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
1907b8d70035SAlex Elder 
1908b8d70035SAlex Elder 	return ret;
1909b8d70035SAlex Elder }
1910b8d70035SAlex Elder 
1911b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1912b8d70035SAlex Elder {
1913b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1914b8d70035SAlex Elder 	u64 hver;
1915b8d70035SAlex Elder 	int rc;
1916b8d70035SAlex Elder 
1917b8d70035SAlex Elder 	if (!rbd_dev)
1918b8d70035SAlex Elder 		return;
1919b8d70035SAlex Elder 
192037206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1921b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1922b8d70035SAlex Elder 		(unsigned int) opcode);
1923b8d70035SAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
1924b8d70035SAlex Elder 	if (rc)
1925b8d70035SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
1926b8d70035SAlex Elder 			   " update snaps: %d\n", rc);
1927b8d70035SAlex Elder 
1928cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1929b8d70035SAlex Elder }
1930b8d70035SAlex Elder 
19319969ebc5SAlex Elder /*
19329969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
19339969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
19349969ebc5SAlex Elder  */
19359969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
19369969ebc5SAlex Elder {
19379969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
19389969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
19399969ebc5SAlex Elder 	int ret;
19409969ebc5SAlex Elder 
19419969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
19429969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
19439969ebc5SAlex Elder 
19449969ebc5SAlex Elder 	if (start) {
19453c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
19469969ebc5SAlex Elder 						&rbd_dev->watch_event);
19479969ebc5SAlex Elder 		if (ret < 0)
19489969ebc5SAlex Elder 			return ret;
19498eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
19509969ebc5SAlex Elder 	}
19519969ebc5SAlex Elder 
19529969ebc5SAlex Elder 	ret = -ENOMEM;
19539969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
19549969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
19559969ebc5SAlex Elder 	if (!obj_request)
19569969ebc5SAlex Elder 		goto out_cancel;
19579969ebc5SAlex Elder 
1958430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1959430c28c3SAlex Elder 	if (!obj_request->osd_req)
1960430c28c3SAlex Elder 		goto out_cancel;
1961430c28c3SAlex Elder 
19628eb87565SAlex Elder 	if (start)
1963975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
19648eb87565SAlex Elder 	else
19656977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
1966975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
19672169238dSAlex Elder 
19682169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
19692169238dSAlex Elder 				rbd_dev->watch_event->cookie,
19702169238dSAlex Elder 				rbd_dev->header.obj_version, start);
19712169238dSAlex Elder 	rbd_osd_req_format(obj_request, true);
19722169238dSAlex Elder 
19739969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
19749969ebc5SAlex Elder 	if (ret)
19759969ebc5SAlex Elder 		goto out_cancel;
19769969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
19779969ebc5SAlex Elder 	if (ret)
19789969ebc5SAlex Elder 		goto out_cancel;
19799969ebc5SAlex Elder 	ret = obj_request->result;
19809969ebc5SAlex Elder 	if (ret)
19819969ebc5SAlex Elder 		goto out_cancel;
19829969ebc5SAlex Elder 
19838eb87565SAlex Elder 	/*
19848eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
19858eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
19868eb87565SAlex Elder 	 * a pointer to the object request during that time (in
19878eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
19888eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
19898eb87565SAlex Elder 	 * unregistered it.
19908eb87565SAlex Elder 	 */
19918eb87565SAlex Elder 	if (start) {
19928eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
19938eb87565SAlex Elder 
19948eb87565SAlex Elder 		return 0;
19958eb87565SAlex Elder 	}
19968eb87565SAlex Elder 
19978eb87565SAlex Elder 	/* We have successfully torn down the watch request */
19988eb87565SAlex Elder 
19998eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
20008eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
20019969ebc5SAlex Elder out_cancel:
20029969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
20039969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
20049969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
20059969ebc5SAlex Elder 	if (obj_request)
20069969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
20079969ebc5SAlex Elder 
20089969ebc5SAlex Elder 	return ret;
20099969ebc5SAlex Elder }
20109969ebc5SAlex Elder 
201136be9a76SAlex Elder /*
201236be9a76SAlex Elder  * Synchronous osd object method call
201336be9a76SAlex Elder  */
201436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
201536be9a76SAlex Elder 			     const char *object_name,
201636be9a76SAlex Elder 			     const char *class_name,
201736be9a76SAlex Elder 			     const char *method_name,
201836be9a76SAlex Elder 			     const char *outbound,
201936be9a76SAlex Elder 			     size_t outbound_size,
202036be9a76SAlex Elder 			     char *inbound,
202136be9a76SAlex Elder 			     size_t inbound_size,
202236be9a76SAlex Elder 			     u64 *version)
202336be9a76SAlex Elder {
20242169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
202536be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
202636be9a76SAlex Elder 	struct page **pages;
202736be9a76SAlex Elder 	u32 page_count;
202836be9a76SAlex Elder 	int ret;
202936be9a76SAlex Elder 
203036be9a76SAlex Elder 	/*
20316010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
20326010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
20336010a451SAlex Elder 	 * also supply outbound data--parameters for the object
20346010a451SAlex Elder 	 * method.  Currently if this is present it will be a
20356010a451SAlex Elder 	 * snapshot id.
203636be9a76SAlex Elder 	 */
203736be9a76SAlex Elder 	page_count = (u32) calc_pages_for(0, inbound_size);
203836be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
203936be9a76SAlex Elder 	if (IS_ERR(pages))
204036be9a76SAlex Elder 		return PTR_ERR(pages);
204136be9a76SAlex Elder 
204236be9a76SAlex Elder 	ret = -ENOMEM;
20436010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
204436be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
204536be9a76SAlex Elder 	if (!obj_request)
204636be9a76SAlex Elder 		goto out;
204736be9a76SAlex Elder 
204836be9a76SAlex Elder 	obj_request->pages = pages;
204936be9a76SAlex Elder 	obj_request->page_count = page_count;
205036be9a76SAlex Elder 
2051430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
205236be9a76SAlex Elder 	if (!obj_request->osd_req)
205336be9a76SAlex Elder 		goto out;
205436be9a76SAlex Elder 
2055c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
205604017e29SAlex Elder 					class_name, method_name);
205704017e29SAlex Elder 	if (outbound_size) {
205804017e29SAlex Elder 		struct ceph_pagelist *pagelist;
205904017e29SAlex Elder 
206004017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
206104017e29SAlex Elder 		if (!pagelist)
206204017e29SAlex Elder 			goto out;
206304017e29SAlex Elder 
206404017e29SAlex Elder 		ceph_pagelist_init(pagelist);
206504017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
206604017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
206704017e29SAlex Elder 						pagelist);
206804017e29SAlex Elder 	}
2069a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2070a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
207144cd188dSAlex Elder 					0, false, false);
20722fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
2073430c28c3SAlex Elder 
207436be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
207536be9a76SAlex Elder 	if (ret)
207636be9a76SAlex Elder 		goto out;
207736be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
207836be9a76SAlex Elder 	if (ret)
207936be9a76SAlex Elder 		goto out;
208036be9a76SAlex Elder 
208136be9a76SAlex Elder 	ret = obj_request->result;
208236be9a76SAlex Elder 	if (ret < 0)
208336be9a76SAlex Elder 		goto out;
208423ed6e13SAlex Elder 	ret = 0;
2085903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
208636be9a76SAlex Elder 	if (version)
208736be9a76SAlex Elder 		*version = obj_request->version;
208836be9a76SAlex Elder out:
208936be9a76SAlex Elder 	if (obj_request)
209036be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
209136be9a76SAlex Elder 	else
209236be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
209336be9a76SAlex Elder 
209436be9a76SAlex Elder 	return ret;
209536be9a76SAlex Elder }
209636be9a76SAlex Elder 
2097bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2098cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2099bf0d5f50SAlex Elder {
2100bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2101bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2102bf0d5f50SAlex Elder 	struct request *rq;
2103bf0d5f50SAlex Elder 	int result;
2104bf0d5f50SAlex Elder 
2105bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2106bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2107bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2108bf0d5f50SAlex Elder 		u64 offset;
2109bf0d5f50SAlex Elder 		u64 length;
2110bf0d5f50SAlex Elder 
2111bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2112bf0d5f50SAlex Elder 
2113bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
21144dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
21154dda41d3SAlex Elder 				(int) rq->cmd_type);
21164dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
21174dda41d3SAlex Elder 			continue;
21184dda41d3SAlex Elder 		}
21194dda41d3SAlex Elder 
21204dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
21214dda41d3SAlex Elder 
21224dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
21234dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
21244dda41d3SAlex Elder 
21254dda41d3SAlex Elder 		if (!length) {
21264dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2127bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2128bf0d5f50SAlex Elder 			continue;
2129bf0d5f50SAlex Elder 		}
2130bf0d5f50SAlex Elder 
2131bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2132bf0d5f50SAlex Elder 
2133bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2134bf0d5f50SAlex Elder 
2135bf0d5f50SAlex Elder 		if (write_request) {
2136bf0d5f50SAlex Elder 			result = -EROFS;
2137bf0d5f50SAlex Elder 			if (read_only)
2138bf0d5f50SAlex Elder 				goto end_request;
2139bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2140bf0d5f50SAlex Elder 		}
2141bf0d5f50SAlex Elder 
21426d292906SAlex Elder 		/*
21436d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
21446d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
21456d292906SAlex Elder 		 * have disappeared by the time our request arrives
21466d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
21476d292906SAlex Elder 		 * we already know.
21486d292906SAlex Elder 		 */
21496d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2150bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2151bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2152bf0d5f50SAlex Elder 			result = -ENXIO;
2153bf0d5f50SAlex Elder 			goto end_request;
2154bf0d5f50SAlex Elder 		}
2155bf0d5f50SAlex Elder 
2156bf0d5f50SAlex Elder 		result = -EINVAL;
2157bf0d5f50SAlex Elder 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
2158bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2159bf0d5f50SAlex Elder 
2160bf0d5f50SAlex Elder 		result = -ENOMEM;
2161bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
21629849e986SAlex Elder 							write_request, false);
2163bf0d5f50SAlex Elder 		if (!img_request)
2164bf0d5f50SAlex Elder 			goto end_request;
2165bf0d5f50SAlex Elder 
2166bf0d5f50SAlex Elder 		img_request->rq = rq;
2167bf0d5f50SAlex Elder 
2168bf0d5f50SAlex Elder 		result = rbd_img_request_fill_bio(img_request, rq->bio);
2169bf0d5f50SAlex Elder 		if (!result)
2170bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2171bf0d5f50SAlex Elder 		if (result)
2172bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2173bf0d5f50SAlex Elder end_request:
2174bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2175bf0d5f50SAlex Elder 		if (result < 0) {
21767da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
21777da22d29SAlex Elder 				write_request ? "write" : "read",
21787da22d29SAlex Elder 				length, offset, result);
21797da22d29SAlex Elder 
2180bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2181bf0d5f50SAlex Elder 		}
2182bf0d5f50SAlex Elder 	}
2183bf0d5f50SAlex Elder }
2184bf0d5f50SAlex Elder 
2185602adf40SYehuda Sadeh /*
2186602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2187602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2188f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2189602adf40SYehuda Sadeh  */
2190602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2191602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2192602adf40SYehuda Sadeh {
2193602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2194e5cfeed2SAlex Elder 	sector_t sector_offset;
2195e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2196e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2197e5cfeed2SAlex Elder 	int ret;
2198602adf40SYehuda Sadeh 
2199e5cfeed2SAlex Elder 	/*
2200e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2201e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2202e5cfeed2SAlex Elder 	 * device.
2203e5cfeed2SAlex Elder 	 */
2204e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2205e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2206e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2207593a9e7bSAlex Elder 
2208e5cfeed2SAlex Elder 	/*
2209e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2210e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2211e5cfeed2SAlex Elder 	 */
2212e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2213e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2214e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2215e5cfeed2SAlex Elder 	else
2216e5cfeed2SAlex Elder 		ret = 0;
2217e5cfeed2SAlex Elder 
2218e5cfeed2SAlex Elder 	/*
2219e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2220e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2221e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2222e5cfeed2SAlex Elder 	 * added to an empty bio."
2223e5cfeed2SAlex Elder 	 */
2224e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2225e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2226e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2227e5cfeed2SAlex Elder 
2228e5cfeed2SAlex Elder 	return ret;
2229602adf40SYehuda Sadeh }
2230602adf40SYehuda Sadeh 
2231602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2232602adf40SYehuda Sadeh {
2233602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2234602adf40SYehuda Sadeh 
2235602adf40SYehuda Sadeh 	if (!disk)
2236602adf40SYehuda Sadeh 		return;
2237602adf40SYehuda Sadeh 
2238602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
2239602adf40SYehuda Sadeh 		del_gendisk(disk);
2240602adf40SYehuda Sadeh 	if (disk->queue)
2241602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
2242602adf40SYehuda Sadeh 	put_disk(disk);
2243602adf40SYehuda Sadeh }
2244602adf40SYehuda Sadeh 
2245788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2246788e2df3SAlex Elder 				const char *object_name,
2247788e2df3SAlex Elder 				u64 offset, u64 length,
2248788e2df3SAlex Elder 				char *buf, u64 *version)
2249788e2df3SAlex Elder 
2250788e2df3SAlex Elder {
22512169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2252788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2253788e2df3SAlex Elder 	struct page **pages = NULL;
2254788e2df3SAlex Elder 	u32 page_count;
22551ceae7efSAlex Elder 	size_t size;
2256788e2df3SAlex Elder 	int ret;
2257788e2df3SAlex Elder 
2258788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2259788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2260788e2df3SAlex Elder 	if (IS_ERR(pages))
2261788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2262788e2df3SAlex Elder 
2263788e2df3SAlex Elder 	ret = -ENOMEM;
2264788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2265788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2266788e2df3SAlex Elder 	if (!obj_request)
2267788e2df3SAlex Elder 		goto out;
2268788e2df3SAlex Elder 
2269788e2df3SAlex Elder 	obj_request->pages = pages;
2270788e2df3SAlex Elder 	obj_request->page_count = page_count;
2271788e2df3SAlex Elder 
2272430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2273788e2df3SAlex Elder 	if (!obj_request->osd_req)
2274788e2df3SAlex Elder 		goto out;
2275788e2df3SAlex Elder 
2276c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2277c99d2d4aSAlex Elder 					offset, length, 0, 0);
2278406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2279a4ce40a9SAlex Elder 					obj_request->pages,
228044cd188dSAlex Elder 					obj_request->length,
228144cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
228244cd188dSAlex Elder 					false, false);
22832fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
2284430c28c3SAlex Elder 
2285788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2286788e2df3SAlex Elder 	if (ret)
2287788e2df3SAlex Elder 		goto out;
2288788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2289788e2df3SAlex Elder 	if (ret)
2290788e2df3SAlex Elder 		goto out;
2291788e2df3SAlex Elder 
2292788e2df3SAlex Elder 	ret = obj_request->result;
2293788e2df3SAlex Elder 	if (ret < 0)
2294788e2df3SAlex Elder 		goto out;
22951ceae7efSAlex Elder 
22961ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
22971ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
2298903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
229923ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
230023ed6e13SAlex Elder 	ret = (int) size;
2301788e2df3SAlex Elder 	if (version)
2302788e2df3SAlex Elder 		*version = obj_request->version;
2303788e2df3SAlex Elder out:
2304788e2df3SAlex Elder 	if (obj_request)
2305788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2306788e2df3SAlex Elder 	else
2307788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2308788e2df3SAlex Elder 
2309788e2df3SAlex Elder 	return ret;
2310788e2df3SAlex Elder }
2311788e2df3SAlex Elder 
2312602adf40SYehuda Sadeh /*
23134156d998SAlex Elder  * Read the complete header for the given rbd device.
23144156d998SAlex Elder  *
23154156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
23164156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
23174156d998SAlex Elder  * of a variable that will be filled in with the version of the
23184156d998SAlex Elder  * header object at the time it was read.
23194156d998SAlex Elder  *
23204156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
23214156d998SAlex Elder  */
23224156d998SAlex Elder static struct rbd_image_header_ondisk *
23234156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
23244156d998SAlex Elder {
23254156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
23264156d998SAlex Elder 	u32 snap_count = 0;
23274156d998SAlex Elder 	u64 names_size = 0;
23284156d998SAlex Elder 	u32 want_count;
23294156d998SAlex Elder 	int ret;
23304156d998SAlex Elder 
23314156d998SAlex Elder 	/*
23324156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
23334156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
23344156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
23354156d998SAlex Elder 	 * the number of snapshots could change by the time we read
23364156d998SAlex Elder 	 * it in, in which case we re-read it.
23374156d998SAlex Elder 	 */
23384156d998SAlex Elder 	do {
23394156d998SAlex Elder 		size_t size;
23404156d998SAlex Elder 
23414156d998SAlex Elder 		kfree(ondisk);
23424156d998SAlex Elder 
23434156d998SAlex Elder 		size = sizeof (*ondisk);
23444156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
23454156d998SAlex Elder 		size += names_size;
23464156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
23474156d998SAlex Elder 		if (!ondisk)
23484156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
23494156d998SAlex Elder 
2350788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
23514156d998SAlex Elder 				       0, size,
23524156d998SAlex Elder 				       (char *) ondisk, version);
23534156d998SAlex Elder 		if (ret < 0)
23544156d998SAlex Elder 			goto out_err;
23554156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
23564156d998SAlex Elder 			ret = -ENXIO;
235706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
235806ecc6cbSAlex Elder 				size, ret);
23594156d998SAlex Elder 			goto out_err;
23604156d998SAlex Elder 		}
23614156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
23624156d998SAlex Elder 			ret = -ENXIO;
236306ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
23644156d998SAlex Elder 			goto out_err;
23654156d998SAlex Elder 		}
23664156d998SAlex Elder 
23674156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
23684156d998SAlex Elder 		want_count = snap_count;
23694156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
23704156d998SAlex Elder 	} while (snap_count != want_count);
23714156d998SAlex Elder 
23724156d998SAlex Elder 	return ondisk;
23734156d998SAlex Elder 
23744156d998SAlex Elder out_err:
23754156d998SAlex Elder 	kfree(ondisk);
23764156d998SAlex Elder 
23774156d998SAlex Elder 	return ERR_PTR(ret);
23784156d998SAlex Elder }
23794156d998SAlex Elder 
23804156d998SAlex Elder /*
2381602adf40SYehuda Sadeh  * reload the ondisk the header
2382602adf40SYehuda Sadeh  */
2383602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2384602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2385602adf40SYehuda Sadeh {
23864156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
23874156d998SAlex Elder 	u64 ver = 0;
23884156d998SAlex Elder 	int ret;
2389602adf40SYehuda Sadeh 
23904156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
23914156d998SAlex Elder 	if (IS_ERR(ondisk))
23924156d998SAlex Elder 		return PTR_ERR(ondisk);
23934156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
23944156d998SAlex Elder 	if (ret >= 0)
239559c2be1eSYehuda Sadeh 		header->obj_version = ver;
23964156d998SAlex Elder 	kfree(ondisk);
2397602adf40SYehuda Sadeh 
23984156d998SAlex Elder 	return ret;
2399602adf40SYehuda Sadeh }
2400602adf40SYehuda Sadeh 
240141f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2402dfc5606dSYehuda Sadeh {
2403dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2404a0593290SAlex Elder 	struct rbd_snap *next;
2405dfc5606dSYehuda Sadeh 
2406a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
240741f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
2408dfc5606dSYehuda Sadeh }
2409dfc5606dSYehuda Sadeh 
24109478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
24119478554aSAlex Elder {
24129478554aSAlex Elder 	sector_t size;
24139478554aSAlex Elder 
24140d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
24159478554aSAlex Elder 		return;
24169478554aSAlex Elder 
24179478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
24189478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
24199478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
24209478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
24219478554aSAlex Elder }
24229478554aSAlex Elder 
2423602adf40SYehuda Sadeh /*
2424602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
2425602adf40SYehuda Sadeh  */
2426117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2427602adf40SYehuda Sadeh {
2428602adf40SYehuda Sadeh 	int ret;
2429602adf40SYehuda Sadeh 	struct rbd_image_header h;
2430602adf40SYehuda Sadeh 
2431602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
2432602adf40SYehuda Sadeh 	if (ret < 0)
2433602adf40SYehuda Sadeh 		return ret;
2434602adf40SYehuda Sadeh 
2435a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
2436a51aa0c0SJosh Durgin 
24379478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
24389478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
24399478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
24409db4b3e3SSage Weil 
2441849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
2442602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
2443849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
2444d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
2445d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
2446602adf40SYehuda Sadeh 
2447b813623aSAlex Elder 	if (hver)
2448b813623aSAlex Elder 		*hver = h.obj_version;
2449a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
245093a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
2451602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
2452602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
2453602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
2454849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
2455849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2456849b4260SAlex Elder 	kfree(h.object_prefix);
2457849b4260SAlex Elder 
2458304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2459304f6808SAlex Elder 	if (!ret)
2460304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
2461dfc5606dSYehuda Sadeh 
2462c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
2463602adf40SYehuda Sadeh 
2464dfc5606dSYehuda Sadeh 	return ret;
2465602adf40SYehuda Sadeh }
2466602adf40SYehuda Sadeh 
2467117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
24681fe5e993SAlex Elder {
24691fe5e993SAlex Elder 	int ret;
24701fe5e993SAlex Elder 
2471117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
24721fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2473117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
2474117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
2475117973fbSAlex Elder 	else
2476117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
24771fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
24781fe5e993SAlex Elder 
24791fe5e993SAlex Elder 	return ret;
24801fe5e993SAlex Elder }
24811fe5e993SAlex Elder 
2482602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
2483602adf40SYehuda Sadeh {
2484602adf40SYehuda Sadeh 	struct gendisk *disk;
2485602adf40SYehuda Sadeh 	struct request_queue *q;
2486593a9e7bSAlex Elder 	u64 segment_size;
2487602adf40SYehuda Sadeh 
2488602adf40SYehuda Sadeh 	/* create gendisk info */
2489602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2490602adf40SYehuda Sadeh 	if (!disk)
24911fcdb8aaSAlex Elder 		return -ENOMEM;
2492602adf40SYehuda Sadeh 
2493f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2494de71a297SAlex Elder 		 rbd_dev->dev_id);
2495602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
2496602adf40SYehuda Sadeh 	disk->first_minor = 0;
2497602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
2498602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
2499602adf40SYehuda Sadeh 
2500bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2501602adf40SYehuda Sadeh 	if (!q)
2502602adf40SYehuda Sadeh 		goto out_disk;
2503029bcbd8SJosh Durgin 
2504593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
2505593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
2506593a9e7bSAlex Elder 
2507029bcbd8SJosh Durgin 	/* set io sizes to object size */
2508593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
2509593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2510593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
2511593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
2512593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
2513029bcbd8SJosh Durgin 
2514602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
2515602adf40SYehuda Sadeh 	disk->queue = q;
2516602adf40SYehuda Sadeh 
2517602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
2518602adf40SYehuda Sadeh 
2519602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
2520602adf40SYehuda Sadeh 
252112f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
252212f02944SAlex Elder 
2523602adf40SYehuda Sadeh 	return 0;
2524602adf40SYehuda Sadeh out_disk:
2525602adf40SYehuda Sadeh 	put_disk(disk);
25261fcdb8aaSAlex Elder 
25271fcdb8aaSAlex Elder 	return -ENOMEM;
2528602adf40SYehuda Sadeh }
2529602adf40SYehuda Sadeh 
2530dfc5606dSYehuda Sadeh /*
2531dfc5606dSYehuda Sadeh   sysfs
2532dfc5606dSYehuda Sadeh */
2533602adf40SYehuda Sadeh 
2534593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2535593a9e7bSAlex Elder {
2536593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
2537593a9e7bSAlex Elder }
2538593a9e7bSAlex Elder 
2539dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
2540dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2541602adf40SYehuda Sadeh {
2542593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2543a51aa0c0SJosh Durgin 	sector_t size;
2544dfc5606dSYehuda Sadeh 
2545a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
2546a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
2547a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
2548a51aa0c0SJosh Durgin 
2549a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2550602adf40SYehuda Sadeh }
2551602adf40SYehuda Sadeh 
255234b13184SAlex Elder /*
255334b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
255434b13184SAlex Elder  * necessarily the base image.
255534b13184SAlex Elder  */
255634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
255734b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
255834b13184SAlex Elder {
255934b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
256034b13184SAlex Elder 
256134b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
256234b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
256334b13184SAlex Elder }
256434b13184SAlex Elder 
2565dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
2566dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
2567602adf40SYehuda Sadeh {
2568593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2569dfc5606dSYehuda Sadeh 
2570dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2571dfc5606dSYehuda Sadeh }
2572dfc5606dSYehuda Sadeh 
2573dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2574dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2575dfc5606dSYehuda Sadeh {
2576593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2577dfc5606dSYehuda Sadeh 
25781dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
25791dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2580dfc5606dSYehuda Sadeh }
2581dfc5606dSYehuda Sadeh 
2582dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2583dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2584dfc5606dSYehuda Sadeh {
2585593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2586dfc5606dSYehuda Sadeh 
25870d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2588dfc5606dSYehuda Sadeh }
2589dfc5606dSYehuda Sadeh 
25909bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
25919bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
25929bb2f334SAlex Elder {
25939bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
25949bb2f334SAlex Elder 
25950d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
25960d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
25979bb2f334SAlex Elder }
25989bb2f334SAlex Elder 
2599dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2600dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2601dfc5606dSYehuda Sadeh {
2602593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2603dfc5606dSYehuda Sadeh 
2604a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
26050d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2606a92ffdf8SAlex Elder 
2607a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2608dfc5606dSYehuda Sadeh }
2609dfc5606dSYehuda Sadeh 
2610589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2611589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2612589d30e0SAlex Elder {
2613589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2614589d30e0SAlex Elder 
26150d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2616589d30e0SAlex Elder }
2617589d30e0SAlex Elder 
261834b13184SAlex Elder /*
261934b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
262034b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
262134b13184SAlex Elder  */
2622dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2623dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2624dfc5606dSYehuda Sadeh 			     char *buf)
2625dfc5606dSYehuda Sadeh {
2626593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2627dfc5606dSYehuda Sadeh 
26280d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2629dfc5606dSYehuda Sadeh }
2630dfc5606dSYehuda Sadeh 
263186b00e0dSAlex Elder /*
263286b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
263386b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
263486b00e0dSAlex Elder  * "(no parent image)".
263586b00e0dSAlex Elder  */
263686b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
263786b00e0dSAlex Elder 			     struct device_attribute *attr,
263886b00e0dSAlex Elder 			     char *buf)
263986b00e0dSAlex Elder {
264086b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
264186b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
264286b00e0dSAlex Elder 	int count;
264386b00e0dSAlex Elder 	char *bufp = buf;
264486b00e0dSAlex Elder 
264586b00e0dSAlex Elder 	if (!spec)
264686b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
264786b00e0dSAlex Elder 
264886b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
264986b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
265086b00e0dSAlex Elder 	if (count < 0)
265186b00e0dSAlex Elder 		return count;
265286b00e0dSAlex Elder 	bufp += count;
265386b00e0dSAlex Elder 
265486b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
265586b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
265686b00e0dSAlex Elder 	if (count < 0)
265786b00e0dSAlex Elder 		return count;
265886b00e0dSAlex Elder 	bufp += count;
265986b00e0dSAlex Elder 
266086b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
266186b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
266286b00e0dSAlex Elder 	if (count < 0)
266386b00e0dSAlex Elder 		return count;
266486b00e0dSAlex Elder 	bufp += count;
266586b00e0dSAlex Elder 
266686b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
266786b00e0dSAlex Elder 	if (count < 0)
266886b00e0dSAlex Elder 		return count;
266986b00e0dSAlex Elder 	bufp += count;
267086b00e0dSAlex Elder 
267186b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
267286b00e0dSAlex Elder }
267386b00e0dSAlex Elder 
2674dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2675dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2676dfc5606dSYehuda Sadeh 				 const char *buf,
2677dfc5606dSYehuda Sadeh 				 size_t size)
2678dfc5606dSYehuda Sadeh {
2679593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2680b813623aSAlex Elder 	int ret;
2681602adf40SYehuda Sadeh 
2682117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2683b813623aSAlex Elder 
2684b813623aSAlex Elder 	return ret < 0 ? ret : size;
2685dfc5606dSYehuda Sadeh }
2686602adf40SYehuda Sadeh 
2687dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
268834b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2689dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2690dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2691dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
26929bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2693dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2694589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2695dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2696dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
269786b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2698dfc5606dSYehuda Sadeh 
2699dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2700dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
270134b13184SAlex Elder 	&dev_attr_features.attr,
2702dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2703dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2704dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
27059bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2706dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2707589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2708dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
270986b00e0dSAlex Elder 	&dev_attr_parent.attr,
2710dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2711dfc5606dSYehuda Sadeh 	NULL
2712dfc5606dSYehuda Sadeh };
2713dfc5606dSYehuda Sadeh 
2714dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2715dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2716dfc5606dSYehuda Sadeh };
2717dfc5606dSYehuda Sadeh 
2718dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2719dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2720dfc5606dSYehuda Sadeh 	NULL
2721dfc5606dSYehuda Sadeh };
2722dfc5606dSYehuda Sadeh 
2723dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2724dfc5606dSYehuda Sadeh {
2725dfc5606dSYehuda Sadeh }
2726dfc5606dSYehuda Sadeh 
2727dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2728dfc5606dSYehuda Sadeh 	.name		= "rbd",
2729dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2730dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2731dfc5606dSYehuda Sadeh };
2732dfc5606dSYehuda Sadeh 
2733dfc5606dSYehuda Sadeh 
2734dfc5606dSYehuda Sadeh /*
2735dfc5606dSYehuda Sadeh   sysfs - snapshots
2736dfc5606dSYehuda Sadeh */
2737dfc5606dSYehuda Sadeh 
2738dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2739dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2740dfc5606dSYehuda Sadeh 				  char *buf)
2741dfc5606dSYehuda Sadeh {
2742dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2743dfc5606dSYehuda Sadeh 
27443591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2745dfc5606dSYehuda Sadeh }
2746dfc5606dSYehuda Sadeh 
2747dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2748dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2749dfc5606dSYehuda Sadeh 				char *buf)
2750dfc5606dSYehuda Sadeh {
2751dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2752dfc5606dSYehuda Sadeh 
2753593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2754dfc5606dSYehuda Sadeh }
2755dfc5606dSYehuda Sadeh 
275634b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
275734b13184SAlex Elder 				struct device_attribute *attr,
275834b13184SAlex Elder 				char *buf)
275934b13184SAlex Elder {
276034b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
276134b13184SAlex Elder 
276234b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
276334b13184SAlex Elder 			(unsigned long long) snap->features);
276434b13184SAlex Elder }
276534b13184SAlex Elder 
2766dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2767dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
276834b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2769dfc5606dSYehuda Sadeh 
2770dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2771dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2772dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
277334b13184SAlex Elder 	&dev_attr_snap_features.attr,
2774dfc5606dSYehuda Sadeh 	NULL,
2775dfc5606dSYehuda Sadeh };
2776dfc5606dSYehuda Sadeh 
2777dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2778dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2779dfc5606dSYehuda Sadeh };
2780dfc5606dSYehuda Sadeh 
2781dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2782dfc5606dSYehuda Sadeh {
2783dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2784dfc5606dSYehuda Sadeh 	kfree(snap->name);
2785dfc5606dSYehuda Sadeh 	kfree(snap);
2786dfc5606dSYehuda Sadeh }
2787dfc5606dSYehuda Sadeh 
2788dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2789dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2790dfc5606dSYehuda Sadeh 	NULL
2791dfc5606dSYehuda Sadeh };
2792dfc5606dSYehuda Sadeh 
2793dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2794dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2795dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2796dfc5606dSYehuda Sadeh };
2797dfc5606dSYehuda Sadeh 
27988b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
27998b8fb99cSAlex Elder {
28008b8fb99cSAlex Elder 	kref_get(&spec->kref);
28018b8fb99cSAlex Elder 
28028b8fb99cSAlex Elder 	return spec;
28038b8fb99cSAlex Elder }
28048b8fb99cSAlex Elder 
28058b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
28068b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
28078b8fb99cSAlex Elder {
28088b8fb99cSAlex Elder 	if (spec)
28098b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
28108b8fb99cSAlex Elder }
28118b8fb99cSAlex Elder 
28128b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
28138b8fb99cSAlex Elder {
28148b8fb99cSAlex Elder 	struct rbd_spec *spec;
28158b8fb99cSAlex Elder 
28168b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
28178b8fb99cSAlex Elder 	if (!spec)
28188b8fb99cSAlex Elder 		return NULL;
28198b8fb99cSAlex Elder 	kref_init(&spec->kref);
28208b8fb99cSAlex Elder 
28218b8fb99cSAlex Elder 	return spec;
28228b8fb99cSAlex Elder }
28238b8fb99cSAlex Elder 
28248b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
28258b8fb99cSAlex Elder {
28268b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
28278b8fb99cSAlex Elder 
28288b8fb99cSAlex Elder 	kfree(spec->pool_name);
28298b8fb99cSAlex Elder 	kfree(spec->image_id);
28308b8fb99cSAlex Elder 	kfree(spec->image_name);
28318b8fb99cSAlex Elder 	kfree(spec->snap_name);
28328b8fb99cSAlex Elder 	kfree(spec);
28338b8fb99cSAlex Elder }
28348b8fb99cSAlex Elder 
2835cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2836c53d5893SAlex Elder 				struct rbd_spec *spec)
2837c53d5893SAlex Elder {
2838c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2839c53d5893SAlex Elder 
2840c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2841c53d5893SAlex Elder 	if (!rbd_dev)
2842c53d5893SAlex Elder 		return NULL;
2843c53d5893SAlex Elder 
2844c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
28456d292906SAlex Elder 	rbd_dev->flags = 0;
2846c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2847c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2848c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2849c53d5893SAlex Elder 
2850c53d5893SAlex Elder 	rbd_dev->spec = spec;
2851c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2852c53d5893SAlex Elder 
28530903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
28540903e875SAlex Elder 
28550903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
28560903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
28570903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
28580903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
28590903e875SAlex Elder 
2860c53d5893SAlex Elder 	return rbd_dev;
2861c53d5893SAlex Elder }
2862c53d5893SAlex Elder 
2863c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2864c53d5893SAlex Elder {
286586b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2866c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2867c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2868c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2869c53d5893SAlex Elder 	kfree(rbd_dev);
2870c53d5893SAlex Elder }
2871c53d5893SAlex Elder 
2872304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2873304f6808SAlex Elder {
2874304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2875304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2876304f6808SAlex Elder 
2877304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2878304f6808SAlex Elder 
2879304f6808SAlex Elder 	return ret;
2880304f6808SAlex Elder }
2881304f6808SAlex Elder 
288241f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2883dfc5606dSYehuda Sadeh {
2884dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2885304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2886dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2887dfc5606dSYehuda Sadeh }
2888dfc5606dSYehuda Sadeh 
288914e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2890dfc5606dSYehuda Sadeh 				  struct device *parent)
2891dfc5606dSYehuda Sadeh {
2892dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2893dfc5606dSYehuda Sadeh 	int ret;
2894dfc5606dSYehuda Sadeh 
2895dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2896dfc5606dSYehuda Sadeh 	dev->parent = parent;
2897dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2898d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2899304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2900304f6808SAlex Elder 
2901dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2902dfc5606dSYehuda Sadeh 
2903dfc5606dSYehuda Sadeh 	return ret;
2904dfc5606dSYehuda Sadeh }
2905dfc5606dSYehuda Sadeh 
29064e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2907c8d18425SAlex Elder 						const char *snap_name,
290834b13184SAlex Elder 						u64 snap_id, u64 snap_size,
290934b13184SAlex Elder 						u64 snap_features)
2910dfc5606dSYehuda Sadeh {
29114e891e0aSAlex Elder 	struct rbd_snap *snap;
2912dfc5606dSYehuda Sadeh 	int ret;
29134e891e0aSAlex Elder 
29144e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2915dfc5606dSYehuda Sadeh 	if (!snap)
29164e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
29174e891e0aSAlex Elder 
29184e891e0aSAlex Elder 	ret = -ENOMEM;
2919c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
29204e891e0aSAlex Elder 	if (!snap->name)
29214e891e0aSAlex Elder 		goto err;
29224e891e0aSAlex Elder 
2923c8d18425SAlex Elder 	snap->id = snap_id;
2924c8d18425SAlex Elder 	snap->size = snap_size;
292534b13184SAlex Elder 	snap->features = snap_features;
29264e891e0aSAlex Elder 
29274e891e0aSAlex Elder 	return snap;
29284e891e0aSAlex Elder 
2929dfc5606dSYehuda Sadeh err:
2930dfc5606dSYehuda Sadeh 	kfree(snap->name);
2931dfc5606dSYehuda Sadeh 	kfree(snap);
29324e891e0aSAlex Elder 
29334e891e0aSAlex Elder 	return ERR_PTR(ret);
2934dfc5606dSYehuda Sadeh }
2935dfc5606dSYehuda Sadeh 
2936cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2937cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2938cd892126SAlex Elder {
2939cd892126SAlex Elder 	char *snap_name;
2940cd892126SAlex Elder 
2941cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2942cd892126SAlex Elder 
2943cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2944cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2945cd892126SAlex Elder 
2946cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2947cd892126SAlex Elder 
2948cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2949cd892126SAlex Elder 	while (which--)
2950cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2951cd892126SAlex Elder 
2952cd892126SAlex Elder 	return snap_name;
2953cd892126SAlex Elder }
2954cd892126SAlex Elder 
2955dfc5606dSYehuda Sadeh /*
29569d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
29579d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
29589d475de5SAlex Elder  * image.
29599d475de5SAlex Elder  */
29609d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
29619d475de5SAlex Elder 				u8 *order, u64 *snap_size)
29629d475de5SAlex Elder {
29639d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
29649d475de5SAlex Elder 	int ret;
29659d475de5SAlex Elder 	struct {
29669d475de5SAlex Elder 		u8 order;
29679d475de5SAlex Elder 		__le64 size;
29689d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
29699d475de5SAlex Elder 
297036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
29719d475de5SAlex Elder 				"rbd", "get_size",
29729d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
297307b2391fSAlex Elder 				(char *) &size_buf, sizeof (size_buf), NULL);
297436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
29759d475de5SAlex Elder 	if (ret < 0)
29769d475de5SAlex Elder 		return ret;
29779d475de5SAlex Elder 
29789d475de5SAlex Elder 	*order = size_buf.order;
29799d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
29809d475de5SAlex Elder 
29819d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
29829d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
29839d475de5SAlex Elder 		(unsigned long long) *snap_size);
29849d475de5SAlex Elder 
29859d475de5SAlex Elder 	return 0;
29869d475de5SAlex Elder }
29879d475de5SAlex Elder 
29889d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
29899d475de5SAlex Elder {
29909d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
29919d475de5SAlex Elder 					&rbd_dev->header.obj_order,
29929d475de5SAlex Elder 					&rbd_dev->header.image_size);
29939d475de5SAlex Elder }
29949d475de5SAlex Elder 
29951e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
29961e130199SAlex Elder {
29971e130199SAlex Elder 	void *reply_buf;
29981e130199SAlex Elder 	int ret;
29991e130199SAlex Elder 	void *p;
30001e130199SAlex Elder 
30011e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
30021e130199SAlex Elder 	if (!reply_buf)
30031e130199SAlex Elder 		return -ENOMEM;
30041e130199SAlex Elder 
300536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
30061e130199SAlex Elder 				"rbd", "get_object_prefix",
30071e130199SAlex Elder 				NULL, 0,
300807b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
300936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
30101e130199SAlex Elder 	if (ret < 0)
30111e130199SAlex Elder 		goto out;
30121e130199SAlex Elder 
30131e130199SAlex Elder 	p = reply_buf;
30141e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
30151e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
30161e130199SAlex Elder 						NULL, GFP_NOIO);
30171e130199SAlex Elder 
30181e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
30191e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
30201e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
30211e130199SAlex Elder 	} else {
30221e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
30231e130199SAlex Elder 	}
30241e130199SAlex Elder 
30251e130199SAlex Elder out:
30261e130199SAlex Elder 	kfree(reply_buf);
30271e130199SAlex Elder 
30281e130199SAlex Elder 	return ret;
30291e130199SAlex Elder }
30301e130199SAlex Elder 
3031b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3032b1b5402aSAlex Elder 		u64 *snap_features)
3033b1b5402aSAlex Elder {
3034b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3035b1b5402aSAlex Elder 	struct {
3036b1b5402aSAlex Elder 		__le64 features;
3037b1b5402aSAlex Elder 		__le64 incompat;
3038b1b5402aSAlex Elder 	} features_buf = { 0 };
3039d889140cSAlex Elder 	u64 incompat;
3040b1b5402aSAlex Elder 	int ret;
3041b1b5402aSAlex Elder 
304236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3043b1b5402aSAlex Elder 				"rbd", "get_features",
3044b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
3045b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
304607b2391fSAlex Elder 				NULL);
304736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3048b1b5402aSAlex Elder 	if (ret < 0)
3049b1b5402aSAlex Elder 		return ret;
3050d889140cSAlex Elder 
3051d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
30525cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3053b8f5c6edSAlex Elder 		return -ENXIO;
3054d889140cSAlex Elder 
3055b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3056b1b5402aSAlex Elder 
3057b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3058b1b5402aSAlex Elder 		(unsigned long long) snap_id,
3059b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
3060b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
3061b1b5402aSAlex Elder 
3062b1b5402aSAlex Elder 	return 0;
3063b1b5402aSAlex Elder }
3064b1b5402aSAlex Elder 
3065b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3066b1b5402aSAlex Elder {
3067b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3068b1b5402aSAlex Elder 						&rbd_dev->header.features);
3069b1b5402aSAlex Elder }
3070b1b5402aSAlex Elder 
307186b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
307286b00e0dSAlex Elder {
307386b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
307486b00e0dSAlex Elder 	size_t size;
307586b00e0dSAlex Elder 	void *reply_buf = NULL;
307686b00e0dSAlex Elder 	__le64 snapid;
307786b00e0dSAlex Elder 	void *p;
307886b00e0dSAlex Elder 	void *end;
307986b00e0dSAlex Elder 	char *image_id;
308086b00e0dSAlex Elder 	u64 overlap;
308186b00e0dSAlex Elder 	int ret;
308286b00e0dSAlex Elder 
308386b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
308486b00e0dSAlex Elder 	if (!parent_spec)
308586b00e0dSAlex Elder 		return -ENOMEM;
308686b00e0dSAlex Elder 
308786b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
308886b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
308986b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
309086b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
309186b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
309286b00e0dSAlex Elder 	if (!reply_buf) {
309386b00e0dSAlex Elder 		ret = -ENOMEM;
309486b00e0dSAlex Elder 		goto out_err;
309586b00e0dSAlex Elder 	}
309686b00e0dSAlex Elder 
309786b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
309836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
309986b00e0dSAlex Elder 				"rbd", "get_parent",
310086b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
310107b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
310236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
310386b00e0dSAlex Elder 	if (ret < 0)
310486b00e0dSAlex Elder 		goto out_err;
310586b00e0dSAlex Elder 
310686b00e0dSAlex Elder 	ret = -ERANGE;
310786b00e0dSAlex Elder 	p = reply_buf;
310886b00e0dSAlex Elder 	end = (char *) reply_buf + size;
310986b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
311086b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
311186b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
311286b00e0dSAlex Elder 
31130903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
31140903e875SAlex Elder 
31150903e875SAlex Elder 	ret = -EIO;
31160903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
31170903e875SAlex Elder 		goto out;
31180903e875SAlex Elder 
3119979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
312086b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
312186b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
312286b00e0dSAlex Elder 		goto out_err;
312386b00e0dSAlex Elder 	}
312486b00e0dSAlex Elder 	parent_spec->image_id = image_id;
312586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
312686b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
312786b00e0dSAlex Elder 
312886b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
312986b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
313086b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
313186b00e0dSAlex Elder out:
313286b00e0dSAlex Elder 	ret = 0;
313386b00e0dSAlex Elder out_err:
313486b00e0dSAlex Elder 	kfree(reply_buf);
313586b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
313686b00e0dSAlex Elder 
313786b00e0dSAlex Elder 	return ret;
313886b00e0dSAlex Elder }
313986b00e0dSAlex Elder 
31409e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
31419e15b77dSAlex Elder {
31429e15b77dSAlex Elder 	size_t image_id_size;
31439e15b77dSAlex Elder 	char *image_id;
31449e15b77dSAlex Elder 	void *p;
31459e15b77dSAlex Elder 	void *end;
31469e15b77dSAlex Elder 	size_t size;
31479e15b77dSAlex Elder 	void *reply_buf = NULL;
31489e15b77dSAlex Elder 	size_t len = 0;
31499e15b77dSAlex Elder 	char *image_name = NULL;
31509e15b77dSAlex Elder 	int ret;
31519e15b77dSAlex Elder 
31529e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
31539e15b77dSAlex Elder 
315469e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
315569e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
31569e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
31579e15b77dSAlex Elder 	if (!image_id)
31589e15b77dSAlex Elder 		return NULL;
31599e15b77dSAlex Elder 
31609e15b77dSAlex Elder 	p = image_id;
31619e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
316269e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
31639e15b77dSAlex Elder 
31649e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
31659e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
31669e15b77dSAlex Elder 	if (!reply_buf)
31679e15b77dSAlex Elder 		goto out;
31689e15b77dSAlex Elder 
316936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
31709e15b77dSAlex Elder 				"rbd", "dir_get_name",
31719e15b77dSAlex Elder 				image_id, image_id_size,
317207b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
31739e15b77dSAlex Elder 	if (ret < 0)
31749e15b77dSAlex Elder 		goto out;
31759e15b77dSAlex Elder 	p = reply_buf;
31769e15b77dSAlex Elder 	end = (char *) reply_buf + size;
31779e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
31789e15b77dSAlex Elder 	if (IS_ERR(image_name))
31799e15b77dSAlex Elder 		image_name = NULL;
31809e15b77dSAlex Elder 	else
31819e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
31829e15b77dSAlex Elder out:
31839e15b77dSAlex Elder 	kfree(reply_buf);
31849e15b77dSAlex Elder 	kfree(image_id);
31859e15b77dSAlex Elder 
31869e15b77dSAlex Elder 	return image_name;
31879e15b77dSAlex Elder }
31889e15b77dSAlex Elder 
31899e15b77dSAlex Elder /*
31909e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
31919e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
31929e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
31939e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
31949e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
31959e15b77dSAlex Elder  * until then.
31969e15b77dSAlex Elder  */
31979e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
31989e15b77dSAlex Elder {
31999e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
32009e15b77dSAlex Elder 	const char *name;
32019e15b77dSAlex Elder 	void *reply_buf = NULL;
32029e15b77dSAlex Elder 	int ret;
32039e15b77dSAlex Elder 
32049e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
32059e15b77dSAlex Elder 		return 0;	/* Already have the names */
32069e15b77dSAlex Elder 
32079e15b77dSAlex Elder 	/* Look up the pool name */
32089e15b77dSAlex Elder 
32099e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
32109e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3211935dc89fSAlex Elder 	if (!name) {
3212935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
3213935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
3214935dc89fSAlex Elder 		return -EIO;
3215935dc89fSAlex Elder 	}
32169e15b77dSAlex Elder 
32179e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
32189e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
32199e15b77dSAlex Elder 		return -ENOMEM;
32209e15b77dSAlex Elder 
32219e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
32229e15b77dSAlex Elder 
32239e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
322469e7a02fSAlex Elder 	if (name)
32259e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
322669e7a02fSAlex Elder 	else
322706ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
32289e15b77dSAlex Elder 
32299e15b77dSAlex Elder 	/* Look up the snapshot name. */
32309e15b77dSAlex Elder 
32319e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
32329e15b77dSAlex Elder 	if (!name) {
3233935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
3234935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
32359e15b77dSAlex Elder 		ret = -EIO;
32369e15b77dSAlex Elder 		goto out_err;
32379e15b77dSAlex Elder 	}
32389e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
32399e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
32409e15b77dSAlex Elder 		goto out_err;
32419e15b77dSAlex Elder 
32429e15b77dSAlex Elder 	return 0;
32439e15b77dSAlex Elder out_err:
32449e15b77dSAlex Elder 	kfree(reply_buf);
32459e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
32469e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
32479e15b77dSAlex Elder 
32489e15b77dSAlex Elder 	return ret;
32499e15b77dSAlex Elder }
32509e15b77dSAlex Elder 
32516e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
325235d489f9SAlex Elder {
325335d489f9SAlex Elder 	size_t size;
325435d489f9SAlex Elder 	int ret;
325535d489f9SAlex Elder 	void *reply_buf;
325635d489f9SAlex Elder 	void *p;
325735d489f9SAlex Elder 	void *end;
325835d489f9SAlex Elder 	u64 seq;
325935d489f9SAlex Elder 	u32 snap_count;
326035d489f9SAlex Elder 	struct ceph_snap_context *snapc;
326135d489f9SAlex Elder 	u32 i;
326235d489f9SAlex Elder 
326335d489f9SAlex Elder 	/*
326435d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
326535d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
326635d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
326735d489f9SAlex Elder 	 * prepared to receive.
326835d489f9SAlex Elder 	 */
326935d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
327035d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
327135d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
327235d489f9SAlex Elder 	if (!reply_buf)
327335d489f9SAlex Elder 		return -ENOMEM;
327435d489f9SAlex Elder 
327536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
327635d489f9SAlex Elder 				"rbd", "get_snapcontext",
327735d489f9SAlex Elder 				NULL, 0,
327807b2391fSAlex Elder 				reply_buf, size, ver);
327936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
328035d489f9SAlex Elder 	if (ret < 0)
328135d489f9SAlex Elder 		goto out;
328235d489f9SAlex Elder 
328335d489f9SAlex Elder 	ret = -ERANGE;
328435d489f9SAlex Elder 	p = reply_buf;
328535d489f9SAlex Elder 	end = (char *) reply_buf + size;
328635d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
328735d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
328835d489f9SAlex Elder 
328935d489f9SAlex Elder 	/*
329035d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
329135d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
329235d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
329335d489f9SAlex Elder 	 * allocate is representable in a size_t.
329435d489f9SAlex Elder 	 */
329535d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
329635d489f9SAlex Elder 				 / sizeof (u64)) {
329735d489f9SAlex Elder 		ret = -EINVAL;
329835d489f9SAlex Elder 		goto out;
329935d489f9SAlex Elder 	}
330035d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
330135d489f9SAlex Elder 		goto out;
330235d489f9SAlex Elder 
330335d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
330435d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
330535d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
330635d489f9SAlex Elder 	if (!snapc) {
330735d489f9SAlex Elder 		ret = -ENOMEM;
330835d489f9SAlex Elder 		goto out;
330935d489f9SAlex Elder 	}
331035d489f9SAlex Elder 
331135d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
331235d489f9SAlex Elder 	snapc->seq = seq;
331335d489f9SAlex Elder 	snapc->num_snaps = snap_count;
331435d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
331535d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
331635d489f9SAlex Elder 
331735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
331835d489f9SAlex Elder 
331935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
332035d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
332135d489f9SAlex Elder 
332235d489f9SAlex Elder out:
332335d489f9SAlex Elder 	kfree(reply_buf);
332435d489f9SAlex Elder 
332535d489f9SAlex Elder 	return 0;
332635d489f9SAlex Elder }
332735d489f9SAlex Elder 
3328b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3329b8b1e2dbSAlex Elder {
3330b8b1e2dbSAlex Elder 	size_t size;
3331b8b1e2dbSAlex Elder 	void *reply_buf;
3332b8b1e2dbSAlex Elder 	__le64 snap_id;
3333b8b1e2dbSAlex Elder 	int ret;
3334b8b1e2dbSAlex Elder 	void *p;
3335b8b1e2dbSAlex Elder 	void *end;
3336b8b1e2dbSAlex Elder 	char *snap_name;
3337b8b1e2dbSAlex Elder 
3338b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3339b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3340b8b1e2dbSAlex Elder 	if (!reply_buf)
3341b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3342b8b1e2dbSAlex Elder 
3343b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
334436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3345b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
3346b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
334707b2391fSAlex Elder 				reply_buf, size, NULL);
334836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3349b8b1e2dbSAlex Elder 	if (ret < 0)
3350b8b1e2dbSAlex Elder 		goto out;
3351b8b1e2dbSAlex Elder 
3352b8b1e2dbSAlex Elder 	p = reply_buf;
3353b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
3354e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3355b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
3356b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
3357b8b1e2dbSAlex Elder 		goto out;
3358b8b1e2dbSAlex Elder 	} else {
3359b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
3360b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
3361b8b1e2dbSAlex Elder 	}
3362b8b1e2dbSAlex Elder 	kfree(reply_buf);
3363b8b1e2dbSAlex Elder 
3364b8b1e2dbSAlex Elder 	return snap_name;
3365b8b1e2dbSAlex Elder out:
3366b8b1e2dbSAlex Elder 	kfree(reply_buf);
3367b8b1e2dbSAlex Elder 
3368b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
3369b8b1e2dbSAlex Elder }
3370b8b1e2dbSAlex Elder 
3371b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3372b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3373b8b1e2dbSAlex Elder {
3374e0b49868SAlex Elder 	u64 snap_id;
3375b8b1e2dbSAlex Elder 	u8 order;
3376b8b1e2dbSAlex Elder 	int ret;
3377b8b1e2dbSAlex Elder 
3378b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3379b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3380b8b1e2dbSAlex Elder 	if (ret)
3381b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3382b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3383b8b1e2dbSAlex Elder 	if (ret)
3384b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3385b8b1e2dbSAlex Elder 
3386b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
3387b8b1e2dbSAlex Elder }
3388b8b1e2dbSAlex Elder 
3389b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3390b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3391b8b1e2dbSAlex Elder {
3392b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3393b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3394b8b1e2dbSAlex Elder 					snap_size, snap_features);
3395b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3396b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3397b8b1e2dbSAlex Elder 					snap_size, snap_features);
3398b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3399b8b1e2dbSAlex Elder }
3400b8b1e2dbSAlex Elder 
3401117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3402117973fbSAlex Elder {
3403117973fbSAlex Elder 	int ret;
3404117973fbSAlex Elder 	__u8 obj_order;
3405117973fbSAlex Elder 
3406117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3407117973fbSAlex Elder 
3408117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
3409117973fbSAlex Elder 
3410117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
3411117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
3412117973fbSAlex Elder 	if (ret)
3413117973fbSAlex Elder 		goto out;
3414117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
3415117973fbSAlex Elder 		ret = -EIO;
3416117973fbSAlex Elder 		goto out;
3417117973fbSAlex Elder 	}
3418117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
3419117973fbSAlex Elder 
3420117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3421117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
3422117973fbSAlex Elder 	if (ret)
3423117973fbSAlex Elder 		goto out;
3424117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3425117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
3426117973fbSAlex Elder 	if (ret)
3427117973fbSAlex Elder 		goto out;
3428117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
3429117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
3430117973fbSAlex Elder out:
3431117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
3432117973fbSAlex Elder 
3433117973fbSAlex Elder 	return ret;
3434117973fbSAlex Elder }
3435117973fbSAlex Elder 
34369d475de5SAlex Elder /*
343735938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
343835938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
343935938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
344035938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
344135938150SAlex Elder  * And verify there are no changes to snapshots we already know
344235938150SAlex Elder  * about.
344335938150SAlex Elder  *
344435938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
344535938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
344635938150SAlex Elder  * are also maintained in that order.)
3447dfc5606dSYehuda Sadeh  */
3448304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3449dfc5606dSYehuda Sadeh {
345035938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
345135938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
345235938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
345335938150SAlex Elder 	struct list_head *links = head->next;
345435938150SAlex Elder 	u32 index = 0;
3455dfc5606dSYehuda Sadeh 
34569fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
345735938150SAlex Elder 	while (index < snap_count || links != head) {
345835938150SAlex Elder 		u64 snap_id;
345935938150SAlex Elder 		struct rbd_snap *snap;
3460cd892126SAlex Elder 		char *snap_name;
3461cd892126SAlex Elder 		u64 snap_size = 0;
3462cd892126SAlex Elder 		u64 snap_features = 0;
3463dfc5606dSYehuda Sadeh 
346435938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
346535938150SAlex Elder 					     : CEPH_NOSNAP;
346635938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
346735938150SAlex Elder 				     : NULL;
3468aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3469dfc5606dSYehuda Sadeh 
347035938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
347135938150SAlex Elder 			struct list_head *next = links->next;
3472dfc5606dSYehuda Sadeh 
34736d292906SAlex Elder 			/*
34746d292906SAlex Elder 			 * A previously-existing snapshot is not in
34756d292906SAlex Elder 			 * the new snap context.
34766d292906SAlex Elder 			 *
34776d292906SAlex Elder 			 * If the now missing snapshot is the one the
34786d292906SAlex Elder 			 * image is mapped to, clear its exists flag
34796d292906SAlex Elder 			 * so we can avoid sending any more requests
34806d292906SAlex Elder 			 * to it.
34816d292906SAlex Elder 			 */
34820d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
34836d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
348441f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
34859fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
34860d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
34870d7dbfceSAlex Elder 							"mapped " : "",
34889fcbb800SAlex Elder 				(unsigned long long) snap->id);
3489dfc5606dSYehuda Sadeh 
349035938150SAlex Elder 			/* Done with this list entry; advance */
349135938150SAlex Elder 
349235938150SAlex Elder 			links = next;
349335938150SAlex Elder 			continue;
3494dfc5606dSYehuda Sadeh 		}
349535938150SAlex Elder 
3496b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
3497cd892126SAlex Elder 					&snap_size, &snap_features);
3498cd892126SAlex Elder 		if (IS_ERR(snap_name))
3499cd892126SAlex Elder 			return PTR_ERR(snap_name);
3500cd892126SAlex Elder 
35019fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
35029fcbb800SAlex Elder 			(unsigned long long) snap_id);
350335938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
350435938150SAlex Elder 			struct rbd_snap *new_snap;
350535938150SAlex Elder 
350635938150SAlex Elder 			/* We haven't seen this snapshot before */
350735938150SAlex Elder 
3508c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3509cd892126SAlex Elder 					snap_id, snap_size, snap_features);
35109fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
35119fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
35129fcbb800SAlex Elder 
35139fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
35149fcbb800SAlex Elder 
35159fcbb800SAlex Elder 				return err;
35169fcbb800SAlex Elder 			}
351735938150SAlex Elder 
351835938150SAlex Elder 			/* New goes before existing, or at end of list */
351935938150SAlex Elder 
35209fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
352135938150SAlex Elder 			if (snap)
352235938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
352335938150SAlex Elder 			else
3524523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
352535938150SAlex Elder 		} else {
352635938150SAlex Elder 			/* Already have this one */
352735938150SAlex Elder 
35289fcbb800SAlex Elder 			dout("  already present\n");
35299fcbb800SAlex Elder 
3530cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
3531aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
3532cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
353335938150SAlex Elder 
353435938150SAlex Elder 			/* Done with this list entry; advance */
353535938150SAlex Elder 
353635938150SAlex Elder 			links = links->next;
3537dfc5606dSYehuda Sadeh 		}
353835938150SAlex Elder 
353935938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
354035938150SAlex Elder 
354135938150SAlex Elder 		index++;
3542dfc5606dSYehuda Sadeh 	}
35439fcbb800SAlex Elder 	dout("%s: done\n", __func__);
3544dfc5606dSYehuda Sadeh 
3545dfc5606dSYehuda Sadeh 	return 0;
3546dfc5606dSYehuda Sadeh }
3547dfc5606dSYehuda Sadeh 
3548304f6808SAlex Elder /*
3549304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
3550304f6808SAlex Elder  * have not already been registered.
3551304f6808SAlex Elder  */
3552304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3553304f6808SAlex Elder {
3554304f6808SAlex Elder 	struct rbd_snap *snap;
3555304f6808SAlex Elder 	int ret = 0;
3556304f6808SAlex Elder 
355737206ee5SAlex Elder 	dout("%s:\n", __func__);
355886ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
355986ff77bbSAlex Elder 		return -EIO;
3560304f6808SAlex Elder 
3561304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
3562304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
3563304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3564304f6808SAlex Elder 			if (ret < 0)
3565304f6808SAlex Elder 				break;
3566304f6808SAlex Elder 		}
3567304f6808SAlex Elder 	}
3568304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
3569304f6808SAlex Elder 
3570304f6808SAlex Elder 	return ret;
3571304f6808SAlex Elder }
3572304f6808SAlex Elder 
3573dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3574dfc5606dSYehuda Sadeh {
3575dfc5606dSYehuda Sadeh 	struct device *dev;
3576cd789ab9SAlex Elder 	int ret;
3577dfc5606dSYehuda Sadeh 
3578dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3579dfc5606dSYehuda Sadeh 
3580cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3581dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3582dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3583dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3584dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3585de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3586dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3587dfc5606dSYehuda Sadeh 
3588dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3589cd789ab9SAlex Elder 
3590dfc5606dSYehuda Sadeh 	return ret;
3591602adf40SYehuda Sadeh }
3592602adf40SYehuda Sadeh 
3593dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3594dfc5606dSYehuda Sadeh {
3595dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3596dfc5606dSYehuda Sadeh }
3597dfc5606dSYehuda Sadeh 
3598e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
35991ddbe94eSAlex Elder 
36001ddbe94eSAlex Elder /*
3601499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3602499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
36031ddbe94eSAlex Elder  */
3604e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3605b7f23c36SAlex Elder {
3606e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3607499afd5bSAlex Elder 
3608499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3609499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3610499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3611e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3612e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3613b7f23c36SAlex Elder }
3614b7f23c36SAlex Elder 
36151ddbe94eSAlex Elder /*
3616499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3617499afd5bSAlex Elder  * identifier is no longer in use.
36181ddbe94eSAlex Elder  */
3619e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
36201ddbe94eSAlex Elder {
3621d184f6bfSAlex Elder 	struct list_head *tmp;
3622de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3623d184f6bfSAlex Elder 	int max_id;
3624d184f6bfSAlex Elder 
3625aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3626499afd5bSAlex Elder 
3627e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3628e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3629499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3630499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3631d184f6bfSAlex Elder 
3632d184f6bfSAlex Elder 	/*
3633d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3634d184f6bfSAlex Elder 	 * is nothing special we need to do.
3635d184f6bfSAlex Elder 	 */
3636e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3637d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3638d184f6bfSAlex Elder 		return;
3639d184f6bfSAlex Elder 	}
3640d184f6bfSAlex Elder 
3641d184f6bfSAlex Elder 	/*
3642d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3643d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3644d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3645d184f6bfSAlex Elder 	 */
3646d184f6bfSAlex Elder 	max_id = 0;
3647d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3648d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3649d184f6bfSAlex Elder 
3650d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3651b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3652b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3653d184f6bfSAlex Elder 	}
3654499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
36551ddbe94eSAlex Elder 
36561ddbe94eSAlex Elder 	/*
3657e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3658d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3659d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3660d184f6bfSAlex Elder 	 * case.
36611ddbe94eSAlex Elder 	 */
3662e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3663e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3664b7f23c36SAlex Elder }
3665b7f23c36SAlex Elder 
3666a725f65eSAlex Elder /*
3667e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3668e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3669593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3670593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3671e28fff26SAlex Elder  */
3672e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3673e28fff26SAlex Elder {
3674e28fff26SAlex Elder         /*
3675e28fff26SAlex Elder         * These are the characters that produce nonzero for
3676e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3677e28fff26SAlex Elder         */
3678e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3679e28fff26SAlex Elder 
3680e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3681e28fff26SAlex Elder 
3682e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3683e28fff26SAlex Elder }
3684e28fff26SAlex Elder 
3685e28fff26SAlex Elder /*
3686e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3687e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3688593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3689593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3690e28fff26SAlex Elder  *
3691e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3692e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3693e28fff26SAlex Elder  * token_size if the token would not fit.
3694e28fff26SAlex Elder  *
3695593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3696e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3697e28fff26SAlex Elder  * too small to hold it.
3698e28fff26SAlex Elder  */
3699e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3700e28fff26SAlex Elder 				char *token,
3701e28fff26SAlex Elder 				size_t token_size)
3702e28fff26SAlex Elder {
3703e28fff26SAlex Elder         size_t len;
3704e28fff26SAlex Elder 
3705e28fff26SAlex Elder 	len = next_token(buf);
3706e28fff26SAlex Elder 	if (len < token_size) {
3707e28fff26SAlex Elder 		memcpy(token, *buf, len);
3708e28fff26SAlex Elder 		*(token + len) = '\0';
3709e28fff26SAlex Elder 	}
3710e28fff26SAlex Elder 	*buf += len;
3711e28fff26SAlex Elder 
3712e28fff26SAlex Elder         return len;
3713e28fff26SAlex Elder }
3714e28fff26SAlex Elder 
3715e28fff26SAlex Elder /*
3716ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3717ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3718ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3719ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3720ea3352f4SAlex Elder  *
3721ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3722ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3723ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3724ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3725ea3352f4SAlex Elder  *
3726ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3727ea3352f4SAlex Elder  * the end of the found token.
3728ea3352f4SAlex Elder  *
3729ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3730ea3352f4SAlex Elder  */
3731ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3732ea3352f4SAlex Elder {
3733ea3352f4SAlex Elder 	char *dup;
3734ea3352f4SAlex Elder 	size_t len;
3735ea3352f4SAlex Elder 
3736ea3352f4SAlex Elder 	len = next_token(buf);
37374caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3738ea3352f4SAlex Elder 	if (!dup)
3739ea3352f4SAlex Elder 		return NULL;
3740ea3352f4SAlex Elder 	*(dup + len) = '\0';
3741ea3352f4SAlex Elder 	*buf += len;
3742ea3352f4SAlex Elder 
3743ea3352f4SAlex Elder 	if (lenp)
3744ea3352f4SAlex Elder 		*lenp = len;
3745ea3352f4SAlex Elder 
3746ea3352f4SAlex Elder 	return dup;
3747ea3352f4SAlex Elder }
3748ea3352f4SAlex Elder 
3749ea3352f4SAlex Elder /*
3750859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3751859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3752859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3753859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3754d22f76e7SAlex Elder  *
3755859c31dfSAlex Elder  * The information extracted from these options is recorded in
3756859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3757859c31dfSAlex Elder  * structures:
3758859c31dfSAlex Elder  *  ceph_opts
3759859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3760859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3761859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3762859c31dfSAlex Elder  *  rbd_opts
3763859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3764859c31dfSAlex Elder  *	this function; caller must release with kfree().
3765859c31dfSAlex Elder  *  spec
3766859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3767859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3768859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3769859c31dfSAlex Elder  *
3770859c31dfSAlex Elder  * The options passed take this form:
3771859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3772859c31dfSAlex Elder  * where:
3773859c31dfSAlex Elder  *  <mon_addrs>
3774859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3775859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3776859c31dfSAlex Elder  *      by a port number (separated by a colon).
3777859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3778859c31dfSAlex Elder  *  <options>
3779859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3780859c31dfSAlex Elder  *  <pool_name>
3781859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3782859c31dfSAlex Elder  *  <image_name>
3783859c31dfSAlex Elder  *      The name of the image in that pool to map.
3784859c31dfSAlex Elder  *  <snap_id>
3785859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3786859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3787859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3788859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3789a725f65eSAlex Elder  */
3790859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3791dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3792859c31dfSAlex Elder 				struct rbd_options **opts,
3793859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3794a725f65eSAlex Elder {
3795e28fff26SAlex Elder 	size_t len;
3796859c31dfSAlex Elder 	char *options;
37970ddebc0cSAlex Elder 	const char *mon_addrs;
37980ddebc0cSAlex Elder 	size_t mon_addrs_size;
3799859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
38004e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3801859c31dfSAlex Elder 	struct ceph_options *copts;
3802dc79b113SAlex Elder 	int ret;
3803e28fff26SAlex Elder 
3804e28fff26SAlex Elder 	/* The first four tokens are required */
3805e28fff26SAlex Elder 
38067ef3214aSAlex Elder 	len = next_token(&buf);
38074fb5d671SAlex Elder 	if (!len) {
38084fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
38094fb5d671SAlex Elder 		return -EINVAL;
38104fb5d671SAlex Elder 	}
38110ddebc0cSAlex Elder 	mon_addrs = buf;
3812f28e565aSAlex Elder 	mon_addrs_size = len + 1;
38137ef3214aSAlex Elder 	buf += len;
3814a725f65eSAlex Elder 
3815dc79b113SAlex Elder 	ret = -EINVAL;
3816f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3817f28e565aSAlex Elder 	if (!options)
3818dc79b113SAlex Elder 		return -ENOMEM;
38194fb5d671SAlex Elder 	if (!*options) {
38204fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
38214fb5d671SAlex Elder 		goto out_err;
38224fb5d671SAlex Elder 	}
3823a725f65eSAlex Elder 
3824859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3825859c31dfSAlex Elder 	if (!spec)
3826f28e565aSAlex Elder 		goto out_mem;
3827859c31dfSAlex Elder 
3828859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3829859c31dfSAlex Elder 	if (!spec->pool_name)
3830859c31dfSAlex Elder 		goto out_mem;
38314fb5d671SAlex Elder 	if (!*spec->pool_name) {
38324fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
38334fb5d671SAlex Elder 		goto out_err;
38344fb5d671SAlex Elder 	}
3835e28fff26SAlex Elder 
383669e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
3837859c31dfSAlex Elder 	if (!spec->image_name)
3838f28e565aSAlex Elder 		goto out_mem;
38394fb5d671SAlex Elder 	if (!*spec->image_name) {
38404fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
38414fb5d671SAlex Elder 		goto out_err;
38424fb5d671SAlex Elder 	}
3843e28fff26SAlex Elder 
3844f28e565aSAlex Elder 	/*
3845f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3846f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3847f28e565aSAlex Elder 	 */
38483feeb894SAlex Elder 	len = next_token(&buf);
3849820a5f3eSAlex Elder 	if (!len) {
38503feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
38513feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3852f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3853dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3854f28e565aSAlex Elder 		goto out_err;
3855849b4260SAlex Elder 	}
38564caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3857859c31dfSAlex Elder 	if (!spec->snap_name)
3858f28e565aSAlex Elder 		goto out_mem;
3859859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3860e5c35534SAlex Elder 
38610ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3862e28fff26SAlex Elder 
38634e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
38644e9afebaSAlex Elder 	if (!rbd_opts)
38654e9afebaSAlex Elder 		goto out_mem;
38664e9afebaSAlex Elder 
38674e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3868d22f76e7SAlex Elder 
3869859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
38700ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
38714e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3872859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3873859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3874dc79b113SAlex Elder 		goto out_err;
3875dc79b113SAlex Elder 	}
3876859c31dfSAlex Elder 	kfree(options);
3877859c31dfSAlex Elder 
3878859c31dfSAlex Elder 	*ceph_opts = copts;
38794e9afebaSAlex Elder 	*opts = rbd_opts;
3880859c31dfSAlex Elder 	*rbd_spec = spec;
38810ddebc0cSAlex Elder 
3882dc79b113SAlex Elder 	return 0;
3883f28e565aSAlex Elder out_mem:
3884dc79b113SAlex Elder 	ret = -ENOMEM;
3885d22f76e7SAlex Elder out_err:
3886859c31dfSAlex Elder 	kfree(rbd_opts);
3887859c31dfSAlex Elder 	rbd_spec_put(spec);
3888f28e565aSAlex Elder 	kfree(options);
3889d22f76e7SAlex Elder 
3890dc79b113SAlex Elder 	return ret;
3891a725f65eSAlex Elder }
3892a725f65eSAlex Elder 
3893589d30e0SAlex Elder /*
3894589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3895589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3896589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3897589d30e0SAlex Elder  *
3898589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3899589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3900589d30e0SAlex Elder  * with the supplied name.
3901589d30e0SAlex Elder  *
3902589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3903589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3904589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3905589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3906589d30e0SAlex Elder  */
3907589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3908589d30e0SAlex Elder {
3909589d30e0SAlex Elder 	int ret;
3910589d30e0SAlex Elder 	size_t size;
3911589d30e0SAlex Elder 	char *object_name;
3912589d30e0SAlex Elder 	void *response;
3913589d30e0SAlex Elder 	void *p;
3914589d30e0SAlex Elder 
39152f82ee54SAlex Elder 	/* If we already have it we don't need to look it up */
39162f82ee54SAlex Elder 
39172f82ee54SAlex Elder 	if (rbd_dev->spec->image_id)
39182f82ee54SAlex Elder 		return 0;
39192f82ee54SAlex Elder 
3920589d30e0SAlex Elder 	/*
39212c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
39222c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
39232c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
39242c0d0a10SAlex Elder 	 */
39252c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
39262c0d0a10SAlex Elder 		return 0;
39272c0d0a10SAlex Elder 
39282c0d0a10SAlex Elder 	/*
3929589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3930589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3931589d30e0SAlex Elder 	 */
393269e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3933589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3934589d30e0SAlex Elder 	if (!object_name)
3935589d30e0SAlex Elder 		return -ENOMEM;
39360d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3937589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3938589d30e0SAlex Elder 
3939589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3940589d30e0SAlex Elder 
3941589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3942589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3943589d30e0SAlex Elder 	if (!response) {
3944589d30e0SAlex Elder 		ret = -ENOMEM;
3945589d30e0SAlex Elder 		goto out;
3946589d30e0SAlex Elder 	}
3947589d30e0SAlex Elder 
394836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
3949589d30e0SAlex Elder 				"rbd", "get_id",
3950589d30e0SAlex Elder 				NULL, 0,
395107b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
395236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3953589d30e0SAlex Elder 	if (ret < 0)
3954589d30e0SAlex Elder 		goto out;
3955589d30e0SAlex Elder 
3956589d30e0SAlex Elder 	p = response;
39570d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3958589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
3959979ed480SAlex Elder 						NULL, GFP_NOIO);
39600d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
39610d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
39620d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3963589d30e0SAlex Elder 	} else {
39640d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3965589d30e0SAlex Elder 	}
3966589d30e0SAlex Elder out:
3967589d30e0SAlex Elder 	kfree(response);
3968589d30e0SAlex Elder 	kfree(object_name);
3969589d30e0SAlex Elder 
3970589d30e0SAlex Elder 	return ret;
3971589d30e0SAlex Elder }
3972589d30e0SAlex Elder 
3973a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3974a30b71b9SAlex Elder {
3975a30b71b9SAlex Elder 	int ret;
3976a30b71b9SAlex Elder 	size_t size;
3977a30b71b9SAlex Elder 
3978a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3979a30b71b9SAlex Elder 
39800d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
39810d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3982a30b71b9SAlex Elder 		return -ENOMEM;
3983a30b71b9SAlex Elder 
3984a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3985a30b71b9SAlex Elder 
398669e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3987a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3988a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3989a30b71b9SAlex Elder 		ret = -ENOMEM;
3990a30b71b9SAlex Elder 		goto out_err;
3991a30b71b9SAlex Elder 	}
39920d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
39930d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3994a30b71b9SAlex Elder 
3995a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3996a30b71b9SAlex Elder 
3997a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3998a30b71b9SAlex Elder 	if (ret < 0)
3999a30b71b9SAlex Elder 		goto out_err;
400086b00e0dSAlex Elder 
400186b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
400286b00e0dSAlex Elder 
400386b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
400486b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
400586b00e0dSAlex Elder 
4006a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
4007a30b71b9SAlex Elder 
4008a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
4009a30b71b9SAlex Elder 		rbd_dev->header_name);
4010a30b71b9SAlex Elder 
4011a30b71b9SAlex Elder 	return 0;
4012a30b71b9SAlex Elder 
4013a30b71b9SAlex Elder out_err:
4014a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
4015a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
40160d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
40170d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
4018a30b71b9SAlex Elder 
4019a30b71b9SAlex Elder 	return ret;
4020a30b71b9SAlex Elder }
4021a30b71b9SAlex Elder 
4022a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4023a30b71b9SAlex Elder {
4024a30b71b9SAlex Elder 	size_t size;
40259d475de5SAlex Elder 	int ret;
40266e14b1a6SAlex Elder 	u64 ver = 0;
4027a30b71b9SAlex Elder 
4028a30b71b9SAlex Elder 	/*
4029a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
4030a30b71b9SAlex Elder 	 * object name for this rbd image.
4031a30b71b9SAlex Elder 	 */
4032979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
4033a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4034a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
4035a30b71b9SAlex Elder 		return -ENOMEM;
4036a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
40370d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
40389d475de5SAlex Elder 
40399d475de5SAlex Elder 	/* Get the size and object order for the image */
40409d475de5SAlex Elder 
40419d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
40429d475de5SAlex Elder 	if (ret < 0)
40439d475de5SAlex Elder 		goto out_err;
40441e130199SAlex Elder 
40451e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
40461e130199SAlex Elder 
40471e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
40481e130199SAlex Elder 	if (ret < 0)
40491e130199SAlex Elder 		goto out_err;
4050b1b5402aSAlex Elder 
4051d889140cSAlex Elder 	/* Get the and check features for the image */
4052b1b5402aSAlex Elder 
4053b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
4054b1b5402aSAlex Elder 	if (ret < 0)
4055b1b5402aSAlex Elder 		goto out_err;
405635d489f9SAlex Elder 
405786b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
405886b00e0dSAlex Elder 
405986b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
406086b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
406186b00e0dSAlex Elder 		if (ret < 0)
406286b00e0dSAlex Elder 			goto out_err;
406386b00e0dSAlex Elder 	}
406486b00e0dSAlex Elder 
40656e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
406635d489f9SAlex Elder 
40676e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
40686e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
40696e14b1a6SAlex Elder 
40706e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
40716e14b1a6SAlex Elder 
40726e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
407335d489f9SAlex Elder 	if (ret)
407435d489f9SAlex Elder 		goto out_err;
40756e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
40766e14b1a6SAlex Elder 
4077a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
4078a30b71b9SAlex Elder 
4079a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
4080a30b71b9SAlex Elder 		rbd_dev->header_name);
4081a30b71b9SAlex Elder 
408235152979SAlex Elder 	return 0;
40839d475de5SAlex Elder out_err:
408486b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
408586b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
408686b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
40879d475de5SAlex Elder 	kfree(rbd_dev->header_name);
40889d475de5SAlex Elder 	rbd_dev->header_name = NULL;
40891e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
40901e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
40919d475de5SAlex Elder 
40929d475de5SAlex Elder 	return ret;
4093a30b71b9SAlex Elder }
4094a30b71b9SAlex Elder 
409583a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
409683a06263SAlex Elder {
40972f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
40982f82ee54SAlex Elder 	struct rbd_spec *parent_spec = NULL;
40992f82ee54SAlex Elder 	struct rbd_client *rbdc = NULL;
410083a06263SAlex Elder 	int ret;
410183a06263SAlex Elder 
410283a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
410383a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
410483a06263SAlex Elder 	if (ret)
410583a06263SAlex Elder 		return ret;
410683a06263SAlex Elder 
41079e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
41089e15b77dSAlex Elder 	if (ret)
41099e15b77dSAlex Elder 		goto err_out_snaps;
41109e15b77dSAlex Elder 
411183a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
411283a06263SAlex Elder 	if (ret)
411383a06263SAlex Elder 		goto err_out_snaps;
411483a06263SAlex Elder 
411583a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
411683a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
411783a06263SAlex Elder 
411883a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
411983a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
412083a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
412183a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
412283a06263SAlex Elder 
412383a06263SAlex Elder 	/* Get our block major device number. */
412483a06263SAlex Elder 
412583a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
412683a06263SAlex Elder 	if (ret < 0)
412783a06263SAlex Elder 		goto err_out_id;
412883a06263SAlex Elder 	rbd_dev->major = ret;
412983a06263SAlex Elder 
413083a06263SAlex Elder 	/* Set up the blkdev mapping. */
413183a06263SAlex Elder 
413283a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
413383a06263SAlex Elder 	if (ret)
413483a06263SAlex Elder 		goto err_out_blkdev;
413583a06263SAlex Elder 
413683a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
413783a06263SAlex Elder 	if (ret)
413883a06263SAlex Elder 		goto err_out_disk;
413983a06263SAlex Elder 
414083a06263SAlex Elder 	/*
414183a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
414283a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
414383a06263SAlex Elder 	 */
41442f82ee54SAlex Elder 	/* Probe the parent if there is one */
41452f82ee54SAlex Elder 
41462f82ee54SAlex Elder 	if (rbd_dev->parent_spec) {
41472f82ee54SAlex Elder 		/*
41482f82ee54SAlex Elder 		 * We need to pass a reference to the client and the
41492f82ee54SAlex Elder 		 * parent spec when creating the parent rbd_dev.
41502f82ee54SAlex Elder 		 * Images related by parent/child relationships
41512f82ee54SAlex Elder 		 * always share both.
41522f82ee54SAlex Elder 		 */
41532f82ee54SAlex Elder 		parent_spec = rbd_spec_get(rbd_dev->parent_spec);
41542f82ee54SAlex Elder 		rbdc = __rbd_get_client(rbd_dev->rbd_client);
41552f82ee54SAlex Elder 
41562f82ee54SAlex Elder 		parent = rbd_dev_create(rbdc, parent_spec);
41572f82ee54SAlex Elder 		if (!parent) {
41582f82ee54SAlex Elder 			ret = -ENOMEM;
41592f82ee54SAlex Elder 			goto err_out_spec;
41602f82ee54SAlex Elder 		}
41612f82ee54SAlex Elder 		rbdc = NULL;		/* parent now owns reference */
41622f82ee54SAlex Elder 		parent_spec = NULL;	/* parent now owns reference */
41632f82ee54SAlex Elder 		ret = rbd_dev_probe(parent);
41642f82ee54SAlex Elder 		if (ret < 0)
41652f82ee54SAlex Elder 			goto err_out_parent;
41662f82ee54SAlex Elder 		rbd_dev->parent = parent;
41672f82ee54SAlex Elder 	}
41682f82ee54SAlex Elder 
416983a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
417083a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
417183a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
417283a06263SAlex Elder 	if (ret)
417383a06263SAlex Elder 		goto err_out_bus;
417483a06263SAlex Elder 
41759969ebc5SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
417683a06263SAlex Elder 	if (ret)
417783a06263SAlex Elder 		goto err_out_bus;
417883a06263SAlex Elder 
417983a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
418083a06263SAlex Elder 
418183a06263SAlex Elder 	add_disk(rbd_dev->disk);
418283a06263SAlex Elder 
418383a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
418483a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
418583a06263SAlex Elder 
418683a06263SAlex Elder 	return ret;
41872f82ee54SAlex Elder 
41882f82ee54SAlex Elder err_out_parent:
41892f82ee54SAlex Elder 	rbd_dev_destroy(parent);
41902f82ee54SAlex Elder err_out_spec:
41912f82ee54SAlex Elder 	rbd_spec_put(parent_spec);
41922f82ee54SAlex Elder 	rbd_put_client(rbdc);
419383a06263SAlex Elder err_out_bus:
419483a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
419583a06263SAlex Elder 
419683a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
419783a06263SAlex Elder 
419883a06263SAlex Elder 	return ret;
419983a06263SAlex Elder err_out_disk:
420083a06263SAlex Elder 	rbd_free_disk(rbd_dev);
420183a06263SAlex Elder err_out_blkdev:
420283a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
420383a06263SAlex Elder err_out_id:
420483a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
420583a06263SAlex Elder err_out_snaps:
420683a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
420783a06263SAlex Elder 
420883a06263SAlex Elder 	return ret;
420983a06263SAlex Elder }
421083a06263SAlex Elder 
4211a30b71b9SAlex Elder /*
4212a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4213a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4214a30b71b9SAlex Elder  * id.
4215a30b71b9SAlex Elder  */
4216a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
4217a30b71b9SAlex Elder {
4218a30b71b9SAlex Elder 	int ret;
4219a30b71b9SAlex Elder 
4220a30b71b9SAlex Elder 	/*
4221a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4222a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4223a30b71b9SAlex Elder 	 * it's a format 1 image.
4224a30b71b9SAlex Elder 	 */
4225a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4226a30b71b9SAlex Elder 	if (ret)
4227a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4228a30b71b9SAlex Elder 	else
4229a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
423083a06263SAlex Elder 	if (ret) {
4231a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
4232a30b71b9SAlex Elder 
4233a30b71b9SAlex Elder 		return ret;
4234a30b71b9SAlex Elder 	}
4235a30b71b9SAlex Elder 
423683a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
423783a06263SAlex Elder 	if (ret)
423883a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
423983a06263SAlex Elder 
424083a06263SAlex Elder 	return ret;
424183a06263SAlex Elder }
424283a06263SAlex Elder 
424359c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
424459c2be1eSYehuda Sadeh 		       const char *buf,
424559c2be1eSYehuda Sadeh 		       size_t count)
4246602adf40SYehuda Sadeh {
4247cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4248dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
42494e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4250859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
42519d3997fdSAlex Elder 	struct rbd_client *rbdc;
425227cc2594SAlex Elder 	struct ceph_osd_client *osdc;
425327cc2594SAlex Elder 	int rc = -ENOMEM;
4254602adf40SYehuda Sadeh 
4255602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4256602adf40SYehuda Sadeh 		return -ENODEV;
4257602adf40SYehuda Sadeh 
4258a725f65eSAlex Elder 	/* parse add command */
4259859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4260dc79b113SAlex Elder 	if (rc < 0)
4261bd4ba655SAlex Elder 		goto err_out_module;
4262a725f65eSAlex Elder 
42639d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
42649d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
42659d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
42660ddebc0cSAlex Elder 		goto err_out_args;
42679d3997fdSAlex Elder 	}
4268c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4269602adf40SYehuda Sadeh 
4270602adf40SYehuda Sadeh 	/* pick the pool */
42719d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4272859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4273602adf40SYehuda Sadeh 	if (rc < 0)
4274602adf40SYehuda Sadeh 		goto err_out_client;
4275859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
4276859c31dfSAlex Elder 
42770903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
42780903e875SAlex Elder 
42790903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
42800903e875SAlex Elder 		rc = -EIO;
42810903e875SAlex Elder 		goto err_out_client;
42820903e875SAlex Elder 	}
42830903e875SAlex Elder 
4284c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4285bd4ba655SAlex Elder 	if (!rbd_dev)
4286bd4ba655SAlex Elder 		goto err_out_client;
4287c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4288c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4289602adf40SYehuda Sadeh 
4290bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4291c53d5893SAlex Elder 	kfree(rbd_opts);
4292c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4293bd4ba655SAlex Elder 
4294a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
4295a30b71b9SAlex Elder 	if (rc < 0)
4296c53d5893SAlex Elder 		goto err_out_rbd_dev;
429705fd6f6fSAlex Elder 
4298602adf40SYehuda Sadeh 	return count;
4299c53d5893SAlex Elder err_out_rbd_dev:
4300c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4301bd4ba655SAlex Elder err_out_client:
43029d3997fdSAlex Elder 	rbd_put_client(rbdc);
43030ddebc0cSAlex Elder err_out_args:
430478cea76eSAlex Elder 	if (ceph_opts)
430578cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
43064e9afebaSAlex Elder 	kfree(rbd_opts);
4307859c31dfSAlex Elder 	rbd_spec_put(spec);
4308bd4ba655SAlex Elder err_out_module:
4309bd4ba655SAlex Elder 	module_put(THIS_MODULE);
431027cc2594SAlex Elder 
4311602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
431227cc2594SAlex Elder 
431327cc2594SAlex Elder 	return (ssize_t) rc;
4314602adf40SYehuda Sadeh }
4315602adf40SYehuda Sadeh 
4316de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4317602adf40SYehuda Sadeh {
4318602adf40SYehuda Sadeh 	struct list_head *tmp;
4319602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4320602adf40SYehuda Sadeh 
4321e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4322602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4323602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4324de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4325e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4326602adf40SYehuda Sadeh 			return rbd_dev;
4327602adf40SYehuda Sadeh 		}
4328e124a82fSAlex Elder 	}
4329e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4330602adf40SYehuda Sadeh 	return NULL;
4331602adf40SYehuda Sadeh }
4332602adf40SYehuda Sadeh 
4333dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
4334602adf40SYehuda Sadeh {
4335593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4336602adf40SYehuda Sadeh 
433759c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
43389969ebc5SAlex Elder 		rbd_dev_header_watch_sync(rbd_dev, 0);
4339602adf40SYehuda Sadeh 
4340602adf40SYehuda Sadeh 	/* clean up and free blkdev */
4341602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4342602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
434332eec68dSAlex Elder 
43442ac4e75dSAlex Elder 	/* release allocated disk header fields */
43452ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
43462ac4e75dSAlex Elder 
434732eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
4348e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4349c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
4350c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4351602adf40SYehuda Sadeh 
4352602adf40SYehuda Sadeh 	/* release module ref */
4353602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
4354602adf40SYehuda Sadeh }
4355602adf40SYehuda Sadeh 
43562f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev)
43572f82ee54SAlex Elder {
43582f82ee54SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
43592f82ee54SAlex Elder 	rbd_bus_del_dev(rbd_dev);
43602f82ee54SAlex Elder }
43612f82ee54SAlex Elder 
4362dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4363602adf40SYehuda Sadeh 			  const char *buf,
4364602adf40SYehuda Sadeh 			  size_t count)
4365602adf40SYehuda Sadeh {
4366602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
4367602adf40SYehuda Sadeh 	int target_id, rc;
4368602adf40SYehuda Sadeh 	unsigned long ul;
4369602adf40SYehuda Sadeh 	int ret = count;
4370602adf40SYehuda Sadeh 
4371602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
4372602adf40SYehuda Sadeh 	if (rc)
4373602adf40SYehuda Sadeh 		return rc;
4374602adf40SYehuda Sadeh 
4375602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4376602adf40SYehuda Sadeh 	target_id = (int) ul;
4377602adf40SYehuda Sadeh 	if (target_id != ul)
4378602adf40SYehuda Sadeh 		return -EINVAL;
4379602adf40SYehuda Sadeh 
4380602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4381602adf40SYehuda Sadeh 
4382602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4383602adf40SYehuda Sadeh 	if (!rbd_dev) {
4384602adf40SYehuda Sadeh 		ret = -ENOENT;
4385602adf40SYehuda Sadeh 		goto done;
4386602adf40SYehuda Sadeh 	}
4387602adf40SYehuda Sadeh 
4388a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4389b82d167bSAlex Elder 	if (rbd_dev->open_count)
439042382b70SAlex Elder 		ret = -EBUSY;
4391b82d167bSAlex Elder 	else
4392b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4393a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4394b82d167bSAlex Elder 	if (ret < 0)
439542382b70SAlex Elder 		goto done;
439642382b70SAlex Elder 
43972f82ee54SAlex Elder 	while (rbd_dev->parent_spec) {
43982f82ee54SAlex Elder 		struct rbd_device *first = rbd_dev;
43992f82ee54SAlex Elder 		struct rbd_device *second = first->parent;
44002f82ee54SAlex Elder 		struct rbd_device *third;
44012f82ee54SAlex Elder 
44022f82ee54SAlex Elder 		/*
44032f82ee54SAlex Elder 		 * Follow to the parent with no grandparent and
44042f82ee54SAlex Elder 		 * remove it.
44052f82ee54SAlex Elder 		 */
44062f82ee54SAlex Elder 		while (second && (third = second->parent)) {
44072f82ee54SAlex Elder 			first = second;
44082f82ee54SAlex Elder 			second = third;
44092f82ee54SAlex Elder 		}
44102f82ee54SAlex Elder 		__rbd_remove(second);
44112f82ee54SAlex Elder 		rbd_spec_put(first->parent_spec);
44122f82ee54SAlex Elder 		first->parent_spec = NULL;
44132f82ee54SAlex Elder 		first->parent_overlap = 0;
44142f82ee54SAlex Elder 		first->parent = NULL;
44152f82ee54SAlex Elder 	}
44162f82ee54SAlex Elder 	__rbd_remove(rbd_dev);
4417602adf40SYehuda Sadeh 
4418602adf40SYehuda Sadeh done:
4419602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4420aafb230eSAlex Elder 
4421602adf40SYehuda Sadeh 	return ret;
4422602adf40SYehuda Sadeh }
4423602adf40SYehuda Sadeh 
4424602adf40SYehuda Sadeh /*
4425602adf40SYehuda Sadeh  * create control files in sysfs
4426dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4427602adf40SYehuda Sadeh  */
4428602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4429602adf40SYehuda Sadeh {
4430dfc5606dSYehuda Sadeh 	int ret;
4431602adf40SYehuda Sadeh 
4432fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
4433dfc5606dSYehuda Sadeh 	if (ret < 0)
4434dfc5606dSYehuda Sadeh 		return ret;
4435602adf40SYehuda Sadeh 
4436fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
4437fed4c143SAlex Elder 	if (ret < 0)
4438fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
4439602adf40SYehuda Sadeh 
4440602adf40SYehuda Sadeh 	return ret;
4441602adf40SYehuda Sadeh }
4442602adf40SYehuda Sadeh 
4443602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
4444602adf40SYehuda Sadeh {
4445dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
4446fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
4447602adf40SYehuda Sadeh }
4448602adf40SYehuda Sadeh 
4449cc344fa1SAlex Elder static int __init rbd_init(void)
4450602adf40SYehuda Sadeh {
4451602adf40SYehuda Sadeh 	int rc;
4452602adf40SYehuda Sadeh 
44531e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
44541e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
44551e32d34cSAlex Elder 
44561e32d34cSAlex Elder 		return -EINVAL;
44571e32d34cSAlex Elder 	}
4458602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
4459602adf40SYehuda Sadeh 	if (rc)
4460602adf40SYehuda Sadeh 		return rc;
4461f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4462602adf40SYehuda Sadeh 	return 0;
4463602adf40SYehuda Sadeh }
4464602adf40SYehuda Sadeh 
4465cc344fa1SAlex Elder static void __exit rbd_exit(void)
4466602adf40SYehuda Sadeh {
4467602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
4468602adf40SYehuda Sadeh }
4469602adf40SYehuda Sadeh 
4470602adf40SYehuda Sadeh module_init(rbd_init);
4471602adf40SYehuda Sadeh module_exit(rbd_exit);
4472602adf40SYehuda Sadeh 
4473602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4474602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4475602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
4476602adf40SYehuda Sadeh 
4477602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
4478602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4479602adf40SYehuda Sadeh 
4480602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
4481