xref: /openbmc/linux/drivers/block/rbd.c (revision 9875201e)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44602adf40SYehuda Sadeh 
45602adf40SYehuda Sadeh #include "rbd_types.h"
46602adf40SYehuda Sadeh 
47aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
48aafb230eSAlex Elder 
49593a9e7bSAlex Elder /*
50593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
51593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
52593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
53593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
54593a9e7bSAlex Elder  */
55593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
56593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
57593a9e7bSAlex Elder 
58a2acd00eSAlex Elder /*
59a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
60a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
61a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
62a2acd00eSAlex Elder  * -EINVAL without updating it.
63a2acd00eSAlex Elder  */
64a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
65a2acd00eSAlex Elder {
66a2acd00eSAlex Elder 	unsigned int counter;
67a2acd00eSAlex Elder 
68a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
69a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
70a2acd00eSAlex Elder 		return (int)counter;
71a2acd00eSAlex Elder 
72a2acd00eSAlex Elder 	atomic_dec(v);
73a2acd00eSAlex Elder 
74a2acd00eSAlex Elder 	return -EINVAL;
75a2acd00eSAlex Elder }
76a2acd00eSAlex Elder 
77a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
78a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
79a2acd00eSAlex Elder {
80a2acd00eSAlex Elder 	int counter;
81a2acd00eSAlex Elder 
82a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
83a2acd00eSAlex Elder 	if (counter >= 0)
84a2acd00eSAlex Elder 		return counter;
85a2acd00eSAlex Elder 
86a2acd00eSAlex Elder 	atomic_inc(v);
87a2acd00eSAlex Elder 
88a2acd00eSAlex Elder 	return -EINVAL;
89a2acd00eSAlex Elder }
90a2acd00eSAlex Elder 
91f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
92f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
95602adf40SYehuda Sadeh 
96d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
97d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
98d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
99d4b125e9SAlex Elder 
10035d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
101602adf40SYehuda Sadeh 
102602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
103602adf40SYehuda Sadeh 
1049682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1059682fc6dSAlex Elder 
1069e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1079e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
108589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1099e15b77dSAlex Elder 
1101e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
111589d30e0SAlex Elder 
112d889140cSAlex Elder /* Feature bits */
113d889140cSAlex Elder 
1145cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1155cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1165cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1175cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
118d889140cSAlex Elder 
119d889140cSAlex Elder /* Features supported by this (client software) implementation. */
120d889140cSAlex Elder 
121770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
122d889140cSAlex Elder 
12381a89793SAlex Elder /*
12481a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
12581a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
12681a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
12781a89793SAlex Elder  * enough to hold all possible device names.
12881a89793SAlex Elder  */
129602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13081a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
131602adf40SYehuda Sadeh 
132602adf40SYehuda Sadeh /*
133602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
134602adf40SYehuda Sadeh  */
135602adf40SYehuda Sadeh struct rbd_image_header {
136f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
137849b4260SAlex Elder 	char *object_prefix;
138602adf40SYehuda Sadeh 	__u8 obj_order;
139602adf40SYehuda Sadeh 	__u8 crypt_type;
140602adf40SYehuda Sadeh 	__u8 comp_type;
141f35a4deeSAlex Elder 	u64 stripe_unit;
142f35a4deeSAlex Elder 	u64 stripe_count;
143f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
144602adf40SYehuda Sadeh 
145f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
146f84344f3SAlex Elder 	u64 image_size;
147f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
148f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
149f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15059c2be1eSYehuda Sadeh };
15159c2be1eSYehuda Sadeh 
1520d7dbfceSAlex Elder /*
1530d7dbfceSAlex Elder  * An rbd image specification.
1540d7dbfceSAlex Elder  *
1550d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
156c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
157c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
158c66c6e0cSAlex Elder  *
159c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
160c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
161c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
162c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
163c66c6e0cSAlex Elder  *
164c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
165c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
166c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
167c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
168c66c6e0cSAlex Elder  * is shared between the parent and child).
169c66c6e0cSAlex Elder  *
170c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
171c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
172c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
173c66c6e0cSAlex Elder  *
174c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
175c66c6e0cSAlex Elder  * could be a null pointer).
1760d7dbfceSAlex Elder  */
1770d7dbfceSAlex Elder struct rbd_spec {
1780d7dbfceSAlex Elder 	u64		pool_id;
179ecb4dc22SAlex Elder 	const char	*pool_name;
1800d7dbfceSAlex Elder 
181ecb4dc22SAlex Elder 	const char	*image_id;
182ecb4dc22SAlex Elder 	const char	*image_name;
1830d7dbfceSAlex Elder 
1840d7dbfceSAlex Elder 	u64		snap_id;
185ecb4dc22SAlex Elder 	const char	*snap_name;
1860d7dbfceSAlex Elder 
1870d7dbfceSAlex Elder 	struct kref	kref;
1880d7dbfceSAlex Elder };
1890d7dbfceSAlex Elder 
190602adf40SYehuda Sadeh /*
191f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
192602adf40SYehuda Sadeh  */
193602adf40SYehuda Sadeh struct rbd_client {
194602adf40SYehuda Sadeh 	struct ceph_client	*client;
195602adf40SYehuda Sadeh 	struct kref		kref;
196602adf40SYehuda Sadeh 	struct list_head	node;
197602adf40SYehuda Sadeh };
198602adf40SYehuda Sadeh 
199bf0d5f50SAlex Elder struct rbd_img_request;
200bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
201bf0d5f50SAlex Elder 
202bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
203bf0d5f50SAlex Elder 
204bf0d5f50SAlex Elder struct rbd_obj_request;
205bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
206bf0d5f50SAlex Elder 
2079969ebc5SAlex Elder enum obj_request_type {
2089969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2099969ebc5SAlex Elder };
210bf0d5f50SAlex Elder 
211926f9b3fSAlex Elder enum obj_req_flags {
212926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2136365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2145679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2155679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
216926f9b3fSAlex Elder };
217926f9b3fSAlex Elder 
218bf0d5f50SAlex Elder struct rbd_obj_request {
219bf0d5f50SAlex Elder 	const char		*object_name;
220bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
221bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
222926f9b3fSAlex Elder 	unsigned long		flags;
223bf0d5f50SAlex Elder 
224c5b5ef6cSAlex Elder 	/*
225c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
226c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
227c5b5ef6cSAlex Elder 	 *
228c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
229c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
230c5b5ef6cSAlex Elder 	 *
231c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
232c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
233c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
234c5b5ef6cSAlex Elder 	 *
235c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
236c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
237c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
238c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
239c5b5ef6cSAlex Elder 	 */
240c5b5ef6cSAlex Elder 	union {
241c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
242c5b5ef6cSAlex Elder 		struct {
243bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
244c5b5ef6cSAlex Elder 			u64			img_offset;
245c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
246c5b5ef6cSAlex Elder 			struct list_head	links;
247c5b5ef6cSAlex Elder 		};
248c5b5ef6cSAlex Elder 	};
249bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
250bf0d5f50SAlex Elder 
251bf0d5f50SAlex Elder 	enum obj_request_type	type;
252788e2df3SAlex Elder 	union {
253bf0d5f50SAlex Elder 		struct bio	*bio_list;
254788e2df3SAlex Elder 		struct {
255788e2df3SAlex Elder 			struct page	**pages;
256788e2df3SAlex Elder 			u32		page_count;
257788e2df3SAlex Elder 		};
258788e2df3SAlex Elder 	};
2590eefd470SAlex Elder 	struct page		**copyup_pages;
260ebda6408SAlex Elder 	u32			copyup_page_count;
261bf0d5f50SAlex Elder 
262bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
263bf0d5f50SAlex Elder 
264bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2651b83bef2SSage Weil 	int			result;
266bf0d5f50SAlex Elder 
267bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
268788e2df3SAlex Elder 	struct completion	completion;
269bf0d5f50SAlex Elder 
270bf0d5f50SAlex Elder 	struct kref		kref;
271bf0d5f50SAlex Elder };
272bf0d5f50SAlex Elder 
2730c425248SAlex Elder enum img_req_flags {
2749849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2759849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
276d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2770c425248SAlex Elder };
2780c425248SAlex Elder 
279bf0d5f50SAlex Elder struct rbd_img_request {
280bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
281bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
282bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2830c425248SAlex Elder 	unsigned long		flags;
284bf0d5f50SAlex Elder 	union {
285bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2869849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2879849e986SAlex Elder 	};
2889849e986SAlex Elder 	union {
2899849e986SAlex Elder 		struct request		*rq;		/* block request */
2909849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
291bf0d5f50SAlex Elder 	};
2923d7efd18SAlex Elder 	struct page		**copyup_pages;
293ebda6408SAlex Elder 	u32			copyup_page_count;
294bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
295bf0d5f50SAlex Elder 	u32			next_completion;
296bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
29755f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
298a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
299bf0d5f50SAlex Elder 
300bf0d5f50SAlex Elder 	u32			obj_request_count;
301bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
302bf0d5f50SAlex Elder 
303bf0d5f50SAlex Elder 	struct kref		kref;
304bf0d5f50SAlex Elder };
305bf0d5f50SAlex Elder 
306bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
307ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
308bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
309ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
310bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
311ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
312bf0d5f50SAlex Elder 
313f84344f3SAlex Elder struct rbd_mapping {
31499c1f08fSAlex Elder 	u64                     size;
31534b13184SAlex Elder 	u64                     features;
316f84344f3SAlex Elder 	bool			read_only;
317f84344f3SAlex Elder };
318f84344f3SAlex Elder 
319602adf40SYehuda Sadeh /*
320602adf40SYehuda Sadeh  * a single device
321602adf40SYehuda Sadeh  */
322602adf40SYehuda Sadeh struct rbd_device {
323de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
324602adf40SYehuda Sadeh 
325602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
326602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
327602adf40SYehuda Sadeh 
328a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
329602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
330602adf40SYehuda Sadeh 
331602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
332602adf40SYehuda Sadeh 
333b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
334602adf40SYehuda Sadeh 
335602adf40SYehuda Sadeh 	struct rbd_image_header	header;
336b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3370d7dbfceSAlex Elder 	struct rbd_spec		*spec;
338602adf40SYehuda Sadeh 
3390d7dbfceSAlex Elder 	char			*header_name;
340971f839aSAlex Elder 
3410903e875SAlex Elder 	struct ceph_file_layout	layout;
3420903e875SAlex Elder 
34359c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
344975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
34559c2be1eSYehuda Sadeh 
34686b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
34786b00e0dSAlex Elder 	u64			parent_overlap;
348a2acd00eSAlex Elder 	atomic_t		parent_ref;
3492f82ee54SAlex Elder 	struct rbd_device	*parent;
35086b00e0dSAlex Elder 
351c666601aSJosh Durgin 	/* protects updating the header */
352c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
353f84344f3SAlex Elder 
354f84344f3SAlex Elder 	struct rbd_mapping	mapping;
355602adf40SYehuda Sadeh 
356602adf40SYehuda Sadeh 	struct list_head	node;
357dfc5606dSYehuda Sadeh 
358dfc5606dSYehuda Sadeh 	/* sysfs related */
359dfc5606dSYehuda Sadeh 	struct device		dev;
360b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
361dfc5606dSYehuda Sadeh };
362dfc5606dSYehuda Sadeh 
363b82d167bSAlex Elder /*
364b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
365b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
366b82d167bSAlex Elder  *
367b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
368b82d167bSAlex Elder  * "open_count" field) requires atomic access.
369b82d167bSAlex Elder  */
3706d292906SAlex Elder enum rbd_dev_flags {
3716d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
372b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3736d292906SAlex Elder };
3746d292906SAlex Elder 
375cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
376e124a82fSAlex Elder 
377602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
378e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
379e124a82fSAlex Elder 
380602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
381432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
382602adf40SYehuda Sadeh 
38378c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
38478c2a44aSAlex Elder 
3851c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
386868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
38778c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3881c2a9dfeSAlex Elder 
3893d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
3903d7efd18SAlex Elder 
391200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
392dfc5606dSYehuda Sadeh 
393f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
394f0f8cef5SAlex Elder 		       size_t count);
395f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
396f0f8cef5SAlex Elder 			  size_t count);
3971f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
398a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
399f0f8cef5SAlex Elder 
400f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
401f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
402f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
403f0f8cef5SAlex Elder 	__ATTR_NULL
404f0f8cef5SAlex Elder };
405f0f8cef5SAlex Elder 
406f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
407f0f8cef5SAlex Elder 	.name		= "rbd",
408f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
409f0f8cef5SAlex Elder };
410f0f8cef5SAlex Elder 
411f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
412f0f8cef5SAlex Elder {
413f0f8cef5SAlex Elder }
414f0f8cef5SAlex Elder 
415f0f8cef5SAlex Elder static struct device rbd_root_dev = {
416f0f8cef5SAlex Elder 	.init_name =    "rbd",
417f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
418f0f8cef5SAlex Elder };
419f0f8cef5SAlex Elder 
42006ecc6cbSAlex Elder static __printf(2, 3)
42106ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
42206ecc6cbSAlex Elder {
42306ecc6cbSAlex Elder 	struct va_format vaf;
42406ecc6cbSAlex Elder 	va_list args;
42506ecc6cbSAlex Elder 
42606ecc6cbSAlex Elder 	va_start(args, fmt);
42706ecc6cbSAlex Elder 	vaf.fmt = fmt;
42806ecc6cbSAlex Elder 	vaf.va = &args;
42906ecc6cbSAlex Elder 
43006ecc6cbSAlex Elder 	if (!rbd_dev)
43106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
43206ecc6cbSAlex Elder 	else if (rbd_dev->disk)
43306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
43406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
43506ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
43606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
43706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
43806ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
43906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
44006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
44106ecc6cbSAlex Elder 	else	/* punt */
44206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
44306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
44406ecc6cbSAlex Elder 	va_end(args);
44506ecc6cbSAlex Elder }
44606ecc6cbSAlex Elder 
447aafb230eSAlex Elder #ifdef RBD_DEBUG
448aafb230eSAlex Elder #define rbd_assert(expr)						\
449aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
450aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
451aafb230eSAlex Elder 						"at line %d:\n\n"	\
452aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
453aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
454aafb230eSAlex Elder 			BUG();						\
455aafb230eSAlex Elder 		}
456aafb230eSAlex Elder #else /* !RBD_DEBUG */
457aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
458aafb230eSAlex Elder #endif /* !RBD_DEBUG */
459dfc5606dSYehuda Sadeh 
460b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
46105a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
46205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
4638b3e1a56SAlex Elder 
464cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
4652df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
4662df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
46754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
46854cac61fSAlex Elder 					u64 snap_id);
4692ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4702ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
4712ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4722ad3d716SAlex Elder 		u64 *snap_features);
4732ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
47459c2be1eSYehuda Sadeh 
475602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
476602adf40SYehuda Sadeh {
477f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
478b82d167bSAlex Elder 	bool removing = false;
479602adf40SYehuda Sadeh 
480f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
481602adf40SYehuda Sadeh 		return -EROFS;
482602adf40SYehuda Sadeh 
483a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
484b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
485b82d167bSAlex Elder 		removing = true;
486b82d167bSAlex Elder 	else
487b82d167bSAlex Elder 		rbd_dev->open_count++;
488a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
489b82d167bSAlex Elder 	if (removing)
490b82d167bSAlex Elder 		return -ENOENT;
491b82d167bSAlex Elder 
492c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
493f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
494340c7a2bSAlex Elder 
495602adf40SYehuda Sadeh 	return 0;
496602adf40SYehuda Sadeh }
497602adf40SYehuda Sadeh 
498db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
499dfc5606dSYehuda Sadeh {
500dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
501b82d167bSAlex Elder 	unsigned long open_count_before;
502b82d167bSAlex Elder 
503a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
504b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
505a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
506b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
507dfc5606dSYehuda Sadeh 
508c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
509dfc5606dSYehuda Sadeh }
510dfc5606dSYehuda Sadeh 
511602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
512602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
513602adf40SYehuda Sadeh 	.open			= rbd_open,
514dfc5606dSYehuda Sadeh 	.release		= rbd_release,
515602adf40SYehuda Sadeh };
516602adf40SYehuda Sadeh 
517602adf40SYehuda Sadeh /*
5187262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
519cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
520602adf40SYehuda Sadeh  */
521f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
522602adf40SYehuda Sadeh {
523602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
524602adf40SYehuda Sadeh 	int ret = -ENOMEM;
525602adf40SYehuda Sadeh 
52637206ee5SAlex Elder 	dout("%s:\n", __func__);
527602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
528602adf40SYehuda Sadeh 	if (!rbdc)
529602adf40SYehuda Sadeh 		goto out_opt;
530602adf40SYehuda Sadeh 
531602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
532602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
533602adf40SYehuda Sadeh 
53443ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
535602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
53608f75463SAlex Elder 		goto out_rbdc;
53743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
538602adf40SYehuda Sadeh 
539602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
540602adf40SYehuda Sadeh 	if (ret < 0)
54108f75463SAlex Elder 		goto out_client;
542602adf40SYehuda Sadeh 
543432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
544602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
545432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
546602adf40SYehuda Sadeh 
54737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
548bc534d86SAlex Elder 
549602adf40SYehuda Sadeh 	return rbdc;
55008f75463SAlex Elder out_client:
551602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
55208f75463SAlex Elder out_rbdc:
553602adf40SYehuda Sadeh 	kfree(rbdc);
554602adf40SYehuda Sadeh out_opt:
55543ae4701SAlex Elder 	if (ceph_opts)
55643ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
55737206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
55837206ee5SAlex Elder 
55928f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
560602adf40SYehuda Sadeh }
561602adf40SYehuda Sadeh 
5622f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5632f82ee54SAlex Elder {
5642f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5652f82ee54SAlex Elder 
5662f82ee54SAlex Elder 	return rbdc;
5672f82ee54SAlex Elder }
5682f82ee54SAlex Elder 
569602adf40SYehuda Sadeh /*
5701f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5711f7ba331SAlex Elder  * found, bump its reference count.
572602adf40SYehuda Sadeh  */
5731f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
574602adf40SYehuda Sadeh {
575602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5761f7ba331SAlex Elder 	bool found = false;
577602adf40SYehuda Sadeh 
57843ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
579602adf40SYehuda Sadeh 		return NULL;
580602adf40SYehuda Sadeh 
5811f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5821f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5831f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5842f82ee54SAlex Elder 			__rbd_get_client(client_node);
5852f82ee54SAlex Elder 
5861f7ba331SAlex Elder 			found = true;
5871f7ba331SAlex Elder 			break;
5881f7ba331SAlex Elder 		}
5891f7ba331SAlex Elder 	}
5901f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5911f7ba331SAlex Elder 
5921f7ba331SAlex Elder 	return found ? client_node : NULL;
593602adf40SYehuda Sadeh }
594602adf40SYehuda Sadeh 
595602adf40SYehuda Sadeh /*
59659c2be1eSYehuda Sadeh  * mount options
59759c2be1eSYehuda Sadeh  */
59859c2be1eSYehuda Sadeh enum {
59959c2be1eSYehuda Sadeh 	Opt_last_int,
60059c2be1eSYehuda Sadeh 	/* int args above */
60159c2be1eSYehuda Sadeh 	Opt_last_string,
60259c2be1eSYehuda Sadeh 	/* string args above */
603cc0538b6SAlex Elder 	Opt_read_only,
604cc0538b6SAlex Elder 	Opt_read_write,
605cc0538b6SAlex Elder 	/* Boolean args above */
606cc0538b6SAlex Elder 	Opt_last_bool,
60759c2be1eSYehuda Sadeh };
60859c2be1eSYehuda Sadeh 
60943ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
61059c2be1eSYehuda Sadeh 	/* int args above */
61159c2be1eSYehuda Sadeh 	/* string args above */
612be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
613cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
614cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
615cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
616cc0538b6SAlex Elder 	/* Boolean args above */
61759c2be1eSYehuda Sadeh 	{-1, NULL}
61859c2be1eSYehuda Sadeh };
61959c2be1eSYehuda Sadeh 
62098571b5aSAlex Elder struct rbd_options {
62198571b5aSAlex Elder 	bool	read_only;
62298571b5aSAlex Elder };
62398571b5aSAlex Elder 
62498571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
62598571b5aSAlex Elder 
62659c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
62759c2be1eSYehuda Sadeh {
62843ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
62959c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
63059c2be1eSYehuda Sadeh 	int token, intval, ret;
63159c2be1eSYehuda Sadeh 
63243ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
63359c2be1eSYehuda Sadeh 	if (token < 0)
63459c2be1eSYehuda Sadeh 		return -EINVAL;
63559c2be1eSYehuda Sadeh 
63659c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
63759c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
63859c2be1eSYehuda Sadeh 		if (ret < 0) {
63959c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
64059c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
64159c2be1eSYehuda Sadeh 			return ret;
64259c2be1eSYehuda Sadeh 		}
64359c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
64459c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
64559c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
64659c2be1eSYehuda Sadeh 		     argstr[0].from);
647cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
648cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
64959c2be1eSYehuda Sadeh 	} else {
65059c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
65159c2be1eSYehuda Sadeh 	}
65259c2be1eSYehuda Sadeh 
65359c2be1eSYehuda Sadeh 	switch (token) {
654cc0538b6SAlex Elder 	case Opt_read_only:
655cc0538b6SAlex Elder 		rbd_opts->read_only = true;
656cc0538b6SAlex Elder 		break;
657cc0538b6SAlex Elder 	case Opt_read_write:
658cc0538b6SAlex Elder 		rbd_opts->read_only = false;
659cc0538b6SAlex Elder 		break;
66059c2be1eSYehuda Sadeh 	default:
661aafb230eSAlex Elder 		rbd_assert(false);
662aafb230eSAlex Elder 		break;
66359c2be1eSYehuda Sadeh 	}
66459c2be1eSYehuda Sadeh 	return 0;
66559c2be1eSYehuda Sadeh }
66659c2be1eSYehuda Sadeh 
66759c2be1eSYehuda Sadeh /*
668602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
6697262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
6707262cfcaSAlex Elder  * function.
671602adf40SYehuda Sadeh  */
6729d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
673602adf40SYehuda Sadeh {
674f8c38929SAlex Elder 	struct rbd_client *rbdc;
67559c2be1eSYehuda Sadeh 
676cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
6771f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6789d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
67943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6809d3997fdSAlex Elder 	else
681f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
682cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
683d720bcb0SAlex Elder 
6849d3997fdSAlex Elder 	return rbdc;
685602adf40SYehuda Sadeh }
686602adf40SYehuda Sadeh 
687602adf40SYehuda Sadeh /*
688602adf40SYehuda Sadeh  * Destroy ceph client
689d23a4b3fSAlex Elder  *
690432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
691602adf40SYehuda Sadeh  */
692602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
693602adf40SYehuda Sadeh {
694602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
695602adf40SYehuda Sadeh 
69637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
697cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
698602adf40SYehuda Sadeh 	list_del(&rbdc->node);
699cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
700602adf40SYehuda Sadeh 
701602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
702602adf40SYehuda Sadeh 	kfree(rbdc);
703602adf40SYehuda Sadeh }
704602adf40SYehuda Sadeh 
705602adf40SYehuda Sadeh /*
706602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
707602adf40SYehuda Sadeh  * it.
708602adf40SYehuda Sadeh  */
7099d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
710602adf40SYehuda Sadeh {
711c53d5893SAlex Elder 	if (rbdc)
7129d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
713602adf40SYehuda Sadeh }
714602adf40SYehuda Sadeh 
715a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
716a30b71b9SAlex Elder {
717a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
718a30b71b9SAlex Elder }
719a30b71b9SAlex Elder 
7208e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
7218e94af8eSAlex Elder {
722103a150fSAlex Elder 	size_t size;
723103a150fSAlex Elder 	u32 snap_count;
724103a150fSAlex Elder 
725103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
726103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
727103a150fSAlex Elder 		return false;
728103a150fSAlex Elder 
729db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
730db2388b6SAlex Elder 
731db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
732db2388b6SAlex Elder 		return false;
733db2388b6SAlex Elder 
734db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
735db2388b6SAlex Elder 
736db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
737db2388b6SAlex Elder 		return false;
738db2388b6SAlex Elder 
739103a150fSAlex Elder 	/*
740103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
741103a150fSAlex Elder 	 * that limits the number of snapshots.
742103a150fSAlex Elder 	 */
743103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
744103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
745103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
746103a150fSAlex Elder 		return false;
747103a150fSAlex Elder 
748103a150fSAlex Elder 	/*
749103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
750103a150fSAlex Elder 	 * header must also be representable in a size_t.
751103a150fSAlex Elder 	 */
752103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
753103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
754103a150fSAlex Elder 		return false;
755103a150fSAlex Elder 
756103a150fSAlex Elder 	return true;
7578e94af8eSAlex Elder }
7588e94af8eSAlex Elder 
759602adf40SYehuda Sadeh /*
760bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
761bb23e37aSAlex Elder  * on-disk header.
762602adf40SYehuda Sadeh  */
763662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
7644156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
765602adf40SYehuda Sadeh {
766662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
767bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
768bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
769bb23e37aSAlex Elder 	char *object_prefix = NULL;
770bb23e37aSAlex Elder 	char *snap_names = NULL;
771bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
772ccece235SAlex Elder 	u32 snap_count;
773d2bb24e5SAlex Elder 	size_t size;
774bb23e37aSAlex Elder 	int ret = -ENOMEM;
775621901d6SAlex Elder 	u32 i;
776602adf40SYehuda Sadeh 
777bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
778103a150fSAlex Elder 
779bb23e37aSAlex Elder 	if (first_time) {
780bb23e37aSAlex Elder 		size_t len;
781bb23e37aSAlex Elder 
782bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
783bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
784bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
785bb23e37aSAlex Elder 		if (!object_prefix)
786602adf40SYehuda Sadeh 			return -ENOMEM;
787bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
788bb23e37aSAlex Elder 		object_prefix[len] = '\0';
789bb23e37aSAlex Elder 	}
79000f1f36fSAlex Elder 
791bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
792d2bb24e5SAlex Elder 
793602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
794bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
795bb23e37aSAlex Elder 	if (!snapc)
796bb23e37aSAlex Elder 		goto out_err;
797bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
798602adf40SYehuda Sadeh 	if (snap_count) {
799bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
800f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
801f785cc1dSAlex Elder 
802bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
803621901d6SAlex Elder 
804f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
805bb23e37aSAlex Elder 			goto out_2big;
806bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
807bb23e37aSAlex Elder 		if (!snap_names)
808602adf40SYehuda Sadeh 			goto out_err;
809bb23e37aSAlex Elder 
810bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
811bb23e37aSAlex Elder 
812bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
813bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
814bb23e37aSAlex Elder 		if (!snap_sizes)
815bb23e37aSAlex Elder 			goto out_err;
816bb23e37aSAlex Elder 
817f785cc1dSAlex Elder 		/*
818bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
819bb23e37aSAlex Elder 		 * and size.
820bb23e37aSAlex Elder 		 *
82199a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
822bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
823f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
824f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
825f785cc1dSAlex Elder 		 */
826bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
827bb23e37aSAlex Elder 		snaps = ondisk->snaps;
828bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
829bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
830bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
831bb23e37aSAlex Elder 		}
832602adf40SYehuda Sadeh 	}
833849b4260SAlex Elder 
834bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
835bb23e37aSAlex Elder 
836bb23e37aSAlex Elder 	if (first_time) {
837bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
838602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
839602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
840602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
841bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
842bb23e37aSAlex Elder 		header->stripe_unit = 0;
843bb23e37aSAlex Elder 		header->stripe_count = 0;
844bb23e37aSAlex Elder 		header->features = 0;
845662518b1SAlex Elder 	} else {
846662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
847662518b1SAlex Elder 		kfree(header->snap_names);
848662518b1SAlex Elder 		kfree(header->snap_sizes);
849bb23e37aSAlex Elder 	}
8506a52325fSAlex Elder 
851bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
852621901d6SAlex Elder 
853f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
854bb23e37aSAlex Elder 	header->snapc = snapc;
855bb23e37aSAlex Elder 	header->snap_names = snap_names;
856bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
857468521c1SAlex Elder 
858662518b1SAlex Elder 	/* Make sure mapping size is consistent with header info */
859662518b1SAlex Elder 
860662518b1SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
861662518b1SAlex Elder 		if (rbd_dev->mapping.size != header->image_size)
862662518b1SAlex Elder 			rbd_dev->mapping.size = header->image_size;
863662518b1SAlex Elder 
864602adf40SYehuda Sadeh 	return 0;
865bb23e37aSAlex Elder out_2big:
866bb23e37aSAlex Elder 	ret = -EIO;
8676a52325fSAlex Elder out_err:
868bb23e37aSAlex Elder 	kfree(snap_sizes);
869bb23e37aSAlex Elder 	kfree(snap_names);
870bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
871bb23e37aSAlex Elder 	kfree(object_prefix);
872ccece235SAlex Elder 
873bb23e37aSAlex Elder 	return ret;
874602adf40SYehuda Sadeh }
875602adf40SYehuda Sadeh 
8769682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
8779682fc6dSAlex Elder {
8789682fc6dSAlex Elder 	const char *snap_name;
8799682fc6dSAlex Elder 
8809682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
8819682fc6dSAlex Elder 
8829682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
8839682fc6dSAlex Elder 
8849682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
8859682fc6dSAlex Elder 	while (which--)
8869682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
8879682fc6dSAlex Elder 
8889682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
8899682fc6dSAlex Elder }
8909682fc6dSAlex Elder 
89130d1cff8SAlex Elder /*
89230d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
89330d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
89430d1cff8SAlex Elder  */
89530d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
89630d1cff8SAlex Elder {
89730d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
89830d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
89930d1cff8SAlex Elder 
90030d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
90130d1cff8SAlex Elder 		return 1;
90230d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
90330d1cff8SAlex Elder }
90430d1cff8SAlex Elder 
90530d1cff8SAlex Elder /*
90630d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
90730d1cff8SAlex Elder  * present.
90830d1cff8SAlex Elder  *
90930d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
91030d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
91130d1cff8SAlex Elder  *
91230d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
91330d1cff8SAlex Elder  * reverse order, highest snapshot id first.
91430d1cff8SAlex Elder  */
9159682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
9169682fc6dSAlex Elder {
9179682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
91830d1cff8SAlex Elder 	u64 *found;
9199682fc6dSAlex Elder 
92030d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
92130d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
9229682fc6dSAlex Elder 
92330d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9249682fc6dSAlex Elder }
9259682fc6dSAlex Elder 
9262ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
9272ad3d716SAlex Elder 					u64 snap_id)
92854cac61fSAlex Elder {
92954cac61fSAlex Elder 	u32 which;
93054cac61fSAlex Elder 
93154cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
93254cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
93354cac61fSAlex Elder 		return NULL;
93454cac61fSAlex Elder 
93554cac61fSAlex Elder 	return _rbd_dev_v1_snap_name(rbd_dev, which);
93654cac61fSAlex Elder }
93754cac61fSAlex Elder 
9389e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
9399e15b77dSAlex Elder {
9409e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
9419e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
9429e15b77dSAlex Elder 
94354cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
94454cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
94554cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9469e15b77dSAlex Elder 
94754cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9489e15b77dSAlex Elder }
9499e15b77dSAlex Elder 
9502ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
9512ad3d716SAlex Elder 				u64 *snap_size)
952602adf40SYehuda Sadeh {
9532ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
9542ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
9552ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
9562ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
9572ad3d716SAlex Elder 		u32 which;
95800f1f36fSAlex Elder 
9592ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
9602ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
9612ad3d716SAlex Elder 			return -ENOENT;
96200f1f36fSAlex Elder 
9632ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
9642ad3d716SAlex Elder 	} else {
9652ad3d716SAlex Elder 		u64 size = 0;
9662ad3d716SAlex Elder 		int ret;
9672ad3d716SAlex Elder 
9682ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
9692ad3d716SAlex Elder 		if (ret)
9702ad3d716SAlex Elder 			return ret;
9712ad3d716SAlex Elder 
9722ad3d716SAlex Elder 		*snap_size = size;
9732ad3d716SAlex Elder 	}
9742ad3d716SAlex Elder 	return 0;
9752ad3d716SAlex Elder }
9762ad3d716SAlex Elder 
9772ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
9782ad3d716SAlex Elder 			u64 *snap_features)
9792ad3d716SAlex Elder {
9802ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
9812ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
9822ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
9832ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
9842ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
9852ad3d716SAlex Elder 	} else {
9862ad3d716SAlex Elder 		u64 features = 0;
9872ad3d716SAlex Elder 		int ret;
9882ad3d716SAlex Elder 
9892ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
9902ad3d716SAlex Elder 		if (ret)
9912ad3d716SAlex Elder 			return ret;
9922ad3d716SAlex Elder 
9932ad3d716SAlex Elder 		*snap_features = features;
9942ad3d716SAlex Elder 	}
9952ad3d716SAlex Elder 	return 0;
99600f1f36fSAlex Elder }
997602adf40SYehuda Sadeh 
998d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
999602adf40SYehuda Sadeh {
10008f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
10012ad3d716SAlex Elder 	u64 size = 0;
10022ad3d716SAlex Elder 	u64 features = 0;
10032ad3d716SAlex Elder 	int ret;
10048b0241f8SAlex Elder 
10052ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
10062ad3d716SAlex Elder 	if (ret)
10072ad3d716SAlex Elder 		return ret;
10082ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
10092ad3d716SAlex Elder 	if (ret)
10102ad3d716SAlex Elder 		return ret;
10112ad3d716SAlex Elder 
10122ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
10132ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
10142ad3d716SAlex Elder 
10158b0241f8SAlex Elder 	return 0;
1016602adf40SYehuda Sadeh }
1017602adf40SYehuda Sadeh 
1018d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1019d1cf5788SAlex Elder {
1020d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1021d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1022200a6a8bSAlex Elder }
1023200a6a8bSAlex Elder 
102498571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1025602adf40SYehuda Sadeh {
102665ccfe21SAlex Elder 	char *name;
102765ccfe21SAlex Elder 	u64 segment;
102865ccfe21SAlex Elder 	int ret;
10293a96d5cdSJosh Durgin 	char *name_format;
1030602adf40SYehuda Sadeh 
103178c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
103265ccfe21SAlex Elder 	if (!name)
103365ccfe21SAlex Elder 		return NULL;
103465ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
10353a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
10363a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
10373a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
10383a96d5cdSJosh Durgin 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
103965ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
10402fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
104165ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
104265ccfe21SAlex Elder 			segment, ret);
104365ccfe21SAlex Elder 		kfree(name);
104465ccfe21SAlex Elder 		name = NULL;
104565ccfe21SAlex Elder 	}
1046602adf40SYehuda Sadeh 
104765ccfe21SAlex Elder 	return name;
104865ccfe21SAlex Elder }
1049602adf40SYehuda Sadeh 
105078c2a44aSAlex Elder static void rbd_segment_name_free(const char *name)
105178c2a44aSAlex Elder {
105278c2a44aSAlex Elder 	/* The explicit cast here is needed to drop the const qualifier */
105378c2a44aSAlex Elder 
105478c2a44aSAlex Elder 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
105578c2a44aSAlex Elder }
105678c2a44aSAlex Elder 
105765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
105865ccfe21SAlex Elder {
105965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1060602adf40SYehuda Sadeh 
106165ccfe21SAlex Elder 	return offset & (segment_size - 1);
106265ccfe21SAlex Elder }
106365ccfe21SAlex Elder 
106465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
106565ccfe21SAlex Elder 				u64 offset, u64 length)
106665ccfe21SAlex Elder {
106765ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
106865ccfe21SAlex Elder 
106965ccfe21SAlex Elder 	offset &= segment_size - 1;
107065ccfe21SAlex Elder 
1071aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
107265ccfe21SAlex Elder 	if (offset + length > segment_size)
107365ccfe21SAlex Elder 		length = segment_size - offset;
107465ccfe21SAlex Elder 
107565ccfe21SAlex Elder 	return length;
1076602adf40SYehuda Sadeh }
1077602adf40SYehuda Sadeh 
1078602adf40SYehuda Sadeh /*
1079029bcbd8SJosh Durgin  * returns the size of an object in the image
1080029bcbd8SJosh Durgin  */
1081029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1082029bcbd8SJosh Durgin {
1083029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1084029bcbd8SJosh Durgin }
1085029bcbd8SJosh Durgin 
1086029bcbd8SJosh Durgin /*
1087602adf40SYehuda Sadeh  * bio helpers
1088602adf40SYehuda Sadeh  */
1089602adf40SYehuda Sadeh 
1090602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1091602adf40SYehuda Sadeh {
1092602adf40SYehuda Sadeh 	struct bio *tmp;
1093602adf40SYehuda Sadeh 
1094602adf40SYehuda Sadeh 	while (chain) {
1095602adf40SYehuda Sadeh 		tmp = chain;
1096602adf40SYehuda Sadeh 		chain = chain->bi_next;
1097602adf40SYehuda Sadeh 		bio_put(tmp);
1098602adf40SYehuda Sadeh 	}
1099602adf40SYehuda Sadeh }
1100602adf40SYehuda Sadeh 
1101602adf40SYehuda Sadeh /*
1102602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1103602adf40SYehuda Sadeh  */
1104602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1105602adf40SYehuda Sadeh {
1106602adf40SYehuda Sadeh 	struct bio_vec *bv;
1107602adf40SYehuda Sadeh 	unsigned long flags;
1108602adf40SYehuda Sadeh 	void *buf;
1109602adf40SYehuda Sadeh 	int i;
1110602adf40SYehuda Sadeh 	int pos = 0;
1111602adf40SYehuda Sadeh 
1112602adf40SYehuda Sadeh 	while (chain) {
1113602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
1114602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
1115602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
1116602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
1117602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
1118602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
1119e2156054SAlex Elder 				flush_dcache_page(bv->bv_page);
112085b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1121602adf40SYehuda Sadeh 			}
1122602adf40SYehuda Sadeh 			pos += bv->bv_len;
1123602adf40SYehuda Sadeh 		}
1124602adf40SYehuda Sadeh 
1125602adf40SYehuda Sadeh 		chain = chain->bi_next;
1126602adf40SYehuda Sadeh 	}
1127602adf40SYehuda Sadeh }
1128602adf40SYehuda Sadeh 
1129602adf40SYehuda Sadeh /*
1130b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1131b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1132b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1133b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1134b9434c5bSAlex Elder  */
1135b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1136b9434c5bSAlex Elder {
1137b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1138b9434c5bSAlex Elder 
1139b9434c5bSAlex Elder 	rbd_assert(end > offset);
1140b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1141b9434c5bSAlex Elder 	while (offset < end) {
1142b9434c5bSAlex Elder 		size_t page_offset;
1143b9434c5bSAlex Elder 		size_t length;
1144b9434c5bSAlex Elder 		unsigned long flags;
1145b9434c5bSAlex Elder 		void *kaddr;
1146b9434c5bSAlex Elder 
1147491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1148491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1149b9434c5bSAlex Elder 		local_irq_save(flags);
1150b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1151b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1152e2156054SAlex Elder 		flush_dcache_page(*page);
1153b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1154b9434c5bSAlex Elder 		local_irq_restore(flags);
1155b9434c5bSAlex Elder 
1156b9434c5bSAlex Elder 		offset += length;
1157b9434c5bSAlex Elder 		page++;
1158b9434c5bSAlex Elder 	}
1159b9434c5bSAlex Elder }
1160b9434c5bSAlex Elder 
1161b9434c5bSAlex Elder /*
1162f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1163f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1164602adf40SYehuda Sadeh  */
1165f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1166f7760dadSAlex Elder 					unsigned int offset,
1167f7760dadSAlex Elder 					unsigned int len,
1168f7760dadSAlex Elder 					gfp_t gfpmask)
1169602adf40SYehuda Sadeh {
1170f7760dadSAlex Elder 	struct bio_vec *bv;
1171f7760dadSAlex Elder 	unsigned int resid;
1172f7760dadSAlex Elder 	unsigned short idx;
1173f7760dadSAlex Elder 	unsigned int voff;
1174f7760dadSAlex Elder 	unsigned short end_idx;
1175f7760dadSAlex Elder 	unsigned short vcnt;
1176f7760dadSAlex Elder 	struct bio *bio;
1177602adf40SYehuda Sadeh 
1178f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1179f7760dadSAlex Elder 
1180f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1181f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1182f7760dadSAlex Elder 
1183f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1184f7760dadSAlex Elder 		return NULL;
1185f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1186f7760dadSAlex Elder 		return NULL;
1187f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1188f7760dadSAlex Elder 		return NULL;
1189f7760dadSAlex Elder 
1190f7760dadSAlex Elder 	/* Find first affected segment... */
1191f7760dadSAlex Elder 
1192f7760dadSAlex Elder 	resid = offset;
1193d74c6d51SKent Overstreet 	bio_for_each_segment(bv, bio_src, idx) {
1194f7760dadSAlex Elder 		if (resid < bv->bv_len)
1195f7760dadSAlex Elder 			break;
1196f7760dadSAlex Elder 		resid -= bv->bv_len;
1197602adf40SYehuda Sadeh 	}
1198f7760dadSAlex Elder 	voff = resid;
1199602adf40SYehuda Sadeh 
1200f7760dadSAlex Elder 	/* ...and the last affected segment */
1201542582fcSAlex Elder 
1202f7760dadSAlex Elder 	resid += len;
1203f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1204f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1205f7760dadSAlex Elder 			break;
1206f7760dadSAlex Elder 		resid -= bv->bv_len;
1207f7760dadSAlex Elder 	}
1208f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1209602adf40SYehuda Sadeh 
1210f7760dadSAlex Elder 	/* Build the clone */
1211f7760dadSAlex Elder 
1212f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1213f7760dadSAlex Elder 	if (!bio)
1214f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1215f7760dadSAlex Elder 
1216f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1217f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1218f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1219f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1220602adf40SYehuda Sadeh 
1221602adf40SYehuda Sadeh 	/*
1222f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1223f7760dadSAlex Elder 	 * and last (or only) entries.
1224602adf40SYehuda Sadeh 	 */
1225f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1226f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1227f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1228f7760dadSAlex Elder 	if (vcnt > 1) {
1229f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1230f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1231602adf40SYehuda Sadeh 	} else {
1232f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1233602adf40SYehuda Sadeh 	}
1234602adf40SYehuda Sadeh 
1235f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1236f7760dadSAlex Elder 	bio->bi_size = len;
1237f7760dadSAlex Elder 	bio->bi_idx = 0;
1238602adf40SYehuda Sadeh 
1239f7760dadSAlex Elder 	return bio;
1240602adf40SYehuda Sadeh }
1241602adf40SYehuda Sadeh 
1242f7760dadSAlex Elder /*
1243f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1244f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1245f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1246f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1247f7760dadSAlex Elder  *
1248f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1249f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1250f7760dadSAlex Elder  * the start of data to be cloned is located.
1251f7760dadSAlex Elder  *
1252f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1253f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1254f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1255f7760dadSAlex Elder  */
1256f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1257f7760dadSAlex Elder 					unsigned int *offset,
1258f7760dadSAlex Elder 					unsigned int len,
1259f7760dadSAlex Elder 					gfp_t gfpmask)
1260f7760dadSAlex Elder {
1261f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1262f7760dadSAlex Elder 	unsigned int off = *offset;
1263f7760dadSAlex Elder 	struct bio *chain = NULL;
1264f7760dadSAlex Elder 	struct bio **end;
1265602adf40SYehuda Sadeh 
1266f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1267602adf40SYehuda Sadeh 
1268f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1269f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1270602adf40SYehuda Sadeh 
1271f7760dadSAlex Elder 	end = &chain;
1272f7760dadSAlex Elder 	while (len) {
1273f7760dadSAlex Elder 		unsigned int bi_size;
1274f7760dadSAlex Elder 		struct bio *bio;
1275f7760dadSAlex Elder 
1276f5400b7aSAlex Elder 		if (!bi) {
1277f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1278f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1279f5400b7aSAlex Elder 		}
1280f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1281f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1282f7760dadSAlex Elder 		if (!bio)
1283f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1284f7760dadSAlex Elder 
1285f7760dadSAlex Elder 		*end = bio;
1286f7760dadSAlex Elder 		end = &bio->bi_next;
1287f7760dadSAlex Elder 
1288f7760dadSAlex Elder 		off += bi_size;
1289f7760dadSAlex Elder 		if (off == bi->bi_size) {
1290f7760dadSAlex Elder 			bi = bi->bi_next;
1291f7760dadSAlex Elder 			off = 0;
1292f7760dadSAlex Elder 		}
1293f7760dadSAlex Elder 		len -= bi_size;
1294f7760dadSAlex Elder 	}
1295f7760dadSAlex Elder 	*bio_src = bi;
1296f7760dadSAlex Elder 	*offset = off;
1297f7760dadSAlex Elder 
1298f7760dadSAlex Elder 	return chain;
1299f7760dadSAlex Elder out_err:
1300f7760dadSAlex Elder 	bio_chain_put(chain);
1301f7760dadSAlex Elder 
1302602adf40SYehuda Sadeh 	return NULL;
1303602adf40SYehuda Sadeh }
1304602adf40SYehuda Sadeh 
1305926f9b3fSAlex Elder /*
1306926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1307926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1308926f9b3fSAlex Elder  * again.
1309926f9b3fSAlex Elder  */
13106365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13116365d33aSAlex Elder {
13126365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13136365d33aSAlex Elder 		struct rbd_device *rbd_dev;
13146365d33aSAlex Elder 
131557acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
13166365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
13176365d33aSAlex Elder 			obj_request);
13186365d33aSAlex Elder 	}
13196365d33aSAlex Elder }
13206365d33aSAlex Elder 
13216365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
13226365d33aSAlex Elder {
13236365d33aSAlex Elder 	smp_mb();
13246365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
13256365d33aSAlex Elder }
13266365d33aSAlex Elder 
132757acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
132857acbaa7SAlex Elder {
132957acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
133057acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
133157acbaa7SAlex Elder 
133257acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
133357acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
133457acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
133557acbaa7SAlex Elder 			obj_request);
133657acbaa7SAlex Elder 	}
133757acbaa7SAlex Elder }
133857acbaa7SAlex Elder 
133957acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
134057acbaa7SAlex Elder {
134157acbaa7SAlex Elder 	smp_mb();
134257acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
134357acbaa7SAlex Elder }
134457acbaa7SAlex Elder 
13455679c59fSAlex Elder /*
13465679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
13475679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
13485679c59fSAlex Elder  *
13495679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
13505679c59fSAlex Elder  * away again.  It's possible that the response from two existence
13515679c59fSAlex Elder  * checks are separated by the creation of the target object, and
13525679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
13535679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
13545679c59fSAlex Elder  */
13555679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
13565679c59fSAlex Elder 				bool exists)
13575679c59fSAlex Elder {
13585679c59fSAlex Elder 	if (exists)
13595679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
13605679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
13615679c59fSAlex Elder 	smp_mb();
13625679c59fSAlex Elder }
13635679c59fSAlex Elder 
13645679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
13655679c59fSAlex Elder {
13665679c59fSAlex Elder 	smp_mb();
13675679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
13685679c59fSAlex Elder }
13695679c59fSAlex Elder 
13705679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
13715679c59fSAlex Elder {
13725679c59fSAlex Elder 	smp_mb();
13735679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
13745679c59fSAlex Elder }
13755679c59fSAlex Elder 
1376bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1377bf0d5f50SAlex Elder {
137837206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
137937206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1380bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1381bf0d5f50SAlex Elder }
1382bf0d5f50SAlex Elder 
1383bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1384bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1385bf0d5f50SAlex Elder {
1386bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
138737206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
138837206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1389bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1390bf0d5f50SAlex Elder }
1391bf0d5f50SAlex Elder 
1392e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1393e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1394bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1395bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1396bf0d5f50SAlex Elder {
1397bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
139837206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
139937206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1400e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1401e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1402e93f3152SAlex Elder 	else
1403bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1404bf0d5f50SAlex Elder }
1405bf0d5f50SAlex Elder 
1406bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1407bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1408bf0d5f50SAlex Elder {
140925dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
141025dcf954SAlex Elder 
1411b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1412bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
141325dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
14146365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
14156365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1416bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
141725dcf954SAlex Elder 	img_request->obj_request_count++;
141825dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
141937206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
142037206ee5SAlex Elder 		obj_request->which);
1421bf0d5f50SAlex Elder }
1422bf0d5f50SAlex Elder 
1423bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1424bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1425bf0d5f50SAlex Elder {
1426bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
142725dcf954SAlex Elder 
142837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
142937206ee5SAlex Elder 		obj_request->which);
1430bf0d5f50SAlex Elder 	list_del(&obj_request->links);
143125dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
143225dcf954SAlex Elder 	img_request->obj_request_count--;
143325dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
143425dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
14356365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1436bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1437bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
143825dcf954SAlex Elder 	obj_request->callback = NULL;
1439bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1440bf0d5f50SAlex Elder }
1441bf0d5f50SAlex Elder 
1442bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1443bf0d5f50SAlex Elder {
1444bf0d5f50SAlex Elder 	switch (type) {
14459969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1446bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1447788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1448bf0d5f50SAlex Elder 		return true;
1449bf0d5f50SAlex Elder 	default:
1450bf0d5f50SAlex Elder 		return false;
1451bf0d5f50SAlex Elder 	}
1452bf0d5f50SAlex Elder }
1453bf0d5f50SAlex Elder 
1454bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1455bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1456bf0d5f50SAlex Elder {
145737206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
145837206ee5SAlex Elder 
1459bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1460bf0d5f50SAlex Elder }
1461bf0d5f50SAlex Elder 
1462bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1463bf0d5f50SAlex Elder {
146455f27e09SAlex Elder 
146537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
146655f27e09SAlex Elder 
146755f27e09SAlex Elder 	/*
146855f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
146955f27e09SAlex Elder 	 * count for the image request.  We could instead use
147055f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
147155f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
147255f27e09SAlex Elder 	 */
147355f27e09SAlex Elder 	if (!img_request->result) {
147455f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
147555f27e09SAlex Elder 		u64 xferred = 0;
147655f27e09SAlex Elder 
147755f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
147855f27e09SAlex Elder 			xferred += obj_request->xferred;
147955f27e09SAlex Elder 		img_request->xferred = xferred;
148055f27e09SAlex Elder 	}
148155f27e09SAlex Elder 
1482bf0d5f50SAlex Elder 	if (img_request->callback)
1483bf0d5f50SAlex Elder 		img_request->callback(img_request);
1484bf0d5f50SAlex Elder 	else
1485bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1486bf0d5f50SAlex Elder }
1487bf0d5f50SAlex Elder 
1488788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1489788e2df3SAlex Elder 
1490788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1491788e2df3SAlex Elder {
149237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
149337206ee5SAlex Elder 
1494788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1495788e2df3SAlex Elder }
1496788e2df3SAlex Elder 
14970c425248SAlex Elder /*
14980c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
14990c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
15000c425248SAlex Elder  * and currently never change thereafter.
15010c425248SAlex Elder  */
15020c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
15030c425248SAlex Elder {
15040c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
15050c425248SAlex Elder 	smp_mb();
15060c425248SAlex Elder }
15070c425248SAlex Elder 
15080c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
15090c425248SAlex Elder {
15100c425248SAlex Elder 	smp_mb();
15110c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
15120c425248SAlex Elder }
15130c425248SAlex Elder 
15149849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
15159849e986SAlex Elder {
15169849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
15179849e986SAlex Elder 	smp_mb();
15189849e986SAlex Elder }
15199849e986SAlex Elder 
1520e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1521e93f3152SAlex Elder {
1522e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1523e93f3152SAlex Elder 	smp_mb();
1524e93f3152SAlex Elder }
1525e93f3152SAlex Elder 
15269849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
15279849e986SAlex Elder {
15289849e986SAlex Elder 	smp_mb();
15299849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
15309849e986SAlex Elder }
15319849e986SAlex Elder 
1532d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1533d0b2e944SAlex Elder {
1534d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1535d0b2e944SAlex Elder 	smp_mb();
1536d0b2e944SAlex Elder }
1537d0b2e944SAlex Elder 
1538a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1539a2acd00eSAlex Elder {
1540a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1541a2acd00eSAlex Elder 	smp_mb();
1542a2acd00eSAlex Elder }
1543a2acd00eSAlex Elder 
1544d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1545d0b2e944SAlex Elder {
1546d0b2e944SAlex Elder 	smp_mb();
1547d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1548d0b2e944SAlex Elder }
1549d0b2e944SAlex Elder 
15506e2a4505SAlex Elder static void
15516e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
15526e2a4505SAlex Elder {
1553b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1554b9434c5bSAlex Elder 	u64 length = obj_request->length;
1555b9434c5bSAlex Elder 
15566e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
15576e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1558b9434c5bSAlex Elder 		xferred, length);
15596e2a4505SAlex Elder 	/*
156017c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
156117c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
156217c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
156317c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
156417c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
156517c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
15666e2a4505SAlex Elder 	 */
1567b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
15686e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1569b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
15706e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1571b9434c5bSAlex Elder 		else
1572b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
15736e2a4505SAlex Elder 		obj_request->result = 0;
1574b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1575b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1576b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1577b9434c5bSAlex Elder 		else
1578b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
15796e2a4505SAlex Elder 	}
158017c1cc1dSJosh Durgin 	obj_request->xferred = length;
15816e2a4505SAlex Elder 	obj_request_done_set(obj_request);
15826e2a4505SAlex Elder }
15836e2a4505SAlex Elder 
1584bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1585bf0d5f50SAlex Elder {
158637206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
158737206ee5SAlex Elder 		obj_request->callback);
1588bf0d5f50SAlex Elder 	if (obj_request->callback)
1589bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1590788e2df3SAlex Elder 	else
1591788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1592bf0d5f50SAlex Elder }
1593bf0d5f50SAlex Elder 
1594c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
159539bf2c5dSAlex Elder {
159639bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
159739bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
159839bf2c5dSAlex Elder }
159939bf2c5dSAlex Elder 
1600c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1601bf0d5f50SAlex Elder {
160257acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1603a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
160457acbaa7SAlex Elder 	bool layered = false;
160557acbaa7SAlex Elder 
160657acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
160757acbaa7SAlex Elder 		img_request = obj_request->img_request;
160857acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1609a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
161057acbaa7SAlex Elder 	}
16118b3e1a56SAlex Elder 
16128b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
16138b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
16148b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1615a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1616a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
16178b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
16188b3e1a56SAlex Elder 	else if (img_request)
16196e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
16206e2a4505SAlex Elder 	else
162107741308SAlex Elder 		obj_request_done_set(obj_request);
1622bf0d5f50SAlex Elder }
1623bf0d5f50SAlex Elder 
1624c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1625bf0d5f50SAlex Elder {
16261b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
16271b83bef2SSage Weil 		obj_request->result, obj_request->length);
16281b83bef2SSage Weil 	/*
16298b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
16308b3e1a56SAlex Elder 	 * it to our originally-requested length.
16311b83bef2SSage Weil 	 */
16321b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
163307741308SAlex Elder 	obj_request_done_set(obj_request);
1634bf0d5f50SAlex Elder }
1635bf0d5f50SAlex Elder 
1636fbfab539SAlex Elder /*
1637fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1638fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1639fbfab539SAlex Elder  */
1640c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1641fbfab539SAlex Elder {
164237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1643fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1644fbfab539SAlex Elder }
1645fbfab539SAlex Elder 
1646bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1647bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1648bf0d5f50SAlex Elder {
1649bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1650bf0d5f50SAlex Elder 	u16 opcode;
1651bf0d5f50SAlex Elder 
165237206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1653bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
165457acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
165557acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
165657acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
165757acbaa7SAlex Elder 	} else {
165857acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
165957acbaa7SAlex Elder 	}
1660bf0d5f50SAlex Elder 
16611b83bef2SSage Weil 	if (osd_req->r_result < 0)
16621b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1663bf0d5f50SAlex Elder 
16640eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1665bf0d5f50SAlex Elder 
1666c47f9371SAlex Elder 	/*
1667c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1668c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1669c47f9371SAlex Elder 	 */
16701b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1671c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
167279528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1673bf0d5f50SAlex Elder 	switch (opcode) {
1674bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1675c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1676bf0d5f50SAlex Elder 		break;
1677bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1678c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1679bf0d5f50SAlex Elder 		break;
1680fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1681c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1682fbfab539SAlex Elder 		break;
168336be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1684b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
16859969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1686c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
16879969ebc5SAlex Elder 		break;
1688bf0d5f50SAlex Elder 	default:
1689bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1690bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1691bf0d5f50SAlex Elder 		break;
1692bf0d5f50SAlex Elder 	}
1693bf0d5f50SAlex Elder 
169407741308SAlex Elder 	if (obj_request_done_test(obj_request))
1695bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1696bf0d5f50SAlex Elder }
1697bf0d5f50SAlex Elder 
16989d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1699430c28c3SAlex Elder {
1700430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17018c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17029d4df01fSAlex Elder 	u64 snap_id;
1703430c28c3SAlex Elder 
17048c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1705430c28c3SAlex Elder 
17069d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
17078c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17089d4df01fSAlex Elder 			NULL, snap_id, NULL);
17099d4df01fSAlex Elder }
17109d4df01fSAlex Elder 
17119d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
17129d4df01fSAlex Elder {
17139d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17149d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17159d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
17169d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
17179d4df01fSAlex Elder 
17189d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
17199d4df01fSAlex Elder 
17209d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
17219d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17229d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1723430c28c3SAlex Elder }
1724430c28c3SAlex Elder 
1725bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1726bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1727bf0d5f50SAlex Elder 					bool write_request,
1728430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1729bf0d5f50SAlex Elder {
1730bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1731bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1732bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1733bf0d5f50SAlex Elder 
17346365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
17356365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
17366365d33aSAlex Elder 
17370c425248SAlex Elder 		rbd_assert(write_request ==
17380c425248SAlex Elder 				img_request_write_test(img_request));
17390c425248SAlex Elder 		if (write_request)
1740bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1741bf0d5f50SAlex Elder 	}
1742bf0d5f50SAlex Elder 
1743bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1744bf0d5f50SAlex Elder 
1745bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1746bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1747bf0d5f50SAlex Elder 	if (!osd_req)
1748bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1749bf0d5f50SAlex Elder 
1750430c28c3SAlex Elder 	if (write_request)
1751bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1752430c28c3SAlex Elder 	else
1753bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1754bf0d5f50SAlex Elder 
1755bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1756bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1757bf0d5f50SAlex Elder 
1758bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1759bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1760bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1761bf0d5f50SAlex Elder 
1762bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1763bf0d5f50SAlex Elder 
1764bf0d5f50SAlex Elder 	return osd_req;
1765bf0d5f50SAlex Elder }
1766bf0d5f50SAlex Elder 
17670eefd470SAlex Elder /*
17680eefd470SAlex Elder  * Create a copyup osd request based on the information in the
17690eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
17700eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
17710eefd470SAlex Elder  */
17720eefd470SAlex Elder static struct ceph_osd_request *
17730eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
17740eefd470SAlex Elder {
17750eefd470SAlex Elder 	struct rbd_img_request *img_request;
17760eefd470SAlex Elder 	struct ceph_snap_context *snapc;
17770eefd470SAlex Elder 	struct rbd_device *rbd_dev;
17780eefd470SAlex Elder 	struct ceph_osd_client *osdc;
17790eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
17800eefd470SAlex Elder 
17810eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17820eefd470SAlex Elder 	img_request = obj_request->img_request;
17830eefd470SAlex Elder 	rbd_assert(img_request);
17840eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
17850eefd470SAlex Elder 
17860eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
17870eefd470SAlex Elder 
17880eefd470SAlex Elder 	snapc = img_request->snapc;
17890eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
17900eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
17910eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
17920eefd470SAlex Elder 	if (!osd_req)
17930eefd470SAlex Elder 		return NULL;	/* ENOMEM */
17940eefd470SAlex Elder 
17950eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
17960eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
17970eefd470SAlex Elder 	osd_req->r_priv = obj_request;
17980eefd470SAlex Elder 
17990eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
18000eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
18010eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
18020eefd470SAlex Elder 
18030eefd470SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
18040eefd470SAlex Elder 
18050eefd470SAlex Elder 	return osd_req;
18060eefd470SAlex Elder }
18070eefd470SAlex Elder 
18080eefd470SAlex Elder 
1809bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1810bf0d5f50SAlex Elder {
1811bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1812bf0d5f50SAlex Elder }
1813bf0d5f50SAlex Elder 
1814bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1815bf0d5f50SAlex Elder 
1816bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1817bf0d5f50SAlex Elder 						u64 offset, u64 length,
1818bf0d5f50SAlex Elder 						enum obj_request_type type)
1819bf0d5f50SAlex Elder {
1820bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1821bf0d5f50SAlex Elder 	size_t size;
1822bf0d5f50SAlex Elder 	char *name;
1823bf0d5f50SAlex Elder 
1824bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1825bf0d5f50SAlex Elder 
1826bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1827f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1828f907ad55SAlex Elder 	if (!name)
1829bf0d5f50SAlex Elder 		return NULL;
1830bf0d5f50SAlex Elder 
1831868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1832f907ad55SAlex Elder 	if (!obj_request) {
1833f907ad55SAlex Elder 		kfree(name);
1834f907ad55SAlex Elder 		return NULL;
1835f907ad55SAlex Elder 	}
1836f907ad55SAlex Elder 
1837bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1838bf0d5f50SAlex Elder 	obj_request->offset = offset;
1839bf0d5f50SAlex Elder 	obj_request->length = length;
1840926f9b3fSAlex Elder 	obj_request->flags = 0;
1841bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1842bf0d5f50SAlex Elder 	obj_request->type = type;
1843bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1844788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1845bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1846bf0d5f50SAlex Elder 
184737206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
184837206ee5SAlex Elder 		offset, length, (int)type, obj_request);
184937206ee5SAlex Elder 
1850bf0d5f50SAlex Elder 	return obj_request;
1851bf0d5f50SAlex Elder }
1852bf0d5f50SAlex Elder 
1853bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1854bf0d5f50SAlex Elder {
1855bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1856bf0d5f50SAlex Elder 
1857bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1858bf0d5f50SAlex Elder 
185937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
186037206ee5SAlex Elder 
1861bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1862bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1863bf0d5f50SAlex Elder 
1864bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1865bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1866bf0d5f50SAlex Elder 
1867bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1868bf0d5f50SAlex Elder 	switch (obj_request->type) {
18699969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
18709969ebc5SAlex Elder 		break;		/* Nothing to do */
1871bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1872bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1873bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1874bf0d5f50SAlex Elder 		break;
1875788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1876788e2df3SAlex Elder 		if (obj_request->pages)
1877788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1878788e2df3SAlex Elder 						obj_request->page_count);
1879788e2df3SAlex Elder 		break;
1880bf0d5f50SAlex Elder 	}
1881bf0d5f50SAlex Elder 
1882f907ad55SAlex Elder 	kfree(obj_request->object_name);
1883868311b1SAlex Elder 	obj_request->object_name = NULL;
1884868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1885bf0d5f50SAlex Elder }
1886bf0d5f50SAlex Elder 
1887fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1888fb65d228SAlex Elder 
1889fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1890fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1891fb65d228SAlex Elder {
1892fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1893fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1894fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1895fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1896fb65d228SAlex Elder }
1897fb65d228SAlex Elder 
1898bf0d5f50SAlex Elder /*
1899a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1900a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1901a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1902a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1903a2acd00eSAlex Elder  */
1904a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1905a2acd00eSAlex Elder {
1906a2acd00eSAlex Elder 	int counter;
1907a2acd00eSAlex Elder 
1908a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1909a2acd00eSAlex Elder 		return;
1910a2acd00eSAlex Elder 
1911a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1912a2acd00eSAlex Elder 	if (counter > 0)
1913a2acd00eSAlex Elder 		return;
1914a2acd00eSAlex Elder 
1915a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1916a2acd00eSAlex Elder 
1917a2acd00eSAlex Elder 	if (!counter)
1918a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1919a2acd00eSAlex Elder 	else
1920a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference underflow\n");
1921a2acd00eSAlex Elder }
1922a2acd00eSAlex Elder 
1923a2acd00eSAlex Elder /*
1924a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1925a2acd00eSAlex Elder  * parent.
1926a2acd00eSAlex Elder  *
1927392a9dadSAlex Elder  * We must get the reference before checking for the overlap to
1928392a9dadSAlex Elder  * coordinate properly with zeroing the parent overlap in
1929392a9dadSAlex Elder  * rbd_dev_v2_parent_info() when an image gets flattened.  We
1930392a9dadSAlex Elder  * drop it again if there is no overlap.
1931392a9dadSAlex Elder  *
1932a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1933a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1934a2acd00eSAlex Elder  * false otherwise.
1935a2acd00eSAlex Elder  */
1936a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1937a2acd00eSAlex Elder {
1938a2acd00eSAlex Elder 	int counter;
1939a2acd00eSAlex Elder 
1940a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1941a2acd00eSAlex Elder 		return false;
1942a2acd00eSAlex Elder 
1943a2acd00eSAlex Elder 	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1944a2acd00eSAlex Elder 	if (counter > 0 && rbd_dev->parent_overlap)
1945a2acd00eSAlex Elder 		return true;
1946a2acd00eSAlex Elder 
1947a2acd00eSAlex Elder 	/* Image was flattened, but parent is not yet torn down */
1948a2acd00eSAlex Elder 
1949a2acd00eSAlex Elder 	if (counter < 0)
1950a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference overflow\n");
1951a2acd00eSAlex Elder 
1952a2acd00eSAlex Elder 	return false;
1953a2acd00eSAlex Elder }
1954a2acd00eSAlex Elder 
1955bf0d5f50SAlex Elder /*
1956bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1957bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1958bf0d5f50SAlex Elder  * (if there is one).
1959bf0d5f50SAlex Elder  */
1960cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1961cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1962bf0d5f50SAlex Elder 					u64 offset, u64 length,
1963e93f3152SAlex Elder 					bool write_request)
1964bf0d5f50SAlex Elder {
1965bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1966bf0d5f50SAlex Elder 
19671c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1968bf0d5f50SAlex Elder 	if (!img_request)
1969bf0d5f50SAlex Elder 		return NULL;
1970bf0d5f50SAlex Elder 
1971bf0d5f50SAlex Elder 	if (write_request) {
1972bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1973812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
1974bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1975bf0d5f50SAlex Elder 	}
1976bf0d5f50SAlex Elder 
1977bf0d5f50SAlex Elder 	img_request->rq = NULL;
1978bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1979bf0d5f50SAlex Elder 	img_request->offset = offset;
1980bf0d5f50SAlex Elder 	img_request->length = length;
19810c425248SAlex Elder 	img_request->flags = 0;
19820c425248SAlex Elder 	if (write_request) {
19830c425248SAlex Elder 		img_request_write_set(img_request);
1984468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
19850c425248SAlex Elder 	} else {
1986bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
19870c425248SAlex Elder 	}
1988a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
1989d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1990bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1991bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1992bf0d5f50SAlex Elder 	img_request->callback = NULL;
1993a5a337d4SAlex Elder 	img_request->result = 0;
1994bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1995bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1996bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1997bf0d5f50SAlex Elder 
199837206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
199937206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
200037206ee5SAlex Elder 		img_request);
200137206ee5SAlex Elder 
2002bf0d5f50SAlex Elder 	return img_request;
2003bf0d5f50SAlex Elder }
2004bf0d5f50SAlex Elder 
2005bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2006bf0d5f50SAlex Elder {
2007bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2008bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2009bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2010bf0d5f50SAlex Elder 
2011bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2012bf0d5f50SAlex Elder 
201337206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
201437206ee5SAlex Elder 
2015bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2016bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
201725dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2018bf0d5f50SAlex Elder 
2019a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2020a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2021a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2022a2acd00eSAlex Elder 	}
2023a2acd00eSAlex Elder 
20240c425248SAlex Elder 	if (img_request_write_test(img_request))
2025812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2026bf0d5f50SAlex Elder 
20271c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2028bf0d5f50SAlex Elder }
2029bf0d5f50SAlex Elder 
2030e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2031e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2032e93f3152SAlex Elder 					u64 img_offset, u64 length)
2033e93f3152SAlex Elder {
2034e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2035e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2036e93f3152SAlex Elder 
2037e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2038e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2039e93f3152SAlex Elder 
2040e93f3152SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
2041e93f3152SAlex Elder 						img_offset, length, false);
2042e93f3152SAlex Elder 	if (!parent_request)
2043e93f3152SAlex Elder 		return NULL;
2044e93f3152SAlex Elder 
2045e93f3152SAlex Elder 	img_request_child_set(parent_request);
2046e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2047e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2048e93f3152SAlex Elder 
2049e93f3152SAlex Elder 	return parent_request;
2050e93f3152SAlex Elder }
2051e93f3152SAlex Elder 
2052e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2053e93f3152SAlex Elder {
2054e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2055e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2056e93f3152SAlex Elder 
2057e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2058e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2059e93f3152SAlex Elder 
2060e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2061e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2062e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2063e93f3152SAlex Elder 
2064e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2065e93f3152SAlex Elder }
2066e93f3152SAlex Elder 
20671217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
20681217857fSAlex Elder {
20696365d33aSAlex Elder 	struct rbd_img_request *img_request;
20701217857fSAlex Elder 	unsigned int xferred;
20711217857fSAlex Elder 	int result;
20728b3e1a56SAlex Elder 	bool more;
20731217857fSAlex Elder 
20746365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20756365d33aSAlex Elder 	img_request = obj_request->img_request;
20766365d33aSAlex Elder 
20771217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
20781217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
20791217857fSAlex Elder 	result = obj_request->result;
20801217857fSAlex Elder 	if (result) {
20811217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
20821217857fSAlex Elder 
20831217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
20841217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
20851217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
20861217857fSAlex Elder 			obj_request->offset);
20871217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
20881217857fSAlex Elder 			result, xferred);
20891217857fSAlex Elder 		if (!img_request->result)
20901217857fSAlex Elder 			img_request->result = result;
20911217857fSAlex Elder 	}
20921217857fSAlex Elder 
2093f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2094f1a4739fSAlex Elder 
2095f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2096f1a4739fSAlex Elder 		obj_request->pages = NULL;
2097f1a4739fSAlex Elder 		obj_request->page_count = 0;
2098f1a4739fSAlex Elder 	}
2099f1a4739fSAlex Elder 
21008b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
21018b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
21028b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
21038b3e1a56SAlex Elder 	} else {
21048b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
21058b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
21068b3e1a56SAlex Elder 	}
21078b3e1a56SAlex Elder 
21088b3e1a56SAlex Elder 	return more;
21091217857fSAlex Elder }
21101217857fSAlex Elder 
21112169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
21122169238dSAlex Elder {
21132169238dSAlex Elder 	struct rbd_img_request *img_request;
21142169238dSAlex Elder 	u32 which = obj_request->which;
21152169238dSAlex Elder 	bool more = true;
21162169238dSAlex Elder 
21176365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21182169238dSAlex Elder 	img_request = obj_request->img_request;
21192169238dSAlex Elder 
21202169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
21212169238dSAlex Elder 	rbd_assert(img_request != NULL);
21222169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
21232169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
21242169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
21252169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
21262169238dSAlex Elder 
21272169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
21282169238dSAlex Elder 	if (which != img_request->next_completion)
21292169238dSAlex Elder 		goto out;
21302169238dSAlex Elder 
21312169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
21322169238dSAlex Elder 		rbd_assert(more);
21332169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
21342169238dSAlex Elder 
21352169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
21362169238dSAlex Elder 			break;
21371217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
21382169238dSAlex Elder 		which++;
21392169238dSAlex Elder 	}
21402169238dSAlex Elder 
21412169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
21422169238dSAlex Elder 	img_request->next_completion = which;
21432169238dSAlex Elder out:
21442169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
21452169238dSAlex Elder 
21462169238dSAlex Elder 	if (!more)
21472169238dSAlex Elder 		rbd_img_request_complete(img_request);
21482169238dSAlex Elder }
21492169238dSAlex Elder 
2150f1a4739fSAlex Elder /*
2151f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2152f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2153f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2154f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2155f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2156f1a4739fSAlex Elder  * all data described by the image request.
2157f1a4739fSAlex Elder  */
2158f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2159f1a4739fSAlex Elder 					enum obj_request_type type,
2160f1a4739fSAlex Elder 					void *data_desc)
2161bf0d5f50SAlex Elder {
2162bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2163bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2164bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
21650c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
2166a158073cSJingoo Han 	struct bio *bio_list = NULL;
2167f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2168a158073cSJingoo Han 	struct page **pages = NULL;
21697da22d29SAlex Elder 	u64 img_offset;
2170bf0d5f50SAlex Elder 	u64 resid;
2171bf0d5f50SAlex Elder 	u16 opcode;
2172bf0d5f50SAlex Elder 
2173f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2174f1a4739fSAlex Elder 		(int)type, data_desc);
217537206ee5SAlex Elder 
2176430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
21777da22d29SAlex Elder 	img_offset = img_request->offset;
2178bf0d5f50SAlex Elder 	resid = img_request->length;
21794dda41d3SAlex Elder 	rbd_assert(resid > 0);
2180f1a4739fSAlex Elder 
2181f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2182f1a4739fSAlex Elder 		bio_list = data_desc;
2183f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2184f1a4739fSAlex Elder 	} else {
2185f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2186f1a4739fSAlex Elder 		pages = data_desc;
2187f1a4739fSAlex Elder 	}
2188f1a4739fSAlex Elder 
2189bf0d5f50SAlex Elder 	while (resid) {
21902fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2191bf0d5f50SAlex Elder 		const char *object_name;
2192bf0d5f50SAlex Elder 		u64 offset;
2193bf0d5f50SAlex Elder 		u64 length;
2194bf0d5f50SAlex Elder 
21957da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2196bf0d5f50SAlex Elder 		if (!object_name)
2197bf0d5f50SAlex Elder 			goto out_unwind;
21987da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
21997da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2200bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2201f1a4739fSAlex Elder 						offset, length, type);
220278c2a44aSAlex Elder 		/* object request has its own copy of the object name */
220378c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2204bf0d5f50SAlex Elder 		if (!obj_request)
2205bf0d5f50SAlex Elder 			goto out_unwind;
220603507db6SJosh Durgin 		/*
220703507db6SJosh Durgin 		 * set obj_request->img_request before creating the
220803507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
220903507db6SJosh Durgin 		 */
221003507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2211bf0d5f50SAlex Elder 
2212f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2213f1a4739fSAlex Elder 			unsigned int clone_size;
2214f1a4739fSAlex Elder 
2215bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2216bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2217f1a4739fSAlex Elder 			obj_request->bio_list =
2218f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2219f1a4739fSAlex Elder 								&bio_offset,
2220f1a4739fSAlex Elder 								clone_size,
2221bf0d5f50SAlex Elder 								GFP_ATOMIC);
2222bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
2223bf0d5f50SAlex Elder 				goto out_partial;
2224f1a4739fSAlex Elder 		} else {
2225f1a4739fSAlex Elder 			unsigned int page_count;
2226f1a4739fSAlex Elder 
2227f1a4739fSAlex Elder 			obj_request->pages = pages;
2228f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2229f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2230f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2231f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2232f1a4739fSAlex Elder 			pages += page_count;
2233f1a4739fSAlex Elder 		}
2234bf0d5f50SAlex Elder 
22352fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
22362fa12320SAlex Elder 						obj_request);
22372fa12320SAlex Elder 		if (!osd_req)
2238bf0d5f50SAlex Elder 			goto out_partial;
22392fa12320SAlex Elder 		obj_request->osd_req = osd_req;
22402169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
2241430c28c3SAlex Elder 
22422fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
22432fa12320SAlex Elder 						0, 0);
2244f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
2245406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
2246f1a4739fSAlex Elder 					obj_request->bio_list, length);
2247f1a4739fSAlex Elder 		else
2248f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
2249f1a4739fSAlex Elder 					obj_request->pages, length,
2250f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
22519d4df01fSAlex Elder 
22529d4df01fSAlex Elder 		if (write_request)
22539d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
22549d4df01fSAlex Elder 		else
22559d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2256430c28c3SAlex Elder 
22577da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2258bf0d5f50SAlex Elder 
22597da22d29SAlex Elder 		img_offset += length;
2260bf0d5f50SAlex Elder 		resid -= length;
2261bf0d5f50SAlex Elder 	}
2262bf0d5f50SAlex Elder 
2263bf0d5f50SAlex Elder 	return 0;
2264bf0d5f50SAlex Elder 
2265bf0d5f50SAlex Elder out_partial:
2266bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
2267bf0d5f50SAlex Elder out_unwind:
2268bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2269bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
2270bf0d5f50SAlex Elder 
2271bf0d5f50SAlex Elder 	return -ENOMEM;
2272bf0d5f50SAlex Elder }
2273bf0d5f50SAlex Elder 
22743d7efd18SAlex Elder static void
22750eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
22760eefd470SAlex Elder {
22770eefd470SAlex Elder 	struct rbd_img_request *img_request;
22780eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2279ebda6408SAlex Elder 	struct page **pages;
22800eefd470SAlex Elder 	u32 page_count;
22810eefd470SAlex Elder 
22820eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
22830eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22840eefd470SAlex Elder 	img_request = obj_request->img_request;
22850eefd470SAlex Elder 	rbd_assert(img_request);
22860eefd470SAlex Elder 
22870eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
22880eefd470SAlex Elder 	rbd_assert(rbd_dev);
22890eefd470SAlex Elder 
2290ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2291ebda6408SAlex Elder 	rbd_assert(pages != NULL);
22920eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2293ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2294ebda6408SAlex Elder 	rbd_assert(page_count);
2295ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2296ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
22970eefd470SAlex Elder 
22980eefd470SAlex Elder 	/*
22990eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
23000eefd470SAlex Elder 	 * original write request.  There is no such thing as a
23010eefd470SAlex Elder 	 * successful short write, so if the request was successful
23020eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
23030eefd470SAlex Elder 	 */
23040eefd470SAlex Elder 	if (!obj_request->result)
23050eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
23060eefd470SAlex Elder 
23070eefd470SAlex Elder 	/* Finish up with the normal image object callback */
23080eefd470SAlex Elder 
23090eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
23100eefd470SAlex Elder }
23110eefd470SAlex Elder 
23120eefd470SAlex Elder static void
23133d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
23143d7efd18SAlex Elder {
23153d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
23160eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
23170eefd470SAlex Elder 	struct ceph_osd_client *osdc;
23180eefd470SAlex Elder 	struct rbd_device *rbd_dev;
23193d7efd18SAlex Elder 	struct page **pages;
2320ebda6408SAlex Elder 	u32 page_count;
2321bbea1c1aSAlex Elder 	int img_result;
2322ebda6408SAlex Elder 	u64 parent_length;
2323b91f09f1SAlex Elder 	u64 offset;
2324b91f09f1SAlex Elder 	u64 length;
23253d7efd18SAlex Elder 
23263d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
23273d7efd18SAlex Elder 
23283d7efd18SAlex Elder 	/* First get what we need from the image request */
23293d7efd18SAlex Elder 
23303d7efd18SAlex Elder 	pages = img_request->copyup_pages;
23313d7efd18SAlex Elder 	rbd_assert(pages != NULL);
23323d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2333ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2334ebda6408SAlex Elder 	rbd_assert(page_count);
2335ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
23363d7efd18SAlex Elder 
23373d7efd18SAlex Elder 	orig_request = img_request->obj_request;
23383d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2339b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2340bbea1c1aSAlex Elder 	img_result = img_request->result;
2341ebda6408SAlex Elder 	parent_length = img_request->length;
2342ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
23433d7efd18SAlex Elder 	rbd_img_request_put(img_request);
23443d7efd18SAlex Elder 
234591c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
234691c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
23473d7efd18SAlex Elder 	rbd_assert(rbd_dev);
23483d7efd18SAlex Elder 
2349bbea1c1aSAlex Elder 	/*
2350bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2351bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2352bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2353bbea1c1aSAlex Elder 	 */
2354bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2355bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2356bbea1c1aSAlex Elder 
2357bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2358bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2359bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2360bbea1c1aSAlex Elder 		if (!img_result)
2361bbea1c1aSAlex Elder 			return;
2362bbea1c1aSAlex Elder 	}
2363bbea1c1aSAlex Elder 
2364bbea1c1aSAlex Elder 	if (img_result)
23650eefd470SAlex Elder 		goto out_err;
23663d7efd18SAlex Elder 
23678785b1d4SAlex Elder 	/*
23688785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
23698785b1d4SAlex Elder 	 * We need a new one that can hold the two ops in a copyup
23708785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
23718785b1d4SAlex Elder 	 * original request, and release the old one.
23728785b1d4SAlex Elder 	 */
2373bbea1c1aSAlex Elder 	img_result = -ENOMEM;
23740eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
23750eefd470SAlex Elder 	if (!osd_req)
23760eefd470SAlex Elder 		goto out_err;
23778785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
23780eefd470SAlex Elder 	orig_request->osd_req = osd_req;
23790eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2380ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
23813d7efd18SAlex Elder 
23820eefd470SAlex Elder 	/* Initialize the copyup op */
23830eefd470SAlex Elder 
23840eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2385ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
23860eefd470SAlex Elder 						false, false);
23870eefd470SAlex Elder 
23880eefd470SAlex Elder 	/* Then the original write request op */
23890eefd470SAlex Elder 
2390b91f09f1SAlex Elder 	offset = orig_request->offset;
2391b91f09f1SAlex Elder 	length = orig_request->length;
23920eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2393b91f09f1SAlex Elder 					offset, length, 0, 0);
2394b91f09f1SAlex Elder 	if (orig_request->type == OBJ_REQUEST_BIO)
2395b91f09f1SAlex Elder 		osd_req_op_extent_osd_data_bio(osd_req, 1,
2396b91f09f1SAlex Elder 					orig_request->bio_list, length);
2397b91f09f1SAlex Elder 	else
2398b91f09f1SAlex Elder 		osd_req_op_extent_osd_data_pages(osd_req, 1,
2399b91f09f1SAlex Elder 					orig_request->pages, length,
2400b91f09f1SAlex Elder 					offset & ~PAGE_MASK, false, false);
24010eefd470SAlex Elder 
24020eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
24030eefd470SAlex Elder 
24040eefd470SAlex Elder 	/* All set, send it off. */
24050eefd470SAlex Elder 
24060eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
24070eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2408bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2409bbea1c1aSAlex Elder 	if (!img_result)
24100eefd470SAlex Elder 		return;
24110eefd470SAlex Elder out_err:
24120eefd470SAlex Elder 	/* Record the error code and complete the request */
24130eefd470SAlex Elder 
2414bbea1c1aSAlex Elder 	orig_request->result = img_result;
24150eefd470SAlex Elder 	orig_request->xferred = 0;
24163d7efd18SAlex Elder 	obj_request_done_set(orig_request);
24173d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
24183d7efd18SAlex Elder }
24193d7efd18SAlex Elder 
24203d7efd18SAlex Elder /*
24213d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
24223d7efd18SAlex Elder  * entire target of the given object request.  This is used for
24233d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
24243d7efd18SAlex Elder  * object request from the image request does not exist.
24253d7efd18SAlex Elder  *
24263d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
24273d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
24283d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
24293d7efd18SAlex Elder  * the original object request for the copyup operation.
24303d7efd18SAlex Elder  *
24313d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
24323d7efd18SAlex Elder  * object request and mark it done so it gets completed.
24333d7efd18SAlex Elder  */
24343d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
24353d7efd18SAlex Elder {
24363d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
24373d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
24383d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
24393d7efd18SAlex Elder 	u64 img_offset;
24403d7efd18SAlex Elder 	u64 length;
24413d7efd18SAlex Elder 	struct page **pages = NULL;
24423d7efd18SAlex Elder 	u32 page_count;
24433d7efd18SAlex Elder 	int result;
24443d7efd18SAlex Elder 
24453d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2446b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
24473d7efd18SAlex Elder 
24483d7efd18SAlex Elder 	img_request = obj_request->img_request;
24493d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
24503d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
24513d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
24523d7efd18SAlex Elder 
24533d7efd18SAlex Elder 	/*
24543d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
24553d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
24563d7efd18SAlex Elder 	 */
24573d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
24583d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
24593d7efd18SAlex Elder 
24603d7efd18SAlex Elder 	/*
2461a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2462a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2463a9e8ba2cSAlex Elder 	 * necessary.
2464a9e8ba2cSAlex Elder 	 */
2465a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2466a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2467a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2468a9e8ba2cSAlex Elder 	}
2469a9e8ba2cSAlex Elder 
2470a9e8ba2cSAlex Elder 	/*
24713d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
24723d7efd18SAlex Elder 	 * from the parent.
24733d7efd18SAlex Elder 	 */
24743d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
24753d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
24763d7efd18SAlex Elder 	if (IS_ERR(pages)) {
24773d7efd18SAlex Elder 		result = PTR_ERR(pages);
24783d7efd18SAlex Elder 		pages = NULL;
24793d7efd18SAlex Elder 		goto out_err;
24803d7efd18SAlex Elder 	}
24813d7efd18SAlex Elder 
24823d7efd18SAlex Elder 	result = -ENOMEM;
2483e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2484e93f3152SAlex Elder 						img_offset, length);
24853d7efd18SAlex Elder 	if (!parent_request)
24863d7efd18SAlex Elder 		goto out_err;
24873d7efd18SAlex Elder 
24883d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
24893d7efd18SAlex Elder 	if (result)
24903d7efd18SAlex Elder 		goto out_err;
24913d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2492ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
24933d7efd18SAlex Elder 
24943d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
24953d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
24963d7efd18SAlex Elder 	if (!result)
24973d7efd18SAlex Elder 		return 0;
24983d7efd18SAlex Elder 
24993d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2500ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
25013d7efd18SAlex Elder 	parent_request->obj_request = NULL;
25023d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
25033d7efd18SAlex Elder out_err:
25043d7efd18SAlex Elder 	if (pages)
25053d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
25063d7efd18SAlex Elder 	if (parent_request)
25073d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
25083d7efd18SAlex Elder 	obj_request->result = result;
25093d7efd18SAlex Elder 	obj_request->xferred = 0;
25103d7efd18SAlex Elder 	obj_request_done_set(obj_request);
25113d7efd18SAlex Elder 
25123d7efd18SAlex Elder 	return result;
25133d7efd18SAlex Elder }
25143d7efd18SAlex Elder 
2515c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2516c5b5ef6cSAlex Elder {
2517c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2518638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2519c5b5ef6cSAlex Elder 	int result;
2520c5b5ef6cSAlex Elder 
2521c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2522c5b5ef6cSAlex Elder 
2523c5b5ef6cSAlex Elder 	/*
2524c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2525c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2526c5b5ef6cSAlex Elder 	 * we're done with the request.
2527c5b5ef6cSAlex Elder 	 */
2528c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2529c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2530912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2531c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2532c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2533c5b5ef6cSAlex Elder 
2534c5b5ef6cSAlex Elder 	result = obj_request->result;
2535c5b5ef6cSAlex Elder 	obj_request->result = 0;
2536c5b5ef6cSAlex Elder 
2537c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2538c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2539c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2540c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2541c5b5ef6cSAlex Elder 
2542638f5abeSAlex Elder 	/*
2543638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2544638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2545638f5abeSAlex Elder 	 * and re-submit the original write request.
2546638f5abeSAlex Elder 	 */
2547638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2548638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2549638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2550638f5abeSAlex Elder 
2551638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2552638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2553638f5abeSAlex Elder 		if (!result)
2554638f5abeSAlex Elder 			return;
2555638f5abeSAlex Elder 	}
2556c5b5ef6cSAlex Elder 
2557c5b5ef6cSAlex Elder 	/*
2558c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2559c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2560c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2561c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2562c5b5ef6cSAlex Elder 	 */
2563c5b5ef6cSAlex Elder 	if (!result) {
2564c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2565c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2566c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2567c5b5ef6cSAlex Elder 	} else if (result) {
2568c5b5ef6cSAlex Elder 		orig_request->result = result;
25693d7efd18SAlex Elder 		goto out;
2570c5b5ef6cSAlex Elder 	}
2571c5b5ef6cSAlex Elder 
2572c5b5ef6cSAlex Elder 	/*
2573c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2574c5b5ef6cSAlex Elder 	 * whether the target object exists.
2575c5b5ef6cSAlex Elder 	 */
2576b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
25773d7efd18SAlex Elder out:
2578c5b5ef6cSAlex Elder 	if (orig_request->result)
2579c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2580c5b5ef6cSAlex Elder }
2581c5b5ef6cSAlex Elder 
2582c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2583c5b5ef6cSAlex Elder {
2584c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2585c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2586c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2587c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2588c5b5ef6cSAlex Elder 	u32 page_count;
2589c5b5ef6cSAlex Elder 	size_t size;
2590c5b5ef6cSAlex Elder 	int ret;
2591c5b5ef6cSAlex Elder 
2592c5b5ef6cSAlex Elder 	/*
2593c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2594c5b5ef6cSAlex Elder 	 *     le64 length;
2595c5b5ef6cSAlex Elder 	 *     struct {
2596c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2597c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2598c5b5ef6cSAlex Elder 	 *     } mtime;
2599c5b5ef6cSAlex Elder 	 */
2600c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2601c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2602c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2603c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2604c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2605c5b5ef6cSAlex Elder 
2606c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2607c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2608c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2609c5b5ef6cSAlex Elder 	if (!stat_request)
2610c5b5ef6cSAlex Elder 		goto out;
2611c5b5ef6cSAlex Elder 
2612c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2613c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2614c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2615c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2616c5b5ef6cSAlex Elder 
2617c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2618c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2619c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2620c5b5ef6cSAlex Elder 						stat_request);
2621c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2622c5b5ef6cSAlex Elder 		goto out;
2623c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2624c5b5ef6cSAlex Elder 
2625c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2626c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2627c5b5ef6cSAlex Elder 					false, false);
26289d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2629c5b5ef6cSAlex Elder 
2630c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2631c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2632c5b5ef6cSAlex Elder out:
2633c5b5ef6cSAlex Elder 	if (ret)
2634c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2635c5b5ef6cSAlex Elder 
2636c5b5ef6cSAlex Elder 	return ret;
2637c5b5ef6cSAlex Elder }
2638c5b5ef6cSAlex Elder 
2639b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2640b454e36dSAlex Elder {
2641b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2642a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
26433d7efd18SAlex Elder 	bool known;
2644b454e36dSAlex Elder 
2645b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2646b454e36dSAlex Elder 
2647b454e36dSAlex Elder 	img_request = obj_request->img_request;
2648b454e36dSAlex Elder 	rbd_assert(img_request);
2649a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2650b454e36dSAlex Elder 
2651b454e36dSAlex Elder 	/*
2652a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2653a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2654a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2655a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2656a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2657a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2658a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2659a9e8ba2cSAlex Elder 	 * simple object request.
2660b454e36dSAlex Elder 	 */
2661b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2662b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2663a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
26643d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
26653d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2666b454e36dSAlex Elder 
2667b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2668b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2669b454e36dSAlex Elder 
2670b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2671b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2672b454e36dSAlex Elder 
2673b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2674b454e36dSAlex Elder 	}
2675b454e36dSAlex Elder 
2676b454e36dSAlex Elder 	/*
26773d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
26783d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
26793d7efd18SAlex Elder 	 * start by reading the data for the full target object from
26803d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2681b454e36dSAlex Elder 	 */
26823d7efd18SAlex Elder 	if (known)
26833d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
26843d7efd18SAlex Elder 
26853d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2686b454e36dSAlex Elder 
2687b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2688b454e36dSAlex Elder }
2689b454e36dSAlex Elder 
2690bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2691bf0d5f50SAlex Elder {
2692bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
269346faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2694bf0d5f50SAlex Elder 
269537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
269646faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2697bf0d5f50SAlex Elder 		int ret;
2698bf0d5f50SAlex Elder 
2699b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2700bf0d5f50SAlex Elder 		if (ret)
2701bf0d5f50SAlex Elder 			return ret;
2702bf0d5f50SAlex Elder 	}
2703bf0d5f50SAlex Elder 
2704bf0d5f50SAlex Elder 	return 0;
2705bf0d5f50SAlex Elder }
2706bf0d5f50SAlex Elder 
27078b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
27088b3e1a56SAlex Elder {
27098b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2710a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2711a9e8ba2cSAlex Elder 	u64 obj_end;
271202c74fbaSAlex Elder 	u64 img_xferred;
271302c74fbaSAlex Elder 	int img_result;
27148b3e1a56SAlex Elder 
27158b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
27168b3e1a56SAlex Elder 
271702c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
271802c74fbaSAlex Elder 
27198b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
272002c74fbaSAlex Elder 	img_xferred = img_request->xferred;
272102c74fbaSAlex Elder 	img_result = img_request->result;
272202c74fbaSAlex Elder 	rbd_img_request_put(img_request);
272302c74fbaSAlex Elder 
272402c74fbaSAlex Elder 	/*
272502c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
272602c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
272702c74fbaSAlex Elder 	 * original request.
272802c74fbaSAlex Elder 	 */
2729a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2730a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
273102c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
273202c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
273302c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
27348b3e1a56SAlex Elder 
273502c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
273602c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
273702c74fbaSAlex Elder 		if (!img_result)
273802c74fbaSAlex Elder 			return;
273902c74fbaSAlex Elder 	}
274002c74fbaSAlex Elder 
274102c74fbaSAlex Elder 	obj_request->result = img_result;
2742a9e8ba2cSAlex Elder 	if (obj_request->result)
2743a9e8ba2cSAlex Elder 		goto out;
2744a9e8ba2cSAlex Elder 
2745a9e8ba2cSAlex Elder 	/*
2746a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2747a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2748a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2749a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2750a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2751a9e8ba2cSAlex Elder 	 */
2752a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2753a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2754a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2755a9e8ba2cSAlex Elder 		u64 xferred = 0;
2756a9e8ba2cSAlex Elder 
2757a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2758a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2759a9e8ba2cSAlex Elder 					obj_request->img_offset;
2760a9e8ba2cSAlex Elder 
276102c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
2762a9e8ba2cSAlex Elder 	} else {
276302c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
2764a9e8ba2cSAlex Elder 	}
2765a9e8ba2cSAlex Elder out:
27668b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
27678b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
27688b3e1a56SAlex Elder }
27698b3e1a56SAlex Elder 
27708b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
27718b3e1a56SAlex Elder {
27728b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
27738b3e1a56SAlex Elder 	int result;
27748b3e1a56SAlex Elder 
27758b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
27768b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
27778b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
27785b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
27798b3e1a56SAlex Elder 
27808b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
2781e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
27828b3e1a56SAlex Elder 						obj_request->img_offset,
2783e93f3152SAlex Elder 						obj_request->length);
27848b3e1a56SAlex Elder 	result = -ENOMEM;
27858b3e1a56SAlex Elder 	if (!img_request)
27868b3e1a56SAlex Elder 		goto out_err;
27878b3e1a56SAlex Elder 
27885b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
2789f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2790f1a4739fSAlex Elder 						obj_request->bio_list);
27915b2ab72dSAlex Elder 	else
27925b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
27935b2ab72dSAlex Elder 						obj_request->pages);
27948b3e1a56SAlex Elder 	if (result)
27958b3e1a56SAlex Elder 		goto out_err;
27968b3e1a56SAlex Elder 
27978b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
27988b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
27998b3e1a56SAlex Elder 	if (result)
28008b3e1a56SAlex Elder 		goto out_err;
28018b3e1a56SAlex Elder 
28028b3e1a56SAlex Elder 	return;
28038b3e1a56SAlex Elder out_err:
28048b3e1a56SAlex Elder 	if (img_request)
28058b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
28068b3e1a56SAlex Elder 	obj_request->result = result;
28078b3e1a56SAlex Elder 	obj_request->xferred = 0;
28088b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
28098b3e1a56SAlex Elder }
28108b3e1a56SAlex Elder 
281120e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2812b8d70035SAlex Elder {
2813b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
28142169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2815b8d70035SAlex Elder 	int ret;
2816b8d70035SAlex Elder 
2817b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2818b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2819b8d70035SAlex Elder 	if (!obj_request)
2820b8d70035SAlex Elder 		return -ENOMEM;
2821b8d70035SAlex Elder 
2822b8d70035SAlex Elder 	ret = -ENOMEM;
2823430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2824b8d70035SAlex Elder 	if (!obj_request->osd_req)
2825b8d70035SAlex Elder 		goto out;
2826b8d70035SAlex Elder 
2827c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2828cc4a38bdSAlex Elder 					notify_id, 0, 0);
28299d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2830430c28c3SAlex Elder 
2831b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2832cf81b60eSAlex Elder 	if (ret)
283320e0af67SJosh Durgin 		goto out;
283420e0af67SJosh Durgin 	ret = rbd_obj_request_wait(obj_request);
283520e0af67SJosh Durgin out:
2836b8d70035SAlex Elder 	rbd_obj_request_put(obj_request);
2837b8d70035SAlex Elder 
2838b8d70035SAlex Elder 	return ret;
2839b8d70035SAlex Elder }
2840b8d70035SAlex Elder 
2841b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2842b8d70035SAlex Elder {
2843b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2844e627db08SAlex Elder 	int ret;
2845b8d70035SAlex Elder 
2846b8d70035SAlex Elder 	if (!rbd_dev)
2847b8d70035SAlex Elder 		return;
2848b8d70035SAlex Elder 
284937206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2850b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2851b8d70035SAlex Elder 		(unsigned int)opcode);
2852e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
2853e627db08SAlex Elder 	if (ret)
28543b5cf2a2SAlex Elder 		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2855b8d70035SAlex Elder 
285620e0af67SJosh Durgin 	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
2857b8d70035SAlex Elder }
2858b8d70035SAlex Elder 
28599969ebc5SAlex Elder /*
28609969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
28619969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
28629969ebc5SAlex Elder  */
28631f3ef788SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
28649969ebc5SAlex Elder {
28659969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
28669969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
28679969ebc5SAlex Elder 	int ret;
28689969ebc5SAlex Elder 
28699969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
28709969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
28719969ebc5SAlex Elder 
28729969ebc5SAlex Elder 	if (start) {
28733c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
28749969ebc5SAlex Elder 						&rbd_dev->watch_event);
28759969ebc5SAlex Elder 		if (ret < 0)
28769969ebc5SAlex Elder 			return ret;
28778eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
28789969ebc5SAlex Elder 	}
28799969ebc5SAlex Elder 
28809969ebc5SAlex Elder 	ret = -ENOMEM;
28819969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
28829969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
28839969ebc5SAlex Elder 	if (!obj_request)
28849969ebc5SAlex Elder 		goto out_cancel;
28859969ebc5SAlex Elder 
2886430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2887430c28c3SAlex Elder 	if (!obj_request->osd_req)
2888430c28c3SAlex Elder 		goto out_cancel;
2889430c28c3SAlex Elder 
28908eb87565SAlex Elder 	if (start)
2891975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
28928eb87565SAlex Elder 	else
28936977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2894975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
28952169238dSAlex Elder 
28962169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
28971f3ef788SAlex Elder 				rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
28989d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
28992169238dSAlex Elder 
29009969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
29019969ebc5SAlex Elder 	if (ret)
29029969ebc5SAlex Elder 		goto out_cancel;
29039969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
29049969ebc5SAlex Elder 	if (ret)
29059969ebc5SAlex Elder 		goto out_cancel;
29069969ebc5SAlex Elder 	ret = obj_request->result;
29079969ebc5SAlex Elder 	if (ret)
29089969ebc5SAlex Elder 		goto out_cancel;
29099969ebc5SAlex Elder 
29108eb87565SAlex Elder 	/*
29118eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
29128eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
29138eb87565SAlex Elder 	 * a pointer to the object request during that time (in
29148eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
29158eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
29168eb87565SAlex Elder 	 * unregistered it.
29178eb87565SAlex Elder 	 */
29188eb87565SAlex Elder 	if (start) {
29198eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
29208eb87565SAlex Elder 
29218eb87565SAlex Elder 		return 0;
29228eb87565SAlex Elder 	}
29238eb87565SAlex Elder 
29248eb87565SAlex Elder 	/* We have successfully torn down the watch request */
29258eb87565SAlex Elder 
29268eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
29278eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
29289969ebc5SAlex Elder out_cancel:
29299969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
29309969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
29319969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
29329969ebc5SAlex Elder 	if (obj_request)
29339969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
29349969ebc5SAlex Elder 
29359969ebc5SAlex Elder 	return ret;
29369969ebc5SAlex Elder }
29379969ebc5SAlex Elder 
293836be9a76SAlex Elder /*
2939f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
2940f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
294136be9a76SAlex Elder  */
294236be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
294336be9a76SAlex Elder 			     const char *object_name,
294436be9a76SAlex Elder 			     const char *class_name,
294536be9a76SAlex Elder 			     const char *method_name,
29464157976bSAlex Elder 			     const void *outbound,
294736be9a76SAlex Elder 			     size_t outbound_size,
29484157976bSAlex Elder 			     void *inbound,
2949e2a58ee5SAlex Elder 			     size_t inbound_size)
295036be9a76SAlex Elder {
29512169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
295236be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
295336be9a76SAlex Elder 	struct page **pages;
295436be9a76SAlex Elder 	u32 page_count;
295536be9a76SAlex Elder 	int ret;
295636be9a76SAlex Elder 
295736be9a76SAlex Elder 	/*
29586010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
29596010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
29606010a451SAlex Elder 	 * also supply outbound data--parameters for the object
29616010a451SAlex Elder 	 * method.  Currently if this is present it will be a
29626010a451SAlex Elder 	 * snapshot id.
296336be9a76SAlex Elder 	 */
296436be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
296536be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
296636be9a76SAlex Elder 	if (IS_ERR(pages))
296736be9a76SAlex Elder 		return PTR_ERR(pages);
296836be9a76SAlex Elder 
296936be9a76SAlex Elder 	ret = -ENOMEM;
29706010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
297136be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
297236be9a76SAlex Elder 	if (!obj_request)
297336be9a76SAlex Elder 		goto out;
297436be9a76SAlex Elder 
297536be9a76SAlex Elder 	obj_request->pages = pages;
297636be9a76SAlex Elder 	obj_request->page_count = page_count;
297736be9a76SAlex Elder 
2978430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
297936be9a76SAlex Elder 	if (!obj_request->osd_req)
298036be9a76SAlex Elder 		goto out;
298136be9a76SAlex Elder 
2982c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
298304017e29SAlex Elder 					class_name, method_name);
298404017e29SAlex Elder 	if (outbound_size) {
298504017e29SAlex Elder 		struct ceph_pagelist *pagelist;
298604017e29SAlex Elder 
298704017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
298804017e29SAlex Elder 		if (!pagelist)
298904017e29SAlex Elder 			goto out;
299004017e29SAlex Elder 
299104017e29SAlex Elder 		ceph_pagelist_init(pagelist);
299204017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
299304017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
299404017e29SAlex Elder 						pagelist);
299504017e29SAlex Elder 	}
2996a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2997a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
299844cd188dSAlex Elder 					0, false, false);
29999d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3000430c28c3SAlex Elder 
300136be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
300236be9a76SAlex Elder 	if (ret)
300336be9a76SAlex Elder 		goto out;
300436be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
300536be9a76SAlex Elder 	if (ret)
300636be9a76SAlex Elder 		goto out;
300736be9a76SAlex Elder 
300836be9a76SAlex Elder 	ret = obj_request->result;
300936be9a76SAlex Elder 	if (ret < 0)
301036be9a76SAlex Elder 		goto out;
301157385b51SAlex Elder 
301257385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
301357385b51SAlex Elder 	ret = (int)obj_request->xferred;
3014903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
301536be9a76SAlex Elder out:
301636be9a76SAlex Elder 	if (obj_request)
301736be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
301836be9a76SAlex Elder 	else
301936be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
302036be9a76SAlex Elder 
302136be9a76SAlex Elder 	return ret;
302236be9a76SAlex Elder }
302336be9a76SAlex Elder 
3024bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
3025cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
3026bf0d5f50SAlex Elder {
3027bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
3028bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
3029bf0d5f50SAlex Elder 	struct request *rq;
3030bf0d5f50SAlex Elder 	int result;
3031bf0d5f50SAlex Elder 
3032bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
3033bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
3034bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
3035bf0d5f50SAlex Elder 		u64 offset;
3036bf0d5f50SAlex Elder 		u64 length;
3037bf0d5f50SAlex Elder 
3038bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
3039bf0d5f50SAlex Elder 
3040bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
30414dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
30424dda41d3SAlex Elder 				(int) rq->cmd_type);
30434dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
30444dda41d3SAlex Elder 			continue;
30454dda41d3SAlex Elder 		}
30464dda41d3SAlex Elder 
30474dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
30484dda41d3SAlex Elder 
30494dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
30504dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
30514dda41d3SAlex Elder 
30524dda41d3SAlex Elder 		if (!length) {
30534dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
3054bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
3055bf0d5f50SAlex Elder 			continue;
3056bf0d5f50SAlex Elder 		}
3057bf0d5f50SAlex Elder 
3058bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
3059bf0d5f50SAlex Elder 
3060bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
3061bf0d5f50SAlex Elder 
3062bf0d5f50SAlex Elder 		if (write_request) {
3063bf0d5f50SAlex Elder 			result = -EROFS;
3064bf0d5f50SAlex Elder 			if (read_only)
3065bf0d5f50SAlex Elder 				goto end_request;
3066bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3067bf0d5f50SAlex Elder 		}
3068bf0d5f50SAlex Elder 
30696d292906SAlex Elder 		/*
30706d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
30716d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
30726d292906SAlex Elder 		 * have disappeared by the time our request arrives
30736d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
30746d292906SAlex Elder 		 * we already know.
30756d292906SAlex Elder 		 */
30766d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3077bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
3078bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3079bf0d5f50SAlex Elder 			result = -ENXIO;
3080bf0d5f50SAlex Elder 			goto end_request;
3081bf0d5f50SAlex Elder 		}
3082bf0d5f50SAlex Elder 
3083bf0d5f50SAlex Elder 		result = -EINVAL;
3084c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
3085c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3086c0cd10dbSAlex Elder 				offset, length);
3087bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
3088c0cd10dbSAlex Elder 		}
3089bf0d5f50SAlex Elder 
309000a653e2SAlex Elder 		result = -EIO;
309100a653e2SAlex Elder 		if (offset + length > rbd_dev->mapping.size) {
309200a653e2SAlex Elder 			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
309300a653e2SAlex Elder 				offset, length, rbd_dev->mapping.size);
309400a653e2SAlex Elder 			goto end_request;
309500a653e2SAlex Elder 		}
309600a653e2SAlex Elder 
3097bf0d5f50SAlex Elder 		result = -ENOMEM;
3098bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
3099e93f3152SAlex Elder 							write_request);
3100bf0d5f50SAlex Elder 		if (!img_request)
3101bf0d5f50SAlex Elder 			goto end_request;
3102bf0d5f50SAlex Elder 
3103bf0d5f50SAlex Elder 		img_request->rq = rq;
3104bf0d5f50SAlex Elder 
3105f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3106f1a4739fSAlex Elder 						rq->bio);
3107bf0d5f50SAlex Elder 		if (!result)
3108bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
3109bf0d5f50SAlex Elder 		if (result)
3110bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
3111bf0d5f50SAlex Elder end_request:
3112bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
3113bf0d5f50SAlex Elder 		if (result < 0) {
31147da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
31157da22d29SAlex Elder 				write_request ? "write" : "read",
31167da22d29SAlex Elder 				length, offset, result);
31177da22d29SAlex Elder 
3118bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
3119bf0d5f50SAlex Elder 		}
3120bf0d5f50SAlex Elder 	}
3121bf0d5f50SAlex Elder }
3122bf0d5f50SAlex Elder 
3123602adf40SYehuda Sadeh /*
3124602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
3125602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
3126f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
3127602adf40SYehuda Sadeh  */
3128602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3129602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
3130602adf40SYehuda Sadeh {
3131602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
3132e5cfeed2SAlex Elder 	sector_t sector_offset;
3133e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
3134e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
3135e5cfeed2SAlex Elder 	int ret;
3136602adf40SYehuda Sadeh 
3137e5cfeed2SAlex Elder 	/*
3138e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
3139e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
3140e5cfeed2SAlex Elder 	 * device.
3141e5cfeed2SAlex Elder 	 */
3142e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3143e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3144e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3145593a9e7bSAlex Elder 
3146e5cfeed2SAlex Elder 	/*
3147e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
3148e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
3149e5cfeed2SAlex Elder 	 */
3150e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3151e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
3152e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
3153e5cfeed2SAlex Elder 	else
3154e5cfeed2SAlex Elder 		ret = 0;
3155e5cfeed2SAlex Elder 
3156e5cfeed2SAlex Elder 	/*
3157e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
3158e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
3159e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
3160e5cfeed2SAlex Elder 	 * added to an empty bio."
3161e5cfeed2SAlex Elder 	 */
3162e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3163e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3164e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
3165e5cfeed2SAlex Elder 
3166e5cfeed2SAlex Elder 	return ret;
3167602adf40SYehuda Sadeh }
3168602adf40SYehuda Sadeh 
3169602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3170602adf40SYehuda Sadeh {
3171602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3172602adf40SYehuda Sadeh 
3173602adf40SYehuda Sadeh 	if (!disk)
3174602adf40SYehuda Sadeh 		return;
3175602adf40SYehuda Sadeh 
3176a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3177a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3178602adf40SYehuda Sadeh 		del_gendisk(disk);
3179602adf40SYehuda Sadeh 		if (disk->queue)
3180602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
3181a0cab924SAlex Elder 	}
3182602adf40SYehuda Sadeh 	put_disk(disk);
3183602adf40SYehuda Sadeh }
3184602adf40SYehuda Sadeh 
3185788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3186788e2df3SAlex Elder 				const char *object_name,
31877097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3188788e2df3SAlex Elder 
3189788e2df3SAlex Elder {
31902169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3191788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3192788e2df3SAlex Elder 	struct page **pages = NULL;
3193788e2df3SAlex Elder 	u32 page_count;
31941ceae7efSAlex Elder 	size_t size;
3195788e2df3SAlex Elder 	int ret;
3196788e2df3SAlex Elder 
3197788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3198788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3199788e2df3SAlex Elder 	if (IS_ERR(pages))
3200788e2df3SAlex Elder 		ret = PTR_ERR(pages);
3201788e2df3SAlex Elder 
3202788e2df3SAlex Elder 	ret = -ENOMEM;
3203788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3204788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3205788e2df3SAlex Elder 	if (!obj_request)
3206788e2df3SAlex Elder 		goto out;
3207788e2df3SAlex Elder 
3208788e2df3SAlex Elder 	obj_request->pages = pages;
3209788e2df3SAlex Elder 	obj_request->page_count = page_count;
3210788e2df3SAlex Elder 
3211430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3212788e2df3SAlex Elder 	if (!obj_request->osd_req)
3213788e2df3SAlex Elder 		goto out;
3214788e2df3SAlex Elder 
3215c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3216c99d2d4aSAlex Elder 					offset, length, 0, 0);
3217406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3218a4ce40a9SAlex Elder 					obj_request->pages,
321944cd188dSAlex Elder 					obj_request->length,
322044cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
322144cd188dSAlex Elder 					false, false);
32229d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3223430c28c3SAlex Elder 
3224788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3225788e2df3SAlex Elder 	if (ret)
3226788e2df3SAlex Elder 		goto out;
3227788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3228788e2df3SAlex Elder 	if (ret)
3229788e2df3SAlex Elder 		goto out;
3230788e2df3SAlex Elder 
3231788e2df3SAlex Elder 	ret = obj_request->result;
3232788e2df3SAlex Elder 	if (ret < 0)
3233788e2df3SAlex Elder 		goto out;
32341ceae7efSAlex Elder 
32351ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
32361ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3237903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
323823ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
323923ed6e13SAlex Elder 	ret = (int)size;
3240788e2df3SAlex Elder out:
3241788e2df3SAlex Elder 	if (obj_request)
3242788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3243788e2df3SAlex Elder 	else
3244788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3245788e2df3SAlex Elder 
3246788e2df3SAlex Elder 	return ret;
3247788e2df3SAlex Elder }
3248788e2df3SAlex Elder 
3249602adf40SYehuda Sadeh /*
3250662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3251662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3252662518b1SAlex Elder  * information about the image.
32534156d998SAlex Elder  */
325499a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
32554156d998SAlex Elder {
32564156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
32574156d998SAlex Elder 	u32 snap_count = 0;
32584156d998SAlex Elder 	u64 names_size = 0;
32594156d998SAlex Elder 	u32 want_count;
32604156d998SAlex Elder 	int ret;
32614156d998SAlex Elder 
32624156d998SAlex Elder 	/*
32634156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
32644156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
32654156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
32664156d998SAlex Elder 	 * the number of snapshots could change by the time we read
32674156d998SAlex Elder 	 * it in, in which case we re-read it.
32684156d998SAlex Elder 	 */
32694156d998SAlex Elder 	do {
32704156d998SAlex Elder 		size_t size;
32714156d998SAlex Elder 
32724156d998SAlex Elder 		kfree(ondisk);
32734156d998SAlex Elder 
32744156d998SAlex Elder 		size = sizeof (*ondisk);
32754156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
32764156d998SAlex Elder 		size += names_size;
32774156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
32784156d998SAlex Elder 		if (!ondisk)
3279662518b1SAlex Elder 			return -ENOMEM;
32804156d998SAlex Elder 
3281788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
32827097f8dfSAlex Elder 				       0, size, ondisk);
32834156d998SAlex Elder 		if (ret < 0)
3284662518b1SAlex Elder 			goto out;
3285c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
32864156d998SAlex Elder 			ret = -ENXIO;
328706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
328806ecc6cbSAlex Elder 				size, ret);
3289662518b1SAlex Elder 			goto out;
32904156d998SAlex Elder 		}
32914156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
32924156d998SAlex Elder 			ret = -ENXIO;
329306ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3294662518b1SAlex Elder 			goto out;
32954156d998SAlex Elder 		}
32964156d998SAlex Elder 
32974156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
32984156d998SAlex Elder 		want_count = snap_count;
32994156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
33004156d998SAlex Elder 	} while (snap_count != want_count);
33014156d998SAlex Elder 
3302662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3303662518b1SAlex Elder out:
33044156d998SAlex Elder 	kfree(ondisk);
33054156d998SAlex Elder 
3306dfc5606dSYehuda Sadeh 	return ret;
3307602adf40SYehuda Sadeh }
3308602adf40SYehuda Sadeh 
330915228edeSAlex Elder /*
331015228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
331115228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
331215228edeSAlex Elder  */
331315228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
331415228edeSAlex Elder {
331515228edeSAlex Elder 	u64 snap_id;
331615228edeSAlex Elder 
331715228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
331815228edeSAlex Elder 		return;
331915228edeSAlex Elder 
332015228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
332115228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
332215228edeSAlex Elder 		return;
332315228edeSAlex Elder 
332415228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
332515228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
332615228edeSAlex Elder }
332715228edeSAlex Elder 
33289875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
33299875201eSJosh Durgin {
33309875201eSJosh Durgin 	sector_t size;
33319875201eSJosh Durgin 	bool removing;
33329875201eSJosh Durgin 
33339875201eSJosh Durgin 	/*
33349875201eSJosh Durgin 	 * Don't hold the lock while doing disk operations,
33359875201eSJosh Durgin 	 * or lock ordering will conflict with the bdev mutex via:
33369875201eSJosh Durgin 	 * rbd_add() -> blkdev_get() -> rbd_open()
33379875201eSJosh Durgin 	 */
33389875201eSJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
33399875201eSJosh Durgin 	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
33409875201eSJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
33419875201eSJosh Durgin 	/*
33429875201eSJosh Durgin 	 * If the device is being removed, rbd_dev->disk has
33439875201eSJosh Durgin 	 * been destroyed, so don't try to update its size
33449875201eSJosh Durgin 	 */
33459875201eSJosh Durgin 	if (!removing) {
33469875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
33479875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
33489875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
33499875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
33509875201eSJosh Durgin 	}
33519875201eSJosh Durgin }
33529875201eSJosh Durgin 
3353cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
33541fe5e993SAlex Elder {
3355e627db08SAlex Elder 	u64 mapping_size;
33561fe5e993SAlex Elder 	int ret;
33571fe5e993SAlex Elder 
3358117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3359cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
33603b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3361117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
336299a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
3363117973fbSAlex Elder 	else
33642df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
336515228edeSAlex Elder 
336615228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
336715228edeSAlex Elder 
336815228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
3369cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
3370cfbf6377SAlex Elder 
337100a653e2SAlex Elder 	if (mapping_size != rbd_dev->mapping.size) {
33729875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
337300a653e2SAlex Elder 	}
33741fe5e993SAlex Elder 
33751fe5e993SAlex Elder 	return ret;
33761fe5e993SAlex Elder }
33771fe5e993SAlex Elder 
3378602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3379602adf40SYehuda Sadeh {
3380602adf40SYehuda Sadeh 	struct gendisk *disk;
3381602adf40SYehuda Sadeh 	struct request_queue *q;
3382593a9e7bSAlex Elder 	u64 segment_size;
3383602adf40SYehuda Sadeh 
3384602adf40SYehuda Sadeh 	/* create gendisk info */
3385602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3386602adf40SYehuda Sadeh 	if (!disk)
33871fcdb8aaSAlex Elder 		return -ENOMEM;
3388602adf40SYehuda Sadeh 
3389f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3390de71a297SAlex Elder 		 rbd_dev->dev_id);
3391602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3392602adf40SYehuda Sadeh 	disk->first_minor = 0;
3393602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3394602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3395602adf40SYehuda Sadeh 
3396bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3397602adf40SYehuda Sadeh 	if (!q)
3398602adf40SYehuda Sadeh 		goto out_disk;
3399029bcbd8SJosh Durgin 
3400593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3401593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3402593a9e7bSAlex Elder 
3403029bcbd8SJosh Durgin 	/* set io sizes to object size */
3404593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3405593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3406593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3407593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3408593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3409029bcbd8SJosh Durgin 
3410602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3411602adf40SYehuda Sadeh 	disk->queue = q;
3412602adf40SYehuda Sadeh 
3413602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3414602adf40SYehuda Sadeh 
3415602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3416602adf40SYehuda Sadeh 
3417602adf40SYehuda Sadeh 	return 0;
3418602adf40SYehuda Sadeh out_disk:
3419602adf40SYehuda Sadeh 	put_disk(disk);
34201fcdb8aaSAlex Elder 
34211fcdb8aaSAlex Elder 	return -ENOMEM;
3422602adf40SYehuda Sadeh }
3423602adf40SYehuda Sadeh 
3424dfc5606dSYehuda Sadeh /*
3425dfc5606dSYehuda Sadeh   sysfs
3426dfc5606dSYehuda Sadeh */
3427602adf40SYehuda Sadeh 
3428593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3429593a9e7bSAlex Elder {
3430593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3431593a9e7bSAlex Elder }
3432593a9e7bSAlex Elder 
3433dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3434dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3435602adf40SYehuda Sadeh {
3436593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3437dfc5606dSYehuda Sadeh 
3438fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3439fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3440602adf40SYehuda Sadeh }
3441602adf40SYehuda Sadeh 
344234b13184SAlex Elder /*
344334b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
344434b13184SAlex Elder  * necessarily the base image.
344534b13184SAlex Elder  */
344634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
344734b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
344834b13184SAlex Elder {
344934b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
345034b13184SAlex Elder 
345134b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
345234b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
345334b13184SAlex Elder }
345434b13184SAlex Elder 
3455dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3456dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3457602adf40SYehuda Sadeh {
3458593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3459dfc5606dSYehuda Sadeh 
3460fc71d833SAlex Elder 	if (rbd_dev->major)
3461dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3462fc71d833SAlex Elder 
3463fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3464fc71d833SAlex Elder 
3465dfc5606dSYehuda Sadeh }
3466dfc5606dSYehuda Sadeh 
3467dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3468dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3469dfc5606dSYehuda Sadeh {
3470593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3471dfc5606dSYehuda Sadeh 
34721dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
34731dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3474dfc5606dSYehuda Sadeh }
3475dfc5606dSYehuda Sadeh 
3476dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3477dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3478dfc5606dSYehuda Sadeh {
3479593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3480dfc5606dSYehuda Sadeh 
34810d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3482dfc5606dSYehuda Sadeh }
3483dfc5606dSYehuda Sadeh 
34849bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
34859bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
34869bb2f334SAlex Elder {
34879bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
34889bb2f334SAlex Elder 
34890d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
34900d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
34919bb2f334SAlex Elder }
34929bb2f334SAlex Elder 
3493dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3494dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3495dfc5606dSYehuda Sadeh {
3496593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3497dfc5606dSYehuda Sadeh 
3498a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
34990d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3500a92ffdf8SAlex Elder 
3501a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3502dfc5606dSYehuda Sadeh }
3503dfc5606dSYehuda Sadeh 
3504589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3505589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3506589d30e0SAlex Elder {
3507589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3508589d30e0SAlex Elder 
35090d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3510589d30e0SAlex Elder }
3511589d30e0SAlex Elder 
351234b13184SAlex Elder /*
351334b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
351434b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
351534b13184SAlex Elder  */
3516dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3517dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3518dfc5606dSYehuda Sadeh 			     char *buf)
3519dfc5606dSYehuda Sadeh {
3520593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3521dfc5606dSYehuda Sadeh 
35220d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3523dfc5606dSYehuda Sadeh }
3524dfc5606dSYehuda Sadeh 
352586b00e0dSAlex Elder /*
352686b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
352786b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
352886b00e0dSAlex Elder  * "(no parent image)".
352986b00e0dSAlex Elder  */
353086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
353186b00e0dSAlex Elder 			     struct device_attribute *attr,
353286b00e0dSAlex Elder 			     char *buf)
353386b00e0dSAlex Elder {
353486b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
353586b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
353686b00e0dSAlex Elder 	int count;
353786b00e0dSAlex Elder 	char *bufp = buf;
353886b00e0dSAlex Elder 
353986b00e0dSAlex Elder 	if (!spec)
354086b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
354186b00e0dSAlex Elder 
354286b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
354386b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
354486b00e0dSAlex Elder 	if (count < 0)
354586b00e0dSAlex Elder 		return count;
354686b00e0dSAlex Elder 	bufp += count;
354786b00e0dSAlex Elder 
354886b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
354986b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
355086b00e0dSAlex Elder 	if (count < 0)
355186b00e0dSAlex Elder 		return count;
355286b00e0dSAlex Elder 	bufp += count;
355386b00e0dSAlex Elder 
355486b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
355586b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
355686b00e0dSAlex Elder 	if (count < 0)
355786b00e0dSAlex Elder 		return count;
355886b00e0dSAlex Elder 	bufp += count;
355986b00e0dSAlex Elder 
356086b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
356186b00e0dSAlex Elder 	if (count < 0)
356286b00e0dSAlex Elder 		return count;
356386b00e0dSAlex Elder 	bufp += count;
356486b00e0dSAlex Elder 
356586b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
356686b00e0dSAlex Elder }
356786b00e0dSAlex Elder 
3568dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3569dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3570dfc5606dSYehuda Sadeh 				 const char *buf,
3571dfc5606dSYehuda Sadeh 				 size_t size)
3572dfc5606dSYehuda Sadeh {
3573593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3574b813623aSAlex Elder 	int ret;
3575602adf40SYehuda Sadeh 
3576cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3577e627db08SAlex Elder 	if (ret)
3578e627db08SAlex Elder 		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3579b813623aSAlex Elder 
3580b813623aSAlex Elder 	return ret < 0 ? ret : size;
3581dfc5606dSYehuda Sadeh }
3582602adf40SYehuda Sadeh 
3583dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
358434b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3585dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3586dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3587dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
35889bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3589dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3590589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3591dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3592dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
359386b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3594dfc5606dSYehuda Sadeh 
3595dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3596dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
359734b13184SAlex Elder 	&dev_attr_features.attr,
3598dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3599dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3600dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
36019bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3602dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3603589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3604dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
360586b00e0dSAlex Elder 	&dev_attr_parent.attr,
3606dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3607dfc5606dSYehuda Sadeh 	NULL
3608dfc5606dSYehuda Sadeh };
3609dfc5606dSYehuda Sadeh 
3610dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3611dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3612dfc5606dSYehuda Sadeh };
3613dfc5606dSYehuda Sadeh 
3614dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3615dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3616dfc5606dSYehuda Sadeh 	NULL
3617dfc5606dSYehuda Sadeh };
3618dfc5606dSYehuda Sadeh 
3619dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3620dfc5606dSYehuda Sadeh {
3621dfc5606dSYehuda Sadeh }
3622dfc5606dSYehuda Sadeh 
3623dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3624dfc5606dSYehuda Sadeh 	.name		= "rbd",
3625dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3626dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3627dfc5606dSYehuda Sadeh };
3628dfc5606dSYehuda Sadeh 
36298b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
36308b8fb99cSAlex Elder {
36318b8fb99cSAlex Elder 	kref_get(&spec->kref);
36328b8fb99cSAlex Elder 
36338b8fb99cSAlex Elder 	return spec;
36348b8fb99cSAlex Elder }
36358b8fb99cSAlex Elder 
36368b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
36378b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
36388b8fb99cSAlex Elder {
36398b8fb99cSAlex Elder 	if (spec)
36408b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
36418b8fb99cSAlex Elder }
36428b8fb99cSAlex Elder 
36438b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
36448b8fb99cSAlex Elder {
36458b8fb99cSAlex Elder 	struct rbd_spec *spec;
36468b8fb99cSAlex Elder 
36478b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
36488b8fb99cSAlex Elder 	if (!spec)
36498b8fb99cSAlex Elder 		return NULL;
36508b8fb99cSAlex Elder 	kref_init(&spec->kref);
36518b8fb99cSAlex Elder 
36528b8fb99cSAlex Elder 	return spec;
36538b8fb99cSAlex Elder }
36548b8fb99cSAlex Elder 
36558b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
36568b8fb99cSAlex Elder {
36578b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
36588b8fb99cSAlex Elder 
36598b8fb99cSAlex Elder 	kfree(spec->pool_name);
36608b8fb99cSAlex Elder 	kfree(spec->image_id);
36618b8fb99cSAlex Elder 	kfree(spec->image_name);
36628b8fb99cSAlex Elder 	kfree(spec->snap_name);
36638b8fb99cSAlex Elder 	kfree(spec);
36648b8fb99cSAlex Elder }
36658b8fb99cSAlex Elder 
3666cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3667c53d5893SAlex Elder 				struct rbd_spec *spec)
3668c53d5893SAlex Elder {
3669c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3670c53d5893SAlex Elder 
3671c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3672c53d5893SAlex Elder 	if (!rbd_dev)
3673c53d5893SAlex Elder 		return NULL;
3674c53d5893SAlex Elder 
3675c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
36766d292906SAlex Elder 	rbd_dev->flags = 0;
3677a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
3678c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3679c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3680c53d5893SAlex Elder 
3681c53d5893SAlex Elder 	rbd_dev->spec = spec;
3682c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3683c53d5893SAlex Elder 
36840903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
36850903e875SAlex Elder 
36860903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
36870903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
36880903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
36890903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
36900903e875SAlex Elder 
3691c53d5893SAlex Elder 	return rbd_dev;
3692c53d5893SAlex Elder }
3693c53d5893SAlex Elder 
3694c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3695c53d5893SAlex Elder {
3696c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3697c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3698c53d5893SAlex Elder 	kfree(rbd_dev);
3699c53d5893SAlex Elder }
3700c53d5893SAlex Elder 
3701dfc5606dSYehuda Sadeh /*
37029d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
37039d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
37049d475de5SAlex Elder  * image.
37059d475de5SAlex Elder  */
37069d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
37079d475de5SAlex Elder 				u8 *order, u64 *snap_size)
37089d475de5SAlex Elder {
37099d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
37109d475de5SAlex Elder 	int ret;
37119d475de5SAlex Elder 	struct {
37129d475de5SAlex Elder 		u8 order;
37139d475de5SAlex Elder 		__le64 size;
37149d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
37159d475de5SAlex Elder 
371636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
37179d475de5SAlex Elder 				"rbd", "get_size",
37184157976bSAlex Elder 				&snapid, sizeof (snapid),
3719e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
372036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
37219d475de5SAlex Elder 	if (ret < 0)
37229d475de5SAlex Elder 		return ret;
372357385b51SAlex Elder 	if (ret < sizeof (size_buf))
372457385b51SAlex Elder 		return -ERANGE;
37259d475de5SAlex Elder 
3726c3545579SJosh Durgin 	if (order) {
37279d475de5SAlex Elder 		*order = size_buf.order;
3728c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
3729c3545579SJosh Durgin 	}
37309d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
37319d475de5SAlex Elder 
3732c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
3733c3545579SJosh Durgin 		(unsigned long long)snap_id,
37349d475de5SAlex Elder 		(unsigned long long)*snap_size);
37359d475de5SAlex Elder 
37369d475de5SAlex Elder 	return 0;
37379d475de5SAlex Elder }
37389d475de5SAlex Elder 
37399d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
37409d475de5SAlex Elder {
37419d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
37429d475de5SAlex Elder 					&rbd_dev->header.obj_order,
37439d475de5SAlex Elder 					&rbd_dev->header.image_size);
37449d475de5SAlex Elder }
37459d475de5SAlex Elder 
37461e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
37471e130199SAlex Elder {
37481e130199SAlex Elder 	void *reply_buf;
37491e130199SAlex Elder 	int ret;
37501e130199SAlex Elder 	void *p;
37511e130199SAlex Elder 
37521e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
37531e130199SAlex Elder 	if (!reply_buf)
37541e130199SAlex Elder 		return -ENOMEM;
37551e130199SAlex Elder 
375636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
37574157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3758e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
375936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
37601e130199SAlex Elder 	if (ret < 0)
37611e130199SAlex Elder 		goto out;
37621e130199SAlex Elder 
37631e130199SAlex Elder 	p = reply_buf;
37641e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
376557385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
376657385b51SAlex Elder 	ret = 0;
37671e130199SAlex Elder 
37681e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
37691e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
37701e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
37711e130199SAlex Elder 	} else {
37721e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
37731e130199SAlex Elder 	}
37741e130199SAlex Elder out:
37751e130199SAlex Elder 	kfree(reply_buf);
37761e130199SAlex Elder 
37771e130199SAlex Elder 	return ret;
37781e130199SAlex Elder }
37791e130199SAlex Elder 
3780b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3781b1b5402aSAlex Elder 		u64 *snap_features)
3782b1b5402aSAlex Elder {
3783b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3784b1b5402aSAlex Elder 	struct {
3785b1b5402aSAlex Elder 		__le64 features;
3786b1b5402aSAlex Elder 		__le64 incompat;
37874157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3788d889140cSAlex Elder 	u64 incompat;
3789b1b5402aSAlex Elder 	int ret;
3790b1b5402aSAlex Elder 
379136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3792b1b5402aSAlex Elder 				"rbd", "get_features",
37934157976bSAlex Elder 				&snapid, sizeof (snapid),
3794e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
379536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3796b1b5402aSAlex Elder 	if (ret < 0)
3797b1b5402aSAlex Elder 		return ret;
379857385b51SAlex Elder 	if (ret < sizeof (features_buf))
379957385b51SAlex Elder 		return -ERANGE;
3800d889140cSAlex Elder 
3801d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
38025cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3803b8f5c6edSAlex Elder 		return -ENXIO;
3804d889140cSAlex Elder 
3805b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3806b1b5402aSAlex Elder 
3807b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3808b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3809b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3810b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3811b1b5402aSAlex Elder 
3812b1b5402aSAlex Elder 	return 0;
3813b1b5402aSAlex Elder }
3814b1b5402aSAlex Elder 
3815b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3816b1b5402aSAlex Elder {
3817b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3818b1b5402aSAlex Elder 						&rbd_dev->header.features);
3819b1b5402aSAlex Elder }
3820b1b5402aSAlex Elder 
382186b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
382286b00e0dSAlex Elder {
382386b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
382486b00e0dSAlex Elder 	size_t size;
382586b00e0dSAlex Elder 	void *reply_buf = NULL;
382686b00e0dSAlex Elder 	__le64 snapid;
382786b00e0dSAlex Elder 	void *p;
382886b00e0dSAlex Elder 	void *end;
3829642a2537SAlex Elder 	u64 pool_id;
383086b00e0dSAlex Elder 	char *image_id;
38313b5cf2a2SAlex Elder 	u64 snap_id;
383286b00e0dSAlex Elder 	u64 overlap;
383386b00e0dSAlex Elder 	int ret;
383486b00e0dSAlex Elder 
383586b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
383686b00e0dSAlex Elder 	if (!parent_spec)
383786b00e0dSAlex Elder 		return -ENOMEM;
383886b00e0dSAlex Elder 
383986b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
384086b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
384186b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
384286b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
384386b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
384486b00e0dSAlex Elder 	if (!reply_buf) {
384586b00e0dSAlex Elder 		ret = -ENOMEM;
384686b00e0dSAlex Elder 		goto out_err;
384786b00e0dSAlex Elder 	}
384886b00e0dSAlex Elder 
384986b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
385036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
385186b00e0dSAlex Elder 				"rbd", "get_parent",
38524157976bSAlex Elder 				&snapid, sizeof (snapid),
3853e2a58ee5SAlex Elder 				reply_buf, size);
385436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
385586b00e0dSAlex Elder 	if (ret < 0)
385686b00e0dSAlex Elder 		goto out_err;
385786b00e0dSAlex Elder 
385886b00e0dSAlex Elder 	p = reply_buf;
385957385b51SAlex Elder 	end = reply_buf + ret;
386057385b51SAlex Elder 	ret = -ERANGE;
3861642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
3862392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
3863392a9dadSAlex Elder 		/*
3864392a9dadSAlex Elder 		 * Either the parent never existed, or we have
3865392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
3866392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
3867392a9dadSAlex Elder 		 * layered image disappears we immediately set the
3868392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
3869392a9dadSAlex Elder 		 * requests will be treated as if the image had no
3870392a9dadSAlex Elder 		 * parent.
3871392a9dadSAlex Elder 		 */
3872392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
3873392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
3874392a9dadSAlex Elder 			smp_mb();
3875392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
3876392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
3877392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
3878392a9dadSAlex Elder 		}
3879392a9dadSAlex Elder 
388086b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
3881392a9dadSAlex Elder 	}
388286b00e0dSAlex Elder 
38830903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
38840903e875SAlex Elder 
38850903e875SAlex Elder 	ret = -EIO;
3886642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
3887c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3888642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
388957385b51SAlex Elder 		goto out_err;
3890c0cd10dbSAlex Elder 	}
38910903e875SAlex Elder 
3892979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
389386b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
389486b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
389586b00e0dSAlex Elder 		goto out_err;
389686b00e0dSAlex Elder 	}
38973b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
389886b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
389986b00e0dSAlex Elder 
39003b5cf2a2SAlex Elder 	/*
39013b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
39023b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
39033b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
39043b5cf2a2SAlex Elder 	 */
39053b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
39063b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
39073b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
39083b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
390986b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
391086b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
39113b5cf2a2SAlex Elder 	}
39123b5cf2a2SAlex Elder 
39133b5cf2a2SAlex Elder 	/*
39143b5cf2a2SAlex Elder 	 * We always update the parent overlap.  If it's zero we
39153b5cf2a2SAlex Elder 	 * treat it specially.
39163b5cf2a2SAlex Elder 	 */
391770cf49cfSAlex Elder 	rbd_dev->parent_overlap = overlap;
39183b5cf2a2SAlex Elder 	smp_mb();
39193b5cf2a2SAlex Elder 	if (!overlap) {
39203b5cf2a2SAlex Elder 
39213b5cf2a2SAlex Elder 		/* A null parent_spec indicates it's the initial probe */
39223b5cf2a2SAlex Elder 
39233b5cf2a2SAlex Elder 		if (parent_spec) {
39243b5cf2a2SAlex Elder 			/*
39253b5cf2a2SAlex Elder 			 * The overlap has become zero, so the clone
39263b5cf2a2SAlex Elder 			 * must have been resized down to 0 at some
39273b5cf2a2SAlex Elder 			 * point.  Treat this the same as a flatten.
39283b5cf2a2SAlex Elder 			 */
39293b5cf2a2SAlex Elder 			rbd_dev_parent_put(rbd_dev);
39303b5cf2a2SAlex Elder 			pr_info("%s: clone image now standalone\n",
39313b5cf2a2SAlex Elder 				rbd_dev->disk->disk_name);
393270cf49cfSAlex Elder 		} else {
39333b5cf2a2SAlex Elder 			/*
39343b5cf2a2SAlex Elder 			 * For the initial probe, if we find the
39353b5cf2a2SAlex Elder 			 * overlap is zero we just pretend there was
39363b5cf2a2SAlex Elder 			 * no parent image.
39373b5cf2a2SAlex Elder 			 */
39383b5cf2a2SAlex Elder 			rbd_warn(rbd_dev, "ignoring parent of "
39393b5cf2a2SAlex Elder 						"clone with overlap 0\n");
39403b5cf2a2SAlex Elder 		}
394170cf49cfSAlex Elder 	}
394286b00e0dSAlex Elder out:
394386b00e0dSAlex Elder 	ret = 0;
394486b00e0dSAlex Elder out_err:
394586b00e0dSAlex Elder 	kfree(reply_buf);
394686b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
394786b00e0dSAlex Elder 
394886b00e0dSAlex Elder 	return ret;
394986b00e0dSAlex Elder }
395086b00e0dSAlex Elder 
3951cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3952cc070d59SAlex Elder {
3953cc070d59SAlex Elder 	struct {
3954cc070d59SAlex Elder 		__le64 stripe_unit;
3955cc070d59SAlex Elder 		__le64 stripe_count;
3956cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
3957cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
3958cc070d59SAlex Elder 	void *p;
3959cc070d59SAlex Elder 	u64 obj_size;
3960cc070d59SAlex Elder 	u64 stripe_unit;
3961cc070d59SAlex Elder 	u64 stripe_count;
3962cc070d59SAlex Elder 	int ret;
3963cc070d59SAlex Elder 
3964cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3965cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
3966e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
3967cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3968cc070d59SAlex Elder 	if (ret < 0)
3969cc070d59SAlex Elder 		return ret;
3970cc070d59SAlex Elder 	if (ret < size)
3971cc070d59SAlex Elder 		return -ERANGE;
3972cc070d59SAlex Elder 
3973cc070d59SAlex Elder 	/*
3974cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
3975cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
3976cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
3977cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
3978cc070d59SAlex Elder 	 */
3979cc070d59SAlex Elder 	ret = -EINVAL;
3980cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
3981cc070d59SAlex Elder 	p = &striping_info_buf;
3982cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
3983cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
3984cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
3985cc070d59SAlex Elder 				"(got %llu want %llu)",
3986cc070d59SAlex Elder 				stripe_unit, obj_size);
3987cc070d59SAlex Elder 		return -EINVAL;
3988cc070d59SAlex Elder 	}
3989cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
3990cc070d59SAlex Elder 	if (stripe_count != 1) {
3991cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
3992cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
3993cc070d59SAlex Elder 		return -EINVAL;
3994cc070d59SAlex Elder 	}
3995500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
3996500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
3997cc070d59SAlex Elder 
3998cc070d59SAlex Elder 	return 0;
3999cc070d59SAlex Elder }
4000cc070d59SAlex Elder 
40019e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
40029e15b77dSAlex Elder {
40039e15b77dSAlex Elder 	size_t image_id_size;
40049e15b77dSAlex Elder 	char *image_id;
40059e15b77dSAlex Elder 	void *p;
40069e15b77dSAlex Elder 	void *end;
40079e15b77dSAlex Elder 	size_t size;
40089e15b77dSAlex Elder 	void *reply_buf = NULL;
40099e15b77dSAlex Elder 	size_t len = 0;
40109e15b77dSAlex Elder 	char *image_name = NULL;
40119e15b77dSAlex Elder 	int ret;
40129e15b77dSAlex Elder 
40139e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
40149e15b77dSAlex Elder 
401569e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
401669e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
40179e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
40189e15b77dSAlex Elder 	if (!image_id)
40199e15b77dSAlex Elder 		return NULL;
40209e15b77dSAlex Elder 
40219e15b77dSAlex Elder 	p = image_id;
40224157976bSAlex Elder 	end = image_id + image_id_size;
402369e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
40249e15b77dSAlex Elder 
40259e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
40269e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
40279e15b77dSAlex Elder 	if (!reply_buf)
40289e15b77dSAlex Elder 		goto out;
40299e15b77dSAlex Elder 
403036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
40319e15b77dSAlex Elder 				"rbd", "dir_get_name",
40329e15b77dSAlex Elder 				image_id, image_id_size,
4033e2a58ee5SAlex Elder 				reply_buf, size);
40349e15b77dSAlex Elder 	if (ret < 0)
40359e15b77dSAlex Elder 		goto out;
40369e15b77dSAlex Elder 	p = reply_buf;
4037f40eb349SAlex Elder 	end = reply_buf + ret;
4038f40eb349SAlex Elder 
40399e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
40409e15b77dSAlex Elder 	if (IS_ERR(image_name))
40419e15b77dSAlex Elder 		image_name = NULL;
40429e15b77dSAlex Elder 	else
40439e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
40449e15b77dSAlex Elder out:
40459e15b77dSAlex Elder 	kfree(reply_buf);
40469e15b77dSAlex Elder 	kfree(image_id);
40479e15b77dSAlex Elder 
40489e15b77dSAlex Elder 	return image_name;
40499e15b77dSAlex Elder }
40509e15b77dSAlex Elder 
40512ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
40522ad3d716SAlex Elder {
40532ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
40542ad3d716SAlex Elder 	const char *snap_name;
40552ad3d716SAlex Elder 	u32 which = 0;
40562ad3d716SAlex Elder 
40572ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
40582ad3d716SAlex Elder 
40592ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
40602ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
40612ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
40622ad3d716SAlex Elder 			return snapc->snaps[which];
40632ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
40642ad3d716SAlex Elder 		which++;
40652ad3d716SAlex Elder 	}
40662ad3d716SAlex Elder 	return CEPH_NOSNAP;
40672ad3d716SAlex Elder }
40682ad3d716SAlex Elder 
40692ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
40702ad3d716SAlex Elder {
40712ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
40722ad3d716SAlex Elder 	u32 which;
40732ad3d716SAlex Elder 	bool found = false;
40742ad3d716SAlex Elder 	u64 snap_id;
40752ad3d716SAlex Elder 
40762ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
40772ad3d716SAlex Elder 		const char *snap_name;
40782ad3d716SAlex Elder 
40792ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
40802ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
40812ad3d716SAlex Elder 		if (IS_ERR(snap_name))
40822ad3d716SAlex Elder 			break;
40832ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
40842ad3d716SAlex Elder 		kfree(snap_name);
40852ad3d716SAlex Elder 	}
40862ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
40872ad3d716SAlex Elder }
40882ad3d716SAlex Elder 
40892ad3d716SAlex Elder /*
40902ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
40912ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
40922ad3d716SAlex Elder  */
40932ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
40942ad3d716SAlex Elder {
40952ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
40962ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
40972ad3d716SAlex Elder 
40982ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
40992ad3d716SAlex Elder }
41002ad3d716SAlex Elder 
41019e15b77dSAlex Elder /*
41022e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
41032e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
41042e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
41052e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
41062e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
41072e9f7f1cSAlex Elder  * allocated.
4108e1d4213fSAlex Elder  *
4109e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
4110e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
4111e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
41129e15b77dSAlex Elder  */
41132e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
41149e15b77dSAlex Elder {
41152e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
41162e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
41172e9f7f1cSAlex Elder 	const char *pool_name;
41182e9f7f1cSAlex Elder 	const char *image_name;
41192e9f7f1cSAlex Elder 	const char *snap_name;
41209e15b77dSAlex Elder 	int ret;
41219e15b77dSAlex Elder 
4122e1d4213fSAlex Elder 	/*
4123e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
4124e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
4125e1d4213fSAlex Elder 	 */
41262e9f7f1cSAlex Elder 	if (spec->pool_name) {
41272e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
41282ad3d716SAlex Elder 			u64 snap_id;
4129e1d4213fSAlex Elder 
41302ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
41312ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
4132e1d4213fSAlex Elder 				return -ENOENT;
41332ad3d716SAlex Elder 			spec->snap_id = snap_id;
4134e1d4213fSAlex Elder 		} else {
41352e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
4136e1d4213fSAlex Elder 		}
4137e1d4213fSAlex Elder 
4138e1d4213fSAlex Elder 		return 0;
4139e1d4213fSAlex Elder 	}
41409e15b77dSAlex Elder 
41412e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
41429e15b77dSAlex Elder 
41432e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
41442e9f7f1cSAlex Elder 	if (!pool_name) {
41452e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4146935dc89fSAlex Elder 		return -EIO;
4147935dc89fSAlex Elder 	}
41482e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
41492e9f7f1cSAlex Elder 	if (!pool_name)
41509e15b77dSAlex Elder 		return -ENOMEM;
41519e15b77dSAlex Elder 
41529e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
41539e15b77dSAlex Elder 
41542e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
41552e9f7f1cSAlex Elder 	if (!image_name)
415606ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
41579e15b77dSAlex Elder 
41582e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
41599e15b77dSAlex Elder 
41602e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
41612e9f7f1cSAlex Elder 	if (!snap_name) {
41622e9f7f1cSAlex Elder 		ret = -ENOMEM;
41639e15b77dSAlex Elder 		goto out_err;
41642e9f7f1cSAlex Elder 	}
41652e9f7f1cSAlex Elder 
41662e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
41672e9f7f1cSAlex Elder 	spec->image_name = image_name;
41682e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
41699e15b77dSAlex Elder 
41709e15b77dSAlex Elder 	return 0;
41719e15b77dSAlex Elder out_err:
41722e9f7f1cSAlex Elder 	kfree(image_name);
41732e9f7f1cSAlex Elder 	kfree(pool_name);
41749e15b77dSAlex Elder 
41759e15b77dSAlex Elder 	return ret;
41769e15b77dSAlex Elder }
41779e15b77dSAlex Elder 
4178cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
417935d489f9SAlex Elder {
418035d489f9SAlex Elder 	size_t size;
418135d489f9SAlex Elder 	int ret;
418235d489f9SAlex Elder 	void *reply_buf;
418335d489f9SAlex Elder 	void *p;
418435d489f9SAlex Elder 	void *end;
418535d489f9SAlex Elder 	u64 seq;
418635d489f9SAlex Elder 	u32 snap_count;
418735d489f9SAlex Elder 	struct ceph_snap_context *snapc;
418835d489f9SAlex Elder 	u32 i;
418935d489f9SAlex Elder 
419035d489f9SAlex Elder 	/*
419135d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
419235d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
419335d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
419435d489f9SAlex Elder 	 * prepared to receive.
419535d489f9SAlex Elder 	 */
419635d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
419735d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
419835d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
419935d489f9SAlex Elder 	if (!reply_buf)
420035d489f9SAlex Elder 		return -ENOMEM;
420135d489f9SAlex Elder 
420236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
42034157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4204e2a58ee5SAlex Elder 				reply_buf, size);
420536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
420635d489f9SAlex Elder 	if (ret < 0)
420735d489f9SAlex Elder 		goto out;
420835d489f9SAlex Elder 
420935d489f9SAlex Elder 	p = reply_buf;
421057385b51SAlex Elder 	end = reply_buf + ret;
421157385b51SAlex Elder 	ret = -ERANGE;
421235d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
421335d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
421435d489f9SAlex Elder 
421535d489f9SAlex Elder 	/*
421635d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
421735d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
421835d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
421935d489f9SAlex Elder 	 * allocate is representable in a size_t.
422035d489f9SAlex Elder 	 */
422135d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
422235d489f9SAlex Elder 				 / sizeof (u64)) {
422335d489f9SAlex Elder 		ret = -EINVAL;
422435d489f9SAlex Elder 		goto out;
422535d489f9SAlex Elder 	}
422635d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
422735d489f9SAlex Elder 		goto out;
4228468521c1SAlex Elder 	ret = 0;
422935d489f9SAlex Elder 
4230812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
423135d489f9SAlex Elder 	if (!snapc) {
423235d489f9SAlex Elder 		ret = -ENOMEM;
423335d489f9SAlex Elder 		goto out;
423435d489f9SAlex Elder 	}
423535d489f9SAlex Elder 	snapc->seq = seq;
423635d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
423735d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
423835d489f9SAlex Elder 
423949ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
424035d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
424135d489f9SAlex Elder 
424235d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
424335d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
424435d489f9SAlex Elder out:
424535d489f9SAlex Elder 	kfree(reply_buf);
424635d489f9SAlex Elder 
424757385b51SAlex Elder 	return ret;
424835d489f9SAlex Elder }
424935d489f9SAlex Elder 
425054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
425154cac61fSAlex Elder 					u64 snap_id)
4252b8b1e2dbSAlex Elder {
4253b8b1e2dbSAlex Elder 	size_t size;
4254b8b1e2dbSAlex Elder 	void *reply_buf;
425554cac61fSAlex Elder 	__le64 snapid;
4256b8b1e2dbSAlex Elder 	int ret;
4257b8b1e2dbSAlex Elder 	void *p;
4258b8b1e2dbSAlex Elder 	void *end;
4259b8b1e2dbSAlex Elder 	char *snap_name;
4260b8b1e2dbSAlex Elder 
4261b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4262b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4263b8b1e2dbSAlex Elder 	if (!reply_buf)
4264b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4265b8b1e2dbSAlex Elder 
426654cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
426736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4268b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
426954cac61fSAlex Elder 				&snapid, sizeof (snapid),
4270e2a58ee5SAlex Elder 				reply_buf, size);
427136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4272f40eb349SAlex Elder 	if (ret < 0) {
4273f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4274b8b1e2dbSAlex Elder 		goto out;
4275f40eb349SAlex Elder 	}
4276b8b1e2dbSAlex Elder 
4277b8b1e2dbSAlex Elder 	p = reply_buf;
4278f40eb349SAlex Elder 	end = reply_buf + ret;
4279e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4280f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4281b8b1e2dbSAlex Elder 		goto out;
4282f40eb349SAlex Elder 
4283b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
428454cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4285b8b1e2dbSAlex Elder out:
4286b8b1e2dbSAlex Elder 	kfree(reply_buf);
4287b8b1e2dbSAlex Elder 
4288f40eb349SAlex Elder 	return snap_name;
4289b8b1e2dbSAlex Elder }
4290b8b1e2dbSAlex Elder 
42912df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4292117973fbSAlex Elder {
42932df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4294117973fbSAlex Elder 	int ret;
4295117973fbSAlex Elder 
42961617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
42971617e40cSJosh Durgin 	if (ret)
4298cfbf6377SAlex Elder 		return ret;
42991617e40cSJosh Durgin 
43002df3fac7SAlex Elder 	if (first_time) {
43012df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
43022df3fac7SAlex Elder 		if (ret)
4303cfbf6377SAlex Elder 			return ret;
43042df3fac7SAlex Elder 	}
43052df3fac7SAlex Elder 
4306642a2537SAlex Elder 	/*
4307642a2537SAlex Elder 	 * If the image supports layering, get the parent info.  We
4308642a2537SAlex Elder 	 * need to probe the first time regardless.  Thereafter we
4309642a2537SAlex Elder 	 * only need to if there's a parent, to see if it has
4310642a2537SAlex Elder 	 * disappeared due to the mapped image getting flattened.
4311642a2537SAlex Elder 	 */
4312642a2537SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4313642a2537SAlex Elder 			(first_time || rbd_dev->parent_spec)) {
4314642a2537SAlex Elder 		bool warn;
4315642a2537SAlex Elder 
4316642a2537SAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
4317642a2537SAlex Elder 		if (ret)
4318cfbf6377SAlex Elder 			return ret;
4319642a2537SAlex Elder 
4320642a2537SAlex Elder 		/*
4321642a2537SAlex Elder 		 * Print a warning if this is the initial probe and
4322642a2537SAlex Elder 		 * the image has a parent.  Don't print it if the
4323642a2537SAlex Elder 		 * image now being probed is itself a parent.  We
4324642a2537SAlex Elder 		 * can tell at this point because we won't know its
4325642a2537SAlex Elder 		 * pool name yet (just its pool id).
4326642a2537SAlex Elder 		 */
4327642a2537SAlex Elder 		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4328642a2537SAlex Elder 		if (first_time && warn)
4329642a2537SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
4330642a2537SAlex Elder 					"is EXPERIMENTAL!");
4331642a2537SAlex Elder 	}
4332642a2537SAlex Elder 
433329334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
433429334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
433529334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
4336117973fbSAlex Elder 
4337cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4338117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4339117973fbSAlex Elder 
4340117973fbSAlex Elder 	return ret;
4341117973fbSAlex Elder }
4342117973fbSAlex Elder 
4343dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4344dfc5606dSYehuda Sadeh {
4345dfc5606dSYehuda Sadeh 	struct device *dev;
4346cd789ab9SAlex Elder 	int ret;
4347dfc5606dSYehuda Sadeh 
4348cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4349dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4350dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4351dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4352200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4353de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4354dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4355dfc5606dSYehuda Sadeh 
4356dfc5606dSYehuda Sadeh 	return ret;
4357602adf40SYehuda Sadeh }
4358602adf40SYehuda Sadeh 
4359dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4360dfc5606dSYehuda Sadeh {
4361dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4362dfc5606dSYehuda Sadeh }
4363dfc5606dSYehuda Sadeh 
4364e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
43651ddbe94eSAlex Elder 
43661ddbe94eSAlex Elder /*
4367499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4368499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
43691ddbe94eSAlex Elder  */
4370e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4371b7f23c36SAlex Elder {
4372e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4373499afd5bSAlex Elder 
4374499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4375499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4376499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4377e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4378e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4379b7f23c36SAlex Elder }
4380b7f23c36SAlex Elder 
43811ddbe94eSAlex Elder /*
4382499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4383499afd5bSAlex Elder  * identifier is no longer in use.
43841ddbe94eSAlex Elder  */
4385e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
43861ddbe94eSAlex Elder {
4387d184f6bfSAlex Elder 	struct list_head *tmp;
4388de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
4389d184f6bfSAlex Elder 	int max_id;
4390d184f6bfSAlex Elder 
4391aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
4392499afd5bSAlex Elder 
4393e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4394e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4395499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4396499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4397d184f6bfSAlex Elder 
4398d184f6bfSAlex Elder 	/*
4399d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
4400d184f6bfSAlex Elder 	 * is nothing special we need to do.
4401d184f6bfSAlex Elder 	 */
4402e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4403d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
4404d184f6bfSAlex Elder 		return;
4405d184f6bfSAlex Elder 	}
4406d184f6bfSAlex Elder 
4407d184f6bfSAlex Elder 	/*
4408d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
4409d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
4410d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
4411d184f6bfSAlex Elder 	 */
4412d184f6bfSAlex Elder 	max_id = 0;
4413d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
4414d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
4415d184f6bfSAlex Elder 
4416d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4417b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
4418b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
4419d184f6bfSAlex Elder 	}
4420499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
44211ddbe94eSAlex Elder 
44221ddbe94eSAlex Elder 	/*
4423e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
4424d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
4425d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
4426d184f6bfSAlex Elder 	 * case.
44271ddbe94eSAlex Elder 	 */
4428e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4429e2839308SAlex Elder 	dout("  max dev id has been reset\n");
4430b7f23c36SAlex Elder }
4431b7f23c36SAlex Elder 
4432a725f65eSAlex Elder /*
4433e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4434e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4435593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4436593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4437e28fff26SAlex Elder  */
4438e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4439e28fff26SAlex Elder {
4440e28fff26SAlex Elder         /*
4441e28fff26SAlex Elder         * These are the characters that produce nonzero for
4442e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4443e28fff26SAlex Elder         */
4444e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4445e28fff26SAlex Elder 
4446e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4447e28fff26SAlex Elder 
4448e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4449e28fff26SAlex Elder }
4450e28fff26SAlex Elder 
4451e28fff26SAlex Elder /*
4452e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4453e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4454593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4455593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4456e28fff26SAlex Elder  *
4457e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4458e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4459e28fff26SAlex Elder  * token_size if the token would not fit.
4460e28fff26SAlex Elder  *
4461593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4462e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4463e28fff26SAlex Elder  * too small to hold it.
4464e28fff26SAlex Elder  */
4465e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4466e28fff26SAlex Elder 				char *token,
4467e28fff26SAlex Elder 				size_t token_size)
4468e28fff26SAlex Elder {
4469e28fff26SAlex Elder         size_t len;
4470e28fff26SAlex Elder 
4471e28fff26SAlex Elder 	len = next_token(buf);
4472e28fff26SAlex Elder 	if (len < token_size) {
4473e28fff26SAlex Elder 		memcpy(token, *buf, len);
4474e28fff26SAlex Elder 		*(token + len) = '\0';
4475e28fff26SAlex Elder 	}
4476e28fff26SAlex Elder 	*buf += len;
4477e28fff26SAlex Elder 
4478e28fff26SAlex Elder         return len;
4479e28fff26SAlex Elder }
4480e28fff26SAlex Elder 
4481e28fff26SAlex Elder /*
4482ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4483ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4484ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4485ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4486ea3352f4SAlex Elder  *
4487ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4488ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4489ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4490ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4491ea3352f4SAlex Elder  *
4492ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4493ea3352f4SAlex Elder  * the end of the found token.
4494ea3352f4SAlex Elder  *
4495ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4496ea3352f4SAlex Elder  */
4497ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4498ea3352f4SAlex Elder {
4499ea3352f4SAlex Elder 	char *dup;
4500ea3352f4SAlex Elder 	size_t len;
4501ea3352f4SAlex Elder 
4502ea3352f4SAlex Elder 	len = next_token(buf);
45034caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4504ea3352f4SAlex Elder 	if (!dup)
4505ea3352f4SAlex Elder 		return NULL;
4506ea3352f4SAlex Elder 	*(dup + len) = '\0';
4507ea3352f4SAlex Elder 	*buf += len;
4508ea3352f4SAlex Elder 
4509ea3352f4SAlex Elder 	if (lenp)
4510ea3352f4SAlex Elder 		*lenp = len;
4511ea3352f4SAlex Elder 
4512ea3352f4SAlex Elder 	return dup;
4513ea3352f4SAlex Elder }
4514ea3352f4SAlex Elder 
4515ea3352f4SAlex Elder /*
4516859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4517859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4518859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4519859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4520d22f76e7SAlex Elder  *
4521859c31dfSAlex Elder  * The information extracted from these options is recorded in
4522859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4523859c31dfSAlex Elder  * structures:
4524859c31dfSAlex Elder  *  ceph_opts
4525859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4526859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4527859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4528859c31dfSAlex Elder  *  rbd_opts
4529859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4530859c31dfSAlex Elder  *	this function; caller must release with kfree().
4531859c31dfSAlex Elder  *  spec
4532859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4533859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4534859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4535859c31dfSAlex Elder  *
4536859c31dfSAlex Elder  * The options passed take this form:
4537859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4538859c31dfSAlex Elder  * where:
4539859c31dfSAlex Elder  *  <mon_addrs>
4540859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4541859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4542859c31dfSAlex Elder  *      by a port number (separated by a colon).
4543859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4544859c31dfSAlex Elder  *  <options>
4545859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4546859c31dfSAlex Elder  *  <pool_name>
4547859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4548859c31dfSAlex Elder  *  <image_name>
4549859c31dfSAlex Elder  *      The name of the image in that pool to map.
4550859c31dfSAlex Elder  *  <snap_id>
4551859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4552859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4553859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4554859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4555a725f65eSAlex Elder  */
4556859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4557dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4558859c31dfSAlex Elder 				struct rbd_options **opts,
4559859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4560a725f65eSAlex Elder {
4561e28fff26SAlex Elder 	size_t len;
4562859c31dfSAlex Elder 	char *options;
45630ddebc0cSAlex Elder 	const char *mon_addrs;
4564ecb4dc22SAlex Elder 	char *snap_name;
45650ddebc0cSAlex Elder 	size_t mon_addrs_size;
4566859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
45674e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4568859c31dfSAlex Elder 	struct ceph_options *copts;
4569dc79b113SAlex Elder 	int ret;
4570e28fff26SAlex Elder 
4571e28fff26SAlex Elder 	/* The first four tokens are required */
4572e28fff26SAlex Elder 
45737ef3214aSAlex Elder 	len = next_token(&buf);
45744fb5d671SAlex Elder 	if (!len) {
45754fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
45764fb5d671SAlex Elder 		return -EINVAL;
45774fb5d671SAlex Elder 	}
45780ddebc0cSAlex Elder 	mon_addrs = buf;
4579f28e565aSAlex Elder 	mon_addrs_size = len + 1;
45807ef3214aSAlex Elder 	buf += len;
4581a725f65eSAlex Elder 
4582dc79b113SAlex Elder 	ret = -EINVAL;
4583f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4584f28e565aSAlex Elder 	if (!options)
4585dc79b113SAlex Elder 		return -ENOMEM;
45864fb5d671SAlex Elder 	if (!*options) {
45874fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
45884fb5d671SAlex Elder 		goto out_err;
45894fb5d671SAlex Elder 	}
4590a725f65eSAlex Elder 
4591859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4592859c31dfSAlex Elder 	if (!spec)
4593f28e565aSAlex Elder 		goto out_mem;
4594859c31dfSAlex Elder 
4595859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4596859c31dfSAlex Elder 	if (!spec->pool_name)
4597859c31dfSAlex Elder 		goto out_mem;
45984fb5d671SAlex Elder 	if (!*spec->pool_name) {
45994fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
46004fb5d671SAlex Elder 		goto out_err;
46014fb5d671SAlex Elder 	}
4602e28fff26SAlex Elder 
460369e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4604859c31dfSAlex Elder 	if (!spec->image_name)
4605f28e565aSAlex Elder 		goto out_mem;
46064fb5d671SAlex Elder 	if (!*spec->image_name) {
46074fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
46084fb5d671SAlex Elder 		goto out_err;
46094fb5d671SAlex Elder 	}
4610e28fff26SAlex Elder 
4611f28e565aSAlex Elder 	/*
4612f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4613f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4614f28e565aSAlex Elder 	 */
46153feeb894SAlex Elder 	len = next_token(&buf);
4616820a5f3eSAlex Elder 	if (!len) {
46173feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
46183feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4619f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4620dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4621f28e565aSAlex Elder 		goto out_err;
4622849b4260SAlex Elder 	}
4623ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4624ecb4dc22SAlex Elder 	if (!snap_name)
4625f28e565aSAlex Elder 		goto out_mem;
4626ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4627ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4628e5c35534SAlex Elder 
46290ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4630e28fff26SAlex Elder 
46314e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
46324e9afebaSAlex Elder 	if (!rbd_opts)
46334e9afebaSAlex Elder 		goto out_mem;
46344e9afebaSAlex Elder 
46354e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4636d22f76e7SAlex Elder 
4637859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
46380ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
46394e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4640859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4641859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4642dc79b113SAlex Elder 		goto out_err;
4643dc79b113SAlex Elder 	}
4644859c31dfSAlex Elder 	kfree(options);
4645859c31dfSAlex Elder 
4646859c31dfSAlex Elder 	*ceph_opts = copts;
46474e9afebaSAlex Elder 	*opts = rbd_opts;
4648859c31dfSAlex Elder 	*rbd_spec = spec;
46490ddebc0cSAlex Elder 
4650dc79b113SAlex Elder 	return 0;
4651f28e565aSAlex Elder out_mem:
4652dc79b113SAlex Elder 	ret = -ENOMEM;
4653d22f76e7SAlex Elder out_err:
4654859c31dfSAlex Elder 	kfree(rbd_opts);
4655859c31dfSAlex Elder 	rbd_spec_put(spec);
4656f28e565aSAlex Elder 	kfree(options);
4657d22f76e7SAlex Elder 
4658dc79b113SAlex Elder 	return ret;
4659a725f65eSAlex Elder }
4660a725f65eSAlex Elder 
4661589d30e0SAlex Elder /*
4662589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4663589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4664589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4665589d30e0SAlex Elder  *
4666589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4667589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4668589d30e0SAlex Elder  * with the supplied name.
4669589d30e0SAlex Elder  *
4670589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4671589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4672589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4673589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4674589d30e0SAlex Elder  */
4675589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4676589d30e0SAlex Elder {
4677589d30e0SAlex Elder 	int ret;
4678589d30e0SAlex Elder 	size_t size;
4679589d30e0SAlex Elder 	char *object_name;
4680589d30e0SAlex Elder 	void *response;
4681c0fba368SAlex Elder 	char *image_id;
46822f82ee54SAlex Elder 
4683589d30e0SAlex Elder 	/*
46842c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
46852c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4686c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4687c0fba368SAlex Elder 	 * do still need to set the image format though.
46882c0d0a10SAlex Elder 	 */
4689c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4690c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4691c0fba368SAlex Elder 
46922c0d0a10SAlex Elder 		return 0;
4693c0fba368SAlex Elder 	}
46942c0d0a10SAlex Elder 
46952c0d0a10SAlex Elder 	/*
4696589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4697589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4698589d30e0SAlex Elder 	 */
469969e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4700589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4701589d30e0SAlex Elder 	if (!object_name)
4702589d30e0SAlex Elder 		return -ENOMEM;
47030d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4704589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4705589d30e0SAlex Elder 
4706589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4707589d30e0SAlex Elder 
4708589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4709589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4710589d30e0SAlex Elder 	if (!response) {
4711589d30e0SAlex Elder 		ret = -ENOMEM;
4712589d30e0SAlex Elder 		goto out;
4713589d30e0SAlex Elder 	}
4714589d30e0SAlex Elder 
4715c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4716c0fba368SAlex Elder 
471736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
47184157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4719e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
472036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4721c0fba368SAlex Elder 	if (ret == -ENOENT) {
4722c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4723c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4724c0fba368SAlex Elder 		if (!ret)
4725c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4726c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4727c0fba368SAlex Elder 		void *p = response;
4728589d30e0SAlex Elder 
4729c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4730979ed480SAlex Elder 						NULL, GFP_NOIO);
4731c0fba368SAlex Elder 		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4732c0fba368SAlex Elder 		if (!ret)
4733c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4734589d30e0SAlex Elder 	} else {
4735c0fba368SAlex Elder 		ret = -EINVAL;
4736c0fba368SAlex Elder 	}
4737c0fba368SAlex Elder 
4738c0fba368SAlex Elder 	if (!ret) {
4739c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4740c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4741589d30e0SAlex Elder 	}
4742589d30e0SAlex Elder out:
4743589d30e0SAlex Elder 	kfree(response);
4744589d30e0SAlex Elder 	kfree(object_name);
4745589d30e0SAlex Elder 
4746589d30e0SAlex Elder 	return ret;
4747589d30e0SAlex Elder }
4748589d30e0SAlex Elder 
47493abef3b3SAlex Elder /*
47503abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
47513abef3b3SAlex Elder  * call.
47523abef3b3SAlex Elder  */
47536fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
47546fd48b3bSAlex Elder {
47556fd48b3bSAlex Elder 	struct rbd_image_header	*header;
47566fd48b3bSAlex Elder 
4757392a9dadSAlex Elder 	/* Drop parent reference unless it's already been done (or none) */
4758392a9dadSAlex Elder 
4759392a9dadSAlex Elder 	if (rbd_dev->parent_overlap)
4760a2acd00eSAlex Elder 		rbd_dev_parent_put(rbd_dev);
47616fd48b3bSAlex Elder 
47626fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
47636fd48b3bSAlex Elder 
47646fd48b3bSAlex Elder 	header = &rbd_dev->header;
4765812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
47666fd48b3bSAlex Elder 	kfree(header->snap_sizes);
47676fd48b3bSAlex Elder 	kfree(header->snap_names);
47686fd48b3bSAlex Elder 	kfree(header->object_prefix);
47696fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
47706fd48b3bSAlex Elder }
47716fd48b3bSAlex Elder 
47722df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4773a30b71b9SAlex Elder {
4774a30b71b9SAlex Elder 	int ret;
4775a30b71b9SAlex Elder 
47761e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
477757385b51SAlex Elder 	if (ret)
47781e130199SAlex Elder 		goto out_err;
4779b1b5402aSAlex Elder 
47802df3fac7SAlex Elder 	/*
47812df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
47822df3fac7SAlex Elder 	 * features are assumed to never change.
47832df3fac7SAlex Elder 	 */
4784b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
478557385b51SAlex Elder 	if (ret)
4786b1b5402aSAlex Elder 		goto out_err;
478735d489f9SAlex Elder 
4788cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4789cc070d59SAlex Elder 
4790cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4791cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4792cc070d59SAlex Elder 		if (ret < 0)
4793cc070d59SAlex Elder 			goto out_err;
4794cc070d59SAlex Elder 	}
47952df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
4796a30b71b9SAlex Elder 
479735152979SAlex Elder 	return 0;
47989d475de5SAlex Elder out_err:
4799642a2537SAlex Elder 	rbd_dev->header.features = 0;
48001e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
48011e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
48029d475de5SAlex Elder 
48039d475de5SAlex Elder 	return ret;
4804a30b71b9SAlex Elder }
4805a30b71b9SAlex Elder 
4806124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
480783a06263SAlex Elder {
48082f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4809124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4810124afba2SAlex Elder 	struct rbd_client *rbdc;
4811124afba2SAlex Elder 	int ret;
4812124afba2SAlex Elder 
4813124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4814124afba2SAlex Elder 		return 0;
4815124afba2SAlex Elder 	/*
4816124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4817124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4818124afba2SAlex Elder 	 * parent/child relationships always share both.
4819124afba2SAlex Elder 	 */
4820124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4821124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4822124afba2SAlex Elder 
4823124afba2SAlex Elder 	ret = -ENOMEM;
4824124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4825124afba2SAlex Elder 	if (!parent)
4826124afba2SAlex Elder 		goto out_err;
4827124afba2SAlex Elder 
48281f3ef788SAlex Elder 	ret = rbd_dev_image_probe(parent, false);
4829124afba2SAlex Elder 	if (ret < 0)
4830124afba2SAlex Elder 		goto out_err;
4831124afba2SAlex Elder 	rbd_dev->parent = parent;
4832a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
4833124afba2SAlex Elder 
4834124afba2SAlex Elder 	return 0;
4835124afba2SAlex Elder out_err:
4836124afba2SAlex Elder 	if (parent) {
4837fb65d228SAlex Elder 		rbd_dev_unparent(rbd_dev);
4838124afba2SAlex Elder 		kfree(rbd_dev->header_name);
4839124afba2SAlex Elder 		rbd_dev_destroy(parent);
4840124afba2SAlex Elder 	} else {
4841124afba2SAlex Elder 		rbd_put_client(rbdc);
4842124afba2SAlex Elder 		rbd_spec_put(parent_spec);
4843124afba2SAlex Elder 	}
4844124afba2SAlex Elder 
4845124afba2SAlex Elder 	return ret;
4846124afba2SAlex Elder }
4847124afba2SAlex Elder 
4848200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4849124afba2SAlex Elder {
485083a06263SAlex Elder 	int ret;
485183a06263SAlex Elder 
485283a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
485383a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
485483a06263SAlex Elder 
485583a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
485683a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
485783a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
485883a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
485983a06263SAlex Elder 
486083a06263SAlex Elder 	/* Get our block major device number. */
486183a06263SAlex Elder 
486283a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
486383a06263SAlex Elder 	if (ret < 0)
486483a06263SAlex Elder 		goto err_out_id;
486583a06263SAlex Elder 	rbd_dev->major = ret;
486683a06263SAlex Elder 
486783a06263SAlex Elder 	/* Set up the blkdev mapping. */
486883a06263SAlex Elder 
486983a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
487083a06263SAlex Elder 	if (ret)
487183a06263SAlex Elder 		goto err_out_blkdev;
487283a06263SAlex Elder 
4873f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
487483a06263SAlex Elder 	if (ret)
487583a06263SAlex Elder 		goto err_out_disk;
4876f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4877f35a4deeSAlex Elder 
4878f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
4879f35a4deeSAlex Elder 	if (ret)
4880f35a4deeSAlex Elder 		goto err_out_mapping;
488183a06263SAlex Elder 
488283a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
488383a06263SAlex Elder 
4884129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
488583a06263SAlex Elder 	add_disk(rbd_dev->disk);
488683a06263SAlex Elder 
488783a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
488883a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
488983a06263SAlex Elder 
489083a06263SAlex Elder 	return ret;
48912f82ee54SAlex Elder 
4892f35a4deeSAlex Elder err_out_mapping:
4893f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
489483a06263SAlex Elder err_out_disk:
489583a06263SAlex Elder 	rbd_free_disk(rbd_dev);
489683a06263SAlex Elder err_out_blkdev:
489783a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
489883a06263SAlex Elder err_out_id:
489983a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
4900d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
490183a06263SAlex Elder 
490283a06263SAlex Elder 	return ret;
490383a06263SAlex Elder }
490483a06263SAlex Elder 
4905332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4906332bb12dSAlex Elder {
4907332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
4908332bb12dSAlex Elder 	size_t size;
4909332bb12dSAlex Elder 
4910332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
4911332bb12dSAlex Elder 
4912332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4913332bb12dSAlex Elder 
4914332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4915332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4916332bb12dSAlex Elder 	else
4917332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4918332bb12dSAlex Elder 
4919332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4920332bb12dSAlex Elder 	if (!rbd_dev->header_name)
4921332bb12dSAlex Elder 		return -ENOMEM;
4922332bb12dSAlex Elder 
4923332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4924332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4925332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
4926332bb12dSAlex Elder 	else
4927332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4928332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
4929332bb12dSAlex Elder 	return 0;
4930332bb12dSAlex Elder }
4931332bb12dSAlex Elder 
4932200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4933200a6a8bSAlex Elder {
49346fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
4935200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
49366fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
49376fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
49386fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
49396fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
49406fd48b3bSAlex Elder 
4941200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
4942200a6a8bSAlex Elder }
4943200a6a8bSAlex Elder 
4944a30b71b9SAlex Elder /*
4945a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
49461f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
49471f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
49481f3ef788SAlex Elder  * object to get detailed information about the rbd image.
4949a30b71b9SAlex Elder  */
49501f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
4951a30b71b9SAlex Elder {
4952a30b71b9SAlex Elder 	int ret;
4953b644de2bSAlex Elder 	int tmp;
4954a30b71b9SAlex Elder 
4955a30b71b9SAlex Elder 	/*
49563abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
49573abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
49583abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
49593abef3b3SAlex Elder 	 * will be set to either 1 or 2.
4960a30b71b9SAlex Elder 	 */
4961a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4962a30b71b9SAlex Elder 	if (ret)
4963c0fba368SAlex Elder 		return ret;
4964c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
4965c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4966c0fba368SAlex Elder 
4967332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
4968332bb12dSAlex Elder 	if (ret)
4969332bb12dSAlex Elder 		goto err_out_format;
4970332bb12dSAlex Elder 
49711f3ef788SAlex Elder 	if (mapping) {
49721f3ef788SAlex Elder 		ret = rbd_dev_header_watch_sync(rbd_dev, true);
4973b644de2bSAlex Elder 		if (ret)
4974b644de2bSAlex Elder 			goto out_header_name;
49751f3ef788SAlex Elder 	}
4976b644de2bSAlex Elder 
4977c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
497899a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
4979a30b71b9SAlex Elder 	else
49802df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
49815655c4d9SAlex Elder 	if (ret)
4982b644de2bSAlex Elder 		goto err_out_watch;
4983a30b71b9SAlex Elder 
49849bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
49859bb81c9bSAlex Elder 	if (ret)
498633dca39fSAlex Elder 		goto err_out_probe;
49879bb81c9bSAlex Elder 
49889bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
498930d60ba2SAlex Elder 	if (ret)
499030d60ba2SAlex Elder 		goto err_out_probe;
499183a06263SAlex Elder 
499230d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
499330d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
499430d60ba2SAlex Elder 
499530d60ba2SAlex Elder 	return 0;
49966fd48b3bSAlex Elder err_out_probe:
49976fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
4998b644de2bSAlex Elder err_out_watch:
49991f3ef788SAlex Elder 	if (mapping) {
50001f3ef788SAlex Elder 		tmp = rbd_dev_header_watch_sync(rbd_dev, false);
5001b644de2bSAlex Elder 		if (tmp)
50021f3ef788SAlex Elder 			rbd_warn(rbd_dev, "unable to tear down "
50031f3ef788SAlex Elder 					"watch request (%d)\n", tmp);
50041f3ef788SAlex Elder 	}
5005332bb12dSAlex Elder out_header_name:
5006332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
5007332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
5008332bb12dSAlex Elder err_out_format:
5009332bb12dSAlex Elder 	rbd_dev->image_format = 0;
50105655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
50115655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
50125655c4d9SAlex Elder 
50135655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
50145655c4d9SAlex Elder 
50155655c4d9SAlex Elder 	return ret;
501683a06263SAlex Elder }
501783a06263SAlex Elder 
501859c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
501959c2be1eSYehuda Sadeh 		       const char *buf,
502059c2be1eSYehuda Sadeh 		       size_t count)
5021602adf40SYehuda Sadeh {
5022cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5023dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
50244e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5025859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
50269d3997fdSAlex Elder 	struct rbd_client *rbdc;
502727cc2594SAlex Elder 	struct ceph_osd_client *osdc;
502851344a38SAlex Elder 	bool read_only;
502927cc2594SAlex Elder 	int rc = -ENOMEM;
5030602adf40SYehuda Sadeh 
5031602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5032602adf40SYehuda Sadeh 		return -ENODEV;
5033602adf40SYehuda Sadeh 
5034a725f65eSAlex Elder 	/* parse add command */
5035859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5036dc79b113SAlex Elder 	if (rc < 0)
5037bd4ba655SAlex Elder 		goto err_out_module;
503851344a38SAlex Elder 	read_only = rbd_opts->read_only;
503951344a38SAlex Elder 	kfree(rbd_opts);
504051344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
5041a725f65eSAlex Elder 
50429d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
50439d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
50449d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
50450ddebc0cSAlex Elder 		goto err_out_args;
50469d3997fdSAlex Elder 	}
5047602adf40SYehuda Sadeh 
5048602adf40SYehuda Sadeh 	/* pick the pool */
50499d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
5050859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
5051602adf40SYehuda Sadeh 	if (rc < 0)
5052602adf40SYehuda Sadeh 		goto err_out_client;
5053859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5054859c31dfSAlex Elder 
50550903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
50560903e875SAlex Elder 
5057c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
5058c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5059c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
50600903e875SAlex Elder 		rc = -EIO;
50610903e875SAlex Elder 		goto err_out_client;
50620903e875SAlex Elder 	}
50630903e875SAlex Elder 
5064c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
5065bd4ba655SAlex Elder 	if (!rbd_dev)
5066bd4ba655SAlex Elder 		goto err_out_client;
5067c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5068c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5069602adf40SYehuda Sadeh 
50701f3ef788SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, true);
5071a30b71b9SAlex Elder 	if (rc < 0)
5072c53d5893SAlex Elder 		goto err_out_rbd_dev;
507305fd6f6fSAlex Elder 
50747ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
50757ce4eef7SAlex Elder 
50767ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
50777ce4eef7SAlex Elder 		read_only = true;
50787ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
50797ce4eef7SAlex Elder 
5080b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
50813abef3b3SAlex Elder 	if (rc) {
50823abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
50833abef3b3SAlex Elder 		goto err_out_module;
50843abef3b3SAlex Elder 	}
50853abef3b3SAlex Elder 
5086602adf40SYehuda Sadeh 	return count;
5087b536f69aSAlex Elder 
5088c53d5893SAlex Elder err_out_rbd_dev:
5089c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5090bd4ba655SAlex Elder err_out_client:
50919d3997fdSAlex Elder 	rbd_put_client(rbdc);
50920ddebc0cSAlex Elder err_out_args:
5093859c31dfSAlex Elder 	rbd_spec_put(spec);
5094bd4ba655SAlex Elder err_out_module:
5095bd4ba655SAlex Elder 	module_put(THIS_MODULE);
509627cc2594SAlex Elder 
5097602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
509827cc2594SAlex Elder 
509927cc2594SAlex Elder 	return (ssize_t)rc;
5100602adf40SYehuda Sadeh }
5101602adf40SYehuda Sadeh 
5102200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
5103602adf40SYehuda Sadeh {
5104593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5105602adf40SYehuda Sadeh 
5106602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5107200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
51086d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5109602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
5110200a6a8bSAlex Elder 	rbd_dev->major = 0;
5111e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5112d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5113602adf40SYehuda Sadeh }
5114602adf40SYehuda Sadeh 
511505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
511605a46afdSAlex Elder {
5117ad945fc1SAlex Elder 	while (rbd_dev->parent) {
511805a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
511905a46afdSAlex Elder 		struct rbd_device *second = first->parent;
512005a46afdSAlex Elder 		struct rbd_device *third;
512105a46afdSAlex Elder 
512205a46afdSAlex Elder 		/*
512305a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
512405a46afdSAlex Elder 		 * remove it.
512505a46afdSAlex Elder 		 */
512605a46afdSAlex Elder 		while (second && (third = second->parent)) {
512705a46afdSAlex Elder 			first = second;
512805a46afdSAlex Elder 			second = third;
512905a46afdSAlex Elder 		}
5130ad945fc1SAlex Elder 		rbd_assert(second);
51318ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5132ad945fc1SAlex Elder 		first->parent = NULL;
5133ad945fc1SAlex Elder 		first->parent_overlap = 0;
5134ad945fc1SAlex Elder 
5135ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
513605a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
513705a46afdSAlex Elder 		first->parent_spec = NULL;
513805a46afdSAlex Elder 	}
513905a46afdSAlex Elder }
514005a46afdSAlex Elder 
5141dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
5142602adf40SYehuda Sadeh 			  const char *buf,
5143602adf40SYehuda Sadeh 			  size_t count)
5144602adf40SYehuda Sadeh {
5145602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5146751cc0e3SAlex Elder 	struct list_head *tmp;
5147751cc0e3SAlex Elder 	int dev_id;
5148602adf40SYehuda Sadeh 	unsigned long ul;
514982a442d2SAlex Elder 	bool already = false;
51500d8189e1SAlex Elder 	int ret;
5151602adf40SYehuda Sadeh 
51520d8189e1SAlex Elder 	ret = strict_strtoul(buf, 10, &ul);
51530d8189e1SAlex Elder 	if (ret)
51540d8189e1SAlex Elder 		return ret;
5155602adf40SYehuda Sadeh 
5156602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5157751cc0e3SAlex Elder 	dev_id = (int)ul;
5158751cc0e3SAlex Elder 	if (dev_id != ul)
5159602adf40SYehuda Sadeh 		return -EINVAL;
5160602adf40SYehuda Sadeh 
5161602adf40SYehuda Sadeh 	ret = -ENOENT;
5162751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5163751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5164751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5165751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5166751cc0e3SAlex Elder 			ret = 0;
5167751cc0e3SAlex Elder 			break;
5168602adf40SYehuda Sadeh 		}
5169751cc0e3SAlex Elder 	}
5170751cc0e3SAlex Elder 	if (!ret) {
5171a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5172b82d167bSAlex Elder 		if (rbd_dev->open_count)
517342382b70SAlex Elder 			ret = -EBUSY;
5174b82d167bSAlex Elder 		else
517582a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
517682a442d2SAlex Elder 							&rbd_dev->flags);
5177a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5178751cc0e3SAlex Elder 	}
5179751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
518082a442d2SAlex Elder 	if (ret < 0 || already)
51811ba0f1e7SAlex Elder 		return ret;
5182751cc0e3SAlex Elder 
51831f3ef788SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, false);
51841f3ef788SAlex Elder 	if (ret)
51851f3ef788SAlex Elder 		rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
51869abc5990SJosh Durgin 
51879abc5990SJosh Durgin 	/*
51889abc5990SJosh Durgin 	 * flush remaining watch callbacks - these must be complete
51899abc5990SJosh Durgin 	 * before the osd_client is shutdown
51909abc5990SJosh Durgin 	 */
51919abc5990SJosh Durgin 	dout("%s: flushing notifies", __func__);
51929abc5990SJosh Durgin 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
51939875201eSJosh Durgin 	/*
51949875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
51959875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
51969875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
51979875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
51989875201eSJosh Durgin 	 */
51999875201eSJosh Durgin 	rbd_bus_del_dev(rbd_dev);
52008ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
520179ab7558SAlex Elder 	module_put(THIS_MODULE);
5202aafb230eSAlex Elder 
52031ba0f1e7SAlex Elder 	return count;
5204602adf40SYehuda Sadeh }
5205602adf40SYehuda Sadeh 
5206602adf40SYehuda Sadeh /*
5207602adf40SYehuda Sadeh  * create control files in sysfs
5208dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5209602adf40SYehuda Sadeh  */
5210602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5211602adf40SYehuda Sadeh {
5212dfc5606dSYehuda Sadeh 	int ret;
5213602adf40SYehuda Sadeh 
5214fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5215dfc5606dSYehuda Sadeh 	if (ret < 0)
5216dfc5606dSYehuda Sadeh 		return ret;
5217602adf40SYehuda Sadeh 
5218fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5219fed4c143SAlex Elder 	if (ret < 0)
5220fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5221602adf40SYehuda Sadeh 
5222602adf40SYehuda Sadeh 	return ret;
5223602adf40SYehuda Sadeh }
5224602adf40SYehuda Sadeh 
5225602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5226602adf40SYehuda Sadeh {
5227dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5228fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5229602adf40SYehuda Sadeh }
5230602adf40SYehuda Sadeh 
52311c2a9dfeSAlex Elder static int rbd_slab_init(void)
52321c2a9dfeSAlex Elder {
52331c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
52341c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
52351c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
52361c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
52371c2a9dfeSAlex Elder 					0, NULL);
5238868311b1SAlex Elder 	if (!rbd_img_request_cache)
5239868311b1SAlex Elder 		return -ENOMEM;
5240868311b1SAlex Elder 
5241868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5242868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5243868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5244868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5245868311b1SAlex Elder 					0, NULL);
524678c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
524778c2a44aSAlex Elder 		goto out_err;
524878c2a44aSAlex Elder 
524978c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
525078c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
525178c2a44aSAlex Elder 					MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
525278c2a44aSAlex Elder 	if (rbd_segment_name_cache)
52531c2a9dfeSAlex Elder 		return 0;
525478c2a44aSAlex Elder out_err:
525578c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
525678c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
525778c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
525878c2a44aSAlex Elder 	}
52591c2a9dfeSAlex Elder 
5260868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5261868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5262868311b1SAlex Elder 
52631c2a9dfeSAlex Elder 	return -ENOMEM;
52641c2a9dfeSAlex Elder }
52651c2a9dfeSAlex Elder 
52661c2a9dfeSAlex Elder static void rbd_slab_exit(void)
52671c2a9dfeSAlex Elder {
526878c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
526978c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
527078c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
527178c2a44aSAlex Elder 
5272868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5273868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5274868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5275868311b1SAlex Elder 
52761c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
52771c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
52781c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
52791c2a9dfeSAlex Elder }
52801c2a9dfeSAlex Elder 
5281cc344fa1SAlex Elder static int __init rbd_init(void)
5282602adf40SYehuda Sadeh {
5283602adf40SYehuda Sadeh 	int rc;
5284602adf40SYehuda Sadeh 
52851e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
52861e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
52871e32d34cSAlex Elder 
52881e32d34cSAlex Elder 		return -EINVAL;
52891e32d34cSAlex Elder 	}
52901c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5291602adf40SYehuda Sadeh 	if (rc)
5292602adf40SYehuda Sadeh 		return rc;
52931c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
52941c2a9dfeSAlex Elder 	if (rc)
52951c2a9dfeSAlex Elder 		rbd_slab_exit();
52961c2a9dfeSAlex Elder 	else
5297f0f8cef5SAlex Elder 		pr_info("loaded " RBD_DRV_NAME_LONG "\n");
52981c2a9dfeSAlex Elder 
52991c2a9dfeSAlex Elder 	return rc;
5300602adf40SYehuda Sadeh }
5301602adf40SYehuda Sadeh 
5302cc344fa1SAlex Elder static void __exit rbd_exit(void)
5303602adf40SYehuda Sadeh {
5304602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
53051c2a9dfeSAlex Elder 	rbd_slab_exit();
5306602adf40SYehuda Sadeh }
5307602adf40SYehuda Sadeh 
5308602adf40SYehuda Sadeh module_init(rbd_init);
5309602adf40SYehuda Sadeh module_exit(rbd_exit);
5310602adf40SYehuda Sadeh 
5311d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5312602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5313602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5314602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
5315602adf40SYehuda Sadeh 
5316602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5317602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5318602adf40SYehuda Sadeh 
5319602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5320