xref: /openbmc/linux/drivers/block/rbd.c (revision a158073c)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44602adf40SYehuda Sadeh 
45602adf40SYehuda Sadeh #include "rbd_types.h"
46602adf40SYehuda Sadeh 
47aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
48aafb230eSAlex Elder 
49593a9e7bSAlex Elder /*
50593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
51593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
52593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
53593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
54593a9e7bSAlex Elder  */
55593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
56593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
57593a9e7bSAlex Elder 
58a2acd00eSAlex Elder /*
59a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
60a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
61a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
62a2acd00eSAlex Elder  * -EINVAL without updating it.
63a2acd00eSAlex Elder  */
64a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
65a2acd00eSAlex Elder {
66a2acd00eSAlex Elder 	unsigned int counter;
67a2acd00eSAlex Elder 
68a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
69a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
70a2acd00eSAlex Elder 		return (int)counter;
71a2acd00eSAlex Elder 
72a2acd00eSAlex Elder 	atomic_dec(v);
73a2acd00eSAlex Elder 
74a2acd00eSAlex Elder 	return -EINVAL;
75a2acd00eSAlex Elder }
76a2acd00eSAlex Elder 
77a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
78a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
79a2acd00eSAlex Elder {
80a2acd00eSAlex Elder 	int counter;
81a2acd00eSAlex Elder 
82a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
83a2acd00eSAlex Elder 	if (counter >= 0)
84a2acd00eSAlex Elder 		return counter;
85a2acd00eSAlex Elder 
86a2acd00eSAlex Elder 	atomic_inc(v);
87a2acd00eSAlex Elder 
88a2acd00eSAlex Elder 	return -EINVAL;
89a2acd00eSAlex Elder }
90a2acd00eSAlex Elder 
91f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
92f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
95602adf40SYehuda Sadeh 
96d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
97d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
98d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
99d4b125e9SAlex Elder 
10035d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
101602adf40SYehuda Sadeh 
102602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
103602adf40SYehuda Sadeh 
1049682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1059682fc6dSAlex Elder 
1069e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1079e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
108589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1099e15b77dSAlex Elder 
1101e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
111589d30e0SAlex Elder 
112d889140cSAlex Elder /* Feature bits */
113d889140cSAlex Elder 
1145cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1155cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1165cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1175cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
118d889140cSAlex Elder 
119d889140cSAlex Elder /* Features supported by this (client software) implementation. */
120d889140cSAlex Elder 
121770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
122d889140cSAlex Elder 
12381a89793SAlex Elder /*
12481a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
12581a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
12681a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
12781a89793SAlex Elder  * enough to hold all possible device names.
12881a89793SAlex Elder  */
129602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13081a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
131602adf40SYehuda Sadeh 
132602adf40SYehuda Sadeh /*
133602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
134602adf40SYehuda Sadeh  */
135602adf40SYehuda Sadeh struct rbd_image_header {
136f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
137849b4260SAlex Elder 	char *object_prefix;
138602adf40SYehuda Sadeh 	__u8 obj_order;
139602adf40SYehuda Sadeh 	__u8 crypt_type;
140602adf40SYehuda Sadeh 	__u8 comp_type;
141f35a4deeSAlex Elder 	u64 stripe_unit;
142f35a4deeSAlex Elder 	u64 stripe_count;
143f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
144602adf40SYehuda Sadeh 
145f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
146f84344f3SAlex Elder 	u64 image_size;
147f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
148f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
149f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15059c2be1eSYehuda Sadeh };
15159c2be1eSYehuda Sadeh 
1520d7dbfceSAlex Elder /*
1530d7dbfceSAlex Elder  * An rbd image specification.
1540d7dbfceSAlex Elder  *
1550d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
156c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
157c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
158c66c6e0cSAlex Elder  *
159c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
160c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
161c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
162c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
163c66c6e0cSAlex Elder  *
164c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
165c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
166c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
167c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
168c66c6e0cSAlex Elder  * is shared between the parent and child).
169c66c6e0cSAlex Elder  *
170c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
171c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
172c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
173c66c6e0cSAlex Elder  *
174c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
175c66c6e0cSAlex Elder  * could be a null pointer).
1760d7dbfceSAlex Elder  */
1770d7dbfceSAlex Elder struct rbd_spec {
1780d7dbfceSAlex Elder 	u64		pool_id;
179ecb4dc22SAlex Elder 	const char	*pool_name;
1800d7dbfceSAlex Elder 
181ecb4dc22SAlex Elder 	const char	*image_id;
182ecb4dc22SAlex Elder 	const char	*image_name;
1830d7dbfceSAlex Elder 
1840d7dbfceSAlex Elder 	u64		snap_id;
185ecb4dc22SAlex Elder 	const char	*snap_name;
1860d7dbfceSAlex Elder 
1870d7dbfceSAlex Elder 	struct kref	kref;
1880d7dbfceSAlex Elder };
1890d7dbfceSAlex Elder 
190602adf40SYehuda Sadeh /*
191f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
192602adf40SYehuda Sadeh  */
193602adf40SYehuda Sadeh struct rbd_client {
194602adf40SYehuda Sadeh 	struct ceph_client	*client;
195602adf40SYehuda Sadeh 	struct kref		kref;
196602adf40SYehuda Sadeh 	struct list_head	node;
197602adf40SYehuda Sadeh };
198602adf40SYehuda Sadeh 
199bf0d5f50SAlex Elder struct rbd_img_request;
200bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
201bf0d5f50SAlex Elder 
202bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
203bf0d5f50SAlex Elder 
204bf0d5f50SAlex Elder struct rbd_obj_request;
205bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
206bf0d5f50SAlex Elder 
2079969ebc5SAlex Elder enum obj_request_type {
2089969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2099969ebc5SAlex Elder };
210bf0d5f50SAlex Elder 
211926f9b3fSAlex Elder enum obj_req_flags {
212926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2136365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2145679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2155679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
216926f9b3fSAlex Elder };
217926f9b3fSAlex Elder 
218bf0d5f50SAlex Elder struct rbd_obj_request {
219bf0d5f50SAlex Elder 	const char		*object_name;
220bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
221bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
222926f9b3fSAlex Elder 	unsigned long		flags;
223bf0d5f50SAlex Elder 
224c5b5ef6cSAlex Elder 	/*
225c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
226c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
227c5b5ef6cSAlex Elder 	 *
228c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
229c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
230c5b5ef6cSAlex Elder 	 *
231c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
232c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
233c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
234c5b5ef6cSAlex Elder 	 *
235c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
236c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
237c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
238c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
239c5b5ef6cSAlex Elder 	 */
240c5b5ef6cSAlex Elder 	union {
241c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
242c5b5ef6cSAlex Elder 		struct {
243bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
244c5b5ef6cSAlex Elder 			u64			img_offset;
245c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
246c5b5ef6cSAlex Elder 			struct list_head	links;
247c5b5ef6cSAlex Elder 		};
248c5b5ef6cSAlex Elder 	};
249bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
250bf0d5f50SAlex Elder 
251bf0d5f50SAlex Elder 	enum obj_request_type	type;
252788e2df3SAlex Elder 	union {
253bf0d5f50SAlex Elder 		struct bio	*bio_list;
254788e2df3SAlex Elder 		struct {
255788e2df3SAlex Elder 			struct page	**pages;
256788e2df3SAlex Elder 			u32		page_count;
257788e2df3SAlex Elder 		};
258788e2df3SAlex Elder 	};
2590eefd470SAlex Elder 	struct page		**copyup_pages;
260ebda6408SAlex Elder 	u32			copyup_page_count;
261bf0d5f50SAlex Elder 
262bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
263bf0d5f50SAlex Elder 
264bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2651b83bef2SSage Weil 	int			result;
266bf0d5f50SAlex Elder 
267bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
268788e2df3SAlex Elder 	struct completion	completion;
269bf0d5f50SAlex Elder 
270bf0d5f50SAlex Elder 	struct kref		kref;
271bf0d5f50SAlex Elder };
272bf0d5f50SAlex Elder 
2730c425248SAlex Elder enum img_req_flags {
2749849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2759849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
276d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2770c425248SAlex Elder };
2780c425248SAlex Elder 
279bf0d5f50SAlex Elder struct rbd_img_request {
280bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
281bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
282bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2830c425248SAlex Elder 	unsigned long		flags;
284bf0d5f50SAlex Elder 	union {
285bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2869849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2879849e986SAlex Elder 	};
2889849e986SAlex Elder 	union {
2899849e986SAlex Elder 		struct request		*rq;		/* block request */
2909849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
291bf0d5f50SAlex Elder 	};
2923d7efd18SAlex Elder 	struct page		**copyup_pages;
293ebda6408SAlex Elder 	u32			copyup_page_count;
294bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
295bf0d5f50SAlex Elder 	u32			next_completion;
296bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
29755f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
298a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
299bf0d5f50SAlex Elder 
300bf0d5f50SAlex Elder 	u32			obj_request_count;
301bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
302bf0d5f50SAlex Elder 
303bf0d5f50SAlex Elder 	struct kref		kref;
304bf0d5f50SAlex Elder };
305bf0d5f50SAlex Elder 
306bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
307ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
308bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
309ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
310bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
311ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
312bf0d5f50SAlex Elder 
313f84344f3SAlex Elder struct rbd_mapping {
31499c1f08fSAlex Elder 	u64                     size;
31534b13184SAlex Elder 	u64                     features;
316f84344f3SAlex Elder 	bool			read_only;
317f84344f3SAlex Elder };
318f84344f3SAlex Elder 
319602adf40SYehuda Sadeh /*
320602adf40SYehuda Sadeh  * a single device
321602adf40SYehuda Sadeh  */
322602adf40SYehuda Sadeh struct rbd_device {
323de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
324602adf40SYehuda Sadeh 
325602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
326602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
327602adf40SYehuda Sadeh 
328a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
329602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
330602adf40SYehuda Sadeh 
331602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
332602adf40SYehuda Sadeh 
333b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
334602adf40SYehuda Sadeh 
335602adf40SYehuda Sadeh 	struct rbd_image_header	header;
336b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3370d7dbfceSAlex Elder 	struct rbd_spec		*spec;
338602adf40SYehuda Sadeh 
3390d7dbfceSAlex Elder 	char			*header_name;
340971f839aSAlex Elder 
3410903e875SAlex Elder 	struct ceph_file_layout	layout;
3420903e875SAlex Elder 
34359c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
344975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
34559c2be1eSYehuda Sadeh 
34686b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
34786b00e0dSAlex Elder 	u64			parent_overlap;
348a2acd00eSAlex Elder 	atomic_t		parent_ref;
3492f82ee54SAlex Elder 	struct rbd_device	*parent;
35086b00e0dSAlex Elder 
351c666601aSJosh Durgin 	/* protects updating the header */
352c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
353f84344f3SAlex Elder 
354f84344f3SAlex Elder 	struct rbd_mapping	mapping;
355602adf40SYehuda Sadeh 
356602adf40SYehuda Sadeh 	struct list_head	node;
357dfc5606dSYehuda Sadeh 
358dfc5606dSYehuda Sadeh 	/* sysfs related */
359dfc5606dSYehuda Sadeh 	struct device		dev;
360b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
361dfc5606dSYehuda Sadeh };
362dfc5606dSYehuda Sadeh 
363b82d167bSAlex Elder /*
364b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
365b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
366b82d167bSAlex Elder  *
367b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
368b82d167bSAlex Elder  * "open_count" field) requires atomic access.
369b82d167bSAlex Elder  */
3706d292906SAlex Elder enum rbd_dev_flags {
3716d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
372b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3736d292906SAlex Elder };
3746d292906SAlex Elder 
375cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
376e124a82fSAlex Elder 
377602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
378e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
379e124a82fSAlex Elder 
380602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
381432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
382602adf40SYehuda Sadeh 
38378c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
38478c2a44aSAlex Elder 
3851c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
386868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
38778c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3881c2a9dfeSAlex Elder 
3893d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
3903d7efd18SAlex Elder 
391200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
392dfc5606dSYehuda Sadeh 
393f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
394f0f8cef5SAlex Elder 		       size_t count);
395f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
396f0f8cef5SAlex Elder 			  size_t count);
3971f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
398a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
399f0f8cef5SAlex Elder 
400f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
401f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
402f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
403f0f8cef5SAlex Elder 	__ATTR_NULL
404f0f8cef5SAlex Elder };
405f0f8cef5SAlex Elder 
406f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
407f0f8cef5SAlex Elder 	.name		= "rbd",
408f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
409f0f8cef5SAlex Elder };
410f0f8cef5SAlex Elder 
411f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
412f0f8cef5SAlex Elder {
413f0f8cef5SAlex Elder }
414f0f8cef5SAlex Elder 
415f0f8cef5SAlex Elder static struct device rbd_root_dev = {
416f0f8cef5SAlex Elder 	.init_name =    "rbd",
417f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
418f0f8cef5SAlex Elder };
419f0f8cef5SAlex Elder 
42006ecc6cbSAlex Elder static __printf(2, 3)
42106ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
42206ecc6cbSAlex Elder {
42306ecc6cbSAlex Elder 	struct va_format vaf;
42406ecc6cbSAlex Elder 	va_list args;
42506ecc6cbSAlex Elder 
42606ecc6cbSAlex Elder 	va_start(args, fmt);
42706ecc6cbSAlex Elder 	vaf.fmt = fmt;
42806ecc6cbSAlex Elder 	vaf.va = &args;
42906ecc6cbSAlex Elder 
43006ecc6cbSAlex Elder 	if (!rbd_dev)
43106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
43206ecc6cbSAlex Elder 	else if (rbd_dev->disk)
43306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
43406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
43506ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
43606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
43706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
43806ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
43906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
44006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
44106ecc6cbSAlex Elder 	else	/* punt */
44206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
44306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
44406ecc6cbSAlex Elder 	va_end(args);
44506ecc6cbSAlex Elder }
44606ecc6cbSAlex Elder 
447aafb230eSAlex Elder #ifdef RBD_DEBUG
448aafb230eSAlex Elder #define rbd_assert(expr)						\
449aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
450aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
451aafb230eSAlex Elder 						"at line %d:\n\n"	\
452aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
453aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
454aafb230eSAlex Elder 			BUG();						\
455aafb230eSAlex Elder 		}
456aafb230eSAlex Elder #else /* !RBD_DEBUG */
457aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
458aafb230eSAlex Elder #endif /* !RBD_DEBUG */
459dfc5606dSYehuda Sadeh 
460b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
46105a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
46205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
4638b3e1a56SAlex Elder 
464cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
4652df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
4662df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
46754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
46854cac61fSAlex Elder 					u64 snap_id);
4692ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4702ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
4712ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4722ad3d716SAlex Elder 		u64 *snap_features);
4732ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
47459c2be1eSYehuda Sadeh 
475602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
476602adf40SYehuda Sadeh {
477f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
478b82d167bSAlex Elder 	bool removing = false;
479602adf40SYehuda Sadeh 
480f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
481602adf40SYehuda Sadeh 		return -EROFS;
482602adf40SYehuda Sadeh 
483a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
484b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
485b82d167bSAlex Elder 		removing = true;
486b82d167bSAlex Elder 	else
487b82d167bSAlex Elder 		rbd_dev->open_count++;
488a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
489b82d167bSAlex Elder 	if (removing)
490b82d167bSAlex Elder 		return -ENOENT;
491b82d167bSAlex Elder 
492c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
493f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
494340c7a2bSAlex Elder 
495602adf40SYehuda Sadeh 	return 0;
496602adf40SYehuda Sadeh }
497602adf40SYehuda Sadeh 
498db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
499dfc5606dSYehuda Sadeh {
500dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
501b82d167bSAlex Elder 	unsigned long open_count_before;
502b82d167bSAlex Elder 
503a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
504b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
505a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
506b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
507dfc5606dSYehuda Sadeh 
508c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
509dfc5606dSYehuda Sadeh }
510dfc5606dSYehuda Sadeh 
511602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
512602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
513602adf40SYehuda Sadeh 	.open			= rbd_open,
514dfc5606dSYehuda Sadeh 	.release		= rbd_release,
515602adf40SYehuda Sadeh };
516602adf40SYehuda Sadeh 
517602adf40SYehuda Sadeh /*
5187262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
519cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
520602adf40SYehuda Sadeh  */
521f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
522602adf40SYehuda Sadeh {
523602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
524602adf40SYehuda Sadeh 	int ret = -ENOMEM;
525602adf40SYehuda Sadeh 
52637206ee5SAlex Elder 	dout("%s:\n", __func__);
527602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
528602adf40SYehuda Sadeh 	if (!rbdc)
529602adf40SYehuda Sadeh 		goto out_opt;
530602adf40SYehuda Sadeh 
531602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
532602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
533602adf40SYehuda Sadeh 
53443ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
535602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
53608f75463SAlex Elder 		goto out_rbdc;
53743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
538602adf40SYehuda Sadeh 
539602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
540602adf40SYehuda Sadeh 	if (ret < 0)
54108f75463SAlex Elder 		goto out_client;
542602adf40SYehuda Sadeh 
543432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
544602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
545432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
546602adf40SYehuda Sadeh 
54737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
548bc534d86SAlex Elder 
549602adf40SYehuda Sadeh 	return rbdc;
55008f75463SAlex Elder out_client:
551602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
55208f75463SAlex Elder out_rbdc:
553602adf40SYehuda Sadeh 	kfree(rbdc);
554602adf40SYehuda Sadeh out_opt:
55543ae4701SAlex Elder 	if (ceph_opts)
55643ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
55737206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
55837206ee5SAlex Elder 
55928f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
560602adf40SYehuda Sadeh }
561602adf40SYehuda Sadeh 
5622f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5632f82ee54SAlex Elder {
5642f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5652f82ee54SAlex Elder 
5662f82ee54SAlex Elder 	return rbdc;
5672f82ee54SAlex Elder }
5682f82ee54SAlex Elder 
569602adf40SYehuda Sadeh /*
5701f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5711f7ba331SAlex Elder  * found, bump its reference count.
572602adf40SYehuda Sadeh  */
5731f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
574602adf40SYehuda Sadeh {
575602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5761f7ba331SAlex Elder 	bool found = false;
577602adf40SYehuda Sadeh 
57843ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
579602adf40SYehuda Sadeh 		return NULL;
580602adf40SYehuda Sadeh 
5811f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5821f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5831f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5842f82ee54SAlex Elder 			__rbd_get_client(client_node);
5852f82ee54SAlex Elder 
5861f7ba331SAlex Elder 			found = true;
5871f7ba331SAlex Elder 			break;
5881f7ba331SAlex Elder 		}
5891f7ba331SAlex Elder 	}
5901f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5911f7ba331SAlex Elder 
5921f7ba331SAlex Elder 	return found ? client_node : NULL;
593602adf40SYehuda Sadeh }
594602adf40SYehuda Sadeh 
595602adf40SYehuda Sadeh /*
59659c2be1eSYehuda Sadeh  * mount options
59759c2be1eSYehuda Sadeh  */
59859c2be1eSYehuda Sadeh enum {
59959c2be1eSYehuda Sadeh 	Opt_last_int,
60059c2be1eSYehuda Sadeh 	/* int args above */
60159c2be1eSYehuda Sadeh 	Opt_last_string,
60259c2be1eSYehuda Sadeh 	/* string args above */
603cc0538b6SAlex Elder 	Opt_read_only,
604cc0538b6SAlex Elder 	Opt_read_write,
605cc0538b6SAlex Elder 	/* Boolean args above */
606cc0538b6SAlex Elder 	Opt_last_bool,
60759c2be1eSYehuda Sadeh };
60859c2be1eSYehuda Sadeh 
60943ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
61059c2be1eSYehuda Sadeh 	/* int args above */
61159c2be1eSYehuda Sadeh 	/* string args above */
612be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
613cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
614cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
615cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
616cc0538b6SAlex Elder 	/* Boolean args above */
61759c2be1eSYehuda Sadeh 	{-1, NULL}
61859c2be1eSYehuda Sadeh };
61959c2be1eSYehuda Sadeh 
62098571b5aSAlex Elder struct rbd_options {
62198571b5aSAlex Elder 	bool	read_only;
62298571b5aSAlex Elder };
62398571b5aSAlex Elder 
62498571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
62598571b5aSAlex Elder 
62659c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
62759c2be1eSYehuda Sadeh {
62843ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
62959c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
63059c2be1eSYehuda Sadeh 	int token, intval, ret;
63159c2be1eSYehuda Sadeh 
63243ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
63359c2be1eSYehuda Sadeh 	if (token < 0)
63459c2be1eSYehuda Sadeh 		return -EINVAL;
63559c2be1eSYehuda Sadeh 
63659c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
63759c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
63859c2be1eSYehuda Sadeh 		if (ret < 0) {
63959c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
64059c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
64159c2be1eSYehuda Sadeh 			return ret;
64259c2be1eSYehuda Sadeh 		}
64359c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
64459c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
64559c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
64659c2be1eSYehuda Sadeh 		     argstr[0].from);
647cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
648cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
64959c2be1eSYehuda Sadeh 	} else {
65059c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
65159c2be1eSYehuda Sadeh 	}
65259c2be1eSYehuda Sadeh 
65359c2be1eSYehuda Sadeh 	switch (token) {
654cc0538b6SAlex Elder 	case Opt_read_only:
655cc0538b6SAlex Elder 		rbd_opts->read_only = true;
656cc0538b6SAlex Elder 		break;
657cc0538b6SAlex Elder 	case Opt_read_write:
658cc0538b6SAlex Elder 		rbd_opts->read_only = false;
659cc0538b6SAlex Elder 		break;
66059c2be1eSYehuda Sadeh 	default:
661aafb230eSAlex Elder 		rbd_assert(false);
662aafb230eSAlex Elder 		break;
66359c2be1eSYehuda Sadeh 	}
66459c2be1eSYehuda Sadeh 	return 0;
66559c2be1eSYehuda Sadeh }
66659c2be1eSYehuda Sadeh 
66759c2be1eSYehuda Sadeh /*
668602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
6697262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
6707262cfcaSAlex Elder  * function.
671602adf40SYehuda Sadeh  */
6729d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
673602adf40SYehuda Sadeh {
674f8c38929SAlex Elder 	struct rbd_client *rbdc;
67559c2be1eSYehuda Sadeh 
676cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
6771f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6789d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
67943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6809d3997fdSAlex Elder 	else
681f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
682cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
683d720bcb0SAlex Elder 
6849d3997fdSAlex Elder 	return rbdc;
685602adf40SYehuda Sadeh }
686602adf40SYehuda Sadeh 
687602adf40SYehuda Sadeh /*
688602adf40SYehuda Sadeh  * Destroy ceph client
689d23a4b3fSAlex Elder  *
690432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
691602adf40SYehuda Sadeh  */
692602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
693602adf40SYehuda Sadeh {
694602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
695602adf40SYehuda Sadeh 
69637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
697cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
698602adf40SYehuda Sadeh 	list_del(&rbdc->node);
699cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
700602adf40SYehuda Sadeh 
701602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
702602adf40SYehuda Sadeh 	kfree(rbdc);
703602adf40SYehuda Sadeh }
704602adf40SYehuda Sadeh 
705602adf40SYehuda Sadeh /*
706602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
707602adf40SYehuda Sadeh  * it.
708602adf40SYehuda Sadeh  */
7099d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
710602adf40SYehuda Sadeh {
711c53d5893SAlex Elder 	if (rbdc)
7129d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
713602adf40SYehuda Sadeh }
714602adf40SYehuda Sadeh 
715a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
716a30b71b9SAlex Elder {
717a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
718a30b71b9SAlex Elder }
719a30b71b9SAlex Elder 
7208e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
7218e94af8eSAlex Elder {
722103a150fSAlex Elder 	size_t size;
723103a150fSAlex Elder 	u32 snap_count;
724103a150fSAlex Elder 
725103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
726103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
727103a150fSAlex Elder 		return false;
728103a150fSAlex Elder 
729db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
730db2388b6SAlex Elder 
731db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
732db2388b6SAlex Elder 		return false;
733db2388b6SAlex Elder 
734db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
735db2388b6SAlex Elder 
736db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
737db2388b6SAlex Elder 		return false;
738db2388b6SAlex Elder 
739103a150fSAlex Elder 	/*
740103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
741103a150fSAlex Elder 	 * that limits the number of snapshots.
742103a150fSAlex Elder 	 */
743103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
744103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
745103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
746103a150fSAlex Elder 		return false;
747103a150fSAlex Elder 
748103a150fSAlex Elder 	/*
749103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
750103a150fSAlex Elder 	 * header must also be representable in a size_t.
751103a150fSAlex Elder 	 */
752103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
753103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
754103a150fSAlex Elder 		return false;
755103a150fSAlex Elder 
756103a150fSAlex Elder 	return true;
7578e94af8eSAlex Elder }
7588e94af8eSAlex Elder 
759602adf40SYehuda Sadeh /*
760bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
761bb23e37aSAlex Elder  * on-disk header.
762602adf40SYehuda Sadeh  */
763662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
7644156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
765602adf40SYehuda Sadeh {
766662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
767bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
768bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
769bb23e37aSAlex Elder 	char *object_prefix = NULL;
770bb23e37aSAlex Elder 	char *snap_names = NULL;
771bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
772ccece235SAlex Elder 	u32 snap_count;
773d2bb24e5SAlex Elder 	size_t size;
774bb23e37aSAlex Elder 	int ret = -ENOMEM;
775621901d6SAlex Elder 	u32 i;
776602adf40SYehuda Sadeh 
777bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
778103a150fSAlex Elder 
779bb23e37aSAlex Elder 	if (first_time) {
780bb23e37aSAlex Elder 		size_t len;
781bb23e37aSAlex Elder 
782bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
783bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
784bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
785bb23e37aSAlex Elder 		if (!object_prefix)
786602adf40SYehuda Sadeh 			return -ENOMEM;
787bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
788bb23e37aSAlex Elder 		object_prefix[len] = '\0';
789bb23e37aSAlex Elder 	}
79000f1f36fSAlex Elder 
791bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
792d2bb24e5SAlex Elder 
793602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
794bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
795bb23e37aSAlex Elder 	if (!snapc)
796bb23e37aSAlex Elder 		goto out_err;
797bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
798602adf40SYehuda Sadeh 	if (snap_count) {
799bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
800f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
801f785cc1dSAlex Elder 
802bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
803621901d6SAlex Elder 
804f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
805bb23e37aSAlex Elder 			goto out_2big;
806bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
807bb23e37aSAlex Elder 		if (!snap_names)
808602adf40SYehuda Sadeh 			goto out_err;
809bb23e37aSAlex Elder 
810bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
811bb23e37aSAlex Elder 
812bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
813bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
814bb23e37aSAlex Elder 		if (!snap_sizes)
815bb23e37aSAlex Elder 			goto out_err;
816bb23e37aSAlex Elder 
817f785cc1dSAlex Elder 		/*
818bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
819bb23e37aSAlex Elder 		 * and size.
820bb23e37aSAlex Elder 		 *
82199a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
822bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
823f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
824f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
825f785cc1dSAlex Elder 		 */
826bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
827bb23e37aSAlex Elder 		snaps = ondisk->snaps;
828bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
829bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
830bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
831bb23e37aSAlex Elder 		}
832602adf40SYehuda Sadeh 	}
833849b4260SAlex Elder 
834bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
835bb23e37aSAlex Elder 
836bb23e37aSAlex Elder 	if (first_time) {
837bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
838602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
839602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
840602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
841bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
842bb23e37aSAlex Elder 		header->stripe_unit = 0;
843bb23e37aSAlex Elder 		header->stripe_count = 0;
844bb23e37aSAlex Elder 		header->features = 0;
845662518b1SAlex Elder 	} else {
846662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
847662518b1SAlex Elder 		kfree(header->snap_names);
848662518b1SAlex Elder 		kfree(header->snap_sizes);
849bb23e37aSAlex Elder 	}
8506a52325fSAlex Elder 
851bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
852621901d6SAlex Elder 
853f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
854bb23e37aSAlex Elder 	header->snapc = snapc;
855bb23e37aSAlex Elder 	header->snap_names = snap_names;
856bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
857468521c1SAlex Elder 
858662518b1SAlex Elder 	/* Make sure mapping size is consistent with header info */
859662518b1SAlex Elder 
860662518b1SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
861662518b1SAlex Elder 		if (rbd_dev->mapping.size != header->image_size)
862662518b1SAlex Elder 			rbd_dev->mapping.size = header->image_size;
863662518b1SAlex Elder 
864602adf40SYehuda Sadeh 	return 0;
865bb23e37aSAlex Elder out_2big:
866bb23e37aSAlex Elder 	ret = -EIO;
8676a52325fSAlex Elder out_err:
868bb23e37aSAlex Elder 	kfree(snap_sizes);
869bb23e37aSAlex Elder 	kfree(snap_names);
870bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
871bb23e37aSAlex Elder 	kfree(object_prefix);
872ccece235SAlex Elder 
873bb23e37aSAlex Elder 	return ret;
874602adf40SYehuda Sadeh }
875602adf40SYehuda Sadeh 
8769682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
8779682fc6dSAlex Elder {
8789682fc6dSAlex Elder 	const char *snap_name;
8799682fc6dSAlex Elder 
8809682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
8819682fc6dSAlex Elder 
8829682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
8839682fc6dSAlex Elder 
8849682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
8859682fc6dSAlex Elder 	while (which--)
8869682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
8879682fc6dSAlex Elder 
8889682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
8899682fc6dSAlex Elder }
8909682fc6dSAlex Elder 
89130d1cff8SAlex Elder /*
89230d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
89330d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
89430d1cff8SAlex Elder  */
89530d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
89630d1cff8SAlex Elder {
89730d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
89830d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
89930d1cff8SAlex Elder 
90030d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
90130d1cff8SAlex Elder 		return 1;
90230d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
90330d1cff8SAlex Elder }
90430d1cff8SAlex Elder 
90530d1cff8SAlex Elder /*
90630d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
90730d1cff8SAlex Elder  * present.
90830d1cff8SAlex Elder  *
90930d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
91030d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
91130d1cff8SAlex Elder  *
91230d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
91330d1cff8SAlex Elder  * reverse order, highest snapshot id first.
91430d1cff8SAlex Elder  */
9159682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
9169682fc6dSAlex Elder {
9179682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
91830d1cff8SAlex Elder 	u64 *found;
9199682fc6dSAlex Elder 
92030d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
92130d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
9229682fc6dSAlex Elder 
92330d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9249682fc6dSAlex Elder }
9259682fc6dSAlex Elder 
9262ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
9272ad3d716SAlex Elder 					u64 snap_id)
92854cac61fSAlex Elder {
92954cac61fSAlex Elder 	u32 which;
93054cac61fSAlex Elder 
93154cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
93254cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
93354cac61fSAlex Elder 		return NULL;
93454cac61fSAlex Elder 
93554cac61fSAlex Elder 	return _rbd_dev_v1_snap_name(rbd_dev, which);
93654cac61fSAlex Elder }
93754cac61fSAlex Elder 
9389e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
9399e15b77dSAlex Elder {
9409e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
9419e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
9429e15b77dSAlex Elder 
94354cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
94454cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
94554cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9469e15b77dSAlex Elder 
94754cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9489e15b77dSAlex Elder }
9499e15b77dSAlex Elder 
9502ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
9512ad3d716SAlex Elder 				u64 *snap_size)
952602adf40SYehuda Sadeh {
9532ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
9542ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
9552ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
9562ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
9572ad3d716SAlex Elder 		u32 which;
95800f1f36fSAlex Elder 
9592ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
9602ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
9612ad3d716SAlex Elder 			return -ENOENT;
96200f1f36fSAlex Elder 
9632ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
9642ad3d716SAlex Elder 	} else {
9652ad3d716SAlex Elder 		u64 size = 0;
9662ad3d716SAlex Elder 		int ret;
9672ad3d716SAlex Elder 
9682ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
9692ad3d716SAlex Elder 		if (ret)
9702ad3d716SAlex Elder 			return ret;
9712ad3d716SAlex Elder 
9722ad3d716SAlex Elder 		*snap_size = size;
9732ad3d716SAlex Elder 	}
9742ad3d716SAlex Elder 	return 0;
9752ad3d716SAlex Elder }
9762ad3d716SAlex Elder 
9772ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
9782ad3d716SAlex Elder 			u64 *snap_features)
9792ad3d716SAlex Elder {
9802ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
9812ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
9822ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
9832ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
9842ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
9852ad3d716SAlex Elder 	} else {
9862ad3d716SAlex Elder 		u64 features = 0;
9872ad3d716SAlex Elder 		int ret;
9882ad3d716SAlex Elder 
9892ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
9902ad3d716SAlex Elder 		if (ret)
9912ad3d716SAlex Elder 			return ret;
9922ad3d716SAlex Elder 
9932ad3d716SAlex Elder 		*snap_features = features;
9942ad3d716SAlex Elder 	}
9952ad3d716SAlex Elder 	return 0;
99600f1f36fSAlex Elder }
997602adf40SYehuda Sadeh 
998d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
999602adf40SYehuda Sadeh {
10008f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
10012ad3d716SAlex Elder 	u64 size = 0;
10022ad3d716SAlex Elder 	u64 features = 0;
10032ad3d716SAlex Elder 	int ret;
10048b0241f8SAlex Elder 
10052ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
10062ad3d716SAlex Elder 	if (ret)
10072ad3d716SAlex Elder 		return ret;
10082ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
10092ad3d716SAlex Elder 	if (ret)
10102ad3d716SAlex Elder 		return ret;
10112ad3d716SAlex Elder 
10122ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
10132ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
10142ad3d716SAlex Elder 
10158b0241f8SAlex Elder 	return 0;
1016602adf40SYehuda Sadeh }
1017602adf40SYehuda Sadeh 
1018d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1019d1cf5788SAlex Elder {
1020d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1021d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1022200a6a8bSAlex Elder }
1023200a6a8bSAlex Elder 
102498571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1025602adf40SYehuda Sadeh {
102665ccfe21SAlex Elder 	char *name;
102765ccfe21SAlex Elder 	u64 segment;
102865ccfe21SAlex Elder 	int ret;
10293a96d5cdSJosh Durgin 	char *name_format;
1030602adf40SYehuda Sadeh 
103178c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
103265ccfe21SAlex Elder 	if (!name)
103365ccfe21SAlex Elder 		return NULL;
103465ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
10353a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
10363a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
10373a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
10383a96d5cdSJosh Durgin 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
103965ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
10402fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
104165ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
104265ccfe21SAlex Elder 			segment, ret);
104365ccfe21SAlex Elder 		kfree(name);
104465ccfe21SAlex Elder 		name = NULL;
104565ccfe21SAlex Elder 	}
1046602adf40SYehuda Sadeh 
104765ccfe21SAlex Elder 	return name;
104865ccfe21SAlex Elder }
1049602adf40SYehuda Sadeh 
105078c2a44aSAlex Elder static void rbd_segment_name_free(const char *name)
105178c2a44aSAlex Elder {
105278c2a44aSAlex Elder 	/* The explicit cast here is needed to drop the const qualifier */
105378c2a44aSAlex Elder 
105478c2a44aSAlex Elder 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
105578c2a44aSAlex Elder }
105678c2a44aSAlex Elder 
105765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
105865ccfe21SAlex Elder {
105965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1060602adf40SYehuda Sadeh 
106165ccfe21SAlex Elder 	return offset & (segment_size - 1);
106265ccfe21SAlex Elder }
106365ccfe21SAlex Elder 
106465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
106565ccfe21SAlex Elder 				u64 offset, u64 length)
106665ccfe21SAlex Elder {
106765ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
106865ccfe21SAlex Elder 
106965ccfe21SAlex Elder 	offset &= segment_size - 1;
107065ccfe21SAlex Elder 
1071aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
107265ccfe21SAlex Elder 	if (offset + length > segment_size)
107365ccfe21SAlex Elder 		length = segment_size - offset;
107465ccfe21SAlex Elder 
107565ccfe21SAlex Elder 	return length;
1076602adf40SYehuda Sadeh }
1077602adf40SYehuda Sadeh 
1078602adf40SYehuda Sadeh /*
1079029bcbd8SJosh Durgin  * returns the size of an object in the image
1080029bcbd8SJosh Durgin  */
1081029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1082029bcbd8SJosh Durgin {
1083029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1084029bcbd8SJosh Durgin }
1085029bcbd8SJosh Durgin 
1086029bcbd8SJosh Durgin /*
1087602adf40SYehuda Sadeh  * bio helpers
1088602adf40SYehuda Sadeh  */
1089602adf40SYehuda Sadeh 
1090602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1091602adf40SYehuda Sadeh {
1092602adf40SYehuda Sadeh 	struct bio *tmp;
1093602adf40SYehuda Sadeh 
1094602adf40SYehuda Sadeh 	while (chain) {
1095602adf40SYehuda Sadeh 		tmp = chain;
1096602adf40SYehuda Sadeh 		chain = chain->bi_next;
1097602adf40SYehuda Sadeh 		bio_put(tmp);
1098602adf40SYehuda Sadeh 	}
1099602adf40SYehuda Sadeh }
1100602adf40SYehuda Sadeh 
1101602adf40SYehuda Sadeh /*
1102602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1103602adf40SYehuda Sadeh  */
1104602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1105602adf40SYehuda Sadeh {
1106602adf40SYehuda Sadeh 	struct bio_vec *bv;
1107602adf40SYehuda Sadeh 	unsigned long flags;
1108602adf40SYehuda Sadeh 	void *buf;
1109602adf40SYehuda Sadeh 	int i;
1110602adf40SYehuda Sadeh 	int pos = 0;
1111602adf40SYehuda Sadeh 
1112602adf40SYehuda Sadeh 	while (chain) {
1113602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
1114602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
1115602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
1116602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
1117602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
1118602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
1119e2156054SAlex Elder 				flush_dcache_page(bv->bv_page);
112085b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1121602adf40SYehuda Sadeh 			}
1122602adf40SYehuda Sadeh 			pos += bv->bv_len;
1123602adf40SYehuda Sadeh 		}
1124602adf40SYehuda Sadeh 
1125602adf40SYehuda Sadeh 		chain = chain->bi_next;
1126602adf40SYehuda Sadeh 	}
1127602adf40SYehuda Sadeh }
1128602adf40SYehuda Sadeh 
1129602adf40SYehuda Sadeh /*
1130b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1131b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1132b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1133b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1134b9434c5bSAlex Elder  */
1135b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1136b9434c5bSAlex Elder {
1137b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1138b9434c5bSAlex Elder 
1139b9434c5bSAlex Elder 	rbd_assert(end > offset);
1140b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1141b9434c5bSAlex Elder 	while (offset < end) {
1142b9434c5bSAlex Elder 		size_t page_offset;
1143b9434c5bSAlex Elder 		size_t length;
1144b9434c5bSAlex Elder 		unsigned long flags;
1145b9434c5bSAlex Elder 		void *kaddr;
1146b9434c5bSAlex Elder 
1147491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1148491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1149b9434c5bSAlex Elder 		local_irq_save(flags);
1150b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1151b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1152e2156054SAlex Elder 		flush_dcache_page(*page);
1153b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1154b9434c5bSAlex Elder 		local_irq_restore(flags);
1155b9434c5bSAlex Elder 
1156b9434c5bSAlex Elder 		offset += length;
1157b9434c5bSAlex Elder 		page++;
1158b9434c5bSAlex Elder 	}
1159b9434c5bSAlex Elder }
1160b9434c5bSAlex Elder 
1161b9434c5bSAlex Elder /*
1162f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1163f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1164602adf40SYehuda Sadeh  */
1165f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1166f7760dadSAlex Elder 					unsigned int offset,
1167f7760dadSAlex Elder 					unsigned int len,
1168f7760dadSAlex Elder 					gfp_t gfpmask)
1169602adf40SYehuda Sadeh {
1170f7760dadSAlex Elder 	struct bio_vec *bv;
1171f7760dadSAlex Elder 	unsigned int resid;
1172f7760dadSAlex Elder 	unsigned short idx;
1173f7760dadSAlex Elder 	unsigned int voff;
1174f7760dadSAlex Elder 	unsigned short end_idx;
1175f7760dadSAlex Elder 	unsigned short vcnt;
1176f7760dadSAlex Elder 	struct bio *bio;
1177602adf40SYehuda Sadeh 
1178f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1179f7760dadSAlex Elder 
1180f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1181f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1182f7760dadSAlex Elder 
1183f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1184f7760dadSAlex Elder 		return NULL;
1185f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1186f7760dadSAlex Elder 		return NULL;
1187f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1188f7760dadSAlex Elder 		return NULL;
1189f7760dadSAlex Elder 
1190f7760dadSAlex Elder 	/* Find first affected segment... */
1191f7760dadSAlex Elder 
1192f7760dadSAlex Elder 	resid = offset;
1193d74c6d51SKent Overstreet 	bio_for_each_segment(bv, bio_src, idx) {
1194f7760dadSAlex Elder 		if (resid < bv->bv_len)
1195f7760dadSAlex Elder 			break;
1196f7760dadSAlex Elder 		resid -= bv->bv_len;
1197602adf40SYehuda Sadeh 	}
1198f7760dadSAlex Elder 	voff = resid;
1199602adf40SYehuda Sadeh 
1200f7760dadSAlex Elder 	/* ...and the last affected segment */
1201542582fcSAlex Elder 
1202f7760dadSAlex Elder 	resid += len;
1203f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1204f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1205f7760dadSAlex Elder 			break;
1206f7760dadSAlex Elder 		resid -= bv->bv_len;
1207f7760dadSAlex Elder 	}
1208f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1209602adf40SYehuda Sadeh 
1210f7760dadSAlex Elder 	/* Build the clone */
1211f7760dadSAlex Elder 
1212f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1213f7760dadSAlex Elder 	if (!bio)
1214f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1215f7760dadSAlex Elder 
1216f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1217f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1218f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1219f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1220602adf40SYehuda Sadeh 
1221602adf40SYehuda Sadeh 	/*
1222f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1223f7760dadSAlex Elder 	 * and last (or only) entries.
1224602adf40SYehuda Sadeh 	 */
1225f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1226f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1227f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1228f7760dadSAlex Elder 	if (vcnt > 1) {
1229f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1230f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1231602adf40SYehuda Sadeh 	} else {
1232f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1233602adf40SYehuda Sadeh 	}
1234602adf40SYehuda Sadeh 
1235f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1236f7760dadSAlex Elder 	bio->bi_size = len;
1237f7760dadSAlex Elder 	bio->bi_idx = 0;
1238602adf40SYehuda Sadeh 
1239f7760dadSAlex Elder 	return bio;
1240602adf40SYehuda Sadeh }
1241602adf40SYehuda Sadeh 
1242f7760dadSAlex Elder /*
1243f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1244f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1245f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1246f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1247f7760dadSAlex Elder  *
1248f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1249f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1250f7760dadSAlex Elder  * the start of data to be cloned is located.
1251f7760dadSAlex Elder  *
1252f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1253f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1254f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1255f7760dadSAlex Elder  */
1256f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1257f7760dadSAlex Elder 					unsigned int *offset,
1258f7760dadSAlex Elder 					unsigned int len,
1259f7760dadSAlex Elder 					gfp_t gfpmask)
1260f7760dadSAlex Elder {
1261f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1262f7760dadSAlex Elder 	unsigned int off = *offset;
1263f7760dadSAlex Elder 	struct bio *chain = NULL;
1264f7760dadSAlex Elder 	struct bio **end;
1265602adf40SYehuda Sadeh 
1266f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1267602adf40SYehuda Sadeh 
1268f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1269f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1270602adf40SYehuda Sadeh 
1271f7760dadSAlex Elder 	end = &chain;
1272f7760dadSAlex Elder 	while (len) {
1273f7760dadSAlex Elder 		unsigned int bi_size;
1274f7760dadSAlex Elder 		struct bio *bio;
1275f7760dadSAlex Elder 
1276f5400b7aSAlex Elder 		if (!bi) {
1277f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1278f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1279f5400b7aSAlex Elder 		}
1280f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1281f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1282f7760dadSAlex Elder 		if (!bio)
1283f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1284f7760dadSAlex Elder 
1285f7760dadSAlex Elder 		*end = bio;
1286f7760dadSAlex Elder 		end = &bio->bi_next;
1287f7760dadSAlex Elder 
1288f7760dadSAlex Elder 		off += bi_size;
1289f7760dadSAlex Elder 		if (off == bi->bi_size) {
1290f7760dadSAlex Elder 			bi = bi->bi_next;
1291f7760dadSAlex Elder 			off = 0;
1292f7760dadSAlex Elder 		}
1293f7760dadSAlex Elder 		len -= bi_size;
1294f7760dadSAlex Elder 	}
1295f7760dadSAlex Elder 	*bio_src = bi;
1296f7760dadSAlex Elder 	*offset = off;
1297f7760dadSAlex Elder 
1298f7760dadSAlex Elder 	return chain;
1299f7760dadSAlex Elder out_err:
1300f7760dadSAlex Elder 	bio_chain_put(chain);
1301f7760dadSAlex Elder 
1302602adf40SYehuda Sadeh 	return NULL;
1303602adf40SYehuda Sadeh }
1304602adf40SYehuda Sadeh 
1305926f9b3fSAlex Elder /*
1306926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1307926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1308926f9b3fSAlex Elder  * again.
1309926f9b3fSAlex Elder  */
13106365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13116365d33aSAlex Elder {
13126365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13136365d33aSAlex Elder 		struct rbd_device *rbd_dev;
13146365d33aSAlex Elder 
131557acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
13166365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
13176365d33aSAlex Elder 			obj_request);
13186365d33aSAlex Elder 	}
13196365d33aSAlex Elder }
13206365d33aSAlex Elder 
13216365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
13226365d33aSAlex Elder {
13236365d33aSAlex Elder 	smp_mb();
13246365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
13256365d33aSAlex Elder }
13266365d33aSAlex Elder 
132757acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
132857acbaa7SAlex Elder {
132957acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
133057acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
133157acbaa7SAlex Elder 
133257acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
133357acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
133457acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
133557acbaa7SAlex Elder 			obj_request);
133657acbaa7SAlex Elder 	}
133757acbaa7SAlex Elder }
133857acbaa7SAlex Elder 
133957acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
134057acbaa7SAlex Elder {
134157acbaa7SAlex Elder 	smp_mb();
134257acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
134357acbaa7SAlex Elder }
134457acbaa7SAlex Elder 
13455679c59fSAlex Elder /*
13465679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
13475679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
13485679c59fSAlex Elder  *
13495679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
13505679c59fSAlex Elder  * away again.  It's possible that the response from two existence
13515679c59fSAlex Elder  * checks are separated by the creation of the target object, and
13525679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
13535679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
13545679c59fSAlex Elder  */
13555679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
13565679c59fSAlex Elder 				bool exists)
13575679c59fSAlex Elder {
13585679c59fSAlex Elder 	if (exists)
13595679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
13605679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
13615679c59fSAlex Elder 	smp_mb();
13625679c59fSAlex Elder }
13635679c59fSAlex Elder 
13645679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
13655679c59fSAlex Elder {
13665679c59fSAlex Elder 	smp_mb();
13675679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
13685679c59fSAlex Elder }
13695679c59fSAlex Elder 
13705679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
13715679c59fSAlex Elder {
13725679c59fSAlex Elder 	smp_mb();
13735679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
13745679c59fSAlex Elder }
13755679c59fSAlex Elder 
1376bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1377bf0d5f50SAlex Elder {
137837206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
137937206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1380bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1381bf0d5f50SAlex Elder }
1382bf0d5f50SAlex Elder 
1383bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1384bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1385bf0d5f50SAlex Elder {
1386bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
138737206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
138837206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1389bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1390bf0d5f50SAlex Elder }
1391bf0d5f50SAlex Elder 
1392e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1393e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1394bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1395bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1396bf0d5f50SAlex Elder {
1397bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
139837206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
139937206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1400e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1401e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1402e93f3152SAlex Elder 	else
1403bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1404bf0d5f50SAlex Elder }
1405bf0d5f50SAlex Elder 
1406bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1407bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1408bf0d5f50SAlex Elder {
140925dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
141025dcf954SAlex Elder 
1411b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1412bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
141325dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
14146365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
14156365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1416bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
141725dcf954SAlex Elder 	img_request->obj_request_count++;
141825dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
141937206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
142037206ee5SAlex Elder 		obj_request->which);
1421bf0d5f50SAlex Elder }
1422bf0d5f50SAlex Elder 
1423bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1424bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1425bf0d5f50SAlex Elder {
1426bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
142725dcf954SAlex Elder 
142837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
142937206ee5SAlex Elder 		obj_request->which);
1430bf0d5f50SAlex Elder 	list_del(&obj_request->links);
143125dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
143225dcf954SAlex Elder 	img_request->obj_request_count--;
143325dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
143425dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
14356365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1436bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1437bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
143825dcf954SAlex Elder 	obj_request->callback = NULL;
1439bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1440bf0d5f50SAlex Elder }
1441bf0d5f50SAlex Elder 
1442bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1443bf0d5f50SAlex Elder {
1444bf0d5f50SAlex Elder 	switch (type) {
14459969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1446bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1447788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1448bf0d5f50SAlex Elder 		return true;
1449bf0d5f50SAlex Elder 	default:
1450bf0d5f50SAlex Elder 		return false;
1451bf0d5f50SAlex Elder 	}
1452bf0d5f50SAlex Elder }
1453bf0d5f50SAlex Elder 
1454bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1455bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1456bf0d5f50SAlex Elder {
145737206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
145837206ee5SAlex Elder 
1459bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1460bf0d5f50SAlex Elder }
1461bf0d5f50SAlex Elder 
1462bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1463bf0d5f50SAlex Elder {
146455f27e09SAlex Elder 
146537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
146655f27e09SAlex Elder 
146755f27e09SAlex Elder 	/*
146855f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
146955f27e09SAlex Elder 	 * count for the image request.  We could instead use
147055f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
147155f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
147255f27e09SAlex Elder 	 */
147355f27e09SAlex Elder 	if (!img_request->result) {
147455f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
147555f27e09SAlex Elder 		u64 xferred = 0;
147655f27e09SAlex Elder 
147755f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
147855f27e09SAlex Elder 			xferred += obj_request->xferred;
147955f27e09SAlex Elder 		img_request->xferred = xferred;
148055f27e09SAlex Elder 	}
148155f27e09SAlex Elder 
1482bf0d5f50SAlex Elder 	if (img_request->callback)
1483bf0d5f50SAlex Elder 		img_request->callback(img_request);
1484bf0d5f50SAlex Elder 	else
1485bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1486bf0d5f50SAlex Elder }
1487bf0d5f50SAlex Elder 
1488788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1489788e2df3SAlex Elder 
1490788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1491788e2df3SAlex Elder {
149237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
149337206ee5SAlex Elder 
1494788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1495788e2df3SAlex Elder }
1496788e2df3SAlex Elder 
14970c425248SAlex Elder /*
14980c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
14990c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
15000c425248SAlex Elder  * and currently never change thereafter.
15010c425248SAlex Elder  */
15020c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
15030c425248SAlex Elder {
15040c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
15050c425248SAlex Elder 	smp_mb();
15060c425248SAlex Elder }
15070c425248SAlex Elder 
15080c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
15090c425248SAlex Elder {
15100c425248SAlex Elder 	smp_mb();
15110c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
15120c425248SAlex Elder }
15130c425248SAlex Elder 
15149849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
15159849e986SAlex Elder {
15169849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
15179849e986SAlex Elder 	smp_mb();
15189849e986SAlex Elder }
15199849e986SAlex Elder 
1520e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1521e93f3152SAlex Elder {
1522e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1523e93f3152SAlex Elder 	smp_mb();
1524e93f3152SAlex Elder }
1525e93f3152SAlex Elder 
15269849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
15279849e986SAlex Elder {
15289849e986SAlex Elder 	smp_mb();
15299849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
15309849e986SAlex Elder }
15319849e986SAlex Elder 
1532d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1533d0b2e944SAlex Elder {
1534d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1535d0b2e944SAlex Elder 	smp_mb();
1536d0b2e944SAlex Elder }
1537d0b2e944SAlex Elder 
1538a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1539a2acd00eSAlex Elder {
1540a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1541a2acd00eSAlex Elder 	smp_mb();
1542a2acd00eSAlex Elder }
1543a2acd00eSAlex Elder 
1544d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1545d0b2e944SAlex Elder {
1546d0b2e944SAlex Elder 	smp_mb();
1547d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1548d0b2e944SAlex Elder }
1549d0b2e944SAlex Elder 
15506e2a4505SAlex Elder static void
15516e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
15526e2a4505SAlex Elder {
1553b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1554b9434c5bSAlex Elder 	u64 length = obj_request->length;
1555b9434c5bSAlex Elder 
15566e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
15576e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1558b9434c5bSAlex Elder 		xferred, length);
15596e2a4505SAlex Elder 	/*
15606e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
15616e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
15626e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
15636e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
15646e2a4505SAlex Elder 	 * was satisfied.
15656e2a4505SAlex Elder 	 */
1566b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
15676e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1568b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
15696e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1570b9434c5bSAlex Elder 		else
1571b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
15726e2a4505SAlex Elder 		obj_request->result = 0;
1573b9434c5bSAlex Elder 		obj_request->xferred = length;
1574b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1575b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1576b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1577b9434c5bSAlex Elder 		else
1578b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
1579b9434c5bSAlex Elder 		obj_request->xferred = length;
15806e2a4505SAlex Elder 	}
15816e2a4505SAlex Elder 	obj_request_done_set(obj_request);
15826e2a4505SAlex Elder }
15836e2a4505SAlex Elder 
1584bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1585bf0d5f50SAlex Elder {
158637206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
158737206ee5SAlex Elder 		obj_request->callback);
1588bf0d5f50SAlex Elder 	if (obj_request->callback)
1589bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1590788e2df3SAlex Elder 	else
1591788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1592bf0d5f50SAlex Elder }
1593bf0d5f50SAlex Elder 
1594c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
159539bf2c5dSAlex Elder {
159639bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
159739bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
159839bf2c5dSAlex Elder }
159939bf2c5dSAlex Elder 
1600c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1601bf0d5f50SAlex Elder {
160257acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1603a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
160457acbaa7SAlex Elder 	bool layered = false;
160557acbaa7SAlex Elder 
160657acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
160757acbaa7SAlex Elder 		img_request = obj_request->img_request;
160857acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1609a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
161057acbaa7SAlex Elder 	}
16118b3e1a56SAlex Elder 
16128b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
16138b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
16148b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1615a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1616a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
16178b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
16188b3e1a56SAlex Elder 	else if (img_request)
16196e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
16206e2a4505SAlex Elder 	else
162107741308SAlex Elder 		obj_request_done_set(obj_request);
1622bf0d5f50SAlex Elder }
1623bf0d5f50SAlex Elder 
1624c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1625bf0d5f50SAlex Elder {
16261b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
16271b83bef2SSage Weil 		obj_request->result, obj_request->length);
16281b83bef2SSage Weil 	/*
16298b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
16308b3e1a56SAlex Elder 	 * it to our originally-requested length.
16311b83bef2SSage Weil 	 */
16321b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
163307741308SAlex Elder 	obj_request_done_set(obj_request);
1634bf0d5f50SAlex Elder }
1635bf0d5f50SAlex Elder 
1636fbfab539SAlex Elder /*
1637fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1638fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1639fbfab539SAlex Elder  */
1640c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1641fbfab539SAlex Elder {
164237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1643fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1644fbfab539SAlex Elder }
1645fbfab539SAlex Elder 
1646bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1647bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1648bf0d5f50SAlex Elder {
1649bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1650bf0d5f50SAlex Elder 	u16 opcode;
1651bf0d5f50SAlex Elder 
165237206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1653bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
165457acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
165557acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
165657acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
165757acbaa7SAlex Elder 	} else {
165857acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
165957acbaa7SAlex Elder 	}
1660bf0d5f50SAlex Elder 
16611b83bef2SSage Weil 	if (osd_req->r_result < 0)
16621b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1663bf0d5f50SAlex Elder 
16640eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1665bf0d5f50SAlex Elder 
1666c47f9371SAlex Elder 	/*
1667c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1668c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1669c47f9371SAlex Elder 	 */
16701b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1671c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
167279528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1673bf0d5f50SAlex Elder 	switch (opcode) {
1674bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1675c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1676bf0d5f50SAlex Elder 		break;
1677bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1678c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1679bf0d5f50SAlex Elder 		break;
1680fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1681c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1682fbfab539SAlex Elder 		break;
168336be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1684b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
16859969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1686c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
16879969ebc5SAlex Elder 		break;
1688bf0d5f50SAlex Elder 	default:
1689bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1690bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1691bf0d5f50SAlex Elder 		break;
1692bf0d5f50SAlex Elder 	}
1693bf0d5f50SAlex Elder 
169407741308SAlex Elder 	if (obj_request_done_test(obj_request))
1695bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1696bf0d5f50SAlex Elder }
1697bf0d5f50SAlex Elder 
16989d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1699430c28c3SAlex Elder {
1700430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17018c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17029d4df01fSAlex Elder 	u64 snap_id;
1703430c28c3SAlex Elder 
17048c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1705430c28c3SAlex Elder 
17069d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
17078c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17089d4df01fSAlex Elder 			NULL, snap_id, NULL);
17099d4df01fSAlex Elder }
17109d4df01fSAlex Elder 
17119d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
17129d4df01fSAlex Elder {
17139d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17149d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17159d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
17169d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
17179d4df01fSAlex Elder 
17189d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
17199d4df01fSAlex Elder 
17209d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
17219d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17229d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1723430c28c3SAlex Elder }
1724430c28c3SAlex Elder 
1725bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1726bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1727bf0d5f50SAlex Elder 					bool write_request,
1728430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1729bf0d5f50SAlex Elder {
1730bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1731bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1732bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1733bf0d5f50SAlex Elder 
17346365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
17356365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
17366365d33aSAlex Elder 
17370c425248SAlex Elder 		rbd_assert(write_request ==
17380c425248SAlex Elder 				img_request_write_test(img_request));
17390c425248SAlex Elder 		if (write_request)
1740bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1741bf0d5f50SAlex Elder 	}
1742bf0d5f50SAlex Elder 
1743bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1744bf0d5f50SAlex Elder 
1745bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1746bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1747bf0d5f50SAlex Elder 	if (!osd_req)
1748bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1749bf0d5f50SAlex Elder 
1750430c28c3SAlex Elder 	if (write_request)
1751bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1752430c28c3SAlex Elder 	else
1753bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1754bf0d5f50SAlex Elder 
1755bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1756bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1757bf0d5f50SAlex Elder 
1758bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1759bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1760bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1761bf0d5f50SAlex Elder 
1762bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1763bf0d5f50SAlex Elder 
1764bf0d5f50SAlex Elder 	return osd_req;
1765bf0d5f50SAlex Elder }
1766bf0d5f50SAlex Elder 
17670eefd470SAlex Elder /*
17680eefd470SAlex Elder  * Create a copyup osd request based on the information in the
17690eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
17700eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
17710eefd470SAlex Elder  */
17720eefd470SAlex Elder static struct ceph_osd_request *
17730eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
17740eefd470SAlex Elder {
17750eefd470SAlex Elder 	struct rbd_img_request *img_request;
17760eefd470SAlex Elder 	struct ceph_snap_context *snapc;
17770eefd470SAlex Elder 	struct rbd_device *rbd_dev;
17780eefd470SAlex Elder 	struct ceph_osd_client *osdc;
17790eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
17800eefd470SAlex Elder 
17810eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17820eefd470SAlex Elder 	img_request = obj_request->img_request;
17830eefd470SAlex Elder 	rbd_assert(img_request);
17840eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
17850eefd470SAlex Elder 
17860eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
17870eefd470SAlex Elder 
17880eefd470SAlex Elder 	snapc = img_request->snapc;
17890eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
17900eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
17910eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
17920eefd470SAlex Elder 	if (!osd_req)
17930eefd470SAlex Elder 		return NULL;	/* ENOMEM */
17940eefd470SAlex Elder 
17950eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
17960eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
17970eefd470SAlex Elder 	osd_req->r_priv = obj_request;
17980eefd470SAlex Elder 
17990eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
18000eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
18010eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
18020eefd470SAlex Elder 
18030eefd470SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
18040eefd470SAlex Elder 
18050eefd470SAlex Elder 	return osd_req;
18060eefd470SAlex Elder }
18070eefd470SAlex Elder 
18080eefd470SAlex Elder 
1809bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1810bf0d5f50SAlex Elder {
1811bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1812bf0d5f50SAlex Elder }
1813bf0d5f50SAlex Elder 
1814bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1815bf0d5f50SAlex Elder 
1816bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1817bf0d5f50SAlex Elder 						u64 offset, u64 length,
1818bf0d5f50SAlex Elder 						enum obj_request_type type)
1819bf0d5f50SAlex Elder {
1820bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1821bf0d5f50SAlex Elder 	size_t size;
1822bf0d5f50SAlex Elder 	char *name;
1823bf0d5f50SAlex Elder 
1824bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1825bf0d5f50SAlex Elder 
1826bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1827f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1828f907ad55SAlex Elder 	if (!name)
1829bf0d5f50SAlex Elder 		return NULL;
1830bf0d5f50SAlex Elder 
1831868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1832f907ad55SAlex Elder 	if (!obj_request) {
1833f907ad55SAlex Elder 		kfree(name);
1834f907ad55SAlex Elder 		return NULL;
1835f907ad55SAlex Elder 	}
1836f907ad55SAlex Elder 
1837bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1838bf0d5f50SAlex Elder 	obj_request->offset = offset;
1839bf0d5f50SAlex Elder 	obj_request->length = length;
1840926f9b3fSAlex Elder 	obj_request->flags = 0;
1841bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1842bf0d5f50SAlex Elder 	obj_request->type = type;
1843bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1844788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1845bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1846bf0d5f50SAlex Elder 
184737206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
184837206ee5SAlex Elder 		offset, length, (int)type, obj_request);
184937206ee5SAlex Elder 
1850bf0d5f50SAlex Elder 	return obj_request;
1851bf0d5f50SAlex Elder }
1852bf0d5f50SAlex Elder 
1853bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1854bf0d5f50SAlex Elder {
1855bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1856bf0d5f50SAlex Elder 
1857bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1858bf0d5f50SAlex Elder 
185937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
186037206ee5SAlex Elder 
1861bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1862bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1863bf0d5f50SAlex Elder 
1864bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1865bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1866bf0d5f50SAlex Elder 
1867bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1868bf0d5f50SAlex Elder 	switch (obj_request->type) {
18699969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
18709969ebc5SAlex Elder 		break;		/* Nothing to do */
1871bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1872bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1873bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1874bf0d5f50SAlex Elder 		break;
1875788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1876788e2df3SAlex Elder 		if (obj_request->pages)
1877788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1878788e2df3SAlex Elder 						obj_request->page_count);
1879788e2df3SAlex Elder 		break;
1880bf0d5f50SAlex Elder 	}
1881bf0d5f50SAlex Elder 
1882f907ad55SAlex Elder 	kfree(obj_request->object_name);
1883868311b1SAlex Elder 	obj_request->object_name = NULL;
1884868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1885bf0d5f50SAlex Elder }
1886bf0d5f50SAlex Elder 
1887fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1888fb65d228SAlex Elder 
1889fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1890fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1891fb65d228SAlex Elder {
1892fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1893fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1894fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1895fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1896fb65d228SAlex Elder }
1897fb65d228SAlex Elder 
1898bf0d5f50SAlex Elder /*
1899a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1900a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1901a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1902a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1903a2acd00eSAlex Elder  */
1904a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1905a2acd00eSAlex Elder {
1906a2acd00eSAlex Elder 	int counter;
1907a2acd00eSAlex Elder 
1908a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1909a2acd00eSAlex Elder 		return;
1910a2acd00eSAlex Elder 
1911a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1912a2acd00eSAlex Elder 	if (counter > 0)
1913a2acd00eSAlex Elder 		return;
1914a2acd00eSAlex Elder 
1915a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1916a2acd00eSAlex Elder 
1917a2acd00eSAlex Elder 	if (!counter)
1918a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1919a2acd00eSAlex Elder 	else
1920a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference underflow\n");
1921a2acd00eSAlex Elder }
1922a2acd00eSAlex Elder 
1923a2acd00eSAlex Elder /*
1924a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1925a2acd00eSAlex Elder  * parent.
1926a2acd00eSAlex Elder  *
1927392a9dadSAlex Elder  * We must get the reference before checking for the overlap to
1928392a9dadSAlex Elder  * coordinate properly with zeroing the parent overlap in
1929392a9dadSAlex Elder  * rbd_dev_v2_parent_info() when an image gets flattened.  We
1930392a9dadSAlex Elder  * drop it again if there is no overlap.
1931392a9dadSAlex Elder  *
1932a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1933a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1934a2acd00eSAlex Elder  * false otherwise.
1935a2acd00eSAlex Elder  */
1936a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1937a2acd00eSAlex Elder {
1938a2acd00eSAlex Elder 	int counter;
1939a2acd00eSAlex Elder 
1940a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1941a2acd00eSAlex Elder 		return false;
1942a2acd00eSAlex Elder 
1943a2acd00eSAlex Elder 	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1944a2acd00eSAlex Elder 	if (counter > 0 && rbd_dev->parent_overlap)
1945a2acd00eSAlex Elder 		return true;
1946a2acd00eSAlex Elder 
1947a2acd00eSAlex Elder 	/* Image was flattened, but parent is not yet torn down */
1948a2acd00eSAlex Elder 
1949a2acd00eSAlex Elder 	if (counter < 0)
1950a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference overflow\n");
1951a2acd00eSAlex Elder 
1952a2acd00eSAlex Elder 	return false;
1953a2acd00eSAlex Elder }
1954a2acd00eSAlex Elder 
1955bf0d5f50SAlex Elder /*
1956bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1957bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1958bf0d5f50SAlex Elder  * (if there is one).
1959bf0d5f50SAlex Elder  */
1960cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1961cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1962bf0d5f50SAlex Elder 					u64 offset, u64 length,
1963e93f3152SAlex Elder 					bool write_request)
1964bf0d5f50SAlex Elder {
1965bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1966bf0d5f50SAlex Elder 
19671c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1968bf0d5f50SAlex Elder 	if (!img_request)
1969bf0d5f50SAlex Elder 		return NULL;
1970bf0d5f50SAlex Elder 
1971bf0d5f50SAlex Elder 	if (write_request) {
1972bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1973812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
1974bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1975bf0d5f50SAlex Elder 	}
1976bf0d5f50SAlex Elder 
1977bf0d5f50SAlex Elder 	img_request->rq = NULL;
1978bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1979bf0d5f50SAlex Elder 	img_request->offset = offset;
1980bf0d5f50SAlex Elder 	img_request->length = length;
19810c425248SAlex Elder 	img_request->flags = 0;
19820c425248SAlex Elder 	if (write_request) {
19830c425248SAlex Elder 		img_request_write_set(img_request);
1984468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
19850c425248SAlex Elder 	} else {
1986bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
19870c425248SAlex Elder 	}
1988a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
1989d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1990bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1991bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1992bf0d5f50SAlex Elder 	img_request->callback = NULL;
1993a5a337d4SAlex Elder 	img_request->result = 0;
1994bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1995bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1996bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1997bf0d5f50SAlex Elder 
199837206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
199937206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
200037206ee5SAlex Elder 		img_request);
200137206ee5SAlex Elder 
2002bf0d5f50SAlex Elder 	return img_request;
2003bf0d5f50SAlex Elder }
2004bf0d5f50SAlex Elder 
2005bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2006bf0d5f50SAlex Elder {
2007bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2008bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2009bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2010bf0d5f50SAlex Elder 
2011bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2012bf0d5f50SAlex Elder 
201337206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
201437206ee5SAlex Elder 
2015bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2016bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
201725dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2018bf0d5f50SAlex Elder 
2019a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2020a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2021a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2022a2acd00eSAlex Elder 	}
2023a2acd00eSAlex Elder 
20240c425248SAlex Elder 	if (img_request_write_test(img_request))
2025812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2026bf0d5f50SAlex Elder 
20271c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2028bf0d5f50SAlex Elder }
2029bf0d5f50SAlex Elder 
2030e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2031e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2032e93f3152SAlex Elder 					u64 img_offset, u64 length)
2033e93f3152SAlex Elder {
2034e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2035e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2036e93f3152SAlex Elder 
2037e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2038e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2039e93f3152SAlex Elder 
2040e93f3152SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
2041e93f3152SAlex Elder 						img_offset, length, false);
2042e93f3152SAlex Elder 	if (!parent_request)
2043e93f3152SAlex Elder 		return NULL;
2044e93f3152SAlex Elder 
2045e93f3152SAlex Elder 	img_request_child_set(parent_request);
2046e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2047e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2048e93f3152SAlex Elder 
2049e93f3152SAlex Elder 	return parent_request;
2050e93f3152SAlex Elder }
2051e93f3152SAlex Elder 
2052e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2053e93f3152SAlex Elder {
2054e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2055e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2056e93f3152SAlex Elder 
2057e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2058e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2059e93f3152SAlex Elder 
2060e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2061e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2062e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2063e93f3152SAlex Elder 
2064e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2065e93f3152SAlex Elder }
2066e93f3152SAlex Elder 
20671217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
20681217857fSAlex Elder {
20696365d33aSAlex Elder 	struct rbd_img_request *img_request;
20701217857fSAlex Elder 	unsigned int xferred;
20711217857fSAlex Elder 	int result;
20728b3e1a56SAlex Elder 	bool more;
20731217857fSAlex Elder 
20746365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20756365d33aSAlex Elder 	img_request = obj_request->img_request;
20766365d33aSAlex Elder 
20771217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
20781217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
20791217857fSAlex Elder 	result = obj_request->result;
20801217857fSAlex Elder 	if (result) {
20811217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
20821217857fSAlex Elder 
20831217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
20841217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
20851217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
20861217857fSAlex Elder 			obj_request->offset);
20871217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
20881217857fSAlex Elder 			result, xferred);
20891217857fSAlex Elder 		if (!img_request->result)
20901217857fSAlex Elder 			img_request->result = result;
20911217857fSAlex Elder 	}
20921217857fSAlex Elder 
2093f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2094f1a4739fSAlex Elder 
2095f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2096f1a4739fSAlex Elder 		obj_request->pages = NULL;
2097f1a4739fSAlex Elder 		obj_request->page_count = 0;
2098f1a4739fSAlex Elder 	}
2099f1a4739fSAlex Elder 
21008b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
21018b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
21028b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
21038b3e1a56SAlex Elder 	} else {
21048b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
21058b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
21068b3e1a56SAlex Elder 	}
21078b3e1a56SAlex Elder 
21088b3e1a56SAlex Elder 	return more;
21091217857fSAlex Elder }
21101217857fSAlex Elder 
21112169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
21122169238dSAlex Elder {
21132169238dSAlex Elder 	struct rbd_img_request *img_request;
21142169238dSAlex Elder 	u32 which = obj_request->which;
21152169238dSAlex Elder 	bool more = true;
21162169238dSAlex Elder 
21176365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21182169238dSAlex Elder 	img_request = obj_request->img_request;
21192169238dSAlex Elder 
21202169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
21212169238dSAlex Elder 	rbd_assert(img_request != NULL);
21222169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
21232169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
21242169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
21252169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
21262169238dSAlex Elder 
21272169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
21282169238dSAlex Elder 	if (which != img_request->next_completion)
21292169238dSAlex Elder 		goto out;
21302169238dSAlex Elder 
21312169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
21322169238dSAlex Elder 		rbd_assert(more);
21332169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
21342169238dSAlex Elder 
21352169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
21362169238dSAlex Elder 			break;
21371217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
21382169238dSAlex Elder 		which++;
21392169238dSAlex Elder 	}
21402169238dSAlex Elder 
21412169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
21422169238dSAlex Elder 	img_request->next_completion = which;
21432169238dSAlex Elder out:
21442169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
21452169238dSAlex Elder 
21462169238dSAlex Elder 	if (!more)
21472169238dSAlex Elder 		rbd_img_request_complete(img_request);
21482169238dSAlex Elder }
21492169238dSAlex Elder 
2150f1a4739fSAlex Elder /*
2151f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2152f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2153f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2154f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2155f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2156f1a4739fSAlex Elder  * all data described by the image request.
2157f1a4739fSAlex Elder  */
2158f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2159f1a4739fSAlex Elder 					enum obj_request_type type,
2160f1a4739fSAlex Elder 					void *data_desc)
2161bf0d5f50SAlex Elder {
2162bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2163bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2164bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
21650c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
2166a158073cSJingoo Han 	struct bio *bio_list = NULL;
2167f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2168a158073cSJingoo Han 	struct page **pages = NULL;
21697da22d29SAlex Elder 	u64 img_offset;
2170bf0d5f50SAlex Elder 	u64 resid;
2171bf0d5f50SAlex Elder 	u16 opcode;
2172bf0d5f50SAlex Elder 
2173f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2174f1a4739fSAlex Elder 		(int)type, data_desc);
217537206ee5SAlex Elder 
2176430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
21777da22d29SAlex Elder 	img_offset = img_request->offset;
2178bf0d5f50SAlex Elder 	resid = img_request->length;
21794dda41d3SAlex Elder 	rbd_assert(resid > 0);
2180f1a4739fSAlex Elder 
2181f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2182f1a4739fSAlex Elder 		bio_list = data_desc;
2183f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2184f1a4739fSAlex Elder 	} else {
2185f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2186f1a4739fSAlex Elder 		pages = data_desc;
2187f1a4739fSAlex Elder 	}
2188f1a4739fSAlex Elder 
2189bf0d5f50SAlex Elder 	while (resid) {
21902fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2191bf0d5f50SAlex Elder 		const char *object_name;
2192bf0d5f50SAlex Elder 		u64 offset;
2193bf0d5f50SAlex Elder 		u64 length;
2194bf0d5f50SAlex Elder 
21957da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2196bf0d5f50SAlex Elder 		if (!object_name)
2197bf0d5f50SAlex Elder 			goto out_unwind;
21987da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
21997da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2200bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2201f1a4739fSAlex Elder 						offset, length, type);
220278c2a44aSAlex Elder 		/* object request has its own copy of the object name */
220378c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2204bf0d5f50SAlex Elder 		if (!obj_request)
2205bf0d5f50SAlex Elder 			goto out_unwind;
2206bf0d5f50SAlex Elder 
2207f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2208f1a4739fSAlex Elder 			unsigned int clone_size;
2209f1a4739fSAlex Elder 
2210bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2211bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2212f1a4739fSAlex Elder 			obj_request->bio_list =
2213f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2214f1a4739fSAlex Elder 								&bio_offset,
2215f1a4739fSAlex Elder 								clone_size,
2216bf0d5f50SAlex Elder 								GFP_ATOMIC);
2217bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
2218bf0d5f50SAlex Elder 				goto out_partial;
2219f1a4739fSAlex Elder 		} else {
2220f1a4739fSAlex Elder 			unsigned int page_count;
2221f1a4739fSAlex Elder 
2222f1a4739fSAlex Elder 			obj_request->pages = pages;
2223f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2224f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2225f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2226f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2227f1a4739fSAlex Elder 			pages += page_count;
2228f1a4739fSAlex Elder 		}
2229bf0d5f50SAlex Elder 
22302fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
22312fa12320SAlex Elder 						obj_request);
22322fa12320SAlex Elder 		if (!osd_req)
2233bf0d5f50SAlex Elder 			goto out_partial;
22342fa12320SAlex Elder 		obj_request->osd_req = osd_req;
22352169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
2236430c28c3SAlex Elder 
22372fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
22382fa12320SAlex Elder 						0, 0);
2239f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
2240406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
2241f1a4739fSAlex Elder 					obj_request->bio_list, length);
2242f1a4739fSAlex Elder 		else
2243f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
2244f1a4739fSAlex Elder 					obj_request->pages, length,
2245f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
22469d4df01fSAlex Elder 
2247d2d1f17aSJosh Durgin 		/*
2248d2d1f17aSJosh Durgin 		 * set obj_request->img_request before formatting
2249d2d1f17aSJosh Durgin 		 * the osd_request so that it gets the right snapc
2250d2d1f17aSJosh Durgin 		 */
2251d2d1f17aSJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
22529d4df01fSAlex Elder 		if (write_request)
22539d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
22549d4df01fSAlex Elder 		else
22559d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2256430c28c3SAlex Elder 
22577da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2258bf0d5f50SAlex Elder 
22597da22d29SAlex Elder 		img_offset += length;
2260bf0d5f50SAlex Elder 		resid -= length;
2261bf0d5f50SAlex Elder 	}
2262bf0d5f50SAlex Elder 
2263bf0d5f50SAlex Elder 	return 0;
2264bf0d5f50SAlex Elder 
2265bf0d5f50SAlex Elder out_partial:
2266bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
2267bf0d5f50SAlex Elder out_unwind:
2268bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2269bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
2270bf0d5f50SAlex Elder 
2271bf0d5f50SAlex Elder 	return -ENOMEM;
2272bf0d5f50SAlex Elder }
2273bf0d5f50SAlex Elder 
22743d7efd18SAlex Elder static void
22750eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
22760eefd470SAlex Elder {
22770eefd470SAlex Elder 	struct rbd_img_request *img_request;
22780eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2279ebda6408SAlex Elder 	struct page **pages;
22800eefd470SAlex Elder 	u32 page_count;
22810eefd470SAlex Elder 
22820eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
22830eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22840eefd470SAlex Elder 	img_request = obj_request->img_request;
22850eefd470SAlex Elder 	rbd_assert(img_request);
22860eefd470SAlex Elder 
22870eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
22880eefd470SAlex Elder 	rbd_assert(rbd_dev);
22890eefd470SAlex Elder 
2290ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2291ebda6408SAlex Elder 	rbd_assert(pages != NULL);
22920eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2293ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2294ebda6408SAlex Elder 	rbd_assert(page_count);
2295ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2296ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
22970eefd470SAlex Elder 
22980eefd470SAlex Elder 	/*
22990eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
23000eefd470SAlex Elder 	 * original write request.  There is no such thing as a
23010eefd470SAlex Elder 	 * successful short write, so if the request was successful
23020eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
23030eefd470SAlex Elder 	 */
23040eefd470SAlex Elder 	if (!obj_request->result)
23050eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
23060eefd470SAlex Elder 
23070eefd470SAlex Elder 	/* Finish up with the normal image object callback */
23080eefd470SAlex Elder 
23090eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
23100eefd470SAlex Elder }
23110eefd470SAlex Elder 
23120eefd470SAlex Elder static void
23133d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
23143d7efd18SAlex Elder {
23153d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
23160eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
23170eefd470SAlex Elder 	struct ceph_osd_client *osdc;
23180eefd470SAlex Elder 	struct rbd_device *rbd_dev;
23193d7efd18SAlex Elder 	struct page **pages;
2320ebda6408SAlex Elder 	u32 page_count;
2321bbea1c1aSAlex Elder 	int img_result;
2322ebda6408SAlex Elder 	u64 parent_length;
2323b91f09f1SAlex Elder 	u64 offset;
2324b91f09f1SAlex Elder 	u64 length;
23253d7efd18SAlex Elder 
23263d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
23273d7efd18SAlex Elder 
23283d7efd18SAlex Elder 	/* First get what we need from the image request */
23293d7efd18SAlex Elder 
23303d7efd18SAlex Elder 	pages = img_request->copyup_pages;
23313d7efd18SAlex Elder 	rbd_assert(pages != NULL);
23323d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2333ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2334ebda6408SAlex Elder 	rbd_assert(page_count);
2335ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
23363d7efd18SAlex Elder 
23373d7efd18SAlex Elder 	orig_request = img_request->obj_request;
23383d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2339b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2340bbea1c1aSAlex Elder 	img_result = img_request->result;
2341ebda6408SAlex Elder 	parent_length = img_request->length;
2342ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
23433d7efd18SAlex Elder 	rbd_img_request_put(img_request);
23443d7efd18SAlex Elder 
234591c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
234691c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
23473d7efd18SAlex Elder 	rbd_assert(rbd_dev);
23483d7efd18SAlex Elder 
2349bbea1c1aSAlex Elder 	/*
2350bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2351bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2352bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2353bbea1c1aSAlex Elder 	 */
2354bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2355bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2356bbea1c1aSAlex Elder 
2357bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2358bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2359bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2360bbea1c1aSAlex Elder 		if (!img_result)
2361bbea1c1aSAlex Elder 			return;
2362bbea1c1aSAlex Elder 	}
2363bbea1c1aSAlex Elder 
2364bbea1c1aSAlex Elder 	if (img_result)
23650eefd470SAlex Elder 		goto out_err;
23663d7efd18SAlex Elder 
23678785b1d4SAlex Elder 	/*
23688785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
23698785b1d4SAlex Elder 	 * We need a new one that can hold the two ops in a copyup
23708785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
23718785b1d4SAlex Elder 	 * original request, and release the old one.
23728785b1d4SAlex Elder 	 */
2373bbea1c1aSAlex Elder 	img_result = -ENOMEM;
23740eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
23750eefd470SAlex Elder 	if (!osd_req)
23760eefd470SAlex Elder 		goto out_err;
23778785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
23780eefd470SAlex Elder 	orig_request->osd_req = osd_req;
23790eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2380ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
23813d7efd18SAlex Elder 
23820eefd470SAlex Elder 	/* Initialize the copyup op */
23830eefd470SAlex Elder 
23840eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2385ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
23860eefd470SAlex Elder 						false, false);
23870eefd470SAlex Elder 
23880eefd470SAlex Elder 	/* Then the original write request op */
23890eefd470SAlex Elder 
2390b91f09f1SAlex Elder 	offset = orig_request->offset;
2391b91f09f1SAlex Elder 	length = orig_request->length;
23920eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2393b91f09f1SAlex Elder 					offset, length, 0, 0);
2394b91f09f1SAlex Elder 	if (orig_request->type == OBJ_REQUEST_BIO)
2395b91f09f1SAlex Elder 		osd_req_op_extent_osd_data_bio(osd_req, 1,
2396b91f09f1SAlex Elder 					orig_request->bio_list, length);
2397b91f09f1SAlex Elder 	else
2398b91f09f1SAlex Elder 		osd_req_op_extent_osd_data_pages(osd_req, 1,
2399b91f09f1SAlex Elder 					orig_request->pages, length,
2400b91f09f1SAlex Elder 					offset & ~PAGE_MASK, false, false);
24010eefd470SAlex Elder 
24020eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
24030eefd470SAlex Elder 
24040eefd470SAlex Elder 	/* All set, send it off. */
24050eefd470SAlex Elder 
24060eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
24070eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2408bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2409bbea1c1aSAlex Elder 	if (!img_result)
24100eefd470SAlex Elder 		return;
24110eefd470SAlex Elder out_err:
24120eefd470SAlex Elder 	/* Record the error code and complete the request */
24130eefd470SAlex Elder 
2414bbea1c1aSAlex Elder 	orig_request->result = img_result;
24150eefd470SAlex Elder 	orig_request->xferred = 0;
24163d7efd18SAlex Elder 	obj_request_done_set(orig_request);
24173d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
24183d7efd18SAlex Elder }
24193d7efd18SAlex Elder 
24203d7efd18SAlex Elder /*
24213d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
24223d7efd18SAlex Elder  * entire target of the given object request.  This is used for
24233d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
24243d7efd18SAlex Elder  * object request from the image request does not exist.
24253d7efd18SAlex Elder  *
24263d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
24273d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
24283d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
24293d7efd18SAlex Elder  * the original object request for the copyup operation.
24303d7efd18SAlex Elder  *
24313d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
24323d7efd18SAlex Elder  * object request and mark it done so it gets completed.
24333d7efd18SAlex Elder  */
24343d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
24353d7efd18SAlex Elder {
24363d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
24373d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
24383d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
24393d7efd18SAlex Elder 	u64 img_offset;
24403d7efd18SAlex Elder 	u64 length;
24413d7efd18SAlex Elder 	struct page **pages = NULL;
24423d7efd18SAlex Elder 	u32 page_count;
24433d7efd18SAlex Elder 	int result;
24443d7efd18SAlex Elder 
24453d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2446b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
24473d7efd18SAlex Elder 
24483d7efd18SAlex Elder 	img_request = obj_request->img_request;
24493d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
24503d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
24513d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
24523d7efd18SAlex Elder 
24533d7efd18SAlex Elder 	/*
24543d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
24553d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
24563d7efd18SAlex Elder 	 */
24573d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
24583d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
24593d7efd18SAlex Elder 
24603d7efd18SAlex Elder 	/*
2461a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2462a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2463a9e8ba2cSAlex Elder 	 * necessary.
2464a9e8ba2cSAlex Elder 	 */
2465a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2466a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2467a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2468a9e8ba2cSAlex Elder 	}
2469a9e8ba2cSAlex Elder 
2470a9e8ba2cSAlex Elder 	/*
24713d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
24723d7efd18SAlex Elder 	 * from the parent.
24733d7efd18SAlex Elder 	 */
24743d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
24753d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
24763d7efd18SAlex Elder 	if (IS_ERR(pages)) {
24773d7efd18SAlex Elder 		result = PTR_ERR(pages);
24783d7efd18SAlex Elder 		pages = NULL;
24793d7efd18SAlex Elder 		goto out_err;
24803d7efd18SAlex Elder 	}
24813d7efd18SAlex Elder 
24823d7efd18SAlex Elder 	result = -ENOMEM;
2483e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2484e93f3152SAlex Elder 						img_offset, length);
24853d7efd18SAlex Elder 	if (!parent_request)
24863d7efd18SAlex Elder 		goto out_err;
24873d7efd18SAlex Elder 
24883d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
24893d7efd18SAlex Elder 	if (result)
24903d7efd18SAlex Elder 		goto out_err;
24913d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2492ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
24933d7efd18SAlex Elder 
24943d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
24953d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
24963d7efd18SAlex Elder 	if (!result)
24973d7efd18SAlex Elder 		return 0;
24983d7efd18SAlex Elder 
24993d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2500ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
25013d7efd18SAlex Elder 	parent_request->obj_request = NULL;
25023d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
25033d7efd18SAlex Elder out_err:
25043d7efd18SAlex Elder 	if (pages)
25053d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
25063d7efd18SAlex Elder 	if (parent_request)
25073d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
25083d7efd18SAlex Elder 	obj_request->result = result;
25093d7efd18SAlex Elder 	obj_request->xferred = 0;
25103d7efd18SAlex Elder 	obj_request_done_set(obj_request);
25113d7efd18SAlex Elder 
25123d7efd18SAlex Elder 	return result;
25133d7efd18SAlex Elder }
25143d7efd18SAlex Elder 
2515c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2516c5b5ef6cSAlex Elder {
2517c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2518638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2519c5b5ef6cSAlex Elder 	int result;
2520c5b5ef6cSAlex Elder 
2521c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2522c5b5ef6cSAlex Elder 
2523c5b5ef6cSAlex Elder 	/*
2524c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2525c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2526c5b5ef6cSAlex Elder 	 * we're done with the request.
2527c5b5ef6cSAlex Elder 	 */
2528c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2529c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2530912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2531c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2532c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2533c5b5ef6cSAlex Elder 
2534c5b5ef6cSAlex Elder 	result = obj_request->result;
2535c5b5ef6cSAlex Elder 	obj_request->result = 0;
2536c5b5ef6cSAlex Elder 
2537c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2538c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2539c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2540c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2541c5b5ef6cSAlex Elder 
2542638f5abeSAlex Elder 	/*
2543638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2544638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2545638f5abeSAlex Elder 	 * and re-submit the original write request.
2546638f5abeSAlex Elder 	 */
2547638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2548638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2549638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2550638f5abeSAlex Elder 
2551638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2552638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2553638f5abeSAlex Elder 		if (!result)
2554638f5abeSAlex Elder 			return;
2555638f5abeSAlex Elder 	}
2556c5b5ef6cSAlex Elder 
2557c5b5ef6cSAlex Elder 	/*
2558c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2559c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2560c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2561c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2562c5b5ef6cSAlex Elder 	 */
2563c5b5ef6cSAlex Elder 	if (!result) {
2564c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2565c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2566c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2567c5b5ef6cSAlex Elder 	} else if (result) {
2568c5b5ef6cSAlex Elder 		orig_request->result = result;
25693d7efd18SAlex Elder 		goto out;
2570c5b5ef6cSAlex Elder 	}
2571c5b5ef6cSAlex Elder 
2572c5b5ef6cSAlex Elder 	/*
2573c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2574c5b5ef6cSAlex Elder 	 * whether the target object exists.
2575c5b5ef6cSAlex Elder 	 */
2576b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
25773d7efd18SAlex Elder out:
2578c5b5ef6cSAlex Elder 	if (orig_request->result)
2579c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2580c5b5ef6cSAlex Elder }
2581c5b5ef6cSAlex Elder 
2582c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2583c5b5ef6cSAlex Elder {
2584c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2585c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2586c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2587c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2588c5b5ef6cSAlex Elder 	u32 page_count;
2589c5b5ef6cSAlex Elder 	size_t size;
2590c5b5ef6cSAlex Elder 	int ret;
2591c5b5ef6cSAlex Elder 
2592c5b5ef6cSAlex Elder 	/*
2593c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2594c5b5ef6cSAlex Elder 	 *     le64 length;
2595c5b5ef6cSAlex Elder 	 *     struct {
2596c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2597c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2598c5b5ef6cSAlex Elder 	 *     } mtime;
2599c5b5ef6cSAlex Elder 	 */
2600c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2601c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2602c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2603c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2604c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2605c5b5ef6cSAlex Elder 
2606c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2607c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2608c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2609c5b5ef6cSAlex Elder 	if (!stat_request)
2610c5b5ef6cSAlex Elder 		goto out;
2611c5b5ef6cSAlex Elder 
2612c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2613c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2614c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2615c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2616c5b5ef6cSAlex Elder 
2617c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2618c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2619c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2620c5b5ef6cSAlex Elder 						stat_request);
2621c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2622c5b5ef6cSAlex Elder 		goto out;
2623c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2624c5b5ef6cSAlex Elder 
2625c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2626c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2627c5b5ef6cSAlex Elder 					false, false);
26289d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2629c5b5ef6cSAlex Elder 
2630c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2631c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2632c5b5ef6cSAlex Elder out:
2633c5b5ef6cSAlex Elder 	if (ret)
2634c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2635c5b5ef6cSAlex Elder 
2636c5b5ef6cSAlex Elder 	return ret;
2637c5b5ef6cSAlex Elder }
2638c5b5ef6cSAlex Elder 
2639b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2640b454e36dSAlex Elder {
2641b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2642a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
26433d7efd18SAlex Elder 	bool known;
2644b454e36dSAlex Elder 
2645b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2646b454e36dSAlex Elder 
2647b454e36dSAlex Elder 	img_request = obj_request->img_request;
2648b454e36dSAlex Elder 	rbd_assert(img_request);
2649a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2650b454e36dSAlex Elder 
2651b454e36dSAlex Elder 	/*
2652a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2653a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2654a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2655a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2656a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2657a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2658a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2659a9e8ba2cSAlex Elder 	 * simple object request.
2660b454e36dSAlex Elder 	 */
2661b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2662b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2663a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
26643d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
26653d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2666b454e36dSAlex Elder 
2667b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2668b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2669b454e36dSAlex Elder 
2670b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2671b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2672b454e36dSAlex Elder 
2673b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2674b454e36dSAlex Elder 	}
2675b454e36dSAlex Elder 
2676b454e36dSAlex Elder 	/*
26773d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
26783d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
26793d7efd18SAlex Elder 	 * start by reading the data for the full target object from
26803d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2681b454e36dSAlex Elder 	 */
26823d7efd18SAlex Elder 	if (known)
26833d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
26843d7efd18SAlex Elder 
26853d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2686b454e36dSAlex Elder 
2687b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2688b454e36dSAlex Elder }
2689b454e36dSAlex Elder 
2690bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2691bf0d5f50SAlex Elder {
2692bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
269346faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2694bf0d5f50SAlex Elder 
269537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
269646faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2697bf0d5f50SAlex Elder 		int ret;
2698bf0d5f50SAlex Elder 
2699b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2700bf0d5f50SAlex Elder 		if (ret)
2701bf0d5f50SAlex Elder 			return ret;
2702bf0d5f50SAlex Elder 	}
2703bf0d5f50SAlex Elder 
2704bf0d5f50SAlex Elder 	return 0;
2705bf0d5f50SAlex Elder }
2706bf0d5f50SAlex Elder 
27078b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
27088b3e1a56SAlex Elder {
27098b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2710a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2711a9e8ba2cSAlex Elder 	u64 obj_end;
271202c74fbaSAlex Elder 	u64 img_xferred;
271302c74fbaSAlex Elder 	int img_result;
27148b3e1a56SAlex Elder 
27158b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
27168b3e1a56SAlex Elder 
271702c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
271802c74fbaSAlex Elder 
27198b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
272002c74fbaSAlex Elder 	img_xferred = img_request->xferred;
272102c74fbaSAlex Elder 	img_result = img_request->result;
272202c74fbaSAlex Elder 	rbd_img_request_put(img_request);
272302c74fbaSAlex Elder 
272402c74fbaSAlex Elder 	/*
272502c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
272602c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
272702c74fbaSAlex Elder 	 * original request.
272802c74fbaSAlex Elder 	 */
2729a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2730a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
273102c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
273202c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
273302c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
27348b3e1a56SAlex Elder 
273502c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
273602c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
273702c74fbaSAlex Elder 		if (!img_result)
273802c74fbaSAlex Elder 			return;
273902c74fbaSAlex Elder 	}
274002c74fbaSAlex Elder 
274102c74fbaSAlex Elder 	obj_request->result = img_result;
2742a9e8ba2cSAlex Elder 	if (obj_request->result)
2743a9e8ba2cSAlex Elder 		goto out;
2744a9e8ba2cSAlex Elder 
2745a9e8ba2cSAlex Elder 	/*
2746a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2747a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2748a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2749a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2750a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2751a9e8ba2cSAlex Elder 	 */
2752a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2753a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2754a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2755a9e8ba2cSAlex Elder 		u64 xferred = 0;
2756a9e8ba2cSAlex Elder 
2757a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2758a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2759a9e8ba2cSAlex Elder 					obj_request->img_offset;
2760a9e8ba2cSAlex Elder 
276102c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
2762a9e8ba2cSAlex Elder 	} else {
276302c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
2764a9e8ba2cSAlex Elder 	}
2765a9e8ba2cSAlex Elder out:
27668b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
27678b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
27688b3e1a56SAlex Elder }
27698b3e1a56SAlex Elder 
27708b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
27718b3e1a56SAlex Elder {
27728b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
27738b3e1a56SAlex Elder 	int result;
27748b3e1a56SAlex Elder 
27758b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
27768b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
27778b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
27785b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
27798b3e1a56SAlex Elder 
27808b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
2781e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
27828b3e1a56SAlex Elder 						obj_request->img_offset,
2783e93f3152SAlex Elder 						obj_request->length);
27848b3e1a56SAlex Elder 	result = -ENOMEM;
27858b3e1a56SAlex Elder 	if (!img_request)
27868b3e1a56SAlex Elder 		goto out_err;
27878b3e1a56SAlex Elder 
27885b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
2789f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2790f1a4739fSAlex Elder 						obj_request->bio_list);
27915b2ab72dSAlex Elder 	else
27925b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
27935b2ab72dSAlex Elder 						obj_request->pages);
27948b3e1a56SAlex Elder 	if (result)
27958b3e1a56SAlex Elder 		goto out_err;
27968b3e1a56SAlex Elder 
27978b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
27988b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
27998b3e1a56SAlex Elder 	if (result)
28008b3e1a56SAlex Elder 		goto out_err;
28018b3e1a56SAlex Elder 
28028b3e1a56SAlex Elder 	return;
28038b3e1a56SAlex Elder out_err:
28048b3e1a56SAlex Elder 	if (img_request)
28058b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
28068b3e1a56SAlex Elder 	obj_request->result = result;
28078b3e1a56SAlex Elder 	obj_request->xferred = 0;
28088b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
28098b3e1a56SAlex Elder }
28108b3e1a56SAlex Elder 
2811cc4a38bdSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
2812b8d70035SAlex Elder {
2813b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
28142169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2815b8d70035SAlex Elder 	int ret;
2816b8d70035SAlex Elder 
2817b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2818b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2819b8d70035SAlex Elder 	if (!obj_request)
2820b8d70035SAlex Elder 		return -ENOMEM;
2821b8d70035SAlex Elder 
2822b8d70035SAlex Elder 	ret = -ENOMEM;
2823430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2824b8d70035SAlex Elder 	if (!obj_request->osd_req)
2825b8d70035SAlex Elder 		goto out;
28262169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
2827b8d70035SAlex Elder 
2828c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2829cc4a38bdSAlex Elder 					notify_id, 0, 0);
28309d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2831430c28c3SAlex Elder 
2832b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2833b8d70035SAlex Elder out:
2834cf81b60eSAlex Elder 	if (ret)
2835b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
2836b8d70035SAlex Elder 
2837b8d70035SAlex Elder 	return ret;
2838b8d70035SAlex Elder }
2839b8d70035SAlex Elder 
2840b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2841b8d70035SAlex Elder {
2842b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2843e627db08SAlex Elder 	int ret;
2844b8d70035SAlex Elder 
2845b8d70035SAlex Elder 	if (!rbd_dev)
2846b8d70035SAlex Elder 		return;
2847b8d70035SAlex Elder 
284837206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2849b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2850b8d70035SAlex Elder 		(unsigned int)opcode);
2851e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
2852e627db08SAlex Elder 	if (ret)
28533b5cf2a2SAlex Elder 		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2854b8d70035SAlex Elder 
2855cc4a38bdSAlex Elder 	rbd_obj_notify_ack(rbd_dev, notify_id);
2856b8d70035SAlex Elder }
2857b8d70035SAlex Elder 
28589969ebc5SAlex Elder /*
28599969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
28609969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
28619969ebc5SAlex Elder  */
28621f3ef788SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
28639969ebc5SAlex Elder {
28649969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
28659969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
28669969ebc5SAlex Elder 	int ret;
28679969ebc5SAlex Elder 
28689969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
28699969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
28709969ebc5SAlex Elder 
28719969ebc5SAlex Elder 	if (start) {
28723c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
28739969ebc5SAlex Elder 						&rbd_dev->watch_event);
28749969ebc5SAlex Elder 		if (ret < 0)
28759969ebc5SAlex Elder 			return ret;
28768eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
28779969ebc5SAlex Elder 	}
28789969ebc5SAlex Elder 
28799969ebc5SAlex Elder 	ret = -ENOMEM;
28809969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
28819969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
28829969ebc5SAlex Elder 	if (!obj_request)
28839969ebc5SAlex Elder 		goto out_cancel;
28849969ebc5SAlex Elder 
2885430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2886430c28c3SAlex Elder 	if (!obj_request->osd_req)
2887430c28c3SAlex Elder 		goto out_cancel;
2888430c28c3SAlex Elder 
28898eb87565SAlex Elder 	if (start)
2890975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
28918eb87565SAlex Elder 	else
28926977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2893975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
28942169238dSAlex Elder 
28952169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
28961f3ef788SAlex Elder 				rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
28979d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
28982169238dSAlex Elder 
28999969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
29009969ebc5SAlex Elder 	if (ret)
29019969ebc5SAlex Elder 		goto out_cancel;
29029969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
29039969ebc5SAlex Elder 	if (ret)
29049969ebc5SAlex Elder 		goto out_cancel;
29059969ebc5SAlex Elder 	ret = obj_request->result;
29069969ebc5SAlex Elder 	if (ret)
29079969ebc5SAlex Elder 		goto out_cancel;
29089969ebc5SAlex Elder 
29098eb87565SAlex Elder 	/*
29108eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
29118eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
29128eb87565SAlex Elder 	 * a pointer to the object request during that time (in
29138eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
29148eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
29158eb87565SAlex Elder 	 * unregistered it.
29168eb87565SAlex Elder 	 */
29178eb87565SAlex Elder 	if (start) {
29188eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
29198eb87565SAlex Elder 
29208eb87565SAlex Elder 		return 0;
29218eb87565SAlex Elder 	}
29228eb87565SAlex Elder 
29238eb87565SAlex Elder 	/* We have successfully torn down the watch request */
29248eb87565SAlex Elder 
29258eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
29268eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
29279969ebc5SAlex Elder out_cancel:
29289969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
29299969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
29309969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
29319969ebc5SAlex Elder 	if (obj_request)
29329969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
29339969ebc5SAlex Elder 
29349969ebc5SAlex Elder 	return ret;
29359969ebc5SAlex Elder }
29369969ebc5SAlex Elder 
293736be9a76SAlex Elder /*
2938f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
2939f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
294036be9a76SAlex Elder  */
294136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
294236be9a76SAlex Elder 			     const char *object_name,
294336be9a76SAlex Elder 			     const char *class_name,
294436be9a76SAlex Elder 			     const char *method_name,
29454157976bSAlex Elder 			     const void *outbound,
294636be9a76SAlex Elder 			     size_t outbound_size,
29474157976bSAlex Elder 			     void *inbound,
2948e2a58ee5SAlex Elder 			     size_t inbound_size)
294936be9a76SAlex Elder {
29502169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
295136be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
295236be9a76SAlex Elder 	struct page **pages;
295336be9a76SAlex Elder 	u32 page_count;
295436be9a76SAlex Elder 	int ret;
295536be9a76SAlex Elder 
295636be9a76SAlex Elder 	/*
29576010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
29586010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
29596010a451SAlex Elder 	 * also supply outbound data--parameters for the object
29606010a451SAlex Elder 	 * method.  Currently if this is present it will be a
29616010a451SAlex Elder 	 * snapshot id.
296236be9a76SAlex Elder 	 */
296336be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
296436be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
296536be9a76SAlex Elder 	if (IS_ERR(pages))
296636be9a76SAlex Elder 		return PTR_ERR(pages);
296736be9a76SAlex Elder 
296836be9a76SAlex Elder 	ret = -ENOMEM;
29696010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
297036be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
297136be9a76SAlex Elder 	if (!obj_request)
297236be9a76SAlex Elder 		goto out;
297336be9a76SAlex Elder 
297436be9a76SAlex Elder 	obj_request->pages = pages;
297536be9a76SAlex Elder 	obj_request->page_count = page_count;
297636be9a76SAlex Elder 
2977430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
297836be9a76SAlex Elder 	if (!obj_request->osd_req)
297936be9a76SAlex Elder 		goto out;
298036be9a76SAlex Elder 
2981c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
298204017e29SAlex Elder 					class_name, method_name);
298304017e29SAlex Elder 	if (outbound_size) {
298404017e29SAlex Elder 		struct ceph_pagelist *pagelist;
298504017e29SAlex Elder 
298604017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
298704017e29SAlex Elder 		if (!pagelist)
298804017e29SAlex Elder 			goto out;
298904017e29SAlex Elder 
299004017e29SAlex Elder 		ceph_pagelist_init(pagelist);
299104017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
299204017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
299304017e29SAlex Elder 						pagelist);
299404017e29SAlex Elder 	}
2995a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2996a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
299744cd188dSAlex Elder 					0, false, false);
29989d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2999430c28c3SAlex Elder 
300036be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
300136be9a76SAlex Elder 	if (ret)
300236be9a76SAlex Elder 		goto out;
300336be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
300436be9a76SAlex Elder 	if (ret)
300536be9a76SAlex Elder 		goto out;
300636be9a76SAlex Elder 
300736be9a76SAlex Elder 	ret = obj_request->result;
300836be9a76SAlex Elder 	if (ret < 0)
300936be9a76SAlex Elder 		goto out;
301057385b51SAlex Elder 
301157385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
301257385b51SAlex Elder 	ret = (int)obj_request->xferred;
3013903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
301436be9a76SAlex Elder out:
301536be9a76SAlex Elder 	if (obj_request)
301636be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
301736be9a76SAlex Elder 	else
301836be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
301936be9a76SAlex Elder 
302036be9a76SAlex Elder 	return ret;
302136be9a76SAlex Elder }
302236be9a76SAlex Elder 
3023bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
3024cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
3025bf0d5f50SAlex Elder {
3026bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
3027bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
3028bf0d5f50SAlex Elder 	struct request *rq;
3029bf0d5f50SAlex Elder 	int result;
3030bf0d5f50SAlex Elder 
3031bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
3032bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
3033bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
3034bf0d5f50SAlex Elder 		u64 offset;
3035bf0d5f50SAlex Elder 		u64 length;
3036bf0d5f50SAlex Elder 
3037bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
3038bf0d5f50SAlex Elder 
3039bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
30404dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
30414dda41d3SAlex Elder 				(int) rq->cmd_type);
30424dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
30434dda41d3SAlex Elder 			continue;
30444dda41d3SAlex Elder 		}
30454dda41d3SAlex Elder 
30464dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
30474dda41d3SAlex Elder 
30484dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
30494dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
30504dda41d3SAlex Elder 
30514dda41d3SAlex Elder 		if (!length) {
30524dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
3053bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
3054bf0d5f50SAlex Elder 			continue;
3055bf0d5f50SAlex Elder 		}
3056bf0d5f50SAlex Elder 
3057bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
3058bf0d5f50SAlex Elder 
3059bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
3060bf0d5f50SAlex Elder 
3061bf0d5f50SAlex Elder 		if (write_request) {
3062bf0d5f50SAlex Elder 			result = -EROFS;
3063bf0d5f50SAlex Elder 			if (read_only)
3064bf0d5f50SAlex Elder 				goto end_request;
3065bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3066bf0d5f50SAlex Elder 		}
3067bf0d5f50SAlex Elder 
30686d292906SAlex Elder 		/*
30696d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
30706d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
30716d292906SAlex Elder 		 * have disappeared by the time our request arrives
30726d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
30736d292906SAlex Elder 		 * we already know.
30746d292906SAlex Elder 		 */
30756d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3076bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
3077bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3078bf0d5f50SAlex Elder 			result = -ENXIO;
3079bf0d5f50SAlex Elder 			goto end_request;
3080bf0d5f50SAlex Elder 		}
3081bf0d5f50SAlex Elder 
3082bf0d5f50SAlex Elder 		result = -EINVAL;
3083c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
3084c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3085c0cd10dbSAlex Elder 				offset, length);
3086bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
3087c0cd10dbSAlex Elder 		}
3088bf0d5f50SAlex Elder 
308900a653e2SAlex Elder 		result = -EIO;
309000a653e2SAlex Elder 		if (offset + length > rbd_dev->mapping.size) {
309100a653e2SAlex Elder 			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
309200a653e2SAlex Elder 				offset, length, rbd_dev->mapping.size);
309300a653e2SAlex Elder 			goto end_request;
309400a653e2SAlex Elder 		}
309500a653e2SAlex Elder 
3096bf0d5f50SAlex Elder 		result = -ENOMEM;
3097bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
3098e93f3152SAlex Elder 							write_request);
3099bf0d5f50SAlex Elder 		if (!img_request)
3100bf0d5f50SAlex Elder 			goto end_request;
3101bf0d5f50SAlex Elder 
3102bf0d5f50SAlex Elder 		img_request->rq = rq;
3103bf0d5f50SAlex Elder 
3104f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3105f1a4739fSAlex Elder 						rq->bio);
3106bf0d5f50SAlex Elder 		if (!result)
3107bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
3108bf0d5f50SAlex Elder 		if (result)
3109bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
3110bf0d5f50SAlex Elder end_request:
3111bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
3112bf0d5f50SAlex Elder 		if (result < 0) {
31137da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
31147da22d29SAlex Elder 				write_request ? "write" : "read",
31157da22d29SAlex Elder 				length, offset, result);
31167da22d29SAlex Elder 
3117bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
3118bf0d5f50SAlex Elder 		}
3119bf0d5f50SAlex Elder 	}
3120bf0d5f50SAlex Elder }
3121bf0d5f50SAlex Elder 
3122602adf40SYehuda Sadeh /*
3123602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
3124602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
3125f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
3126602adf40SYehuda Sadeh  */
3127602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3128602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
3129602adf40SYehuda Sadeh {
3130602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
3131e5cfeed2SAlex Elder 	sector_t sector_offset;
3132e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
3133e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
3134e5cfeed2SAlex Elder 	int ret;
3135602adf40SYehuda Sadeh 
3136e5cfeed2SAlex Elder 	/*
3137e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
3138e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
3139e5cfeed2SAlex Elder 	 * device.
3140e5cfeed2SAlex Elder 	 */
3141e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3142e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3143e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3144593a9e7bSAlex Elder 
3145e5cfeed2SAlex Elder 	/*
3146e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
3147e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
3148e5cfeed2SAlex Elder 	 */
3149e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3150e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
3151e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
3152e5cfeed2SAlex Elder 	else
3153e5cfeed2SAlex Elder 		ret = 0;
3154e5cfeed2SAlex Elder 
3155e5cfeed2SAlex Elder 	/*
3156e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
3157e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
3158e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
3159e5cfeed2SAlex Elder 	 * added to an empty bio."
3160e5cfeed2SAlex Elder 	 */
3161e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3162e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3163e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
3164e5cfeed2SAlex Elder 
3165e5cfeed2SAlex Elder 	return ret;
3166602adf40SYehuda Sadeh }
3167602adf40SYehuda Sadeh 
3168602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3169602adf40SYehuda Sadeh {
3170602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3171602adf40SYehuda Sadeh 
3172602adf40SYehuda Sadeh 	if (!disk)
3173602adf40SYehuda Sadeh 		return;
3174602adf40SYehuda Sadeh 
3175a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3176a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3177602adf40SYehuda Sadeh 		del_gendisk(disk);
3178602adf40SYehuda Sadeh 		if (disk->queue)
3179602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
3180a0cab924SAlex Elder 	}
3181602adf40SYehuda Sadeh 	put_disk(disk);
3182602adf40SYehuda Sadeh }
3183602adf40SYehuda Sadeh 
3184788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3185788e2df3SAlex Elder 				const char *object_name,
31867097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3187788e2df3SAlex Elder 
3188788e2df3SAlex Elder {
31892169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3190788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3191788e2df3SAlex Elder 	struct page **pages = NULL;
3192788e2df3SAlex Elder 	u32 page_count;
31931ceae7efSAlex Elder 	size_t size;
3194788e2df3SAlex Elder 	int ret;
3195788e2df3SAlex Elder 
3196788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3197788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3198788e2df3SAlex Elder 	if (IS_ERR(pages))
3199788e2df3SAlex Elder 		ret = PTR_ERR(pages);
3200788e2df3SAlex Elder 
3201788e2df3SAlex Elder 	ret = -ENOMEM;
3202788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3203788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3204788e2df3SAlex Elder 	if (!obj_request)
3205788e2df3SAlex Elder 		goto out;
3206788e2df3SAlex Elder 
3207788e2df3SAlex Elder 	obj_request->pages = pages;
3208788e2df3SAlex Elder 	obj_request->page_count = page_count;
3209788e2df3SAlex Elder 
3210430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3211788e2df3SAlex Elder 	if (!obj_request->osd_req)
3212788e2df3SAlex Elder 		goto out;
3213788e2df3SAlex Elder 
3214c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3215c99d2d4aSAlex Elder 					offset, length, 0, 0);
3216406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3217a4ce40a9SAlex Elder 					obj_request->pages,
321844cd188dSAlex Elder 					obj_request->length,
321944cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
322044cd188dSAlex Elder 					false, false);
32219d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3222430c28c3SAlex Elder 
3223788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3224788e2df3SAlex Elder 	if (ret)
3225788e2df3SAlex Elder 		goto out;
3226788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3227788e2df3SAlex Elder 	if (ret)
3228788e2df3SAlex Elder 		goto out;
3229788e2df3SAlex Elder 
3230788e2df3SAlex Elder 	ret = obj_request->result;
3231788e2df3SAlex Elder 	if (ret < 0)
3232788e2df3SAlex Elder 		goto out;
32331ceae7efSAlex Elder 
32341ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
32351ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3236903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
323723ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
323823ed6e13SAlex Elder 	ret = (int)size;
3239788e2df3SAlex Elder out:
3240788e2df3SAlex Elder 	if (obj_request)
3241788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3242788e2df3SAlex Elder 	else
3243788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3244788e2df3SAlex Elder 
3245788e2df3SAlex Elder 	return ret;
3246788e2df3SAlex Elder }
3247788e2df3SAlex Elder 
3248602adf40SYehuda Sadeh /*
3249662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3250662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3251662518b1SAlex Elder  * information about the image.
32524156d998SAlex Elder  */
325399a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
32544156d998SAlex Elder {
32554156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
32564156d998SAlex Elder 	u32 snap_count = 0;
32574156d998SAlex Elder 	u64 names_size = 0;
32584156d998SAlex Elder 	u32 want_count;
32594156d998SAlex Elder 	int ret;
32604156d998SAlex Elder 
32614156d998SAlex Elder 	/*
32624156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
32634156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
32644156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
32654156d998SAlex Elder 	 * the number of snapshots could change by the time we read
32664156d998SAlex Elder 	 * it in, in which case we re-read it.
32674156d998SAlex Elder 	 */
32684156d998SAlex Elder 	do {
32694156d998SAlex Elder 		size_t size;
32704156d998SAlex Elder 
32714156d998SAlex Elder 		kfree(ondisk);
32724156d998SAlex Elder 
32734156d998SAlex Elder 		size = sizeof (*ondisk);
32744156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
32754156d998SAlex Elder 		size += names_size;
32764156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
32774156d998SAlex Elder 		if (!ondisk)
3278662518b1SAlex Elder 			return -ENOMEM;
32794156d998SAlex Elder 
3280788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
32817097f8dfSAlex Elder 				       0, size, ondisk);
32824156d998SAlex Elder 		if (ret < 0)
3283662518b1SAlex Elder 			goto out;
3284c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
32854156d998SAlex Elder 			ret = -ENXIO;
328606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
328706ecc6cbSAlex Elder 				size, ret);
3288662518b1SAlex Elder 			goto out;
32894156d998SAlex Elder 		}
32904156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
32914156d998SAlex Elder 			ret = -ENXIO;
329206ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3293662518b1SAlex Elder 			goto out;
32944156d998SAlex Elder 		}
32954156d998SAlex Elder 
32964156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
32974156d998SAlex Elder 		want_count = snap_count;
32984156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
32994156d998SAlex Elder 	} while (snap_count != want_count);
33004156d998SAlex Elder 
3301662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3302662518b1SAlex Elder out:
33034156d998SAlex Elder 	kfree(ondisk);
33044156d998SAlex Elder 
3305dfc5606dSYehuda Sadeh 	return ret;
3306602adf40SYehuda Sadeh }
3307602adf40SYehuda Sadeh 
330815228edeSAlex Elder /*
330915228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
331015228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
331115228edeSAlex Elder  */
331215228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
331315228edeSAlex Elder {
331415228edeSAlex Elder 	u64 snap_id;
331515228edeSAlex Elder 
331615228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
331715228edeSAlex Elder 		return;
331815228edeSAlex Elder 
331915228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
332015228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
332115228edeSAlex Elder 		return;
332215228edeSAlex Elder 
332315228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
332415228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
332515228edeSAlex Elder }
332615228edeSAlex Elder 
3327cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
33281fe5e993SAlex Elder {
3329e627db08SAlex Elder 	u64 mapping_size;
33301fe5e993SAlex Elder 	int ret;
33311fe5e993SAlex Elder 
3332117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3333cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
33343b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3335117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
333699a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
3337117973fbSAlex Elder 	else
33382df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
333915228edeSAlex Elder 
334015228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
334115228edeSAlex Elder 
334215228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
3343cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
3344cfbf6377SAlex Elder 
334500a653e2SAlex Elder 	if (mapping_size != rbd_dev->mapping.size) {
334600a653e2SAlex Elder 		sector_t size;
334700a653e2SAlex Elder 
334800a653e2SAlex Elder 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
334900a653e2SAlex Elder 		dout("setting size to %llu sectors", (unsigned long long)size);
335000a653e2SAlex Elder 		set_capacity(rbd_dev->disk, size);
3351a3fbe5d4SAlex Elder 		revalidate_disk(rbd_dev->disk);
335200a653e2SAlex Elder 	}
33531fe5e993SAlex Elder 
33541fe5e993SAlex Elder 	return ret;
33551fe5e993SAlex Elder }
33561fe5e993SAlex Elder 
3357602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3358602adf40SYehuda Sadeh {
3359602adf40SYehuda Sadeh 	struct gendisk *disk;
3360602adf40SYehuda Sadeh 	struct request_queue *q;
3361593a9e7bSAlex Elder 	u64 segment_size;
3362602adf40SYehuda Sadeh 
3363602adf40SYehuda Sadeh 	/* create gendisk info */
3364602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3365602adf40SYehuda Sadeh 	if (!disk)
33661fcdb8aaSAlex Elder 		return -ENOMEM;
3367602adf40SYehuda Sadeh 
3368f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3369de71a297SAlex Elder 		 rbd_dev->dev_id);
3370602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3371602adf40SYehuda Sadeh 	disk->first_minor = 0;
3372602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3373602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3374602adf40SYehuda Sadeh 
3375bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3376602adf40SYehuda Sadeh 	if (!q)
3377602adf40SYehuda Sadeh 		goto out_disk;
3378029bcbd8SJosh Durgin 
3379593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3380593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3381593a9e7bSAlex Elder 
3382029bcbd8SJosh Durgin 	/* set io sizes to object size */
3383593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3384593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3385593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3386593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3387593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3388029bcbd8SJosh Durgin 
3389602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3390602adf40SYehuda Sadeh 	disk->queue = q;
3391602adf40SYehuda Sadeh 
3392602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3393602adf40SYehuda Sadeh 
3394602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3395602adf40SYehuda Sadeh 
3396602adf40SYehuda Sadeh 	return 0;
3397602adf40SYehuda Sadeh out_disk:
3398602adf40SYehuda Sadeh 	put_disk(disk);
33991fcdb8aaSAlex Elder 
34001fcdb8aaSAlex Elder 	return -ENOMEM;
3401602adf40SYehuda Sadeh }
3402602adf40SYehuda Sadeh 
3403dfc5606dSYehuda Sadeh /*
3404dfc5606dSYehuda Sadeh   sysfs
3405dfc5606dSYehuda Sadeh */
3406602adf40SYehuda Sadeh 
3407593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3408593a9e7bSAlex Elder {
3409593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3410593a9e7bSAlex Elder }
3411593a9e7bSAlex Elder 
3412dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3413dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3414602adf40SYehuda Sadeh {
3415593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3416dfc5606dSYehuda Sadeh 
3417fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3418fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3419602adf40SYehuda Sadeh }
3420602adf40SYehuda Sadeh 
342134b13184SAlex Elder /*
342234b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
342334b13184SAlex Elder  * necessarily the base image.
342434b13184SAlex Elder  */
342534b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
342634b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
342734b13184SAlex Elder {
342834b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
342934b13184SAlex Elder 
343034b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
343134b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
343234b13184SAlex Elder }
343334b13184SAlex Elder 
3434dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3435dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3436602adf40SYehuda Sadeh {
3437593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3438dfc5606dSYehuda Sadeh 
3439fc71d833SAlex Elder 	if (rbd_dev->major)
3440dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3441fc71d833SAlex Elder 
3442fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3443fc71d833SAlex Elder 
3444dfc5606dSYehuda Sadeh }
3445dfc5606dSYehuda Sadeh 
3446dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3447dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3448dfc5606dSYehuda Sadeh {
3449593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3450dfc5606dSYehuda Sadeh 
34511dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
34521dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3453dfc5606dSYehuda Sadeh }
3454dfc5606dSYehuda Sadeh 
3455dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3456dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3457dfc5606dSYehuda Sadeh {
3458593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3459dfc5606dSYehuda Sadeh 
34600d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3461dfc5606dSYehuda Sadeh }
3462dfc5606dSYehuda Sadeh 
34639bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
34649bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
34659bb2f334SAlex Elder {
34669bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
34679bb2f334SAlex Elder 
34680d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
34690d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
34709bb2f334SAlex Elder }
34719bb2f334SAlex Elder 
3472dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3473dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3474dfc5606dSYehuda Sadeh {
3475593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3476dfc5606dSYehuda Sadeh 
3477a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
34780d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3479a92ffdf8SAlex Elder 
3480a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3481dfc5606dSYehuda Sadeh }
3482dfc5606dSYehuda Sadeh 
3483589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3484589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3485589d30e0SAlex Elder {
3486589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3487589d30e0SAlex Elder 
34880d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3489589d30e0SAlex Elder }
3490589d30e0SAlex Elder 
349134b13184SAlex Elder /*
349234b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
349334b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
349434b13184SAlex Elder  */
3495dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3496dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3497dfc5606dSYehuda Sadeh 			     char *buf)
3498dfc5606dSYehuda Sadeh {
3499593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3500dfc5606dSYehuda Sadeh 
35010d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3502dfc5606dSYehuda Sadeh }
3503dfc5606dSYehuda Sadeh 
350486b00e0dSAlex Elder /*
350586b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
350686b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
350786b00e0dSAlex Elder  * "(no parent image)".
350886b00e0dSAlex Elder  */
350986b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
351086b00e0dSAlex Elder 			     struct device_attribute *attr,
351186b00e0dSAlex Elder 			     char *buf)
351286b00e0dSAlex Elder {
351386b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
351486b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
351586b00e0dSAlex Elder 	int count;
351686b00e0dSAlex Elder 	char *bufp = buf;
351786b00e0dSAlex Elder 
351886b00e0dSAlex Elder 	if (!spec)
351986b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
352086b00e0dSAlex Elder 
352186b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
352286b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
352386b00e0dSAlex Elder 	if (count < 0)
352486b00e0dSAlex Elder 		return count;
352586b00e0dSAlex Elder 	bufp += count;
352686b00e0dSAlex Elder 
352786b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
352886b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
352986b00e0dSAlex Elder 	if (count < 0)
353086b00e0dSAlex Elder 		return count;
353186b00e0dSAlex Elder 	bufp += count;
353286b00e0dSAlex Elder 
353386b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
353486b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
353586b00e0dSAlex Elder 	if (count < 0)
353686b00e0dSAlex Elder 		return count;
353786b00e0dSAlex Elder 	bufp += count;
353886b00e0dSAlex Elder 
353986b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
354086b00e0dSAlex Elder 	if (count < 0)
354186b00e0dSAlex Elder 		return count;
354286b00e0dSAlex Elder 	bufp += count;
354386b00e0dSAlex Elder 
354486b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
354586b00e0dSAlex Elder }
354686b00e0dSAlex Elder 
3547dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3548dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3549dfc5606dSYehuda Sadeh 				 const char *buf,
3550dfc5606dSYehuda Sadeh 				 size_t size)
3551dfc5606dSYehuda Sadeh {
3552593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3553b813623aSAlex Elder 	int ret;
3554602adf40SYehuda Sadeh 
3555cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3556e627db08SAlex Elder 	if (ret)
3557e627db08SAlex Elder 		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3558b813623aSAlex Elder 
3559b813623aSAlex Elder 	return ret < 0 ? ret : size;
3560dfc5606dSYehuda Sadeh }
3561602adf40SYehuda Sadeh 
3562dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
356334b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3564dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3565dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3566dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
35679bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3568dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3569589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3570dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3571dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
357286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3573dfc5606dSYehuda Sadeh 
3574dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3575dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
357634b13184SAlex Elder 	&dev_attr_features.attr,
3577dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3578dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3579dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
35809bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3581dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3582589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3583dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
358486b00e0dSAlex Elder 	&dev_attr_parent.attr,
3585dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3586dfc5606dSYehuda Sadeh 	NULL
3587dfc5606dSYehuda Sadeh };
3588dfc5606dSYehuda Sadeh 
3589dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3590dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3591dfc5606dSYehuda Sadeh };
3592dfc5606dSYehuda Sadeh 
3593dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3594dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3595dfc5606dSYehuda Sadeh 	NULL
3596dfc5606dSYehuda Sadeh };
3597dfc5606dSYehuda Sadeh 
3598dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3599dfc5606dSYehuda Sadeh {
3600dfc5606dSYehuda Sadeh }
3601dfc5606dSYehuda Sadeh 
3602dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3603dfc5606dSYehuda Sadeh 	.name		= "rbd",
3604dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3605dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3606dfc5606dSYehuda Sadeh };
3607dfc5606dSYehuda Sadeh 
36088b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
36098b8fb99cSAlex Elder {
36108b8fb99cSAlex Elder 	kref_get(&spec->kref);
36118b8fb99cSAlex Elder 
36128b8fb99cSAlex Elder 	return spec;
36138b8fb99cSAlex Elder }
36148b8fb99cSAlex Elder 
36158b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
36168b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
36178b8fb99cSAlex Elder {
36188b8fb99cSAlex Elder 	if (spec)
36198b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
36208b8fb99cSAlex Elder }
36218b8fb99cSAlex Elder 
36228b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
36238b8fb99cSAlex Elder {
36248b8fb99cSAlex Elder 	struct rbd_spec *spec;
36258b8fb99cSAlex Elder 
36268b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
36278b8fb99cSAlex Elder 	if (!spec)
36288b8fb99cSAlex Elder 		return NULL;
36298b8fb99cSAlex Elder 	kref_init(&spec->kref);
36308b8fb99cSAlex Elder 
36318b8fb99cSAlex Elder 	return spec;
36328b8fb99cSAlex Elder }
36338b8fb99cSAlex Elder 
36348b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
36358b8fb99cSAlex Elder {
36368b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
36378b8fb99cSAlex Elder 
36388b8fb99cSAlex Elder 	kfree(spec->pool_name);
36398b8fb99cSAlex Elder 	kfree(spec->image_id);
36408b8fb99cSAlex Elder 	kfree(spec->image_name);
36418b8fb99cSAlex Elder 	kfree(spec->snap_name);
36428b8fb99cSAlex Elder 	kfree(spec);
36438b8fb99cSAlex Elder }
36448b8fb99cSAlex Elder 
3645cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3646c53d5893SAlex Elder 				struct rbd_spec *spec)
3647c53d5893SAlex Elder {
3648c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3649c53d5893SAlex Elder 
3650c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3651c53d5893SAlex Elder 	if (!rbd_dev)
3652c53d5893SAlex Elder 		return NULL;
3653c53d5893SAlex Elder 
3654c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
36556d292906SAlex Elder 	rbd_dev->flags = 0;
3656a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
3657c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3658c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3659c53d5893SAlex Elder 
3660c53d5893SAlex Elder 	rbd_dev->spec = spec;
3661c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3662c53d5893SAlex Elder 
36630903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
36640903e875SAlex Elder 
36650903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
36660903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
36670903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
36680903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
36690903e875SAlex Elder 
3670c53d5893SAlex Elder 	return rbd_dev;
3671c53d5893SAlex Elder }
3672c53d5893SAlex Elder 
3673c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3674c53d5893SAlex Elder {
3675c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3676c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3677c53d5893SAlex Elder 	kfree(rbd_dev);
3678c53d5893SAlex Elder }
3679c53d5893SAlex Elder 
3680dfc5606dSYehuda Sadeh /*
36819d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
36829d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
36839d475de5SAlex Elder  * image.
36849d475de5SAlex Elder  */
36859d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
36869d475de5SAlex Elder 				u8 *order, u64 *snap_size)
36879d475de5SAlex Elder {
36889d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
36899d475de5SAlex Elder 	int ret;
36909d475de5SAlex Elder 	struct {
36919d475de5SAlex Elder 		u8 order;
36929d475de5SAlex Elder 		__le64 size;
36939d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
36949d475de5SAlex Elder 
369536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
36969d475de5SAlex Elder 				"rbd", "get_size",
36974157976bSAlex Elder 				&snapid, sizeof (snapid),
3698e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
369936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
37009d475de5SAlex Elder 	if (ret < 0)
37019d475de5SAlex Elder 		return ret;
370257385b51SAlex Elder 	if (ret < sizeof (size_buf))
370357385b51SAlex Elder 		return -ERANGE;
37049d475de5SAlex Elder 
3705c86f86e9SAlex Elder 	if (order)
37069d475de5SAlex Elder 		*order = size_buf.order;
37079d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
37089d475de5SAlex Elder 
37099d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
37109d475de5SAlex Elder 		(unsigned long long)snap_id, (unsigned int)*order,
37119d475de5SAlex Elder 		(unsigned long long)*snap_size);
37129d475de5SAlex Elder 
37139d475de5SAlex Elder 	return 0;
37149d475de5SAlex Elder }
37159d475de5SAlex Elder 
37169d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
37179d475de5SAlex Elder {
37189d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
37199d475de5SAlex Elder 					&rbd_dev->header.obj_order,
37209d475de5SAlex Elder 					&rbd_dev->header.image_size);
37219d475de5SAlex Elder }
37229d475de5SAlex Elder 
37231e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
37241e130199SAlex Elder {
37251e130199SAlex Elder 	void *reply_buf;
37261e130199SAlex Elder 	int ret;
37271e130199SAlex Elder 	void *p;
37281e130199SAlex Elder 
37291e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
37301e130199SAlex Elder 	if (!reply_buf)
37311e130199SAlex Elder 		return -ENOMEM;
37321e130199SAlex Elder 
373336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
37344157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3735e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
373636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
37371e130199SAlex Elder 	if (ret < 0)
37381e130199SAlex Elder 		goto out;
37391e130199SAlex Elder 
37401e130199SAlex Elder 	p = reply_buf;
37411e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
374257385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
374357385b51SAlex Elder 	ret = 0;
37441e130199SAlex Elder 
37451e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
37461e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
37471e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
37481e130199SAlex Elder 	} else {
37491e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
37501e130199SAlex Elder 	}
37511e130199SAlex Elder out:
37521e130199SAlex Elder 	kfree(reply_buf);
37531e130199SAlex Elder 
37541e130199SAlex Elder 	return ret;
37551e130199SAlex Elder }
37561e130199SAlex Elder 
3757b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3758b1b5402aSAlex Elder 		u64 *snap_features)
3759b1b5402aSAlex Elder {
3760b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3761b1b5402aSAlex Elder 	struct {
3762b1b5402aSAlex Elder 		__le64 features;
3763b1b5402aSAlex Elder 		__le64 incompat;
37644157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3765d889140cSAlex Elder 	u64 incompat;
3766b1b5402aSAlex Elder 	int ret;
3767b1b5402aSAlex Elder 
376836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3769b1b5402aSAlex Elder 				"rbd", "get_features",
37704157976bSAlex Elder 				&snapid, sizeof (snapid),
3771e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
377236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3773b1b5402aSAlex Elder 	if (ret < 0)
3774b1b5402aSAlex Elder 		return ret;
377557385b51SAlex Elder 	if (ret < sizeof (features_buf))
377657385b51SAlex Elder 		return -ERANGE;
3777d889140cSAlex Elder 
3778d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
37795cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3780b8f5c6edSAlex Elder 		return -ENXIO;
3781d889140cSAlex Elder 
3782b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3783b1b5402aSAlex Elder 
3784b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3785b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3786b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3787b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3788b1b5402aSAlex Elder 
3789b1b5402aSAlex Elder 	return 0;
3790b1b5402aSAlex Elder }
3791b1b5402aSAlex Elder 
3792b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3793b1b5402aSAlex Elder {
3794b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3795b1b5402aSAlex Elder 						&rbd_dev->header.features);
3796b1b5402aSAlex Elder }
3797b1b5402aSAlex Elder 
379886b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
379986b00e0dSAlex Elder {
380086b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
380186b00e0dSAlex Elder 	size_t size;
380286b00e0dSAlex Elder 	void *reply_buf = NULL;
380386b00e0dSAlex Elder 	__le64 snapid;
380486b00e0dSAlex Elder 	void *p;
380586b00e0dSAlex Elder 	void *end;
3806642a2537SAlex Elder 	u64 pool_id;
380786b00e0dSAlex Elder 	char *image_id;
38083b5cf2a2SAlex Elder 	u64 snap_id;
380986b00e0dSAlex Elder 	u64 overlap;
381086b00e0dSAlex Elder 	int ret;
381186b00e0dSAlex Elder 
381286b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
381386b00e0dSAlex Elder 	if (!parent_spec)
381486b00e0dSAlex Elder 		return -ENOMEM;
381586b00e0dSAlex Elder 
381686b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
381786b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
381886b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
381986b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
382086b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
382186b00e0dSAlex Elder 	if (!reply_buf) {
382286b00e0dSAlex Elder 		ret = -ENOMEM;
382386b00e0dSAlex Elder 		goto out_err;
382486b00e0dSAlex Elder 	}
382586b00e0dSAlex Elder 
382686b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
382736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
382886b00e0dSAlex Elder 				"rbd", "get_parent",
38294157976bSAlex Elder 				&snapid, sizeof (snapid),
3830e2a58ee5SAlex Elder 				reply_buf, size);
383136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
383286b00e0dSAlex Elder 	if (ret < 0)
383386b00e0dSAlex Elder 		goto out_err;
383486b00e0dSAlex Elder 
383586b00e0dSAlex Elder 	p = reply_buf;
383657385b51SAlex Elder 	end = reply_buf + ret;
383757385b51SAlex Elder 	ret = -ERANGE;
3838642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
3839392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
3840392a9dadSAlex Elder 		/*
3841392a9dadSAlex Elder 		 * Either the parent never existed, or we have
3842392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
3843392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
3844392a9dadSAlex Elder 		 * layered image disappears we immediately set the
3845392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
3846392a9dadSAlex Elder 		 * requests will be treated as if the image had no
3847392a9dadSAlex Elder 		 * parent.
3848392a9dadSAlex Elder 		 */
3849392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
3850392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
3851392a9dadSAlex Elder 			smp_mb();
3852392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
3853392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
3854392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
3855392a9dadSAlex Elder 		}
3856392a9dadSAlex Elder 
385786b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
3858392a9dadSAlex Elder 	}
385986b00e0dSAlex Elder 
38600903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
38610903e875SAlex Elder 
38620903e875SAlex Elder 	ret = -EIO;
3863642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
3864c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3865642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
386657385b51SAlex Elder 		goto out_err;
3867c0cd10dbSAlex Elder 	}
38680903e875SAlex Elder 
3869979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
387086b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
387186b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
387286b00e0dSAlex Elder 		goto out_err;
387386b00e0dSAlex Elder 	}
38743b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
387586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
387686b00e0dSAlex Elder 
38773b5cf2a2SAlex Elder 	/*
38783b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
38793b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
38803b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
38813b5cf2a2SAlex Elder 	 */
38823b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
38833b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
38843b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
38853b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
388686b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
388786b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
38883b5cf2a2SAlex Elder 	}
38893b5cf2a2SAlex Elder 
38903b5cf2a2SAlex Elder 	/*
38913b5cf2a2SAlex Elder 	 * We always update the parent overlap.  If it's zero we
38923b5cf2a2SAlex Elder 	 * treat it specially.
38933b5cf2a2SAlex Elder 	 */
389470cf49cfSAlex Elder 	rbd_dev->parent_overlap = overlap;
38953b5cf2a2SAlex Elder 	smp_mb();
38963b5cf2a2SAlex Elder 	if (!overlap) {
38973b5cf2a2SAlex Elder 
38983b5cf2a2SAlex Elder 		/* A null parent_spec indicates it's the initial probe */
38993b5cf2a2SAlex Elder 
39003b5cf2a2SAlex Elder 		if (parent_spec) {
39013b5cf2a2SAlex Elder 			/*
39023b5cf2a2SAlex Elder 			 * The overlap has become zero, so the clone
39033b5cf2a2SAlex Elder 			 * must have been resized down to 0 at some
39043b5cf2a2SAlex Elder 			 * point.  Treat this the same as a flatten.
39053b5cf2a2SAlex Elder 			 */
39063b5cf2a2SAlex Elder 			rbd_dev_parent_put(rbd_dev);
39073b5cf2a2SAlex Elder 			pr_info("%s: clone image now standalone\n",
39083b5cf2a2SAlex Elder 				rbd_dev->disk->disk_name);
390970cf49cfSAlex Elder 		} else {
39103b5cf2a2SAlex Elder 			/*
39113b5cf2a2SAlex Elder 			 * For the initial probe, if we find the
39123b5cf2a2SAlex Elder 			 * overlap is zero we just pretend there was
39133b5cf2a2SAlex Elder 			 * no parent image.
39143b5cf2a2SAlex Elder 			 */
39153b5cf2a2SAlex Elder 			rbd_warn(rbd_dev, "ignoring parent of "
39163b5cf2a2SAlex Elder 						"clone with overlap 0\n");
39173b5cf2a2SAlex Elder 		}
391870cf49cfSAlex Elder 	}
391986b00e0dSAlex Elder out:
392086b00e0dSAlex Elder 	ret = 0;
392186b00e0dSAlex Elder out_err:
392286b00e0dSAlex Elder 	kfree(reply_buf);
392386b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
392486b00e0dSAlex Elder 
392586b00e0dSAlex Elder 	return ret;
392686b00e0dSAlex Elder }
392786b00e0dSAlex Elder 
3928cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3929cc070d59SAlex Elder {
3930cc070d59SAlex Elder 	struct {
3931cc070d59SAlex Elder 		__le64 stripe_unit;
3932cc070d59SAlex Elder 		__le64 stripe_count;
3933cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
3934cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
3935cc070d59SAlex Elder 	void *p;
3936cc070d59SAlex Elder 	u64 obj_size;
3937cc070d59SAlex Elder 	u64 stripe_unit;
3938cc070d59SAlex Elder 	u64 stripe_count;
3939cc070d59SAlex Elder 	int ret;
3940cc070d59SAlex Elder 
3941cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3942cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
3943e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
3944cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3945cc070d59SAlex Elder 	if (ret < 0)
3946cc070d59SAlex Elder 		return ret;
3947cc070d59SAlex Elder 	if (ret < size)
3948cc070d59SAlex Elder 		return -ERANGE;
3949cc070d59SAlex Elder 
3950cc070d59SAlex Elder 	/*
3951cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
3952cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
3953cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
3954cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
3955cc070d59SAlex Elder 	 */
3956cc070d59SAlex Elder 	ret = -EINVAL;
3957cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
3958cc070d59SAlex Elder 	p = &striping_info_buf;
3959cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
3960cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
3961cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
3962cc070d59SAlex Elder 				"(got %llu want %llu)",
3963cc070d59SAlex Elder 				stripe_unit, obj_size);
3964cc070d59SAlex Elder 		return -EINVAL;
3965cc070d59SAlex Elder 	}
3966cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
3967cc070d59SAlex Elder 	if (stripe_count != 1) {
3968cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
3969cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
3970cc070d59SAlex Elder 		return -EINVAL;
3971cc070d59SAlex Elder 	}
3972500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
3973500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
3974cc070d59SAlex Elder 
3975cc070d59SAlex Elder 	return 0;
3976cc070d59SAlex Elder }
3977cc070d59SAlex Elder 
39789e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
39799e15b77dSAlex Elder {
39809e15b77dSAlex Elder 	size_t image_id_size;
39819e15b77dSAlex Elder 	char *image_id;
39829e15b77dSAlex Elder 	void *p;
39839e15b77dSAlex Elder 	void *end;
39849e15b77dSAlex Elder 	size_t size;
39859e15b77dSAlex Elder 	void *reply_buf = NULL;
39869e15b77dSAlex Elder 	size_t len = 0;
39879e15b77dSAlex Elder 	char *image_name = NULL;
39889e15b77dSAlex Elder 	int ret;
39899e15b77dSAlex Elder 
39909e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
39919e15b77dSAlex Elder 
399269e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
399369e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
39949e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
39959e15b77dSAlex Elder 	if (!image_id)
39969e15b77dSAlex Elder 		return NULL;
39979e15b77dSAlex Elder 
39989e15b77dSAlex Elder 	p = image_id;
39994157976bSAlex Elder 	end = image_id + image_id_size;
400069e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
40019e15b77dSAlex Elder 
40029e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
40039e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
40049e15b77dSAlex Elder 	if (!reply_buf)
40059e15b77dSAlex Elder 		goto out;
40069e15b77dSAlex Elder 
400736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
40089e15b77dSAlex Elder 				"rbd", "dir_get_name",
40099e15b77dSAlex Elder 				image_id, image_id_size,
4010e2a58ee5SAlex Elder 				reply_buf, size);
40119e15b77dSAlex Elder 	if (ret < 0)
40129e15b77dSAlex Elder 		goto out;
40139e15b77dSAlex Elder 	p = reply_buf;
4014f40eb349SAlex Elder 	end = reply_buf + ret;
4015f40eb349SAlex Elder 
40169e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
40179e15b77dSAlex Elder 	if (IS_ERR(image_name))
40189e15b77dSAlex Elder 		image_name = NULL;
40199e15b77dSAlex Elder 	else
40209e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
40219e15b77dSAlex Elder out:
40229e15b77dSAlex Elder 	kfree(reply_buf);
40239e15b77dSAlex Elder 	kfree(image_id);
40249e15b77dSAlex Elder 
40259e15b77dSAlex Elder 	return image_name;
40269e15b77dSAlex Elder }
40279e15b77dSAlex Elder 
40282ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
40292ad3d716SAlex Elder {
40302ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
40312ad3d716SAlex Elder 	const char *snap_name;
40322ad3d716SAlex Elder 	u32 which = 0;
40332ad3d716SAlex Elder 
40342ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
40352ad3d716SAlex Elder 
40362ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
40372ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
40382ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
40392ad3d716SAlex Elder 			return snapc->snaps[which];
40402ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
40412ad3d716SAlex Elder 		which++;
40422ad3d716SAlex Elder 	}
40432ad3d716SAlex Elder 	return CEPH_NOSNAP;
40442ad3d716SAlex Elder }
40452ad3d716SAlex Elder 
40462ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
40472ad3d716SAlex Elder {
40482ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
40492ad3d716SAlex Elder 	u32 which;
40502ad3d716SAlex Elder 	bool found = false;
40512ad3d716SAlex Elder 	u64 snap_id;
40522ad3d716SAlex Elder 
40532ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
40542ad3d716SAlex Elder 		const char *snap_name;
40552ad3d716SAlex Elder 
40562ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
40572ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
40582ad3d716SAlex Elder 		if (IS_ERR(snap_name))
40592ad3d716SAlex Elder 			break;
40602ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
40612ad3d716SAlex Elder 		kfree(snap_name);
40622ad3d716SAlex Elder 	}
40632ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
40642ad3d716SAlex Elder }
40652ad3d716SAlex Elder 
40662ad3d716SAlex Elder /*
40672ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
40682ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
40692ad3d716SAlex Elder  */
40702ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
40712ad3d716SAlex Elder {
40722ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
40732ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
40742ad3d716SAlex Elder 
40752ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
40762ad3d716SAlex Elder }
40772ad3d716SAlex Elder 
40789e15b77dSAlex Elder /*
40792e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
40802e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
40812e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
40822e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
40832e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
40842e9f7f1cSAlex Elder  * allocated.
4085e1d4213fSAlex Elder  *
4086e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
4087e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
4088e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
40899e15b77dSAlex Elder  */
40902e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
40919e15b77dSAlex Elder {
40922e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
40932e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
40942e9f7f1cSAlex Elder 	const char *pool_name;
40952e9f7f1cSAlex Elder 	const char *image_name;
40962e9f7f1cSAlex Elder 	const char *snap_name;
40979e15b77dSAlex Elder 	int ret;
40989e15b77dSAlex Elder 
4099e1d4213fSAlex Elder 	/*
4100e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
4101e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
4102e1d4213fSAlex Elder 	 */
41032e9f7f1cSAlex Elder 	if (spec->pool_name) {
41042e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
41052ad3d716SAlex Elder 			u64 snap_id;
4106e1d4213fSAlex Elder 
41072ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
41082ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
4109e1d4213fSAlex Elder 				return -ENOENT;
41102ad3d716SAlex Elder 			spec->snap_id = snap_id;
4111e1d4213fSAlex Elder 		} else {
41122e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
4113e1d4213fSAlex Elder 		}
4114e1d4213fSAlex Elder 
4115e1d4213fSAlex Elder 		return 0;
4116e1d4213fSAlex Elder 	}
41179e15b77dSAlex Elder 
41182e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
41199e15b77dSAlex Elder 
41202e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
41212e9f7f1cSAlex Elder 	if (!pool_name) {
41222e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4123935dc89fSAlex Elder 		return -EIO;
4124935dc89fSAlex Elder 	}
41252e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
41262e9f7f1cSAlex Elder 	if (!pool_name)
41279e15b77dSAlex Elder 		return -ENOMEM;
41289e15b77dSAlex Elder 
41299e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
41309e15b77dSAlex Elder 
41312e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
41322e9f7f1cSAlex Elder 	if (!image_name)
413306ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
41349e15b77dSAlex Elder 
41352e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
41369e15b77dSAlex Elder 
41372e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
41382e9f7f1cSAlex Elder 	if (!snap_name) {
41392e9f7f1cSAlex Elder 		ret = -ENOMEM;
41409e15b77dSAlex Elder 		goto out_err;
41412e9f7f1cSAlex Elder 	}
41422e9f7f1cSAlex Elder 
41432e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
41442e9f7f1cSAlex Elder 	spec->image_name = image_name;
41452e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
41469e15b77dSAlex Elder 
41479e15b77dSAlex Elder 	return 0;
41489e15b77dSAlex Elder out_err:
41492e9f7f1cSAlex Elder 	kfree(image_name);
41502e9f7f1cSAlex Elder 	kfree(pool_name);
41519e15b77dSAlex Elder 
41529e15b77dSAlex Elder 	return ret;
41539e15b77dSAlex Elder }
41549e15b77dSAlex Elder 
4155cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
415635d489f9SAlex Elder {
415735d489f9SAlex Elder 	size_t size;
415835d489f9SAlex Elder 	int ret;
415935d489f9SAlex Elder 	void *reply_buf;
416035d489f9SAlex Elder 	void *p;
416135d489f9SAlex Elder 	void *end;
416235d489f9SAlex Elder 	u64 seq;
416335d489f9SAlex Elder 	u32 snap_count;
416435d489f9SAlex Elder 	struct ceph_snap_context *snapc;
416535d489f9SAlex Elder 	u32 i;
416635d489f9SAlex Elder 
416735d489f9SAlex Elder 	/*
416835d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
416935d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
417035d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
417135d489f9SAlex Elder 	 * prepared to receive.
417235d489f9SAlex Elder 	 */
417335d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
417435d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
417535d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
417635d489f9SAlex Elder 	if (!reply_buf)
417735d489f9SAlex Elder 		return -ENOMEM;
417835d489f9SAlex Elder 
417936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
41804157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4181e2a58ee5SAlex Elder 				reply_buf, size);
418236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
418335d489f9SAlex Elder 	if (ret < 0)
418435d489f9SAlex Elder 		goto out;
418535d489f9SAlex Elder 
418635d489f9SAlex Elder 	p = reply_buf;
418757385b51SAlex Elder 	end = reply_buf + ret;
418857385b51SAlex Elder 	ret = -ERANGE;
418935d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
419035d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
419135d489f9SAlex Elder 
419235d489f9SAlex Elder 	/*
419335d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
419435d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
419535d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
419635d489f9SAlex Elder 	 * allocate is representable in a size_t.
419735d489f9SAlex Elder 	 */
419835d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
419935d489f9SAlex Elder 				 / sizeof (u64)) {
420035d489f9SAlex Elder 		ret = -EINVAL;
420135d489f9SAlex Elder 		goto out;
420235d489f9SAlex Elder 	}
420335d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
420435d489f9SAlex Elder 		goto out;
4205468521c1SAlex Elder 	ret = 0;
420635d489f9SAlex Elder 
4207812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
420835d489f9SAlex Elder 	if (!snapc) {
420935d489f9SAlex Elder 		ret = -ENOMEM;
421035d489f9SAlex Elder 		goto out;
421135d489f9SAlex Elder 	}
421235d489f9SAlex Elder 	snapc->seq = seq;
421335d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
421435d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
421535d489f9SAlex Elder 
421649ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
421735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
421835d489f9SAlex Elder 
421935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
422035d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
422135d489f9SAlex Elder out:
422235d489f9SAlex Elder 	kfree(reply_buf);
422335d489f9SAlex Elder 
422457385b51SAlex Elder 	return ret;
422535d489f9SAlex Elder }
422635d489f9SAlex Elder 
422754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
422854cac61fSAlex Elder 					u64 snap_id)
4229b8b1e2dbSAlex Elder {
4230b8b1e2dbSAlex Elder 	size_t size;
4231b8b1e2dbSAlex Elder 	void *reply_buf;
423254cac61fSAlex Elder 	__le64 snapid;
4233b8b1e2dbSAlex Elder 	int ret;
4234b8b1e2dbSAlex Elder 	void *p;
4235b8b1e2dbSAlex Elder 	void *end;
4236b8b1e2dbSAlex Elder 	char *snap_name;
4237b8b1e2dbSAlex Elder 
4238b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4239b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4240b8b1e2dbSAlex Elder 	if (!reply_buf)
4241b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4242b8b1e2dbSAlex Elder 
424354cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
424436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4245b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
424654cac61fSAlex Elder 				&snapid, sizeof (snapid),
4247e2a58ee5SAlex Elder 				reply_buf, size);
424836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4249f40eb349SAlex Elder 	if (ret < 0) {
4250f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4251b8b1e2dbSAlex Elder 		goto out;
4252f40eb349SAlex Elder 	}
4253b8b1e2dbSAlex Elder 
4254b8b1e2dbSAlex Elder 	p = reply_buf;
4255f40eb349SAlex Elder 	end = reply_buf + ret;
4256e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4257f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4258b8b1e2dbSAlex Elder 		goto out;
4259f40eb349SAlex Elder 
4260b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
426154cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4262b8b1e2dbSAlex Elder out:
4263b8b1e2dbSAlex Elder 	kfree(reply_buf);
4264b8b1e2dbSAlex Elder 
4265f40eb349SAlex Elder 	return snap_name;
4266b8b1e2dbSAlex Elder }
4267b8b1e2dbSAlex Elder 
42682df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4269117973fbSAlex Elder {
42702df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4271117973fbSAlex Elder 	int ret;
4272117973fbSAlex Elder 
42731617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
42741617e40cSJosh Durgin 	if (ret)
4275cfbf6377SAlex Elder 		return ret;
42761617e40cSJosh Durgin 
42772df3fac7SAlex Elder 	if (first_time) {
42782df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
42792df3fac7SAlex Elder 		if (ret)
4280cfbf6377SAlex Elder 			return ret;
42812df3fac7SAlex Elder 	}
42822df3fac7SAlex Elder 
4283642a2537SAlex Elder 	/*
4284642a2537SAlex Elder 	 * If the image supports layering, get the parent info.  We
4285642a2537SAlex Elder 	 * need to probe the first time regardless.  Thereafter we
4286642a2537SAlex Elder 	 * only need to if there's a parent, to see if it has
4287642a2537SAlex Elder 	 * disappeared due to the mapped image getting flattened.
4288642a2537SAlex Elder 	 */
4289642a2537SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4290642a2537SAlex Elder 			(first_time || rbd_dev->parent_spec)) {
4291642a2537SAlex Elder 		bool warn;
4292642a2537SAlex Elder 
4293642a2537SAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
4294642a2537SAlex Elder 		if (ret)
4295cfbf6377SAlex Elder 			return ret;
4296642a2537SAlex Elder 
4297642a2537SAlex Elder 		/*
4298642a2537SAlex Elder 		 * Print a warning if this is the initial probe and
4299642a2537SAlex Elder 		 * the image has a parent.  Don't print it if the
4300642a2537SAlex Elder 		 * image now being probed is itself a parent.  We
4301642a2537SAlex Elder 		 * can tell at this point because we won't know its
4302642a2537SAlex Elder 		 * pool name yet (just its pool id).
4303642a2537SAlex Elder 		 */
4304642a2537SAlex Elder 		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4305642a2537SAlex Elder 		if (first_time && warn)
4306642a2537SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
4307642a2537SAlex Elder 					"is EXPERIMENTAL!");
4308642a2537SAlex Elder 	}
4309642a2537SAlex Elder 
431029334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
431129334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
431229334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
4313117973fbSAlex Elder 
4314cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4315117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4316117973fbSAlex Elder 
4317117973fbSAlex Elder 	return ret;
4318117973fbSAlex Elder }
4319117973fbSAlex Elder 
4320dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4321dfc5606dSYehuda Sadeh {
4322dfc5606dSYehuda Sadeh 	struct device *dev;
4323cd789ab9SAlex Elder 	int ret;
4324dfc5606dSYehuda Sadeh 
4325cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4326dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4327dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4328dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4329200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4330de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4331dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4332dfc5606dSYehuda Sadeh 
4333dfc5606dSYehuda Sadeh 	return ret;
4334602adf40SYehuda Sadeh }
4335602adf40SYehuda Sadeh 
4336dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4337dfc5606dSYehuda Sadeh {
4338dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4339dfc5606dSYehuda Sadeh }
4340dfc5606dSYehuda Sadeh 
4341e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
43421ddbe94eSAlex Elder 
43431ddbe94eSAlex Elder /*
4344499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4345499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
43461ddbe94eSAlex Elder  */
4347e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4348b7f23c36SAlex Elder {
4349e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4350499afd5bSAlex Elder 
4351499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4352499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4353499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4354e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4355e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4356b7f23c36SAlex Elder }
4357b7f23c36SAlex Elder 
43581ddbe94eSAlex Elder /*
4359499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4360499afd5bSAlex Elder  * identifier is no longer in use.
43611ddbe94eSAlex Elder  */
4362e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
43631ddbe94eSAlex Elder {
4364d184f6bfSAlex Elder 	struct list_head *tmp;
4365de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
4366d184f6bfSAlex Elder 	int max_id;
4367d184f6bfSAlex Elder 
4368aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
4369499afd5bSAlex Elder 
4370e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4371e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4372499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4373499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4374d184f6bfSAlex Elder 
4375d184f6bfSAlex Elder 	/*
4376d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
4377d184f6bfSAlex Elder 	 * is nothing special we need to do.
4378d184f6bfSAlex Elder 	 */
4379e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4380d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
4381d184f6bfSAlex Elder 		return;
4382d184f6bfSAlex Elder 	}
4383d184f6bfSAlex Elder 
4384d184f6bfSAlex Elder 	/*
4385d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
4386d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
4387d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
4388d184f6bfSAlex Elder 	 */
4389d184f6bfSAlex Elder 	max_id = 0;
4390d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
4391d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
4392d184f6bfSAlex Elder 
4393d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4394b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
4395b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
4396d184f6bfSAlex Elder 	}
4397499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
43981ddbe94eSAlex Elder 
43991ddbe94eSAlex Elder 	/*
4400e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
4401d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
4402d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
4403d184f6bfSAlex Elder 	 * case.
44041ddbe94eSAlex Elder 	 */
4405e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4406e2839308SAlex Elder 	dout("  max dev id has been reset\n");
4407b7f23c36SAlex Elder }
4408b7f23c36SAlex Elder 
4409a725f65eSAlex Elder /*
4410e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4411e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4412593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4413593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4414e28fff26SAlex Elder  */
4415e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4416e28fff26SAlex Elder {
4417e28fff26SAlex Elder         /*
4418e28fff26SAlex Elder         * These are the characters that produce nonzero for
4419e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4420e28fff26SAlex Elder         */
4421e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4422e28fff26SAlex Elder 
4423e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4424e28fff26SAlex Elder 
4425e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4426e28fff26SAlex Elder }
4427e28fff26SAlex Elder 
4428e28fff26SAlex Elder /*
4429e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4430e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4431593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4432593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4433e28fff26SAlex Elder  *
4434e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4435e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4436e28fff26SAlex Elder  * token_size if the token would not fit.
4437e28fff26SAlex Elder  *
4438593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4439e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4440e28fff26SAlex Elder  * too small to hold it.
4441e28fff26SAlex Elder  */
4442e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4443e28fff26SAlex Elder 				char *token,
4444e28fff26SAlex Elder 				size_t token_size)
4445e28fff26SAlex Elder {
4446e28fff26SAlex Elder         size_t len;
4447e28fff26SAlex Elder 
4448e28fff26SAlex Elder 	len = next_token(buf);
4449e28fff26SAlex Elder 	if (len < token_size) {
4450e28fff26SAlex Elder 		memcpy(token, *buf, len);
4451e28fff26SAlex Elder 		*(token + len) = '\0';
4452e28fff26SAlex Elder 	}
4453e28fff26SAlex Elder 	*buf += len;
4454e28fff26SAlex Elder 
4455e28fff26SAlex Elder         return len;
4456e28fff26SAlex Elder }
4457e28fff26SAlex Elder 
4458e28fff26SAlex Elder /*
4459ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4460ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4461ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4462ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4463ea3352f4SAlex Elder  *
4464ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4465ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4466ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4467ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4468ea3352f4SAlex Elder  *
4469ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4470ea3352f4SAlex Elder  * the end of the found token.
4471ea3352f4SAlex Elder  *
4472ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4473ea3352f4SAlex Elder  */
4474ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4475ea3352f4SAlex Elder {
4476ea3352f4SAlex Elder 	char *dup;
4477ea3352f4SAlex Elder 	size_t len;
4478ea3352f4SAlex Elder 
4479ea3352f4SAlex Elder 	len = next_token(buf);
44804caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4481ea3352f4SAlex Elder 	if (!dup)
4482ea3352f4SAlex Elder 		return NULL;
4483ea3352f4SAlex Elder 	*(dup + len) = '\0';
4484ea3352f4SAlex Elder 	*buf += len;
4485ea3352f4SAlex Elder 
4486ea3352f4SAlex Elder 	if (lenp)
4487ea3352f4SAlex Elder 		*lenp = len;
4488ea3352f4SAlex Elder 
4489ea3352f4SAlex Elder 	return dup;
4490ea3352f4SAlex Elder }
4491ea3352f4SAlex Elder 
4492ea3352f4SAlex Elder /*
4493859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4494859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4495859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4496859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4497d22f76e7SAlex Elder  *
4498859c31dfSAlex Elder  * The information extracted from these options is recorded in
4499859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4500859c31dfSAlex Elder  * structures:
4501859c31dfSAlex Elder  *  ceph_opts
4502859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4503859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4504859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4505859c31dfSAlex Elder  *  rbd_opts
4506859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4507859c31dfSAlex Elder  *	this function; caller must release with kfree().
4508859c31dfSAlex Elder  *  spec
4509859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4510859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4511859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4512859c31dfSAlex Elder  *
4513859c31dfSAlex Elder  * The options passed take this form:
4514859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4515859c31dfSAlex Elder  * where:
4516859c31dfSAlex Elder  *  <mon_addrs>
4517859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4518859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4519859c31dfSAlex Elder  *      by a port number (separated by a colon).
4520859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4521859c31dfSAlex Elder  *  <options>
4522859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4523859c31dfSAlex Elder  *  <pool_name>
4524859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4525859c31dfSAlex Elder  *  <image_name>
4526859c31dfSAlex Elder  *      The name of the image in that pool to map.
4527859c31dfSAlex Elder  *  <snap_id>
4528859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4529859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4530859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4531859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4532a725f65eSAlex Elder  */
4533859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4534dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4535859c31dfSAlex Elder 				struct rbd_options **opts,
4536859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4537a725f65eSAlex Elder {
4538e28fff26SAlex Elder 	size_t len;
4539859c31dfSAlex Elder 	char *options;
45400ddebc0cSAlex Elder 	const char *mon_addrs;
4541ecb4dc22SAlex Elder 	char *snap_name;
45420ddebc0cSAlex Elder 	size_t mon_addrs_size;
4543859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
45444e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4545859c31dfSAlex Elder 	struct ceph_options *copts;
4546dc79b113SAlex Elder 	int ret;
4547e28fff26SAlex Elder 
4548e28fff26SAlex Elder 	/* The first four tokens are required */
4549e28fff26SAlex Elder 
45507ef3214aSAlex Elder 	len = next_token(&buf);
45514fb5d671SAlex Elder 	if (!len) {
45524fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
45534fb5d671SAlex Elder 		return -EINVAL;
45544fb5d671SAlex Elder 	}
45550ddebc0cSAlex Elder 	mon_addrs = buf;
4556f28e565aSAlex Elder 	mon_addrs_size = len + 1;
45577ef3214aSAlex Elder 	buf += len;
4558a725f65eSAlex Elder 
4559dc79b113SAlex Elder 	ret = -EINVAL;
4560f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4561f28e565aSAlex Elder 	if (!options)
4562dc79b113SAlex Elder 		return -ENOMEM;
45634fb5d671SAlex Elder 	if (!*options) {
45644fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
45654fb5d671SAlex Elder 		goto out_err;
45664fb5d671SAlex Elder 	}
4567a725f65eSAlex Elder 
4568859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4569859c31dfSAlex Elder 	if (!spec)
4570f28e565aSAlex Elder 		goto out_mem;
4571859c31dfSAlex Elder 
4572859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4573859c31dfSAlex Elder 	if (!spec->pool_name)
4574859c31dfSAlex Elder 		goto out_mem;
45754fb5d671SAlex Elder 	if (!*spec->pool_name) {
45764fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
45774fb5d671SAlex Elder 		goto out_err;
45784fb5d671SAlex Elder 	}
4579e28fff26SAlex Elder 
458069e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4581859c31dfSAlex Elder 	if (!spec->image_name)
4582f28e565aSAlex Elder 		goto out_mem;
45834fb5d671SAlex Elder 	if (!*spec->image_name) {
45844fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
45854fb5d671SAlex Elder 		goto out_err;
45864fb5d671SAlex Elder 	}
4587e28fff26SAlex Elder 
4588f28e565aSAlex Elder 	/*
4589f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4590f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4591f28e565aSAlex Elder 	 */
45923feeb894SAlex Elder 	len = next_token(&buf);
4593820a5f3eSAlex Elder 	if (!len) {
45943feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
45953feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4596f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4597dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4598f28e565aSAlex Elder 		goto out_err;
4599849b4260SAlex Elder 	}
4600ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4601ecb4dc22SAlex Elder 	if (!snap_name)
4602f28e565aSAlex Elder 		goto out_mem;
4603ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4604ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4605e5c35534SAlex Elder 
46060ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4607e28fff26SAlex Elder 
46084e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
46094e9afebaSAlex Elder 	if (!rbd_opts)
46104e9afebaSAlex Elder 		goto out_mem;
46114e9afebaSAlex Elder 
46124e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4613d22f76e7SAlex Elder 
4614859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
46150ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
46164e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4617859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4618859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4619dc79b113SAlex Elder 		goto out_err;
4620dc79b113SAlex Elder 	}
4621859c31dfSAlex Elder 	kfree(options);
4622859c31dfSAlex Elder 
4623859c31dfSAlex Elder 	*ceph_opts = copts;
46244e9afebaSAlex Elder 	*opts = rbd_opts;
4625859c31dfSAlex Elder 	*rbd_spec = spec;
46260ddebc0cSAlex Elder 
4627dc79b113SAlex Elder 	return 0;
4628f28e565aSAlex Elder out_mem:
4629dc79b113SAlex Elder 	ret = -ENOMEM;
4630d22f76e7SAlex Elder out_err:
4631859c31dfSAlex Elder 	kfree(rbd_opts);
4632859c31dfSAlex Elder 	rbd_spec_put(spec);
4633f28e565aSAlex Elder 	kfree(options);
4634d22f76e7SAlex Elder 
4635dc79b113SAlex Elder 	return ret;
4636a725f65eSAlex Elder }
4637a725f65eSAlex Elder 
4638589d30e0SAlex Elder /*
4639589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4640589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4641589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4642589d30e0SAlex Elder  *
4643589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4644589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4645589d30e0SAlex Elder  * with the supplied name.
4646589d30e0SAlex Elder  *
4647589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4648589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4649589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4650589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4651589d30e0SAlex Elder  */
4652589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4653589d30e0SAlex Elder {
4654589d30e0SAlex Elder 	int ret;
4655589d30e0SAlex Elder 	size_t size;
4656589d30e0SAlex Elder 	char *object_name;
4657589d30e0SAlex Elder 	void *response;
4658c0fba368SAlex Elder 	char *image_id;
46592f82ee54SAlex Elder 
4660589d30e0SAlex Elder 	/*
46612c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
46622c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4663c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4664c0fba368SAlex Elder 	 * do still need to set the image format though.
46652c0d0a10SAlex Elder 	 */
4666c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4667c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4668c0fba368SAlex Elder 
46692c0d0a10SAlex Elder 		return 0;
4670c0fba368SAlex Elder 	}
46712c0d0a10SAlex Elder 
46722c0d0a10SAlex Elder 	/*
4673589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4674589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4675589d30e0SAlex Elder 	 */
467669e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4677589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4678589d30e0SAlex Elder 	if (!object_name)
4679589d30e0SAlex Elder 		return -ENOMEM;
46800d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4681589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4682589d30e0SAlex Elder 
4683589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4684589d30e0SAlex Elder 
4685589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4686589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4687589d30e0SAlex Elder 	if (!response) {
4688589d30e0SAlex Elder 		ret = -ENOMEM;
4689589d30e0SAlex Elder 		goto out;
4690589d30e0SAlex Elder 	}
4691589d30e0SAlex Elder 
4692c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4693c0fba368SAlex Elder 
469436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
46954157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4696e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
469736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4698c0fba368SAlex Elder 	if (ret == -ENOENT) {
4699c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4700c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4701c0fba368SAlex Elder 		if (!ret)
4702c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4703c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4704c0fba368SAlex Elder 		void *p = response;
4705589d30e0SAlex Elder 
4706c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4707979ed480SAlex Elder 						NULL, GFP_NOIO);
4708c0fba368SAlex Elder 		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4709c0fba368SAlex Elder 		if (!ret)
4710c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4711589d30e0SAlex Elder 	} else {
4712c0fba368SAlex Elder 		ret = -EINVAL;
4713c0fba368SAlex Elder 	}
4714c0fba368SAlex Elder 
4715c0fba368SAlex Elder 	if (!ret) {
4716c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4717c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4718589d30e0SAlex Elder 	}
4719589d30e0SAlex Elder out:
4720589d30e0SAlex Elder 	kfree(response);
4721589d30e0SAlex Elder 	kfree(object_name);
4722589d30e0SAlex Elder 
4723589d30e0SAlex Elder 	return ret;
4724589d30e0SAlex Elder }
4725589d30e0SAlex Elder 
47263abef3b3SAlex Elder /*
47273abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
47283abef3b3SAlex Elder  * call.
47293abef3b3SAlex Elder  */
47306fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
47316fd48b3bSAlex Elder {
47326fd48b3bSAlex Elder 	struct rbd_image_header	*header;
47336fd48b3bSAlex Elder 
4734392a9dadSAlex Elder 	/* Drop parent reference unless it's already been done (or none) */
4735392a9dadSAlex Elder 
4736392a9dadSAlex Elder 	if (rbd_dev->parent_overlap)
4737a2acd00eSAlex Elder 		rbd_dev_parent_put(rbd_dev);
47386fd48b3bSAlex Elder 
47396fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
47406fd48b3bSAlex Elder 
47416fd48b3bSAlex Elder 	header = &rbd_dev->header;
4742812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
47436fd48b3bSAlex Elder 	kfree(header->snap_sizes);
47446fd48b3bSAlex Elder 	kfree(header->snap_names);
47456fd48b3bSAlex Elder 	kfree(header->object_prefix);
47466fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
47476fd48b3bSAlex Elder }
47486fd48b3bSAlex Elder 
47492df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4750a30b71b9SAlex Elder {
4751a30b71b9SAlex Elder 	int ret;
4752a30b71b9SAlex Elder 
47531e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
475457385b51SAlex Elder 	if (ret)
47551e130199SAlex Elder 		goto out_err;
4756b1b5402aSAlex Elder 
47572df3fac7SAlex Elder 	/*
47582df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
47592df3fac7SAlex Elder 	 * features are assumed to never change.
47602df3fac7SAlex Elder 	 */
4761b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
476257385b51SAlex Elder 	if (ret)
4763b1b5402aSAlex Elder 		goto out_err;
476435d489f9SAlex Elder 
4765cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4766cc070d59SAlex Elder 
4767cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4768cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4769cc070d59SAlex Elder 		if (ret < 0)
4770cc070d59SAlex Elder 			goto out_err;
4771cc070d59SAlex Elder 	}
47722df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
4773a30b71b9SAlex Elder 
477435152979SAlex Elder 	return 0;
47759d475de5SAlex Elder out_err:
4776642a2537SAlex Elder 	rbd_dev->header.features = 0;
47771e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
47781e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
47799d475de5SAlex Elder 
47809d475de5SAlex Elder 	return ret;
4781a30b71b9SAlex Elder }
4782a30b71b9SAlex Elder 
4783124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
478483a06263SAlex Elder {
47852f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4786124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4787124afba2SAlex Elder 	struct rbd_client *rbdc;
4788124afba2SAlex Elder 	int ret;
4789124afba2SAlex Elder 
4790124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4791124afba2SAlex Elder 		return 0;
4792124afba2SAlex Elder 	/*
4793124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4794124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4795124afba2SAlex Elder 	 * parent/child relationships always share both.
4796124afba2SAlex Elder 	 */
4797124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4798124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4799124afba2SAlex Elder 
4800124afba2SAlex Elder 	ret = -ENOMEM;
4801124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4802124afba2SAlex Elder 	if (!parent)
4803124afba2SAlex Elder 		goto out_err;
4804124afba2SAlex Elder 
48051f3ef788SAlex Elder 	ret = rbd_dev_image_probe(parent, false);
4806124afba2SAlex Elder 	if (ret < 0)
4807124afba2SAlex Elder 		goto out_err;
4808124afba2SAlex Elder 	rbd_dev->parent = parent;
4809a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
4810124afba2SAlex Elder 
4811124afba2SAlex Elder 	return 0;
4812124afba2SAlex Elder out_err:
4813124afba2SAlex Elder 	if (parent) {
4814fb65d228SAlex Elder 		rbd_dev_unparent(rbd_dev);
4815124afba2SAlex Elder 		kfree(rbd_dev->header_name);
4816124afba2SAlex Elder 		rbd_dev_destroy(parent);
4817124afba2SAlex Elder 	} else {
4818124afba2SAlex Elder 		rbd_put_client(rbdc);
4819124afba2SAlex Elder 		rbd_spec_put(parent_spec);
4820124afba2SAlex Elder 	}
4821124afba2SAlex Elder 
4822124afba2SAlex Elder 	return ret;
4823124afba2SAlex Elder }
4824124afba2SAlex Elder 
4825200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4826124afba2SAlex Elder {
482783a06263SAlex Elder 	int ret;
482883a06263SAlex Elder 
482983a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
483083a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
483183a06263SAlex Elder 
483283a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
483383a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
483483a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
483583a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
483683a06263SAlex Elder 
483783a06263SAlex Elder 	/* Get our block major device number. */
483883a06263SAlex Elder 
483983a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
484083a06263SAlex Elder 	if (ret < 0)
484183a06263SAlex Elder 		goto err_out_id;
484283a06263SAlex Elder 	rbd_dev->major = ret;
484383a06263SAlex Elder 
484483a06263SAlex Elder 	/* Set up the blkdev mapping. */
484583a06263SAlex Elder 
484683a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
484783a06263SAlex Elder 	if (ret)
484883a06263SAlex Elder 		goto err_out_blkdev;
484983a06263SAlex Elder 
4850f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
485183a06263SAlex Elder 	if (ret)
485283a06263SAlex Elder 		goto err_out_disk;
4853f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4854f35a4deeSAlex Elder 
4855f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
4856f35a4deeSAlex Elder 	if (ret)
4857f35a4deeSAlex Elder 		goto err_out_mapping;
485883a06263SAlex Elder 
485983a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
486083a06263SAlex Elder 
4861129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
486283a06263SAlex Elder 	add_disk(rbd_dev->disk);
486383a06263SAlex Elder 
486483a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
486583a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
486683a06263SAlex Elder 
486783a06263SAlex Elder 	return ret;
48682f82ee54SAlex Elder 
4869f35a4deeSAlex Elder err_out_mapping:
4870f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
487183a06263SAlex Elder err_out_disk:
487283a06263SAlex Elder 	rbd_free_disk(rbd_dev);
487383a06263SAlex Elder err_out_blkdev:
487483a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
487583a06263SAlex Elder err_out_id:
487683a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
4877d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
487883a06263SAlex Elder 
487983a06263SAlex Elder 	return ret;
488083a06263SAlex Elder }
488183a06263SAlex Elder 
4882332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4883332bb12dSAlex Elder {
4884332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
4885332bb12dSAlex Elder 	size_t size;
4886332bb12dSAlex Elder 
4887332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
4888332bb12dSAlex Elder 
4889332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4890332bb12dSAlex Elder 
4891332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4892332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4893332bb12dSAlex Elder 	else
4894332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4895332bb12dSAlex Elder 
4896332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4897332bb12dSAlex Elder 	if (!rbd_dev->header_name)
4898332bb12dSAlex Elder 		return -ENOMEM;
4899332bb12dSAlex Elder 
4900332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4901332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4902332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
4903332bb12dSAlex Elder 	else
4904332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4905332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
4906332bb12dSAlex Elder 	return 0;
4907332bb12dSAlex Elder }
4908332bb12dSAlex Elder 
4909200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4910200a6a8bSAlex Elder {
49116fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
4912200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
49136fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
49146fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
49156fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
49166fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
49176fd48b3bSAlex Elder 
4918200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
4919200a6a8bSAlex Elder }
4920200a6a8bSAlex Elder 
4921a30b71b9SAlex Elder /*
4922a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
49231f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
49241f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
49251f3ef788SAlex Elder  * object to get detailed information about the rbd image.
4926a30b71b9SAlex Elder  */
49271f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
4928a30b71b9SAlex Elder {
4929a30b71b9SAlex Elder 	int ret;
4930b644de2bSAlex Elder 	int tmp;
4931a30b71b9SAlex Elder 
4932a30b71b9SAlex Elder 	/*
49333abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
49343abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
49353abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
49363abef3b3SAlex Elder 	 * will be set to either 1 or 2.
4937a30b71b9SAlex Elder 	 */
4938a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4939a30b71b9SAlex Elder 	if (ret)
4940c0fba368SAlex Elder 		return ret;
4941c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
4942c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4943c0fba368SAlex Elder 
4944332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
4945332bb12dSAlex Elder 	if (ret)
4946332bb12dSAlex Elder 		goto err_out_format;
4947332bb12dSAlex Elder 
49481f3ef788SAlex Elder 	if (mapping) {
49491f3ef788SAlex Elder 		ret = rbd_dev_header_watch_sync(rbd_dev, true);
4950b644de2bSAlex Elder 		if (ret)
4951b644de2bSAlex Elder 			goto out_header_name;
49521f3ef788SAlex Elder 	}
4953b644de2bSAlex Elder 
4954c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
495599a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
4956a30b71b9SAlex Elder 	else
49572df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
49585655c4d9SAlex Elder 	if (ret)
4959b644de2bSAlex Elder 		goto err_out_watch;
4960a30b71b9SAlex Elder 
49619bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
49629bb81c9bSAlex Elder 	if (ret)
496333dca39fSAlex Elder 		goto err_out_probe;
49649bb81c9bSAlex Elder 
49659bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
496630d60ba2SAlex Elder 	if (ret)
496730d60ba2SAlex Elder 		goto err_out_probe;
496883a06263SAlex Elder 
496930d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
497030d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
497130d60ba2SAlex Elder 
497230d60ba2SAlex Elder 	return 0;
49736fd48b3bSAlex Elder err_out_probe:
49746fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
4975b644de2bSAlex Elder err_out_watch:
49761f3ef788SAlex Elder 	if (mapping) {
49771f3ef788SAlex Elder 		tmp = rbd_dev_header_watch_sync(rbd_dev, false);
4978b644de2bSAlex Elder 		if (tmp)
49791f3ef788SAlex Elder 			rbd_warn(rbd_dev, "unable to tear down "
49801f3ef788SAlex Elder 					"watch request (%d)\n", tmp);
49811f3ef788SAlex Elder 	}
4982332bb12dSAlex Elder out_header_name:
4983332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
4984332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
4985332bb12dSAlex Elder err_out_format:
4986332bb12dSAlex Elder 	rbd_dev->image_format = 0;
49875655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
49885655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
49895655c4d9SAlex Elder 
49905655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
49915655c4d9SAlex Elder 
49925655c4d9SAlex Elder 	return ret;
499383a06263SAlex Elder }
499483a06263SAlex Elder 
499559c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
499659c2be1eSYehuda Sadeh 		       const char *buf,
499759c2be1eSYehuda Sadeh 		       size_t count)
4998602adf40SYehuda Sadeh {
4999cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5000dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
50014e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5002859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
50039d3997fdSAlex Elder 	struct rbd_client *rbdc;
500427cc2594SAlex Elder 	struct ceph_osd_client *osdc;
500551344a38SAlex Elder 	bool read_only;
500627cc2594SAlex Elder 	int rc = -ENOMEM;
5007602adf40SYehuda Sadeh 
5008602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5009602adf40SYehuda Sadeh 		return -ENODEV;
5010602adf40SYehuda Sadeh 
5011a725f65eSAlex Elder 	/* parse add command */
5012859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5013dc79b113SAlex Elder 	if (rc < 0)
5014bd4ba655SAlex Elder 		goto err_out_module;
501551344a38SAlex Elder 	read_only = rbd_opts->read_only;
501651344a38SAlex Elder 	kfree(rbd_opts);
501751344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
5018a725f65eSAlex Elder 
50199d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
50209d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
50219d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
50220ddebc0cSAlex Elder 		goto err_out_args;
50239d3997fdSAlex Elder 	}
5024602adf40SYehuda Sadeh 
5025602adf40SYehuda Sadeh 	/* pick the pool */
50269d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
5027859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
5028602adf40SYehuda Sadeh 	if (rc < 0)
5029602adf40SYehuda Sadeh 		goto err_out_client;
5030859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5031859c31dfSAlex Elder 
50320903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
50330903e875SAlex Elder 
5034c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
5035c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5036c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
50370903e875SAlex Elder 		rc = -EIO;
50380903e875SAlex Elder 		goto err_out_client;
50390903e875SAlex Elder 	}
50400903e875SAlex Elder 
5041c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
5042bd4ba655SAlex Elder 	if (!rbd_dev)
5043bd4ba655SAlex Elder 		goto err_out_client;
5044c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5045c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5046602adf40SYehuda Sadeh 
50471f3ef788SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, true);
5048a30b71b9SAlex Elder 	if (rc < 0)
5049c53d5893SAlex Elder 		goto err_out_rbd_dev;
505005fd6f6fSAlex Elder 
50517ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
50527ce4eef7SAlex Elder 
50537ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
50547ce4eef7SAlex Elder 		read_only = true;
50557ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
50567ce4eef7SAlex Elder 
5057b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
50583abef3b3SAlex Elder 	if (rc) {
50593abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
50603abef3b3SAlex Elder 		goto err_out_module;
50613abef3b3SAlex Elder 	}
50623abef3b3SAlex Elder 
5063602adf40SYehuda Sadeh 	return count;
5064b536f69aSAlex Elder 
5065c53d5893SAlex Elder err_out_rbd_dev:
5066c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5067bd4ba655SAlex Elder err_out_client:
50689d3997fdSAlex Elder 	rbd_put_client(rbdc);
50690ddebc0cSAlex Elder err_out_args:
5070859c31dfSAlex Elder 	rbd_spec_put(spec);
5071bd4ba655SAlex Elder err_out_module:
5072bd4ba655SAlex Elder 	module_put(THIS_MODULE);
507327cc2594SAlex Elder 
5074602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
507527cc2594SAlex Elder 
507627cc2594SAlex Elder 	return (ssize_t)rc;
5077602adf40SYehuda Sadeh }
5078602adf40SYehuda Sadeh 
5079200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
5080602adf40SYehuda Sadeh {
5081593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5082602adf40SYehuda Sadeh 
5083602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5084200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
50856d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5086602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
5087200a6a8bSAlex Elder 	rbd_dev->major = 0;
5088e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5089d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5090602adf40SYehuda Sadeh }
5091602adf40SYehuda Sadeh 
509205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
509305a46afdSAlex Elder {
5094ad945fc1SAlex Elder 	while (rbd_dev->parent) {
509505a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
509605a46afdSAlex Elder 		struct rbd_device *second = first->parent;
509705a46afdSAlex Elder 		struct rbd_device *third;
509805a46afdSAlex Elder 
509905a46afdSAlex Elder 		/*
510005a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
510105a46afdSAlex Elder 		 * remove it.
510205a46afdSAlex Elder 		 */
510305a46afdSAlex Elder 		while (second && (third = second->parent)) {
510405a46afdSAlex Elder 			first = second;
510505a46afdSAlex Elder 			second = third;
510605a46afdSAlex Elder 		}
5107ad945fc1SAlex Elder 		rbd_assert(second);
51088ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5109ad945fc1SAlex Elder 		first->parent = NULL;
5110ad945fc1SAlex Elder 		first->parent_overlap = 0;
5111ad945fc1SAlex Elder 
5112ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
511305a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
511405a46afdSAlex Elder 		first->parent_spec = NULL;
511505a46afdSAlex Elder 	}
511605a46afdSAlex Elder }
511705a46afdSAlex Elder 
5118dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
5119602adf40SYehuda Sadeh 			  const char *buf,
5120602adf40SYehuda Sadeh 			  size_t count)
5121602adf40SYehuda Sadeh {
5122602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5123751cc0e3SAlex Elder 	struct list_head *tmp;
5124751cc0e3SAlex Elder 	int dev_id;
5125602adf40SYehuda Sadeh 	unsigned long ul;
512682a442d2SAlex Elder 	bool already = false;
51270d8189e1SAlex Elder 	int ret;
5128602adf40SYehuda Sadeh 
51290d8189e1SAlex Elder 	ret = strict_strtoul(buf, 10, &ul);
51300d8189e1SAlex Elder 	if (ret)
51310d8189e1SAlex Elder 		return ret;
5132602adf40SYehuda Sadeh 
5133602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5134751cc0e3SAlex Elder 	dev_id = (int)ul;
5135751cc0e3SAlex Elder 	if (dev_id != ul)
5136602adf40SYehuda Sadeh 		return -EINVAL;
5137602adf40SYehuda Sadeh 
5138602adf40SYehuda Sadeh 	ret = -ENOENT;
5139751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5140751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5141751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5142751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5143751cc0e3SAlex Elder 			ret = 0;
5144751cc0e3SAlex Elder 			break;
5145602adf40SYehuda Sadeh 		}
5146751cc0e3SAlex Elder 	}
5147751cc0e3SAlex Elder 	if (!ret) {
5148a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5149b82d167bSAlex Elder 		if (rbd_dev->open_count)
515042382b70SAlex Elder 			ret = -EBUSY;
5151b82d167bSAlex Elder 		else
515282a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
515382a442d2SAlex Elder 							&rbd_dev->flags);
5154a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5155751cc0e3SAlex Elder 	}
5156751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
515782a442d2SAlex Elder 	if (ret < 0 || already)
51581ba0f1e7SAlex Elder 		return ret;
5159751cc0e3SAlex Elder 
5160b480815aSAlex Elder 	rbd_bus_del_dev(rbd_dev);
51611f3ef788SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, false);
51621f3ef788SAlex Elder 	if (ret)
51631f3ef788SAlex Elder 		rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
51648ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
516579ab7558SAlex Elder 	module_put(THIS_MODULE);
5166aafb230eSAlex Elder 
51671ba0f1e7SAlex Elder 	return count;
5168602adf40SYehuda Sadeh }
5169602adf40SYehuda Sadeh 
5170602adf40SYehuda Sadeh /*
5171602adf40SYehuda Sadeh  * create control files in sysfs
5172dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5173602adf40SYehuda Sadeh  */
5174602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5175602adf40SYehuda Sadeh {
5176dfc5606dSYehuda Sadeh 	int ret;
5177602adf40SYehuda Sadeh 
5178fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5179dfc5606dSYehuda Sadeh 	if (ret < 0)
5180dfc5606dSYehuda Sadeh 		return ret;
5181602adf40SYehuda Sadeh 
5182fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5183fed4c143SAlex Elder 	if (ret < 0)
5184fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5185602adf40SYehuda Sadeh 
5186602adf40SYehuda Sadeh 	return ret;
5187602adf40SYehuda Sadeh }
5188602adf40SYehuda Sadeh 
5189602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5190602adf40SYehuda Sadeh {
5191dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5192fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5193602adf40SYehuda Sadeh }
5194602adf40SYehuda Sadeh 
51951c2a9dfeSAlex Elder static int rbd_slab_init(void)
51961c2a9dfeSAlex Elder {
51971c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
51981c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
51991c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
52001c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
52011c2a9dfeSAlex Elder 					0, NULL);
5202868311b1SAlex Elder 	if (!rbd_img_request_cache)
5203868311b1SAlex Elder 		return -ENOMEM;
5204868311b1SAlex Elder 
5205868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5206868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5207868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5208868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5209868311b1SAlex Elder 					0, NULL);
521078c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
521178c2a44aSAlex Elder 		goto out_err;
521278c2a44aSAlex Elder 
521378c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
521478c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
521578c2a44aSAlex Elder 					MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
521678c2a44aSAlex Elder 	if (rbd_segment_name_cache)
52171c2a9dfeSAlex Elder 		return 0;
521878c2a44aSAlex Elder out_err:
521978c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
522078c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
522178c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
522278c2a44aSAlex Elder 	}
52231c2a9dfeSAlex Elder 
5224868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5225868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5226868311b1SAlex Elder 
52271c2a9dfeSAlex Elder 	return -ENOMEM;
52281c2a9dfeSAlex Elder }
52291c2a9dfeSAlex Elder 
52301c2a9dfeSAlex Elder static void rbd_slab_exit(void)
52311c2a9dfeSAlex Elder {
523278c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
523378c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
523478c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
523578c2a44aSAlex Elder 
5236868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5237868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5238868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5239868311b1SAlex Elder 
52401c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
52411c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
52421c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
52431c2a9dfeSAlex Elder }
52441c2a9dfeSAlex Elder 
5245cc344fa1SAlex Elder static int __init rbd_init(void)
5246602adf40SYehuda Sadeh {
5247602adf40SYehuda Sadeh 	int rc;
5248602adf40SYehuda Sadeh 
52491e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
52501e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
52511e32d34cSAlex Elder 
52521e32d34cSAlex Elder 		return -EINVAL;
52531e32d34cSAlex Elder 	}
52541c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5255602adf40SYehuda Sadeh 	if (rc)
5256602adf40SYehuda Sadeh 		return rc;
52571c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
52581c2a9dfeSAlex Elder 	if (rc)
52591c2a9dfeSAlex Elder 		rbd_slab_exit();
52601c2a9dfeSAlex Elder 	else
5261f0f8cef5SAlex Elder 		pr_info("loaded " RBD_DRV_NAME_LONG "\n");
52621c2a9dfeSAlex Elder 
52631c2a9dfeSAlex Elder 	return rc;
5264602adf40SYehuda Sadeh }
5265602adf40SYehuda Sadeh 
5266cc344fa1SAlex Elder static void __exit rbd_exit(void)
5267602adf40SYehuda Sadeh {
5268602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
52691c2a9dfeSAlex Elder 	rbd_slab_exit();
5270602adf40SYehuda Sadeh }
5271602adf40SYehuda Sadeh 
5272602adf40SYehuda Sadeh module_init(rbd_init);
5273602adf40SYehuda Sadeh module_exit(rbd_exit);
5274602adf40SYehuda Sadeh 
5275d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5276602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5277602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5278602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
5279602adf40SYehuda Sadeh 
5280602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5281602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5282602adf40SYehuda Sadeh 
5283602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5284