xref: /openbmc/linux/drivers/block/rbd.c (revision 2d0ebc5d)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44f8a22fc2SIlya Dryomov #include <linux/idr.h>
45602adf40SYehuda Sadeh 
46602adf40SYehuda Sadeh #include "rbd_types.h"
47602adf40SYehuda Sadeh 
48aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
49aafb230eSAlex Elder 
50593a9e7bSAlex Elder /*
51593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
52593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
53593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
54593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
55593a9e7bSAlex Elder  */
56593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
57593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
58593a9e7bSAlex Elder 
59a2acd00eSAlex Elder /*
60a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
61a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
62a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
63a2acd00eSAlex Elder  * -EINVAL without updating it.
64a2acd00eSAlex Elder  */
65a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
66a2acd00eSAlex Elder {
67a2acd00eSAlex Elder 	unsigned int counter;
68a2acd00eSAlex Elder 
69a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
70a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
71a2acd00eSAlex Elder 		return (int)counter;
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder 	atomic_dec(v);
74a2acd00eSAlex Elder 
75a2acd00eSAlex Elder 	return -EINVAL;
76a2acd00eSAlex Elder }
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
79a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
80a2acd00eSAlex Elder {
81a2acd00eSAlex Elder 	int counter;
82a2acd00eSAlex Elder 
83a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
84a2acd00eSAlex Elder 	if (counter >= 0)
85a2acd00eSAlex Elder 		return counter;
86a2acd00eSAlex Elder 
87a2acd00eSAlex Elder 	atomic_inc(v);
88a2acd00eSAlex Elder 
89a2acd00eSAlex Elder 	return -EINVAL;
90a2acd00eSAlex Elder }
91a2acd00eSAlex Elder 
92f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
93602adf40SYehuda Sadeh 
947e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
957e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
96602adf40SYehuda Sadeh 
97d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
98d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
99d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
100d4b125e9SAlex Elder 
10135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
102602adf40SYehuda Sadeh 
103602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
104602adf40SYehuda Sadeh 
1059682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1069682fc6dSAlex Elder 
1079e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1089e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
109589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1109e15b77dSAlex Elder 
1111e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
112589d30e0SAlex Elder 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1155cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1165cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1175cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1185cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
119d889140cSAlex Elder 
120d889140cSAlex Elder /* Features supported by this (client software) implementation. */
121d889140cSAlex Elder 
122770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
123d889140cSAlex Elder 
12481a89793SAlex Elder /*
12581a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
12681a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
12781a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
12881a89793SAlex Elder  * enough to hold all possible device names.
12981a89793SAlex Elder  */
130602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
132602adf40SYehuda Sadeh 
133602adf40SYehuda Sadeh /*
134602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
135602adf40SYehuda Sadeh  */
136602adf40SYehuda Sadeh struct rbd_image_header {
137f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
138849b4260SAlex Elder 	char *object_prefix;
139602adf40SYehuda Sadeh 	__u8 obj_order;
140602adf40SYehuda Sadeh 	__u8 crypt_type;
141602adf40SYehuda Sadeh 	__u8 comp_type;
142f35a4deeSAlex Elder 	u64 stripe_unit;
143f35a4deeSAlex Elder 	u64 stripe_count;
144f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
145602adf40SYehuda Sadeh 
146f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
147f84344f3SAlex Elder 	u64 image_size;
148f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
149f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
150f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15159c2be1eSYehuda Sadeh };
15259c2be1eSYehuda Sadeh 
1530d7dbfceSAlex Elder /*
1540d7dbfceSAlex Elder  * An rbd image specification.
1550d7dbfceSAlex Elder  *
1560d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
157c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
158c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
159c66c6e0cSAlex Elder  *
160c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
161c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
162c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
163c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
164c66c6e0cSAlex Elder  *
165c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
166c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
167c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
168c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
169c66c6e0cSAlex Elder  * is shared between the parent and child).
170c66c6e0cSAlex Elder  *
171c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
172c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
173c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
174c66c6e0cSAlex Elder  *
175c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
176c66c6e0cSAlex Elder  * could be a null pointer).
1770d7dbfceSAlex Elder  */
1780d7dbfceSAlex Elder struct rbd_spec {
1790d7dbfceSAlex Elder 	u64		pool_id;
180ecb4dc22SAlex Elder 	const char	*pool_name;
1810d7dbfceSAlex Elder 
182ecb4dc22SAlex Elder 	const char	*image_id;
183ecb4dc22SAlex Elder 	const char	*image_name;
1840d7dbfceSAlex Elder 
1850d7dbfceSAlex Elder 	u64		snap_id;
186ecb4dc22SAlex Elder 	const char	*snap_name;
1870d7dbfceSAlex Elder 
1880d7dbfceSAlex Elder 	struct kref	kref;
1890d7dbfceSAlex Elder };
1900d7dbfceSAlex Elder 
191602adf40SYehuda Sadeh /*
192f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
193602adf40SYehuda Sadeh  */
194602adf40SYehuda Sadeh struct rbd_client {
195602adf40SYehuda Sadeh 	struct ceph_client	*client;
196602adf40SYehuda Sadeh 	struct kref		kref;
197602adf40SYehuda Sadeh 	struct list_head	node;
198602adf40SYehuda Sadeh };
199602adf40SYehuda Sadeh 
200bf0d5f50SAlex Elder struct rbd_img_request;
201bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
202bf0d5f50SAlex Elder 
203bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
204bf0d5f50SAlex Elder 
205bf0d5f50SAlex Elder struct rbd_obj_request;
206bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
207bf0d5f50SAlex Elder 
2089969ebc5SAlex Elder enum obj_request_type {
2099969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2109969ebc5SAlex Elder };
211bf0d5f50SAlex Elder 
212926f9b3fSAlex Elder enum obj_req_flags {
213926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2146365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2155679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2165679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
217926f9b3fSAlex Elder };
218926f9b3fSAlex Elder 
219bf0d5f50SAlex Elder struct rbd_obj_request {
220bf0d5f50SAlex Elder 	const char		*object_name;
221bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
222bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
223926f9b3fSAlex Elder 	unsigned long		flags;
224bf0d5f50SAlex Elder 
225c5b5ef6cSAlex Elder 	/*
226c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
227c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
228c5b5ef6cSAlex Elder 	 *
229c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
230c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
231c5b5ef6cSAlex Elder 	 *
232c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
233c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
234c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
235c5b5ef6cSAlex Elder 	 *
236c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
237c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
238c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
239c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
240c5b5ef6cSAlex Elder 	 */
241c5b5ef6cSAlex Elder 	union {
242c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
243c5b5ef6cSAlex Elder 		struct {
244bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
245c5b5ef6cSAlex Elder 			u64			img_offset;
246c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
247c5b5ef6cSAlex Elder 			struct list_head	links;
248c5b5ef6cSAlex Elder 		};
249c5b5ef6cSAlex Elder 	};
250bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
251bf0d5f50SAlex Elder 
252bf0d5f50SAlex Elder 	enum obj_request_type	type;
253788e2df3SAlex Elder 	union {
254bf0d5f50SAlex Elder 		struct bio	*bio_list;
255788e2df3SAlex Elder 		struct {
256788e2df3SAlex Elder 			struct page	**pages;
257788e2df3SAlex Elder 			u32		page_count;
258788e2df3SAlex Elder 		};
259788e2df3SAlex Elder 	};
2600eefd470SAlex Elder 	struct page		**copyup_pages;
261ebda6408SAlex Elder 	u32			copyup_page_count;
262bf0d5f50SAlex Elder 
263bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2661b83bef2SSage Weil 	int			result;
267bf0d5f50SAlex Elder 
268bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
269788e2df3SAlex Elder 	struct completion	completion;
270bf0d5f50SAlex Elder 
271bf0d5f50SAlex Elder 	struct kref		kref;
272bf0d5f50SAlex Elder };
273bf0d5f50SAlex Elder 
2740c425248SAlex Elder enum img_req_flags {
2759849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2769849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
277d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2780c425248SAlex Elder };
2790c425248SAlex Elder 
280bf0d5f50SAlex Elder struct rbd_img_request {
281bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
282bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
283bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2840c425248SAlex Elder 	unsigned long		flags;
285bf0d5f50SAlex Elder 	union {
286bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2879849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2889849e986SAlex Elder 	};
2899849e986SAlex Elder 	union {
2909849e986SAlex Elder 		struct request		*rq;		/* block request */
2919849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
292bf0d5f50SAlex Elder 	};
2933d7efd18SAlex Elder 	struct page		**copyup_pages;
294ebda6408SAlex Elder 	u32			copyup_page_count;
295bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
296bf0d5f50SAlex Elder 	u32			next_completion;
297bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
29855f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
299a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
300bf0d5f50SAlex Elder 
301bf0d5f50SAlex Elder 	u32			obj_request_count;
302bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
303bf0d5f50SAlex Elder 
304bf0d5f50SAlex Elder 	struct kref		kref;
305bf0d5f50SAlex Elder };
306bf0d5f50SAlex Elder 
307bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
308ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
309bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
310ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
311bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
312ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
313bf0d5f50SAlex Elder 
314f84344f3SAlex Elder struct rbd_mapping {
31599c1f08fSAlex Elder 	u64                     size;
31634b13184SAlex Elder 	u64                     features;
317f84344f3SAlex Elder 	bool			read_only;
318f84344f3SAlex Elder };
319f84344f3SAlex Elder 
320602adf40SYehuda Sadeh /*
321602adf40SYehuda Sadeh  * a single device
322602adf40SYehuda Sadeh  */
323602adf40SYehuda Sadeh struct rbd_device {
324de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
325602adf40SYehuda Sadeh 
326602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
327dd82fff1SIlya Dryomov 	int			minor;
328602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
329602adf40SYehuda Sadeh 
330a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
331602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
332602adf40SYehuda Sadeh 
333602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
334602adf40SYehuda Sadeh 
335b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
336602adf40SYehuda Sadeh 
337602adf40SYehuda Sadeh 	struct rbd_image_header	header;
338b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3390d7dbfceSAlex Elder 	struct rbd_spec		*spec;
340602adf40SYehuda Sadeh 
3410d7dbfceSAlex Elder 	char			*header_name;
342971f839aSAlex Elder 
3430903e875SAlex Elder 	struct ceph_file_layout	layout;
3440903e875SAlex Elder 
34559c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
346975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
34759c2be1eSYehuda Sadeh 
34886b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
34986b00e0dSAlex Elder 	u64			parent_overlap;
350a2acd00eSAlex Elder 	atomic_t		parent_ref;
3512f82ee54SAlex Elder 	struct rbd_device	*parent;
35286b00e0dSAlex Elder 
353c666601aSJosh Durgin 	/* protects updating the header */
354c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
355f84344f3SAlex Elder 
356f84344f3SAlex Elder 	struct rbd_mapping	mapping;
357602adf40SYehuda Sadeh 
358602adf40SYehuda Sadeh 	struct list_head	node;
359dfc5606dSYehuda Sadeh 
360dfc5606dSYehuda Sadeh 	/* sysfs related */
361dfc5606dSYehuda Sadeh 	struct device		dev;
362b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
363dfc5606dSYehuda Sadeh };
364dfc5606dSYehuda Sadeh 
365b82d167bSAlex Elder /*
366b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
367b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
368b82d167bSAlex Elder  *
369b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
370b82d167bSAlex Elder  * "open_count" field) requires atomic access.
371b82d167bSAlex Elder  */
3726d292906SAlex Elder enum rbd_dev_flags {
3736d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
374b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3756d292906SAlex Elder };
3766d292906SAlex Elder 
377cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
378e124a82fSAlex Elder 
379602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
380e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
381e124a82fSAlex Elder 
382602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
383432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
384602adf40SYehuda Sadeh 
38578c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
38678c2a44aSAlex Elder 
3871c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
388868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
38978c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3901c2a9dfeSAlex Elder 
3919b60e70bSIlya Dryomov static int rbd_major;
392f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
393f8a22fc2SIlya Dryomov 
3949b60e70bSIlya Dryomov /*
3959b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
3969b60e70bSIlya Dryomov  * userspace rbd utility.
3979b60e70bSIlya Dryomov  */
3989b60e70bSIlya Dryomov static bool single_major = false;
3999b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4009b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4019b60e70bSIlya Dryomov 
4023d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4033d7efd18SAlex Elder 
404200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
405dfc5606dSYehuda Sadeh 
406f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
407f0f8cef5SAlex Elder 		       size_t count);
408f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
409f0f8cef5SAlex Elder 			  size_t count);
4109b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4119b60e70bSIlya Dryomov 				    size_t count);
4129b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4139b60e70bSIlya Dryomov 				       size_t count);
4141f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
415a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
416f0f8cef5SAlex Elder 
4179b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4189b60e70bSIlya Dryomov {
4197e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4209b60e70bSIlya Dryomov }
4219b60e70bSIlya Dryomov 
4229b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4239b60e70bSIlya Dryomov {
4247e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4259b60e70bSIlya Dryomov }
4269b60e70bSIlya Dryomov 
427b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
428b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
4299b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
4309b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
431b15a21ddSGreg Kroah-Hartman 
432b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
433b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
434b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4359b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4369b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
437b15a21ddSGreg Kroah-Hartman 	NULL,
438f0f8cef5SAlex Elder };
43992c76dc0SIlya Dryomov 
44092c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
44192c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
44292c76dc0SIlya Dryomov {
4439b60e70bSIlya Dryomov 	if (!single_major &&
4449b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4459b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4469b60e70bSIlya Dryomov 		return 0;
4479b60e70bSIlya Dryomov 
44892c76dc0SIlya Dryomov 	return attr->mode;
44992c76dc0SIlya Dryomov }
45092c76dc0SIlya Dryomov 
45192c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
45292c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
45392c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
45492c76dc0SIlya Dryomov };
45592c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
456f0f8cef5SAlex Elder 
457f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
458f0f8cef5SAlex Elder 	.name		= "rbd",
459b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
460f0f8cef5SAlex Elder };
461f0f8cef5SAlex Elder 
462f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
463f0f8cef5SAlex Elder {
464f0f8cef5SAlex Elder }
465f0f8cef5SAlex Elder 
466f0f8cef5SAlex Elder static struct device rbd_root_dev = {
467f0f8cef5SAlex Elder 	.init_name =    "rbd",
468f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
469f0f8cef5SAlex Elder };
470f0f8cef5SAlex Elder 
47106ecc6cbSAlex Elder static __printf(2, 3)
47206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
47306ecc6cbSAlex Elder {
47406ecc6cbSAlex Elder 	struct va_format vaf;
47506ecc6cbSAlex Elder 	va_list args;
47606ecc6cbSAlex Elder 
47706ecc6cbSAlex Elder 	va_start(args, fmt);
47806ecc6cbSAlex Elder 	vaf.fmt = fmt;
47906ecc6cbSAlex Elder 	vaf.va = &args;
48006ecc6cbSAlex Elder 
48106ecc6cbSAlex Elder 	if (!rbd_dev)
48206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
48306ecc6cbSAlex Elder 	else if (rbd_dev->disk)
48406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
48506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
48606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
48706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
48806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
48906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
49006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
49106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
49206ecc6cbSAlex Elder 	else	/* punt */
49306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
49406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
49506ecc6cbSAlex Elder 	va_end(args);
49606ecc6cbSAlex Elder }
49706ecc6cbSAlex Elder 
498aafb230eSAlex Elder #ifdef RBD_DEBUG
499aafb230eSAlex Elder #define rbd_assert(expr)						\
500aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
501aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
502aafb230eSAlex Elder 						"at line %d:\n\n"	\
503aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
504aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
505aafb230eSAlex Elder 			BUG();						\
506aafb230eSAlex Elder 		}
507aafb230eSAlex Elder #else /* !RBD_DEBUG */
508aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
509aafb230eSAlex Elder #endif /* !RBD_DEBUG */
510dfc5606dSYehuda Sadeh 
511b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
51205a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
51305a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5148b3e1a56SAlex Elder 
515cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5162df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
5172df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
51854cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
51954cac61fSAlex Elder 					u64 snap_id);
5202ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5212ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5222ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5232ad3d716SAlex Elder 		u64 *snap_features);
5242ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
52559c2be1eSYehuda Sadeh 
526602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
527602adf40SYehuda Sadeh {
528f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
529b82d167bSAlex Elder 	bool removing = false;
530602adf40SYehuda Sadeh 
531f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
532602adf40SYehuda Sadeh 		return -EROFS;
533602adf40SYehuda Sadeh 
534a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
535b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
536b82d167bSAlex Elder 		removing = true;
537b82d167bSAlex Elder 	else
538b82d167bSAlex Elder 		rbd_dev->open_count++;
539a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
540b82d167bSAlex Elder 	if (removing)
541b82d167bSAlex Elder 		return -ENOENT;
542b82d167bSAlex Elder 
543c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
544f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
545340c7a2bSAlex Elder 
546602adf40SYehuda Sadeh 	return 0;
547602adf40SYehuda Sadeh }
548602adf40SYehuda Sadeh 
549db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
550dfc5606dSYehuda Sadeh {
551dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
552b82d167bSAlex Elder 	unsigned long open_count_before;
553b82d167bSAlex Elder 
554a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
555b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
556a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
557b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
558dfc5606dSYehuda Sadeh 
559c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
560dfc5606dSYehuda Sadeh }
561dfc5606dSYehuda Sadeh 
562602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
563602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
564602adf40SYehuda Sadeh 	.open			= rbd_open,
565dfc5606dSYehuda Sadeh 	.release		= rbd_release,
566602adf40SYehuda Sadeh };
567602adf40SYehuda Sadeh 
568602adf40SYehuda Sadeh /*
5697262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
570cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
571602adf40SYehuda Sadeh  */
572f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
573602adf40SYehuda Sadeh {
574602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
575602adf40SYehuda Sadeh 	int ret = -ENOMEM;
576602adf40SYehuda Sadeh 
57737206ee5SAlex Elder 	dout("%s:\n", __func__);
578602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
579602adf40SYehuda Sadeh 	if (!rbdc)
580602adf40SYehuda Sadeh 		goto out_opt;
581602adf40SYehuda Sadeh 
582602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
583602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
584602adf40SYehuda Sadeh 
58543ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
586602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
58708f75463SAlex Elder 		goto out_rbdc;
58843ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
589602adf40SYehuda Sadeh 
590602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
591602adf40SYehuda Sadeh 	if (ret < 0)
59208f75463SAlex Elder 		goto out_client;
593602adf40SYehuda Sadeh 
594432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
595602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
596432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
597602adf40SYehuda Sadeh 
59837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
599bc534d86SAlex Elder 
600602adf40SYehuda Sadeh 	return rbdc;
60108f75463SAlex Elder out_client:
602602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
60308f75463SAlex Elder out_rbdc:
604602adf40SYehuda Sadeh 	kfree(rbdc);
605602adf40SYehuda Sadeh out_opt:
60643ae4701SAlex Elder 	if (ceph_opts)
60743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
60837206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
60937206ee5SAlex Elder 
61028f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
611602adf40SYehuda Sadeh }
612602adf40SYehuda Sadeh 
6132f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
6142f82ee54SAlex Elder {
6152f82ee54SAlex Elder 	kref_get(&rbdc->kref);
6162f82ee54SAlex Elder 
6172f82ee54SAlex Elder 	return rbdc;
6182f82ee54SAlex Elder }
6192f82ee54SAlex Elder 
620602adf40SYehuda Sadeh /*
6211f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
6221f7ba331SAlex Elder  * found, bump its reference count.
623602adf40SYehuda Sadeh  */
6241f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
625602adf40SYehuda Sadeh {
626602adf40SYehuda Sadeh 	struct rbd_client *client_node;
6271f7ba331SAlex Elder 	bool found = false;
628602adf40SYehuda Sadeh 
62943ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
630602adf40SYehuda Sadeh 		return NULL;
631602adf40SYehuda Sadeh 
6321f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
6331f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
6341f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
6352f82ee54SAlex Elder 			__rbd_get_client(client_node);
6362f82ee54SAlex Elder 
6371f7ba331SAlex Elder 			found = true;
6381f7ba331SAlex Elder 			break;
6391f7ba331SAlex Elder 		}
6401f7ba331SAlex Elder 	}
6411f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
6421f7ba331SAlex Elder 
6431f7ba331SAlex Elder 	return found ? client_node : NULL;
644602adf40SYehuda Sadeh }
645602adf40SYehuda Sadeh 
646602adf40SYehuda Sadeh /*
64759c2be1eSYehuda Sadeh  * mount options
64859c2be1eSYehuda Sadeh  */
64959c2be1eSYehuda Sadeh enum {
65059c2be1eSYehuda Sadeh 	Opt_last_int,
65159c2be1eSYehuda Sadeh 	/* int args above */
65259c2be1eSYehuda Sadeh 	Opt_last_string,
65359c2be1eSYehuda Sadeh 	/* string args above */
654cc0538b6SAlex Elder 	Opt_read_only,
655cc0538b6SAlex Elder 	Opt_read_write,
656cc0538b6SAlex Elder 	/* Boolean args above */
657cc0538b6SAlex Elder 	Opt_last_bool,
65859c2be1eSYehuda Sadeh };
65959c2be1eSYehuda Sadeh 
66043ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
66159c2be1eSYehuda Sadeh 	/* int args above */
66259c2be1eSYehuda Sadeh 	/* string args above */
663be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
664cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
665cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
666cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
667cc0538b6SAlex Elder 	/* Boolean args above */
66859c2be1eSYehuda Sadeh 	{-1, NULL}
66959c2be1eSYehuda Sadeh };
67059c2be1eSYehuda Sadeh 
67198571b5aSAlex Elder struct rbd_options {
67298571b5aSAlex Elder 	bool	read_only;
67398571b5aSAlex Elder };
67498571b5aSAlex Elder 
67598571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
67698571b5aSAlex Elder 
67759c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
67859c2be1eSYehuda Sadeh {
67943ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
68059c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
68159c2be1eSYehuda Sadeh 	int token, intval, ret;
68259c2be1eSYehuda Sadeh 
68343ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
68459c2be1eSYehuda Sadeh 	if (token < 0)
68559c2be1eSYehuda Sadeh 		return -EINVAL;
68659c2be1eSYehuda Sadeh 
68759c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
68859c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
68959c2be1eSYehuda Sadeh 		if (ret < 0) {
69059c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
69159c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
69259c2be1eSYehuda Sadeh 			return ret;
69359c2be1eSYehuda Sadeh 		}
69459c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
69559c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
69659c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
69759c2be1eSYehuda Sadeh 		     argstr[0].from);
698cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
699cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
70059c2be1eSYehuda Sadeh 	} else {
70159c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
70259c2be1eSYehuda Sadeh 	}
70359c2be1eSYehuda Sadeh 
70459c2be1eSYehuda Sadeh 	switch (token) {
705cc0538b6SAlex Elder 	case Opt_read_only:
706cc0538b6SAlex Elder 		rbd_opts->read_only = true;
707cc0538b6SAlex Elder 		break;
708cc0538b6SAlex Elder 	case Opt_read_write:
709cc0538b6SAlex Elder 		rbd_opts->read_only = false;
710cc0538b6SAlex Elder 		break;
71159c2be1eSYehuda Sadeh 	default:
712aafb230eSAlex Elder 		rbd_assert(false);
713aafb230eSAlex Elder 		break;
71459c2be1eSYehuda Sadeh 	}
71559c2be1eSYehuda Sadeh 	return 0;
71659c2be1eSYehuda Sadeh }
71759c2be1eSYehuda Sadeh 
71859c2be1eSYehuda Sadeh /*
719602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
7207262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
7217262cfcaSAlex Elder  * function.
722602adf40SYehuda Sadeh  */
7239d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
724602adf40SYehuda Sadeh {
725f8c38929SAlex Elder 	struct rbd_client *rbdc;
72659c2be1eSYehuda Sadeh 
727cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
7281f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
7299d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
73043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
7319d3997fdSAlex Elder 	else
732f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
733cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
734d720bcb0SAlex Elder 
7359d3997fdSAlex Elder 	return rbdc;
736602adf40SYehuda Sadeh }
737602adf40SYehuda Sadeh 
738602adf40SYehuda Sadeh /*
739602adf40SYehuda Sadeh  * Destroy ceph client
740d23a4b3fSAlex Elder  *
741432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
742602adf40SYehuda Sadeh  */
743602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
744602adf40SYehuda Sadeh {
745602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
746602adf40SYehuda Sadeh 
74737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
748cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
749602adf40SYehuda Sadeh 	list_del(&rbdc->node);
750cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
751602adf40SYehuda Sadeh 
752602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
753602adf40SYehuda Sadeh 	kfree(rbdc);
754602adf40SYehuda Sadeh }
755602adf40SYehuda Sadeh 
756602adf40SYehuda Sadeh /*
757602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
758602adf40SYehuda Sadeh  * it.
759602adf40SYehuda Sadeh  */
7609d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
761602adf40SYehuda Sadeh {
762c53d5893SAlex Elder 	if (rbdc)
7639d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
764602adf40SYehuda Sadeh }
765602adf40SYehuda Sadeh 
766a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
767a30b71b9SAlex Elder {
768a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
769a30b71b9SAlex Elder }
770a30b71b9SAlex Elder 
7718e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
7728e94af8eSAlex Elder {
773103a150fSAlex Elder 	size_t size;
774103a150fSAlex Elder 	u32 snap_count;
775103a150fSAlex Elder 
776103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
777103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
778103a150fSAlex Elder 		return false;
779103a150fSAlex Elder 
780db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
781db2388b6SAlex Elder 
782db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
783db2388b6SAlex Elder 		return false;
784db2388b6SAlex Elder 
785db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
786db2388b6SAlex Elder 
787db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
788db2388b6SAlex Elder 		return false;
789db2388b6SAlex Elder 
790103a150fSAlex Elder 	/*
791103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
792103a150fSAlex Elder 	 * that limits the number of snapshots.
793103a150fSAlex Elder 	 */
794103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
795103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
796103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
797103a150fSAlex Elder 		return false;
798103a150fSAlex Elder 
799103a150fSAlex Elder 	/*
800103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
801103a150fSAlex Elder 	 * header must also be representable in a size_t.
802103a150fSAlex Elder 	 */
803103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
804103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
805103a150fSAlex Elder 		return false;
806103a150fSAlex Elder 
807103a150fSAlex Elder 	return true;
8088e94af8eSAlex Elder }
8098e94af8eSAlex Elder 
810602adf40SYehuda Sadeh /*
811bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
812bb23e37aSAlex Elder  * on-disk header.
813602adf40SYehuda Sadeh  */
814662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
8154156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
816602adf40SYehuda Sadeh {
817662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
818bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
819bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
820bb23e37aSAlex Elder 	char *object_prefix = NULL;
821bb23e37aSAlex Elder 	char *snap_names = NULL;
822bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
823ccece235SAlex Elder 	u32 snap_count;
824d2bb24e5SAlex Elder 	size_t size;
825bb23e37aSAlex Elder 	int ret = -ENOMEM;
826621901d6SAlex Elder 	u32 i;
827602adf40SYehuda Sadeh 
828bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
829103a150fSAlex Elder 
830bb23e37aSAlex Elder 	if (first_time) {
831bb23e37aSAlex Elder 		size_t len;
832bb23e37aSAlex Elder 
833bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
834bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
835bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
836bb23e37aSAlex Elder 		if (!object_prefix)
837602adf40SYehuda Sadeh 			return -ENOMEM;
838bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
839bb23e37aSAlex Elder 		object_prefix[len] = '\0';
840bb23e37aSAlex Elder 	}
84100f1f36fSAlex Elder 
842bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
843d2bb24e5SAlex Elder 
844602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
845bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
846bb23e37aSAlex Elder 	if (!snapc)
847bb23e37aSAlex Elder 		goto out_err;
848bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
849602adf40SYehuda Sadeh 	if (snap_count) {
850bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
851f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
852f785cc1dSAlex Elder 
853bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
854621901d6SAlex Elder 
855f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
856bb23e37aSAlex Elder 			goto out_2big;
857bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
858bb23e37aSAlex Elder 		if (!snap_names)
859602adf40SYehuda Sadeh 			goto out_err;
860bb23e37aSAlex Elder 
861bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
862bb23e37aSAlex Elder 
863bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
864bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
865bb23e37aSAlex Elder 		if (!snap_sizes)
866bb23e37aSAlex Elder 			goto out_err;
867bb23e37aSAlex Elder 
868f785cc1dSAlex Elder 		/*
869bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
870bb23e37aSAlex Elder 		 * and size.
871bb23e37aSAlex Elder 		 *
87299a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
873bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
874f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
875f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
876f785cc1dSAlex Elder 		 */
877bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
878bb23e37aSAlex Elder 		snaps = ondisk->snaps;
879bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
880bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
881bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
882bb23e37aSAlex Elder 		}
883602adf40SYehuda Sadeh 	}
884849b4260SAlex Elder 
885bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
886bb23e37aSAlex Elder 
887bb23e37aSAlex Elder 	if (first_time) {
888bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
889602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
890602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
891602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
892bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
893bb23e37aSAlex Elder 		header->stripe_unit = 0;
894bb23e37aSAlex Elder 		header->stripe_count = 0;
895bb23e37aSAlex Elder 		header->features = 0;
896662518b1SAlex Elder 	} else {
897662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
898662518b1SAlex Elder 		kfree(header->snap_names);
899662518b1SAlex Elder 		kfree(header->snap_sizes);
900bb23e37aSAlex Elder 	}
9016a52325fSAlex Elder 
902bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
903621901d6SAlex Elder 
904f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
905bb23e37aSAlex Elder 	header->snapc = snapc;
906bb23e37aSAlex Elder 	header->snap_names = snap_names;
907bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
908468521c1SAlex Elder 
909662518b1SAlex Elder 	/* Make sure mapping size is consistent with header info */
910662518b1SAlex Elder 
911662518b1SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
912662518b1SAlex Elder 		if (rbd_dev->mapping.size != header->image_size)
913662518b1SAlex Elder 			rbd_dev->mapping.size = header->image_size;
914662518b1SAlex Elder 
915602adf40SYehuda Sadeh 	return 0;
916bb23e37aSAlex Elder out_2big:
917bb23e37aSAlex Elder 	ret = -EIO;
9186a52325fSAlex Elder out_err:
919bb23e37aSAlex Elder 	kfree(snap_sizes);
920bb23e37aSAlex Elder 	kfree(snap_names);
921bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
922bb23e37aSAlex Elder 	kfree(object_prefix);
923ccece235SAlex Elder 
924bb23e37aSAlex Elder 	return ret;
925602adf40SYehuda Sadeh }
926602adf40SYehuda Sadeh 
9279682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
9289682fc6dSAlex Elder {
9299682fc6dSAlex Elder 	const char *snap_name;
9309682fc6dSAlex Elder 
9319682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
9329682fc6dSAlex Elder 
9339682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
9349682fc6dSAlex Elder 
9359682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
9369682fc6dSAlex Elder 	while (which--)
9379682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
9389682fc6dSAlex Elder 
9399682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
9409682fc6dSAlex Elder }
9419682fc6dSAlex Elder 
94230d1cff8SAlex Elder /*
94330d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
94430d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
94530d1cff8SAlex Elder  */
94630d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
94730d1cff8SAlex Elder {
94830d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
94930d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
95030d1cff8SAlex Elder 
95130d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
95230d1cff8SAlex Elder 		return 1;
95330d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
95430d1cff8SAlex Elder }
95530d1cff8SAlex Elder 
95630d1cff8SAlex Elder /*
95730d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
95830d1cff8SAlex Elder  * present.
95930d1cff8SAlex Elder  *
96030d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
96130d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
96230d1cff8SAlex Elder  *
96330d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
96430d1cff8SAlex Elder  * reverse order, highest snapshot id first.
96530d1cff8SAlex Elder  */
9669682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
9679682fc6dSAlex Elder {
9689682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
96930d1cff8SAlex Elder 	u64 *found;
9709682fc6dSAlex Elder 
97130d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
97230d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
9739682fc6dSAlex Elder 
97430d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9759682fc6dSAlex Elder }
9769682fc6dSAlex Elder 
9772ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
9782ad3d716SAlex Elder 					u64 snap_id)
97954cac61fSAlex Elder {
98054cac61fSAlex Elder 	u32 which;
981da6a6b63SJosh Durgin 	const char *snap_name;
98254cac61fSAlex Elder 
98354cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
98454cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
985da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
98654cac61fSAlex Elder 
987da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
988da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
98954cac61fSAlex Elder }
99054cac61fSAlex Elder 
9919e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
9929e15b77dSAlex Elder {
9939e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
9949e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
9959e15b77dSAlex Elder 
99654cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
99754cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
99854cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9999e15b77dSAlex Elder 
100054cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
10019e15b77dSAlex Elder }
10029e15b77dSAlex Elder 
10032ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
10042ad3d716SAlex Elder 				u64 *snap_size)
1005602adf40SYehuda Sadeh {
10062ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10072ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10082ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
10092ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
10102ad3d716SAlex Elder 		u32 which;
101100f1f36fSAlex Elder 
10122ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
10132ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
10142ad3d716SAlex Elder 			return -ENOENT;
101500f1f36fSAlex Elder 
10162ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
10172ad3d716SAlex Elder 	} else {
10182ad3d716SAlex Elder 		u64 size = 0;
10192ad3d716SAlex Elder 		int ret;
10202ad3d716SAlex Elder 
10212ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
10222ad3d716SAlex Elder 		if (ret)
10232ad3d716SAlex Elder 			return ret;
10242ad3d716SAlex Elder 
10252ad3d716SAlex Elder 		*snap_size = size;
10262ad3d716SAlex Elder 	}
10272ad3d716SAlex Elder 	return 0;
10282ad3d716SAlex Elder }
10292ad3d716SAlex Elder 
10302ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
10312ad3d716SAlex Elder 			u64 *snap_features)
10322ad3d716SAlex Elder {
10332ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10342ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10352ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
10362ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
10372ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
10382ad3d716SAlex Elder 	} else {
10392ad3d716SAlex Elder 		u64 features = 0;
10402ad3d716SAlex Elder 		int ret;
10412ad3d716SAlex Elder 
10422ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
10432ad3d716SAlex Elder 		if (ret)
10442ad3d716SAlex Elder 			return ret;
10452ad3d716SAlex Elder 
10462ad3d716SAlex Elder 		*snap_features = features;
10472ad3d716SAlex Elder 	}
10482ad3d716SAlex Elder 	return 0;
104900f1f36fSAlex Elder }
1050602adf40SYehuda Sadeh 
1051d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1052602adf40SYehuda Sadeh {
10538f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
10542ad3d716SAlex Elder 	u64 size = 0;
10552ad3d716SAlex Elder 	u64 features = 0;
10562ad3d716SAlex Elder 	int ret;
10578b0241f8SAlex Elder 
10582ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
10592ad3d716SAlex Elder 	if (ret)
10602ad3d716SAlex Elder 		return ret;
10612ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
10622ad3d716SAlex Elder 	if (ret)
10632ad3d716SAlex Elder 		return ret;
10642ad3d716SAlex Elder 
10652ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
10662ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
10672ad3d716SAlex Elder 
10688b0241f8SAlex Elder 	return 0;
1069602adf40SYehuda Sadeh }
1070602adf40SYehuda Sadeh 
1071d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1072d1cf5788SAlex Elder {
1073d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1074d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1075200a6a8bSAlex Elder }
1076200a6a8bSAlex Elder 
107798571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1078602adf40SYehuda Sadeh {
107965ccfe21SAlex Elder 	char *name;
108065ccfe21SAlex Elder 	u64 segment;
108165ccfe21SAlex Elder 	int ret;
10823a96d5cdSJosh Durgin 	char *name_format;
1083602adf40SYehuda Sadeh 
108478c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
108565ccfe21SAlex Elder 	if (!name)
108665ccfe21SAlex Elder 		return NULL;
108765ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
10883a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
10893a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
10903a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
10912d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
109265ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
10932d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
109465ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
109565ccfe21SAlex Elder 			segment, ret);
109665ccfe21SAlex Elder 		kfree(name);
109765ccfe21SAlex Elder 		name = NULL;
109865ccfe21SAlex Elder 	}
1099602adf40SYehuda Sadeh 
110065ccfe21SAlex Elder 	return name;
110165ccfe21SAlex Elder }
1102602adf40SYehuda Sadeh 
110378c2a44aSAlex Elder static void rbd_segment_name_free(const char *name)
110478c2a44aSAlex Elder {
110578c2a44aSAlex Elder 	/* The explicit cast here is needed to drop the const qualifier */
110678c2a44aSAlex Elder 
110778c2a44aSAlex Elder 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
110878c2a44aSAlex Elder }
110978c2a44aSAlex Elder 
111065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
111165ccfe21SAlex Elder {
111265ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1113602adf40SYehuda Sadeh 
111465ccfe21SAlex Elder 	return offset & (segment_size - 1);
111565ccfe21SAlex Elder }
111665ccfe21SAlex Elder 
111765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
111865ccfe21SAlex Elder 				u64 offset, u64 length)
111965ccfe21SAlex Elder {
112065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
112165ccfe21SAlex Elder 
112265ccfe21SAlex Elder 	offset &= segment_size - 1;
112365ccfe21SAlex Elder 
1124aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
112565ccfe21SAlex Elder 	if (offset + length > segment_size)
112665ccfe21SAlex Elder 		length = segment_size - offset;
112765ccfe21SAlex Elder 
112865ccfe21SAlex Elder 	return length;
1129602adf40SYehuda Sadeh }
1130602adf40SYehuda Sadeh 
1131602adf40SYehuda Sadeh /*
1132029bcbd8SJosh Durgin  * returns the size of an object in the image
1133029bcbd8SJosh Durgin  */
1134029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1135029bcbd8SJosh Durgin {
1136029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1137029bcbd8SJosh Durgin }
1138029bcbd8SJosh Durgin 
1139029bcbd8SJosh Durgin /*
1140602adf40SYehuda Sadeh  * bio helpers
1141602adf40SYehuda Sadeh  */
1142602adf40SYehuda Sadeh 
1143602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1144602adf40SYehuda Sadeh {
1145602adf40SYehuda Sadeh 	struct bio *tmp;
1146602adf40SYehuda Sadeh 
1147602adf40SYehuda Sadeh 	while (chain) {
1148602adf40SYehuda Sadeh 		tmp = chain;
1149602adf40SYehuda Sadeh 		chain = chain->bi_next;
1150602adf40SYehuda Sadeh 		bio_put(tmp);
1151602adf40SYehuda Sadeh 	}
1152602adf40SYehuda Sadeh }
1153602adf40SYehuda Sadeh 
1154602adf40SYehuda Sadeh /*
1155602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1156602adf40SYehuda Sadeh  */
1157602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1158602adf40SYehuda Sadeh {
1159602adf40SYehuda Sadeh 	struct bio_vec *bv;
1160602adf40SYehuda Sadeh 	unsigned long flags;
1161602adf40SYehuda Sadeh 	void *buf;
1162602adf40SYehuda Sadeh 	int i;
1163602adf40SYehuda Sadeh 	int pos = 0;
1164602adf40SYehuda Sadeh 
1165602adf40SYehuda Sadeh 	while (chain) {
1166602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
1167602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
1168602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
1169602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
1170602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
1171602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
1172e2156054SAlex Elder 				flush_dcache_page(bv->bv_page);
117385b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1174602adf40SYehuda Sadeh 			}
1175602adf40SYehuda Sadeh 			pos += bv->bv_len;
1176602adf40SYehuda Sadeh 		}
1177602adf40SYehuda Sadeh 
1178602adf40SYehuda Sadeh 		chain = chain->bi_next;
1179602adf40SYehuda Sadeh 	}
1180602adf40SYehuda Sadeh }
1181602adf40SYehuda Sadeh 
1182602adf40SYehuda Sadeh /*
1183b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1184b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1185b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1186b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1187b9434c5bSAlex Elder  */
1188b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1189b9434c5bSAlex Elder {
1190b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1191b9434c5bSAlex Elder 
1192b9434c5bSAlex Elder 	rbd_assert(end > offset);
1193b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1194b9434c5bSAlex Elder 	while (offset < end) {
1195b9434c5bSAlex Elder 		size_t page_offset;
1196b9434c5bSAlex Elder 		size_t length;
1197b9434c5bSAlex Elder 		unsigned long flags;
1198b9434c5bSAlex Elder 		void *kaddr;
1199b9434c5bSAlex Elder 
1200491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1201491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1202b9434c5bSAlex Elder 		local_irq_save(flags);
1203b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1204b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1205e2156054SAlex Elder 		flush_dcache_page(*page);
1206b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1207b9434c5bSAlex Elder 		local_irq_restore(flags);
1208b9434c5bSAlex Elder 
1209b9434c5bSAlex Elder 		offset += length;
1210b9434c5bSAlex Elder 		page++;
1211b9434c5bSAlex Elder 	}
1212b9434c5bSAlex Elder }
1213b9434c5bSAlex Elder 
1214b9434c5bSAlex Elder /*
1215f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1216f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1217602adf40SYehuda Sadeh  */
1218f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1219f7760dadSAlex Elder 					unsigned int offset,
1220f7760dadSAlex Elder 					unsigned int len,
1221f7760dadSAlex Elder 					gfp_t gfpmask)
1222602adf40SYehuda Sadeh {
1223f7760dadSAlex Elder 	struct bio_vec *bv;
1224f7760dadSAlex Elder 	unsigned int resid;
1225f7760dadSAlex Elder 	unsigned short idx;
1226f7760dadSAlex Elder 	unsigned int voff;
1227f7760dadSAlex Elder 	unsigned short end_idx;
1228f7760dadSAlex Elder 	unsigned short vcnt;
1229f7760dadSAlex Elder 	struct bio *bio;
1230602adf40SYehuda Sadeh 
1231f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1232f7760dadSAlex Elder 
1233f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1234f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1235f7760dadSAlex Elder 
1236f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1237f7760dadSAlex Elder 		return NULL;
1238f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1239f7760dadSAlex Elder 		return NULL;
1240f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1241f7760dadSAlex Elder 		return NULL;
1242f7760dadSAlex Elder 
1243f7760dadSAlex Elder 	/* Find first affected segment... */
1244f7760dadSAlex Elder 
1245f7760dadSAlex Elder 	resid = offset;
1246d74c6d51SKent Overstreet 	bio_for_each_segment(bv, bio_src, idx) {
1247f7760dadSAlex Elder 		if (resid < bv->bv_len)
1248f7760dadSAlex Elder 			break;
1249f7760dadSAlex Elder 		resid -= bv->bv_len;
1250602adf40SYehuda Sadeh 	}
1251f7760dadSAlex Elder 	voff = resid;
1252602adf40SYehuda Sadeh 
1253f7760dadSAlex Elder 	/* ...and the last affected segment */
1254542582fcSAlex Elder 
1255f7760dadSAlex Elder 	resid += len;
1256f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1257f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1258f7760dadSAlex Elder 			break;
1259f7760dadSAlex Elder 		resid -= bv->bv_len;
1260f7760dadSAlex Elder 	}
1261f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1262602adf40SYehuda Sadeh 
1263f7760dadSAlex Elder 	/* Build the clone */
1264f7760dadSAlex Elder 
1265f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1266f7760dadSAlex Elder 	if (!bio)
1267f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1268f7760dadSAlex Elder 
1269f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1270f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1271f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1272f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1273602adf40SYehuda Sadeh 
1274602adf40SYehuda Sadeh 	/*
1275f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1276f7760dadSAlex Elder 	 * and last (or only) entries.
1277602adf40SYehuda Sadeh 	 */
1278f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1279f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1280f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1281f7760dadSAlex Elder 	if (vcnt > 1) {
1282f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1283f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1284602adf40SYehuda Sadeh 	} else {
1285f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1286602adf40SYehuda Sadeh 	}
1287602adf40SYehuda Sadeh 
1288f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1289f7760dadSAlex Elder 	bio->bi_size = len;
1290f7760dadSAlex Elder 	bio->bi_idx = 0;
1291602adf40SYehuda Sadeh 
1292f7760dadSAlex Elder 	return bio;
1293602adf40SYehuda Sadeh }
1294602adf40SYehuda Sadeh 
1295f7760dadSAlex Elder /*
1296f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1297f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1298f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1299f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1300f7760dadSAlex Elder  *
1301f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1302f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1303f7760dadSAlex Elder  * the start of data to be cloned is located.
1304f7760dadSAlex Elder  *
1305f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1306f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1307f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1308f7760dadSAlex Elder  */
1309f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1310f7760dadSAlex Elder 					unsigned int *offset,
1311f7760dadSAlex Elder 					unsigned int len,
1312f7760dadSAlex Elder 					gfp_t gfpmask)
1313f7760dadSAlex Elder {
1314f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1315f7760dadSAlex Elder 	unsigned int off = *offset;
1316f7760dadSAlex Elder 	struct bio *chain = NULL;
1317f7760dadSAlex Elder 	struct bio **end;
1318602adf40SYehuda Sadeh 
1319f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1320602adf40SYehuda Sadeh 
1321f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1322f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1323602adf40SYehuda Sadeh 
1324f7760dadSAlex Elder 	end = &chain;
1325f7760dadSAlex Elder 	while (len) {
1326f7760dadSAlex Elder 		unsigned int bi_size;
1327f7760dadSAlex Elder 		struct bio *bio;
1328f7760dadSAlex Elder 
1329f5400b7aSAlex Elder 		if (!bi) {
1330f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1331f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1332f5400b7aSAlex Elder 		}
1333f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1334f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1335f7760dadSAlex Elder 		if (!bio)
1336f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1337f7760dadSAlex Elder 
1338f7760dadSAlex Elder 		*end = bio;
1339f7760dadSAlex Elder 		end = &bio->bi_next;
1340f7760dadSAlex Elder 
1341f7760dadSAlex Elder 		off += bi_size;
1342f7760dadSAlex Elder 		if (off == bi->bi_size) {
1343f7760dadSAlex Elder 			bi = bi->bi_next;
1344f7760dadSAlex Elder 			off = 0;
1345f7760dadSAlex Elder 		}
1346f7760dadSAlex Elder 		len -= bi_size;
1347f7760dadSAlex Elder 	}
1348f7760dadSAlex Elder 	*bio_src = bi;
1349f7760dadSAlex Elder 	*offset = off;
1350f7760dadSAlex Elder 
1351f7760dadSAlex Elder 	return chain;
1352f7760dadSAlex Elder out_err:
1353f7760dadSAlex Elder 	bio_chain_put(chain);
1354f7760dadSAlex Elder 
1355602adf40SYehuda Sadeh 	return NULL;
1356602adf40SYehuda Sadeh }
1357602adf40SYehuda Sadeh 
1358926f9b3fSAlex Elder /*
1359926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1360926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1361926f9b3fSAlex Elder  * again.
1362926f9b3fSAlex Elder  */
13636365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13646365d33aSAlex Elder {
13656365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13666365d33aSAlex Elder 		struct rbd_device *rbd_dev;
13676365d33aSAlex Elder 
136857acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
13696365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
13706365d33aSAlex Elder 			obj_request);
13716365d33aSAlex Elder 	}
13726365d33aSAlex Elder }
13736365d33aSAlex Elder 
13746365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
13756365d33aSAlex Elder {
13766365d33aSAlex Elder 	smp_mb();
13776365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
13786365d33aSAlex Elder }
13796365d33aSAlex Elder 
138057acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
138157acbaa7SAlex Elder {
138257acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
138357acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
138457acbaa7SAlex Elder 
138557acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
138657acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
138757acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
138857acbaa7SAlex Elder 			obj_request);
138957acbaa7SAlex Elder 	}
139057acbaa7SAlex Elder }
139157acbaa7SAlex Elder 
139257acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
139357acbaa7SAlex Elder {
139457acbaa7SAlex Elder 	smp_mb();
139557acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
139657acbaa7SAlex Elder }
139757acbaa7SAlex Elder 
13985679c59fSAlex Elder /*
13995679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14005679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14015679c59fSAlex Elder  *
14025679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14035679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14045679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14055679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14065679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14075679c59fSAlex Elder  */
14085679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14095679c59fSAlex Elder 				bool exists)
14105679c59fSAlex Elder {
14115679c59fSAlex Elder 	if (exists)
14125679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14135679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14145679c59fSAlex Elder 	smp_mb();
14155679c59fSAlex Elder }
14165679c59fSAlex Elder 
14175679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14185679c59fSAlex Elder {
14195679c59fSAlex Elder 	smp_mb();
14205679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
14215679c59fSAlex Elder }
14225679c59fSAlex Elder 
14235679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
14245679c59fSAlex Elder {
14255679c59fSAlex Elder 	smp_mb();
14265679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
14275679c59fSAlex Elder }
14285679c59fSAlex Elder 
1429bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1430bf0d5f50SAlex Elder {
143137206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
143237206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1433bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1434bf0d5f50SAlex Elder }
1435bf0d5f50SAlex Elder 
1436bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1437bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1438bf0d5f50SAlex Elder {
1439bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
144037206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
144137206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1442bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1443bf0d5f50SAlex Elder }
1444bf0d5f50SAlex Elder 
1445e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1446e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1447bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1448bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1449bf0d5f50SAlex Elder {
1450bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
145137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
145237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1453e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1454e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1455e93f3152SAlex Elder 	else
1456bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1457bf0d5f50SAlex Elder }
1458bf0d5f50SAlex Elder 
1459bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1460bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1461bf0d5f50SAlex Elder {
146225dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
146325dcf954SAlex Elder 
1464b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1465bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
146625dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
14676365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
14686365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1469bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
147025dcf954SAlex Elder 	img_request->obj_request_count++;
147125dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
147237206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
147337206ee5SAlex Elder 		obj_request->which);
1474bf0d5f50SAlex Elder }
1475bf0d5f50SAlex Elder 
1476bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1477bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1478bf0d5f50SAlex Elder {
1479bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
148025dcf954SAlex Elder 
148137206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
148237206ee5SAlex Elder 		obj_request->which);
1483bf0d5f50SAlex Elder 	list_del(&obj_request->links);
148425dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
148525dcf954SAlex Elder 	img_request->obj_request_count--;
148625dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
148725dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
14886365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1489bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1490bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
149125dcf954SAlex Elder 	obj_request->callback = NULL;
1492bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1493bf0d5f50SAlex Elder }
1494bf0d5f50SAlex Elder 
1495bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1496bf0d5f50SAlex Elder {
1497bf0d5f50SAlex Elder 	switch (type) {
14989969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1499bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1500788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1501bf0d5f50SAlex Elder 		return true;
1502bf0d5f50SAlex Elder 	default:
1503bf0d5f50SAlex Elder 		return false;
1504bf0d5f50SAlex Elder 	}
1505bf0d5f50SAlex Elder }
1506bf0d5f50SAlex Elder 
1507bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1508bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1509bf0d5f50SAlex Elder {
151037206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
151137206ee5SAlex Elder 
1512bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1513bf0d5f50SAlex Elder }
1514bf0d5f50SAlex Elder 
1515bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1516bf0d5f50SAlex Elder {
151755f27e09SAlex Elder 
151837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
151955f27e09SAlex Elder 
152055f27e09SAlex Elder 	/*
152155f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
152255f27e09SAlex Elder 	 * count for the image request.  We could instead use
152355f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
152455f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
152555f27e09SAlex Elder 	 */
152655f27e09SAlex Elder 	if (!img_request->result) {
152755f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
152855f27e09SAlex Elder 		u64 xferred = 0;
152955f27e09SAlex Elder 
153055f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
153155f27e09SAlex Elder 			xferred += obj_request->xferred;
153255f27e09SAlex Elder 		img_request->xferred = xferred;
153355f27e09SAlex Elder 	}
153455f27e09SAlex Elder 
1535bf0d5f50SAlex Elder 	if (img_request->callback)
1536bf0d5f50SAlex Elder 		img_request->callback(img_request);
1537bf0d5f50SAlex Elder 	else
1538bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1539bf0d5f50SAlex Elder }
1540bf0d5f50SAlex Elder 
1541788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1542788e2df3SAlex Elder 
1543788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1544788e2df3SAlex Elder {
154537206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
154637206ee5SAlex Elder 
1547788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1548788e2df3SAlex Elder }
1549788e2df3SAlex Elder 
15500c425248SAlex Elder /*
15510c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
15520c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
15530c425248SAlex Elder  * and currently never change thereafter.
15540c425248SAlex Elder  */
15550c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
15560c425248SAlex Elder {
15570c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
15580c425248SAlex Elder 	smp_mb();
15590c425248SAlex Elder }
15600c425248SAlex Elder 
15610c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
15620c425248SAlex Elder {
15630c425248SAlex Elder 	smp_mb();
15640c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
15650c425248SAlex Elder }
15660c425248SAlex Elder 
15679849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
15689849e986SAlex Elder {
15699849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
15709849e986SAlex Elder 	smp_mb();
15719849e986SAlex Elder }
15729849e986SAlex Elder 
1573e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1574e93f3152SAlex Elder {
1575e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1576e93f3152SAlex Elder 	smp_mb();
1577e93f3152SAlex Elder }
1578e93f3152SAlex Elder 
15799849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
15809849e986SAlex Elder {
15819849e986SAlex Elder 	smp_mb();
15829849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
15839849e986SAlex Elder }
15849849e986SAlex Elder 
1585d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1586d0b2e944SAlex Elder {
1587d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1588d0b2e944SAlex Elder 	smp_mb();
1589d0b2e944SAlex Elder }
1590d0b2e944SAlex Elder 
1591a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1592a2acd00eSAlex Elder {
1593a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1594a2acd00eSAlex Elder 	smp_mb();
1595a2acd00eSAlex Elder }
1596a2acd00eSAlex Elder 
1597d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1598d0b2e944SAlex Elder {
1599d0b2e944SAlex Elder 	smp_mb();
1600d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1601d0b2e944SAlex Elder }
1602d0b2e944SAlex Elder 
16036e2a4505SAlex Elder static void
16046e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
16056e2a4505SAlex Elder {
1606b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1607b9434c5bSAlex Elder 	u64 length = obj_request->length;
1608b9434c5bSAlex Elder 
16096e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
16106e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1611b9434c5bSAlex Elder 		xferred, length);
16126e2a4505SAlex Elder 	/*
161317c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
161417c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
161517c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
161617c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
161717c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
161817c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
16196e2a4505SAlex Elder 	 */
1620b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
16216e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1622b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
16236e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1624b9434c5bSAlex Elder 		else
1625b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
16266e2a4505SAlex Elder 		obj_request->result = 0;
1627b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1628b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1629b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1630b9434c5bSAlex Elder 		else
1631b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
16326e2a4505SAlex Elder 	}
163317c1cc1dSJosh Durgin 	obj_request->xferred = length;
16346e2a4505SAlex Elder 	obj_request_done_set(obj_request);
16356e2a4505SAlex Elder }
16366e2a4505SAlex Elder 
1637bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1638bf0d5f50SAlex Elder {
163937206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
164037206ee5SAlex Elder 		obj_request->callback);
1641bf0d5f50SAlex Elder 	if (obj_request->callback)
1642bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1643788e2df3SAlex Elder 	else
1644788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1645bf0d5f50SAlex Elder }
1646bf0d5f50SAlex Elder 
1647c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
164839bf2c5dSAlex Elder {
164939bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
165039bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
165139bf2c5dSAlex Elder }
165239bf2c5dSAlex Elder 
1653c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1654bf0d5f50SAlex Elder {
165557acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1656a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
165757acbaa7SAlex Elder 	bool layered = false;
165857acbaa7SAlex Elder 
165957acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
166057acbaa7SAlex Elder 		img_request = obj_request->img_request;
166157acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1662a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
166357acbaa7SAlex Elder 	}
16648b3e1a56SAlex Elder 
16658b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
16668b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
16678b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1668a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1669a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
16708b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
16718b3e1a56SAlex Elder 	else if (img_request)
16726e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
16736e2a4505SAlex Elder 	else
167407741308SAlex Elder 		obj_request_done_set(obj_request);
1675bf0d5f50SAlex Elder }
1676bf0d5f50SAlex Elder 
1677c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1678bf0d5f50SAlex Elder {
16791b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
16801b83bef2SSage Weil 		obj_request->result, obj_request->length);
16811b83bef2SSage Weil 	/*
16828b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
16838b3e1a56SAlex Elder 	 * it to our originally-requested length.
16841b83bef2SSage Weil 	 */
16851b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
168607741308SAlex Elder 	obj_request_done_set(obj_request);
1687bf0d5f50SAlex Elder }
1688bf0d5f50SAlex Elder 
1689fbfab539SAlex Elder /*
1690fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1691fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1692fbfab539SAlex Elder  */
1693c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1694fbfab539SAlex Elder {
169537206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1696fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1697fbfab539SAlex Elder }
1698fbfab539SAlex Elder 
1699bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1700bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1701bf0d5f50SAlex Elder {
1702bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1703bf0d5f50SAlex Elder 	u16 opcode;
1704bf0d5f50SAlex Elder 
170537206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1706bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
170757acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
170857acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
170957acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
171057acbaa7SAlex Elder 	} else {
171157acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
171257acbaa7SAlex Elder 	}
1713bf0d5f50SAlex Elder 
17141b83bef2SSage Weil 	if (osd_req->r_result < 0)
17151b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1716bf0d5f50SAlex Elder 
17170eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1718bf0d5f50SAlex Elder 
1719c47f9371SAlex Elder 	/*
1720c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1721c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1722c47f9371SAlex Elder 	 */
17231b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1724c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
172579528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1726bf0d5f50SAlex Elder 	switch (opcode) {
1727bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1728c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1729bf0d5f50SAlex Elder 		break;
1730bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1731c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1732bf0d5f50SAlex Elder 		break;
1733fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1734c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1735fbfab539SAlex Elder 		break;
173636be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1737b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
17389969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1739c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
17409969ebc5SAlex Elder 		break;
1741bf0d5f50SAlex Elder 	default:
1742bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1743bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1744bf0d5f50SAlex Elder 		break;
1745bf0d5f50SAlex Elder 	}
1746bf0d5f50SAlex Elder 
174707741308SAlex Elder 	if (obj_request_done_test(obj_request))
1748bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1749bf0d5f50SAlex Elder }
1750bf0d5f50SAlex Elder 
17519d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1752430c28c3SAlex Elder {
1753430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17548c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17559d4df01fSAlex Elder 	u64 snap_id;
1756430c28c3SAlex Elder 
17578c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1758430c28c3SAlex Elder 
17599d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
17608c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17619d4df01fSAlex Elder 			NULL, snap_id, NULL);
17629d4df01fSAlex Elder }
17639d4df01fSAlex Elder 
17649d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
17659d4df01fSAlex Elder {
17669d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17679d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17689d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
17699d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
17709d4df01fSAlex Elder 
17719d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
17729d4df01fSAlex Elder 
17739d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
17749d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17759d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1776430c28c3SAlex Elder }
1777430c28c3SAlex Elder 
1778bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1779bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1780bf0d5f50SAlex Elder 					bool write_request,
1781430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1782bf0d5f50SAlex Elder {
1783bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1784bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1785bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1786bf0d5f50SAlex Elder 
17876365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
17886365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
17896365d33aSAlex Elder 
17900c425248SAlex Elder 		rbd_assert(write_request ==
17910c425248SAlex Elder 				img_request_write_test(img_request));
17920c425248SAlex Elder 		if (write_request)
1793bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1794bf0d5f50SAlex Elder 	}
1795bf0d5f50SAlex Elder 
1796bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1797bf0d5f50SAlex Elder 
1798bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1799bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1800bf0d5f50SAlex Elder 	if (!osd_req)
1801bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1802bf0d5f50SAlex Elder 
1803430c28c3SAlex Elder 	if (write_request)
1804bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1805430c28c3SAlex Elder 	else
1806bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1807bf0d5f50SAlex Elder 
1808bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1809bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1810bf0d5f50SAlex Elder 
181122116525SIlya Dryomov 	osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
181222116525SIlya Dryomov 
1813bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1814bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1815bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1816bf0d5f50SAlex Elder 
1817bf0d5f50SAlex Elder 	return osd_req;
1818bf0d5f50SAlex Elder }
1819bf0d5f50SAlex Elder 
18200eefd470SAlex Elder /*
18210eefd470SAlex Elder  * Create a copyup osd request based on the information in the
18220eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
18230eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
18240eefd470SAlex Elder  */
18250eefd470SAlex Elder static struct ceph_osd_request *
18260eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
18270eefd470SAlex Elder {
18280eefd470SAlex Elder 	struct rbd_img_request *img_request;
18290eefd470SAlex Elder 	struct ceph_snap_context *snapc;
18300eefd470SAlex Elder 	struct rbd_device *rbd_dev;
18310eefd470SAlex Elder 	struct ceph_osd_client *osdc;
18320eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
18330eefd470SAlex Elder 
18340eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18350eefd470SAlex Elder 	img_request = obj_request->img_request;
18360eefd470SAlex Elder 	rbd_assert(img_request);
18370eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
18380eefd470SAlex Elder 
18390eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
18400eefd470SAlex Elder 
18410eefd470SAlex Elder 	snapc = img_request->snapc;
18420eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
18430eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
18440eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
18450eefd470SAlex Elder 	if (!osd_req)
18460eefd470SAlex Elder 		return NULL;	/* ENOMEM */
18470eefd470SAlex Elder 
18480eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
18490eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
18500eefd470SAlex Elder 	osd_req->r_priv = obj_request;
18510eefd470SAlex Elder 
185222116525SIlya Dryomov 	osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
185322116525SIlya Dryomov 
18540eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
18550eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
18560eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
18570eefd470SAlex Elder 
18580eefd470SAlex Elder 	return osd_req;
18590eefd470SAlex Elder }
18600eefd470SAlex Elder 
18610eefd470SAlex Elder 
1862bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1863bf0d5f50SAlex Elder {
1864bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1865bf0d5f50SAlex Elder }
1866bf0d5f50SAlex Elder 
1867bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1868bf0d5f50SAlex Elder 
1869bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1870bf0d5f50SAlex Elder 						u64 offset, u64 length,
1871bf0d5f50SAlex Elder 						enum obj_request_type type)
1872bf0d5f50SAlex Elder {
1873bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1874bf0d5f50SAlex Elder 	size_t size;
1875bf0d5f50SAlex Elder 	char *name;
1876bf0d5f50SAlex Elder 
1877bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1878bf0d5f50SAlex Elder 
1879bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1880f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1881f907ad55SAlex Elder 	if (!name)
1882bf0d5f50SAlex Elder 		return NULL;
1883bf0d5f50SAlex Elder 
1884868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1885f907ad55SAlex Elder 	if (!obj_request) {
1886f907ad55SAlex Elder 		kfree(name);
1887f907ad55SAlex Elder 		return NULL;
1888f907ad55SAlex Elder 	}
1889f907ad55SAlex Elder 
1890bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1891bf0d5f50SAlex Elder 	obj_request->offset = offset;
1892bf0d5f50SAlex Elder 	obj_request->length = length;
1893926f9b3fSAlex Elder 	obj_request->flags = 0;
1894bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1895bf0d5f50SAlex Elder 	obj_request->type = type;
1896bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1897788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1898bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1899bf0d5f50SAlex Elder 
190037206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
190137206ee5SAlex Elder 		offset, length, (int)type, obj_request);
190237206ee5SAlex Elder 
1903bf0d5f50SAlex Elder 	return obj_request;
1904bf0d5f50SAlex Elder }
1905bf0d5f50SAlex Elder 
1906bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1907bf0d5f50SAlex Elder {
1908bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1909bf0d5f50SAlex Elder 
1910bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1911bf0d5f50SAlex Elder 
191237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
191337206ee5SAlex Elder 
1914bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1915bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1916bf0d5f50SAlex Elder 
1917bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1918bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1919bf0d5f50SAlex Elder 
1920bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1921bf0d5f50SAlex Elder 	switch (obj_request->type) {
19229969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
19239969ebc5SAlex Elder 		break;		/* Nothing to do */
1924bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1925bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1926bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1927bf0d5f50SAlex Elder 		break;
1928788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1929788e2df3SAlex Elder 		if (obj_request->pages)
1930788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1931788e2df3SAlex Elder 						obj_request->page_count);
1932788e2df3SAlex Elder 		break;
1933bf0d5f50SAlex Elder 	}
1934bf0d5f50SAlex Elder 
1935f907ad55SAlex Elder 	kfree(obj_request->object_name);
1936868311b1SAlex Elder 	obj_request->object_name = NULL;
1937868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1938bf0d5f50SAlex Elder }
1939bf0d5f50SAlex Elder 
1940fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1941fb65d228SAlex Elder 
1942fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1943fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1944fb65d228SAlex Elder {
1945fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1946fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1947fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1948fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1949fb65d228SAlex Elder }
1950fb65d228SAlex Elder 
1951bf0d5f50SAlex Elder /*
1952a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1953a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1954a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1955a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1956a2acd00eSAlex Elder  */
1957a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1958a2acd00eSAlex Elder {
1959a2acd00eSAlex Elder 	int counter;
1960a2acd00eSAlex Elder 
1961a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1962a2acd00eSAlex Elder 		return;
1963a2acd00eSAlex Elder 
1964a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1965a2acd00eSAlex Elder 	if (counter > 0)
1966a2acd00eSAlex Elder 		return;
1967a2acd00eSAlex Elder 
1968a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1969a2acd00eSAlex Elder 
1970a2acd00eSAlex Elder 	if (!counter)
1971a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1972a2acd00eSAlex Elder 	else
1973a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference underflow\n");
1974a2acd00eSAlex Elder }
1975a2acd00eSAlex Elder 
1976a2acd00eSAlex Elder /*
1977a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1978a2acd00eSAlex Elder  * parent.
1979a2acd00eSAlex Elder  *
1980392a9dadSAlex Elder  * We must get the reference before checking for the overlap to
1981392a9dadSAlex Elder  * coordinate properly with zeroing the parent overlap in
1982392a9dadSAlex Elder  * rbd_dev_v2_parent_info() when an image gets flattened.  We
1983392a9dadSAlex Elder  * drop it again if there is no overlap.
1984392a9dadSAlex Elder  *
1985a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1986a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1987a2acd00eSAlex Elder  * false otherwise.
1988a2acd00eSAlex Elder  */
1989a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1990a2acd00eSAlex Elder {
1991a2acd00eSAlex Elder 	int counter;
1992a2acd00eSAlex Elder 
1993a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1994a2acd00eSAlex Elder 		return false;
1995a2acd00eSAlex Elder 
1996a2acd00eSAlex Elder 	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1997a2acd00eSAlex Elder 	if (counter > 0 && rbd_dev->parent_overlap)
1998a2acd00eSAlex Elder 		return true;
1999a2acd00eSAlex Elder 
2000a2acd00eSAlex Elder 	/* Image was flattened, but parent is not yet torn down */
2001a2acd00eSAlex Elder 
2002a2acd00eSAlex Elder 	if (counter < 0)
2003a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference overflow\n");
2004a2acd00eSAlex Elder 
2005a2acd00eSAlex Elder 	return false;
2006a2acd00eSAlex Elder }
2007a2acd00eSAlex Elder 
2008bf0d5f50SAlex Elder /*
2009bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2010bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2011bf0d5f50SAlex Elder  * (if there is one).
2012bf0d5f50SAlex Elder  */
2013cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2014cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2015bf0d5f50SAlex Elder 					u64 offset, u64 length,
2016e93f3152SAlex Elder 					bool write_request)
2017bf0d5f50SAlex Elder {
2018bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2019bf0d5f50SAlex Elder 
20201c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
2021bf0d5f50SAlex Elder 	if (!img_request)
2022bf0d5f50SAlex Elder 		return NULL;
2023bf0d5f50SAlex Elder 
2024bf0d5f50SAlex Elder 	if (write_request) {
2025bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
2026812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
2027bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
2028bf0d5f50SAlex Elder 	}
2029bf0d5f50SAlex Elder 
2030bf0d5f50SAlex Elder 	img_request->rq = NULL;
2031bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2032bf0d5f50SAlex Elder 	img_request->offset = offset;
2033bf0d5f50SAlex Elder 	img_request->length = length;
20340c425248SAlex Elder 	img_request->flags = 0;
20350c425248SAlex Elder 	if (write_request) {
20360c425248SAlex Elder 		img_request_write_set(img_request);
2037468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
20380c425248SAlex Elder 	} else {
2039bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
20400c425248SAlex Elder 	}
2041a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2042d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2043bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2044bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2045bf0d5f50SAlex Elder 	img_request->callback = NULL;
2046a5a337d4SAlex Elder 	img_request->result = 0;
2047bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2048bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2049bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2050bf0d5f50SAlex Elder 
205137206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
205237206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
205337206ee5SAlex Elder 		img_request);
205437206ee5SAlex Elder 
2055bf0d5f50SAlex Elder 	return img_request;
2056bf0d5f50SAlex Elder }
2057bf0d5f50SAlex Elder 
2058bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2059bf0d5f50SAlex Elder {
2060bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2061bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2062bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2063bf0d5f50SAlex Elder 
2064bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2065bf0d5f50SAlex Elder 
206637206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
206737206ee5SAlex Elder 
2068bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2069bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
207025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2071bf0d5f50SAlex Elder 
2072a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2073a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2074a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2075a2acd00eSAlex Elder 	}
2076a2acd00eSAlex Elder 
20770c425248SAlex Elder 	if (img_request_write_test(img_request))
2078812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2079bf0d5f50SAlex Elder 
20801c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2081bf0d5f50SAlex Elder }
2082bf0d5f50SAlex Elder 
2083e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2084e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2085e93f3152SAlex Elder 					u64 img_offset, u64 length)
2086e93f3152SAlex Elder {
2087e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2088e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2089e93f3152SAlex Elder 
2090e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2091e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2092e93f3152SAlex Elder 
2093e93f3152SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
2094e93f3152SAlex Elder 						img_offset, length, false);
2095e93f3152SAlex Elder 	if (!parent_request)
2096e93f3152SAlex Elder 		return NULL;
2097e93f3152SAlex Elder 
2098e93f3152SAlex Elder 	img_request_child_set(parent_request);
2099e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2100e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2101e93f3152SAlex Elder 
2102e93f3152SAlex Elder 	return parent_request;
2103e93f3152SAlex Elder }
2104e93f3152SAlex Elder 
2105e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2106e93f3152SAlex Elder {
2107e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2108e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2109e93f3152SAlex Elder 
2110e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2111e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2112e93f3152SAlex Elder 
2113e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2114e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2115e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2116e93f3152SAlex Elder 
2117e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2118e93f3152SAlex Elder }
2119e93f3152SAlex Elder 
21201217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
21211217857fSAlex Elder {
21226365d33aSAlex Elder 	struct rbd_img_request *img_request;
21231217857fSAlex Elder 	unsigned int xferred;
21241217857fSAlex Elder 	int result;
21258b3e1a56SAlex Elder 	bool more;
21261217857fSAlex Elder 
21276365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21286365d33aSAlex Elder 	img_request = obj_request->img_request;
21296365d33aSAlex Elder 
21301217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
21311217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
21321217857fSAlex Elder 	result = obj_request->result;
21331217857fSAlex Elder 	if (result) {
21341217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
21351217857fSAlex Elder 
21361217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
21371217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
21381217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
21391217857fSAlex Elder 			obj_request->offset);
21401217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
21411217857fSAlex Elder 			result, xferred);
21421217857fSAlex Elder 		if (!img_request->result)
21431217857fSAlex Elder 			img_request->result = result;
21441217857fSAlex Elder 	}
21451217857fSAlex Elder 
2146f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2147f1a4739fSAlex Elder 
2148f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2149f1a4739fSAlex Elder 		obj_request->pages = NULL;
2150f1a4739fSAlex Elder 		obj_request->page_count = 0;
2151f1a4739fSAlex Elder 	}
2152f1a4739fSAlex Elder 
21538b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
21548b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
21558b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
21568b3e1a56SAlex Elder 	} else {
21578b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
21588b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
21598b3e1a56SAlex Elder 	}
21608b3e1a56SAlex Elder 
21618b3e1a56SAlex Elder 	return more;
21621217857fSAlex Elder }
21631217857fSAlex Elder 
21642169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
21652169238dSAlex Elder {
21662169238dSAlex Elder 	struct rbd_img_request *img_request;
21672169238dSAlex Elder 	u32 which = obj_request->which;
21682169238dSAlex Elder 	bool more = true;
21692169238dSAlex Elder 
21706365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21712169238dSAlex Elder 	img_request = obj_request->img_request;
21722169238dSAlex Elder 
21732169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
21742169238dSAlex Elder 	rbd_assert(img_request != NULL);
21752169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
21762169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
21772169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
21782169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
21792169238dSAlex Elder 
21802169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
21812169238dSAlex Elder 	if (which != img_request->next_completion)
21822169238dSAlex Elder 		goto out;
21832169238dSAlex Elder 
21842169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
21852169238dSAlex Elder 		rbd_assert(more);
21862169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
21872169238dSAlex Elder 
21882169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
21892169238dSAlex Elder 			break;
21901217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
21912169238dSAlex Elder 		which++;
21922169238dSAlex Elder 	}
21932169238dSAlex Elder 
21942169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
21952169238dSAlex Elder 	img_request->next_completion = which;
21962169238dSAlex Elder out:
21972169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
21982169238dSAlex Elder 
21992169238dSAlex Elder 	if (!more)
22002169238dSAlex Elder 		rbd_img_request_complete(img_request);
22012169238dSAlex Elder }
22022169238dSAlex Elder 
2203f1a4739fSAlex Elder /*
2204f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2205f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2206f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2207f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2208f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2209f1a4739fSAlex Elder  * all data described by the image request.
2210f1a4739fSAlex Elder  */
2211f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2212f1a4739fSAlex Elder 					enum obj_request_type type,
2213f1a4739fSAlex Elder 					void *data_desc)
2214bf0d5f50SAlex Elder {
2215bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2216bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2217bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
22180c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
2219a158073cSJingoo Han 	struct bio *bio_list = NULL;
2220f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2221a158073cSJingoo Han 	struct page **pages = NULL;
22227da22d29SAlex Elder 	u64 img_offset;
2223bf0d5f50SAlex Elder 	u64 resid;
2224bf0d5f50SAlex Elder 	u16 opcode;
2225bf0d5f50SAlex Elder 
2226f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2227f1a4739fSAlex Elder 		(int)type, data_desc);
222837206ee5SAlex Elder 
2229430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
22307da22d29SAlex Elder 	img_offset = img_request->offset;
2231bf0d5f50SAlex Elder 	resid = img_request->length;
22324dda41d3SAlex Elder 	rbd_assert(resid > 0);
2233f1a4739fSAlex Elder 
2234f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2235f1a4739fSAlex Elder 		bio_list = data_desc;
2236f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2237f1a4739fSAlex Elder 	} else {
2238f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2239f1a4739fSAlex Elder 		pages = data_desc;
2240f1a4739fSAlex Elder 	}
2241f1a4739fSAlex Elder 
2242bf0d5f50SAlex Elder 	while (resid) {
22432fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2244bf0d5f50SAlex Elder 		const char *object_name;
2245bf0d5f50SAlex Elder 		u64 offset;
2246bf0d5f50SAlex Elder 		u64 length;
2247bf0d5f50SAlex Elder 
22487da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2249bf0d5f50SAlex Elder 		if (!object_name)
2250bf0d5f50SAlex Elder 			goto out_unwind;
22517da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
22527da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2253bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2254f1a4739fSAlex Elder 						offset, length, type);
225578c2a44aSAlex Elder 		/* object request has its own copy of the object name */
225678c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2257bf0d5f50SAlex Elder 		if (!obj_request)
2258bf0d5f50SAlex Elder 			goto out_unwind;
225903507db6SJosh Durgin 		/*
226003507db6SJosh Durgin 		 * set obj_request->img_request before creating the
226103507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
226203507db6SJosh Durgin 		 */
226303507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2264bf0d5f50SAlex Elder 
2265f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2266f1a4739fSAlex Elder 			unsigned int clone_size;
2267f1a4739fSAlex Elder 
2268bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2269bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2270f1a4739fSAlex Elder 			obj_request->bio_list =
2271f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2272f1a4739fSAlex Elder 								&bio_offset,
2273f1a4739fSAlex Elder 								clone_size,
2274bf0d5f50SAlex Elder 								GFP_ATOMIC);
2275bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
2276bf0d5f50SAlex Elder 				goto out_partial;
2277f1a4739fSAlex Elder 		} else {
2278f1a4739fSAlex Elder 			unsigned int page_count;
2279f1a4739fSAlex Elder 
2280f1a4739fSAlex Elder 			obj_request->pages = pages;
2281f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2282f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2283f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2284f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2285f1a4739fSAlex Elder 			pages += page_count;
2286f1a4739fSAlex Elder 		}
2287bf0d5f50SAlex Elder 
22882fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
22892fa12320SAlex Elder 						obj_request);
22902fa12320SAlex Elder 		if (!osd_req)
2291bf0d5f50SAlex Elder 			goto out_partial;
22922fa12320SAlex Elder 		obj_request->osd_req = osd_req;
22932169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
2294430c28c3SAlex Elder 
22952fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
22962fa12320SAlex Elder 						0, 0);
2297f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
2298406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
2299f1a4739fSAlex Elder 					obj_request->bio_list, length);
2300f1a4739fSAlex Elder 		else
2301f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
2302f1a4739fSAlex Elder 					obj_request->pages, length,
2303f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
23049d4df01fSAlex Elder 
23059d4df01fSAlex Elder 		if (write_request)
23069d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
23079d4df01fSAlex Elder 		else
23089d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2309430c28c3SAlex Elder 
23107da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2311bf0d5f50SAlex Elder 
23127da22d29SAlex Elder 		img_offset += length;
2313bf0d5f50SAlex Elder 		resid -= length;
2314bf0d5f50SAlex Elder 	}
2315bf0d5f50SAlex Elder 
2316bf0d5f50SAlex Elder 	return 0;
2317bf0d5f50SAlex Elder 
2318bf0d5f50SAlex Elder out_partial:
2319bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
2320bf0d5f50SAlex Elder out_unwind:
2321bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2322bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
2323bf0d5f50SAlex Elder 
2324bf0d5f50SAlex Elder 	return -ENOMEM;
2325bf0d5f50SAlex Elder }
2326bf0d5f50SAlex Elder 
23273d7efd18SAlex Elder static void
23280eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
23290eefd470SAlex Elder {
23300eefd470SAlex Elder 	struct rbd_img_request *img_request;
23310eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2332ebda6408SAlex Elder 	struct page **pages;
23330eefd470SAlex Elder 	u32 page_count;
23340eefd470SAlex Elder 
23350eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
23360eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23370eefd470SAlex Elder 	img_request = obj_request->img_request;
23380eefd470SAlex Elder 	rbd_assert(img_request);
23390eefd470SAlex Elder 
23400eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
23410eefd470SAlex Elder 	rbd_assert(rbd_dev);
23420eefd470SAlex Elder 
2343ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2344ebda6408SAlex Elder 	rbd_assert(pages != NULL);
23450eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2346ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2347ebda6408SAlex Elder 	rbd_assert(page_count);
2348ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2349ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
23500eefd470SAlex Elder 
23510eefd470SAlex Elder 	/*
23520eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
23530eefd470SAlex Elder 	 * original write request.  There is no such thing as a
23540eefd470SAlex Elder 	 * successful short write, so if the request was successful
23550eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
23560eefd470SAlex Elder 	 */
23570eefd470SAlex Elder 	if (!obj_request->result)
23580eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
23590eefd470SAlex Elder 
23600eefd470SAlex Elder 	/* Finish up with the normal image object callback */
23610eefd470SAlex Elder 
23620eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
23630eefd470SAlex Elder }
23640eefd470SAlex Elder 
23650eefd470SAlex Elder static void
23663d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
23673d7efd18SAlex Elder {
23683d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
23690eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
23700eefd470SAlex Elder 	struct ceph_osd_client *osdc;
23710eefd470SAlex Elder 	struct rbd_device *rbd_dev;
23723d7efd18SAlex Elder 	struct page **pages;
2373ebda6408SAlex Elder 	u32 page_count;
2374bbea1c1aSAlex Elder 	int img_result;
2375ebda6408SAlex Elder 	u64 parent_length;
2376b91f09f1SAlex Elder 	u64 offset;
2377b91f09f1SAlex Elder 	u64 length;
23783d7efd18SAlex Elder 
23793d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
23803d7efd18SAlex Elder 
23813d7efd18SAlex Elder 	/* First get what we need from the image request */
23823d7efd18SAlex Elder 
23833d7efd18SAlex Elder 	pages = img_request->copyup_pages;
23843d7efd18SAlex Elder 	rbd_assert(pages != NULL);
23853d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2386ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2387ebda6408SAlex Elder 	rbd_assert(page_count);
2388ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
23893d7efd18SAlex Elder 
23903d7efd18SAlex Elder 	orig_request = img_request->obj_request;
23913d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2392b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2393bbea1c1aSAlex Elder 	img_result = img_request->result;
2394ebda6408SAlex Elder 	parent_length = img_request->length;
2395ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
23963d7efd18SAlex Elder 	rbd_img_request_put(img_request);
23973d7efd18SAlex Elder 
239891c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
239991c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
24003d7efd18SAlex Elder 	rbd_assert(rbd_dev);
24013d7efd18SAlex Elder 
2402bbea1c1aSAlex Elder 	/*
2403bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2404bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2405bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2406bbea1c1aSAlex Elder 	 */
2407bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2408bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2409bbea1c1aSAlex Elder 
2410bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2411bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2412bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2413bbea1c1aSAlex Elder 		if (!img_result)
2414bbea1c1aSAlex Elder 			return;
2415bbea1c1aSAlex Elder 	}
2416bbea1c1aSAlex Elder 
2417bbea1c1aSAlex Elder 	if (img_result)
24180eefd470SAlex Elder 		goto out_err;
24193d7efd18SAlex Elder 
24208785b1d4SAlex Elder 	/*
24218785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
24228785b1d4SAlex Elder 	 * We need a new one that can hold the two ops in a copyup
24238785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
24248785b1d4SAlex Elder 	 * original request, and release the old one.
24258785b1d4SAlex Elder 	 */
2426bbea1c1aSAlex Elder 	img_result = -ENOMEM;
24270eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
24280eefd470SAlex Elder 	if (!osd_req)
24290eefd470SAlex Elder 		goto out_err;
24308785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
24310eefd470SAlex Elder 	orig_request->osd_req = osd_req;
24320eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2433ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
24343d7efd18SAlex Elder 
24350eefd470SAlex Elder 	/* Initialize the copyup op */
24360eefd470SAlex Elder 
24370eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2438ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
24390eefd470SAlex Elder 						false, false);
24400eefd470SAlex Elder 
24410eefd470SAlex Elder 	/* Then the original write request op */
24420eefd470SAlex Elder 
2443b91f09f1SAlex Elder 	offset = orig_request->offset;
2444b91f09f1SAlex Elder 	length = orig_request->length;
24450eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2446b91f09f1SAlex Elder 					offset, length, 0, 0);
2447b91f09f1SAlex Elder 	if (orig_request->type == OBJ_REQUEST_BIO)
2448b91f09f1SAlex Elder 		osd_req_op_extent_osd_data_bio(osd_req, 1,
2449b91f09f1SAlex Elder 					orig_request->bio_list, length);
2450b91f09f1SAlex Elder 	else
2451b91f09f1SAlex Elder 		osd_req_op_extent_osd_data_pages(osd_req, 1,
2452b91f09f1SAlex Elder 					orig_request->pages, length,
2453b91f09f1SAlex Elder 					offset & ~PAGE_MASK, false, false);
24540eefd470SAlex Elder 
24550eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
24560eefd470SAlex Elder 
24570eefd470SAlex Elder 	/* All set, send it off. */
24580eefd470SAlex Elder 
24590eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
24600eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2461bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2462bbea1c1aSAlex Elder 	if (!img_result)
24630eefd470SAlex Elder 		return;
24640eefd470SAlex Elder out_err:
24650eefd470SAlex Elder 	/* Record the error code and complete the request */
24660eefd470SAlex Elder 
2467bbea1c1aSAlex Elder 	orig_request->result = img_result;
24680eefd470SAlex Elder 	orig_request->xferred = 0;
24693d7efd18SAlex Elder 	obj_request_done_set(orig_request);
24703d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
24713d7efd18SAlex Elder }
24723d7efd18SAlex Elder 
24733d7efd18SAlex Elder /*
24743d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
24753d7efd18SAlex Elder  * entire target of the given object request.  This is used for
24763d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
24773d7efd18SAlex Elder  * object request from the image request does not exist.
24783d7efd18SAlex Elder  *
24793d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
24803d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
24813d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
24823d7efd18SAlex Elder  * the original object request for the copyup operation.
24833d7efd18SAlex Elder  *
24843d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
24853d7efd18SAlex Elder  * object request and mark it done so it gets completed.
24863d7efd18SAlex Elder  */
24873d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
24883d7efd18SAlex Elder {
24893d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
24903d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
24913d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
24923d7efd18SAlex Elder 	u64 img_offset;
24933d7efd18SAlex Elder 	u64 length;
24943d7efd18SAlex Elder 	struct page **pages = NULL;
24953d7efd18SAlex Elder 	u32 page_count;
24963d7efd18SAlex Elder 	int result;
24973d7efd18SAlex Elder 
24983d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2499b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
25003d7efd18SAlex Elder 
25013d7efd18SAlex Elder 	img_request = obj_request->img_request;
25023d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
25033d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
25043d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
25053d7efd18SAlex Elder 
25063d7efd18SAlex Elder 	/*
25073d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
25083d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
25093d7efd18SAlex Elder 	 */
25103d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
25113d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
25123d7efd18SAlex Elder 
25133d7efd18SAlex Elder 	/*
2514a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2515a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2516a9e8ba2cSAlex Elder 	 * necessary.
2517a9e8ba2cSAlex Elder 	 */
2518a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2519a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2520a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2521a9e8ba2cSAlex Elder 	}
2522a9e8ba2cSAlex Elder 
2523a9e8ba2cSAlex Elder 	/*
25243d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
25253d7efd18SAlex Elder 	 * from the parent.
25263d7efd18SAlex Elder 	 */
25273d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
25283d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
25293d7efd18SAlex Elder 	if (IS_ERR(pages)) {
25303d7efd18SAlex Elder 		result = PTR_ERR(pages);
25313d7efd18SAlex Elder 		pages = NULL;
25323d7efd18SAlex Elder 		goto out_err;
25333d7efd18SAlex Elder 	}
25343d7efd18SAlex Elder 
25353d7efd18SAlex Elder 	result = -ENOMEM;
2536e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2537e93f3152SAlex Elder 						img_offset, length);
25383d7efd18SAlex Elder 	if (!parent_request)
25393d7efd18SAlex Elder 		goto out_err;
25403d7efd18SAlex Elder 
25413d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
25423d7efd18SAlex Elder 	if (result)
25433d7efd18SAlex Elder 		goto out_err;
25443d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2545ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
25463d7efd18SAlex Elder 
25473d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
25483d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
25493d7efd18SAlex Elder 	if (!result)
25503d7efd18SAlex Elder 		return 0;
25513d7efd18SAlex Elder 
25523d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2553ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
25543d7efd18SAlex Elder 	parent_request->obj_request = NULL;
25553d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
25563d7efd18SAlex Elder out_err:
25573d7efd18SAlex Elder 	if (pages)
25583d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
25593d7efd18SAlex Elder 	if (parent_request)
25603d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
25613d7efd18SAlex Elder 	obj_request->result = result;
25623d7efd18SAlex Elder 	obj_request->xferred = 0;
25633d7efd18SAlex Elder 	obj_request_done_set(obj_request);
25643d7efd18SAlex Elder 
25653d7efd18SAlex Elder 	return result;
25663d7efd18SAlex Elder }
25673d7efd18SAlex Elder 
2568c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2569c5b5ef6cSAlex Elder {
2570c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2571638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2572c5b5ef6cSAlex Elder 	int result;
2573c5b5ef6cSAlex Elder 
2574c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2575c5b5ef6cSAlex Elder 
2576c5b5ef6cSAlex Elder 	/*
2577c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2578c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2579c5b5ef6cSAlex Elder 	 * we're done with the request.
2580c5b5ef6cSAlex Elder 	 */
2581c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2582c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2583912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2584c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2585c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2586c5b5ef6cSAlex Elder 
2587c5b5ef6cSAlex Elder 	result = obj_request->result;
2588c5b5ef6cSAlex Elder 	obj_request->result = 0;
2589c5b5ef6cSAlex Elder 
2590c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2591c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2592c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2593c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2594c5b5ef6cSAlex Elder 
2595638f5abeSAlex Elder 	/*
2596638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2597638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2598638f5abeSAlex Elder 	 * and re-submit the original write request.
2599638f5abeSAlex Elder 	 */
2600638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2601638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2602638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2603638f5abeSAlex Elder 
2604638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2605638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2606638f5abeSAlex Elder 		if (!result)
2607638f5abeSAlex Elder 			return;
2608638f5abeSAlex Elder 	}
2609c5b5ef6cSAlex Elder 
2610c5b5ef6cSAlex Elder 	/*
2611c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2612c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2613c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2614c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2615c5b5ef6cSAlex Elder 	 */
2616c5b5ef6cSAlex Elder 	if (!result) {
2617c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2618c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2619c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2620c5b5ef6cSAlex Elder 	} else if (result) {
2621c5b5ef6cSAlex Elder 		orig_request->result = result;
26223d7efd18SAlex Elder 		goto out;
2623c5b5ef6cSAlex Elder 	}
2624c5b5ef6cSAlex Elder 
2625c5b5ef6cSAlex Elder 	/*
2626c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2627c5b5ef6cSAlex Elder 	 * whether the target object exists.
2628c5b5ef6cSAlex Elder 	 */
2629b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
26303d7efd18SAlex Elder out:
2631c5b5ef6cSAlex Elder 	if (orig_request->result)
2632c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2633c5b5ef6cSAlex Elder }
2634c5b5ef6cSAlex Elder 
2635c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2636c5b5ef6cSAlex Elder {
2637c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2638c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2639c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2640c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2641c5b5ef6cSAlex Elder 	u32 page_count;
2642c5b5ef6cSAlex Elder 	size_t size;
2643c5b5ef6cSAlex Elder 	int ret;
2644c5b5ef6cSAlex Elder 
2645c5b5ef6cSAlex Elder 	/*
2646c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2647c5b5ef6cSAlex Elder 	 *     le64 length;
2648c5b5ef6cSAlex Elder 	 *     struct {
2649c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2650c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2651c5b5ef6cSAlex Elder 	 *     } mtime;
2652c5b5ef6cSAlex Elder 	 */
2653c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2654c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2655c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2656c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2657c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2658c5b5ef6cSAlex Elder 
2659c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2660c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2661c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2662c5b5ef6cSAlex Elder 	if (!stat_request)
2663c5b5ef6cSAlex Elder 		goto out;
2664c5b5ef6cSAlex Elder 
2665c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2666c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2667c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2668c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2669c5b5ef6cSAlex Elder 
2670c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2671c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2672c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2673c5b5ef6cSAlex Elder 						stat_request);
2674c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2675c5b5ef6cSAlex Elder 		goto out;
2676c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2677c5b5ef6cSAlex Elder 
2678c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2679c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2680c5b5ef6cSAlex Elder 					false, false);
26819d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2682c5b5ef6cSAlex Elder 
2683c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2684c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2685c5b5ef6cSAlex Elder out:
2686c5b5ef6cSAlex Elder 	if (ret)
2687c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2688c5b5ef6cSAlex Elder 
2689c5b5ef6cSAlex Elder 	return ret;
2690c5b5ef6cSAlex Elder }
2691c5b5ef6cSAlex Elder 
2692b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2693b454e36dSAlex Elder {
2694b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2695a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
26963d7efd18SAlex Elder 	bool known;
2697b454e36dSAlex Elder 
2698b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2699b454e36dSAlex Elder 
2700b454e36dSAlex Elder 	img_request = obj_request->img_request;
2701b454e36dSAlex Elder 	rbd_assert(img_request);
2702a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2703b454e36dSAlex Elder 
2704b454e36dSAlex Elder 	/*
2705a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2706a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2707a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2708a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2709a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2710a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2711a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2712a9e8ba2cSAlex Elder 	 * simple object request.
2713b454e36dSAlex Elder 	 */
2714b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2715b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2716a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
27173d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
27183d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2719b454e36dSAlex Elder 
2720b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2721b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2722b454e36dSAlex Elder 
2723b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2724b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2725b454e36dSAlex Elder 
2726b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2727b454e36dSAlex Elder 	}
2728b454e36dSAlex Elder 
2729b454e36dSAlex Elder 	/*
27303d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
27313d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
27323d7efd18SAlex Elder 	 * start by reading the data for the full target object from
27333d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2734b454e36dSAlex Elder 	 */
27353d7efd18SAlex Elder 	if (known)
27363d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
27373d7efd18SAlex Elder 
27383d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2739b454e36dSAlex Elder 
2740b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2741b454e36dSAlex Elder }
2742b454e36dSAlex Elder 
2743bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2744bf0d5f50SAlex Elder {
2745bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
274646faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2747bf0d5f50SAlex Elder 
274837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
274946faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2750bf0d5f50SAlex Elder 		int ret;
2751bf0d5f50SAlex Elder 
2752b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2753bf0d5f50SAlex Elder 		if (ret)
2754bf0d5f50SAlex Elder 			return ret;
2755bf0d5f50SAlex Elder 	}
2756bf0d5f50SAlex Elder 
2757bf0d5f50SAlex Elder 	return 0;
2758bf0d5f50SAlex Elder }
2759bf0d5f50SAlex Elder 
27608b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
27618b3e1a56SAlex Elder {
27628b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2763a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2764a9e8ba2cSAlex Elder 	u64 obj_end;
276502c74fbaSAlex Elder 	u64 img_xferred;
276602c74fbaSAlex Elder 	int img_result;
27678b3e1a56SAlex Elder 
27688b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
27698b3e1a56SAlex Elder 
277002c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
277102c74fbaSAlex Elder 
27728b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
277302c74fbaSAlex Elder 	img_xferred = img_request->xferred;
277402c74fbaSAlex Elder 	img_result = img_request->result;
277502c74fbaSAlex Elder 	rbd_img_request_put(img_request);
277602c74fbaSAlex Elder 
277702c74fbaSAlex Elder 	/*
277802c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
277902c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
278002c74fbaSAlex Elder 	 * original request.
278102c74fbaSAlex Elder 	 */
2782a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2783a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
278402c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
278502c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
278602c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
27878b3e1a56SAlex Elder 
278802c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
278902c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
279002c74fbaSAlex Elder 		if (!img_result)
279102c74fbaSAlex Elder 			return;
279202c74fbaSAlex Elder 	}
279302c74fbaSAlex Elder 
279402c74fbaSAlex Elder 	obj_request->result = img_result;
2795a9e8ba2cSAlex Elder 	if (obj_request->result)
2796a9e8ba2cSAlex Elder 		goto out;
2797a9e8ba2cSAlex Elder 
2798a9e8ba2cSAlex Elder 	/*
2799a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2800a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2801a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2802a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2803a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2804a9e8ba2cSAlex Elder 	 */
2805a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2806a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2807a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2808a9e8ba2cSAlex Elder 		u64 xferred = 0;
2809a9e8ba2cSAlex Elder 
2810a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2811a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2812a9e8ba2cSAlex Elder 					obj_request->img_offset;
2813a9e8ba2cSAlex Elder 
281402c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
2815a9e8ba2cSAlex Elder 	} else {
281602c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
2817a9e8ba2cSAlex Elder 	}
2818a9e8ba2cSAlex Elder out:
28198b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
28208b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
28218b3e1a56SAlex Elder }
28228b3e1a56SAlex Elder 
28238b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
28248b3e1a56SAlex Elder {
28258b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
28268b3e1a56SAlex Elder 	int result;
28278b3e1a56SAlex Elder 
28288b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
28298b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
28308b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
28315b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
28328b3e1a56SAlex Elder 
28338b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
2834e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
28358b3e1a56SAlex Elder 						obj_request->img_offset,
2836e93f3152SAlex Elder 						obj_request->length);
28378b3e1a56SAlex Elder 	result = -ENOMEM;
28388b3e1a56SAlex Elder 	if (!img_request)
28398b3e1a56SAlex Elder 		goto out_err;
28408b3e1a56SAlex Elder 
28415b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
2842f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2843f1a4739fSAlex Elder 						obj_request->bio_list);
28445b2ab72dSAlex Elder 	else
28455b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
28465b2ab72dSAlex Elder 						obj_request->pages);
28478b3e1a56SAlex Elder 	if (result)
28488b3e1a56SAlex Elder 		goto out_err;
28498b3e1a56SAlex Elder 
28508b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
28518b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
28528b3e1a56SAlex Elder 	if (result)
28538b3e1a56SAlex Elder 		goto out_err;
28548b3e1a56SAlex Elder 
28558b3e1a56SAlex Elder 	return;
28568b3e1a56SAlex Elder out_err:
28578b3e1a56SAlex Elder 	if (img_request)
28588b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
28598b3e1a56SAlex Elder 	obj_request->result = result;
28608b3e1a56SAlex Elder 	obj_request->xferred = 0;
28618b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
28628b3e1a56SAlex Elder }
28638b3e1a56SAlex Elder 
286420e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2865b8d70035SAlex Elder {
2866b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
28672169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2868b8d70035SAlex Elder 	int ret;
2869b8d70035SAlex Elder 
2870b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2871b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2872b8d70035SAlex Elder 	if (!obj_request)
2873b8d70035SAlex Elder 		return -ENOMEM;
2874b8d70035SAlex Elder 
2875b8d70035SAlex Elder 	ret = -ENOMEM;
2876430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2877b8d70035SAlex Elder 	if (!obj_request->osd_req)
2878b8d70035SAlex Elder 		goto out;
2879b8d70035SAlex Elder 
2880c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2881cc4a38bdSAlex Elder 					notify_id, 0, 0);
28829d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2883430c28c3SAlex Elder 
2884b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2885cf81b60eSAlex Elder 	if (ret)
288620e0af67SJosh Durgin 		goto out;
288720e0af67SJosh Durgin 	ret = rbd_obj_request_wait(obj_request);
288820e0af67SJosh Durgin out:
2889b8d70035SAlex Elder 	rbd_obj_request_put(obj_request);
2890b8d70035SAlex Elder 
2891b8d70035SAlex Elder 	return ret;
2892b8d70035SAlex Elder }
2893b8d70035SAlex Elder 
2894b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2895b8d70035SAlex Elder {
2896b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2897e627db08SAlex Elder 	int ret;
2898b8d70035SAlex Elder 
2899b8d70035SAlex Elder 	if (!rbd_dev)
2900b8d70035SAlex Elder 		return;
2901b8d70035SAlex Elder 
290237206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2903b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2904b8d70035SAlex Elder 		(unsigned int)opcode);
2905e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
2906e627db08SAlex Elder 	if (ret)
29073b5cf2a2SAlex Elder 		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2908b8d70035SAlex Elder 
290920e0af67SJosh Durgin 	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
2910b8d70035SAlex Elder }
2911b8d70035SAlex Elder 
29129969ebc5SAlex Elder /*
29139969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
29149969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
29159969ebc5SAlex Elder  */
2916fca27065SIlya Dryomov static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
29179969ebc5SAlex Elder {
29189969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
29199969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
29209969ebc5SAlex Elder 	int ret;
29219969ebc5SAlex Elder 
29229969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
29239969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
29249969ebc5SAlex Elder 
29259969ebc5SAlex Elder 	if (start) {
29263c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
29279969ebc5SAlex Elder 						&rbd_dev->watch_event);
29289969ebc5SAlex Elder 		if (ret < 0)
29299969ebc5SAlex Elder 			return ret;
29308eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
29319969ebc5SAlex Elder 	}
29329969ebc5SAlex Elder 
29339969ebc5SAlex Elder 	ret = -ENOMEM;
29349969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
29359969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
29369969ebc5SAlex Elder 	if (!obj_request)
29379969ebc5SAlex Elder 		goto out_cancel;
29389969ebc5SAlex Elder 
2939430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2940430c28c3SAlex Elder 	if (!obj_request->osd_req)
2941430c28c3SAlex Elder 		goto out_cancel;
2942430c28c3SAlex Elder 
29438eb87565SAlex Elder 	if (start)
2944975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
29458eb87565SAlex Elder 	else
29466977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2947975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
29482169238dSAlex Elder 
29492169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
29501f3ef788SAlex Elder 				rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
29519d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
29522169238dSAlex Elder 
29539969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
29549969ebc5SAlex Elder 	if (ret)
29559969ebc5SAlex Elder 		goto out_cancel;
29569969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
29579969ebc5SAlex Elder 	if (ret)
29589969ebc5SAlex Elder 		goto out_cancel;
29599969ebc5SAlex Elder 	ret = obj_request->result;
29609969ebc5SAlex Elder 	if (ret)
29619969ebc5SAlex Elder 		goto out_cancel;
29629969ebc5SAlex Elder 
29638eb87565SAlex Elder 	/*
29648eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
29658eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
29668eb87565SAlex Elder 	 * a pointer to the object request during that time (in
29678eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
29688eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
29698eb87565SAlex Elder 	 * unregistered it.
29708eb87565SAlex Elder 	 */
29718eb87565SAlex Elder 	if (start) {
29728eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
29738eb87565SAlex Elder 
29748eb87565SAlex Elder 		return 0;
29758eb87565SAlex Elder 	}
29768eb87565SAlex Elder 
29778eb87565SAlex Elder 	/* We have successfully torn down the watch request */
29788eb87565SAlex Elder 
29798eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
29808eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
29819969ebc5SAlex Elder out_cancel:
29829969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
29839969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
29849969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
29859969ebc5SAlex Elder 	if (obj_request)
29869969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
29879969ebc5SAlex Elder 
29889969ebc5SAlex Elder 	return ret;
29899969ebc5SAlex Elder }
29909969ebc5SAlex Elder 
2991fca27065SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
2992fca27065SIlya Dryomov {
2993fca27065SIlya Dryomov 	return __rbd_dev_header_watch_sync(rbd_dev, true);
2994fca27065SIlya Dryomov }
2995fca27065SIlya Dryomov 
2996fca27065SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
2997fca27065SIlya Dryomov {
2998fca27065SIlya Dryomov 	int ret;
2999fca27065SIlya Dryomov 
3000fca27065SIlya Dryomov 	ret = __rbd_dev_header_watch_sync(rbd_dev, false);
3001fca27065SIlya Dryomov 	if (ret) {
3002fca27065SIlya Dryomov 		rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
3003fca27065SIlya Dryomov 			 ret);
3004fca27065SIlya Dryomov 	}
3005fca27065SIlya Dryomov }
3006fca27065SIlya Dryomov 
300736be9a76SAlex Elder /*
3008f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3009f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
301036be9a76SAlex Elder  */
301136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
301236be9a76SAlex Elder 			     const char *object_name,
301336be9a76SAlex Elder 			     const char *class_name,
301436be9a76SAlex Elder 			     const char *method_name,
30154157976bSAlex Elder 			     const void *outbound,
301636be9a76SAlex Elder 			     size_t outbound_size,
30174157976bSAlex Elder 			     void *inbound,
3018e2a58ee5SAlex Elder 			     size_t inbound_size)
301936be9a76SAlex Elder {
30202169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
302136be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
302236be9a76SAlex Elder 	struct page **pages;
302336be9a76SAlex Elder 	u32 page_count;
302436be9a76SAlex Elder 	int ret;
302536be9a76SAlex Elder 
302636be9a76SAlex Elder 	/*
30276010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
30286010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
30296010a451SAlex Elder 	 * also supply outbound data--parameters for the object
30306010a451SAlex Elder 	 * method.  Currently if this is present it will be a
30316010a451SAlex Elder 	 * snapshot id.
303236be9a76SAlex Elder 	 */
303336be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
303436be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
303536be9a76SAlex Elder 	if (IS_ERR(pages))
303636be9a76SAlex Elder 		return PTR_ERR(pages);
303736be9a76SAlex Elder 
303836be9a76SAlex Elder 	ret = -ENOMEM;
30396010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
304036be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
304136be9a76SAlex Elder 	if (!obj_request)
304236be9a76SAlex Elder 		goto out;
304336be9a76SAlex Elder 
304436be9a76SAlex Elder 	obj_request->pages = pages;
304536be9a76SAlex Elder 	obj_request->page_count = page_count;
304636be9a76SAlex Elder 
3047430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
304836be9a76SAlex Elder 	if (!obj_request->osd_req)
304936be9a76SAlex Elder 		goto out;
305036be9a76SAlex Elder 
3051c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
305204017e29SAlex Elder 					class_name, method_name);
305304017e29SAlex Elder 	if (outbound_size) {
305404017e29SAlex Elder 		struct ceph_pagelist *pagelist;
305504017e29SAlex Elder 
305604017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
305704017e29SAlex Elder 		if (!pagelist)
305804017e29SAlex Elder 			goto out;
305904017e29SAlex Elder 
306004017e29SAlex Elder 		ceph_pagelist_init(pagelist);
306104017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
306204017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
306304017e29SAlex Elder 						pagelist);
306404017e29SAlex Elder 	}
3065a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3066a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
306744cd188dSAlex Elder 					0, false, false);
30689d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3069430c28c3SAlex Elder 
307036be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
307136be9a76SAlex Elder 	if (ret)
307236be9a76SAlex Elder 		goto out;
307336be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
307436be9a76SAlex Elder 	if (ret)
307536be9a76SAlex Elder 		goto out;
307636be9a76SAlex Elder 
307736be9a76SAlex Elder 	ret = obj_request->result;
307836be9a76SAlex Elder 	if (ret < 0)
307936be9a76SAlex Elder 		goto out;
308057385b51SAlex Elder 
308157385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
308257385b51SAlex Elder 	ret = (int)obj_request->xferred;
3083903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
308436be9a76SAlex Elder out:
308536be9a76SAlex Elder 	if (obj_request)
308636be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
308736be9a76SAlex Elder 	else
308836be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
308936be9a76SAlex Elder 
309036be9a76SAlex Elder 	return ret;
309136be9a76SAlex Elder }
309236be9a76SAlex Elder 
3093bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
3094cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
3095bf0d5f50SAlex Elder {
3096bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
3097bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
3098bf0d5f50SAlex Elder 	struct request *rq;
3099bf0d5f50SAlex Elder 	int result;
3100bf0d5f50SAlex Elder 
3101bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
3102bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
3103bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
3104bf0d5f50SAlex Elder 		u64 offset;
3105bf0d5f50SAlex Elder 		u64 length;
3106bf0d5f50SAlex Elder 
3107bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
3108bf0d5f50SAlex Elder 
3109bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
31104dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
31114dda41d3SAlex Elder 				(int) rq->cmd_type);
31124dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
31134dda41d3SAlex Elder 			continue;
31144dda41d3SAlex Elder 		}
31154dda41d3SAlex Elder 
31164dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
31174dda41d3SAlex Elder 
31184dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
31194dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
31204dda41d3SAlex Elder 
31214dda41d3SAlex Elder 		if (!length) {
31224dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
3123bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
3124bf0d5f50SAlex Elder 			continue;
3125bf0d5f50SAlex Elder 		}
3126bf0d5f50SAlex Elder 
3127bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
3128bf0d5f50SAlex Elder 
3129bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
3130bf0d5f50SAlex Elder 
3131bf0d5f50SAlex Elder 		if (write_request) {
3132bf0d5f50SAlex Elder 			result = -EROFS;
3133bf0d5f50SAlex Elder 			if (read_only)
3134bf0d5f50SAlex Elder 				goto end_request;
3135bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3136bf0d5f50SAlex Elder 		}
3137bf0d5f50SAlex Elder 
31386d292906SAlex Elder 		/*
31396d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
31406d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
31416d292906SAlex Elder 		 * have disappeared by the time our request arrives
31426d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
31436d292906SAlex Elder 		 * we already know.
31446d292906SAlex Elder 		 */
31456d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3146bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
3147bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3148bf0d5f50SAlex Elder 			result = -ENXIO;
3149bf0d5f50SAlex Elder 			goto end_request;
3150bf0d5f50SAlex Elder 		}
3151bf0d5f50SAlex Elder 
3152bf0d5f50SAlex Elder 		result = -EINVAL;
3153c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
3154c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3155c0cd10dbSAlex Elder 				offset, length);
3156bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
3157c0cd10dbSAlex Elder 		}
3158bf0d5f50SAlex Elder 
315900a653e2SAlex Elder 		result = -EIO;
316000a653e2SAlex Elder 		if (offset + length > rbd_dev->mapping.size) {
316100a653e2SAlex Elder 			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
316200a653e2SAlex Elder 				offset, length, rbd_dev->mapping.size);
316300a653e2SAlex Elder 			goto end_request;
316400a653e2SAlex Elder 		}
316500a653e2SAlex Elder 
3166bf0d5f50SAlex Elder 		result = -ENOMEM;
3167bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
3168e93f3152SAlex Elder 							write_request);
3169bf0d5f50SAlex Elder 		if (!img_request)
3170bf0d5f50SAlex Elder 			goto end_request;
3171bf0d5f50SAlex Elder 
3172bf0d5f50SAlex Elder 		img_request->rq = rq;
3173bf0d5f50SAlex Elder 
3174f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3175f1a4739fSAlex Elder 						rq->bio);
3176bf0d5f50SAlex Elder 		if (!result)
3177bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
3178bf0d5f50SAlex Elder 		if (result)
3179bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
3180bf0d5f50SAlex Elder end_request:
3181bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
3182bf0d5f50SAlex Elder 		if (result < 0) {
31837da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
31847da22d29SAlex Elder 				write_request ? "write" : "read",
31857da22d29SAlex Elder 				length, offset, result);
31867da22d29SAlex Elder 
3187bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
3188bf0d5f50SAlex Elder 		}
3189bf0d5f50SAlex Elder 	}
3190bf0d5f50SAlex Elder }
3191bf0d5f50SAlex Elder 
3192602adf40SYehuda Sadeh /*
3193602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
3194602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
3195f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
3196602adf40SYehuda Sadeh  */
3197602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3198602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
3199602adf40SYehuda Sadeh {
3200602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
3201e5cfeed2SAlex Elder 	sector_t sector_offset;
3202e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
3203e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
3204e5cfeed2SAlex Elder 	int ret;
3205602adf40SYehuda Sadeh 
3206e5cfeed2SAlex Elder 	/*
3207e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
3208e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
3209e5cfeed2SAlex Elder 	 * device.
3210e5cfeed2SAlex Elder 	 */
3211e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3212e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3213e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3214593a9e7bSAlex Elder 
3215e5cfeed2SAlex Elder 	/*
3216e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
3217e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
3218e5cfeed2SAlex Elder 	 */
3219e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3220e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
3221e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
3222e5cfeed2SAlex Elder 	else
3223e5cfeed2SAlex Elder 		ret = 0;
3224e5cfeed2SAlex Elder 
3225e5cfeed2SAlex Elder 	/*
3226e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
3227e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
3228e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
3229e5cfeed2SAlex Elder 	 * added to an empty bio."
3230e5cfeed2SAlex Elder 	 */
3231e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3232e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3233e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
3234e5cfeed2SAlex Elder 
3235e5cfeed2SAlex Elder 	return ret;
3236602adf40SYehuda Sadeh }
3237602adf40SYehuda Sadeh 
3238602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3239602adf40SYehuda Sadeh {
3240602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3241602adf40SYehuda Sadeh 
3242602adf40SYehuda Sadeh 	if (!disk)
3243602adf40SYehuda Sadeh 		return;
3244602adf40SYehuda Sadeh 
3245a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3246a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3247602adf40SYehuda Sadeh 		del_gendisk(disk);
3248602adf40SYehuda Sadeh 		if (disk->queue)
3249602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
3250a0cab924SAlex Elder 	}
3251602adf40SYehuda Sadeh 	put_disk(disk);
3252602adf40SYehuda Sadeh }
3253602adf40SYehuda Sadeh 
3254788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3255788e2df3SAlex Elder 				const char *object_name,
32567097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3257788e2df3SAlex Elder 
3258788e2df3SAlex Elder {
32592169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3260788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3261788e2df3SAlex Elder 	struct page **pages = NULL;
3262788e2df3SAlex Elder 	u32 page_count;
32631ceae7efSAlex Elder 	size_t size;
3264788e2df3SAlex Elder 	int ret;
3265788e2df3SAlex Elder 
3266788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3267788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3268788e2df3SAlex Elder 	if (IS_ERR(pages))
3269788e2df3SAlex Elder 		ret = PTR_ERR(pages);
3270788e2df3SAlex Elder 
3271788e2df3SAlex Elder 	ret = -ENOMEM;
3272788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3273788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3274788e2df3SAlex Elder 	if (!obj_request)
3275788e2df3SAlex Elder 		goto out;
3276788e2df3SAlex Elder 
3277788e2df3SAlex Elder 	obj_request->pages = pages;
3278788e2df3SAlex Elder 	obj_request->page_count = page_count;
3279788e2df3SAlex Elder 
3280430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3281788e2df3SAlex Elder 	if (!obj_request->osd_req)
3282788e2df3SAlex Elder 		goto out;
3283788e2df3SAlex Elder 
3284c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3285c99d2d4aSAlex Elder 					offset, length, 0, 0);
3286406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3287a4ce40a9SAlex Elder 					obj_request->pages,
328844cd188dSAlex Elder 					obj_request->length,
328944cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
329044cd188dSAlex Elder 					false, false);
32919d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3292430c28c3SAlex Elder 
3293788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3294788e2df3SAlex Elder 	if (ret)
3295788e2df3SAlex Elder 		goto out;
3296788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3297788e2df3SAlex Elder 	if (ret)
3298788e2df3SAlex Elder 		goto out;
3299788e2df3SAlex Elder 
3300788e2df3SAlex Elder 	ret = obj_request->result;
3301788e2df3SAlex Elder 	if (ret < 0)
3302788e2df3SAlex Elder 		goto out;
33031ceae7efSAlex Elder 
33041ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
33051ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3306903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
330723ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
330823ed6e13SAlex Elder 	ret = (int)size;
3309788e2df3SAlex Elder out:
3310788e2df3SAlex Elder 	if (obj_request)
3311788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3312788e2df3SAlex Elder 	else
3313788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3314788e2df3SAlex Elder 
3315788e2df3SAlex Elder 	return ret;
3316788e2df3SAlex Elder }
3317788e2df3SAlex Elder 
3318602adf40SYehuda Sadeh /*
3319662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3320662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3321662518b1SAlex Elder  * information about the image.
33224156d998SAlex Elder  */
332399a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
33244156d998SAlex Elder {
33254156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
33264156d998SAlex Elder 	u32 snap_count = 0;
33274156d998SAlex Elder 	u64 names_size = 0;
33284156d998SAlex Elder 	u32 want_count;
33294156d998SAlex Elder 	int ret;
33304156d998SAlex Elder 
33314156d998SAlex Elder 	/*
33324156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
33334156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
33344156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
33354156d998SAlex Elder 	 * the number of snapshots could change by the time we read
33364156d998SAlex Elder 	 * it in, in which case we re-read it.
33374156d998SAlex Elder 	 */
33384156d998SAlex Elder 	do {
33394156d998SAlex Elder 		size_t size;
33404156d998SAlex Elder 
33414156d998SAlex Elder 		kfree(ondisk);
33424156d998SAlex Elder 
33434156d998SAlex Elder 		size = sizeof (*ondisk);
33444156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
33454156d998SAlex Elder 		size += names_size;
33464156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
33474156d998SAlex Elder 		if (!ondisk)
3348662518b1SAlex Elder 			return -ENOMEM;
33494156d998SAlex Elder 
3350788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
33517097f8dfSAlex Elder 				       0, size, ondisk);
33524156d998SAlex Elder 		if (ret < 0)
3353662518b1SAlex Elder 			goto out;
3354c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
33554156d998SAlex Elder 			ret = -ENXIO;
335606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
335706ecc6cbSAlex Elder 				size, ret);
3358662518b1SAlex Elder 			goto out;
33594156d998SAlex Elder 		}
33604156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
33614156d998SAlex Elder 			ret = -ENXIO;
336206ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3363662518b1SAlex Elder 			goto out;
33644156d998SAlex Elder 		}
33654156d998SAlex Elder 
33664156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
33674156d998SAlex Elder 		want_count = snap_count;
33684156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
33694156d998SAlex Elder 	} while (snap_count != want_count);
33704156d998SAlex Elder 
3371662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3372662518b1SAlex Elder out:
33734156d998SAlex Elder 	kfree(ondisk);
33744156d998SAlex Elder 
3375dfc5606dSYehuda Sadeh 	return ret;
3376602adf40SYehuda Sadeh }
3377602adf40SYehuda Sadeh 
337815228edeSAlex Elder /*
337915228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
338015228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
338115228edeSAlex Elder  */
338215228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
338315228edeSAlex Elder {
338415228edeSAlex Elder 	u64 snap_id;
338515228edeSAlex Elder 
338615228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
338715228edeSAlex Elder 		return;
338815228edeSAlex Elder 
338915228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
339015228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
339115228edeSAlex Elder 		return;
339215228edeSAlex Elder 
339315228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
339415228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
339515228edeSAlex Elder }
339615228edeSAlex Elder 
33979875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
33989875201eSJosh Durgin {
33999875201eSJosh Durgin 	sector_t size;
34009875201eSJosh Durgin 	bool removing;
34019875201eSJosh Durgin 
34029875201eSJosh Durgin 	/*
34039875201eSJosh Durgin 	 * Don't hold the lock while doing disk operations,
34049875201eSJosh Durgin 	 * or lock ordering will conflict with the bdev mutex via:
34059875201eSJosh Durgin 	 * rbd_add() -> blkdev_get() -> rbd_open()
34069875201eSJosh Durgin 	 */
34079875201eSJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
34089875201eSJosh Durgin 	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
34099875201eSJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
34109875201eSJosh Durgin 	/*
34119875201eSJosh Durgin 	 * If the device is being removed, rbd_dev->disk has
34129875201eSJosh Durgin 	 * been destroyed, so don't try to update its size
34139875201eSJosh Durgin 	 */
34149875201eSJosh Durgin 	if (!removing) {
34159875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
34169875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
34179875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
34189875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
34199875201eSJosh Durgin 	}
34209875201eSJosh Durgin }
34219875201eSJosh Durgin 
3422cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
34231fe5e993SAlex Elder {
3424e627db08SAlex Elder 	u64 mapping_size;
34251fe5e993SAlex Elder 	int ret;
34261fe5e993SAlex Elder 
3427117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3428cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
34293b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3430117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
343199a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
3432117973fbSAlex Elder 	else
34332df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
343415228edeSAlex Elder 
343515228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
343615228edeSAlex Elder 
343715228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
3438cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
3439cfbf6377SAlex Elder 
344000a653e2SAlex Elder 	if (mapping_size != rbd_dev->mapping.size) {
34419875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
344200a653e2SAlex Elder 	}
34431fe5e993SAlex Elder 
34441fe5e993SAlex Elder 	return ret;
34451fe5e993SAlex Elder }
34461fe5e993SAlex Elder 
3447602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3448602adf40SYehuda Sadeh {
3449602adf40SYehuda Sadeh 	struct gendisk *disk;
3450602adf40SYehuda Sadeh 	struct request_queue *q;
3451593a9e7bSAlex Elder 	u64 segment_size;
3452602adf40SYehuda Sadeh 
3453602adf40SYehuda Sadeh 	/* create gendisk info */
34547e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
34557e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
34567e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
3457602adf40SYehuda Sadeh 	if (!disk)
34581fcdb8aaSAlex Elder 		return -ENOMEM;
3459602adf40SYehuda Sadeh 
3460f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3461de71a297SAlex Elder 		 rbd_dev->dev_id);
3462602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3463dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
34647e513d43SIlya Dryomov 	if (single_major)
34657e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
3466602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3467602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3468602adf40SYehuda Sadeh 
3469bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3470602adf40SYehuda Sadeh 	if (!q)
3471602adf40SYehuda Sadeh 		goto out_disk;
3472029bcbd8SJosh Durgin 
3473593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3474593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3475593a9e7bSAlex Elder 
3476029bcbd8SJosh Durgin 	/* set io sizes to object size */
3477593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3478593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3479593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3480593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3481593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3482029bcbd8SJosh Durgin 
3483602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3484602adf40SYehuda Sadeh 	disk->queue = q;
3485602adf40SYehuda Sadeh 
3486602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3487602adf40SYehuda Sadeh 
3488602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3489602adf40SYehuda Sadeh 
3490602adf40SYehuda Sadeh 	return 0;
3491602adf40SYehuda Sadeh out_disk:
3492602adf40SYehuda Sadeh 	put_disk(disk);
34931fcdb8aaSAlex Elder 
34941fcdb8aaSAlex Elder 	return -ENOMEM;
3495602adf40SYehuda Sadeh }
3496602adf40SYehuda Sadeh 
3497dfc5606dSYehuda Sadeh /*
3498dfc5606dSYehuda Sadeh   sysfs
3499dfc5606dSYehuda Sadeh */
3500602adf40SYehuda Sadeh 
3501593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3502593a9e7bSAlex Elder {
3503593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3504593a9e7bSAlex Elder }
3505593a9e7bSAlex Elder 
3506dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3507dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3508602adf40SYehuda Sadeh {
3509593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3510dfc5606dSYehuda Sadeh 
3511fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3512fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3513602adf40SYehuda Sadeh }
3514602adf40SYehuda Sadeh 
351534b13184SAlex Elder /*
351634b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
351734b13184SAlex Elder  * necessarily the base image.
351834b13184SAlex Elder  */
351934b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
352034b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
352134b13184SAlex Elder {
352234b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
352334b13184SAlex Elder 
352434b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
352534b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
352634b13184SAlex Elder }
352734b13184SAlex Elder 
3528dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3529dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3530602adf40SYehuda Sadeh {
3531593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3532dfc5606dSYehuda Sadeh 
3533fc71d833SAlex Elder 	if (rbd_dev->major)
3534dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3535fc71d833SAlex Elder 
3536fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3537dd82fff1SIlya Dryomov }
3538fc71d833SAlex Elder 
3539dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
3540dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
3541dd82fff1SIlya Dryomov {
3542dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3543dd82fff1SIlya Dryomov 
3544dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
3545dfc5606dSYehuda Sadeh }
3546dfc5606dSYehuda Sadeh 
3547dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3548dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3549dfc5606dSYehuda Sadeh {
3550593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3551dfc5606dSYehuda Sadeh 
35521dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
35531dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3554dfc5606dSYehuda Sadeh }
3555dfc5606dSYehuda Sadeh 
3556dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3557dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3558dfc5606dSYehuda Sadeh {
3559593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3560dfc5606dSYehuda Sadeh 
35610d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3562dfc5606dSYehuda Sadeh }
3563dfc5606dSYehuda Sadeh 
35649bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
35659bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
35669bb2f334SAlex Elder {
35679bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
35689bb2f334SAlex Elder 
35690d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
35700d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
35719bb2f334SAlex Elder }
35729bb2f334SAlex Elder 
3573dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3574dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3575dfc5606dSYehuda Sadeh {
3576593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3577dfc5606dSYehuda Sadeh 
3578a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
35790d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3580a92ffdf8SAlex Elder 
3581a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3582dfc5606dSYehuda Sadeh }
3583dfc5606dSYehuda Sadeh 
3584589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3585589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3586589d30e0SAlex Elder {
3587589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3588589d30e0SAlex Elder 
35890d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3590589d30e0SAlex Elder }
3591589d30e0SAlex Elder 
359234b13184SAlex Elder /*
359334b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
359434b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
359534b13184SAlex Elder  */
3596dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3597dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3598dfc5606dSYehuda Sadeh 			     char *buf)
3599dfc5606dSYehuda Sadeh {
3600593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3601dfc5606dSYehuda Sadeh 
36020d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3603dfc5606dSYehuda Sadeh }
3604dfc5606dSYehuda Sadeh 
360586b00e0dSAlex Elder /*
360686b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
360786b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
360886b00e0dSAlex Elder  * "(no parent image)".
360986b00e0dSAlex Elder  */
361086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
361186b00e0dSAlex Elder 			     struct device_attribute *attr,
361286b00e0dSAlex Elder 			     char *buf)
361386b00e0dSAlex Elder {
361486b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
361586b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
361686b00e0dSAlex Elder 	int count;
361786b00e0dSAlex Elder 	char *bufp = buf;
361886b00e0dSAlex Elder 
361986b00e0dSAlex Elder 	if (!spec)
362086b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
362186b00e0dSAlex Elder 
362286b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
362386b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
362486b00e0dSAlex Elder 	if (count < 0)
362586b00e0dSAlex Elder 		return count;
362686b00e0dSAlex Elder 	bufp += count;
362786b00e0dSAlex Elder 
362886b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
362986b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
363086b00e0dSAlex Elder 	if (count < 0)
363186b00e0dSAlex Elder 		return count;
363286b00e0dSAlex Elder 	bufp += count;
363386b00e0dSAlex Elder 
363486b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
363586b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
363686b00e0dSAlex Elder 	if (count < 0)
363786b00e0dSAlex Elder 		return count;
363886b00e0dSAlex Elder 	bufp += count;
363986b00e0dSAlex Elder 
364086b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
364186b00e0dSAlex Elder 	if (count < 0)
364286b00e0dSAlex Elder 		return count;
364386b00e0dSAlex Elder 	bufp += count;
364486b00e0dSAlex Elder 
364586b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
364686b00e0dSAlex Elder }
364786b00e0dSAlex Elder 
3648dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3649dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3650dfc5606dSYehuda Sadeh 				 const char *buf,
3651dfc5606dSYehuda Sadeh 				 size_t size)
3652dfc5606dSYehuda Sadeh {
3653593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3654b813623aSAlex Elder 	int ret;
3655602adf40SYehuda Sadeh 
3656cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3657e627db08SAlex Elder 	if (ret)
3658e627db08SAlex Elder 		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3659b813623aSAlex Elder 
3660b813623aSAlex Elder 	return ret < 0 ? ret : size;
3661dfc5606dSYehuda Sadeh }
3662602adf40SYehuda Sadeh 
3663dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
366434b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3665dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3666dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3667dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3668dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
36699bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3670dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3671589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3672dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3673dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
367486b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3675dfc5606dSYehuda Sadeh 
3676dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3677dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
367834b13184SAlex Elder 	&dev_attr_features.attr,
3679dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3680dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
3681dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3682dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
36839bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3684dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3685589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3686dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
368786b00e0dSAlex Elder 	&dev_attr_parent.attr,
3688dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3689dfc5606dSYehuda Sadeh 	NULL
3690dfc5606dSYehuda Sadeh };
3691dfc5606dSYehuda Sadeh 
3692dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3693dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3694dfc5606dSYehuda Sadeh };
3695dfc5606dSYehuda Sadeh 
3696dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3697dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3698dfc5606dSYehuda Sadeh 	NULL
3699dfc5606dSYehuda Sadeh };
3700dfc5606dSYehuda Sadeh 
3701dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3702dfc5606dSYehuda Sadeh {
3703dfc5606dSYehuda Sadeh }
3704dfc5606dSYehuda Sadeh 
3705dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3706dfc5606dSYehuda Sadeh 	.name		= "rbd",
3707dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3708dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3709dfc5606dSYehuda Sadeh };
3710dfc5606dSYehuda Sadeh 
37118b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
37128b8fb99cSAlex Elder {
37138b8fb99cSAlex Elder 	kref_get(&spec->kref);
37148b8fb99cSAlex Elder 
37158b8fb99cSAlex Elder 	return spec;
37168b8fb99cSAlex Elder }
37178b8fb99cSAlex Elder 
37188b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
37198b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
37208b8fb99cSAlex Elder {
37218b8fb99cSAlex Elder 	if (spec)
37228b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
37238b8fb99cSAlex Elder }
37248b8fb99cSAlex Elder 
37258b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
37268b8fb99cSAlex Elder {
37278b8fb99cSAlex Elder 	struct rbd_spec *spec;
37288b8fb99cSAlex Elder 
37298b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
37308b8fb99cSAlex Elder 	if (!spec)
37318b8fb99cSAlex Elder 		return NULL;
37328b8fb99cSAlex Elder 	kref_init(&spec->kref);
37338b8fb99cSAlex Elder 
37348b8fb99cSAlex Elder 	return spec;
37358b8fb99cSAlex Elder }
37368b8fb99cSAlex Elder 
37378b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
37388b8fb99cSAlex Elder {
37398b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
37408b8fb99cSAlex Elder 
37418b8fb99cSAlex Elder 	kfree(spec->pool_name);
37428b8fb99cSAlex Elder 	kfree(spec->image_id);
37438b8fb99cSAlex Elder 	kfree(spec->image_name);
37448b8fb99cSAlex Elder 	kfree(spec->snap_name);
37458b8fb99cSAlex Elder 	kfree(spec);
37468b8fb99cSAlex Elder }
37478b8fb99cSAlex Elder 
3748cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3749c53d5893SAlex Elder 				struct rbd_spec *spec)
3750c53d5893SAlex Elder {
3751c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3752c53d5893SAlex Elder 
3753c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3754c53d5893SAlex Elder 	if (!rbd_dev)
3755c53d5893SAlex Elder 		return NULL;
3756c53d5893SAlex Elder 
3757c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
37586d292906SAlex Elder 	rbd_dev->flags = 0;
3759a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
3760c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3761c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3762c53d5893SAlex Elder 
3763c53d5893SAlex Elder 	rbd_dev->spec = spec;
3764c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3765c53d5893SAlex Elder 
37660903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
37670903e875SAlex Elder 
37680903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
37690903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
37700903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
37710903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
37720903e875SAlex Elder 
3773c53d5893SAlex Elder 	return rbd_dev;
3774c53d5893SAlex Elder }
3775c53d5893SAlex Elder 
3776c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3777c53d5893SAlex Elder {
3778c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3779c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3780c53d5893SAlex Elder 	kfree(rbd_dev);
3781c53d5893SAlex Elder }
3782c53d5893SAlex Elder 
3783dfc5606dSYehuda Sadeh /*
37849d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
37859d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
37869d475de5SAlex Elder  * image.
37879d475de5SAlex Elder  */
37889d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
37899d475de5SAlex Elder 				u8 *order, u64 *snap_size)
37909d475de5SAlex Elder {
37919d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
37929d475de5SAlex Elder 	int ret;
37939d475de5SAlex Elder 	struct {
37949d475de5SAlex Elder 		u8 order;
37959d475de5SAlex Elder 		__le64 size;
37969d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
37979d475de5SAlex Elder 
379836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
37999d475de5SAlex Elder 				"rbd", "get_size",
38004157976bSAlex Elder 				&snapid, sizeof (snapid),
3801e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
380236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
38039d475de5SAlex Elder 	if (ret < 0)
38049d475de5SAlex Elder 		return ret;
380557385b51SAlex Elder 	if (ret < sizeof (size_buf))
380657385b51SAlex Elder 		return -ERANGE;
38079d475de5SAlex Elder 
3808c3545579SJosh Durgin 	if (order) {
38099d475de5SAlex Elder 		*order = size_buf.order;
3810c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
3811c3545579SJosh Durgin 	}
38129d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
38139d475de5SAlex Elder 
3814c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
3815c3545579SJosh Durgin 		(unsigned long long)snap_id,
38169d475de5SAlex Elder 		(unsigned long long)*snap_size);
38179d475de5SAlex Elder 
38189d475de5SAlex Elder 	return 0;
38199d475de5SAlex Elder }
38209d475de5SAlex Elder 
38219d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
38229d475de5SAlex Elder {
38239d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
38249d475de5SAlex Elder 					&rbd_dev->header.obj_order,
38259d475de5SAlex Elder 					&rbd_dev->header.image_size);
38269d475de5SAlex Elder }
38279d475de5SAlex Elder 
38281e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
38291e130199SAlex Elder {
38301e130199SAlex Elder 	void *reply_buf;
38311e130199SAlex Elder 	int ret;
38321e130199SAlex Elder 	void *p;
38331e130199SAlex Elder 
38341e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
38351e130199SAlex Elder 	if (!reply_buf)
38361e130199SAlex Elder 		return -ENOMEM;
38371e130199SAlex Elder 
383836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
38394157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3840e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
384136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
38421e130199SAlex Elder 	if (ret < 0)
38431e130199SAlex Elder 		goto out;
38441e130199SAlex Elder 
38451e130199SAlex Elder 	p = reply_buf;
38461e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
384757385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
384857385b51SAlex Elder 	ret = 0;
38491e130199SAlex Elder 
38501e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
38511e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
38521e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
38531e130199SAlex Elder 	} else {
38541e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
38551e130199SAlex Elder 	}
38561e130199SAlex Elder out:
38571e130199SAlex Elder 	kfree(reply_buf);
38581e130199SAlex Elder 
38591e130199SAlex Elder 	return ret;
38601e130199SAlex Elder }
38611e130199SAlex Elder 
3862b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3863b1b5402aSAlex Elder 		u64 *snap_features)
3864b1b5402aSAlex Elder {
3865b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3866b1b5402aSAlex Elder 	struct {
3867b1b5402aSAlex Elder 		__le64 features;
3868b1b5402aSAlex Elder 		__le64 incompat;
38694157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3870d889140cSAlex Elder 	u64 incompat;
3871b1b5402aSAlex Elder 	int ret;
3872b1b5402aSAlex Elder 
387336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3874b1b5402aSAlex Elder 				"rbd", "get_features",
38754157976bSAlex Elder 				&snapid, sizeof (snapid),
3876e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
387736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3878b1b5402aSAlex Elder 	if (ret < 0)
3879b1b5402aSAlex Elder 		return ret;
388057385b51SAlex Elder 	if (ret < sizeof (features_buf))
388157385b51SAlex Elder 		return -ERANGE;
3882d889140cSAlex Elder 
3883d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
38845cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3885b8f5c6edSAlex Elder 		return -ENXIO;
3886d889140cSAlex Elder 
3887b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3888b1b5402aSAlex Elder 
3889b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3890b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3891b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3892b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3893b1b5402aSAlex Elder 
3894b1b5402aSAlex Elder 	return 0;
3895b1b5402aSAlex Elder }
3896b1b5402aSAlex Elder 
3897b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3898b1b5402aSAlex Elder {
3899b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3900b1b5402aSAlex Elder 						&rbd_dev->header.features);
3901b1b5402aSAlex Elder }
3902b1b5402aSAlex Elder 
390386b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
390486b00e0dSAlex Elder {
390586b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
390686b00e0dSAlex Elder 	size_t size;
390786b00e0dSAlex Elder 	void *reply_buf = NULL;
390886b00e0dSAlex Elder 	__le64 snapid;
390986b00e0dSAlex Elder 	void *p;
391086b00e0dSAlex Elder 	void *end;
3911642a2537SAlex Elder 	u64 pool_id;
391286b00e0dSAlex Elder 	char *image_id;
39133b5cf2a2SAlex Elder 	u64 snap_id;
391486b00e0dSAlex Elder 	u64 overlap;
391586b00e0dSAlex Elder 	int ret;
391686b00e0dSAlex Elder 
391786b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
391886b00e0dSAlex Elder 	if (!parent_spec)
391986b00e0dSAlex Elder 		return -ENOMEM;
392086b00e0dSAlex Elder 
392186b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
392286b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
392386b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
392486b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
392586b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
392686b00e0dSAlex Elder 	if (!reply_buf) {
392786b00e0dSAlex Elder 		ret = -ENOMEM;
392886b00e0dSAlex Elder 		goto out_err;
392986b00e0dSAlex Elder 	}
393086b00e0dSAlex Elder 
393186b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
393236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
393386b00e0dSAlex Elder 				"rbd", "get_parent",
39344157976bSAlex Elder 				&snapid, sizeof (snapid),
3935e2a58ee5SAlex Elder 				reply_buf, size);
393636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
393786b00e0dSAlex Elder 	if (ret < 0)
393886b00e0dSAlex Elder 		goto out_err;
393986b00e0dSAlex Elder 
394086b00e0dSAlex Elder 	p = reply_buf;
394157385b51SAlex Elder 	end = reply_buf + ret;
394257385b51SAlex Elder 	ret = -ERANGE;
3943642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
3944392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
3945392a9dadSAlex Elder 		/*
3946392a9dadSAlex Elder 		 * Either the parent never existed, or we have
3947392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
3948392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
3949392a9dadSAlex Elder 		 * layered image disappears we immediately set the
3950392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
3951392a9dadSAlex Elder 		 * requests will be treated as if the image had no
3952392a9dadSAlex Elder 		 * parent.
3953392a9dadSAlex Elder 		 */
3954392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
3955392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
3956392a9dadSAlex Elder 			smp_mb();
3957392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
3958392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
3959392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
3960392a9dadSAlex Elder 		}
3961392a9dadSAlex Elder 
396286b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
3963392a9dadSAlex Elder 	}
396486b00e0dSAlex Elder 
39650903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
39660903e875SAlex Elder 
39670903e875SAlex Elder 	ret = -EIO;
3968642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
3969c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3970642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
397157385b51SAlex Elder 		goto out_err;
3972c0cd10dbSAlex Elder 	}
39730903e875SAlex Elder 
3974979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
397586b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
397686b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
397786b00e0dSAlex Elder 		goto out_err;
397886b00e0dSAlex Elder 	}
39793b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
398086b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
398186b00e0dSAlex Elder 
39823b5cf2a2SAlex Elder 	/*
39833b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
39843b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
39853b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
39863b5cf2a2SAlex Elder 	 */
39873b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
39883b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
39893b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
39903b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
399186b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
399286b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
39933b5cf2a2SAlex Elder 	}
39943b5cf2a2SAlex Elder 
39953b5cf2a2SAlex Elder 	/*
39963b5cf2a2SAlex Elder 	 * We always update the parent overlap.  If it's zero we
39973b5cf2a2SAlex Elder 	 * treat it specially.
39983b5cf2a2SAlex Elder 	 */
399970cf49cfSAlex Elder 	rbd_dev->parent_overlap = overlap;
40003b5cf2a2SAlex Elder 	smp_mb();
40013b5cf2a2SAlex Elder 	if (!overlap) {
40023b5cf2a2SAlex Elder 
40033b5cf2a2SAlex Elder 		/* A null parent_spec indicates it's the initial probe */
40043b5cf2a2SAlex Elder 
40053b5cf2a2SAlex Elder 		if (parent_spec) {
40063b5cf2a2SAlex Elder 			/*
40073b5cf2a2SAlex Elder 			 * The overlap has become zero, so the clone
40083b5cf2a2SAlex Elder 			 * must have been resized down to 0 at some
40093b5cf2a2SAlex Elder 			 * point.  Treat this the same as a flatten.
40103b5cf2a2SAlex Elder 			 */
40113b5cf2a2SAlex Elder 			rbd_dev_parent_put(rbd_dev);
40123b5cf2a2SAlex Elder 			pr_info("%s: clone image now standalone\n",
40133b5cf2a2SAlex Elder 				rbd_dev->disk->disk_name);
401470cf49cfSAlex Elder 		} else {
40153b5cf2a2SAlex Elder 			/*
40163b5cf2a2SAlex Elder 			 * For the initial probe, if we find the
40173b5cf2a2SAlex Elder 			 * overlap is zero we just pretend there was
40183b5cf2a2SAlex Elder 			 * no parent image.
40193b5cf2a2SAlex Elder 			 */
40203b5cf2a2SAlex Elder 			rbd_warn(rbd_dev, "ignoring parent of "
40213b5cf2a2SAlex Elder 						"clone with overlap 0\n");
40223b5cf2a2SAlex Elder 		}
402370cf49cfSAlex Elder 	}
402486b00e0dSAlex Elder out:
402586b00e0dSAlex Elder 	ret = 0;
402686b00e0dSAlex Elder out_err:
402786b00e0dSAlex Elder 	kfree(reply_buf);
402886b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
402986b00e0dSAlex Elder 
403086b00e0dSAlex Elder 	return ret;
403186b00e0dSAlex Elder }
403286b00e0dSAlex Elder 
4033cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4034cc070d59SAlex Elder {
4035cc070d59SAlex Elder 	struct {
4036cc070d59SAlex Elder 		__le64 stripe_unit;
4037cc070d59SAlex Elder 		__le64 stripe_count;
4038cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4039cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4040cc070d59SAlex Elder 	void *p;
4041cc070d59SAlex Elder 	u64 obj_size;
4042cc070d59SAlex Elder 	u64 stripe_unit;
4043cc070d59SAlex Elder 	u64 stripe_count;
4044cc070d59SAlex Elder 	int ret;
4045cc070d59SAlex Elder 
4046cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4047cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
4048e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
4049cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4050cc070d59SAlex Elder 	if (ret < 0)
4051cc070d59SAlex Elder 		return ret;
4052cc070d59SAlex Elder 	if (ret < size)
4053cc070d59SAlex Elder 		return -ERANGE;
4054cc070d59SAlex Elder 
4055cc070d59SAlex Elder 	/*
4056cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
4057cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
4058cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
4059cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
4060cc070d59SAlex Elder 	 */
4061cc070d59SAlex Elder 	ret = -EINVAL;
4062cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
4063cc070d59SAlex Elder 	p = &striping_info_buf;
4064cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
4065cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
4066cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
4067cc070d59SAlex Elder 				"(got %llu want %llu)",
4068cc070d59SAlex Elder 				stripe_unit, obj_size);
4069cc070d59SAlex Elder 		return -EINVAL;
4070cc070d59SAlex Elder 	}
4071cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
4072cc070d59SAlex Elder 	if (stripe_count != 1) {
4073cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
4074cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
4075cc070d59SAlex Elder 		return -EINVAL;
4076cc070d59SAlex Elder 	}
4077500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
4078500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
4079cc070d59SAlex Elder 
4080cc070d59SAlex Elder 	return 0;
4081cc070d59SAlex Elder }
4082cc070d59SAlex Elder 
40839e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
40849e15b77dSAlex Elder {
40859e15b77dSAlex Elder 	size_t image_id_size;
40869e15b77dSAlex Elder 	char *image_id;
40879e15b77dSAlex Elder 	void *p;
40889e15b77dSAlex Elder 	void *end;
40899e15b77dSAlex Elder 	size_t size;
40909e15b77dSAlex Elder 	void *reply_buf = NULL;
40919e15b77dSAlex Elder 	size_t len = 0;
40929e15b77dSAlex Elder 	char *image_name = NULL;
40939e15b77dSAlex Elder 	int ret;
40949e15b77dSAlex Elder 
40959e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
40969e15b77dSAlex Elder 
409769e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
409869e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
40999e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
41009e15b77dSAlex Elder 	if (!image_id)
41019e15b77dSAlex Elder 		return NULL;
41029e15b77dSAlex Elder 
41039e15b77dSAlex Elder 	p = image_id;
41044157976bSAlex Elder 	end = image_id + image_id_size;
410569e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
41069e15b77dSAlex Elder 
41079e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
41089e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
41099e15b77dSAlex Elder 	if (!reply_buf)
41109e15b77dSAlex Elder 		goto out;
41119e15b77dSAlex Elder 
411236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
41139e15b77dSAlex Elder 				"rbd", "dir_get_name",
41149e15b77dSAlex Elder 				image_id, image_id_size,
4115e2a58ee5SAlex Elder 				reply_buf, size);
41169e15b77dSAlex Elder 	if (ret < 0)
41179e15b77dSAlex Elder 		goto out;
41189e15b77dSAlex Elder 	p = reply_buf;
4119f40eb349SAlex Elder 	end = reply_buf + ret;
4120f40eb349SAlex Elder 
41219e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
41229e15b77dSAlex Elder 	if (IS_ERR(image_name))
41239e15b77dSAlex Elder 		image_name = NULL;
41249e15b77dSAlex Elder 	else
41259e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
41269e15b77dSAlex Elder out:
41279e15b77dSAlex Elder 	kfree(reply_buf);
41289e15b77dSAlex Elder 	kfree(image_id);
41299e15b77dSAlex Elder 
41309e15b77dSAlex Elder 	return image_name;
41319e15b77dSAlex Elder }
41329e15b77dSAlex Elder 
41332ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
41342ad3d716SAlex Elder {
41352ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
41362ad3d716SAlex Elder 	const char *snap_name;
41372ad3d716SAlex Elder 	u32 which = 0;
41382ad3d716SAlex Elder 
41392ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
41402ad3d716SAlex Elder 
41412ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
41422ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
41432ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
41442ad3d716SAlex Elder 			return snapc->snaps[which];
41452ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
41462ad3d716SAlex Elder 		which++;
41472ad3d716SAlex Elder 	}
41482ad3d716SAlex Elder 	return CEPH_NOSNAP;
41492ad3d716SAlex Elder }
41502ad3d716SAlex Elder 
41512ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
41522ad3d716SAlex Elder {
41532ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
41542ad3d716SAlex Elder 	u32 which;
41552ad3d716SAlex Elder 	bool found = false;
41562ad3d716SAlex Elder 	u64 snap_id;
41572ad3d716SAlex Elder 
41582ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
41592ad3d716SAlex Elder 		const char *snap_name;
41602ad3d716SAlex Elder 
41612ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
41622ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4163efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
4164efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
4165efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
4166efadc98aSJosh Durgin 				continue;
4167efadc98aSJosh Durgin 			else
41682ad3d716SAlex Elder 				break;
4169efadc98aSJosh Durgin 		}
41702ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
41712ad3d716SAlex Elder 		kfree(snap_name);
41722ad3d716SAlex Elder 	}
41732ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
41742ad3d716SAlex Elder }
41752ad3d716SAlex Elder 
41762ad3d716SAlex Elder /*
41772ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
41782ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
41792ad3d716SAlex Elder  */
41802ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
41812ad3d716SAlex Elder {
41822ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
41832ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
41842ad3d716SAlex Elder 
41852ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
41862ad3d716SAlex Elder }
41872ad3d716SAlex Elder 
41889e15b77dSAlex Elder /*
41892e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
41902e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
41912e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
41922e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
41932e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
41942e9f7f1cSAlex Elder  * allocated.
4195e1d4213fSAlex Elder  *
4196e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
4197e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
4198e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
41999e15b77dSAlex Elder  */
42002e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
42019e15b77dSAlex Elder {
42022e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
42032e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
42042e9f7f1cSAlex Elder 	const char *pool_name;
42052e9f7f1cSAlex Elder 	const char *image_name;
42062e9f7f1cSAlex Elder 	const char *snap_name;
42079e15b77dSAlex Elder 	int ret;
42089e15b77dSAlex Elder 
4209e1d4213fSAlex Elder 	/*
4210e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
4211e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
4212e1d4213fSAlex Elder 	 */
42132e9f7f1cSAlex Elder 	if (spec->pool_name) {
42142e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
42152ad3d716SAlex Elder 			u64 snap_id;
4216e1d4213fSAlex Elder 
42172ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
42182ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
4219e1d4213fSAlex Elder 				return -ENOENT;
42202ad3d716SAlex Elder 			spec->snap_id = snap_id;
4221e1d4213fSAlex Elder 		} else {
42222e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
4223e1d4213fSAlex Elder 		}
4224e1d4213fSAlex Elder 
4225e1d4213fSAlex Elder 		return 0;
4226e1d4213fSAlex Elder 	}
42279e15b77dSAlex Elder 
42282e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
42299e15b77dSAlex Elder 
42302e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
42312e9f7f1cSAlex Elder 	if (!pool_name) {
42322e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4233935dc89fSAlex Elder 		return -EIO;
4234935dc89fSAlex Elder 	}
42352e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
42362e9f7f1cSAlex Elder 	if (!pool_name)
42379e15b77dSAlex Elder 		return -ENOMEM;
42389e15b77dSAlex Elder 
42399e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
42409e15b77dSAlex Elder 
42412e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
42422e9f7f1cSAlex Elder 	if (!image_name)
424306ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
42449e15b77dSAlex Elder 
42452e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
42469e15b77dSAlex Elder 
42472e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4248da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
4249da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
42509e15b77dSAlex Elder 		goto out_err;
42512e9f7f1cSAlex Elder 	}
42522e9f7f1cSAlex Elder 
42532e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
42542e9f7f1cSAlex Elder 	spec->image_name = image_name;
42552e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
42569e15b77dSAlex Elder 
42579e15b77dSAlex Elder 	return 0;
42589e15b77dSAlex Elder out_err:
42592e9f7f1cSAlex Elder 	kfree(image_name);
42602e9f7f1cSAlex Elder 	kfree(pool_name);
42619e15b77dSAlex Elder 
42629e15b77dSAlex Elder 	return ret;
42639e15b77dSAlex Elder }
42649e15b77dSAlex Elder 
4265cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
426635d489f9SAlex Elder {
426735d489f9SAlex Elder 	size_t size;
426835d489f9SAlex Elder 	int ret;
426935d489f9SAlex Elder 	void *reply_buf;
427035d489f9SAlex Elder 	void *p;
427135d489f9SAlex Elder 	void *end;
427235d489f9SAlex Elder 	u64 seq;
427335d489f9SAlex Elder 	u32 snap_count;
427435d489f9SAlex Elder 	struct ceph_snap_context *snapc;
427535d489f9SAlex Elder 	u32 i;
427635d489f9SAlex Elder 
427735d489f9SAlex Elder 	/*
427835d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
427935d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
428035d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
428135d489f9SAlex Elder 	 * prepared to receive.
428235d489f9SAlex Elder 	 */
428335d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
428435d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
428535d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
428635d489f9SAlex Elder 	if (!reply_buf)
428735d489f9SAlex Elder 		return -ENOMEM;
428835d489f9SAlex Elder 
428936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
42904157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4291e2a58ee5SAlex Elder 				reply_buf, size);
429236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
429335d489f9SAlex Elder 	if (ret < 0)
429435d489f9SAlex Elder 		goto out;
429535d489f9SAlex Elder 
429635d489f9SAlex Elder 	p = reply_buf;
429757385b51SAlex Elder 	end = reply_buf + ret;
429857385b51SAlex Elder 	ret = -ERANGE;
429935d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
430035d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
430135d489f9SAlex Elder 
430235d489f9SAlex Elder 	/*
430335d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
430435d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
430535d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
430635d489f9SAlex Elder 	 * allocate is representable in a size_t.
430735d489f9SAlex Elder 	 */
430835d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
430935d489f9SAlex Elder 				 / sizeof (u64)) {
431035d489f9SAlex Elder 		ret = -EINVAL;
431135d489f9SAlex Elder 		goto out;
431235d489f9SAlex Elder 	}
431335d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
431435d489f9SAlex Elder 		goto out;
4315468521c1SAlex Elder 	ret = 0;
431635d489f9SAlex Elder 
4317812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
431835d489f9SAlex Elder 	if (!snapc) {
431935d489f9SAlex Elder 		ret = -ENOMEM;
432035d489f9SAlex Elder 		goto out;
432135d489f9SAlex Elder 	}
432235d489f9SAlex Elder 	snapc->seq = seq;
432335d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
432435d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
432535d489f9SAlex Elder 
432649ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
432735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
432835d489f9SAlex Elder 
432935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
433035d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
433135d489f9SAlex Elder out:
433235d489f9SAlex Elder 	kfree(reply_buf);
433335d489f9SAlex Elder 
433457385b51SAlex Elder 	return ret;
433535d489f9SAlex Elder }
433635d489f9SAlex Elder 
433754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
433854cac61fSAlex Elder 					u64 snap_id)
4339b8b1e2dbSAlex Elder {
4340b8b1e2dbSAlex Elder 	size_t size;
4341b8b1e2dbSAlex Elder 	void *reply_buf;
434254cac61fSAlex Elder 	__le64 snapid;
4343b8b1e2dbSAlex Elder 	int ret;
4344b8b1e2dbSAlex Elder 	void *p;
4345b8b1e2dbSAlex Elder 	void *end;
4346b8b1e2dbSAlex Elder 	char *snap_name;
4347b8b1e2dbSAlex Elder 
4348b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4349b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4350b8b1e2dbSAlex Elder 	if (!reply_buf)
4351b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4352b8b1e2dbSAlex Elder 
435354cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
435436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4355b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
435654cac61fSAlex Elder 				&snapid, sizeof (snapid),
4357e2a58ee5SAlex Elder 				reply_buf, size);
435836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4359f40eb349SAlex Elder 	if (ret < 0) {
4360f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4361b8b1e2dbSAlex Elder 		goto out;
4362f40eb349SAlex Elder 	}
4363b8b1e2dbSAlex Elder 
4364b8b1e2dbSAlex Elder 	p = reply_buf;
4365f40eb349SAlex Elder 	end = reply_buf + ret;
4366e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4367f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4368b8b1e2dbSAlex Elder 		goto out;
4369f40eb349SAlex Elder 
4370b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
437154cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4372b8b1e2dbSAlex Elder out:
4373b8b1e2dbSAlex Elder 	kfree(reply_buf);
4374b8b1e2dbSAlex Elder 
4375f40eb349SAlex Elder 	return snap_name;
4376b8b1e2dbSAlex Elder }
4377b8b1e2dbSAlex Elder 
43782df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4379117973fbSAlex Elder {
43802df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4381117973fbSAlex Elder 	int ret;
4382117973fbSAlex Elder 
43831617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
43841617e40cSJosh Durgin 	if (ret)
4385cfbf6377SAlex Elder 		return ret;
43861617e40cSJosh Durgin 
43872df3fac7SAlex Elder 	if (first_time) {
43882df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
43892df3fac7SAlex Elder 		if (ret)
4390cfbf6377SAlex Elder 			return ret;
43912df3fac7SAlex Elder 	}
43922df3fac7SAlex Elder 
4393642a2537SAlex Elder 	/*
4394642a2537SAlex Elder 	 * If the image supports layering, get the parent info.  We
4395642a2537SAlex Elder 	 * need to probe the first time regardless.  Thereafter we
4396642a2537SAlex Elder 	 * only need to if there's a parent, to see if it has
4397642a2537SAlex Elder 	 * disappeared due to the mapped image getting flattened.
4398642a2537SAlex Elder 	 */
4399642a2537SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4400642a2537SAlex Elder 			(first_time || rbd_dev->parent_spec)) {
4401642a2537SAlex Elder 		bool warn;
4402642a2537SAlex Elder 
4403642a2537SAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
4404642a2537SAlex Elder 		if (ret)
4405cfbf6377SAlex Elder 			return ret;
4406642a2537SAlex Elder 
4407642a2537SAlex Elder 		/*
4408642a2537SAlex Elder 		 * Print a warning if this is the initial probe and
4409642a2537SAlex Elder 		 * the image has a parent.  Don't print it if the
4410642a2537SAlex Elder 		 * image now being probed is itself a parent.  We
4411642a2537SAlex Elder 		 * can tell at this point because we won't know its
4412642a2537SAlex Elder 		 * pool name yet (just its pool id).
4413642a2537SAlex Elder 		 */
4414642a2537SAlex Elder 		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4415642a2537SAlex Elder 		if (first_time && warn)
4416642a2537SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
4417642a2537SAlex Elder 					"is EXPERIMENTAL!");
4418642a2537SAlex Elder 	}
4419642a2537SAlex Elder 
442029334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
442129334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
442229334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
4423117973fbSAlex Elder 
4424cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4425117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4426117973fbSAlex Elder 
4427117973fbSAlex Elder 	return ret;
4428117973fbSAlex Elder }
4429117973fbSAlex Elder 
4430dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4431dfc5606dSYehuda Sadeh {
4432dfc5606dSYehuda Sadeh 	struct device *dev;
4433cd789ab9SAlex Elder 	int ret;
4434dfc5606dSYehuda Sadeh 
4435cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4436dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4437dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4438dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4439200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4440de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4441dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4442dfc5606dSYehuda Sadeh 
4443dfc5606dSYehuda Sadeh 	return ret;
4444602adf40SYehuda Sadeh }
4445602adf40SYehuda Sadeh 
4446dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4447dfc5606dSYehuda Sadeh {
4448dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4449dfc5606dSYehuda Sadeh }
4450dfc5606dSYehuda Sadeh 
44511ddbe94eSAlex Elder /*
4452499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4453f8a22fc2SIlya Dryomov  * the rbd_dev to the global list.
44541ddbe94eSAlex Elder  */
4455f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4456b7f23c36SAlex Elder {
4457f8a22fc2SIlya Dryomov 	int new_dev_id;
4458f8a22fc2SIlya Dryomov 
44599b60e70bSIlya Dryomov 	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
44609b60e70bSIlya Dryomov 				    0, minor_to_rbd_dev_id(1 << MINORBITS),
44619b60e70bSIlya Dryomov 				    GFP_KERNEL);
4462f8a22fc2SIlya Dryomov 	if (new_dev_id < 0)
4463f8a22fc2SIlya Dryomov 		return new_dev_id;
4464f8a22fc2SIlya Dryomov 
4465f8a22fc2SIlya Dryomov 	rbd_dev->dev_id = new_dev_id;
4466499afd5bSAlex Elder 
4467499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4468499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4469499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4470f8a22fc2SIlya Dryomov 
447170eebd20SIlya Dryomov 	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4472f8a22fc2SIlya Dryomov 
4473f8a22fc2SIlya Dryomov 	return 0;
4474b7f23c36SAlex Elder }
4475b7f23c36SAlex Elder 
44761ddbe94eSAlex Elder /*
4477499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4478499afd5bSAlex Elder  * identifier is no longer in use.
44791ddbe94eSAlex Elder  */
4480e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
44811ddbe94eSAlex Elder {
4482499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4483499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4484499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
44851ddbe94eSAlex Elder 
4486f8a22fc2SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4487f8a22fc2SIlya Dryomov 
4488f8a22fc2SIlya Dryomov 	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4489b7f23c36SAlex Elder }
4490b7f23c36SAlex Elder 
4491a725f65eSAlex Elder /*
4492e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4493e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4494593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4495593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4496e28fff26SAlex Elder  */
4497e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4498e28fff26SAlex Elder {
4499e28fff26SAlex Elder         /*
4500e28fff26SAlex Elder         * These are the characters that produce nonzero for
4501e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4502e28fff26SAlex Elder         */
4503e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4504e28fff26SAlex Elder 
4505e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4506e28fff26SAlex Elder 
4507e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4508e28fff26SAlex Elder }
4509e28fff26SAlex Elder 
4510e28fff26SAlex Elder /*
4511e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4512e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4513593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4514593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4515e28fff26SAlex Elder  *
4516e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4517e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4518e28fff26SAlex Elder  * token_size if the token would not fit.
4519e28fff26SAlex Elder  *
4520593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4521e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4522e28fff26SAlex Elder  * too small to hold it.
4523e28fff26SAlex Elder  */
4524e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4525e28fff26SAlex Elder 				char *token,
4526e28fff26SAlex Elder 				size_t token_size)
4527e28fff26SAlex Elder {
4528e28fff26SAlex Elder         size_t len;
4529e28fff26SAlex Elder 
4530e28fff26SAlex Elder 	len = next_token(buf);
4531e28fff26SAlex Elder 	if (len < token_size) {
4532e28fff26SAlex Elder 		memcpy(token, *buf, len);
4533e28fff26SAlex Elder 		*(token + len) = '\0';
4534e28fff26SAlex Elder 	}
4535e28fff26SAlex Elder 	*buf += len;
4536e28fff26SAlex Elder 
4537e28fff26SAlex Elder         return len;
4538e28fff26SAlex Elder }
4539e28fff26SAlex Elder 
4540e28fff26SAlex Elder /*
4541ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4542ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4543ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4544ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4545ea3352f4SAlex Elder  *
4546ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4547ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4548ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4549ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4550ea3352f4SAlex Elder  *
4551ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4552ea3352f4SAlex Elder  * the end of the found token.
4553ea3352f4SAlex Elder  *
4554ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4555ea3352f4SAlex Elder  */
4556ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4557ea3352f4SAlex Elder {
4558ea3352f4SAlex Elder 	char *dup;
4559ea3352f4SAlex Elder 	size_t len;
4560ea3352f4SAlex Elder 
4561ea3352f4SAlex Elder 	len = next_token(buf);
45624caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4563ea3352f4SAlex Elder 	if (!dup)
4564ea3352f4SAlex Elder 		return NULL;
4565ea3352f4SAlex Elder 	*(dup + len) = '\0';
4566ea3352f4SAlex Elder 	*buf += len;
4567ea3352f4SAlex Elder 
4568ea3352f4SAlex Elder 	if (lenp)
4569ea3352f4SAlex Elder 		*lenp = len;
4570ea3352f4SAlex Elder 
4571ea3352f4SAlex Elder 	return dup;
4572ea3352f4SAlex Elder }
4573ea3352f4SAlex Elder 
4574ea3352f4SAlex Elder /*
4575859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4576859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4577859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4578859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4579d22f76e7SAlex Elder  *
4580859c31dfSAlex Elder  * The information extracted from these options is recorded in
4581859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4582859c31dfSAlex Elder  * structures:
4583859c31dfSAlex Elder  *  ceph_opts
4584859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4585859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4586859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4587859c31dfSAlex Elder  *  rbd_opts
4588859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4589859c31dfSAlex Elder  *	this function; caller must release with kfree().
4590859c31dfSAlex Elder  *  spec
4591859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4592859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4593859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4594859c31dfSAlex Elder  *
4595859c31dfSAlex Elder  * The options passed take this form:
4596859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4597859c31dfSAlex Elder  * where:
4598859c31dfSAlex Elder  *  <mon_addrs>
4599859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4600859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4601859c31dfSAlex Elder  *      by a port number (separated by a colon).
4602859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4603859c31dfSAlex Elder  *  <options>
4604859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4605859c31dfSAlex Elder  *  <pool_name>
4606859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4607859c31dfSAlex Elder  *  <image_name>
4608859c31dfSAlex Elder  *      The name of the image in that pool to map.
4609859c31dfSAlex Elder  *  <snap_id>
4610859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4611859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4612859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4613859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4614a725f65eSAlex Elder  */
4615859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4616dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4617859c31dfSAlex Elder 				struct rbd_options **opts,
4618859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4619a725f65eSAlex Elder {
4620e28fff26SAlex Elder 	size_t len;
4621859c31dfSAlex Elder 	char *options;
46220ddebc0cSAlex Elder 	const char *mon_addrs;
4623ecb4dc22SAlex Elder 	char *snap_name;
46240ddebc0cSAlex Elder 	size_t mon_addrs_size;
4625859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
46264e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4627859c31dfSAlex Elder 	struct ceph_options *copts;
4628dc79b113SAlex Elder 	int ret;
4629e28fff26SAlex Elder 
4630e28fff26SAlex Elder 	/* The first four tokens are required */
4631e28fff26SAlex Elder 
46327ef3214aSAlex Elder 	len = next_token(&buf);
46334fb5d671SAlex Elder 	if (!len) {
46344fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
46354fb5d671SAlex Elder 		return -EINVAL;
46364fb5d671SAlex Elder 	}
46370ddebc0cSAlex Elder 	mon_addrs = buf;
4638f28e565aSAlex Elder 	mon_addrs_size = len + 1;
46397ef3214aSAlex Elder 	buf += len;
4640a725f65eSAlex Elder 
4641dc79b113SAlex Elder 	ret = -EINVAL;
4642f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4643f28e565aSAlex Elder 	if (!options)
4644dc79b113SAlex Elder 		return -ENOMEM;
46454fb5d671SAlex Elder 	if (!*options) {
46464fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
46474fb5d671SAlex Elder 		goto out_err;
46484fb5d671SAlex Elder 	}
4649a725f65eSAlex Elder 
4650859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4651859c31dfSAlex Elder 	if (!spec)
4652f28e565aSAlex Elder 		goto out_mem;
4653859c31dfSAlex Elder 
4654859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4655859c31dfSAlex Elder 	if (!spec->pool_name)
4656859c31dfSAlex Elder 		goto out_mem;
46574fb5d671SAlex Elder 	if (!*spec->pool_name) {
46584fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
46594fb5d671SAlex Elder 		goto out_err;
46604fb5d671SAlex Elder 	}
4661e28fff26SAlex Elder 
466269e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4663859c31dfSAlex Elder 	if (!spec->image_name)
4664f28e565aSAlex Elder 		goto out_mem;
46654fb5d671SAlex Elder 	if (!*spec->image_name) {
46664fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
46674fb5d671SAlex Elder 		goto out_err;
46684fb5d671SAlex Elder 	}
4669e28fff26SAlex Elder 
4670f28e565aSAlex Elder 	/*
4671f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4672f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4673f28e565aSAlex Elder 	 */
46743feeb894SAlex Elder 	len = next_token(&buf);
4675820a5f3eSAlex Elder 	if (!len) {
46763feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
46773feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4678f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4679dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4680f28e565aSAlex Elder 		goto out_err;
4681849b4260SAlex Elder 	}
4682ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4683ecb4dc22SAlex Elder 	if (!snap_name)
4684f28e565aSAlex Elder 		goto out_mem;
4685ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4686ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4687e5c35534SAlex Elder 
46880ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4689e28fff26SAlex Elder 
46904e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
46914e9afebaSAlex Elder 	if (!rbd_opts)
46924e9afebaSAlex Elder 		goto out_mem;
46934e9afebaSAlex Elder 
46944e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4695d22f76e7SAlex Elder 
4696859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
46970ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
46984e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4699859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4700859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4701dc79b113SAlex Elder 		goto out_err;
4702dc79b113SAlex Elder 	}
4703859c31dfSAlex Elder 	kfree(options);
4704859c31dfSAlex Elder 
4705859c31dfSAlex Elder 	*ceph_opts = copts;
47064e9afebaSAlex Elder 	*opts = rbd_opts;
4707859c31dfSAlex Elder 	*rbd_spec = spec;
47080ddebc0cSAlex Elder 
4709dc79b113SAlex Elder 	return 0;
4710f28e565aSAlex Elder out_mem:
4711dc79b113SAlex Elder 	ret = -ENOMEM;
4712d22f76e7SAlex Elder out_err:
4713859c31dfSAlex Elder 	kfree(rbd_opts);
4714859c31dfSAlex Elder 	rbd_spec_put(spec);
4715f28e565aSAlex Elder 	kfree(options);
4716d22f76e7SAlex Elder 
4717dc79b113SAlex Elder 	return ret;
4718a725f65eSAlex Elder }
4719a725f65eSAlex Elder 
4720589d30e0SAlex Elder /*
4721589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4722589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4723589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4724589d30e0SAlex Elder  *
4725589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4726589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4727589d30e0SAlex Elder  * with the supplied name.
4728589d30e0SAlex Elder  *
4729589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4730589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4731589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4732589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4733589d30e0SAlex Elder  */
4734589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4735589d30e0SAlex Elder {
4736589d30e0SAlex Elder 	int ret;
4737589d30e0SAlex Elder 	size_t size;
4738589d30e0SAlex Elder 	char *object_name;
4739589d30e0SAlex Elder 	void *response;
4740c0fba368SAlex Elder 	char *image_id;
47412f82ee54SAlex Elder 
4742589d30e0SAlex Elder 	/*
47432c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
47442c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4745c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4746c0fba368SAlex Elder 	 * do still need to set the image format though.
47472c0d0a10SAlex Elder 	 */
4748c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4749c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4750c0fba368SAlex Elder 
47512c0d0a10SAlex Elder 		return 0;
4752c0fba368SAlex Elder 	}
47532c0d0a10SAlex Elder 
47542c0d0a10SAlex Elder 	/*
4755589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4756589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4757589d30e0SAlex Elder 	 */
475869e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4759589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4760589d30e0SAlex Elder 	if (!object_name)
4761589d30e0SAlex Elder 		return -ENOMEM;
47620d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4763589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4764589d30e0SAlex Elder 
4765589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4766589d30e0SAlex Elder 
4767589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4768589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4769589d30e0SAlex Elder 	if (!response) {
4770589d30e0SAlex Elder 		ret = -ENOMEM;
4771589d30e0SAlex Elder 		goto out;
4772589d30e0SAlex Elder 	}
4773589d30e0SAlex Elder 
4774c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4775c0fba368SAlex Elder 
477636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
47774157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4778e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
477936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4780c0fba368SAlex Elder 	if (ret == -ENOENT) {
4781c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4782c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4783c0fba368SAlex Elder 		if (!ret)
4784c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4785c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4786c0fba368SAlex Elder 		void *p = response;
4787589d30e0SAlex Elder 
4788c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4789979ed480SAlex Elder 						NULL, GFP_NOIO);
4790c0fba368SAlex Elder 		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4791c0fba368SAlex Elder 		if (!ret)
4792c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4793589d30e0SAlex Elder 	} else {
4794c0fba368SAlex Elder 		ret = -EINVAL;
4795c0fba368SAlex Elder 	}
4796c0fba368SAlex Elder 
4797c0fba368SAlex Elder 	if (!ret) {
4798c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4799c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4800589d30e0SAlex Elder 	}
4801589d30e0SAlex Elder out:
4802589d30e0SAlex Elder 	kfree(response);
4803589d30e0SAlex Elder 	kfree(object_name);
4804589d30e0SAlex Elder 
4805589d30e0SAlex Elder 	return ret;
4806589d30e0SAlex Elder }
4807589d30e0SAlex Elder 
48083abef3b3SAlex Elder /*
48093abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
48103abef3b3SAlex Elder  * call.
48113abef3b3SAlex Elder  */
48126fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
48136fd48b3bSAlex Elder {
48146fd48b3bSAlex Elder 	struct rbd_image_header	*header;
48156fd48b3bSAlex Elder 
4816392a9dadSAlex Elder 	/* Drop parent reference unless it's already been done (or none) */
4817392a9dadSAlex Elder 
4818392a9dadSAlex Elder 	if (rbd_dev->parent_overlap)
4819a2acd00eSAlex Elder 		rbd_dev_parent_put(rbd_dev);
48206fd48b3bSAlex Elder 
48216fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
48226fd48b3bSAlex Elder 
48236fd48b3bSAlex Elder 	header = &rbd_dev->header;
4824812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
48256fd48b3bSAlex Elder 	kfree(header->snap_sizes);
48266fd48b3bSAlex Elder 	kfree(header->snap_names);
48276fd48b3bSAlex Elder 	kfree(header->object_prefix);
48286fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
48296fd48b3bSAlex Elder }
48306fd48b3bSAlex Elder 
48312df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4832a30b71b9SAlex Elder {
4833a30b71b9SAlex Elder 	int ret;
4834a30b71b9SAlex Elder 
48351e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
483657385b51SAlex Elder 	if (ret)
48371e130199SAlex Elder 		goto out_err;
4838b1b5402aSAlex Elder 
48392df3fac7SAlex Elder 	/*
48402df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
48412df3fac7SAlex Elder 	 * features are assumed to never change.
48422df3fac7SAlex Elder 	 */
4843b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
484457385b51SAlex Elder 	if (ret)
4845b1b5402aSAlex Elder 		goto out_err;
484635d489f9SAlex Elder 
4847cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4848cc070d59SAlex Elder 
4849cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4850cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4851cc070d59SAlex Elder 		if (ret < 0)
4852cc070d59SAlex Elder 			goto out_err;
4853cc070d59SAlex Elder 	}
48542df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
4855a30b71b9SAlex Elder 
485635152979SAlex Elder 	return 0;
48579d475de5SAlex Elder out_err:
4858642a2537SAlex Elder 	rbd_dev->header.features = 0;
48591e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
48601e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
48619d475de5SAlex Elder 
48629d475de5SAlex Elder 	return ret;
4863a30b71b9SAlex Elder }
4864a30b71b9SAlex Elder 
4865124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
486683a06263SAlex Elder {
48672f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4868124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4869124afba2SAlex Elder 	struct rbd_client *rbdc;
4870124afba2SAlex Elder 	int ret;
4871124afba2SAlex Elder 
4872124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4873124afba2SAlex Elder 		return 0;
4874124afba2SAlex Elder 	/*
4875124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4876124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4877124afba2SAlex Elder 	 * parent/child relationships always share both.
4878124afba2SAlex Elder 	 */
4879124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4880124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4881124afba2SAlex Elder 
4882124afba2SAlex Elder 	ret = -ENOMEM;
4883124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4884124afba2SAlex Elder 	if (!parent)
4885124afba2SAlex Elder 		goto out_err;
4886124afba2SAlex Elder 
48871f3ef788SAlex Elder 	ret = rbd_dev_image_probe(parent, false);
4888124afba2SAlex Elder 	if (ret < 0)
4889124afba2SAlex Elder 		goto out_err;
4890124afba2SAlex Elder 	rbd_dev->parent = parent;
4891a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
4892124afba2SAlex Elder 
4893124afba2SAlex Elder 	return 0;
4894124afba2SAlex Elder out_err:
4895124afba2SAlex Elder 	if (parent) {
4896fb65d228SAlex Elder 		rbd_dev_unparent(rbd_dev);
4897124afba2SAlex Elder 		kfree(rbd_dev->header_name);
4898124afba2SAlex Elder 		rbd_dev_destroy(parent);
4899124afba2SAlex Elder 	} else {
4900124afba2SAlex Elder 		rbd_put_client(rbdc);
4901124afba2SAlex Elder 		rbd_spec_put(parent_spec);
4902124afba2SAlex Elder 	}
4903124afba2SAlex Elder 
4904124afba2SAlex Elder 	return ret;
4905124afba2SAlex Elder }
4906124afba2SAlex Elder 
4907200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4908124afba2SAlex Elder {
490983a06263SAlex Elder 	int ret;
491083a06263SAlex Elder 
4911f8a22fc2SIlya Dryomov 	/* Get an id and fill in device name. */
491283a06263SAlex Elder 
4913f8a22fc2SIlya Dryomov 	ret = rbd_dev_id_get(rbd_dev);
4914f8a22fc2SIlya Dryomov 	if (ret)
4915f8a22fc2SIlya Dryomov 		return ret;
4916f8a22fc2SIlya Dryomov 
491783a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
491883a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
491983a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
492083a06263SAlex Elder 
49219b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
492283a06263SAlex Elder 
49239b60e70bSIlya Dryomov 	if (!single_major) {
492483a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
492583a06263SAlex Elder 		if (ret < 0)
492683a06263SAlex Elder 			goto err_out_id;
49279b60e70bSIlya Dryomov 
492883a06263SAlex Elder 		rbd_dev->major = ret;
4929dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
49309b60e70bSIlya Dryomov 	} else {
49319b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
49329b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
49339b60e70bSIlya Dryomov 	}
493483a06263SAlex Elder 
493583a06263SAlex Elder 	/* Set up the blkdev mapping. */
493683a06263SAlex Elder 
493783a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
493883a06263SAlex Elder 	if (ret)
493983a06263SAlex Elder 		goto err_out_blkdev;
494083a06263SAlex Elder 
4941f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
494283a06263SAlex Elder 	if (ret)
494383a06263SAlex Elder 		goto err_out_disk;
4944f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4945f35a4deeSAlex Elder 
4946f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
4947f35a4deeSAlex Elder 	if (ret)
4948f35a4deeSAlex Elder 		goto err_out_mapping;
494983a06263SAlex Elder 
495083a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
495183a06263SAlex Elder 
4952129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
495383a06263SAlex Elder 	add_disk(rbd_dev->disk);
495483a06263SAlex Elder 
495583a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
495683a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
495783a06263SAlex Elder 
495883a06263SAlex Elder 	return ret;
49592f82ee54SAlex Elder 
4960f35a4deeSAlex Elder err_out_mapping:
4961f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
496283a06263SAlex Elder err_out_disk:
496383a06263SAlex Elder 	rbd_free_disk(rbd_dev);
496483a06263SAlex Elder err_out_blkdev:
49659b60e70bSIlya Dryomov 	if (!single_major)
496683a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
496783a06263SAlex Elder err_out_id:
496883a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
4969d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
497083a06263SAlex Elder 
497183a06263SAlex Elder 	return ret;
497283a06263SAlex Elder }
497383a06263SAlex Elder 
4974332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4975332bb12dSAlex Elder {
4976332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
4977332bb12dSAlex Elder 	size_t size;
4978332bb12dSAlex Elder 
4979332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
4980332bb12dSAlex Elder 
4981332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4982332bb12dSAlex Elder 
4983332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4984332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4985332bb12dSAlex Elder 	else
4986332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4987332bb12dSAlex Elder 
4988332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4989332bb12dSAlex Elder 	if (!rbd_dev->header_name)
4990332bb12dSAlex Elder 		return -ENOMEM;
4991332bb12dSAlex Elder 
4992332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4993332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4994332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
4995332bb12dSAlex Elder 	else
4996332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4997332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
4998332bb12dSAlex Elder 	return 0;
4999332bb12dSAlex Elder }
5000332bb12dSAlex Elder 
5001200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5002200a6a8bSAlex Elder {
50036fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5004200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
50056fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
50066fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
50076fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
50086fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
50096fd48b3bSAlex Elder 
5010200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
5011200a6a8bSAlex Elder }
5012200a6a8bSAlex Elder 
5013a30b71b9SAlex Elder /*
5014a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
50151f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
50161f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
50171f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5018a30b71b9SAlex Elder  */
50191f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
5020a30b71b9SAlex Elder {
5021a30b71b9SAlex Elder 	int ret;
5022a30b71b9SAlex Elder 
5023a30b71b9SAlex Elder 	/*
50243abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
50253abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
50263abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
50273abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5028a30b71b9SAlex Elder 	 */
5029a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5030a30b71b9SAlex Elder 	if (ret)
5031c0fba368SAlex Elder 		return ret;
5032c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
5033c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5034c0fba368SAlex Elder 
5035332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5036332bb12dSAlex Elder 	if (ret)
5037332bb12dSAlex Elder 		goto err_out_format;
5038332bb12dSAlex Elder 
50391f3ef788SAlex Elder 	if (mapping) {
5040fca27065SIlya Dryomov 		ret = rbd_dev_header_watch_sync(rbd_dev);
5041b644de2bSAlex Elder 		if (ret)
5042b644de2bSAlex Elder 			goto out_header_name;
50431f3ef788SAlex Elder 	}
5044b644de2bSAlex Elder 
5045c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
504699a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
5047a30b71b9SAlex Elder 	else
50482df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
50495655c4d9SAlex Elder 	if (ret)
5050b644de2bSAlex Elder 		goto err_out_watch;
5051a30b71b9SAlex Elder 
50529bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
50539bb81c9bSAlex Elder 	if (ret)
505433dca39fSAlex Elder 		goto err_out_probe;
50559bb81c9bSAlex Elder 
50569bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
505730d60ba2SAlex Elder 	if (ret)
505830d60ba2SAlex Elder 		goto err_out_probe;
505983a06263SAlex Elder 
506030d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
506130d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
506230d60ba2SAlex Elder 
506330d60ba2SAlex Elder 	return 0;
50646fd48b3bSAlex Elder err_out_probe:
50656fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5066b644de2bSAlex Elder err_out_watch:
5067fca27065SIlya Dryomov 	if (mapping)
5068fca27065SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
5069332bb12dSAlex Elder out_header_name:
5070332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
5071332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
5072332bb12dSAlex Elder err_out_format:
5073332bb12dSAlex Elder 	rbd_dev->image_format = 0;
50745655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
50755655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
50765655c4d9SAlex Elder 
50775655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
50785655c4d9SAlex Elder 
50795655c4d9SAlex Elder 	return ret;
508083a06263SAlex Elder }
508183a06263SAlex Elder 
50829b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
508359c2be1eSYehuda Sadeh 			  const char *buf,
508459c2be1eSYehuda Sadeh 			  size_t count)
5085602adf40SYehuda Sadeh {
5086cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5087dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
50884e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5089859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
50909d3997fdSAlex Elder 	struct rbd_client *rbdc;
509127cc2594SAlex Elder 	struct ceph_osd_client *osdc;
509251344a38SAlex Elder 	bool read_only;
509327cc2594SAlex Elder 	int rc = -ENOMEM;
5094602adf40SYehuda Sadeh 
5095602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5096602adf40SYehuda Sadeh 		return -ENODEV;
5097602adf40SYehuda Sadeh 
5098a725f65eSAlex Elder 	/* parse add command */
5099859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5100dc79b113SAlex Elder 	if (rc < 0)
5101bd4ba655SAlex Elder 		goto err_out_module;
510251344a38SAlex Elder 	read_only = rbd_opts->read_only;
510351344a38SAlex Elder 	kfree(rbd_opts);
510451344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
5105a725f65eSAlex Elder 
51069d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
51079d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
51089d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
51090ddebc0cSAlex Elder 		goto err_out_args;
51109d3997fdSAlex Elder 	}
5111602adf40SYehuda Sadeh 
5112602adf40SYehuda Sadeh 	/* pick the pool */
51139d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
5114859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
5115602adf40SYehuda Sadeh 	if (rc < 0)
5116602adf40SYehuda Sadeh 		goto err_out_client;
5117859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5118859c31dfSAlex Elder 
51190903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
51200903e875SAlex Elder 
5121c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
5122c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5123c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
51240903e875SAlex Elder 		rc = -EIO;
51250903e875SAlex Elder 		goto err_out_client;
51260903e875SAlex Elder 	}
51270903e875SAlex Elder 
5128c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
5129bd4ba655SAlex Elder 	if (!rbd_dev)
5130bd4ba655SAlex Elder 		goto err_out_client;
5131c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5132c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5133602adf40SYehuda Sadeh 
51341f3ef788SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, true);
5135a30b71b9SAlex Elder 	if (rc < 0)
5136c53d5893SAlex Elder 		goto err_out_rbd_dev;
513705fd6f6fSAlex Elder 
51387ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
51397ce4eef7SAlex Elder 
51407ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
51417ce4eef7SAlex Elder 		read_only = true;
51427ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
51437ce4eef7SAlex Elder 
5144b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
51453abef3b3SAlex Elder 	if (rc) {
5146e37180c0SIlya Dryomov 		/*
5147e37180c0SIlya Dryomov 		 * rbd_dev_header_unwatch_sync() can't be moved into
5148e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
5149e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
5150e37180c0SIlya Dryomov 		 */
5151e37180c0SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
51523abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
51533abef3b3SAlex Elder 		goto err_out_module;
51543abef3b3SAlex Elder 	}
51553abef3b3SAlex Elder 
5156602adf40SYehuda Sadeh 	return count;
5157b536f69aSAlex Elder 
5158c53d5893SAlex Elder err_out_rbd_dev:
5159c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5160bd4ba655SAlex Elder err_out_client:
51619d3997fdSAlex Elder 	rbd_put_client(rbdc);
51620ddebc0cSAlex Elder err_out_args:
5163859c31dfSAlex Elder 	rbd_spec_put(spec);
5164bd4ba655SAlex Elder err_out_module:
5165bd4ba655SAlex Elder 	module_put(THIS_MODULE);
516627cc2594SAlex Elder 
5167602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
516827cc2594SAlex Elder 
516927cc2594SAlex Elder 	return (ssize_t)rc;
5170602adf40SYehuda Sadeh }
5171602adf40SYehuda Sadeh 
51729b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
51739b60e70bSIlya Dryomov 		       const char *buf,
51749b60e70bSIlya Dryomov 		       size_t count)
51759b60e70bSIlya Dryomov {
51769b60e70bSIlya Dryomov 	if (single_major)
51779b60e70bSIlya Dryomov 		return -EINVAL;
51789b60e70bSIlya Dryomov 
51799b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
51809b60e70bSIlya Dryomov }
51819b60e70bSIlya Dryomov 
51829b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
51839b60e70bSIlya Dryomov 				    const char *buf,
51849b60e70bSIlya Dryomov 				    size_t count)
51859b60e70bSIlya Dryomov {
51869b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
51879b60e70bSIlya Dryomov }
51889b60e70bSIlya Dryomov 
5189200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
5190602adf40SYehuda Sadeh {
5191593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5192602adf40SYehuda Sadeh 
5193602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5194200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
51956d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
51969b60e70bSIlya Dryomov 	if (!single_major)
5197602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5198e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5199d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5200602adf40SYehuda Sadeh }
5201602adf40SYehuda Sadeh 
520205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
520305a46afdSAlex Elder {
5204ad945fc1SAlex Elder 	while (rbd_dev->parent) {
520505a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
520605a46afdSAlex Elder 		struct rbd_device *second = first->parent;
520705a46afdSAlex Elder 		struct rbd_device *third;
520805a46afdSAlex Elder 
520905a46afdSAlex Elder 		/*
521005a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
521105a46afdSAlex Elder 		 * remove it.
521205a46afdSAlex Elder 		 */
521305a46afdSAlex Elder 		while (second && (third = second->parent)) {
521405a46afdSAlex Elder 			first = second;
521505a46afdSAlex Elder 			second = third;
521605a46afdSAlex Elder 		}
5217ad945fc1SAlex Elder 		rbd_assert(second);
52188ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5219ad945fc1SAlex Elder 		first->parent = NULL;
5220ad945fc1SAlex Elder 		first->parent_overlap = 0;
5221ad945fc1SAlex Elder 
5222ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
522305a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
522405a46afdSAlex Elder 		first->parent_spec = NULL;
522505a46afdSAlex Elder 	}
522605a46afdSAlex Elder }
522705a46afdSAlex Elder 
52289b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
5229602adf40SYehuda Sadeh 			     const char *buf,
5230602adf40SYehuda Sadeh 			     size_t count)
5231602adf40SYehuda Sadeh {
5232602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5233751cc0e3SAlex Elder 	struct list_head *tmp;
5234751cc0e3SAlex Elder 	int dev_id;
5235602adf40SYehuda Sadeh 	unsigned long ul;
523682a442d2SAlex Elder 	bool already = false;
52370d8189e1SAlex Elder 	int ret;
5238602adf40SYehuda Sadeh 
5239bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
52400d8189e1SAlex Elder 	if (ret)
52410d8189e1SAlex Elder 		return ret;
5242602adf40SYehuda Sadeh 
5243602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5244751cc0e3SAlex Elder 	dev_id = (int)ul;
5245751cc0e3SAlex Elder 	if (dev_id != ul)
5246602adf40SYehuda Sadeh 		return -EINVAL;
5247602adf40SYehuda Sadeh 
5248602adf40SYehuda Sadeh 	ret = -ENOENT;
5249751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5250751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5251751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5252751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5253751cc0e3SAlex Elder 			ret = 0;
5254751cc0e3SAlex Elder 			break;
5255602adf40SYehuda Sadeh 		}
5256751cc0e3SAlex Elder 	}
5257751cc0e3SAlex Elder 	if (!ret) {
5258a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5259b82d167bSAlex Elder 		if (rbd_dev->open_count)
526042382b70SAlex Elder 			ret = -EBUSY;
5261b82d167bSAlex Elder 		else
526282a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
526382a442d2SAlex Elder 							&rbd_dev->flags);
5264a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5265751cc0e3SAlex Elder 	}
5266751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
526782a442d2SAlex Elder 	if (ret < 0 || already)
52681ba0f1e7SAlex Elder 		return ret;
5269751cc0e3SAlex Elder 
5270fca27065SIlya Dryomov 	rbd_dev_header_unwatch_sync(rbd_dev);
52719abc5990SJosh Durgin 	/*
52729abc5990SJosh Durgin 	 * flush remaining watch callbacks - these must be complete
52739abc5990SJosh Durgin 	 * before the osd_client is shutdown
52749abc5990SJosh Durgin 	 */
52759abc5990SJosh Durgin 	dout("%s: flushing notifies", __func__);
52769abc5990SJosh Durgin 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5277fca27065SIlya Dryomov 
52789875201eSJosh Durgin 	/*
52799875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
52809875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
52819875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
52829875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
52839875201eSJosh Durgin 	 */
52849875201eSJosh Durgin 	rbd_bus_del_dev(rbd_dev);
52858ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
528679ab7558SAlex Elder 	module_put(THIS_MODULE);
5287aafb230eSAlex Elder 
52881ba0f1e7SAlex Elder 	return count;
5289602adf40SYehuda Sadeh }
5290602adf40SYehuda Sadeh 
52919b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
52929b60e70bSIlya Dryomov 			  const char *buf,
52939b60e70bSIlya Dryomov 			  size_t count)
52949b60e70bSIlya Dryomov {
52959b60e70bSIlya Dryomov 	if (single_major)
52969b60e70bSIlya Dryomov 		return -EINVAL;
52979b60e70bSIlya Dryomov 
52989b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
52999b60e70bSIlya Dryomov }
53009b60e70bSIlya Dryomov 
53019b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
53029b60e70bSIlya Dryomov 				       const char *buf,
53039b60e70bSIlya Dryomov 				       size_t count)
53049b60e70bSIlya Dryomov {
53059b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
53069b60e70bSIlya Dryomov }
53079b60e70bSIlya Dryomov 
5308602adf40SYehuda Sadeh /*
5309602adf40SYehuda Sadeh  * create control files in sysfs
5310dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5311602adf40SYehuda Sadeh  */
5312602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5313602adf40SYehuda Sadeh {
5314dfc5606dSYehuda Sadeh 	int ret;
5315602adf40SYehuda Sadeh 
5316fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5317dfc5606dSYehuda Sadeh 	if (ret < 0)
5318dfc5606dSYehuda Sadeh 		return ret;
5319602adf40SYehuda Sadeh 
5320fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5321fed4c143SAlex Elder 	if (ret < 0)
5322fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5323602adf40SYehuda Sadeh 
5324602adf40SYehuda Sadeh 	return ret;
5325602adf40SYehuda Sadeh }
5326602adf40SYehuda Sadeh 
5327602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5328602adf40SYehuda Sadeh {
5329dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5330fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5331602adf40SYehuda Sadeh }
5332602adf40SYehuda Sadeh 
53331c2a9dfeSAlex Elder static int rbd_slab_init(void)
53341c2a9dfeSAlex Elder {
53351c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
53361c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
53371c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
53381c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
53391c2a9dfeSAlex Elder 					0, NULL);
5340868311b1SAlex Elder 	if (!rbd_img_request_cache)
5341868311b1SAlex Elder 		return -ENOMEM;
5342868311b1SAlex Elder 
5343868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5344868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5345868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5346868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5347868311b1SAlex Elder 					0, NULL);
534878c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
534978c2a44aSAlex Elder 		goto out_err;
535078c2a44aSAlex Elder 
535178c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
535278c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
53532d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
535478c2a44aSAlex Elder 	if (rbd_segment_name_cache)
53551c2a9dfeSAlex Elder 		return 0;
535678c2a44aSAlex Elder out_err:
535778c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
535878c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
535978c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
536078c2a44aSAlex Elder 	}
53611c2a9dfeSAlex Elder 
5362868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5363868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5364868311b1SAlex Elder 
53651c2a9dfeSAlex Elder 	return -ENOMEM;
53661c2a9dfeSAlex Elder }
53671c2a9dfeSAlex Elder 
53681c2a9dfeSAlex Elder static void rbd_slab_exit(void)
53691c2a9dfeSAlex Elder {
537078c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
537178c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
537278c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
537378c2a44aSAlex Elder 
5374868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5375868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5376868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5377868311b1SAlex Elder 
53781c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
53791c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
53801c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
53811c2a9dfeSAlex Elder }
53821c2a9dfeSAlex Elder 
5383cc344fa1SAlex Elder static int __init rbd_init(void)
5384602adf40SYehuda Sadeh {
5385602adf40SYehuda Sadeh 	int rc;
5386602adf40SYehuda Sadeh 
53871e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
53881e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
53891e32d34cSAlex Elder 		return -EINVAL;
53901e32d34cSAlex Elder 	}
5391e1b4d96dSIlya Dryomov 
53921c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5393602adf40SYehuda Sadeh 	if (rc)
5394602adf40SYehuda Sadeh 		return rc;
5395e1b4d96dSIlya Dryomov 
53969b60e70bSIlya Dryomov 	if (single_major) {
53979b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
53989b60e70bSIlya Dryomov 		if (rbd_major < 0) {
53999b60e70bSIlya Dryomov 			rc = rbd_major;
54009b60e70bSIlya Dryomov 			goto err_out_slab;
54019b60e70bSIlya Dryomov 		}
54029b60e70bSIlya Dryomov 	}
54039b60e70bSIlya Dryomov 
54041c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
54051c2a9dfeSAlex Elder 	if (rc)
54069b60e70bSIlya Dryomov 		goto err_out_blkdev;
54071c2a9dfeSAlex Elder 
54089b60e70bSIlya Dryomov 	if (single_major)
54099b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
54109b60e70bSIlya Dryomov 	else
5411e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
54129b60e70bSIlya Dryomov 
5413e1b4d96dSIlya Dryomov 	return 0;
5414e1b4d96dSIlya Dryomov 
54159b60e70bSIlya Dryomov err_out_blkdev:
54169b60e70bSIlya Dryomov 	if (single_major)
54179b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5418e1b4d96dSIlya Dryomov err_out_slab:
5419e1b4d96dSIlya Dryomov 	rbd_slab_exit();
54201c2a9dfeSAlex Elder 	return rc;
5421602adf40SYehuda Sadeh }
5422602adf40SYehuda Sadeh 
5423cc344fa1SAlex Elder static void __exit rbd_exit(void)
5424602adf40SYehuda Sadeh {
5425602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
54269b60e70bSIlya Dryomov 	if (single_major)
54279b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
54281c2a9dfeSAlex Elder 	rbd_slab_exit();
5429602adf40SYehuda Sadeh }
5430602adf40SYehuda Sadeh 
5431602adf40SYehuda Sadeh module_init(rbd_init);
5432602adf40SYehuda Sadeh module_exit(rbd_exit);
5433602adf40SYehuda Sadeh 
5434d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5435602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5436602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5437602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5438602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5439602adf40SYehuda Sadeh 
544090da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5441602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5442