xref: /openbmc/linux/drivers/block/rbd.c (revision 7d5079aa)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44f8a22fc2SIlya Dryomov #include <linux/idr.h>
45602adf40SYehuda Sadeh 
46602adf40SYehuda Sadeh #include "rbd_types.h"
47602adf40SYehuda Sadeh 
48aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
49aafb230eSAlex Elder 
50593a9e7bSAlex Elder /*
51593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
52593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
53593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
54593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
55593a9e7bSAlex Elder  */
56593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
57593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
58593a9e7bSAlex Elder 
59a2acd00eSAlex Elder /*
60a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
61a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
62a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
63a2acd00eSAlex Elder  * -EINVAL without updating it.
64a2acd00eSAlex Elder  */
65a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
66a2acd00eSAlex Elder {
67a2acd00eSAlex Elder 	unsigned int counter;
68a2acd00eSAlex Elder 
69a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
70a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
71a2acd00eSAlex Elder 		return (int)counter;
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder 	atomic_dec(v);
74a2acd00eSAlex Elder 
75a2acd00eSAlex Elder 	return -EINVAL;
76a2acd00eSAlex Elder }
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
79a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
80a2acd00eSAlex Elder {
81a2acd00eSAlex Elder 	int counter;
82a2acd00eSAlex Elder 
83a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
84a2acd00eSAlex Elder 	if (counter >= 0)
85a2acd00eSAlex Elder 		return counter;
86a2acd00eSAlex Elder 
87a2acd00eSAlex Elder 	atomic_inc(v);
88a2acd00eSAlex Elder 
89a2acd00eSAlex Elder 	return -EINVAL;
90a2acd00eSAlex Elder }
91a2acd00eSAlex Elder 
92f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
93602adf40SYehuda Sadeh 
947e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
957e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
96602adf40SYehuda Sadeh 
97d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
98d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
99d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
100d4b125e9SAlex Elder 
10135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
102602adf40SYehuda Sadeh 
103602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
104602adf40SYehuda Sadeh 
1059682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1069682fc6dSAlex Elder 
1079e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1089e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
109589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1109e15b77dSAlex Elder 
1111e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
112589d30e0SAlex Elder 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1155cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1165cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1175cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1185cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
119d889140cSAlex Elder 
120d889140cSAlex Elder /* Features supported by this (client software) implementation. */
121d889140cSAlex Elder 
122770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
123d889140cSAlex Elder 
12481a89793SAlex Elder /*
12581a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
12681a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
12781a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
12881a89793SAlex Elder  * enough to hold all possible device names.
12981a89793SAlex Elder  */
130602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
132602adf40SYehuda Sadeh 
133602adf40SYehuda Sadeh /*
134602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
135602adf40SYehuda Sadeh  */
136602adf40SYehuda Sadeh struct rbd_image_header {
137f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
138849b4260SAlex Elder 	char *object_prefix;
139602adf40SYehuda Sadeh 	__u8 obj_order;
140602adf40SYehuda Sadeh 	__u8 crypt_type;
141602adf40SYehuda Sadeh 	__u8 comp_type;
142f35a4deeSAlex Elder 	u64 stripe_unit;
143f35a4deeSAlex Elder 	u64 stripe_count;
144f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
145602adf40SYehuda Sadeh 
146f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
147f84344f3SAlex Elder 	u64 image_size;
148f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
149f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
150f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15159c2be1eSYehuda Sadeh };
15259c2be1eSYehuda Sadeh 
1530d7dbfceSAlex Elder /*
1540d7dbfceSAlex Elder  * An rbd image specification.
1550d7dbfceSAlex Elder  *
1560d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
157c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
158c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
159c66c6e0cSAlex Elder  *
160c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
161c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
162c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
163c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
164c66c6e0cSAlex Elder  *
165c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
166c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
167c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
168c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
169c66c6e0cSAlex Elder  * is shared between the parent and child).
170c66c6e0cSAlex Elder  *
171c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
172c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
173c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
174c66c6e0cSAlex Elder  *
175c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
176c66c6e0cSAlex Elder  * could be a null pointer).
1770d7dbfceSAlex Elder  */
1780d7dbfceSAlex Elder struct rbd_spec {
1790d7dbfceSAlex Elder 	u64		pool_id;
180ecb4dc22SAlex Elder 	const char	*pool_name;
1810d7dbfceSAlex Elder 
182ecb4dc22SAlex Elder 	const char	*image_id;
183ecb4dc22SAlex Elder 	const char	*image_name;
1840d7dbfceSAlex Elder 
1850d7dbfceSAlex Elder 	u64		snap_id;
186ecb4dc22SAlex Elder 	const char	*snap_name;
1870d7dbfceSAlex Elder 
1880d7dbfceSAlex Elder 	struct kref	kref;
1890d7dbfceSAlex Elder };
1900d7dbfceSAlex Elder 
191602adf40SYehuda Sadeh /*
192f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
193602adf40SYehuda Sadeh  */
194602adf40SYehuda Sadeh struct rbd_client {
195602adf40SYehuda Sadeh 	struct ceph_client	*client;
196602adf40SYehuda Sadeh 	struct kref		kref;
197602adf40SYehuda Sadeh 	struct list_head	node;
198602adf40SYehuda Sadeh };
199602adf40SYehuda Sadeh 
200bf0d5f50SAlex Elder struct rbd_img_request;
201bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
202bf0d5f50SAlex Elder 
203bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
204bf0d5f50SAlex Elder 
205bf0d5f50SAlex Elder struct rbd_obj_request;
206bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
207bf0d5f50SAlex Elder 
2089969ebc5SAlex Elder enum obj_request_type {
2099969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2109969ebc5SAlex Elder };
211bf0d5f50SAlex Elder 
212926f9b3fSAlex Elder enum obj_req_flags {
213926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2146365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2155679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2165679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
217926f9b3fSAlex Elder };
218926f9b3fSAlex Elder 
219bf0d5f50SAlex Elder struct rbd_obj_request {
220bf0d5f50SAlex Elder 	const char		*object_name;
221bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
222bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
223926f9b3fSAlex Elder 	unsigned long		flags;
224bf0d5f50SAlex Elder 
225c5b5ef6cSAlex Elder 	/*
226c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
227c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
228c5b5ef6cSAlex Elder 	 *
229c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
230c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
231c5b5ef6cSAlex Elder 	 *
232c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
233c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
234c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
235c5b5ef6cSAlex Elder 	 *
236c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
237c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
238c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
239c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
240c5b5ef6cSAlex Elder 	 */
241c5b5ef6cSAlex Elder 	union {
242c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
243c5b5ef6cSAlex Elder 		struct {
244bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
245c5b5ef6cSAlex Elder 			u64			img_offset;
246c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
247c5b5ef6cSAlex Elder 			struct list_head	links;
248c5b5ef6cSAlex Elder 		};
249c5b5ef6cSAlex Elder 	};
250bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
251bf0d5f50SAlex Elder 
252bf0d5f50SAlex Elder 	enum obj_request_type	type;
253788e2df3SAlex Elder 	union {
254bf0d5f50SAlex Elder 		struct bio	*bio_list;
255788e2df3SAlex Elder 		struct {
256788e2df3SAlex Elder 			struct page	**pages;
257788e2df3SAlex Elder 			u32		page_count;
258788e2df3SAlex Elder 		};
259788e2df3SAlex Elder 	};
2600eefd470SAlex Elder 	struct page		**copyup_pages;
261ebda6408SAlex Elder 	u32			copyup_page_count;
262bf0d5f50SAlex Elder 
263bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2661b83bef2SSage Weil 	int			result;
267bf0d5f50SAlex Elder 
268bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
269788e2df3SAlex Elder 	struct completion	completion;
270bf0d5f50SAlex Elder 
271bf0d5f50SAlex Elder 	struct kref		kref;
272bf0d5f50SAlex Elder };
273bf0d5f50SAlex Elder 
2740c425248SAlex Elder enum img_req_flags {
2759849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2769849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
277d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2780c425248SAlex Elder };
2790c425248SAlex Elder 
280bf0d5f50SAlex Elder struct rbd_img_request {
281bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
282bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
283bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2840c425248SAlex Elder 	unsigned long		flags;
285bf0d5f50SAlex Elder 	union {
286bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2879849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2889849e986SAlex Elder 	};
2899849e986SAlex Elder 	union {
2909849e986SAlex Elder 		struct request		*rq;		/* block request */
2919849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
292bf0d5f50SAlex Elder 	};
2933d7efd18SAlex Elder 	struct page		**copyup_pages;
294ebda6408SAlex Elder 	u32			copyup_page_count;
295bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
296bf0d5f50SAlex Elder 	u32			next_completion;
297bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
29855f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
299a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
300bf0d5f50SAlex Elder 
301bf0d5f50SAlex Elder 	u32			obj_request_count;
302bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
303bf0d5f50SAlex Elder 
304bf0d5f50SAlex Elder 	struct kref		kref;
305bf0d5f50SAlex Elder };
306bf0d5f50SAlex Elder 
307bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
308ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
309bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
310ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
311bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
312ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
313bf0d5f50SAlex Elder 
314f84344f3SAlex Elder struct rbd_mapping {
31599c1f08fSAlex Elder 	u64                     size;
31634b13184SAlex Elder 	u64                     features;
317f84344f3SAlex Elder 	bool			read_only;
318f84344f3SAlex Elder };
319f84344f3SAlex Elder 
320602adf40SYehuda Sadeh /*
321602adf40SYehuda Sadeh  * a single device
322602adf40SYehuda Sadeh  */
323602adf40SYehuda Sadeh struct rbd_device {
324de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
325602adf40SYehuda Sadeh 
326602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
327dd82fff1SIlya Dryomov 	int			minor;
328602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
329602adf40SYehuda Sadeh 
330a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
331602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
332602adf40SYehuda Sadeh 
333602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
334602adf40SYehuda Sadeh 
335b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
336602adf40SYehuda Sadeh 
337602adf40SYehuda Sadeh 	struct rbd_image_header	header;
338b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3390d7dbfceSAlex Elder 	struct rbd_spec		*spec;
340602adf40SYehuda Sadeh 
3410d7dbfceSAlex Elder 	char			*header_name;
342971f839aSAlex Elder 
3430903e875SAlex Elder 	struct ceph_file_layout	layout;
3440903e875SAlex Elder 
34559c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
346975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
34759c2be1eSYehuda Sadeh 
34886b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
34986b00e0dSAlex Elder 	u64			parent_overlap;
350a2acd00eSAlex Elder 	atomic_t		parent_ref;
3512f82ee54SAlex Elder 	struct rbd_device	*parent;
35286b00e0dSAlex Elder 
353c666601aSJosh Durgin 	/* protects updating the header */
354c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
355f84344f3SAlex Elder 
356f84344f3SAlex Elder 	struct rbd_mapping	mapping;
357602adf40SYehuda Sadeh 
358602adf40SYehuda Sadeh 	struct list_head	node;
359dfc5606dSYehuda Sadeh 
360dfc5606dSYehuda Sadeh 	/* sysfs related */
361dfc5606dSYehuda Sadeh 	struct device		dev;
362b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
363dfc5606dSYehuda Sadeh };
364dfc5606dSYehuda Sadeh 
365b82d167bSAlex Elder /*
366b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
367b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
368b82d167bSAlex Elder  *
369b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
370b82d167bSAlex Elder  * "open_count" field) requires atomic access.
371b82d167bSAlex Elder  */
3726d292906SAlex Elder enum rbd_dev_flags {
3736d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
374b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3756d292906SAlex Elder };
3766d292906SAlex Elder 
377cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
378e124a82fSAlex Elder 
379602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
380e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
381e124a82fSAlex Elder 
382602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
383432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
384602adf40SYehuda Sadeh 
38578c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
38678c2a44aSAlex Elder 
3871c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
388868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
38978c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3901c2a9dfeSAlex Elder 
3919b60e70bSIlya Dryomov static int rbd_major;
392f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
393f8a22fc2SIlya Dryomov 
3949b60e70bSIlya Dryomov /*
3959b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
3969b60e70bSIlya Dryomov  * userspace rbd utility.
3979b60e70bSIlya Dryomov  */
3989b60e70bSIlya Dryomov static bool single_major = false;
3999b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4009b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4019b60e70bSIlya Dryomov 
4023d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4033d7efd18SAlex Elder 
404200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
405dfc5606dSYehuda Sadeh 
406f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
407f0f8cef5SAlex Elder 		       size_t count);
408f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
409f0f8cef5SAlex Elder 			  size_t count);
4109b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4119b60e70bSIlya Dryomov 				    size_t count);
4129b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4139b60e70bSIlya Dryomov 				       size_t count);
4141f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
415a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
416f0f8cef5SAlex Elder 
4179b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4189b60e70bSIlya Dryomov {
4197e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4209b60e70bSIlya Dryomov }
4219b60e70bSIlya Dryomov 
4229b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4239b60e70bSIlya Dryomov {
4247e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4259b60e70bSIlya Dryomov }
4269b60e70bSIlya Dryomov 
427b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
428b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
4299b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
4309b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
431b15a21ddSGreg Kroah-Hartman 
432b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
433b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
434b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4359b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4369b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
437b15a21ddSGreg Kroah-Hartman 	NULL,
438f0f8cef5SAlex Elder };
43992c76dc0SIlya Dryomov 
44092c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
44192c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
44292c76dc0SIlya Dryomov {
4439b60e70bSIlya Dryomov 	if (!single_major &&
4449b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4459b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4469b60e70bSIlya Dryomov 		return 0;
4479b60e70bSIlya Dryomov 
44892c76dc0SIlya Dryomov 	return attr->mode;
44992c76dc0SIlya Dryomov }
45092c76dc0SIlya Dryomov 
45192c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
45292c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
45392c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
45492c76dc0SIlya Dryomov };
45592c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
456f0f8cef5SAlex Elder 
457f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
458f0f8cef5SAlex Elder 	.name		= "rbd",
459b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
460f0f8cef5SAlex Elder };
461f0f8cef5SAlex Elder 
462f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
463f0f8cef5SAlex Elder {
464f0f8cef5SAlex Elder }
465f0f8cef5SAlex Elder 
466f0f8cef5SAlex Elder static struct device rbd_root_dev = {
467f0f8cef5SAlex Elder 	.init_name =    "rbd",
468f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
469f0f8cef5SAlex Elder };
470f0f8cef5SAlex Elder 
47106ecc6cbSAlex Elder static __printf(2, 3)
47206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
47306ecc6cbSAlex Elder {
47406ecc6cbSAlex Elder 	struct va_format vaf;
47506ecc6cbSAlex Elder 	va_list args;
47606ecc6cbSAlex Elder 
47706ecc6cbSAlex Elder 	va_start(args, fmt);
47806ecc6cbSAlex Elder 	vaf.fmt = fmt;
47906ecc6cbSAlex Elder 	vaf.va = &args;
48006ecc6cbSAlex Elder 
48106ecc6cbSAlex Elder 	if (!rbd_dev)
48206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
48306ecc6cbSAlex Elder 	else if (rbd_dev->disk)
48406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
48506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
48606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
48706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
48806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
48906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
49006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
49106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
49206ecc6cbSAlex Elder 	else	/* punt */
49306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
49406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
49506ecc6cbSAlex Elder 	va_end(args);
49606ecc6cbSAlex Elder }
49706ecc6cbSAlex Elder 
498aafb230eSAlex Elder #ifdef RBD_DEBUG
499aafb230eSAlex Elder #define rbd_assert(expr)						\
500aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
501aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
502aafb230eSAlex Elder 						"at line %d:\n\n"	\
503aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
504aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
505aafb230eSAlex Elder 			BUG();						\
506aafb230eSAlex Elder 		}
507aafb230eSAlex Elder #else /* !RBD_DEBUG */
508aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
509aafb230eSAlex Elder #endif /* !RBD_DEBUG */
510dfc5606dSYehuda Sadeh 
511b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
51205a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
51305a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5148b3e1a56SAlex Elder 
515cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5162df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
5172df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
51854cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
51954cac61fSAlex Elder 					u64 snap_id);
5202ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5212ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5222ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5232ad3d716SAlex Elder 		u64 *snap_features);
5242ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
52559c2be1eSYehuda Sadeh 
526602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
527602adf40SYehuda Sadeh {
528f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
529b82d167bSAlex Elder 	bool removing = false;
530602adf40SYehuda Sadeh 
531f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
532602adf40SYehuda Sadeh 		return -EROFS;
533602adf40SYehuda Sadeh 
534a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
535b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
536b82d167bSAlex Elder 		removing = true;
537b82d167bSAlex Elder 	else
538b82d167bSAlex Elder 		rbd_dev->open_count++;
539a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
540b82d167bSAlex Elder 	if (removing)
541b82d167bSAlex Elder 		return -ENOENT;
542b82d167bSAlex Elder 
543c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
544340c7a2bSAlex Elder 
545602adf40SYehuda Sadeh 	return 0;
546602adf40SYehuda Sadeh }
547602adf40SYehuda Sadeh 
548db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
549dfc5606dSYehuda Sadeh {
550dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
551b82d167bSAlex Elder 	unsigned long open_count_before;
552b82d167bSAlex Elder 
553a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
554b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
555a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
556b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
557dfc5606dSYehuda Sadeh 
558c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
559dfc5606dSYehuda Sadeh }
560dfc5606dSYehuda Sadeh 
561131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
562131fd9f6SGuangliang Zhao {
56377f33c03SJosh Durgin 	int ret = 0;
564131fd9f6SGuangliang Zhao 	int val;
565131fd9f6SGuangliang Zhao 	bool ro;
56677f33c03SJosh Durgin 	bool ro_changed = false;
567131fd9f6SGuangliang Zhao 
56877f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
569131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
570131fd9f6SGuangliang Zhao 		return -EFAULT;
571131fd9f6SGuangliang Zhao 
572131fd9f6SGuangliang Zhao 	ro = val ? true : false;
573131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
574131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
575131fd9f6SGuangliang Zhao 		return -EROFS;
576131fd9f6SGuangliang Zhao 
57777f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
57877f33c03SJosh Durgin 	/* prevent others open this device */
57977f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
58077f33c03SJosh Durgin 		ret = -EBUSY;
58177f33c03SJosh Durgin 		goto out;
582131fd9f6SGuangliang Zhao 	}
583131fd9f6SGuangliang Zhao 
58477f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
58577f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
58677f33c03SJosh Durgin 		ro_changed = true;
58777f33c03SJosh Durgin 	}
58877f33c03SJosh Durgin 
58977f33c03SJosh Durgin out:
59077f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
59177f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
59277f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
59377f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
59477f33c03SJosh Durgin 
59577f33c03SJosh Durgin 	return ret;
596131fd9f6SGuangliang Zhao }
597131fd9f6SGuangliang Zhao 
598131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
599131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
600131fd9f6SGuangliang Zhao {
601131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602131fd9f6SGuangliang Zhao 	int ret = 0;
603131fd9f6SGuangliang Zhao 
604131fd9f6SGuangliang Zhao 	switch (cmd) {
605131fd9f6SGuangliang Zhao 	case BLKROSET:
606131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
607131fd9f6SGuangliang Zhao 		break;
608131fd9f6SGuangliang Zhao 	default:
609131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
610131fd9f6SGuangliang Zhao 	}
611131fd9f6SGuangliang Zhao 
612131fd9f6SGuangliang Zhao 	return ret;
613131fd9f6SGuangliang Zhao }
614131fd9f6SGuangliang Zhao 
615131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
616131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
617131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
618131fd9f6SGuangliang Zhao {
619131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
620131fd9f6SGuangliang Zhao }
621131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
622131fd9f6SGuangliang Zhao 
623602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
624602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
625602adf40SYehuda Sadeh 	.open			= rbd_open,
626dfc5606dSYehuda Sadeh 	.release		= rbd_release,
627131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
628131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
629131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
630131fd9f6SGuangliang Zhao #endif
631602adf40SYehuda Sadeh };
632602adf40SYehuda Sadeh 
633602adf40SYehuda Sadeh /*
6347262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
635cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
636602adf40SYehuda Sadeh  */
637f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
638602adf40SYehuda Sadeh {
639602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
640602adf40SYehuda Sadeh 	int ret = -ENOMEM;
641602adf40SYehuda Sadeh 
64237206ee5SAlex Elder 	dout("%s:\n", __func__);
643602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
644602adf40SYehuda Sadeh 	if (!rbdc)
645602adf40SYehuda Sadeh 		goto out_opt;
646602adf40SYehuda Sadeh 
647602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
648602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
649602adf40SYehuda Sadeh 
65043ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
651602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
65208f75463SAlex Elder 		goto out_rbdc;
65343ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
654602adf40SYehuda Sadeh 
655602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
656602adf40SYehuda Sadeh 	if (ret < 0)
65708f75463SAlex Elder 		goto out_client;
658602adf40SYehuda Sadeh 
659432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
660602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
661432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
662602adf40SYehuda Sadeh 
66337206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
664bc534d86SAlex Elder 
665602adf40SYehuda Sadeh 	return rbdc;
66608f75463SAlex Elder out_client:
667602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
66808f75463SAlex Elder out_rbdc:
669602adf40SYehuda Sadeh 	kfree(rbdc);
670602adf40SYehuda Sadeh out_opt:
67143ae4701SAlex Elder 	if (ceph_opts)
67243ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
67337206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
67437206ee5SAlex Elder 
67528f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
676602adf40SYehuda Sadeh }
677602adf40SYehuda Sadeh 
6782f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
6792f82ee54SAlex Elder {
6802f82ee54SAlex Elder 	kref_get(&rbdc->kref);
6812f82ee54SAlex Elder 
6822f82ee54SAlex Elder 	return rbdc;
6832f82ee54SAlex Elder }
6842f82ee54SAlex Elder 
685602adf40SYehuda Sadeh /*
6861f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
6871f7ba331SAlex Elder  * found, bump its reference count.
688602adf40SYehuda Sadeh  */
6891f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
690602adf40SYehuda Sadeh {
691602adf40SYehuda Sadeh 	struct rbd_client *client_node;
6921f7ba331SAlex Elder 	bool found = false;
693602adf40SYehuda Sadeh 
69443ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
695602adf40SYehuda Sadeh 		return NULL;
696602adf40SYehuda Sadeh 
6971f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
6981f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
6991f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7002f82ee54SAlex Elder 			__rbd_get_client(client_node);
7012f82ee54SAlex Elder 
7021f7ba331SAlex Elder 			found = true;
7031f7ba331SAlex Elder 			break;
7041f7ba331SAlex Elder 		}
7051f7ba331SAlex Elder 	}
7061f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7071f7ba331SAlex Elder 
7081f7ba331SAlex Elder 	return found ? client_node : NULL;
709602adf40SYehuda Sadeh }
710602adf40SYehuda Sadeh 
711602adf40SYehuda Sadeh /*
71259c2be1eSYehuda Sadeh  * mount options
71359c2be1eSYehuda Sadeh  */
71459c2be1eSYehuda Sadeh enum {
71559c2be1eSYehuda Sadeh 	Opt_last_int,
71659c2be1eSYehuda Sadeh 	/* int args above */
71759c2be1eSYehuda Sadeh 	Opt_last_string,
71859c2be1eSYehuda Sadeh 	/* string args above */
719cc0538b6SAlex Elder 	Opt_read_only,
720cc0538b6SAlex Elder 	Opt_read_write,
721cc0538b6SAlex Elder 	/* Boolean args above */
722cc0538b6SAlex Elder 	Opt_last_bool,
72359c2be1eSYehuda Sadeh };
72459c2be1eSYehuda Sadeh 
72543ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
72659c2be1eSYehuda Sadeh 	/* int args above */
72759c2be1eSYehuda Sadeh 	/* string args above */
728be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
729cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
730cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
731cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
732cc0538b6SAlex Elder 	/* Boolean args above */
73359c2be1eSYehuda Sadeh 	{-1, NULL}
73459c2be1eSYehuda Sadeh };
73559c2be1eSYehuda Sadeh 
73698571b5aSAlex Elder struct rbd_options {
73798571b5aSAlex Elder 	bool	read_only;
73898571b5aSAlex Elder };
73998571b5aSAlex Elder 
74098571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
74198571b5aSAlex Elder 
74259c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
74359c2be1eSYehuda Sadeh {
74443ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
74559c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
74659c2be1eSYehuda Sadeh 	int token, intval, ret;
74759c2be1eSYehuda Sadeh 
74843ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
74959c2be1eSYehuda Sadeh 	if (token < 0)
75059c2be1eSYehuda Sadeh 		return -EINVAL;
75159c2be1eSYehuda Sadeh 
75259c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
75359c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
75459c2be1eSYehuda Sadeh 		if (ret < 0) {
75559c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
75659c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
75759c2be1eSYehuda Sadeh 			return ret;
75859c2be1eSYehuda Sadeh 		}
75959c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
76059c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
76159c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
76259c2be1eSYehuda Sadeh 		     argstr[0].from);
763cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
764cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
76559c2be1eSYehuda Sadeh 	} else {
76659c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
76759c2be1eSYehuda Sadeh 	}
76859c2be1eSYehuda Sadeh 
76959c2be1eSYehuda Sadeh 	switch (token) {
770cc0538b6SAlex Elder 	case Opt_read_only:
771cc0538b6SAlex Elder 		rbd_opts->read_only = true;
772cc0538b6SAlex Elder 		break;
773cc0538b6SAlex Elder 	case Opt_read_write:
774cc0538b6SAlex Elder 		rbd_opts->read_only = false;
775cc0538b6SAlex Elder 		break;
77659c2be1eSYehuda Sadeh 	default:
777aafb230eSAlex Elder 		rbd_assert(false);
778aafb230eSAlex Elder 		break;
77959c2be1eSYehuda Sadeh 	}
78059c2be1eSYehuda Sadeh 	return 0;
78159c2be1eSYehuda Sadeh }
78259c2be1eSYehuda Sadeh 
78359c2be1eSYehuda Sadeh /*
784602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
7857262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
7867262cfcaSAlex Elder  * function.
787602adf40SYehuda Sadeh  */
7889d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
789602adf40SYehuda Sadeh {
790f8c38929SAlex Elder 	struct rbd_client *rbdc;
79159c2be1eSYehuda Sadeh 
792cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
7931f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
7949d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
79543ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
7969d3997fdSAlex Elder 	else
797f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
798cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
799d720bcb0SAlex Elder 
8009d3997fdSAlex Elder 	return rbdc;
801602adf40SYehuda Sadeh }
802602adf40SYehuda Sadeh 
803602adf40SYehuda Sadeh /*
804602adf40SYehuda Sadeh  * Destroy ceph client
805d23a4b3fSAlex Elder  *
806432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
807602adf40SYehuda Sadeh  */
808602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
809602adf40SYehuda Sadeh {
810602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
811602adf40SYehuda Sadeh 
81237206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
813cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
814602adf40SYehuda Sadeh 	list_del(&rbdc->node);
815cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
816602adf40SYehuda Sadeh 
817602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
818602adf40SYehuda Sadeh 	kfree(rbdc);
819602adf40SYehuda Sadeh }
820602adf40SYehuda Sadeh 
821602adf40SYehuda Sadeh /*
822602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
823602adf40SYehuda Sadeh  * it.
824602adf40SYehuda Sadeh  */
8259d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
826602adf40SYehuda Sadeh {
827c53d5893SAlex Elder 	if (rbdc)
8289d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
829602adf40SYehuda Sadeh }
830602adf40SYehuda Sadeh 
831a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
832a30b71b9SAlex Elder {
833a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
834a30b71b9SAlex Elder }
835a30b71b9SAlex Elder 
8368e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
8378e94af8eSAlex Elder {
838103a150fSAlex Elder 	size_t size;
839103a150fSAlex Elder 	u32 snap_count;
840103a150fSAlex Elder 
841103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
842103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
843103a150fSAlex Elder 		return false;
844103a150fSAlex Elder 
845db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
846db2388b6SAlex Elder 
847db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
848db2388b6SAlex Elder 		return false;
849db2388b6SAlex Elder 
850db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
851db2388b6SAlex Elder 
852db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
853db2388b6SAlex Elder 		return false;
854db2388b6SAlex Elder 
855103a150fSAlex Elder 	/*
856103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
857103a150fSAlex Elder 	 * that limits the number of snapshots.
858103a150fSAlex Elder 	 */
859103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
860103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
861103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
862103a150fSAlex Elder 		return false;
863103a150fSAlex Elder 
864103a150fSAlex Elder 	/*
865103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
866103a150fSAlex Elder 	 * header must also be representable in a size_t.
867103a150fSAlex Elder 	 */
868103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
869103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
870103a150fSAlex Elder 		return false;
871103a150fSAlex Elder 
872103a150fSAlex Elder 	return true;
8738e94af8eSAlex Elder }
8748e94af8eSAlex Elder 
875602adf40SYehuda Sadeh /*
876bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
877bb23e37aSAlex Elder  * on-disk header.
878602adf40SYehuda Sadeh  */
879662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
8804156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
881602adf40SYehuda Sadeh {
882662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
883bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
884bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
885bb23e37aSAlex Elder 	char *object_prefix = NULL;
886bb23e37aSAlex Elder 	char *snap_names = NULL;
887bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
888ccece235SAlex Elder 	u32 snap_count;
889d2bb24e5SAlex Elder 	size_t size;
890bb23e37aSAlex Elder 	int ret = -ENOMEM;
891621901d6SAlex Elder 	u32 i;
892602adf40SYehuda Sadeh 
893bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
894103a150fSAlex Elder 
895bb23e37aSAlex Elder 	if (first_time) {
896bb23e37aSAlex Elder 		size_t len;
897bb23e37aSAlex Elder 
898bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
899bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
900bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
901bb23e37aSAlex Elder 		if (!object_prefix)
902602adf40SYehuda Sadeh 			return -ENOMEM;
903bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
904bb23e37aSAlex Elder 		object_prefix[len] = '\0';
905bb23e37aSAlex Elder 	}
90600f1f36fSAlex Elder 
907bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
908d2bb24e5SAlex Elder 
909602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
910bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
911bb23e37aSAlex Elder 	if (!snapc)
912bb23e37aSAlex Elder 		goto out_err;
913bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
914602adf40SYehuda Sadeh 	if (snap_count) {
915bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
916f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
917f785cc1dSAlex Elder 
918bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
919621901d6SAlex Elder 
920f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
921bb23e37aSAlex Elder 			goto out_2big;
922bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
923bb23e37aSAlex Elder 		if (!snap_names)
924602adf40SYehuda Sadeh 			goto out_err;
925bb23e37aSAlex Elder 
926bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
927bb23e37aSAlex Elder 
928bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
929bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
930bb23e37aSAlex Elder 		if (!snap_sizes)
931bb23e37aSAlex Elder 			goto out_err;
932bb23e37aSAlex Elder 
933f785cc1dSAlex Elder 		/*
934bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
935bb23e37aSAlex Elder 		 * and size.
936bb23e37aSAlex Elder 		 *
93799a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
938bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
939f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
940f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
941f785cc1dSAlex Elder 		 */
942bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
943bb23e37aSAlex Elder 		snaps = ondisk->snaps;
944bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
945bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
946bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
947bb23e37aSAlex Elder 		}
948602adf40SYehuda Sadeh 	}
949849b4260SAlex Elder 
950bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
951bb23e37aSAlex Elder 
952bb23e37aSAlex Elder 	if (first_time) {
953bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
954602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
955602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
956602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
957bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
958bb23e37aSAlex Elder 		header->stripe_unit = 0;
959bb23e37aSAlex Elder 		header->stripe_count = 0;
960bb23e37aSAlex Elder 		header->features = 0;
961662518b1SAlex Elder 	} else {
962662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
963662518b1SAlex Elder 		kfree(header->snap_names);
964662518b1SAlex Elder 		kfree(header->snap_sizes);
965bb23e37aSAlex Elder 	}
9666a52325fSAlex Elder 
967bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
968621901d6SAlex Elder 
969f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
970bb23e37aSAlex Elder 	header->snapc = snapc;
971bb23e37aSAlex Elder 	header->snap_names = snap_names;
972bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
973468521c1SAlex Elder 
974662518b1SAlex Elder 	/* Make sure mapping size is consistent with header info */
975662518b1SAlex Elder 
976662518b1SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
977662518b1SAlex Elder 		if (rbd_dev->mapping.size != header->image_size)
978662518b1SAlex Elder 			rbd_dev->mapping.size = header->image_size;
979662518b1SAlex Elder 
980602adf40SYehuda Sadeh 	return 0;
981bb23e37aSAlex Elder out_2big:
982bb23e37aSAlex Elder 	ret = -EIO;
9836a52325fSAlex Elder out_err:
984bb23e37aSAlex Elder 	kfree(snap_sizes);
985bb23e37aSAlex Elder 	kfree(snap_names);
986bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
987bb23e37aSAlex Elder 	kfree(object_prefix);
988ccece235SAlex Elder 
989bb23e37aSAlex Elder 	return ret;
990602adf40SYehuda Sadeh }
991602adf40SYehuda Sadeh 
9929682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
9939682fc6dSAlex Elder {
9949682fc6dSAlex Elder 	const char *snap_name;
9959682fc6dSAlex Elder 
9969682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
9979682fc6dSAlex Elder 
9989682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
9999682fc6dSAlex Elder 
10009682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
10019682fc6dSAlex Elder 	while (which--)
10029682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
10039682fc6dSAlex Elder 
10049682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
10059682fc6dSAlex Elder }
10069682fc6dSAlex Elder 
100730d1cff8SAlex Elder /*
100830d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
100930d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
101030d1cff8SAlex Elder  */
101130d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
101230d1cff8SAlex Elder {
101330d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
101430d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
101530d1cff8SAlex Elder 
101630d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
101730d1cff8SAlex Elder 		return 1;
101830d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
101930d1cff8SAlex Elder }
102030d1cff8SAlex Elder 
102130d1cff8SAlex Elder /*
102230d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
102330d1cff8SAlex Elder  * present.
102430d1cff8SAlex Elder  *
102530d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
102630d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
102730d1cff8SAlex Elder  *
102830d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
102930d1cff8SAlex Elder  * reverse order, highest snapshot id first.
103030d1cff8SAlex Elder  */
10319682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
10329682fc6dSAlex Elder {
10339682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
103430d1cff8SAlex Elder 	u64 *found;
10359682fc6dSAlex Elder 
103630d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
103730d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
10389682fc6dSAlex Elder 
103930d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
10409682fc6dSAlex Elder }
10419682fc6dSAlex Elder 
10422ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
10432ad3d716SAlex Elder 					u64 snap_id)
104454cac61fSAlex Elder {
104554cac61fSAlex Elder 	u32 which;
1046da6a6b63SJosh Durgin 	const char *snap_name;
104754cac61fSAlex Elder 
104854cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
104954cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1050da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
105154cac61fSAlex Elder 
1052da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1053da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
105454cac61fSAlex Elder }
105554cac61fSAlex Elder 
10569e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
10579e15b77dSAlex Elder {
10589e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
10599e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
10609e15b77dSAlex Elder 
106154cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
106254cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
106354cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
10649e15b77dSAlex Elder 
106554cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
10669e15b77dSAlex Elder }
10679e15b77dSAlex Elder 
10682ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
10692ad3d716SAlex Elder 				u64 *snap_size)
1070602adf40SYehuda Sadeh {
10712ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10722ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10732ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
10742ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
10752ad3d716SAlex Elder 		u32 which;
107600f1f36fSAlex Elder 
10772ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
10782ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
10792ad3d716SAlex Elder 			return -ENOENT;
108000f1f36fSAlex Elder 
10812ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
10822ad3d716SAlex Elder 	} else {
10832ad3d716SAlex Elder 		u64 size = 0;
10842ad3d716SAlex Elder 		int ret;
10852ad3d716SAlex Elder 
10862ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
10872ad3d716SAlex Elder 		if (ret)
10882ad3d716SAlex Elder 			return ret;
10892ad3d716SAlex Elder 
10902ad3d716SAlex Elder 		*snap_size = size;
10912ad3d716SAlex Elder 	}
10922ad3d716SAlex Elder 	return 0;
10932ad3d716SAlex Elder }
10942ad3d716SAlex Elder 
10952ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
10962ad3d716SAlex Elder 			u64 *snap_features)
10972ad3d716SAlex Elder {
10982ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10992ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11002ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
11012ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11022ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
11032ad3d716SAlex Elder 	} else {
11042ad3d716SAlex Elder 		u64 features = 0;
11052ad3d716SAlex Elder 		int ret;
11062ad3d716SAlex Elder 
11072ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
11082ad3d716SAlex Elder 		if (ret)
11092ad3d716SAlex Elder 			return ret;
11102ad3d716SAlex Elder 
11112ad3d716SAlex Elder 		*snap_features = features;
11122ad3d716SAlex Elder 	}
11132ad3d716SAlex Elder 	return 0;
111400f1f36fSAlex Elder }
1115602adf40SYehuda Sadeh 
1116d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1117602adf40SYehuda Sadeh {
11188f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
11192ad3d716SAlex Elder 	u64 size = 0;
11202ad3d716SAlex Elder 	u64 features = 0;
11212ad3d716SAlex Elder 	int ret;
11228b0241f8SAlex Elder 
11232ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
11242ad3d716SAlex Elder 	if (ret)
11252ad3d716SAlex Elder 		return ret;
11262ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
11272ad3d716SAlex Elder 	if (ret)
11282ad3d716SAlex Elder 		return ret;
11292ad3d716SAlex Elder 
11302ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
11312ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
11322ad3d716SAlex Elder 
11338b0241f8SAlex Elder 	return 0;
1134602adf40SYehuda Sadeh }
1135602adf40SYehuda Sadeh 
1136d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1137d1cf5788SAlex Elder {
1138d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1139d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1140200a6a8bSAlex Elder }
1141200a6a8bSAlex Elder 
11427d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name)
11437d5079aaSHimangi Saraogi {
11447d5079aaSHimangi Saraogi 	/* The explicit cast here is needed to drop the const qualifier */
11457d5079aaSHimangi Saraogi 
11467d5079aaSHimangi Saraogi 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
11477d5079aaSHimangi Saraogi }
11487d5079aaSHimangi Saraogi 
114998571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1150602adf40SYehuda Sadeh {
115165ccfe21SAlex Elder 	char *name;
115265ccfe21SAlex Elder 	u64 segment;
115365ccfe21SAlex Elder 	int ret;
11543a96d5cdSJosh Durgin 	char *name_format;
1155602adf40SYehuda Sadeh 
115678c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
115765ccfe21SAlex Elder 	if (!name)
115865ccfe21SAlex Elder 		return NULL;
115965ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
11603a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
11613a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
11623a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
11632d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
116465ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
11652d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
116665ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
116765ccfe21SAlex Elder 			segment, ret);
11687d5079aaSHimangi Saraogi 		rbd_segment_name_free(name);
116965ccfe21SAlex Elder 		name = NULL;
117065ccfe21SAlex Elder 	}
1171602adf40SYehuda Sadeh 
117265ccfe21SAlex Elder 	return name;
117365ccfe21SAlex Elder }
1174602adf40SYehuda Sadeh 
117565ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
117665ccfe21SAlex Elder {
117765ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1178602adf40SYehuda Sadeh 
117965ccfe21SAlex Elder 	return offset & (segment_size - 1);
118065ccfe21SAlex Elder }
118165ccfe21SAlex Elder 
118265ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
118365ccfe21SAlex Elder 				u64 offset, u64 length)
118465ccfe21SAlex Elder {
118565ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
118665ccfe21SAlex Elder 
118765ccfe21SAlex Elder 	offset &= segment_size - 1;
118865ccfe21SAlex Elder 
1189aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
119065ccfe21SAlex Elder 	if (offset + length > segment_size)
119165ccfe21SAlex Elder 		length = segment_size - offset;
119265ccfe21SAlex Elder 
119365ccfe21SAlex Elder 	return length;
1194602adf40SYehuda Sadeh }
1195602adf40SYehuda Sadeh 
1196602adf40SYehuda Sadeh /*
1197029bcbd8SJosh Durgin  * returns the size of an object in the image
1198029bcbd8SJosh Durgin  */
1199029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1200029bcbd8SJosh Durgin {
1201029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1202029bcbd8SJosh Durgin }
1203029bcbd8SJosh Durgin 
1204029bcbd8SJosh Durgin /*
1205602adf40SYehuda Sadeh  * bio helpers
1206602adf40SYehuda Sadeh  */
1207602adf40SYehuda Sadeh 
1208602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1209602adf40SYehuda Sadeh {
1210602adf40SYehuda Sadeh 	struct bio *tmp;
1211602adf40SYehuda Sadeh 
1212602adf40SYehuda Sadeh 	while (chain) {
1213602adf40SYehuda Sadeh 		tmp = chain;
1214602adf40SYehuda Sadeh 		chain = chain->bi_next;
1215602adf40SYehuda Sadeh 		bio_put(tmp);
1216602adf40SYehuda Sadeh 	}
1217602adf40SYehuda Sadeh }
1218602adf40SYehuda Sadeh 
1219602adf40SYehuda Sadeh /*
1220602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1221602adf40SYehuda Sadeh  */
1222602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1223602adf40SYehuda Sadeh {
12247988613bSKent Overstreet 	struct bio_vec bv;
12257988613bSKent Overstreet 	struct bvec_iter iter;
1226602adf40SYehuda Sadeh 	unsigned long flags;
1227602adf40SYehuda Sadeh 	void *buf;
1228602adf40SYehuda Sadeh 	int pos = 0;
1229602adf40SYehuda Sadeh 
1230602adf40SYehuda Sadeh 	while (chain) {
12317988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
12327988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1233602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
12347988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1235602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
12367988613bSKent Overstreet 				       bv.bv_len - remainder);
12377988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
123885b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1239602adf40SYehuda Sadeh 			}
12407988613bSKent Overstreet 			pos += bv.bv_len;
1241602adf40SYehuda Sadeh 		}
1242602adf40SYehuda Sadeh 
1243602adf40SYehuda Sadeh 		chain = chain->bi_next;
1244602adf40SYehuda Sadeh 	}
1245602adf40SYehuda Sadeh }
1246602adf40SYehuda Sadeh 
1247602adf40SYehuda Sadeh /*
1248b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1249b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1250b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1251b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1252b9434c5bSAlex Elder  */
1253b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1254b9434c5bSAlex Elder {
1255b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1256b9434c5bSAlex Elder 
1257b9434c5bSAlex Elder 	rbd_assert(end > offset);
1258b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1259b9434c5bSAlex Elder 	while (offset < end) {
1260b9434c5bSAlex Elder 		size_t page_offset;
1261b9434c5bSAlex Elder 		size_t length;
1262b9434c5bSAlex Elder 		unsigned long flags;
1263b9434c5bSAlex Elder 		void *kaddr;
1264b9434c5bSAlex Elder 
1265491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1266491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1267b9434c5bSAlex Elder 		local_irq_save(flags);
1268b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1269b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1270e2156054SAlex Elder 		flush_dcache_page(*page);
1271b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1272b9434c5bSAlex Elder 		local_irq_restore(flags);
1273b9434c5bSAlex Elder 
1274b9434c5bSAlex Elder 		offset += length;
1275b9434c5bSAlex Elder 		page++;
1276b9434c5bSAlex Elder 	}
1277b9434c5bSAlex Elder }
1278b9434c5bSAlex Elder 
1279b9434c5bSAlex Elder /*
1280f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1281f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1282602adf40SYehuda Sadeh  */
1283f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1284f7760dadSAlex Elder 					unsigned int offset,
1285f7760dadSAlex Elder 					unsigned int len,
1286f7760dadSAlex Elder 					gfp_t gfpmask)
1287602adf40SYehuda Sadeh {
1288f7760dadSAlex Elder 	struct bio *bio;
1289602adf40SYehuda Sadeh 
12905341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1291f7760dadSAlex Elder 	if (!bio)
1292f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1293f7760dadSAlex Elder 
12945341a627SKent Overstreet 	bio_advance(bio, offset);
12954f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1296602adf40SYehuda Sadeh 
1297f7760dadSAlex Elder 	return bio;
1298602adf40SYehuda Sadeh }
1299602adf40SYehuda Sadeh 
1300f7760dadSAlex Elder /*
1301f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1302f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1303f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1304f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1305f7760dadSAlex Elder  *
1306f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1307f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1308f7760dadSAlex Elder  * the start of data to be cloned is located.
1309f7760dadSAlex Elder  *
1310f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1311f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1312f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1313f7760dadSAlex Elder  */
1314f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1315f7760dadSAlex Elder 					unsigned int *offset,
1316f7760dadSAlex Elder 					unsigned int len,
1317f7760dadSAlex Elder 					gfp_t gfpmask)
1318f7760dadSAlex Elder {
1319f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1320f7760dadSAlex Elder 	unsigned int off = *offset;
1321f7760dadSAlex Elder 	struct bio *chain = NULL;
1322f7760dadSAlex Elder 	struct bio **end;
1323602adf40SYehuda Sadeh 
1324f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1325602adf40SYehuda Sadeh 
13264f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1327f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1328602adf40SYehuda Sadeh 
1329f7760dadSAlex Elder 	end = &chain;
1330f7760dadSAlex Elder 	while (len) {
1331f7760dadSAlex Elder 		unsigned int bi_size;
1332f7760dadSAlex Elder 		struct bio *bio;
1333f7760dadSAlex Elder 
1334f5400b7aSAlex Elder 		if (!bi) {
1335f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1336f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1337f5400b7aSAlex Elder 		}
13384f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1339f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1340f7760dadSAlex Elder 		if (!bio)
1341f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1342f7760dadSAlex Elder 
1343f7760dadSAlex Elder 		*end = bio;
1344f7760dadSAlex Elder 		end = &bio->bi_next;
1345f7760dadSAlex Elder 
1346f7760dadSAlex Elder 		off += bi_size;
13474f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1348f7760dadSAlex Elder 			bi = bi->bi_next;
1349f7760dadSAlex Elder 			off = 0;
1350f7760dadSAlex Elder 		}
1351f7760dadSAlex Elder 		len -= bi_size;
1352f7760dadSAlex Elder 	}
1353f7760dadSAlex Elder 	*bio_src = bi;
1354f7760dadSAlex Elder 	*offset = off;
1355f7760dadSAlex Elder 
1356f7760dadSAlex Elder 	return chain;
1357f7760dadSAlex Elder out_err:
1358f7760dadSAlex Elder 	bio_chain_put(chain);
1359f7760dadSAlex Elder 
1360602adf40SYehuda Sadeh 	return NULL;
1361602adf40SYehuda Sadeh }
1362602adf40SYehuda Sadeh 
1363926f9b3fSAlex Elder /*
1364926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1365926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1366926f9b3fSAlex Elder  * again.
1367926f9b3fSAlex Elder  */
13686365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13696365d33aSAlex Elder {
13706365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13716365d33aSAlex Elder 		struct rbd_device *rbd_dev;
13726365d33aSAlex Elder 
137357acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
13746365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
13756365d33aSAlex Elder 			obj_request);
13766365d33aSAlex Elder 	}
13776365d33aSAlex Elder }
13786365d33aSAlex Elder 
13796365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
13806365d33aSAlex Elder {
13816365d33aSAlex Elder 	smp_mb();
13826365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
13836365d33aSAlex Elder }
13846365d33aSAlex Elder 
138557acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
138657acbaa7SAlex Elder {
138757acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
138857acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
138957acbaa7SAlex Elder 
139057acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
139157acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
139257acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
139357acbaa7SAlex Elder 			obj_request);
139457acbaa7SAlex Elder 	}
139557acbaa7SAlex Elder }
139657acbaa7SAlex Elder 
139757acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
139857acbaa7SAlex Elder {
139957acbaa7SAlex Elder 	smp_mb();
140057acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
140157acbaa7SAlex Elder }
140257acbaa7SAlex Elder 
14035679c59fSAlex Elder /*
14045679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14055679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14065679c59fSAlex Elder  *
14075679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14085679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14095679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14105679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14115679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14125679c59fSAlex Elder  */
14135679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14145679c59fSAlex Elder 				bool exists)
14155679c59fSAlex Elder {
14165679c59fSAlex Elder 	if (exists)
14175679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14185679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14195679c59fSAlex Elder 	smp_mb();
14205679c59fSAlex Elder }
14215679c59fSAlex Elder 
14225679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14235679c59fSAlex Elder {
14245679c59fSAlex Elder 	smp_mb();
14255679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
14265679c59fSAlex Elder }
14275679c59fSAlex Elder 
14285679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
14295679c59fSAlex Elder {
14305679c59fSAlex Elder 	smp_mb();
14315679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
14325679c59fSAlex Elder }
14335679c59fSAlex Elder 
14349638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
14359638556aSIlya Dryomov {
14369638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
14379638556aSIlya Dryomov 
14389638556aSIlya Dryomov 	return obj_request->img_offset <
14399638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
14409638556aSIlya Dryomov }
14419638556aSIlya Dryomov 
1442bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1443bf0d5f50SAlex Elder {
144437206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
144537206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1446bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1447bf0d5f50SAlex Elder }
1448bf0d5f50SAlex Elder 
1449bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1450bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1451bf0d5f50SAlex Elder {
1452bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
145337206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
145437206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1455bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1456bf0d5f50SAlex Elder }
1457bf0d5f50SAlex Elder 
14580f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
14590f2d5be7SAlex Elder {
14600f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
14610f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
14620f2d5be7SAlex Elder 	kref_get(&img_request->kref);
14630f2d5be7SAlex Elder }
14640f2d5be7SAlex Elder 
1465e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1466e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1467bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1468bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1469bf0d5f50SAlex Elder {
1470bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
147137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
147237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1473e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1474e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1475e93f3152SAlex Elder 	else
1476bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1477bf0d5f50SAlex Elder }
1478bf0d5f50SAlex Elder 
1479bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1480bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1481bf0d5f50SAlex Elder {
148225dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
148325dcf954SAlex Elder 
1484b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1485bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
148625dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
14876365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
14886365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1489bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
149025dcf954SAlex Elder 	img_request->obj_request_count++;
149125dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
149237206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
149337206ee5SAlex Elder 		obj_request->which);
1494bf0d5f50SAlex Elder }
1495bf0d5f50SAlex Elder 
1496bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1497bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1498bf0d5f50SAlex Elder {
1499bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
150025dcf954SAlex Elder 
150137206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
150237206ee5SAlex Elder 		obj_request->which);
1503bf0d5f50SAlex Elder 	list_del(&obj_request->links);
150425dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
150525dcf954SAlex Elder 	img_request->obj_request_count--;
150625dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
150725dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
15086365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1509bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1510bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
151125dcf954SAlex Elder 	obj_request->callback = NULL;
1512bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1513bf0d5f50SAlex Elder }
1514bf0d5f50SAlex Elder 
1515bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1516bf0d5f50SAlex Elder {
1517bf0d5f50SAlex Elder 	switch (type) {
15189969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1519bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1520788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1521bf0d5f50SAlex Elder 		return true;
1522bf0d5f50SAlex Elder 	default:
1523bf0d5f50SAlex Elder 		return false;
1524bf0d5f50SAlex Elder 	}
1525bf0d5f50SAlex Elder }
1526bf0d5f50SAlex Elder 
1527bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1528bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1529bf0d5f50SAlex Elder {
153071c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1531bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1532bf0d5f50SAlex Elder }
1533bf0d5f50SAlex Elder 
153471c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
153571c20a06SIlya Dryomov {
153671c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
153771c20a06SIlya Dryomov 	ceph_osdc_cancel_request(obj_request->osd_req);
153871c20a06SIlya Dryomov }
153971c20a06SIlya Dryomov 
154071c20a06SIlya Dryomov /*
154171c20a06SIlya Dryomov  * Wait for an object request to complete.  If interrupted, cancel the
154271c20a06SIlya Dryomov  * underlying osd request.
154371c20a06SIlya Dryomov  */
154471c20a06SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
154571c20a06SIlya Dryomov {
154671c20a06SIlya Dryomov 	int ret;
154771c20a06SIlya Dryomov 
154871c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
154971c20a06SIlya Dryomov 
155071c20a06SIlya Dryomov 	ret = wait_for_completion_interruptible(&obj_request->completion);
155171c20a06SIlya Dryomov 	if (ret < 0) {
155271c20a06SIlya Dryomov 		dout("%s %p interrupted\n", __func__, obj_request);
155371c20a06SIlya Dryomov 		rbd_obj_request_end(obj_request);
155471c20a06SIlya Dryomov 		return ret;
155571c20a06SIlya Dryomov 	}
155671c20a06SIlya Dryomov 
155771c20a06SIlya Dryomov 	dout("%s %p done\n", __func__, obj_request);
155871c20a06SIlya Dryomov 	return 0;
155971c20a06SIlya Dryomov }
156071c20a06SIlya Dryomov 
1561bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1562bf0d5f50SAlex Elder {
156355f27e09SAlex Elder 
156437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
156555f27e09SAlex Elder 
156655f27e09SAlex Elder 	/*
156755f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
156855f27e09SAlex Elder 	 * count for the image request.  We could instead use
156955f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
157055f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
157155f27e09SAlex Elder 	 */
157255f27e09SAlex Elder 	if (!img_request->result) {
157355f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
157455f27e09SAlex Elder 		u64 xferred = 0;
157555f27e09SAlex Elder 
157655f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
157755f27e09SAlex Elder 			xferred += obj_request->xferred;
157855f27e09SAlex Elder 		img_request->xferred = xferred;
157955f27e09SAlex Elder 	}
158055f27e09SAlex Elder 
1581bf0d5f50SAlex Elder 	if (img_request->callback)
1582bf0d5f50SAlex Elder 		img_request->callback(img_request);
1583bf0d5f50SAlex Elder 	else
1584bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1585bf0d5f50SAlex Elder }
1586bf0d5f50SAlex Elder 
15870c425248SAlex Elder /*
15880c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
15890c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
15900c425248SAlex Elder  * and currently never change thereafter.
15910c425248SAlex Elder  */
15920c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
15930c425248SAlex Elder {
15940c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
15950c425248SAlex Elder 	smp_mb();
15960c425248SAlex Elder }
15970c425248SAlex Elder 
15980c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
15990c425248SAlex Elder {
16000c425248SAlex Elder 	smp_mb();
16010c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16020c425248SAlex Elder }
16030c425248SAlex Elder 
16049849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
16059849e986SAlex Elder {
16069849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
16079849e986SAlex Elder 	smp_mb();
16089849e986SAlex Elder }
16099849e986SAlex Elder 
1610e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1611e93f3152SAlex Elder {
1612e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1613e93f3152SAlex Elder 	smp_mb();
1614e93f3152SAlex Elder }
1615e93f3152SAlex Elder 
16169849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
16179849e986SAlex Elder {
16189849e986SAlex Elder 	smp_mb();
16199849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
16209849e986SAlex Elder }
16219849e986SAlex Elder 
1622d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1623d0b2e944SAlex Elder {
1624d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1625d0b2e944SAlex Elder 	smp_mb();
1626d0b2e944SAlex Elder }
1627d0b2e944SAlex Elder 
1628a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1629a2acd00eSAlex Elder {
1630a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1631a2acd00eSAlex Elder 	smp_mb();
1632a2acd00eSAlex Elder }
1633a2acd00eSAlex Elder 
1634d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1635d0b2e944SAlex Elder {
1636d0b2e944SAlex Elder 	smp_mb();
1637d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1638d0b2e944SAlex Elder }
1639d0b2e944SAlex Elder 
16406e2a4505SAlex Elder static void
16416e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
16426e2a4505SAlex Elder {
1643b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1644b9434c5bSAlex Elder 	u64 length = obj_request->length;
1645b9434c5bSAlex Elder 
16466e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
16476e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1648b9434c5bSAlex Elder 		xferred, length);
16496e2a4505SAlex Elder 	/*
165017c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
165117c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
165217c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
165317c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
165417c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
165517c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
16566e2a4505SAlex Elder 	 */
1657b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
16586e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1659b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
16606e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1661b9434c5bSAlex Elder 		else
1662b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
16636e2a4505SAlex Elder 		obj_request->result = 0;
1664b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1665b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1666b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1667b9434c5bSAlex Elder 		else
1668b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
16696e2a4505SAlex Elder 	}
167017c1cc1dSJosh Durgin 	obj_request->xferred = length;
16716e2a4505SAlex Elder 	obj_request_done_set(obj_request);
16726e2a4505SAlex Elder }
16736e2a4505SAlex Elder 
1674bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1675bf0d5f50SAlex Elder {
167637206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
167737206ee5SAlex Elder 		obj_request->callback);
1678bf0d5f50SAlex Elder 	if (obj_request->callback)
1679bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1680788e2df3SAlex Elder 	else
1681788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1682bf0d5f50SAlex Elder }
1683bf0d5f50SAlex Elder 
1684c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
168539bf2c5dSAlex Elder {
168639bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
168739bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
168839bf2c5dSAlex Elder }
168939bf2c5dSAlex Elder 
1690c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1691bf0d5f50SAlex Elder {
169257acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1693a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
169457acbaa7SAlex Elder 	bool layered = false;
169557acbaa7SAlex Elder 
169657acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
169757acbaa7SAlex Elder 		img_request = obj_request->img_request;
169857acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1699a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
170057acbaa7SAlex Elder 	}
17018b3e1a56SAlex Elder 
17028b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17038b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
17048b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1705a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1706a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
17078b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
17088b3e1a56SAlex Elder 	else if (img_request)
17096e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
17106e2a4505SAlex Elder 	else
171107741308SAlex Elder 		obj_request_done_set(obj_request);
1712bf0d5f50SAlex Elder }
1713bf0d5f50SAlex Elder 
1714c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1715bf0d5f50SAlex Elder {
17161b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
17171b83bef2SSage Weil 		obj_request->result, obj_request->length);
17181b83bef2SSage Weil 	/*
17198b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
17208b3e1a56SAlex Elder 	 * it to our originally-requested length.
17211b83bef2SSage Weil 	 */
17221b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
172307741308SAlex Elder 	obj_request_done_set(obj_request);
1724bf0d5f50SAlex Elder }
1725bf0d5f50SAlex Elder 
1726fbfab539SAlex Elder /*
1727fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1728fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1729fbfab539SAlex Elder  */
1730c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1731fbfab539SAlex Elder {
173237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1733fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1734fbfab539SAlex Elder }
1735fbfab539SAlex Elder 
1736bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1737bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1738bf0d5f50SAlex Elder {
1739bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1740bf0d5f50SAlex Elder 	u16 opcode;
1741bf0d5f50SAlex Elder 
174237206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1743bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
174457acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
174557acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
174657acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
174757acbaa7SAlex Elder 	} else {
174857acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
174957acbaa7SAlex Elder 	}
1750bf0d5f50SAlex Elder 
17511b83bef2SSage Weil 	if (osd_req->r_result < 0)
17521b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1753bf0d5f50SAlex Elder 
17547cc69d42SIlya Dryomov 	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
1755bf0d5f50SAlex Elder 
1756c47f9371SAlex Elder 	/*
1757c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1758c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1759c47f9371SAlex Elder 	 */
17601b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1761c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
17620ccd5926SIlya Dryomov 
176379528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1764bf0d5f50SAlex Elder 	switch (opcode) {
1765bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1766c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1767bf0d5f50SAlex Elder 		break;
17680ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
17690ccd5926SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
17700ccd5926SIlya Dryomov 		/* fall through */
1771bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1772c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1773bf0d5f50SAlex Elder 		break;
1774fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1775c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1776fbfab539SAlex Elder 		break;
177736be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1778b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
17799969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1780c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
17819969ebc5SAlex Elder 		break;
1782bf0d5f50SAlex Elder 	default:
1783bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1784bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1785bf0d5f50SAlex Elder 		break;
1786bf0d5f50SAlex Elder 	}
1787bf0d5f50SAlex Elder 
178807741308SAlex Elder 	if (obj_request_done_test(obj_request))
1789bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1790bf0d5f50SAlex Elder }
1791bf0d5f50SAlex Elder 
17929d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1793430c28c3SAlex Elder {
1794430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17958c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17969d4df01fSAlex Elder 	u64 snap_id;
1797430c28c3SAlex Elder 
17988c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1799430c28c3SAlex Elder 
18009d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
18018c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
18029d4df01fSAlex Elder 			NULL, snap_id, NULL);
18039d4df01fSAlex Elder }
18049d4df01fSAlex Elder 
18059d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
18069d4df01fSAlex Elder {
18079d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
18089d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
18099d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
18109d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
18119d4df01fSAlex Elder 
18129d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
18139d4df01fSAlex Elder 
18149d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
18159d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
18169d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1817430c28c3SAlex Elder }
1818430c28c3SAlex Elder 
18190ccd5926SIlya Dryomov /*
18200ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
18210ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
18220ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
18230ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
18240ccd5926SIlya Dryomov  */
1825bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1826bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1827bf0d5f50SAlex Elder 					bool write_request,
1828deb236b3SIlya Dryomov 					unsigned int num_ops,
1829430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1830bf0d5f50SAlex Elder {
1831bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1832bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1833bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1834bf0d5f50SAlex Elder 
18356365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
18366365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
18376365d33aSAlex Elder 
18380c425248SAlex Elder 		rbd_assert(write_request ==
18390c425248SAlex Elder 				img_request_write_test(img_request));
18400c425248SAlex Elder 		if (write_request)
1841bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1842bf0d5f50SAlex Elder 	}
1843bf0d5f50SAlex Elder 
18440ccd5926SIlya Dryomov 	rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
1845deb236b3SIlya Dryomov 
1846deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1847bf0d5f50SAlex Elder 
1848bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1849deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1850deb236b3SIlya Dryomov 					  GFP_ATOMIC);
1851bf0d5f50SAlex Elder 	if (!osd_req)
1852bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1853bf0d5f50SAlex Elder 
1854430c28c3SAlex Elder 	if (write_request)
1855bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1856430c28c3SAlex Elder 	else
1857bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1858bf0d5f50SAlex Elder 
1859bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1860bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1861bf0d5f50SAlex Elder 
18623c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
18633c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1864bf0d5f50SAlex Elder 
1865bf0d5f50SAlex Elder 	return osd_req;
1866bf0d5f50SAlex Elder }
1867bf0d5f50SAlex Elder 
18680eefd470SAlex Elder /*
18690eefd470SAlex Elder  * Create a copyup osd request based on the information in the
18700ccd5926SIlya Dryomov  * object request supplied.  A copyup request has three osd ops,
18710ccd5926SIlya Dryomov  * a copyup method call, a hint op, and a write op.
18720eefd470SAlex Elder  */
18730eefd470SAlex Elder static struct ceph_osd_request *
18740eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
18750eefd470SAlex Elder {
18760eefd470SAlex Elder 	struct rbd_img_request *img_request;
18770eefd470SAlex Elder 	struct ceph_snap_context *snapc;
18780eefd470SAlex Elder 	struct rbd_device *rbd_dev;
18790eefd470SAlex Elder 	struct ceph_osd_client *osdc;
18800eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
18810eefd470SAlex Elder 
18820eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18830eefd470SAlex Elder 	img_request = obj_request->img_request;
18840eefd470SAlex Elder 	rbd_assert(img_request);
18850eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
18860eefd470SAlex Elder 
18870ccd5926SIlya Dryomov 	/* Allocate and initialize the request, for the three ops */
18880eefd470SAlex Elder 
18890eefd470SAlex Elder 	snapc = img_request->snapc;
18900eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
18910eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
18920ccd5926SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
18930eefd470SAlex Elder 	if (!osd_req)
18940eefd470SAlex Elder 		return NULL;	/* ENOMEM */
18950eefd470SAlex Elder 
18960eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
18970eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
18980eefd470SAlex Elder 	osd_req->r_priv = obj_request;
18990eefd470SAlex Elder 
19003c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
19013c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
19020eefd470SAlex Elder 
19030eefd470SAlex Elder 	return osd_req;
19040eefd470SAlex Elder }
19050eefd470SAlex Elder 
19060eefd470SAlex Elder 
1907bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1908bf0d5f50SAlex Elder {
1909bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1910bf0d5f50SAlex Elder }
1911bf0d5f50SAlex Elder 
1912bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1913bf0d5f50SAlex Elder 
1914bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1915bf0d5f50SAlex Elder 						u64 offset, u64 length,
1916bf0d5f50SAlex Elder 						enum obj_request_type type)
1917bf0d5f50SAlex Elder {
1918bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1919bf0d5f50SAlex Elder 	size_t size;
1920bf0d5f50SAlex Elder 	char *name;
1921bf0d5f50SAlex Elder 
1922bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1923bf0d5f50SAlex Elder 
1924bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1925f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1926f907ad55SAlex Elder 	if (!name)
1927bf0d5f50SAlex Elder 		return NULL;
1928bf0d5f50SAlex Elder 
1929868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1930f907ad55SAlex Elder 	if (!obj_request) {
1931f907ad55SAlex Elder 		kfree(name);
1932f907ad55SAlex Elder 		return NULL;
1933f907ad55SAlex Elder 	}
1934f907ad55SAlex Elder 
1935bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1936bf0d5f50SAlex Elder 	obj_request->offset = offset;
1937bf0d5f50SAlex Elder 	obj_request->length = length;
1938926f9b3fSAlex Elder 	obj_request->flags = 0;
1939bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1940bf0d5f50SAlex Elder 	obj_request->type = type;
1941bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1942788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1943bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1944bf0d5f50SAlex Elder 
194537206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
194637206ee5SAlex Elder 		offset, length, (int)type, obj_request);
194737206ee5SAlex Elder 
1948bf0d5f50SAlex Elder 	return obj_request;
1949bf0d5f50SAlex Elder }
1950bf0d5f50SAlex Elder 
1951bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1952bf0d5f50SAlex Elder {
1953bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1954bf0d5f50SAlex Elder 
1955bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1956bf0d5f50SAlex Elder 
195737206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
195837206ee5SAlex Elder 
1959bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1960bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1961bf0d5f50SAlex Elder 
1962bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1963bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1964bf0d5f50SAlex Elder 
1965bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1966bf0d5f50SAlex Elder 	switch (obj_request->type) {
19679969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
19689969ebc5SAlex Elder 		break;		/* Nothing to do */
1969bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1970bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1971bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1972bf0d5f50SAlex Elder 		break;
1973788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1974788e2df3SAlex Elder 		if (obj_request->pages)
1975788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1976788e2df3SAlex Elder 						obj_request->page_count);
1977788e2df3SAlex Elder 		break;
1978bf0d5f50SAlex Elder 	}
1979bf0d5f50SAlex Elder 
1980f907ad55SAlex Elder 	kfree(obj_request->object_name);
1981868311b1SAlex Elder 	obj_request->object_name = NULL;
1982868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1983bf0d5f50SAlex Elder }
1984bf0d5f50SAlex Elder 
1985fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1986fb65d228SAlex Elder 
1987fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1988fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1989fb65d228SAlex Elder {
1990fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1991fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1992fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1993fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1994fb65d228SAlex Elder }
1995fb65d228SAlex Elder 
1996bf0d5f50SAlex Elder /*
1997a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1998a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1999a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2000a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2001a2acd00eSAlex Elder  */
2002a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2003a2acd00eSAlex Elder {
2004a2acd00eSAlex Elder 	int counter;
2005a2acd00eSAlex Elder 
2006a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2007a2acd00eSAlex Elder 		return;
2008a2acd00eSAlex Elder 
2009a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2010a2acd00eSAlex Elder 	if (counter > 0)
2011a2acd00eSAlex Elder 		return;
2012a2acd00eSAlex Elder 
2013a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2014a2acd00eSAlex Elder 
2015a2acd00eSAlex Elder 	if (!counter)
2016a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2017a2acd00eSAlex Elder 	else
2018a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference underflow\n");
2019a2acd00eSAlex Elder }
2020a2acd00eSAlex Elder 
2021a2acd00eSAlex Elder /*
2022a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2023a2acd00eSAlex Elder  * parent.
2024a2acd00eSAlex Elder  *
2025392a9dadSAlex Elder  * We must get the reference before checking for the overlap to
2026392a9dadSAlex Elder  * coordinate properly with zeroing the parent overlap in
2027392a9dadSAlex Elder  * rbd_dev_v2_parent_info() when an image gets flattened.  We
2028392a9dadSAlex Elder  * drop it again if there is no overlap.
2029392a9dadSAlex Elder  *
2030a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2031a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2032a2acd00eSAlex Elder  * false otherwise.
2033a2acd00eSAlex Elder  */
2034a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2035a2acd00eSAlex Elder {
2036a2acd00eSAlex Elder 	int counter;
2037a2acd00eSAlex Elder 
2038a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2039a2acd00eSAlex Elder 		return false;
2040a2acd00eSAlex Elder 
2041a2acd00eSAlex Elder 	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2042a2acd00eSAlex Elder 	if (counter > 0 && rbd_dev->parent_overlap)
2043a2acd00eSAlex Elder 		return true;
2044a2acd00eSAlex Elder 
2045a2acd00eSAlex Elder 	/* Image was flattened, but parent is not yet torn down */
2046a2acd00eSAlex Elder 
2047a2acd00eSAlex Elder 	if (counter < 0)
2048a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference overflow\n");
2049a2acd00eSAlex Elder 
2050a2acd00eSAlex Elder 	return false;
2051a2acd00eSAlex Elder }
2052a2acd00eSAlex Elder 
2053bf0d5f50SAlex Elder /*
2054bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2055bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2056bf0d5f50SAlex Elder  * (if there is one).
2057bf0d5f50SAlex Elder  */
2058cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2059cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2060bf0d5f50SAlex Elder 					u64 offset, u64 length,
2061e93f3152SAlex Elder 					bool write_request)
2062bf0d5f50SAlex Elder {
2063bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2064bf0d5f50SAlex Elder 
20651c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
2066bf0d5f50SAlex Elder 	if (!img_request)
2067bf0d5f50SAlex Elder 		return NULL;
2068bf0d5f50SAlex Elder 
2069bf0d5f50SAlex Elder 	if (write_request) {
2070bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
2071812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
2072bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
2073bf0d5f50SAlex Elder 	}
2074bf0d5f50SAlex Elder 
2075bf0d5f50SAlex Elder 	img_request->rq = NULL;
2076bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2077bf0d5f50SAlex Elder 	img_request->offset = offset;
2078bf0d5f50SAlex Elder 	img_request->length = length;
20790c425248SAlex Elder 	img_request->flags = 0;
20800c425248SAlex Elder 	if (write_request) {
20810c425248SAlex Elder 		img_request_write_set(img_request);
2082468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
20830c425248SAlex Elder 	} else {
2084bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
20850c425248SAlex Elder 	}
2086a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2087d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2088bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2089bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2090bf0d5f50SAlex Elder 	img_request->callback = NULL;
2091a5a337d4SAlex Elder 	img_request->result = 0;
2092bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2093bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2094bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2095bf0d5f50SAlex Elder 
209637206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
209737206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
209837206ee5SAlex Elder 		img_request);
209937206ee5SAlex Elder 
2100bf0d5f50SAlex Elder 	return img_request;
2101bf0d5f50SAlex Elder }
2102bf0d5f50SAlex Elder 
2103bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2104bf0d5f50SAlex Elder {
2105bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2106bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2107bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2108bf0d5f50SAlex Elder 
2109bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2110bf0d5f50SAlex Elder 
211137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
211237206ee5SAlex Elder 
2113bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2114bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
211525dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2116bf0d5f50SAlex Elder 
2117a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2118a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2119a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2120a2acd00eSAlex Elder 	}
2121a2acd00eSAlex Elder 
21220c425248SAlex Elder 	if (img_request_write_test(img_request))
2123812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2124bf0d5f50SAlex Elder 
21251c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2126bf0d5f50SAlex Elder }
2127bf0d5f50SAlex Elder 
2128e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2129e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2130e93f3152SAlex Elder 					u64 img_offset, u64 length)
2131e93f3152SAlex Elder {
2132e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2133e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2134e93f3152SAlex Elder 
2135e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2136e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2137e93f3152SAlex Elder 
2138e93f3152SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
2139e93f3152SAlex Elder 						img_offset, length, false);
2140e93f3152SAlex Elder 	if (!parent_request)
2141e93f3152SAlex Elder 		return NULL;
2142e93f3152SAlex Elder 
2143e93f3152SAlex Elder 	img_request_child_set(parent_request);
2144e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2145e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2146e93f3152SAlex Elder 
2147e93f3152SAlex Elder 	return parent_request;
2148e93f3152SAlex Elder }
2149e93f3152SAlex Elder 
2150e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2151e93f3152SAlex Elder {
2152e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2153e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2154e93f3152SAlex Elder 
2155e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2156e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2157e93f3152SAlex Elder 
2158e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2159e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2160e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2161e93f3152SAlex Elder 
2162e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2163e93f3152SAlex Elder }
2164e93f3152SAlex Elder 
21651217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
21661217857fSAlex Elder {
21676365d33aSAlex Elder 	struct rbd_img_request *img_request;
21681217857fSAlex Elder 	unsigned int xferred;
21691217857fSAlex Elder 	int result;
21708b3e1a56SAlex Elder 	bool more;
21711217857fSAlex Elder 
21726365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21736365d33aSAlex Elder 	img_request = obj_request->img_request;
21746365d33aSAlex Elder 
21751217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
21761217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
21771217857fSAlex Elder 	result = obj_request->result;
21781217857fSAlex Elder 	if (result) {
21791217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
21801217857fSAlex Elder 
21811217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
21821217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
21831217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
21841217857fSAlex Elder 			obj_request->offset);
21851217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
21861217857fSAlex Elder 			result, xferred);
21871217857fSAlex Elder 		if (!img_request->result)
21881217857fSAlex Elder 			img_request->result = result;
21891217857fSAlex Elder 	}
21901217857fSAlex Elder 
2191f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2192f1a4739fSAlex Elder 
2193f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2194f1a4739fSAlex Elder 		obj_request->pages = NULL;
2195f1a4739fSAlex Elder 		obj_request->page_count = 0;
2196f1a4739fSAlex Elder 	}
2197f1a4739fSAlex Elder 
21988b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
21998b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
22008b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
22018b3e1a56SAlex Elder 	} else {
22028b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
22038b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
22048b3e1a56SAlex Elder 	}
22058b3e1a56SAlex Elder 
22068b3e1a56SAlex Elder 	return more;
22071217857fSAlex Elder }
22081217857fSAlex Elder 
22092169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
22102169238dSAlex Elder {
22112169238dSAlex Elder 	struct rbd_img_request *img_request;
22122169238dSAlex Elder 	u32 which = obj_request->which;
22132169238dSAlex Elder 	bool more = true;
22142169238dSAlex Elder 
22156365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22162169238dSAlex Elder 	img_request = obj_request->img_request;
22172169238dSAlex Elder 
22182169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
22192169238dSAlex Elder 	rbd_assert(img_request != NULL);
22202169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
22212169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
22222169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
22232169238dSAlex Elder 
22242169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
22252169238dSAlex Elder 	if (which != img_request->next_completion)
22262169238dSAlex Elder 		goto out;
22272169238dSAlex Elder 
22282169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
22292169238dSAlex Elder 		rbd_assert(more);
22302169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
22312169238dSAlex Elder 
22322169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
22332169238dSAlex Elder 			break;
22341217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
22352169238dSAlex Elder 		which++;
22362169238dSAlex Elder 	}
22372169238dSAlex Elder 
22382169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
22392169238dSAlex Elder 	img_request->next_completion = which;
22402169238dSAlex Elder out:
22412169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
22420f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
22432169238dSAlex Elder 
22442169238dSAlex Elder 	if (!more)
22452169238dSAlex Elder 		rbd_img_request_complete(img_request);
22462169238dSAlex Elder }
22472169238dSAlex Elder 
2248f1a4739fSAlex Elder /*
2249f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2250f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2251f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2252f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2253f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2254f1a4739fSAlex Elder  * all data described by the image request.
2255f1a4739fSAlex Elder  */
2256f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2257f1a4739fSAlex Elder 					enum obj_request_type type,
2258f1a4739fSAlex Elder 					void *data_desc)
2259bf0d5f50SAlex Elder {
2260bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2261bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2262bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
22630c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
2264a158073cSJingoo Han 	struct bio *bio_list = NULL;
2265f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2266a158073cSJingoo Han 	struct page **pages = NULL;
22677da22d29SAlex Elder 	u64 img_offset;
2268bf0d5f50SAlex Elder 	u64 resid;
2269bf0d5f50SAlex Elder 	u16 opcode;
2270bf0d5f50SAlex Elder 
2271f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2272f1a4739fSAlex Elder 		(int)type, data_desc);
227337206ee5SAlex Elder 
2274430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
22757da22d29SAlex Elder 	img_offset = img_request->offset;
2276bf0d5f50SAlex Elder 	resid = img_request->length;
22774dda41d3SAlex Elder 	rbd_assert(resid > 0);
2278f1a4739fSAlex Elder 
2279f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2280f1a4739fSAlex Elder 		bio_list = data_desc;
22814f024f37SKent Overstreet 		rbd_assert(img_offset ==
22824f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
2283f1a4739fSAlex Elder 	} else {
2284f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2285f1a4739fSAlex Elder 		pages = data_desc;
2286f1a4739fSAlex Elder 	}
2287f1a4739fSAlex Elder 
2288bf0d5f50SAlex Elder 	while (resid) {
22892fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2290bf0d5f50SAlex Elder 		const char *object_name;
2291bf0d5f50SAlex Elder 		u64 offset;
2292bf0d5f50SAlex Elder 		u64 length;
22930ccd5926SIlya Dryomov 		unsigned int which = 0;
2294bf0d5f50SAlex Elder 
22957da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2296bf0d5f50SAlex Elder 		if (!object_name)
2297bf0d5f50SAlex Elder 			goto out_unwind;
22987da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
22997da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2300bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2301f1a4739fSAlex Elder 						offset, length, type);
230278c2a44aSAlex Elder 		/* object request has its own copy of the object name */
230378c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2304bf0d5f50SAlex Elder 		if (!obj_request)
2305bf0d5f50SAlex Elder 			goto out_unwind;
230662054da6SIlya Dryomov 
230703507db6SJosh Durgin 		/*
230803507db6SJosh Durgin 		 * set obj_request->img_request before creating the
230903507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
231003507db6SJosh Durgin 		 */
231103507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2312bf0d5f50SAlex Elder 
2313f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2314f1a4739fSAlex Elder 			unsigned int clone_size;
2315f1a4739fSAlex Elder 
2316bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2317bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2318f1a4739fSAlex Elder 			obj_request->bio_list =
2319f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2320f1a4739fSAlex Elder 								&bio_offset,
2321f1a4739fSAlex Elder 								clone_size,
2322bf0d5f50SAlex Elder 								GFP_ATOMIC);
2323bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
232462054da6SIlya Dryomov 				goto out_unwind;
2325f1a4739fSAlex Elder 		} else {
2326f1a4739fSAlex Elder 			unsigned int page_count;
2327f1a4739fSAlex Elder 
2328f1a4739fSAlex Elder 			obj_request->pages = pages;
2329f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2330f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2331f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2332f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2333f1a4739fSAlex Elder 			pages += page_count;
2334f1a4739fSAlex Elder 		}
2335bf0d5f50SAlex Elder 
23360ccd5926SIlya Dryomov 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
23370ccd5926SIlya Dryomov 					     (write_request ? 2 : 1),
23382fa12320SAlex Elder 					     obj_request);
23392fa12320SAlex Elder 		if (!osd_req)
234062054da6SIlya Dryomov 			goto out_unwind;
23412fa12320SAlex Elder 		obj_request->osd_req = osd_req;
23422169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
23430f2d5be7SAlex Elder 		rbd_img_request_get(img_request);
2344430c28c3SAlex Elder 
23450ccd5926SIlya Dryomov 		if (write_request) {
23460ccd5926SIlya Dryomov 			osd_req_op_alloc_hint_init(osd_req, which,
23470ccd5926SIlya Dryomov 					     rbd_obj_bytes(&rbd_dev->header),
23480ccd5926SIlya Dryomov 					     rbd_obj_bytes(&rbd_dev->header));
23490ccd5926SIlya Dryomov 			which++;
23500ccd5926SIlya Dryomov 		}
23510ccd5926SIlya Dryomov 
23520ccd5926SIlya Dryomov 		osd_req_op_extent_init(osd_req, which, opcode, offset, length,
23532fa12320SAlex Elder 				       0, 0);
2354f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
23550ccd5926SIlya Dryomov 			osd_req_op_extent_osd_data_bio(osd_req, which,
2356f1a4739fSAlex Elder 					obj_request->bio_list, length);
2357f1a4739fSAlex Elder 		else
23580ccd5926SIlya Dryomov 			osd_req_op_extent_osd_data_pages(osd_req, which,
2359f1a4739fSAlex Elder 					obj_request->pages, length,
2360f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
23619d4df01fSAlex Elder 
23629d4df01fSAlex Elder 		if (write_request)
23639d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
23649d4df01fSAlex Elder 		else
23659d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2366430c28c3SAlex Elder 
23677da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2368bf0d5f50SAlex Elder 
23697da22d29SAlex Elder 		img_offset += length;
2370bf0d5f50SAlex Elder 		resid -= length;
2371bf0d5f50SAlex Elder 	}
2372bf0d5f50SAlex Elder 
2373bf0d5f50SAlex Elder 	return 0;
2374bf0d5f50SAlex Elder 
2375bf0d5f50SAlex Elder out_unwind:
2376bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
237742dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2378bf0d5f50SAlex Elder 
2379bf0d5f50SAlex Elder 	return -ENOMEM;
2380bf0d5f50SAlex Elder }
2381bf0d5f50SAlex Elder 
23823d7efd18SAlex Elder static void
23830eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
23840eefd470SAlex Elder {
23850eefd470SAlex Elder 	struct rbd_img_request *img_request;
23860eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2387ebda6408SAlex Elder 	struct page **pages;
23880eefd470SAlex Elder 	u32 page_count;
23890eefd470SAlex Elder 
23900eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
23910eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23920eefd470SAlex Elder 	img_request = obj_request->img_request;
23930eefd470SAlex Elder 	rbd_assert(img_request);
23940eefd470SAlex Elder 
23950eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
23960eefd470SAlex Elder 	rbd_assert(rbd_dev);
23970eefd470SAlex Elder 
2398ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2399ebda6408SAlex Elder 	rbd_assert(pages != NULL);
24000eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2401ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2402ebda6408SAlex Elder 	rbd_assert(page_count);
2403ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2404ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
24050eefd470SAlex Elder 
24060eefd470SAlex Elder 	/*
24070eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
24080eefd470SAlex Elder 	 * original write request.  There is no such thing as a
24090eefd470SAlex Elder 	 * successful short write, so if the request was successful
24100eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
24110eefd470SAlex Elder 	 */
24120eefd470SAlex Elder 	if (!obj_request->result)
24130eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
24140eefd470SAlex Elder 
24150eefd470SAlex Elder 	/* Finish up with the normal image object callback */
24160eefd470SAlex Elder 
24170eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
24180eefd470SAlex Elder }
24190eefd470SAlex Elder 
24200eefd470SAlex Elder static void
24213d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
24223d7efd18SAlex Elder {
24233d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
24240eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
24250eefd470SAlex Elder 	struct ceph_osd_client *osdc;
24260eefd470SAlex Elder 	struct rbd_device *rbd_dev;
24273d7efd18SAlex Elder 	struct page **pages;
2428ebda6408SAlex Elder 	u32 page_count;
2429bbea1c1aSAlex Elder 	int img_result;
2430ebda6408SAlex Elder 	u64 parent_length;
2431b91f09f1SAlex Elder 	u64 offset;
2432b91f09f1SAlex Elder 	u64 length;
24333d7efd18SAlex Elder 
24343d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
24353d7efd18SAlex Elder 
24363d7efd18SAlex Elder 	/* First get what we need from the image request */
24373d7efd18SAlex Elder 
24383d7efd18SAlex Elder 	pages = img_request->copyup_pages;
24393d7efd18SAlex Elder 	rbd_assert(pages != NULL);
24403d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2441ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2442ebda6408SAlex Elder 	rbd_assert(page_count);
2443ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
24443d7efd18SAlex Elder 
24453d7efd18SAlex Elder 	orig_request = img_request->obj_request;
24463d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2447b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2448bbea1c1aSAlex Elder 	img_result = img_request->result;
2449ebda6408SAlex Elder 	parent_length = img_request->length;
2450ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
24513d7efd18SAlex Elder 	rbd_img_request_put(img_request);
24523d7efd18SAlex Elder 
245391c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
245491c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
24553d7efd18SAlex Elder 	rbd_assert(rbd_dev);
24563d7efd18SAlex Elder 
2457bbea1c1aSAlex Elder 	/*
2458bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2459bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2460bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2461bbea1c1aSAlex Elder 	 */
2462bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2463bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2464bbea1c1aSAlex Elder 
2465bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2466bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2467bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2468bbea1c1aSAlex Elder 		if (!img_result)
2469bbea1c1aSAlex Elder 			return;
2470bbea1c1aSAlex Elder 	}
2471bbea1c1aSAlex Elder 
2472bbea1c1aSAlex Elder 	if (img_result)
24730eefd470SAlex Elder 		goto out_err;
24743d7efd18SAlex Elder 
24758785b1d4SAlex Elder 	/*
24768785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
24770ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
24788785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
24798785b1d4SAlex Elder 	 * original request, and release the old one.
24808785b1d4SAlex Elder 	 */
2481bbea1c1aSAlex Elder 	img_result = -ENOMEM;
24820eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
24830eefd470SAlex Elder 	if (!osd_req)
24840eefd470SAlex Elder 		goto out_err;
24858785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
24860eefd470SAlex Elder 	orig_request->osd_req = osd_req;
24870eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2488ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
24893d7efd18SAlex Elder 
24900eefd470SAlex Elder 	/* Initialize the copyup op */
24910eefd470SAlex Elder 
24920eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2493ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
24940eefd470SAlex Elder 						false, false);
24950eefd470SAlex Elder 
24960ccd5926SIlya Dryomov 	/* Then the hint op */
24970ccd5926SIlya Dryomov 
24980ccd5926SIlya Dryomov 	osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
24990ccd5926SIlya Dryomov 				   rbd_obj_bytes(&rbd_dev->header));
25000ccd5926SIlya Dryomov 
25010ccd5926SIlya Dryomov 	/* And the original write request op */
25020eefd470SAlex Elder 
2503b91f09f1SAlex Elder 	offset = orig_request->offset;
2504b91f09f1SAlex Elder 	length = orig_request->length;
25050ccd5926SIlya Dryomov 	osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
2506b91f09f1SAlex Elder 					offset, length, 0, 0);
2507b91f09f1SAlex Elder 	if (orig_request->type == OBJ_REQUEST_BIO)
25080ccd5926SIlya Dryomov 		osd_req_op_extent_osd_data_bio(osd_req, 2,
2509b91f09f1SAlex Elder 					orig_request->bio_list, length);
2510b91f09f1SAlex Elder 	else
25110ccd5926SIlya Dryomov 		osd_req_op_extent_osd_data_pages(osd_req, 2,
2512b91f09f1SAlex Elder 					orig_request->pages, length,
2513b91f09f1SAlex Elder 					offset & ~PAGE_MASK, false, false);
25140eefd470SAlex Elder 
25150eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
25160eefd470SAlex Elder 
25170eefd470SAlex Elder 	/* All set, send it off. */
25180eefd470SAlex Elder 
25190eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
25200eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2521bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2522bbea1c1aSAlex Elder 	if (!img_result)
25230eefd470SAlex Elder 		return;
25240eefd470SAlex Elder out_err:
25250eefd470SAlex Elder 	/* Record the error code and complete the request */
25260eefd470SAlex Elder 
2527bbea1c1aSAlex Elder 	orig_request->result = img_result;
25280eefd470SAlex Elder 	orig_request->xferred = 0;
25293d7efd18SAlex Elder 	obj_request_done_set(orig_request);
25303d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
25313d7efd18SAlex Elder }
25323d7efd18SAlex Elder 
25333d7efd18SAlex Elder /*
25343d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
25353d7efd18SAlex Elder  * entire target of the given object request.  This is used for
25363d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
25373d7efd18SAlex Elder  * object request from the image request does not exist.
25383d7efd18SAlex Elder  *
25393d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
25403d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
25413d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
25423d7efd18SAlex Elder  * the original object request for the copyup operation.
25433d7efd18SAlex Elder  *
25443d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
25453d7efd18SAlex Elder  * object request and mark it done so it gets completed.
25463d7efd18SAlex Elder  */
25473d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
25483d7efd18SAlex Elder {
25493d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
25503d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
25513d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
25523d7efd18SAlex Elder 	u64 img_offset;
25533d7efd18SAlex Elder 	u64 length;
25543d7efd18SAlex Elder 	struct page **pages = NULL;
25553d7efd18SAlex Elder 	u32 page_count;
25563d7efd18SAlex Elder 	int result;
25573d7efd18SAlex Elder 
25583d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2559b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
25603d7efd18SAlex Elder 
25613d7efd18SAlex Elder 	img_request = obj_request->img_request;
25623d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
25633d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
25643d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
25653d7efd18SAlex Elder 
25663d7efd18SAlex Elder 	/*
25673d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
25683d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
25693d7efd18SAlex Elder 	 */
25703d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
25713d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
25723d7efd18SAlex Elder 
25733d7efd18SAlex Elder 	/*
2574a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2575a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2576a9e8ba2cSAlex Elder 	 * necessary.
2577a9e8ba2cSAlex Elder 	 */
2578a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2579a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2580a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2581a9e8ba2cSAlex Elder 	}
2582a9e8ba2cSAlex Elder 
2583a9e8ba2cSAlex Elder 	/*
25843d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
25853d7efd18SAlex Elder 	 * from the parent.
25863d7efd18SAlex Elder 	 */
25873d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
25883d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
25893d7efd18SAlex Elder 	if (IS_ERR(pages)) {
25903d7efd18SAlex Elder 		result = PTR_ERR(pages);
25913d7efd18SAlex Elder 		pages = NULL;
25923d7efd18SAlex Elder 		goto out_err;
25933d7efd18SAlex Elder 	}
25943d7efd18SAlex Elder 
25953d7efd18SAlex Elder 	result = -ENOMEM;
2596e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2597e93f3152SAlex Elder 						img_offset, length);
25983d7efd18SAlex Elder 	if (!parent_request)
25993d7efd18SAlex Elder 		goto out_err;
26003d7efd18SAlex Elder 
26013d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
26023d7efd18SAlex Elder 	if (result)
26033d7efd18SAlex Elder 		goto out_err;
26043d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2605ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
26063d7efd18SAlex Elder 
26073d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
26083d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
26093d7efd18SAlex Elder 	if (!result)
26103d7efd18SAlex Elder 		return 0;
26113d7efd18SAlex Elder 
26123d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2613ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
26143d7efd18SAlex Elder 	parent_request->obj_request = NULL;
26153d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
26163d7efd18SAlex Elder out_err:
26173d7efd18SAlex Elder 	if (pages)
26183d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
26193d7efd18SAlex Elder 	if (parent_request)
26203d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
26213d7efd18SAlex Elder 	obj_request->result = result;
26223d7efd18SAlex Elder 	obj_request->xferred = 0;
26233d7efd18SAlex Elder 	obj_request_done_set(obj_request);
26243d7efd18SAlex Elder 
26253d7efd18SAlex Elder 	return result;
26263d7efd18SAlex Elder }
26273d7efd18SAlex Elder 
2628c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2629c5b5ef6cSAlex Elder {
2630c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2631638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2632c5b5ef6cSAlex Elder 	int result;
2633c5b5ef6cSAlex Elder 
2634c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2635c5b5ef6cSAlex Elder 
2636c5b5ef6cSAlex Elder 	/*
2637c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2638c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2639c5b5ef6cSAlex Elder 	 * we're done with the request.
2640c5b5ef6cSAlex Elder 	 */
2641c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2642c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2643912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2644c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2645c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2646c5b5ef6cSAlex Elder 
2647c5b5ef6cSAlex Elder 	result = obj_request->result;
2648c5b5ef6cSAlex Elder 	obj_request->result = 0;
2649c5b5ef6cSAlex Elder 
2650c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2651c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2652c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2653c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2654c5b5ef6cSAlex Elder 
2655638f5abeSAlex Elder 	/*
2656638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2657638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2658638f5abeSAlex Elder 	 * and re-submit the original write request.
2659638f5abeSAlex Elder 	 */
2660638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2661638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2662638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2663638f5abeSAlex Elder 
2664638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2665638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2666638f5abeSAlex Elder 		if (!result)
2667638f5abeSAlex Elder 			return;
2668638f5abeSAlex Elder 	}
2669c5b5ef6cSAlex Elder 
2670c5b5ef6cSAlex Elder 	/*
2671c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2672c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2673c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2674c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2675c5b5ef6cSAlex Elder 	 */
2676c5b5ef6cSAlex Elder 	if (!result) {
2677c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2678c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2679c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2680c5b5ef6cSAlex Elder 	} else if (result) {
2681c5b5ef6cSAlex Elder 		orig_request->result = result;
26823d7efd18SAlex Elder 		goto out;
2683c5b5ef6cSAlex Elder 	}
2684c5b5ef6cSAlex Elder 
2685c5b5ef6cSAlex Elder 	/*
2686c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2687c5b5ef6cSAlex Elder 	 * whether the target object exists.
2688c5b5ef6cSAlex Elder 	 */
2689b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
26903d7efd18SAlex Elder out:
2691c5b5ef6cSAlex Elder 	if (orig_request->result)
2692c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2693c5b5ef6cSAlex Elder }
2694c5b5ef6cSAlex Elder 
2695c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2696c5b5ef6cSAlex Elder {
2697c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2698c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2699c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2700c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2701c5b5ef6cSAlex Elder 	u32 page_count;
2702c5b5ef6cSAlex Elder 	size_t size;
2703c5b5ef6cSAlex Elder 	int ret;
2704c5b5ef6cSAlex Elder 
2705c5b5ef6cSAlex Elder 	/*
2706c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2707c5b5ef6cSAlex Elder 	 *     le64 length;
2708c5b5ef6cSAlex Elder 	 *     struct {
2709c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2710c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2711c5b5ef6cSAlex Elder 	 *     } mtime;
2712c5b5ef6cSAlex Elder 	 */
2713c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2714c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2715c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2716c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2717c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2718c5b5ef6cSAlex Elder 
2719c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2720c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2721c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2722c5b5ef6cSAlex Elder 	if (!stat_request)
2723c5b5ef6cSAlex Elder 		goto out;
2724c5b5ef6cSAlex Elder 
2725c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2726c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2727c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2728c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2729c5b5ef6cSAlex Elder 
2730c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2731c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2732deb236b3SIlya Dryomov 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2733c5b5ef6cSAlex Elder 						   stat_request);
2734c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2735c5b5ef6cSAlex Elder 		goto out;
2736c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2737c5b5ef6cSAlex Elder 
2738c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2739c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2740c5b5ef6cSAlex Elder 					false, false);
27419d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2742c5b5ef6cSAlex Elder 
2743c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2744c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2745c5b5ef6cSAlex Elder out:
2746c5b5ef6cSAlex Elder 	if (ret)
2747c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2748c5b5ef6cSAlex Elder 
2749c5b5ef6cSAlex Elder 	return ret;
2750c5b5ef6cSAlex Elder }
2751c5b5ef6cSAlex Elder 
2752b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2753b454e36dSAlex Elder {
2754b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2755a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
27563d7efd18SAlex Elder 	bool known;
2757b454e36dSAlex Elder 
2758b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2759b454e36dSAlex Elder 
2760b454e36dSAlex Elder 	img_request = obj_request->img_request;
2761b454e36dSAlex Elder 	rbd_assert(img_request);
2762a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2763b454e36dSAlex Elder 
2764b454e36dSAlex Elder 	/*
2765a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2766a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2767a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2768a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2769a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2770a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2771a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2772a9e8ba2cSAlex Elder 	 * simple object request.
2773b454e36dSAlex Elder 	 */
2774b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2775b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
27769638556aSIlya Dryomov 		!obj_request_overlaps_parent(obj_request) ||
27773d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
27783d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2779b454e36dSAlex Elder 
2780b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2781b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2782b454e36dSAlex Elder 
2783b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2784b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2785b454e36dSAlex Elder 
2786b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2787b454e36dSAlex Elder 	}
2788b454e36dSAlex Elder 
2789b454e36dSAlex Elder 	/*
27903d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
27913d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
27923d7efd18SAlex Elder 	 * start by reading the data for the full target object from
27933d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2794b454e36dSAlex Elder 	 */
27953d7efd18SAlex Elder 	if (known)
27963d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
27973d7efd18SAlex Elder 
27983d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2799b454e36dSAlex Elder 
2800b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2801b454e36dSAlex Elder }
2802b454e36dSAlex Elder 
2803bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2804bf0d5f50SAlex Elder {
2805bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
280646faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2807bf0d5f50SAlex Elder 
280837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
280946faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2810bf0d5f50SAlex Elder 		int ret;
2811bf0d5f50SAlex Elder 
2812b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2813bf0d5f50SAlex Elder 		if (ret)
2814bf0d5f50SAlex Elder 			return ret;
2815bf0d5f50SAlex Elder 	}
2816bf0d5f50SAlex Elder 
2817bf0d5f50SAlex Elder 	return 0;
2818bf0d5f50SAlex Elder }
2819bf0d5f50SAlex Elder 
28208b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
28218b3e1a56SAlex Elder {
28228b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2823a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2824a9e8ba2cSAlex Elder 	u64 obj_end;
282502c74fbaSAlex Elder 	u64 img_xferred;
282602c74fbaSAlex Elder 	int img_result;
28278b3e1a56SAlex Elder 
28288b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
28298b3e1a56SAlex Elder 
283002c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
283102c74fbaSAlex Elder 
28328b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
283302c74fbaSAlex Elder 	img_xferred = img_request->xferred;
283402c74fbaSAlex Elder 	img_result = img_request->result;
283502c74fbaSAlex Elder 	rbd_img_request_put(img_request);
283602c74fbaSAlex Elder 
283702c74fbaSAlex Elder 	/*
283802c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
283902c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
284002c74fbaSAlex Elder 	 * original request.
284102c74fbaSAlex Elder 	 */
2842a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2843a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
284402c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
284502c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
284602c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
28478b3e1a56SAlex Elder 
284802c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
284902c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
285002c74fbaSAlex Elder 		if (!img_result)
285102c74fbaSAlex Elder 			return;
285202c74fbaSAlex Elder 	}
285302c74fbaSAlex Elder 
285402c74fbaSAlex Elder 	obj_request->result = img_result;
2855a9e8ba2cSAlex Elder 	if (obj_request->result)
2856a9e8ba2cSAlex Elder 		goto out;
2857a9e8ba2cSAlex Elder 
2858a9e8ba2cSAlex Elder 	/*
2859a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2860a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2861a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2862a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2863a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2864a9e8ba2cSAlex Elder 	 */
2865a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2866a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2867a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2868a9e8ba2cSAlex Elder 		u64 xferred = 0;
2869a9e8ba2cSAlex Elder 
2870a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2871a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2872a9e8ba2cSAlex Elder 					obj_request->img_offset;
2873a9e8ba2cSAlex Elder 
287402c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
2875a9e8ba2cSAlex Elder 	} else {
287602c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
2877a9e8ba2cSAlex Elder 	}
2878a9e8ba2cSAlex Elder out:
28798b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
28808b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
28818b3e1a56SAlex Elder }
28828b3e1a56SAlex Elder 
28838b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
28848b3e1a56SAlex Elder {
28858b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
28868b3e1a56SAlex Elder 	int result;
28878b3e1a56SAlex Elder 
28888b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
28898b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
28908b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
28915b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
28928b3e1a56SAlex Elder 
28938b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
2894e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
28958b3e1a56SAlex Elder 						obj_request->img_offset,
2896e93f3152SAlex Elder 						obj_request->length);
28978b3e1a56SAlex Elder 	result = -ENOMEM;
28988b3e1a56SAlex Elder 	if (!img_request)
28998b3e1a56SAlex Elder 		goto out_err;
29008b3e1a56SAlex Elder 
29015b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
2902f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2903f1a4739fSAlex Elder 						obj_request->bio_list);
29045b2ab72dSAlex Elder 	else
29055b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
29065b2ab72dSAlex Elder 						obj_request->pages);
29078b3e1a56SAlex Elder 	if (result)
29088b3e1a56SAlex Elder 		goto out_err;
29098b3e1a56SAlex Elder 
29108b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
29118b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
29128b3e1a56SAlex Elder 	if (result)
29138b3e1a56SAlex Elder 		goto out_err;
29148b3e1a56SAlex Elder 
29158b3e1a56SAlex Elder 	return;
29168b3e1a56SAlex Elder out_err:
29178b3e1a56SAlex Elder 	if (img_request)
29188b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
29198b3e1a56SAlex Elder 	obj_request->result = result;
29208b3e1a56SAlex Elder 	obj_request->xferred = 0;
29218b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
29228b3e1a56SAlex Elder }
29238b3e1a56SAlex Elder 
292420e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2925b8d70035SAlex Elder {
2926b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
29272169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2928b8d70035SAlex Elder 	int ret;
2929b8d70035SAlex Elder 
2930b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2931b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2932b8d70035SAlex Elder 	if (!obj_request)
2933b8d70035SAlex Elder 		return -ENOMEM;
2934b8d70035SAlex Elder 
2935b8d70035SAlex Elder 	ret = -ENOMEM;
2936deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2937deb236b3SIlya Dryomov 						  obj_request);
2938b8d70035SAlex Elder 	if (!obj_request->osd_req)
2939b8d70035SAlex Elder 		goto out;
2940b8d70035SAlex Elder 
2941c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2942cc4a38bdSAlex Elder 					notify_id, 0, 0);
29439d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2944430c28c3SAlex Elder 
2945b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2946cf81b60eSAlex Elder 	if (ret)
294720e0af67SJosh Durgin 		goto out;
294820e0af67SJosh Durgin 	ret = rbd_obj_request_wait(obj_request);
294920e0af67SJosh Durgin out:
2950b8d70035SAlex Elder 	rbd_obj_request_put(obj_request);
2951b8d70035SAlex Elder 
2952b8d70035SAlex Elder 	return ret;
2953b8d70035SAlex Elder }
2954b8d70035SAlex Elder 
2955b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2956b8d70035SAlex Elder {
2957b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2958e627db08SAlex Elder 	int ret;
2959b8d70035SAlex Elder 
2960b8d70035SAlex Elder 	if (!rbd_dev)
2961b8d70035SAlex Elder 		return;
2962b8d70035SAlex Elder 
296337206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2964b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2965b8d70035SAlex Elder 		(unsigned int)opcode);
2966e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
2967e627db08SAlex Elder 	if (ret)
29683b5cf2a2SAlex Elder 		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2969b8d70035SAlex Elder 
297020e0af67SJosh Durgin 	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
2971b8d70035SAlex Elder }
2972b8d70035SAlex Elder 
29739969ebc5SAlex Elder /*
2974bb040aa0SIlya Dryomov  * Send a (un)watch request and wait for the ack.  Return a request
2975bb040aa0SIlya Dryomov  * with a ref held on success or error.
2976bb040aa0SIlya Dryomov  */
2977bb040aa0SIlya Dryomov static struct rbd_obj_request *rbd_obj_watch_request_helper(
2978bb040aa0SIlya Dryomov 						struct rbd_device *rbd_dev,
2979bb040aa0SIlya Dryomov 						bool watch)
2980bb040aa0SIlya Dryomov {
2981bb040aa0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2982bb040aa0SIlya Dryomov 	struct rbd_obj_request *obj_request;
2983bb040aa0SIlya Dryomov 	int ret;
2984bb040aa0SIlya Dryomov 
2985bb040aa0SIlya Dryomov 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2986bb040aa0SIlya Dryomov 					     OBJ_REQUEST_NODATA);
2987bb040aa0SIlya Dryomov 	if (!obj_request)
2988bb040aa0SIlya Dryomov 		return ERR_PTR(-ENOMEM);
2989bb040aa0SIlya Dryomov 
2990bb040aa0SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
2991bb040aa0SIlya Dryomov 						  obj_request);
2992bb040aa0SIlya Dryomov 	if (!obj_request->osd_req) {
2993bb040aa0SIlya Dryomov 		ret = -ENOMEM;
2994bb040aa0SIlya Dryomov 		goto out;
2995bb040aa0SIlya Dryomov 	}
2996bb040aa0SIlya Dryomov 
2997bb040aa0SIlya Dryomov 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2998bb040aa0SIlya Dryomov 			      rbd_dev->watch_event->cookie, 0, watch);
2999bb040aa0SIlya Dryomov 	rbd_osd_req_format_write(obj_request);
3000bb040aa0SIlya Dryomov 
3001bb040aa0SIlya Dryomov 	if (watch)
3002bb040aa0SIlya Dryomov 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
3003bb040aa0SIlya Dryomov 
3004bb040aa0SIlya Dryomov 	ret = rbd_obj_request_submit(osdc, obj_request);
3005bb040aa0SIlya Dryomov 	if (ret)
3006bb040aa0SIlya Dryomov 		goto out;
3007bb040aa0SIlya Dryomov 
3008bb040aa0SIlya Dryomov 	ret = rbd_obj_request_wait(obj_request);
3009bb040aa0SIlya Dryomov 	if (ret)
3010bb040aa0SIlya Dryomov 		goto out;
3011bb040aa0SIlya Dryomov 
3012bb040aa0SIlya Dryomov 	ret = obj_request->result;
3013bb040aa0SIlya Dryomov 	if (ret) {
3014bb040aa0SIlya Dryomov 		if (watch)
3015bb040aa0SIlya Dryomov 			rbd_obj_request_end(obj_request);
3016bb040aa0SIlya Dryomov 		goto out;
3017bb040aa0SIlya Dryomov 	}
3018bb040aa0SIlya Dryomov 
3019bb040aa0SIlya Dryomov 	return obj_request;
3020bb040aa0SIlya Dryomov 
3021bb040aa0SIlya Dryomov out:
3022bb040aa0SIlya Dryomov 	rbd_obj_request_put(obj_request);
3023bb040aa0SIlya Dryomov 	return ERR_PTR(ret);
3024bb040aa0SIlya Dryomov }
3025bb040aa0SIlya Dryomov 
3026bb040aa0SIlya Dryomov /*
3027b30a01f2SIlya Dryomov  * Initiate a watch request, synchronously.
30289969ebc5SAlex Elder  */
3029b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
30309969ebc5SAlex Elder {
30319969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
30329969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
30339969ebc5SAlex Elder 	int ret;
30349969ebc5SAlex Elder 
3035b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_event);
3036b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_request);
30379969ebc5SAlex Elder 
30383c663bbdSAlex Elder 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
30399969ebc5SAlex Elder 				     &rbd_dev->watch_event);
30409969ebc5SAlex Elder 	if (ret < 0)
30419969ebc5SAlex Elder 		return ret;
30429969ebc5SAlex Elder 
304376756a51SIlya Dryomov 	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
304476756a51SIlya Dryomov 	if (IS_ERR(obj_request)) {
304576756a51SIlya Dryomov 		ceph_osdc_cancel_event(rbd_dev->watch_event);
304676756a51SIlya Dryomov 		rbd_dev->watch_event = NULL;
304776756a51SIlya Dryomov 		return PTR_ERR(obj_request);
3048b30a01f2SIlya Dryomov 	}
30499969ebc5SAlex Elder 
30508eb87565SAlex Elder 	/*
30518eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
30528eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
30538eb87565SAlex Elder 	 * a pointer to the object request during that time (in
305476756a51SIlya Dryomov 	 * rbd_dev->watch_request), so we'll keep a reference to it.
305576756a51SIlya Dryomov 	 * We'll drop that reference after we've unregistered it in
305676756a51SIlya Dryomov 	 * rbd_dev_header_unwatch_sync().
30578eb87565SAlex Elder 	 */
30588eb87565SAlex Elder 	rbd_dev->watch_request = obj_request;
30598eb87565SAlex Elder 
30608eb87565SAlex Elder 	return 0;
30619969ebc5SAlex Elder }
30629969ebc5SAlex Elder 
3063b30a01f2SIlya Dryomov /*
3064b30a01f2SIlya Dryomov  * Tear down a watch request, synchronously.
3065b30a01f2SIlya Dryomov  */
306676756a51SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3067fca27065SIlya Dryomov {
3068b30a01f2SIlya Dryomov 	struct rbd_obj_request *obj_request;
3069b30a01f2SIlya Dryomov 
3070b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_event);
3071b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_request);
3072b30a01f2SIlya Dryomov 
307376756a51SIlya Dryomov 	rbd_obj_request_end(rbd_dev->watch_request);
3074b30a01f2SIlya Dryomov 	rbd_obj_request_put(rbd_dev->watch_request);
3075b30a01f2SIlya Dryomov 	rbd_dev->watch_request = NULL;
3076b30a01f2SIlya Dryomov 
307776756a51SIlya Dryomov 	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
307876756a51SIlya Dryomov 	if (!IS_ERR(obj_request))
3079b30a01f2SIlya Dryomov 		rbd_obj_request_put(obj_request);
308076756a51SIlya Dryomov 	else
308176756a51SIlya Dryomov 		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
308276756a51SIlya Dryomov 			 PTR_ERR(obj_request));
308376756a51SIlya Dryomov 
3084b30a01f2SIlya Dryomov 	ceph_osdc_cancel_event(rbd_dev->watch_event);
3085b30a01f2SIlya Dryomov 	rbd_dev->watch_event = NULL;
3086fca27065SIlya Dryomov }
3087fca27065SIlya Dryomov 
308836be9a76SAlex Elder /*
3089f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3090f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
309136be9a76SAlex Elder  */
309236be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
309336be9a76SAlex Elder 			     const char *object_name,
309436be9a76SAlex Elder 			     const char *class_name,
309536be9a76SAlex Elder 			     const char *method_name,
30964157976bSAlex Elder 			     const void *outbound,
309736be9a76SAlex Elder 			     size_t outbound_size,
30984157976bSAlex Elder 			     void *inbound,
3099e2a58ee5SAlex Elder 			     size_t inbound_size)
310036be9a76SAlex Elder {
31012169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
310236be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
310336be9a76SAlex Elder 	struct page **pages;
310436be9a76SAlex Elder 	u32 page_count;
310536be9a76SAlex Elder 	int ret;
310636be9a76SAlex Elder 
310736be9a76SAlex Elder 	/*
31086010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
31096010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
31106010a451SAlex Elder 	 * also supply outbound data--parameters for the object
31116010a451SAlex Elder 	 * method.  Currently if this is present it will be a
31126010a451SAlex Elder 	 * snapshot id.
311336be9a76SAlex Elder 	 */
311436be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
311536be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
311636be9a76SAlex Elder 	if (IS_ERR(pages))
311736be9a76SAlex Elder 		return PTR_ERR(pages);
311836be9a76SAlex Elder 
311936be9a76SAlex Elder 	ret = -ENOMEM;
31206010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
312136be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
312236be9a76SAlex Elder 	if (!obj_request)
312336be9a76SAlex Elder 		goto out;
312436be9a76SAlex Elder 
312536be9a76SAlex Elder 	obj_request->pages = pages;
312636be9a76SAlex Elder 	obj_request->page_count = page_count;
312736be9a76SAlex Elder 
3128deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3129deb236b3SIlya Dryomov 						  obj_request);
313036be9a76SAlex Elder 	if (!obj_request->osd_req)
313136be9a76SAlex Elder 		goto out;
313236be9a76SAlex Elder 
3133c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
313404017e29SAlex Elder 					class_name, method_name);
313504017e29SAlex Elder 	if (outbound_size) {
313604017e29SAlex Elder 		struct ceph_pagelist *pagelist;
313704017e29SAlex Elder 
313804017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
313904017e29SAlex Elder 		if (!pagelist)
314004017e29SAlex Elder 			goto out;
314104017e29SAlex Elder 
314204017e29SAlex Elder 		ceph_pagelist_init(pagelist);
314304017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
314404017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
314504017e29SAlex Elder 						pagelist);
314604017e29SAlex Elder 	}
3147a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3148a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
314944cd188dSAlex Elder 					0, false, false);
31509d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3151430c28c3SAlex Elder 
315236be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
315336be9a76SAlex Elder 	if (ret)
315436be9a76SAlex Elder 		goto out;
315536be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
315636be9a76SAlex Elder 	if (ret)
315736be9a76SAlex Elder 		goto out;
315836be9a76SAlex Elder 
315936be9a76SAlex Elder 	ret = obj_request->result;
316036be9a76SAlex Elder 	if (ret < 0)
316136be9a76SAlex Elder 		goto out;
316257385b51SAlex Elder 
316357385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
316457385b51SAlex Elder 	ret = (int)obj_request->xferred;
3165903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
316636be9a76SAlex Elder out:
316736be9a76SAlex Elder 	if (obj_request)
316836be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
316936be9a76SAlex Elder 	else
317036be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
317136be9a76SAlex Elder 
317236be9a76SAlex Elder 	return ret;
317336be9a76SAlex Elder }
317436be9a76SAlex Elder 
3175bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
3176cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
3177bf0d5f50SAlex Elder {
3178bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
3179bf0d5f50SAlex Elder 	struct request *rq;
3180bf0d5f50SAlex Elder 	int result;
3181bf0d5f50SAlex Elder 
3182bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
3183bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
3184bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
3185bf0d5f50SAlex Elder 		u64 offset;
3186bf0d5f50SAlex Elder 		u64 length;
3187bf0d5f50SAlex Elder 
3188bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
3189bf0d5f50SAlex Elder 
3190bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
31914dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
31924dda41d3SAlex Elder 				(int) rq->cmd_type);
31934dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
31944dda41d3SAlex Elder 			continue;
31954dda41d3SAlex Elder 		}
31964dda41d3SAlex Elder 
31974dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
31984dda41d3SAlex Elder 
31994dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
32004dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
32014dda41d3SAlex Elder 
32024dda41d3SAlex Elder 		if (!length) {
32034dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
3204bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
3205bf0d5f50SAlex Elder 			continue;
3206bf0d5f50SAlex Elder 		}
3207bf0d5f50SAlex Elder 
3208bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
3209bf0d5f50SAlex Elder 
3210bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
3211bf0d5f50SAlex Elder 
3212bf0d5f50SAlex Elder 		if (write_request) {
3213bf0d5f50SAlex Elder 			result = -EROFS;
3214131fd9f6SGuangliang Zhao 			if (rbd_dev->mapping.read_only)
3215bf0d5f50SAlex Elder 				goto end_request;
3216bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3217bf0d5f50SAlex Elder 		}
3218bf0d5f50SAlex Elder 
32196d292906SAlex Elder 		/*
32206d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
32216d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
32226d292906SAlex Elder 		 * have disappeared by the time our request arrives
32236d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
32246d292906SAlex Elder 		 * we already know.
32256d292906SAlex Elder 		 */
32266d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3227bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
3228bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3229bf0d5f50SAlex Elder 			result = -ENXIO;
3230bf0d5f50SAlex Elder 			goto end_request;
3231bf0d5f50SAlex Elder 		}
3232bf0d5f50SAlex Elder 
3233bf0d5f50SAlex Elder 		result = -EINVAL;
3234c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
3235c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3236c0cd10dbSAlex Elder 				offset, length);
3237bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
3238c0cd10dbSAlex Elder 		}
3239bf0d5f50SAlex Elder 
324000a653e2SAlex Elder 		result = -EIO;
324100a653e2SAlex Elder 		if (offset + length > rbd_dev->mapping.size) {
324200a653e2SAlex Elder 			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
324300a653e2SAlex Elder 				offset, length, rbd_dev->mapping.size);
324400a653e2SAlex Elder 			goto end_request;
324500a653e2SAlex Elder 		}
324600a653e2SAlex Elder 
3247bf0d5f50SAlex Elder 		result = -ENOMEM;
3248bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
3249e93f3152SAlex Elder 							write_request);
3250bf0d5f50SAlex Elder 		if (!img_request)
3251bf0d5f50SAlex Elder 			goto end_request;
3252bf0d5f50SAlex Elder 
3253bf0d5f50SAlex Elder 		img_request->rq = rq;
3254bf0d5f50SAlex Elder 
3255f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3256f1a4739fSAlex Elder 						rq->bio);
3257bf0d5f50SAlex Elder 		if (!result)
3258bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
3259bf0d5f50SAlex Elder 		if (result)
3260bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
3261bf0d5f50SAlex Elder end_request:
3262bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
3263bf0d5f50SAlex Elder 		if (result < 0) {
32647da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
32657da22d29SAlex Elder 				write_request ? "write" : "read",
32667da22d29SAlex Elder 				length, offset, result);
32677da22d29SAlex Elder 
3268bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
3269bf0d5f50SAlex Elder 		}
3270bf0d5f50SAlex Elder 	}
3271bf0d5f50SAlex Elder }
3272bf0d5f50SAlex Elder 
3273602adf40SYehuda Sadeh /*
3274602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
3275602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
3276f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
3277602adf40SYehuda Sadeh  */
3278602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3279602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
3280602adf40SYehuda Sadeh {
3281602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
3282e5cfeed2SAlex Elder 	sector_t sector_offset;
3283e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
3284e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
3285e5cfeed2SAlex Elder 	int ret;
3286602adf40SYehuda Sadeh 
3287e5cfeed2SAlex Elder 	/*
3288e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
3289e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
3290e5cfeed2SAlex Elder 	 * device.
3291e5cfeed2SAlex Elder 	 */
3292e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3293e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3294e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3295593a9e7bSAlex Elder 
3296e5cfeed2SAlex Elder 	/*
3297e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
3298e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
3299e5cfeed2SAlex Elder 	 */
3300e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3301e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
3302e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
3303e5cfeed2SAlex Elder 	else
3304e5cfeed2SAlex Elder 		ret = 0;
3305e5cfeed2SAlex Elder 
3306e5cfeed2SAlex Elder 	/*
3307e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
3308e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
3309e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
3310e5cfeed2SAlex Elder 	 * added to an empty bio."
3311e5cfeed2SAlex Elder 	 */
3312e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3313e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3314e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
3315e5cfeed2SAlex Elder 
3316e5cfeed2SAlex Elder 	return ret;
3317602adf40SYehuda Sadeh }
3318602adf40SYehuda Sadeh 
3319602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3320602adf40SYehuda Sadeh {
3321602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3322602adf40SYehuda Sadeh 
3323602adf40SYehuda Sadeh 	if (!disk)
3324602adf40SYehuda Sadeh 		return;
3325602adf40SYehuda Sadeh 
3326a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3327a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3328602adf40SYehuda Sadeh 		del_gendisk(disk);
3329602adf40SYehuda Sadeh 		if (disk->queue)
3330602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
3331a0cab924SAlex Elder 	}
3332602adf40SYehuda Sadeh 	put_disk(disk);
3333602adf40SYehuda Sadeh }
3334602adf40SYehuda Sadeh 
3335788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3336788e2df3SAlex Elder 				const char *object_name,
33377097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3338788e2df3SAlex Elder 
3339788e2df3SAlex Elder {
33402169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3341788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3342788e2df3SAlex Elder 	struct page **pages = NULL;
3343788e2df3SAlex Elder 	u32 page_count;
33441ceae7efSAlex Elder 	size_t size;
3345788e2df3SAlex Elder 	int ret;
3346788e2df3SAlex Elder 
3347788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3348788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3349788e2df3SAlex Elder 	if (IS_ERR(pages))
3350788e2df3SAlex Elder 		ret = PTR_ERR(pages);
3351788e2df3SAlex Elder 
3352788e2df3SAlex Elder 	ret = -ENOMEM;
3353788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3354788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3355788e2df3SAlex Elder 	if (!obj_request)
3356788e2df3SAlex Elder 		goto out;
3357788e2df3SAlex Elder 
3358788e2df3SAlex Elder 	obj_request->pages = pages;
3359788e2df3SAlex Elder 	obj_request->page_count = page_count;
3360788e2df3SAlex Elder 
3361deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3362deb236b3SIlya Dryomov 						  obj_request);
3363788e2df3SAlex Elder 	if (!obj_request->osd_req)
3364788e2df3SAlex Elder 		goto out;
3365788e2df3SAlex Elder 
3366c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3367c99d2d4aSAlex Elder 					offset, length, 0, 0);
3368406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3369a4ce40a9SAlex Elder 					obj_request->pages,
337044cd188dSAlex Elder 					obj_request->length,
337144cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
337244cd188dSAlex Elder 					false, false);
33739d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3374430c28c3SAlex Elder 
3375788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3376788e2df3SAlex Elder 	if (ret)
3377788e2df3SAlex Elder 		goto out;
3378788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3379788e2df3SAlex Elder 	if (ret)
3380788e2df3SAlex Elder 		goto out;
3381788e2df3SAlex Elder 
3382788e2df3SAlex Elder 	ret = obj_request->result;
3383788e2df3SAlex Elder 	if (ret < 0)
3384788e2df3SAlex Elder 		goto out;
33851ceae7efSAlex Elder 
33861ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
33871ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3388903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
338923ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
339023ed6e13SAlex Elder 	ret = (int)size;
3391788e2df3SAlex Elder out:
3392788e2df3SAlex Elder 	if (obj_request)
3393788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3394788e2df3SAlex Elder 	else
3395788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3396788e2df3SAlex Elder 
3397788e2df3SAlex Elder 	return ret;
3398788e2df3SAlex Elder }
3399788e2df3SAlex Elder 
3400602adf40SYehuda Sadeh /*
3401662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3402662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3403662518b1SAlex Elder  * information about the image.
34044156d998SAlex Elder  */
340599a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
34064156d998SAlex Elder {
34074156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
34084156d998SAlex Elder 	u32 snap_count = 0;
34094156d998SAlex Elder 	u64 names_size = 0;
34104156d998SAlex Elder 	u32 want_count;
34114156d998SAlex Elder 	int ret;
34124156d998SAlex Elder 
34134156d998SAlex Elder 	/*
34144156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
34154156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
34164156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
34174156d998SAlex Elder 	 * the number of snapshots could change by the time we read
34184156d998SAlex Elder 	 * it in, in which case we re-read it.
34194156d998SAlex Elder 	 */
34204156d998SAlex Elder 	do {
34214156d998SAlex Elder 		size_t size;
34224156d998SAlex Elder 
34234156d998SAlex Elder 		kfree(ondisk);
34244156d998SAlex Elder 
34254156d998SAlex Elder 		size = sizeof (*ondisk);
34264156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
34274156d998SAlex Elder 		size += names_size;
34284156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
34294156d998SAlex Elder 		if (!ondisk)
3430662518b1SAlex Elder 			return -ENOMEM;
34314156d998SAlex Elder 
3432788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
34337097f8dfSAlex Elder 				       0, size, ondisk);
34344156d998SAlex Elder 		if (ret < 0)
3435662518b1SAlex Elder 			goto out;
3436c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
34374156d998SAlex Elder 			ret = -ENXIO;
343806ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
343906ecc6cbSAlex Elder 				size, ret);
3440662518b1SAlex Elder 			goto out;
34414156d998SAlex Elder 		}
34424156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
34434156d998SAlex Elder 			ret = -ENXIO;
344406ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3445662518b1SAlex Elder 			goto out;
34464156d998SAlex Elder 		}
34474156d998SAlex Elder 
34484156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
34494156d998SAlex Elder 		want_count = snap_count;
34504156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
34514156d998SAlex Elder 	} while (snap_count != want_count);
34524156d998SAlex Elder 
3453662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3454662518b1SAlex Elder out:
34554156d998SAlex Elder 	kfree(ondisk);
34564156d998SAlex Elder 
3457dfc5606dSYehuda Sadeh 	return ret;
3458602adf40SYehuda Sadeh }
3459602adf40SYehuda Sadeh 
346015228edeSAlex Elder /*
346115228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
346215228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
346315228edeSAlex Elder  */
346415228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
346515228edeSAlex Elder {
346615228edeSAlex Elder 	u64 snap_id;
346715228edeSAlex Elder 
346815228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
346915228edeSAlex Elder 		return;
347015228edeSAlex Elder 
347115228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
347215228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
347315228edeSAlex Elder 		return;
347415228edeSAlex Elder 
347515228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
347615228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
347715228edeSAlex Elder }
347815228edeSAlex Elder 
34799875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
34809875201eSJosh Durgin {
34819875201eSJosh Durgin 	sector_t size;
34829875201eSJosh Durgin 	bool removing;
34839875201eSJosh Durgin 
34849875201eSJosh Durgin 	/*
34859875201eSJosh Durgin 	 * Don't hold the lock while doing disk operations,
34869875201eSJosh Durgin 	 * or lock ordering will conflict with the bdev mutex via:
34879875201eSJosh Durgin 	 * rbd_add() -> blkdev_get() -> rbd_open()
34889875201eSJosh Durgin 	 */
34899875201eSJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
34909875201eSJosh Durgin 	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
34919875201eSJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
34929875201eSJosh Durgin 	/*
34939875201eSJosh Durgin 	 * If the device is being removed, rbd_dev->disk has
34949875201eSJosh Durgin 	 * been destroyed, so don't try to update its size
34959875201eSJosh Durgin 	 */
34969875201eSJosh Durgin 	if (!removing) {
34979875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
34989875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
34999875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
35009875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
35019875201eSJosh Durgin 	}
35029875201eSJosh Durgin }
35039875201eSJosh Durgin 
3504cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
35051fe5e993SAlex Elder {
3506e627db08SAlex Elder 	u64 mapping_size;
35071fe5e993SAlex Elder 	int ret;
35081fe5e993SAlex Elder 
3509117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3510cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
35113b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3512117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
351399a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
3514117973fbSAlex Elder 	else
35152df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
351615228edeSAlex Elder 
351715228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
351815228edeSAlex Elder 
351915228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
3520cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
3521cfbf6377SAlex Elder 
352200a653e2SAlex Elder 	if (mapping_size != rbd_dev->mapping.size) {
35239875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
352400a653e2SAlex Elder 	}
35251fe5e993SAlex Elder 
35261fe5e993SAlex Elder 	return ret;
35271fe5e993SAlex Elder }
35281fe5e993SAlex Elder 
3529602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3530602adf40SYehuda Sadeh {
3531602adf40SYehuda Sadeh 	struct gendisk *disk;
3532602adf40SYehuda Sadeh 	struct request_queue *q;
3533593a9e7bSAlex Elder 	u64 segment_size;
3534602adf40SYehuda Sadeh 
3535602adf40SYehuda Sadeh 	/* create gendisk info */
35367e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
35377e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
35387e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
3539602adf40SYehuda Sadeh 	if (!disk)
35401fcdb8aaSAlex Elder 		return -ENOMEM;
3541602adf40SYehuda Sadeh 
3542f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3543de71a297SAlex Elder 		 rbd_dev->dev_id);
3544602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3545dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
35467e513d43SIlya Dryomov 	if (single_major)
35477e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
3548602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3549602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3550602adf40SYehuda Sadeh 
3551bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3552602adf40SYehuda Sadeh 	if (!q)
3553602adf40SYehuda Sadeh 		goto out_disk;
3554029bcbd8SJosh Durgin 
3555593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3556593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3557593a9e7bSAlex Elder 
3558029bcbd8SJosh Durgin 	/* set io sizes to object size */
3559593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3560593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3561593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3562593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3563593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3564029bcbd8SJosh Durgin 
3565602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3566602adf40SYehuda Sadeh 	disk->queue = q;
3567602adf40SYehuda Sadeh 
3568602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3569602adf40SYehuda Sadeh 
3570602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3571602adf40SYehuda Sadeh 
3572602adf40SYehuda Sadeh 	return 0;
3573602adf40SYehuda Sadeh out_disk:
3574602adf40SYehuda Sadeh 	put_disk(disk);
35751fcdb8aaSAlex Elder 
35761fcdb8aaSAlex Elder 	return -ENOMEM;
3577602adf40SYehuda Sadeh }
3578602adf40SYehuda Sadeh 
3579dfc5606dSYehuda Sadeh /*
3580dfc5606dSYehuda Sadeh   sysfs
3581dfc5606dSYehuda Sadeh */
3582602adf40SYehuda Sadeh 
3583593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3584593a9e7bSAlex Elder {
3585593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3586593a9e7bSAlex Elder }
3587593a9e7bSAlex Elder 
3588dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3589dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3590602adf40SYehuda Sadeh {
3591593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3592dfc5606dSYehuda Sadeh 
3593fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3594fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3595602adf40SYehuda Sadeh }
3596602adf40SYehuda Sadeh 
359734b13184SAlex Elder /*
359834b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
359934b13184SAlex Elder  * necessarily the base image.
360034b13184SAlex Elder  */
360134b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
360234b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
360334b13184SAlex Elder {
360434b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
360534b13184SAlex Elder 
360634b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
360734b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
360834b13184SAlex Elder }
360934b13184SAlex Elder 
3610dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3611dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3612602adf40SYehuda Sadeh {
3613593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3614dfc5606dSYehuda Sadeh 
3615fc71d833SAlex Elder 	if (rbd_dev->major)
3616dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3617fc71d833SAlex Elder 
3618fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3619dd82fff1SIlya Dryomov }
3620fc71d833SAlex Elder 
3621dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
3622dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
3623dd82fff1SIlya Dryomov {
3624dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3625dd82fff1SIlya Dryomov 
3626dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
3627dfc5606dSYehuda Sadeh }
3628dfc5606dSYehuda Sadeh 
3629dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3630dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3631dfc5606dSYehuda Sadeh {
3632593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3633dfc5606dSYehuda Sadeh 
36341dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
36351dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3636dfc5606dSYehuda Sadeh }
3637dfc5606dSYehuda Sadeh 
3638dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3639dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3640dfc5606dSYehuda Sadeh {
3641593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3642dfc5606dSYehuda Sadeh 
36430d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3644dfc5606dSYehuda Sadeh }
3645dfc5606dSYehuda Sadeh 
36469bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
36479bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
36489bb2f334SAlex Elder {
36499bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
36509bb2f334SAlex Elder 
36510d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
36520d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
36539bb2f334SAlex Elder }
36549bb2f334SAlex Elder 
3655dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3656dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3657dfc5606dSYehuda Sadeh {
3658593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3659dfc5606dSYehuda Sadeh 
3660a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
36610d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3662a92ffdf8SAlex Elder 
3663a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3664dfc5606dSYehuda Sadeh }
3665dfc5606dSYehuda Sadeh 
3666589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3667589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3668589d30e0SAlex Elder {
3669589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3670589d30e0SAlex Elder 
36710d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3672589d30e0SAlex Elder }
3673589d30e0SAlex Elder 
367434b13184SAlex Elder /*
367534b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
367634b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
367734b13184SAlex Elder  */
3678dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3679dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3680dfc5606dSYehuda Sadeh 			     char *buf)
3681dfc5606dSYehuda Sadeh {
3682593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3683dfc5606dSYehuda Sadeh 
36840d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3685dfc5606dSYehuda Sadeh }
3686dfc5606dSYehuda Sadeh 
368786b00e0dSAlex Elder /*
368886b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
368986b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
369086b00e0dSAlex Elder  * "(no parent image)".
369186b00e0dSAlex Elder  */
369286b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
369386b00e0dSAlex Elder 			     struct device_attribute *attr,
369486b00e0dSAlex Elder 			     char *buf)
369586b00e0dSAlex Elder {
369686b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
369786b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
369886b00e0dSAlex Elder 	int count;
369986b00e0dSAlex Elder 	char *bufp = buf;
370086b00e0dSAlex Elder 
370186b00e0dSAlex Elder 	if (!spec)
370286b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
370386b00e0dSAlex Elder 
370486b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
370586b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
370686b00e0dSAlex Elder 	if (count < 0)
370786b00e0dSAlex Elder 		return count;
370886b00e0dSAlex Elder 	bufp += count;
370986b00e0dSAlex Elder 
371086b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
371186b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
371286b00e0dSAlex Elder 	if (count < 0)
371386b00e0dSAlex Elder 		return count;
371486b00e0dSAlex Elder 	bufp += count;
371586b00e0dSAlex Elder 
371686b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
371786b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
371886b00e0dSAlex Elder 	if (count < 0)
371986b00e0dSAlex Elder 		return count;
372086b00e0dSAlex Elder 	bufp += count;
372186b00e0dSAlex Elder 
372286b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
372386b00e0dSAlex Elder 	if (count < 0)
372486b00e0dSAlex Elder 		return count;
372586b00e0dSAlex Elder 	bufp += count;
372686b00e0dSAlex Elder 
372786b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
372886b00e0dSAlex Elder }
372986b00e0dSAlex Elder 
3730dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3731dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3732dfc5606dSYehuda Sadeh 				 const char *buf,
3733dfc5606dSYehuda Sadeh 				 size_t size)
3734dfc5606dSYehuda Sadeh {
3735593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3736b813623aSAlex Elder 	int ret;
3737602adf40SYehuda Sadeh 
3738cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3739e627db08SAlex Elder 	if (ret)
3740e627db08SAlex Elder 		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3741b813623aSAlex Elder 
3742b813623aSAlex Elder 	return ret < 0 ? ret : size;
3743dfc5606dSYehuda Sadeh }
3744602adf40SYehuda Sadeh 
3745dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
374634b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3747dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3748dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3749dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3750dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
37519bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3752dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3753589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3754dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3755dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
375686b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3757dfc5606dSYehuda Sadeh 
3758dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3759dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
376034b13184SAlex Elder 	&dev_attr_features.attr,
3761dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3762dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
3763dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3764dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
37659bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3766dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3767589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3768dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
376986b00e0dSAlex Elder 	&dev_attr_parent.attr,
3770dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3771dfc5606dSYehuda Sadeh 	NULL
3772dfc5606dSYehuda Sadeh };
3773dfc5606dSYehuda Sadeh 
3774dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3775dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3776dfc5606dSYehuda Sadeh };
3777dfc5606dSYehuda Sadeh 
3778dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3779dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3780dfc5606dSYehuda Sadeh 	NULL
3781dfc5606dSYehuda Sadeh };
3782dfc5606dSYehuda Sadeh 
3783dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3784dfc5606dSYehuda Sadeh {
3785dfc5606dSYehuda Sadeh }
3786dfc5606dSYehuda Sadeh 
3787dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3788dfc5606dSYehuda Sadeh 	.name		= "rbd",
3789dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3790dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3791dfc5606dSYehuda Sadeh };
3792dfc5606dSYehuda Sadeh 
37938b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
37948b8fb99cSAlex Elder {
37958b8fb99cSAlex Elder 	kref_get(&spec->kref);
37968b8fb99cSAlex Elder 
37978b8fb99cSAlex Elder 	return spec;
37988b8fb99cSAlex Elder }
37998b8fb99cSAlex Elder 
38008b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
38018b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
38028b8fb99cSAlex Elder {
38038b8fb99cSAlex Elder 	if (spec)
38048b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
38058b8fb99cSAlex Elder }
38068b8fb99cSAlex Elder 
38078b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
38088b8fb99cSAlex Elder {
38098b8fb99cSAlex Elder 	struct rbd_spec *spec;
38108b8fb99cSAlex Elder 
38118b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
38128b8fb99cSAlex Elder 	if (!spec)
38138b8fb99cSAlex Elder 		return NULL;
38148b8fb99cSAlex Elder 	kref_init(&spec->kref);
38158b8fb99cSAlex Elder 
38168b8fb99cSAlex Elder 	return spec;
38178b8fb99cSAlex Elder }
38188b8fb99cSAlex Elder 
38198b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
38208b8fb99cSAlex Elder {
38218b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
38228b8fb99cSAlex Elder 
38238b8fb99cSAlex Elder 	kfree(spec->pool_name);
38248b8fb99cSAlex Elder 	kfree(spec->image_id);
38258b8fb99cSAlex Elder 	kfree(spec->image_name);
38268b8fb99cSAlex Elder 	kfree(spec->snap_name);
38278b8fb99cSAlex Elder 	kfree(spec);
38288b8fb99cSAlex Elder }
38298b8fb99cSAlex Elder 
3830cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3831c53d5893SAlex Elder 				struct rbd_spec *spec)
3832c53d5893SAlex Elder {
3833c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3834c53d5893SAlex Elder 
3835c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3836c53d5893SAlex Elder 	if (!rbd_dev)
3837c53d5893SAlex Elder 		return NULL;
3838c53d5893SAlex Elder 
3839c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
38406d292906SAlex Elder 	rbd_dev->flags = 0;
3841a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
3842c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3843c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3844c53d5893SAlex Elder 
3845c53d5893SAlex Elder 	rbd_dev->spec = spec;
3846c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3847c53d5893SAlex Elder 
38480903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
38490903e875SAlex Elder 
38500903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
38510903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
38520903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
38530903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
38540903e875SAlex Elder 
3855c53d5893SAlex Elder 	return rbd_dev;
3856c53d5893SAlex Elder }
3857c53d5893SAlex Elder 
3858c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3859c53d5893SAlex Elder {
3860c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3861c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3862c53d5893SAlex Elder 	kfree(rbd_dev);
3863c53d5893SAlex Elder }
3864c53d5893SAlex Elder 
3865dfc5606dSYehuda Sadeh /*
38669d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
38679d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
38689d475de5SAlex Elder  * image.
38699d475de5SAlex Elder  */
38709d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
38719d475de5SAlex Elder 				u8 *order, u64 *snap_size)
38729d475de5SAlex Elder {
38739d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
38749d475de5SAlex Elder 	int ret;
38759d475de5SAlex Elder 	struct {
38769d475de5SAlex Elder 		u8 order;
38779d475de5SAlex Elder 		__le64 size;
38789d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
38799d475de5SAlex Elder 
388036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
38819d475de5SAlex Elder 				"rbd", "get_size",
38824157976bSAlex Elder 				&snapid, sizeof (snapid),
3883e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
388436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
38859d475de5SAlex Elder 	if (ret < 0)
38869d475de5SAlex Elder 		return ret;
388757385b51SAlex Elder 	if (ret < sizeof (size_buf))
388857385b51SAlex Elder 		return -ERANGE;
38899d475de5SAlex Elder 
3890c3545579SJosh Durgin 	if (order) {
38919d475de5SAlex Elder 		*order = size_buf.order;
3892c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
3893c3545579SJosh Durgin 	}
38949d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
38959d475de5SAlex Elder 
3896c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
3897c3545579SJosh Durgin 		(unsigned long long)snap_id,
38989d475de5SAlex Elder 		(unsigned long long)*snap_size);
38999d475de5SAlex Elder 
39009d475de5SAlex Elder 	return 0;
39019d475de5SAlex Elder }
39029d475de5SAlex Elder 
39039d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
39049d475de5SAlex Elder {
39059d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
39069d475de5SAlex Elder 					&rbd_dev->header.obj_order,
39079d475de5SAlex Elder 					&rbd_dev->header.image_size);
39089d475de5SAlex Elder }
39099d475de5SAlex Elder 
39101e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
39111e130199SAlex Elder {
39121e130199SAlex Elder 	void *reply_buf;
39131e130199SAlex Elder 	int ret;
39141e130199SAlex Elder 	void *p;
39151e130199SAlex Elder 
39161e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
39171e130199SAlex Elder 	if (!reply_buf)
39181e130199SAlex Elder 		return -ENOMEM;
39191e130199SAlex Elder 
392036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
39214157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3922e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
392336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
39241e130199SAlex Elder 	if (ret < 0)
39251e130199SAlex Elder 		goto out;
39261e130199SAlex Elder 
39271e130199SAlex Elder 	p = reply_buf;
39281e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
392957385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
393057385b51SAlex Elder 	ret = 0;
39311e130199SAlex Elder 
39321e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
39331e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
39341e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
39351e130199SAlex Elder 	} else {
39361e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
39371e130199SAlex Elder 	}
39381e130199SAlex Elder out:
39391e130199SAlex Elder 	kfree(reply_buf);
39401e130199SAlex Elder 
39411e130199SAlex Elder 	return ret;
39421e130199SAlex Elder }
39431e130199SAlex Elder 
3944b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3945b1b5402aSAlex Elder 		u64 *snap_features)
3946b1b5402aSAlex Elder {
3947b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3948b1b5402aSAlex Elder 	struct {
3949b1b5402aSAlex Elder 		__le64 features;
3950b1b5402aSAlex Elder 		__le64 incompat;
39514157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3952d889140cSAlex Elder 	u64 incompat;
3953b1b5402aSAlex Elder 	int ret;
3954b1b5402aSAlex Elder 
395536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3956b1b5402aSAlex Elder 				"rbd", "get_features",
39574157976bSAlex Elder 				&snapid, sizeof (snapid),
3958e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
395936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3960b1b5402aSAlex Elder 	if (ret < 0)
3961b1b5402aSAlex Elder 		return ret;
396257385b51SAlex Elder 	if (ret < sizeof (features_buf))
396357385b51SAlex Elder 		return -ERANGE;
3964d889140cSAlex Elder 
3965d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
39665cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3967b8f5c6edSAlex Elder 		return -ENXIO;
3968d889140cSAlex Elder 
3969b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3970b1b5402aSAlex Elder 
3971b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3972b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3973b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3974b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3975b1b5402aSAlex Elder 
3976b1b5402aSAlex Elder 	return 0;
3977b1b5402aSAlex Elder }
3978b1b5402aSAlex Elder 
3979b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3980b1b5402aSAlex Elder {
3981b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3982b1b5402aSAlex Elder 						&rbd_dev->header.features);
3983b1b5402aSAlex Elder }
3984b1b5402aSAlex Elder 
398586b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
398686b00e0dSAlex Elder {
398786b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
398886b00e0dSAlex Elder 	size_t size;
398986b00e0dSAlex Elder 	void *reply_buf = NULL;
399086b00e0dSAlex Elder 	__le64 snapid;
399186b00e0dSAlex Elder 	void *p;
399286b00e0dSAlex Elder 	void *end;
3993642a2537SAlex Elder 	u64 pool_id;
399486b00e0dSAlex Elder 	char *image_id;
39953b5cf2a2SAlex Elder 	u64 snap_id;
399686b00e0dSAlex Elder 	u64 overlap;
399786b00e0dSAlex Elder 	int ret;
399886b00e0dSAlex Elder 
399986b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
400086b00e0dSAlex Elder 	if (!parent_spec)
400186b00e0dSAlex Elder 		return -ENOMEM;
400286b00e0dSAlex Elder 
400386b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
400486b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
400586b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
400686b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
400786b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
400886b00e0dSAlex Elder 	if (!reply_buf) {
400986b00e0dSAlex Elder 		ret = -ENOMEM;
401086b00e0dSAlex Elder 		goto out_err;
401186b00e0dSAlex Elder 	}
401286b00e0dSAlex Elder 
401386b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
401436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
401586b00e0dSAlex Elder 				"rbd", "get_parent",
40164157976bSAlex Elder 				&snapid, sizeof (snapid),
4017e2a58ee5SAlex Elder 				reply_buf, size);
401836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
401986b00e0dSAlex Elder 	if (ret < 0)
402086b00e0dSAlex Elder 		goto out_err;
402186b00e0dSAlex Elder 
402286b00e0dSAlex Elder 	p = reply_buf;
402357385b51SAlex Elder 	end = reply_buf + ret;
402457385b51SAlex Elder 	ret = -ERANGE;
4025642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
4026392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
4027392a9dadSAlex Elder 		/*
4028392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4029392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4030392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4031392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4032392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4033392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4034392a9dadSAlex Elder 		 * parent.
4035392a9dadSAlex Elder 		 */
4036392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4037392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4038392a9dadSAlex Elder 			smp_mb();
4039392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4040392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4041392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4042392a9dadSAlex Elder 		}
4043392a9dadSAlex Elder 
404486b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4045392a9dadSAlex Elder 	}
404686b00e0dSAlex Elder 
40470903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
40480903e875SAlex Elder 
40490903e875SAlex Elder 	ret = -EIO;
4050642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
4051c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
4052642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
405357385b51SAlex Elder 		goto out_err;
4054c0cd10dbSAlex Elder 	}
40550903e875SAlex Elder 
4056979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
405786b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
405886b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
405986b00e0dSAlex Elder 		goto out_err;
406086b00e0dSAlex Elder 	}
40613b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
406286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
406386b00e0dSAlex Elder 
40643b5cf2a2SAlex Elder 	/*
40653b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
40663b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
40673b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
40683b5cf2a2SAlex Elder 	 */
40693b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
40703b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
40713b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
40723b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
407386b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
407486b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
4075fbba11b3SIlya Dryomov 	} else {
4076fbba11b3SIlya Dryomov 		kfree(image_id);
40773b5cf2a2SAlex Elder 	}
40783b5cf2a2SAlex Elder 
40793b5cf2a2SAlex Elder 	/*
40803b5cf2a2SAlex Elder 	 * We always update the parent overlap.  If it's zero we
40813b5cf2a2SAlex Elder 	 * treat it specially.
40823b5cf2a2SAlex Elder 	 */
408370cf49cfSAlex Elder 	rbd_dev->parent_overlap = overlap;
40843b5cf2a2SAlex Elder 	smp_mb();
40853b5cf2a2SAlex Elder 	if (!overlap) {
40863b5cf2a2SAlex Elder 
40873b5cf2a2SAlex Elder 		/* A null parent_spec indicates it's the initial probe */
40883b5cf2a2SAlex Elder 
40893b5cf2a2SAlex Elder 		if (parent_spec) {
40903b5cf2a2SAlex Elder 			/*
40913b5cf2a2SAlex Elder 			 * The overlap has become zero, so the clone
40923b5cf2a2SAlex Elder 			 * must have been resized down to 0 at some
40933b5cf2a2SAlex Elder 			 * point.  Treat this the same as a flatten.
40943b5cf2a2SAlex Elder 			 */
40953b5cf2a2SAlex Elder 			rbd_dev_parent_put(rbd_dev);
40963b5cf2a2SAlex Elder 			pr_info("%s: clone image now standalone\n",
40973b5cf2a2SAlex Elder 				rbd_dev->disk->disk_name);
409870cf49cfSAlex Elder 		} else {
40993b5cf2a2SAlex Elder 			/*
41003b5cf2a2SAlex Elder 			 * For the initial probe, if we find the
41013b5cf2a2SAlex Elder 			 * overlap is zero we just pretend there was
41023b5cf2a2SAlex Elder 			 * no parent image.
41033b5cf2a2SAlex Elder 			 */
41043b5cf2a2SAlex Elder 			rbd_warn(rbd_dev, "ignoring parent of "
41053b5cf2a2SAlex Elder 						"clone with overlap 0\n");
41063b5cf2a2SAlex Elder 		}
410770cf49cfSAlex Elder 	}
410886b00e0dSAlex Elder out:
410986b00e0dSAlex Elder 	ret = 0;
411086b00e0dSAlex Elder out_err:
411186b00e0dSAlex Elder 	kfree(reply_buf);
411286b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
411386b00e0dSAlex Elder 
411486b00e0dSAlex Elder 	return ret;
411586b00e0dSAlex Elder }
411686b00e0dSAlex Elder 
4117cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4118cc070d59SAlex Elder {
4119cc070d59SAlex Elder 	struct {
4120cc070d59SAlex Elder 		__le64 stripe_unit;
4121cc070d59SAlex Elder 		__le64 stripe_count;
4122cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4123cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4124cc070d59SAlex Elder 	void *p;
4125cc070d59SAlex Elder 	u64 obj_size;
4126cc070d59SAlex Elder 	u64 stripe_unit;
4127cc070d59SAlex Elder 	u64 stripe_count;
4128cc070d59SAlex Elder 	int ret;
4129cc070d59SAlex Elder 
4130cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4131cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
4132e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
4133cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4134cc070d59SAlex Elder 	if (ret < 0)
4135cc070d59SAlex Elder 		return ret;
4136cc070d59SAlex Elder 	if (ret < size)
4137cc070d59SAlex Elder 		return -ERANGE;
4138cc070d59SAlex Elder 
4139cc070d59SAlex Elder 	/*
4140cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
4141cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
4142cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
4143cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
4144cc070d59SAlex Elder 	 */
4145cc070d59SAlex Elder 	ret = -EINVAL;
4146cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
4147cc070d59SAlex Elder 	p = &striping_info_buf;
4148cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
4149cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
4150cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
4151cc070d59SAlex Elder 				"(got %llu want %llu)",
4152cc070d59SAlex Elder 				stripe_unit, obj_size);
4153cc070d59SAlex Elder 		return -EINVAL;
4154cc070d59SAlex Elder 	}
4155cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
4156cc070d59SAlex Elder 	if (stripe_count != 1) {
4157cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
4158cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
4159cc070d59SAlex Elder 		return -EINVAL;
4160cc070d59SAlex Elder 	}
4161500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
4162500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
4163cc070d59SAlex Elder 
4164cc070d59SAlex Elder 	return 0;
4165cc070d59SAlex Elder }
4166cc070d59SAlex Elder 
41679e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
41689e15b77dSAlex Elder {
41699e15b77dSAlex Elder 	size_t image_id_size;
41709e15b77dSAlex Elder 	char *image_id;
41719e15b77dSAlex Elder 	void *p;
41729e15b77dSAlex Elder 	void *end;
41739e15b77dSAlex Elder 	size_t size;
41749e15b77dSAlex Elder 	void *reply_buf = NULL;
41759e15b77dSAlex Elder 	size_t len = 0;
41769e15b77dSAlex Elder 	char *image_name = NULL;
41779e15b77dSAlex Elder 	int ret;
41789e15b77dSAlex Elder 
41799e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
41809e15b77dSAlex Elder 
418169e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
418269e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
41839e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
41849e15b77dSAlex Elder 	if (!image_id)
41859e15b77dSAlex Elder 		return NULL;
41869e15b77dSAlex Elder 
41879e15b77dSAlex Elder 	p = image_id;
41884157976bSAlex Elder 	end = image_id + image_id_size;
418969e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
41909e15b77dSAlex Elder 
41919e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
41929e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
41939e15b77dSAlex Elder 	if (!reply_buf)
41949e15b77dSAlex Elder 		goto out;
41959e15b77dSAlex Elder 
419636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
41979e15b77dSAlex Elder 				"rbd", "dir_get_name",
41989e15b77dSAlex Elder 				image_id, image_id_size,
4199e2a58ee5SAlex Elder 				reply_buf, size);
42009e15b77dSAlex Elder 	if (ret < 0)
42019e15b77dSAlex Elder 		goto out;
42029e15b77dSAlex Elder 	p = reply_buf;
4203f40eb349SAlex Elder 	end = reply_buf + ret;
4204f40eb349SAlex Elder 
42059e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
42069e15b77dSAlex Elder 	if (IS_ERR(image_name))
42079e15b77dSAlex Elder 		image_name = NULL;
42089e15b77dSAlex Elder 	else
42099e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
42109e15b77dSAlex Elder out:
42119e15b77dSAlex Elder 	kfree(reply_buf);
42129e15b77dSAlex Elder 	kfree(image_id);
42139e15b77dSAlex Elder 
42149e15b77dSAlex Elder 	return image_name;
42159e15b77dSAlex Elder }
42169e15b77dSAlex Elder 
42172ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
42182ad3d716SAlex Elder {
42192ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
42202ad3d716SAlex Elder 	const char *snap_name;
42212ad3d716SAlex Elder 	u32 which = 0;
42222ad3d716SAlex Elder 
42232ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
42242ad3d716SAlex Elder 
42252ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
42262ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
42272ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
42282ad3d716SAlex Elder 			return snapc->snaps[which];
42292ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
42302ad3d716SAlex Elder 		which++;
42312ad3d716SAlex Elder 	}
42322ad3d716SAlex Elder 	return CEPH_NOSNAP;
42332ad3d716SAlex Elder }
42342ad3d716SAlex Elder 
42352ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
42362ad3d716SAlex Elder {
42372ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
42382ad3d716SAlex Elder 	u32 which;
42392ad3d716SAlex Elder 	bool found = false;
42402ad3d716SAlex Elder 	u64 snap_id;
42412ad3d716SAlex Elder 
42422ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
42432ad3d716SAlex Elder 		const char *snap_name;
42442ad3d716SAlex Elder 
42452ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
42462ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4247efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
4248efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
4249efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
4250efadc98aSJosh Durgin 				continue;
4251efadc98aSJosh Durgin 			else
42522ad3d716SAlex Elder 				break;
4253efadc98aSJosh Durgin 		}
42542ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
42552ad3d716SAlex Elder 		kfree(snap_name);
42562ad3d716SAlex Elder 	}
42572ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
42582ad3d716SAlex Elder }
42592ad3d716SAlex Elder 
42602ad3d716SAlex Elder /*
42612ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
42622ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
42632ad3d716SAlex Elder  */
42642ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
42652ad3d716SAlex Elder {
42662ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
42672ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
42682ad3d716SAlex Elder 
42692ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
42702ad3d716SAlex Elder }
42712ad3d716SAlex Elder 
42729e15b77dSAlex Elder /*
42732e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
42742e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
42752e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
42762e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
42772e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
42782e9f7f1cSAlex Elder  * allocated.
4279e1d4213fSAlex Elder  *
4280e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
4281e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
4282e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
42839e15b77dSAlex Elder  */
42842e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
42859e15b77dSAlex Elder {
42862e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
42872e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
42882e9f7f1cSAlex Elder 	const char *pool_name;
42892e9f7f1cSAlex Elder 	const char *image_name;
42902e9f7f1cSAlex Elder 	const char *snap_name;
42919e15b77dSAlex Elder 	int ret;
42929e15b77dSAlex Elder 
4293e1d4213fSAlex Elder 	/*
4294e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
4295e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
4296e1d4213fSAlex Elder 	 */
42972e9f7f1cSAlex Elder 	if (spec->pool_name) {
42982e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
42992ad3d716SAlex Elder 			u64 snap_id;
4300e1d4213fSAlex Elder 
43012ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
43022ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
4303e1d4213fSAlex Elder 				return -ENOENT;
43042ad3d716SAlex Elder 			spec->snap_id = snap_id;
4305e1d4213fSAlex Elder 		} else {
43062e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
4307e1d4213fSAlex Elder 		}
4308e1d4213fSAlex Elder 
4309e1d4213fSAlex Elder 		return 0;
4310e1d4213fSAlex Elder 	}
43119e15b77dSAlex Elder 
43122e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
43139e15b77dSAlex Elder 
43142e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
43152e9f7f1cSAlex Elder 	if (!pool_name) {
43162e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4317935dc89fSAlex Elder 		return -EIO;
4318935dc89fSAlex Elder 	}
43192e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
43202e9f7f1cSAlex Elder 	if (!pool_name)
43219e15b77dSAlex Elder 		return -ENOMEM;
43229e15b77dSAlex Elder 
43239e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
43249e15b77dSAlex Elder 
43252e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
43262e9f7f1cSAlex Elder 	if (!image_name)
432706ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
43289e15b77dSAlex Elder 
43292e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
43309e15b77dSAlex Elder 
43312e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4332da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
4333da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
43349e15b77dSAlex Elder 		goto out_err;
43352e9f7f1cSAlex Elder 	}
43362e9f7f1cSAlex Elder 
43372e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
43382e9f7f1cSAlex Elder 	spec->image_name = image_name;
43392e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
43409e15b77dSAlex Elder 
43419e15b77dSAlex Elder 	return 0;
43429e15b77dSAlex Elder out_err:
43432e9f7f1cSAlex Elder 	kfree(image_name);
43442e9f7f1cSAlex Elder 	kfree(pool_name);
43459e15b77dSAlex Elder 
43469e15b77dSAlex Elder 	return ret;
43479e15b77dSAlex Elder }
43489e15b77dSAlex Elder 
4349cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
435035d489f9SAlex Elder {
435135d489f9SAlex Elder 	size_t size;
435235d489f9SAlex Elder 	int ret;
435335d489f9SAlex Elder 	void *reply_buf;
435435d489f9SAlex Elder 	void *p;
435535d489f9SAlex Elder 	void *end;
435635d489f9SAlex Elder 	u64 seq;
435735d489f9SAlex Elder 	u32 snap_count;
435835d489f9SAlex Elder 	struct ceph_snap_context *snapc;
435935d489f9SAlex Elder 	u32 i;
436035d489f9SAlex Elder 
436135d489f9SAlex Elder 	/*
436235d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
436335d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
436435d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
436535d489f9SAlex Elder 	 * prepared to receive.
436635d489f9SAlex Elder 	 */
436735d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
436835d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
436935d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
437035d489f9SAlex Elder 	if (!reply_buf)
437135d489f9SAlex Elder 		return -ENOMEM;
437235d489f9SAlex Elder 
437336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
43744157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4375e2a58ee5SAlex Elder 				reply_buf, size);
437636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
437735d489f9SAlex Elder 	if (ret < 0)
437835d489f9SAlex Elder 		goto out;
437935d489f9SAlex Elder 
438035d489f9SAlex Elder 	p = reply_buf;
438157385b51SAlex Elder 	end = reply_buf + ret;
438257385b51SAlex Elder 	ret = -ERANGE;
438335d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
438435d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
438535d489f9SAlex Elder 
438635d489f9SAlex Elder 	/*
438735d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
438835d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
438935d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
439035d489f9SAlex Elder 	 * allocate is representable in a size_t.
439135d489f9SAlex Elder 	 */
439235d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
439335d489f9SAlex Elder 				 / sizeof (u64)) {
439435d489f9SAlex Elder 		ret = -EINVAL;
439535d489f9SAlex Elder 		goto out;
439635d489f9SAlex Elder 	}
439735d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
439835d489f9SAlex Elder 		goto out;
4399468521c1SAlex Elder 	ret = 0;
440035d489f9SAlex Elder 
4401812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
440235d489f9SAlex Elder 	if (!snapc) {
440335d489f9SAlex Elder 		ret = -ENOMEM;
440435d489f9SAlex Elder 		goto out;
440535d489f9SAlex Elder 	}
440635d489f9SAlex Elder 	snapc->seq = seq;
440735d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
440835d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
440935d489f9SAlex Elder 
441049ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
441135d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
441235d489f9SAlex Elder 
441335d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
441435d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
441535d489f9SAlex Elder out:
441635d489f9SAlex Elder 	kfree(reply_buf);
441735d489f9SAlex Elder 
441857385b51SAlex Elder 	return ret;
441935d489f9SAlex Elder }
442035d489f9SAlex Elder 
442154cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
442254cac61fSAlex Elder 					u64 snap_id)
4423b8b1e2dbSAlex Elder {
4424b8b1e2dbSAlex Elder 	size_t size;
4425b8b1e2dbSAlex Elder 	void *reply_buf;
442654cac61fSAlex Elder 	__le64 snapid;
4427b8b1e2dbSAlex Elder 	int ret;
4428b8b1e2dbSAlex Elder 	void *p;
4429b8b1e2dbSAlex Elder 	void *end;
4430b8b1e2dbSAlex Elder 	char *snap_name;
4431b8b1e2dbSAlex Elder 
4432b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4433b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4434b8b1e2dbSAlex Elder 	if (!reply_buf)
4435b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4436b8b1e2dbSAlex Elder 
443754cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
443836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4439b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
444054cac61fSAlex Elder 				&snapid, sizeof (snapid),
4441e2a58ee5SAlex Elder 				reply_buf, size);
444236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4443f40eb349SAlex Elder 	if (ret < 0) {
4444f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4445b8b1e2dbSAlex Elder 		goto out;
4446f40eb349SAlex Elder 	}
4447b8b1e2dbSAlex Elder 
4448b8b1e2dbSAlex Elder 	p = reply_buf;
4449f40eb349SAlex Elder 	end = reply_buf + ret;
4450e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4451f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4452b8b1e2dbSAlex Elder 		goto out;
4453f40eb349SAlex Elder 
4454b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
445554cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4456b8b1e2dbSAlex Elder out:
4457b8b1e2dbSAlex Elder 	kfree(reply_buf);
4458b8b1e2dbSAlex Elder 
4459f40eb349SAlex Elder 	return snap_name;
4460b8b1e2dbSAlex Elder }
4461b8b1e2dbSAlex Elder 
44622df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4463117973fbSAlex Elder {
44642df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4465117973fbSAlex Elder 	int ret;
4466117973fbSAlex Elder 
44671617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
44681617e40cSJosh Durgin 	if (ret)
4469cfbf6377SAlex Elder 		return ret;
44701617e40cSJosh Durgin 
44712df3fac7SAlex Elder 	if (first_time) {
44722df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
44732df3fac7SAlex Elder 		if (ret)
4474cfbf6377SAlex Elder 			return ret;
44752df3fac7SAlex Elder 	}
44762df3fac7SAlex Elder 
4477642a2537SAlex Elder 	/*
4478642a2537SAlex Elder 	 * If the image supports layering, get the parent info.  We
4479642a2537SAlex Elder 	 * need to probe the first time regardless.  Thereafter we
4480642a2537SAlex Elder 	 * only need to if there's a parent, to see if it has
4481642a2537SAlex Elder 	 * disappeared due to the mapped image getting flattened.
4482642a2537SAlex Elder 	 */
4483642a2537SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4484642a2537SAlex Elder 			(first_time || rbd_dev->parent_spec)) {
4485642a2537SAlex Elder 		bool warn;
4486642a2537SAlex Elder 
4487642a2537SAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
4488642a2537SAlex Elder 		if (ret)
4489cfbf6377SAlex Elder 			return ret;
4490642a2537SAlex Elder 
4491642a2537SAlex Elder 		/*
4492642a2537SAlex Elder 		 * Print a warning if this is the initial probe and
4493642a2537SAlex Elder 		 * the image has a parent.  Don't print it if the
4494642a2537SAlex Elder 		 * image now being probed is itself a parent.  We
4495642a2537SAlex Elder 		 * can tell at this point because we won't know its
4496642a2537SAlex Elder 		 * pool name yet (just its pool id).
4497642a2537SAlex Elder 		 */
4498642a2537SAlex Elder 		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4499642a2537SAlex Elder 		if (first_time && warn)
4500642a2537SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
4501642a2537SAlex Elder 					"is EXPERIMENTAL!");
4502642a2537SAlex Elder 	}
4503642a2537SAlex Elder 
450429334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
450529334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
450629334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
4507117973fbSAlex Elder 
4508cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4509117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4510117973fbSAlex Elder 
4511117973fbSAlex Elder 	return ret;
4512117973fbSAlex Elder }
4513117973fbSAlex Elder 
4514dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4515dfc5606dSYehuda Sadeh {
4516dfc5606dSYehuda Sadeh 	struct device *dev;
4517cd789ab9SAlex Elder 	int ret;
4518dfc5606dSYehuda Sadeh 
4519cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4520dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4521dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4522dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4523200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4524de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4525dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4526dfc5606dSYehuda Sadeh 
4527dfc5606dSYehuda Sadeh 	return ret;
4528602adf40SYehuda Sadeh }
4529602adf40SYehuda Sadeh 
4530dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4531dfc5606dSYehuda Sadeh {
4532dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4533dfc5606dSYehuda Sadeh }
4534dfc5606dSYehuda Sadeh 
45351ddbe94eSAlex Elder /*
4536499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4537f8a22fc2SIlya Dryomov  * the rbd_dev to the global list.
45381ddbe94eSAlex Elder  */
4539f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4540b7f23c36SAlex Elder {
4541f8a22fc2SIlya Dryomov 	int new_dev_id;
4542f8a22fc2SIlya Dryomov 
45439b60e70bSIlya Dryomov 	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
45449b60e70bSIlya Dryomov 				    0, minor_to_rbd_dev_id(1 << MINORBITS),
45459b60e70bSIlya Dryomov 				    GFP_KERNEL);
4546f8a22fc2SIlya Dryomov 	if (new_dev_id < 0)
4547f8a22fc2SIlya Dryomov 		return new_dev_id;
4548f8a22fc2SIlya Dryomov 
4549f8a22fc2SIlya Dryomov 	rbd_dev->dev_id = new_dev_id;
4550499afd5bSAlex Elder 
4551499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4552499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4553499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4554f8a22fc2SIlya Dryomov 
455570eebd20SIlya Dryomov 	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4556f8a22fc2SIlya Dryomov 
4557f8a22fc2SIlya Dryomov 	return 0;
4558b7f23c36SAlex Elder }
4559b7f23c36SAlex Elder 
45601ddbe94eSAlex Elder /*
4561499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4562499afd5bSAlex Elder  * identifier is no longer in use.
45631ddbe94eSAlex Elder  */
4564e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
45651ddbe94eSAlex Elder {
4566499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4567499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4568499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
45691ddbe94eSAlex Elder 
4570f8a22fc2SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4571f8a22fc2SIlya Dryomov 
4572f8a22fc2SIlya Dryomov 	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4573b7f23c36SAlex Elder }
4574b7f23c36SAlex Elder 
4575a725f65eSAlex Elder /*
4576e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4577e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4578593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4579593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4580e28fff26SAlex Elder  */
4581e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4582e28fff26SAlex Elder {
4583e28fff26SAlex Elder         /*
4584e28fff26SAlex Elder         * These are the characters that produce nonzero for
4585e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4586e28fff26SAlex Elder         */
4587e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4588e28fff26SAlex Elder 
4589e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4590e28fff26SAlex Elder 
4591e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4592e28fff26SAlex Elder }
4593e28fff26SAlex Elder 
4594e28fff26SAlex Elder /*
4595e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4596e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4597593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4598593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4599e28fff26SAlex Elder  *
4600e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4601e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4602e28fff26SAlex Elder  * token_size if the token would not fit.
4603e28fff26SAlex Elder  *
4604593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4605e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4606e28fff26SAlex Elder  * too small to hold it.
4607e28fff26SAlex Elder  */
4608e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4609e28fff26SAlex Elder 				char *token,
4610e28fff26SAlex Elder 				size_t token_size)
4611e28fff26SAlex Elder {
4612e28fff26SAlex Elder         size_t len;
4613e28fff26SAlex Elder 
4614e28fff26SAlex Elder 	len = next_token(buf);
4615e28fff26SAlex Elder 	if (len < token_size) {
4616e28fff26SAlex Elder 		memcpy(token, *buf, len);
4617e28fff26SAlex Elder 		*(token + len) = '\0';
4618e28fff26SAlex Elder 	}
4619e28fff26SAlex Elder 	*buf += len;
4620e28fff26SAlex Elder 
4621e28fff26SAlex Elder         return len;
4622e28fff26SAlex Elder }
4623e28fff26SAlex Elder 
4624e28fff26SAlex Elder /*
4625ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4626ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4627ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4628ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4629ea3352f4SAlex Elder  *
4630ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4631ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4632ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4633ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4634ea3352f4SAlex Elder  *
4635ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4636ea3352f4SAlex Elder  * the end of the found token.
4637ea3352f4SAlex Elder  *
4638ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4639ea3352f4SAlex Elder  */
4640ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4641ea3352f4SAlex Elder {
4642ea3352f4SAlex Elder 	char *dup;
4643ea3352f4SAlex Elder 	size_t len;
4644ea3352f4SAlex Elder 
4645ea3352f4SAlex Elder 	len = next_token(buf);
46464caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4647ea3352f4SAlex Elder 	if (!dup)
4648ea3352f4SAlex Elder 		return NULL;
4649ea3352f4SAlex Elder 	*(dup + len) = '\0';
4650ea3352f4SAlex Elder 	*buf += len;
4651ea3352f4SAlex Elder 
4652ea3352f4SAlex Elder 	if (lenp)
4653ea3352f4SAlex Elder 		*lenp = len;
4654ea3352f4SAlex Elder 
4655ea3352f4SAlex Elder 	return dup;
4656ea3352f4SAlex Elder }
4657ea3352f4SAlex Elder 
4658ea3352f4SAlex Elder /*
4659859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4660859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4661859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4662859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4663d22f76e7SAlex Elder  *
4664859c31dfSAlex Elder  * The information extracted from these options is recorded in
4665859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4666859c31dfSAlex Elder  * structures:
4667859c31dfSAlex Elder  *  ceph_opts
4668859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4669859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4670859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4671859c31dfSAlex Elder  *  rbd_opts
4672859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4673859c31dfSAlex Elder  *	this function; caller must release with kfree().
4674859c31dfSAlex Elder  *  spec
4675859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4676859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4677859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4678859c31dfSAlex Elder  *
4679859c31dfSAlex Elder  * The options passed take this form:
4680859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4681859c31dfSAlex Elder  * where:
4682859c31dfSAlex Elder  *  <mon_addrs>
4683859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4684859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4685859c31dfSAlex Elder  *      by a port number (separated by a colon).
4686859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4687859c31dfSAlex Elder  *  <options>
4688859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4689859c31dfSAlex Elder  *  <pool_name>
4690859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4691859c31dfSAlex Elder  *  <image_name>
4692859c31dfSAlex Elder  *      The name of the image in that pool to map.
4693859c31dfSAlex Elder  *  <snap_id>
4694859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4695859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4696859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4697859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4698a725f65eSAlex Elder  */
4699859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4700dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4701859c31dfSAlex Elder 				struct rbd_options **opts,
4702859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4703a725f65eSAlex Elder {
4704e28fff26SAlex Elder 	size_t len;
4705859c31dfSAlex Elder 	char *options;
47060ddebc0cSAlex Elder 	const char *mon_addrs;
4707ecb4dc22SAlex Elder 	char *snap_name;
47080ddebc0cSAlex Elder 	size_t mon_addrs_size;
4709859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
47104e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4711859c31dfSAlex Elder 	struct ceph_options *copts;
4712dc79b113SAlex Elder 	int ret;
4713e28fff26SAlex Elder 
4714e28fff26SAlex Elder 	/* The first four tokens are required */
4715e28fff26SAlex Elder 
47167ef3214aSAlex Elder 	len = next_token(&buf);
47174fb5d671SAlex Elder 	if (!len) {
47184fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
47194fb5d671SAlex Elder 		return -EINVAL;
47204fb5d671SAlex Elder 	}
47210ddebc0cSAlex Elder 	mon_addrs = buf;
4722f28e565aSAlex Elder 	mon_addrs_size = len + 1;
47237ef3214aSAlex Elder 	buf += len;
4724a725f65eSAlex Elder 
4725dc79b113SAlex Elder 	ret = -EINVAL;
4726f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4727f28e565aSAlex Elder 	if (!options)
4728dc79b113SAlex Elder 		return -ENOMEM;
47294fb5d671SAlex Elder 	if (!*options) {
47304fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
47314fb5d671SAlex Elder 		goto out_err;
47324fb5d671SAlex Elder 	}
4733a725f65eSAlex Elder 
4734859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4735859c31dfSAlex Elder 	if (!spec)
4736f28e565aSAlex Elder 		goto out_mem;
4737859c31dfSAlex Elder 
4738859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4739859c31dfSAlex Elder 	if (!spec->pool_name)
4740859c31dfSAlex Elder 		goto out_mem;
47414fb5d671SAlex Elder 	if (!*spec->pool_name) {
47424fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
47434fb5d671SAlex Elder 		goto out_err;
47444fb5d671SAlex Elder 	}
4745e28fff26SAlex Elder 
474669e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4747859c31dfSAlex Elder 	if (!spec->image_name)
4748f28e565aSAlex Elder 		goto out_mem;
47494fb5d671SAlex Elder 	if (!*spec->image_name) {
47504fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
47514fb5d671SAlex Elder 		goto out_err;
47524fb5d671SAlex Elder 	}
4753e28fff26SAlex Elder 
4754f28e565aSAlex Elder 	/*
4755f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4756f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4757f28e565aSAlex Elder 	 */
47583feeb894SAlex Elder 	len = next_token(&buf);
4759820a5f3eSAlex Elder 	if (!len) {
47603feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
47613feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4762f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4763dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4764f28e565aSAlex Elder 		goto out_err;
4765849b4260SAlex Elder 	}
4766ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4767ecb4dc22SAlex Elder 	if (!snap_name)
4768f28e565aSAlex Elder 		goto out_mem;
4769ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4770ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4771e5c35534SAlex Elder 
47720ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4773e28fff26SAlex Elder 
47744e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
47754e9afebaSAlex Elder 	if (!rbd_opts)
47764e9afebaSAlex Elder 		goto out_mem;
47774e9afebaSAlex Elder 
47784e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4779d22f76e7SAlex Elder 
4780859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
47810ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
47824e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4783859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4784859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4785dc79b113SAlex Elder 		goto out_err;
4786dc79b113SAlex Elder 	}
4787859c31dfSAlex Elder 	kfree(options);
4788859c31dfSAlex Elder 
4789859c31dfSAlex Elder 	*ceph_opts = copts;
47904e9afebaSAlex Elder 	*opts = rbd_opts;
4791859c31dfSAlex Elder 	*rbd_spec = spec;
47920ddebc0cSAlex Elder 
4793dc79b113SAlex Elder 	return 0;
4794f28e565aSAlex Elder out_mem:
4795dc79b113SAlex Elder 	ret = -ENOMEM;
4796d22f76e7SAlex Elder out_err:
4797859c31dfSAlex Elder 	kfree(rbd_opts);
4798859c31dfSAlex Elder 	rbd_spec_put(spec);
4799f28e565aSAlex Elder 	kfree(options);
4800d22f76e7SAlex Elder 
4801dc79b113SAlex Elder 	return ret;
4802a725f65eSAlex Elder }
4803a725f65eSAlex Elder 
4804589d30e0SAlex Elder /*
480530ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
480630ba1f02SIlya Dryomov  */
480730ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
480830ba1f02SIlya Dryomov {
480930ba1f02SIlya Dryomov 	u64 newest_epoch;
481030ba1f02SIlya Dryomov 	unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
481130ba1f02SIlya Dryomov 	int tries = 0;
481230ba1f02SIlya Dryomov 	int ret;
481330ba1f02SIlya Dryomov 
481430ba1f02SIlya Dryomov again:
481530ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
481630ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
481730ba1f02SIlya Dryomov 		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
481830ba1f02SIlya Dryomov 					       &newest_epoch);
481930ba1f02SIlya Dryomov 		if (ret < 0)
482030ba1f02SIlya Dryomov 			return ret;
482130ba1f02SIlya Dryomov 
482230ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
482330ba1f02SIlya Dryomov 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
482430ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
482530ba1f02SIlya Dryomov 						     newest_epoch, timeout);
482630ba1f02SIlya Dryomov 			goto again;
482730ba1f02SIlya Dryomov 		} else {
482830ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
482930ba1f02SIlya Dryomov 			return -ENOENT;
483030ba1f02SIlya Dryomov 		}
483130ba1f02SIlya Dryomov 	}
483230ba1f02SIlya Dryomov 
483330ba1f02SIlya Dryomov 	return ret;
483430ba1f02SIlya Dryomov }
483530ba1f02SIlya Dryomov 
483630ba1f02SIlya Dryomov /*
4837589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4838589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4839589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4840589d30e0SAlex Elder  *
4841589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4842589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4843589d30e0SAlex Elder  * with the supplied name.
4844589d30e0SAlex Elder  *
4845589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4846589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4847589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4848589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4849589d30e0SAlex Elder  */
4850589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4851589d30e0SAlex Elder {
4852589d30e0SAlex Elder 	int ret;
4853589d30e0SAlex Elder 	size_t size;
4854589d30e0SAlex Elder 	char *object_name;
4855589d30e0SAlex Elder 	void *response;
4856c0fba368SAlex Elder 	char *image_id;
48572f82ee54SAlex Elder 
4858589d30e0SAlex Elder 	/*
48592c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
48602c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4861c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4862c0fba368SAlex Elder 	 * do still need to set the image format though.
48632c0d0a10SAlex Elder 	 */
4864c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4865c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4866c0fba368SAlex Elder 
48672c0d0a10SAlex Elder 		return 0;
4868c0fba368SAlex Elder 	}
48692c0d0a10SAlex Elder 
48702c0d0a10SAlex Elder 	/*
4871589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4872589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4873589d30e0SAlex Elder 	 */
487469e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4875589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4876589d30e0SAlex Elder 	if (!object_name)
4877589d30e0SAlex Elder 		return -ENOMEM;
48780d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4879589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4880589d30e0SAlex Elder 
4881589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4882589d30e0SAlex Elder 
4883589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4884589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4885589d30e0SAlex Elder 	if (!response) {
4886589d30e0SAlex Elder 		ret = -ENOMEM;
4887589d30e0SAlex Elder 		goto out;
4888589d30e0SAlex Elder 	}
4889589d30e0SAlex Elder 
4890c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4891c0fba368SAlex Elder 
489236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
48934157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4894e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
489536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4896c0fba368SAlex Elder 	if (ret == -ENOENT) {
4897c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4898c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4899c0fba368SAlex Elder 		if (!ret)
4900c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4901c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4902c0fba368SAlex Elder 		void *p = response;
4903589d30e0SAlex Elder 
4904c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4905979ed480SAlex Elder 						NULL, GFP_NOIO);
4906461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
4907c0fba368SAlex Elder 		if (!ret)
4908c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4909589d30e0SAlex Elder 	} else {
4910c0fba368SAlex Elder 		ret = -EINVAL;
4911c0fba368SAlex Elder 	}
4912c0fba368SAlex Elder 
4913c0fba368SAlex Elder 	if (!ret) {
4914c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4915c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4916589d30e0SAlex Elder 	}
4917589d30e0SAlex Elder out:
4918589d30e0SAlex Elder 	kfree(response);
4919589d30e0SAlex Elder 	kfree(object_name);
4920589d30e0SAlex Elder 
4921589d30e0SAlex Elder 	return ret;
4922589d30e0SAlex Elder }
4923589d30e0SAlex Elder 
49243abef3b3SAlex Elder /*
49253abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
49263abef3b3SAlex Elder  * call.
49273abef3b3SAlex Elder  */
49286fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
49296fd48b3bSAlex Elder {
49306fd48b3bSAlex Elder 	struct rbd_image_header	*header;
49316fd48b3bSAlex Elder 
4932392a9dadSAlex Elder 	/* Drop parent reference unless it's already been done (or none) */
4933392a9dadSAlex Elder 
4934392a9dadSAlex Elder 	if (rbd_dev->parent_overlap)
4935a2acd00eSAlex Elder 		rbd_dev_parent_put(rbd_dev);
49366fd48b3bSAlex Elder 
49376fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
49386fd48b3bSAlex Elder 
49396fd48b3bSAlex Elder 	header = &rbd_dev->header;
4940812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
49416fd48b3bSAlex Elder 	kfree(header->snap_sizes);
49426fd48b3bSAlex Elder 	kfree(header->snap_names);
49436fd48b3bSAlex Elder 	kfree(header->object_prefix);
49446fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
49456fd48b3bSAlex Elder }
49466fd48b3bSAlex Elder 
49472df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4948a30b71b9SAlex Elder {
4949a30b71b9SAlex Elder 	int ret;
4950a30b71b9SAlex Elder 
49511e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
495257385b51SAlex Elder 	if (ret)
49531e130199SAlex Elder 		goto out_err;
4954b1b5402aSAlex Elder 
49552df3fac7SAlex Elder 	/*
49562df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
49572df3fac7SAlex Elder 	 * features are assumed to never change.
49582df3fac7SAlex Elder 	 */
4959b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
496057385b51SAlex Elder 	if (ret)
4961b1b5402aSAlex Elder 		goto out_err;
496235d489f9SAlex Elder 
4963cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4964cc070d59SAlex Elder 
4965cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4966cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4967cc070d59SAlex Elder 		if (ret < 0)
4968cc070d59SAlex Elder 			goto out_err;
4969cc070d59SAlex Elder 	}
49702df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
4971a30b71b9SAlex Elder 
497235152979SAlex Elder 	return 0;
49739d475de5SAlex Elder out_err:
4974642a2537SAlex Elder 	rbd_dev->header.features = 0;
49751e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
49761e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
49779d475de5SAlex Elder 
49789d475de5SAlex Elder 	return ret;
4979a30b71b9SAlex Elder }
4980a30b71b9SAlex Elder 
4981124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
498283a06263SAlex Elder {
49832f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4984124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4985124afba2SAlex Elder 	struct rbd_client *rbdc;
4986124afba2SAlex Elder 	int ret;
4987124afba2SAlex Elder 
4988124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4989124afba2SAlex Elder 		return 0;
4990124afba2SAlex Elder 	/*
4991124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4992124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4993124afba2SAlex Elder 	 * parent/child relationships always share both.
4994124afba2SAlex Elder 	 */
4995124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4996124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4997124afba2SAlex Elder 
4998124afba2SAlex Elder 	ret = -ENOMEM;
4999124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
5000124afba2SAlex Elder 	if (!parent)
5001124afba2SAlex Elder 		goto out_err;
5002124afba2SAlex Elder 
50031f3ef788SAlex Elder 	ret = rbd_dev_image_probe(parent, false);
5004124afba2SAlex Elder 	if (ret < 0)
5005124afba2SAlex Elder 		goto out_err;
5006124afba2SAlex Elder 	rbd_dev->parent = parent;
5007a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5008124afba2SAlex Elder 
5009124afba2SAlex Elder 	return 0;
5010124afba2SAlex Elder out_err:
5011124afba2SAlex Elder 	if (parent) {
5012fb65d228SAlex Elder 		rbd_dev_unparent(rbd_dev);
5013124afba2SAlex Elder 		kfree(rbd_dev->header_name);
5014124afba2SAlex Elder 		rbd_dev_destroy(parent);
5015124afba2SAlex Elder 	} else {
5016124afba2SAlex Elder 		rbd_put_client(rbdc);
5017124afba2SAlex Elder 		rbd_spec_put(parent_spec);
5018124afba2SAlex Elder 	}
5019124afba2SAlex Elder 
5020124afba2SAlex Elder 	return ret;
5021124afba2SAlex Elder }
5022124afba2SAlex Elder 
5023200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5024124afba2SAlex Elder {
502583a06263SAlex Elder 	int ret;
502683a06263SAlex Elder 
5027f8a22fc2SIlya Dryomov 	/* Get an id and fill in device name. */
502883a06263SAlex Elder 
5029f8a22fc2SIlya Dryomov 	ret = rbd_dev_id_get(rbd_dev);
5030f8a22fc2SIlya Dryomov 	if (ret)
5031f8a22fc2SIlya Dryomov 		return ret;
5032f8a22fc2SIlya Dryomov 
503383a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
503483a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
503583a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
503683a06263SAlex Elder 
50379b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
503883a06263SAlex Elder 
50399b60e70bSIlya Dryomov 	if (!single_major) {
504083a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
504183a06263SAlex Elder 		if (ret < 0)
504283a06263SAlex Elder 			goto err_out_id;
50439b60e70bSIlya Dryomov 
504483a06263SAlex Elder 		rbd_dev->major = ret;
5045dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
50469b60e70bSIlya Dryomov 	} else {
50479b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
50489b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
50499b60e70bSIlya Dryomov 	}
505083a06263SAlex Elder 
505183a06263SAlex Elder 	/* Set up the blkdev mapping. */
505283a06263SAlex Elder 
505383a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
505483a06263SAlex Elder 	if (ret)
505583a06263SAlex Elder 		goto err_out_blkdev;
505683a06263SAlex Elder 
5057f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
505883a06263SAlex Elder 	if (ret)
505983a06263SAlex Elder 		goto err_out_disk;
5060f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
506122001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5062f35a4deeSAlex Elder 
5063f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
5064f35a4deeSAlex Elder 	if (ret)
5065f35a4deeSAlex Elder 		goto err_out_mapping;
506683a06263SAlex Elder 
506783a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
506883a06263SAlex Elder 
5069129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
507083a06263SAlex Elder 	add_disk(rbd_dev->disk);
507183a06263SAlex Elder 
507283a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
507383a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
507483a06263SAlex Elder 
507583a06263SAlex Elder 	return ret;
50762f82ee54SAlex Elder 
5077f35a4deeSAlex Elder err_out_mapping:
5078f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
507983a06263SAlex Elder err_out_disk:
508083a06263SAlex Elder 	rbd_free_disk(rbd_dev);
508183a06263SAlex Elder err_out_blkdev:
50829b60e70bSIlya Dryomov 	if (!single_major)
508383a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
508483a06263SAlex Elder err_out_id:
508583a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
5086d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
508783a06263SAlex Elder 
508883a06263SAlex Elder 	return ret;
508983a06263SAlex Elder }
509083a06263SAlex Elder 
5091332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5092332bb12dSAlex Elder {
5093332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5094332bb12dSAlex Elder 	size_t size;
5095332bb12dSAlex Elder 
5096332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5097332bb12dSAlex Elder 
5098332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5099332bb12dSAlex Elder 
5100332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5101332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
5102332bb12dSAlex Elder 	else
5103332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
5104332bb12dSAlex Elder 
5105332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
5106332bb12dSAlex Elder 	if (!rbd_dev->header_name)
5107332bb12dSAlex Elder 		return -ENOMEM;
5108332bb12dSAlex Elder 
5109332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5110332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5111332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
5112332bb12dSAlex Elder 	else
5113332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5114332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
5115332bb12dSAlex Elder 	return 0;
5116332bb12dSAlex Elder }
5117332bb12dSAlex Elder 
5118200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5119200a6a8bSAlex Elder {
51206fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5121200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
51226fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
51236fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
51246fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
51256fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
51266fd48b3bSAlex Elder 
5127200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
5128200a6a8bSAlex Elder }
5129200a6a8bSAlex Elder 
5130a30b71b9SAlex Elder /*
5131a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
51321f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
51331f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
51341f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5135a30b71b9SAlex Elder  */
51361f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
5137a30b71b9SAlex Elder {
5138a30b71b9SAlex Elder 	int ret;
5139a30b71b9SAlex Elder 
5140a30b71b9SAlex Elder 	/*
51413abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
51423abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
51433abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
51443abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5145a30b71b9SAlex Elder 	 */
5146a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5147a30b71b9SAlex Elder 	if (ret)
5148c0fba368SAlex Elder 		return ret;
5149c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
5150c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5151c0fba368SAlex Elder 
5152332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5153332bb12dSAlex Elder 	if (ret)
5154332bb12dSAlex Elder 		goto err_out_format;
5155332bb12dSAlex Elder 
51561f3ef788SAlex Elder 	if (mapping) {
5157fca27065SIlya Dryomov 		ret = rbd_dev_header_watch_sync(rbd_dev);
5158b644de2bSAlex Elder 		if (ret)
5159b644de2bSAlex Elder 			goto out_header_name;
51601f3ef788SAlex Elder 	}
5161b644de2bSAlex Elder 
5162c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
516399a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
5164a30b71b9SAlex Elder 	else
51652df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
51665655c4d9SAlex Elder 	if (ret)
5167b644de2bSAlex Elder 		goto err_out_watch;
5168a30b71b9SAlex Elder 
51699bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
51709bb81c9bSAlex Elder 	if (ret)
517133dca39fSAlex Elder 		goto err_out_probe;
51729bb81c9bSAlex Elder 
51739bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
517430d60ba2SAlex Elder 	if (ret)
517530d60ba2SAlex Elder 		goto err_out_probe;
517683a06263SAlex Elder 
517730d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
517830d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
517930d60ba2SAlex Elder 
518030d60ba2SAlex Elder 	return 0;
51816fd48b3bSAlex Elder err_out_probe:
51826fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5183b644de2bSAlex Elder err_out_watch:
5184fca27065SIlya Dryomov 	if (mapping)
5185fca27065SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
5186332bb12dSAlex Elder out_header_name:
5187332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
5188332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
5189332bb12dSAlex Elder err_out_format:
5190332bb12dSAlex Elder 	rbd_dev->image_format = 0;
51915655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
51925655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
51935655c4d9SAlex Elder 
51945655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
51955655c4d9SAlex Elder 
51965655c4d9SAlex Elder 	return ret;
519783a06263SAlex Elder }
519883a06263SAlex Elder 
51999b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
520059c2be1eSYehuda Sadeh 			  const char *buf,
520159c2be1eSYehuda Sadeh 			  size_t count)
5202602adf40SYehuda Sadeh {
5203cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5204dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
52054e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5206859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
52079d3997fdSAlex Elder 	struct rbd_client *rbdc;
520851344a38SAlex Elder 	bool read_only;
520927cc2594SAlex Elder 	int rc = -ENOMEM;
5210602adf40SYehuda Sadeh 
5211602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5212602adf40SYehuda Sadeh 		return -ENODEV;
5213602adf40SYehuda Sadeh 
5214a725f65eSAlex Elder 	/* parse add command */
5215859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5216dc79b113SAlex Elder 	if (rc < 0)
5217bd4ba655SAlex Elder 		goto err_out_module;
521851344a38SAlex Elder 	read_only = rbd_opts->read_only;
521951344a38SAlex Elder 	kfree(rbd_opts);
522051344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
5221a725f65eSAlex Elder 
52229d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
52239d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
52249d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
52250ddebc0cSAlex Elder 		goto err_out_args;
52269d3997fdSAlex Elder 	}
5227602adf40SYehuda Sadeh 
5228602adf40SYehuda Sadeh 	/* pick the pool */
522930ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
5230602adf40SYehuda Sadeh 	if (rc < 0)
5231602adf40SYehuda Sadeh 		goto err_out_client;
5232859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5233859c31dfSAlex Elder 
52340903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
52350903e875SAlex Elder 
5236c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
5237c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5238c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
52390903e875SAlex Elder 		rc = -EIO;
52400903e875SAlex Elder 		goto err_out_client;
52410903e875SAlex Elder 	}
52420903e875SAlex Elder 
5243c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
5244bd4ba655SAlex Elder 	if (!rbd_dev)
5245bd4ba655SAlex Elder 		goto err_out_client;
5246c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5247c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5248602adf40SYehuda Sadeh 
52491f3ef788SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, true);
5250a30b71b9SAlex Elder 	if (rc < 0)
5251c53d5893SAlex Elder 		goto err_out_rbd_dev;
525205fd6f6fSAlex Elder 
52537ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
52547ce4eef7SAlex Elder 
52557ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
52567ce4eef7SAlex Elder 		read_only = true;
52577ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
52587ce4eef7SAlex Elder 
5259b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
52603abef3b3SAlex Elder 	if (rc) {
5261e37180c0SIlya Dryomov 		/*
5262e37180c0SIlya Dryomov 		 * rbd_dev_header_unwatch_sync() can't be moved into
5263e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
5264e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
5265e37180c0SIlya Dryomov 		 */
5266e37180c0SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
52673abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
52683abef3b3SAlex Elder 		goto err_out_module;
52693abef3b3SAlex Elder 	}
52703abef3b3SAlex Elder 
5271602adf40SYehuda Sadeh 	return count;
5272b536f69aSAlex Elder 
5273c53d5893SAlex Elder err_out_rbd_dev:
5274c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5275bd4ba655SAlex Elder err_out_client:
52769d3997fdSAlex Elder 	rbd_put_client(rbdc);
52770ddebc0cSAlex Elder err_out_args:
5278859c31dfSAlex Elder 	rbd_spec_put(spec);
5279bd4ba655SAlex Elder err_out_module:
5280bd4ba655SAlex Elder 	module_put(THIS_MODULE);
528127cc2594SAlex Elder 
5282602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
528327cc2594SAlex Elder 
528427cc2594SAlex Elder 	return (ssize_t)rc;
5285602adf40SYehuda Sadeh }
5286602adf40SYehuda Sadeh 
52879b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
52889b60e70bSIlya Dryomov 		       const char *buf,
52899b60e70bSIlya Dryomov 		       size_t count)
52909b60e70bSIlya Dryomov {
52919b60e70bSIlya Dryomov 	if (single_major)
52929b60e70bSIlya Dryomov 		return -EINVAL;
52939b60e70bSIlya Dryomov 
52949b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
52959b60e70bSIlya Dryomov }
52969b60e70bSIlya Dryomov 
52979b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
52989b60e70bSIlya Dryomov 				    const char *buf,
52999b60e70bSIlya Dryomov 				    size_t count)
53009b60e70bSIlya Dryomov {
53019b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
53029b60e70bSIlya Dryomov }
53039b60e70bSIlya Dryomov 
5304200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
5305602adf40SYehuda Sadeh {
5306593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5307602adf40SYehuda Sadeh 
5308602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5309200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
53106d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
53119b60e70bSIlya Dryomov 	if (!single_major)
5312602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5313e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5314d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5315602adf40SYehuda Sadeh }
5316602adf40SYehuda Sadeh 
531705a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
531805a46afdSAlex Elder {
5319ad945fc1SAlex Elder 	while (rbd_dev->parent) {
532005a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
532105a46afdSAlex Elder 		struct rbd_device *second = first->parent;
532205a46afdSAlex Elder 		struct rbd_device *third;
532305a46afdSAlex Elder 
532405a46afdSAlex Elder 		/*
532505a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
532605a46afdSAlex Elder 		 * remove it.
532705a46afdSAlex Elder 		 */
532805a46afdSAlex Elder 		while (second && (third = second->parent)) {
532905a46afdSAlex Elder 			first = second;
533005a46afdSAlex Elder 			second = third;
533105a46afdSAlex Elder 		}
5332ad945fc1SAlex Elder 		rbd_assert(second);
53338ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5334ad945fc1SAlex Elder 		first->parent = NULL;
5335ad945fc1SAlex Elder 		first->parent_overlap = 0;
5336ad945fc1SAlex Elder 
5337ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
533805a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
533905a46afdSAlex Elder 		first->parent_spec = NULL;
534005a46afdSAlex Elder 	}
534105a46afdSAlex Elder }
534205a46afdSAlex Elder 
53439b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
5344602adf40SYehuda Sadeh 			     const char *buf,
5345602adf40SYehuda Sadeh 			     size_t count)
5346602adf40SYehuda Sadeh {
5347602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5348751cc0e3SAlex Elder 	struct list_head *tmp;
5349751cc0e3SAlex Elder 	int dev_id;
5350602adf40SYehuda Sadeh 	unsigned long ul;
535182a442d2SAlex Elder 	bool already = false;
53520d8189e1SAlex Elder 	int ret;
5353602adf40SYehuda Sadeh 
5354bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
53550d8189e1SAlex Elder 	if (ret)
53560d8189e1SAlex Elder 		return ret;
5357602adf40SYehuda Sadeh 
5358602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5359751cc0e3SAlex Elder 	dev_id = (int)ul;
5360751cc0e3SAlex Elder 	if (dev_id != ul)
5361602adf40SYehuda Sadeh 		return -EINVAL;
5362602adf40SYehuda Sadeh 
5363602adf40SYehuda Sadeh 	ret = -ENOENT;
5364751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5365751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5366751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5367751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5368751cc0e3SAlex Elder 			ret = 0;
5369751cc0e3SAlex Elder 			break;
5370602adf40SYehuda Sadeh 		}
5371751cc0e3SAlex Elder 	}
5372751cc0e3SAlex Elder 	if (!ret) {
5373a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5374b82d167bSAlex Elder 		if (rbd_dev->open_count)
537542382b70SAlex Elder 			ret = -EBUSY;
5376b82d167bSAlex Elder 		else
537782a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
537882a442d2SAlex Elder 							&rbd_dev->flags);
5379a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5380751cc0e3SAlex Elder 	}
5381751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
538282a442d2SAlex Elder 	if (ret < 0 || already)
53831ba0f1e7SAlex Elder 		return ret;
5384751cc0e3SAlex Elder 
5385fca27065SIlya Dryomov 	rbd_dev_header_unwatch_sync(rbd_dev);
53869abc5990SJosh Durgin 	/*
53879abc5990SJosh Durgin 	 * flush remaining watch callbacks - these must be complete
53889abc5990SJosh Durgin 	 * before the osd_client is shutdown
53899abc5990SJosh Durgin 	 */
53909abc5990SJosh Durgin 	dout("%s: flushing notifies", __func__);
53919abc5990SJosh Durgin 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5392fca27065SIlya Dryomov 
53939875201eSJosh Durgin 	/*
53949875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
53959875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
53969875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
53979875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
53989875201eSJosh Durgin 	 */
53999875201eSJosh Durgin 	rbd_bus_del_dev(rbd_dev);
54008ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
540179ab7558SAlex Elder 	module_put(THIS_MODULE);
5402aafb230eSAlex Elder 
54031ba0f1e7SAlex Elder 	return count;
5404602adf40SYehuda Sadeh }
5405602adf40SYehuda Sadeh 
54069b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
54079b60e70bSIlya Dryomov 			  const char *buf,
54089b60e70bSIlya Dryomov 			  size_t count)
54099b60e70bSIlya Dryomov {
54109b60e70bSIlya Dryomov 	if (single_major)
54119b60e70bSIlya Dryomov 		return -EINVAL;
54129b60e70bSIlya Dryomov 
54139b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
54149b60e70bSIlya Dryomov }
54159b60e70bSIlya Dryomov 
54169b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
54179b60e70bSIlya Dryomov 				       const char *buf,
54189b60e70bSIlya Dryomov 				       size_t count)
54199b60e70bSIlya Dryomov {
54209b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
54219b60e70bSIlya Dryomov }
54229b60e70bSIlya Dryomov 
5423602adf40SYehuda Sadeh /*
5424602adf40SYehuda Sadeh  * create control files in sysfs
5425dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5426602adf40SYehuda Sadeh  */
5427602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5428602adf40SYehuda Sadeh {
5429dfc5606dSYehuda Sadeh 	int ret;
5430602adf40SYehuda Sadeh 
5431fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5432dfc5606dSYehuda Sadeh 	if (ret < 0)
5433dfc5606dSYehuda Sadeh 		return ret;
5434602adf40SYehuda Sadeh 
5435fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5436fed4c143SAlex Elder 	if (ret < 0)
5437fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5438602adf40SYehuda Sadeh 
5439602adf40SYehuda Sadeh 	return ret;
5440602adf40SYehuda Sadeh }
5441602adf40SYehuda Sadeh 
5442602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5443602adf40SYehuda Sadeh {
5444dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5445fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5446602adf40SYehuda Sadeh }
5447602adf40SYehuda Sadeh 
54481c2a9dfeSAlex Elder static int rbd_slab_init(void)
54491c2a9dfeSAlex Elder {
54501c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
54511c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
54521c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
54531c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
54541c2a9dfeSAlex Elder 					0, NULL);
5455868311b1SAlex Elder 	if (!rbd_img_request_cache)
5456868311b1SAlex Elder 		return -ENOMEM;
5457868311b1SAlex Elder 
5458868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5459868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5460868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5461868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5462868311b1SAlex Elder 					0, NULL);
546378c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
546478c2a44aSAlex Elder 		goto out_err;
546578c2a44aSAlex Elder 
546678c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
546778c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
54682d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
546978c2a44aSAlex Elder 	if (rbd_segment_name_cache)
54701c2a9dfeSAlex Elder 		return 0;
547178c2a44aSAlex Elder out_err:
547278c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
547378c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
547478c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
547578c2a44aSAlex Elder 	}
54761c2a9dfeSAlex Elder 
5477868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5478868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5479868311b1SAlex Elder 
54801c2a9dfeSAlex Elder 	return -ENOMEM;
54811c2a9dfeSAlex Elder }
54821c2a9dfeSAlex Elder 
54831c2a9dfeSAlex Elder static void rbd_slab_exit(void)
54841c2a9dfeSAlex Elder {
548578c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
548678c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
548778c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
548878c2a44aSAlex Elder 
5489868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5490868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5491868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5492868311b1SAlex Elder 
54931c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
54941c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
54951c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
54961c2a9dfeSAlex Elder }
54971c2a9dfeSAlex Elder 
5498cc344fa1SAlex Elder static int __init rbd_init(void)
5499602adf40SYehuda Sadeh {
5500602adf40SYehuda Sadeh 	int rc;
5501602adf40SYehuda Sadeh 
55021e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
55031e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
55041e32d34cSAlex Elder 		return -EINVAL;
55051e32d34cSAlex Elder 	}
5506e1b4d96dSIlya Dryomov 
55071c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5508602adf40SYehuda Sadeh 	if (rc)
5509602adf40SYehuda Sadeh 		return rc;
5510e1b4d96dSIlya Dryomov 
55119b60e70bSIlya Dryomov 	if (single_major) {
55129b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
55139b60e70bSIlya Dryomov 		if (rbd_major < 0) {
55149b60e70bSIlya Dryomov 			rc = rbd_major;
55159b60e70bSIlya Dryomov 			goto err_out_slab;
55169b60e70bSIlya Dryomov 		}
55179b60e70bSIlya Dryomov 	}
55189b60e70bSIlya Dryomov 
55191c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
55201c2a9dfeSAlex Elder 	if (rc)
55219b60e70bSIlya Dryomov 		goto err_out_blkdev;
55221c2a9dfeSAlex Elder 
55239b60e70bSIlya Dryomov 	if (single_major)
55249b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
55259b60e70bSIlya Dryomov 	else
5526e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
55279b60e70bSIlya Dryomov 
5528e1b4d96dSIlya Dryomov 	return 0;
5529e1b4d96dSIlya Dryomov 
55309b60e70bSIlya Dryomov err_out_blkdev:
55319b60e70bSIlya Dryomov 	if (single_major)
55329b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5533e1b4d96dSIlya Dryomov err_out_slab:
5534e1b4d96dSIlya Dryomov 	rbd_slab_exit();
55351c2a9dfeSAlex Elder 	return rc;
5536602adf40SYehuda Sadeh }
5537602adf40SYehuda Sadeh 
5538cc344fa1SAlex Elder static void __exit rbd_exit(void)
5539602adf40SYehuda Sadeh {
5540ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
5541602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
55429b60e70bSIlya Dryomov 	if (single_major)
55439b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
55441c2a9dfeSAlex Elder 	rbd_slab_exit();
5545602adf40SYehuda Sadeh }
5546602adf40SYehuda Sadeh 
5547602adf40SYehuda Sadeh module_init(rbd_init);
5548602adf40SYehuda Sadeh module_exit(rbd_exit);
5549602adf40SYehuda Sadeh 
5550d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5551602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5552602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5553602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5554602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5555602adf40SYehuda Sadeh 
555690da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5557602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5558