xref: /openbmc/linux/drivers/block/rbd.c (revision 88a25a5f)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
35602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3659c2be1eSYehuda Sadeh #include <linux/parser.h>
3730d1cff8SAlex Elder #include <linux/bsearch.h>
38602adf40SYehuda Sadeh 
39602adf40SYehuda Sadeh #include <linux/kernel.h>
40602adf40SYehuda Sadeh #include <linux/device.h>
41602adf40SYehuda Sadeh #include <linux/module.h>
427ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
43602adf40SYehuda Sadeh #include <linux/fs.h>
44602adf40SYehuda Sadeh #include <linux/blkdev.h>
451c2a9dfeSAlex Elder #include <linux/slab.h>
46f8a22fc2SIlya Dryomov #include <linux/idr.h>
47bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
48602adf40SYehuda Sadeh 
49602adf40SYehuda Sadeh #include "rbd_types.h"
50602adf40SYehuda Sadeh 
51aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
52aafb230eSAlex Elder 
53593a9e7bSAlex Elder /*
54593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
55593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
56593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
57593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
58593a9e7bSAlex Elder  */
59593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
60593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
61593a9e7bSAlex Elder 
62a2acd00eSAlex Elder /*
63a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
64a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
65a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
66a2acd00eSAlex Elder  * -EINVAL without updating it.
67a2acd00eSAlex Elder  */
68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
69a2acd00eSAlex Elder {
70a2acd00eSAlex Elder 	unsigned int counter;
71a2acd00eSAlex Elder 
72a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
74a2acd00eSAlex Elder 		return (int)counter;
75a2acd00eSAlex Elder 
76a2acd00eSAlex Elder 	atomic_dec(v);
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	return -EINVAL;
79a2acd00eSAlex Elder }
80a2acd00eSAlex Elder 
81a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
83a2acd00eSAlex Elder {
84a2acd00eSAlex Elder 	int counter;
85a2acd00eSAlex Elder 
86a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
87a2acd00eSAlex Elder 	if (counter >= 0)
88a2acd00eSAlex Elder 		return counter;
89a2acd00eSAlex Elder 
90a2acd00eSAlex Elder 	atomic_inc(v);
91a2acd00eSAlex Elder 
92a2acd00eSAlex Elder 	return -EINVAL;
93a2acd00eSAlex Elder }
94a2acd00eSAlex Elder 
95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
96602adf40SYehuda Sadeh 
977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
99602adf40SYehuda Sadeh 
1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
1016d69bb53SIlya Dryomov 
102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
104d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105d4b125e9SAlex Elder 
10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
107602adf40SYehuda Sadeh 
108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
109602adf40SYehuda Sadeh 
1109682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1119682fc6dSAlex Elder 
1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1159e15b77dSAlex Elder 
1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
117589d30e0SAlex Elder 
118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11999d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
12099d16943SIlya Dryomov 
121d889140cSAlex Elder /* Feature bits */
122d889140cSAlex Elder 
1235cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1245cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
125ed95b21aSIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
126ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
127ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
128ed95b21aSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK)
129d889140cSAlex Elder 
130d889140cSAlex Elder /* Features supported by this (client software) implementation. */
131d889140cSAlex Elder 
132770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
133d889140cSAlex Elder 
13481a89793SAlex Elder /*
13581a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13681a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13781a89793SAlex Elder  */
138602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
139602adf40SYehuda Sadeh 
140602adf40SYehuda Sadeh /*
141602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
142602adf40SYehuda Sadeh  */
143602adf40SYehuda Sadeh struct rbd_image_header {
144f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
145849b4260SAlex Elder 	char *object_prefix;
146602adf40SYehuda Sadeh 	__u8 obj_order;
147602adf40SYehuda Sadeh 	__u8 crypt_type;
148602adf40SYehuda Sadeh 	__u8 comp_type;
149f35a4deeSAlex Elder 	u64 stripe_unit;
150f35a4deeSAlex Elder 	u64 stripe_count;
151f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
152602adf40SYehuda Sadeh 
153f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
154f84344f3SAlex Elder 	u64 image_size;
155f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
156f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
157f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15859c2be1eSYehuda Sadeh };
15959c2be1eSYehuda Sadeh 
1600d7dbfceSAlex Elder /*
1610d7dbfceSAlex Elder  * An rbd image specification.
1620d7dbfceSAlex Elder  *
1630d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
164c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
165c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
166c66c6e0cSAlex Elder  *
167c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
168c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
169c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
170c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
171c66c6e0cSAlex Elder  *
172c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
173c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
174c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
175c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
176c66c6e0cSAlex Elder  * is shared between the parent and child).
177c66c6e0cSAlex Elder  *
178c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
179c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
180c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
181c66c6e0cSAlex Elder  *
182c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
183c66c6e0cSAlex Elder  * could be a null pointer).
1840d7dbfceSAlex Elder  */
1850d7dbfceSAlex Elder struct rbd_spec {
1860d7dbfceSAlex Elder 	u64		pool_id;
187ecb4dc22SAlex Elder 	const char	*pool_name;
1880d7dbfceSAlex Elder 
189ecb4dc22SAlex Elder 	const char	*image_id;
190ecb4dc22SAlex Elder 	const char	*image_name;
1910d7dbfceSAlex Elder 
1920d7dbfceSAlex Elder 	u64		snap_id;
193ecb4dc22SAlex Elder 	const char	*snap_name;
1940d7dbfceSAlex Elder 
1950d7dbfceSAlex Elder 	struct kref	kref;
1960d7dbfceSAlex Elder };
1970d7dbfceSAlex Elder 
198602adf40SYehuda Sadeh /*
199f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
200602adf40SYehuda Sadeh  */
201602adf40SYehuda Sadeh struct rbd_client {
202602adf40SYehuda Sadeh 	struct ceph_client	*client;
203602adf40SYehuda Sadeh 	struct kref		kref;
204602adf40SYehuda Sadeh 	struct list_head	node;
205602adf40SYehuda Sadeh };
206602adf40SYehuda Sadeh 
207bf0d5f50SAlex Elder struct rbd_img_request;
208bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
209bf0d5f50SAlex Elder 
210bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
211bf0d5f50SAlex Elder 
212bf0d5f50SAlex Elder struct rbd_obj_request;
213bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
214bf0d5f50SAlex Elder 
2159969ebc5SAlex Elder enum obj_request_type {
2169969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2179969ebc5SAlex Elder };
218bf0d5f50SAlex Elder 
2196d2940c8SGuangliang Zhao enum obj_operation_type {
2206d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2216d2940c8SGuangliang Zhao 	OBJ_OP_READ,
22290e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2236d2940c8SGuangliang Zhao };
2246d2940c8SGuangliang Zhao 
225926f9b3fSAlex Elder enum obj_req_flags {
226926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2276365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2285679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2295679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
230926f9b3fSAlex Elder };
231926f9b3fSAlex Elder 
232bf0d5f50SAlex Elder struct rbd_obj_request {
233bf0d5f50SAlex Elder 	const char		*object_name;
234bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
235bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
236926f9b3fSAlex Elder 	unsigned long		flags;
237bf0d5f50SAlex Elder 
238c5b5ef6cSAlex Elder 	/*
239c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
240c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
241c5b5ef6cSAlex Elder 	 *
242c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
243c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
244c5b5ef6cSAlex Elder 	 *
245c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
246c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
247c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
248c5b5ef6cSAlex Elder 	 *
249c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
250c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
251c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
252c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
253c5b5ef6cSAlex Elder 	 */
254c5b5ef6cSAlex Elder 	union {
255c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
256c5b5ef6cSAlex Elder 		struct {
257bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
258c5b5ef6cSAlex Elder 			u64			img_offset;
259c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
260c5b5ef6cSAlex Elder 			struct list_head	links;
261c5b5ef6cSAlex Elder 		};
262c5b5ef6cSAlex Elder 	};
263bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	enum obj_request_type	type;
266788e2df3SAlex Elder 	union {
267bf0d5f50SAlex Elder 		struct bio	*bio_list;
268788e2df3SAlex Elder 		struct {
269788e2df3SAlex Elder 			struct page	**pages;
270788e2df3SAlex Elder 			u32		page_count;
271788e2df3SAlex Elder 		};
272788e2df3SAlex Elder 	};
2730eefd470SAlex Elder 	struct page		**copyup_pages;
274ebda6408SAlex Elder 	u32			copyup_page_count;
275bf0d5f50SAlex Elder 
276bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
277bf0d5f50SAlex Elder 
278bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2791b83bef2SSage Weil 	int			result;
280bf0d5f50SAlex Elder 
281bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
282788e2df3SAlex Elder 	struct completion	completion;
283bf0d5f50SAlex Elder 
284bf0d5f50SAlex Elder 	struct kref		kref;
285bf0d5f50SAlex Elder };
286bf0d5f50SAlex Elder 
2870c425248SAlex Elder enum img_req_flags {
2889849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2899849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
290d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
29190e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2920c425248SAlex Elder };
2930c425248SAlex Elder 
294bf0d5f50SAlex Elder struct rbd_img_request {
295bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
296bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
297bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2980c425248SAlex Elder 	unsigned long		flags;
299bf0d5f50SAlex Elder 	union {
300bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
3019849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
3029849e986SAlex Elder 	};
3039849e986SAlex Elder 	union {
3049849e986SAlex Elder 		struct request		*rq;		/* block request */
3059849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
306bf0d5f50SAlex Elder 	};
3073d7efd18SAlex Elder 	struct page		**copyup_pages;
308ebda6408SAlex Elder 	u32			copyup_page_count;
309bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
310bf0d5f50SAlex Elder 	u32			next_completion;
311bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
31255f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
313a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
314bf0d5f50SAlex Elder 
315bf0d5f50SAlex Elder 	u32			obj_request_count;
316bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
317bf0d5f50SAlex Elder 
318bf0d5f50SAlex Elder 	struct kref		kref;
319bf0d5f50SAlex Elder };
320bf0d5f50SAlex Elder 
321bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
322ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
323bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
324ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
325bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
326ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
327bf0d5f50SAlex Elder 
32899d16943SIlya Dryomov enum rbd_watch_state {
32999d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
33099d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
33199d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
33299d16943SIlya Dryomov };
33399d16943SIlya Dryomov 
334ed95b21aSIlya Dryomov enum rbd_lock_state {
335ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
336ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
337ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
338ed95b21aSIlya Dryomov };
339ed95b21aSIlya Dryomov 
340ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
341ed95b21aSIlya Dryomov struct rbd_client_id {
342ed95b21aSIlya Dryomov 	u64 gid;
343ed95b21aSIlya Dryomov 	u64 handle;
344ed95b21aSIlya Dryomov };
345ed95b21aSIlya Dryomov 
346f84344f3SAlex Elder struct rbd_mapping {
34799c1f08fSAlex Elder 	u64                     size;
34834b13184SAlex Elder 	u64                     features;
349f84344f3SAlex Elder 	bool			read_only;
350f84344f3SAlex Elder };
351f84344f3SAlex Elder 
352602adf40SYehuda Sadeh /*
353602adf40SYehuda Sadeh  * a single device
354602adf40SYehuda Sadeh  */
355602adf40SYehuda Sadeh struct rbd_device {
356de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
357602adf40SYehuda Sadeh 
358602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
359dd82fff1SIlya Dryomov 	int			minor;
360602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
361602adf40SYehuda Sadeh 
362a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
363602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
364602adf40SYehuda Sadeh 
365602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
366602adf40SYehuda Sadeh 
367b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
368602adf40SYehuda Sadeh 
369602adf40SYehuda Sadeh 	struct rbd_image_header	header;
370b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3710d7dbfceSAlex Elder 	struct rbd_spec		*spec;
372d147543dSIlya Dryomov 	struct rbd_options	*opts;
3730d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
374602adf40SYehuda Sadeh 
375c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
376922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
377971f839aSAlex Elder 
3781643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3790903e875SAlex Elder 
38099d16943SIlya Dryomov 	struct mutex		watch_mutex;
38199d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
382922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
38399d16943SIlya Dryomov 	u64			watch_cookie;
38499d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
38559c2be1eSYehuda Sadeh 
386ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
387ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
388ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
389ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
390ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
391ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
392ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
393ed95b21aSIlya Dryomov 	wait_queue_head_t	lock_waitq;
394ed95b21aSIlya Dryomov 
3951643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
3961643dfa4SIlya Dryomov 
39786b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
39886b00e0dSAlex Elder 	u64			parent_overlap;
399a2acd00eSAlex Elder 	atomic_t		parent_ref;
4002f82ee54SAlex Elder 	struct rbd_device	*parent;
40186b00e0dSAlex Elder 
4027ad18afaSChristoph Hellwig 	/* Block layer tags. */
4037ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
4047ad18afaSChristoph Hellwig 
405c666601aSJosh Durgin 	/* protects updating the header */
406c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
407f84344f3SAlex Elder 
408f84344f3SAlex Elder 	struct rbd_mapping	mapping;
409602adf40SYehuda Sadeh 
410602adf40SYehuda Sadeh 	struct list_head	node;
411dfc5606dSYehuda Sadeh 
412dfc5606dSYehuda Sadeh 	/* sysfs related */
413dfc5606dSYehuda Sadeh 	struct device		dev;
414b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
415dfc5606dSYehuda Sadeh };
416dfc5606dSYehuda Sadeh 
417b82d167bSAlex Elder /*
418b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
419b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
420b82d167bSAlex Elder  *
421b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
422b82d167bSAlex Elder  * "open_count" field) requires atomic access.
423b82d167bSAlex Elder  */
4246d292906SAlex Elder enum rbd_dev_flags {
4256d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
426b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
4276d292906SAlex Elder };
4286d292906SAlex Elder 
429cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
430e124a82fSAlex Elder 
431602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
432e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
433e124a82fSAlex Elder 
434602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
435432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
436602adf40SYehuda Sadeh 
43778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
43878c2a44aSAlex Elder 
4391c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
440868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
44178c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
4421c2a9dfeSAlex Elder 
4439b60e70bSIlya Dryomov static int rbd_major;
444f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
445f8a22fc2SIlya Dryomov 
446f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
447f5ee37bdSIlya Dryomov 
4489b60e70bSIlya Dryomov /*
4499b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4509b60e70bSIlya Dryomov  * userspace rbd utility.
4519b60e70bSIlya Dryomov  */
4529b60e70bSIlya Dryomov static bool single_major = false;
4539b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4549b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4559b60e70bSIlya Dryomov 
4563d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4573d7efd18SAlex Elder 
458f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
459f0f8cef5SAlex Elder 		       size_t count);
460f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
461f0f8cef5SAlex Elder 			  size_t count);
4629b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4639b60e70bSIlya Dryomov 				    size_t count);
4649b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4659b60e70bSIlya Dryomov 				       size_t count);
4666d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
467a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
468f0f8cef5SAlex Elder 
4699b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4709b60e70bSIlya Dryomov {
4717e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4729b60e70bSIlya Dryomov }
4739b60e70bSIlya Dryomov 
4749b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4759b60e70bSIlya Dryomov {
4767e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4779b60e70bSIlya Dryomov }
4789b60e70bSIlya Dryomov 
479ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev)
480ed95b21aSIlya Dryomov {
481ed95b21aSIlya Dryomov 	return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
482ed95b21aSIlya Dryomov 	       rbd_dev->spec->snap_id == CEPH_NOSNAP &&
483ed95b21aSIlya Dryomov 	       !rbd_dev->mapping.read_only;
484ed95b21aSIlya Dryomov }
485ed95b21aSIlya Dryomov 
486ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
487ed95b21aSIlya Dryomov {
488ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
489ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
490ed95b21aSIlya Dryomov }
491ed95b21aSIlya Dryomov 
492ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
493ed95b21aSIlya Dryomov {
494ed95b21aSIlya Dryomov 	bool is_lock_owner;
495ed95b21aSIlya Dryomov 
496ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
497ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
498ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
499ed95b21aSIlya Dryomov 	return is_lock_owner;
500ed95b21aSIlya Dryomov }
501ed95b21aSIlya Dryomov 
502b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
503b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
5049b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
5059b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
506b15a21ddSGreg Kroah-Hartman 
507b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
508b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
509b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
5109b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
5119b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
512b15a21ddSGreg Kroah-Hartman 	NULL,
513f0f8cef5SAlex Elder };
51492c76dc0SIlya Dryomov 
51592c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
51692c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
51792c76dc0SIlya Dryomov {
5189b60e70bSIlya Dryomov 	if (!single_major &&
5199b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5209b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5219b60e70bSIlya Dryomov 		return 0;
5229b60e70bSIlya Dryomov 
52392c76dc0SIlya Dryomov 	return attr->mode;
52492c76dc0SIlya Dryomov }
52592c76dc0SIlya Dryomov 
52692c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
52792c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
52892c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
52992c76dc0SIlya Dryomov };
53092c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
531f0f8cef5SAlex Elder 
532f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
533f0f8cef5SAlex Elder 	.name		= "rbd",
534b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
535f0f8cef5SAlex Elder };
536f0f8cef5SAlex Elder 
537f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
538f0f8cef5SAlex Elder {
539f0f8cef5SAlex Elder }
540f0f8cef5SAlex Elder 
541f0f8cef5SAlex Elder static struct device rbd_root_dev = {
542f0f8cef5SAlex Elder 	.init_name =    "rbd",
543f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
544f0f8cef5SAlex Elder };
545f0f8cef5SAlex Elder 
54606ecc6cbSAlex Elder static __printf(2, 3)
54706ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
54806ecc6cbSAlex Elder {
54906ecc6cbSAlex Elder 	struct va_format vaf;
55006ecc6cbSAlex Elder 	va_list args;
55106ecc6cbSAlex Elder 
55206ecc6cbSAlex Elder 	va_start(args, fmt);
55306ecc6cbSAlex Elder 	vaf.fmt = fmt;
55406ecc6cbSAlex Elder 	vaf.va = &args;
55506ecc6cbSAlex Elder 
55606ecc6cbSAlex Elder 	if (!rbd_dev)
55706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
55806ecc6cbSAlex Elder 	else if (rbd_dev->disk)
55906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
56006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
56106ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
56206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
56306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
56406ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
56506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
56606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
56706ecc6cbSAlex Elder 	else	/* punt */
56806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
56906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
57006ecc6cbSAlex Elder 	va_end(args);
57106ecc6cbSAlex Elder }
57206ecc6cbSAlex Elder 
573aafb230eSAlex Elder #ifdef RBD_DEBUG
574aafb230eSAlex Elder #define rbd_assert(expr)						\
575aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
576aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
577aafb230eSAlex Elder 						"at line %d:\n\n"	\
578aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
579aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
580aafb230eSAlex Elder 			BUG();						\
581aafb230eSAlex Elder 		}
582aafb230eSAlex Elder #else /* !RBD_DEBUG */
583aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
584aafb230eSAlex Elder #endif /* !RBD_DEBUG */
585dfc5606dSYehuda Sadeh 
5862761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
587b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
58805a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
58905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5908b3e1a56SAlex Elder 
591cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5922df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
593a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
594e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
59554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
59654cac61fSAlex Elder 					u64 snap_id);
5972ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5982ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5992ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
6002ad3d716SAlex Elder 		u64 *snap_features);
60159c2be1eSYehuda Sadeh 
602602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
603602adf40SYehuda Sadeh {
604f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
605b82d167bSAlex Elder 	bool removing = false;
606602adf40SYehuda Sadeh 
607f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
608602adf40SYehuda Sadeh 		return -EROFS;
609602adf40SYehuda Sadeh 
610a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
611b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
612b82d167bSAlex Elder 		removing = true;
613b82d167bSAlex Elder 	else
614b82d167bSAlex Elder 		rbd_dev->open_count++;
615a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
616b82d167bSAlex Elder 	if (removing)
617b82d167bSAlex Elder 		return -ENOENT;
618b82d167bSAlex Elder 
619c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
620340c7a2bSAlex Elder 
621602adf40SYehuda Sadeh 	return 0;
622602adf40SYehuda Sadeh }
623602adf40SYehuda Sadeh 
624db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
625dfc5606dSYehuda Sadeh {
626dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
627b82d167bSAlex Elder 	unsigned long open_count_before;
628b82d167bSAlex Elder 
629a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
630b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
631a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
632b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
633dfc5606dSYehuda Sadeh 
634c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
635dfc5606dSYehuda Sadeh }
636dfc5606dSYehuda Sadeh 
637131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
638131fd9f6SGuangliang Zhao {
63977f33c03SJosh Durgin 	int ret = 0;
640131fd9f6SGuangliang Zhao 	int val;
641131fd9f6SGuangliang Zhao 	bool ro;
64277f33c03SJosh Durgin 	bool ro_changed = false;
643131fd9f6SGuangliang Zhao 
64477f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
645131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
646131fd9f6SGuangliang Zhao 		return -EFAULT;
647131fd9f6SGuangliang Zhao 
648131fd9f6SGuangliang Zhao 	ro = val ? true : false;
649131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
650131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
651131fd9f6SGuangliang Zhao 		return -EROFS;
652131fd9f6SGuangliang Zhao 
65377f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
65477f33c03SJosh Durgin 	/* prevent others open this device */
65577f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
65677f33c03SJosh Durgin 		ret = -EBUSY;
65777f33c03SJosh Durgin 		goto out;
658131fd9f6SGuangliang Zhao 	}
659131fd9f6SGuangliang Zhao 
66077f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
66177f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
66277f33c03SJosh Durgin 		ro_changed = true;
66377f33c03SJosh Durgin 	}
66477f33c03SJosh Durgin 
66577f33c03SJosh Durgin out:
66677f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
66777f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
66877f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
66977f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
67077f33c03SJosh Durgin 
67177f33c03SJosh Durgin 	return ret;
672131fd9f6SGuangliang Zhao }
673131fd9f6SGuangliang Zhao 
674131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
675131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
676131fd9f6SGuangliang Zhao {
677131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
678131fd9f6SGuangliang Zhao 	int ret = 0;
679131fd9f6SGuangliang Zhao 
680131fd9f6SGuangliang Zhao 	switch (cmd) {
681131fd9f6SGuangliang Zhao 	case BLKROSET:
682131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
683131fd9f6SGuangliang Zhao 		break;
684131fd9f6SGuangliang Zhao 	default:
685131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
686131fd9f6SGuangliang Zhao 	}
687131fd9f6SGuangliang Zhao 
688131fd9f6SGuangliang Zhao 	return ret;
689131fd9f6SGuangliang Zhao }
690131fd9f6SGuangliang Zhao 
691131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
692131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
693131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
694131fd9f6SGuangliang Zhao {
695131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
696131fd9f6SGuangliang Zhao }
697131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
698131fd9f6SGuangliang Zhao 
699602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
700602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
701602adf40SYehuda Sadeh 	.open			= rbd_open,
702dfc5606dSYehuda Sadeh 	.release		= rbd_release,
703131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
704131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
705131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
706131fd9f6SGuangliang Zhao #endif
707602adf40SYehuda Sadeh };
708602adf40SYehuda Sadeh 
709602adf40SYehuda Sadeh /*
7107262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
711cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
712602adf40SYehuda Sadeh  */
713f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
714602adf40SYehuda Sadeh {
715602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
716602adf40SYehuda Sadeh 	int ret = -ENOMEM;
717602adf40SYehuda Sadeh 
71837206ee5SAlex Elder 	dout("%s:\n", __func__);
719602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
720602adf40SYehuda Sadeh 	if (!rbdc)
721602adf40SYehuda Sadeh 		goto out_opt;
722602adf40SYehuda Sadeh 
723602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
724602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
725602adf40SYehuda Sadeh 
72643ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
727602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
72808f75463SAlex Elder 		goto out_rbdc;
72943ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
730602adf40SYehuda Sadeh 
731602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
732602adf40SYehuda Sadeh 	if (ret < 0)
73308f75463SAlex Elder 		goto out_client;
734602adf40SYehuda Sadeh 
735432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
736602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
737432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
738602adf40SYehuda Sadeh 
73937206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
740bc534d86SAlex Elder 
741602adf40SYehuda Sadeh 	return rbdc;
74208f75463SAlex Elder out_client:
743602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
74408f75463SAlex Elder out_rbdc:
745602adf40SYehuda Sadeh 	kfree(rbdc);
746602adf40SYehuda Sadeh out_opt:
74743ae4701SAlex Elder 	if (ceph_opts)
74843ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
74937206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
75037206ee5SAlex Elder 
75128f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
752602adf40SYehuda Sadeh }
753602adf40SYehuda Sadeh 
7542f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7552f82ee54SAlex Elder {
7562f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7572f82ee54SAlex Elder 
7582f82ee54SAlex Elder 	return rbdc;
7592f82ee54SAlex Elder }
7602f82ee54SAlex Elder 
761602adf40SYehuda Sadeh /*
7621f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7631f7ba331SAlex Elder  * found, bump its reference count.
764602adf40SYehuda Sadeh  */
7651f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
766602adf40SYehuda Sadeh {
767602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7681f7ba331SAlex Elder 	bool found = false;
769602adf40SYehuda Sadeh 
77043ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
771602adf40SYehuda Sadeh 		return NULL;
772602adf40SYehuda Sadeh 
7731f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7741f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7751f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7762f82ee54SAlex Elder 			__rbd_get_client(client_node);
7772f82ee54SAlex Elder 
7781f7ba331SAlex Elder 			found = true;
7791f7ba331SAlex Elder 			break;
7801f7ba331SAlex Elder 		}
7811f7ba331SAlex Elder 	}
7821f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7831f7ba331SAlex Elder 
7841f7ba331SAlex Elder 	return found ? client_node : NULL;
785602adf40SYehuda Sadeh }
786602adf40SYehuda Sadeh 
787602adf40SYehuda Sadeh /*
788210c104cSIlya Dryomov  * (Per device) rbd map options
78959c2be1eSYehuda Sadeh  */
79059c2be1eSYehuda Sadeh enum {
791b5584180SIlya Dryomov 	Opt_queue_depth,
79259c2be1eSYehuda Sadeh 	Opt_last_int,
79359c2be1eSYehuda Sadeh 	/* int args above */
79459c2be1eSYehuda Sadeh 	Opt_last_string,
79559c2be1eSYehuda Sadeh 	/* string args above */
796cc0538b6SAlex Elder 	Opt_read_only,
797cc0538b6SAlex Elder 	Opt_read_write,
79880de1912SIlya Dryomov 	Opt_lock_on_read,
799210c104cSIlya Dryomov 	Opt_err
80059c2be1eSYehuda Sadeh };
80159c2be1eSYehuda Sadeh 
80243ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
803b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
80459c2be1eSYehuda Sadeh 	/* int args above */
80559c2be1eSYehuda Sadeh 	/* string args above */
806be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
807cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
808cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
809cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
81080de1912SIlya Dryomov 	{Opt_lock_on_read, "lock_on_read"},
811210c104cSIlya Dryomov 	{Opt_err, NULL}
81259c2be1eSYehuda Sadeh };
81359c2be1eSYehuda Sadeh 
81498571b5aSAlex Elder struct rbd_options {
815b5584180SIlya Dryomov 	int	queue_depth;
81698571b5aSAlex Elder 	bool	read_only;
81780de1912SIlya Dryomov 	bool	lock_on_read;
81898571b5aSAlex Elder };
81998571b5aSAlex Elder 
820b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
82198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
82280de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
82398571b5aSAlex Elder 
82459c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
82559c2be1eSYehuda Sadeh {
82643ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
82759c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
82859c2be1eSYehuda Sadeh 	int token, intval, ret;
82959c2be1eSYehuda Sadeh 
83043ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
83159c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
83259c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
83359c2be1eSYehuda Sadeh 		if (ret < 0) {
834210c104cSIlya Dryomov 			pr_err("bad mount option arg (not int) at '%s'\n", c);
83559c2be1eSYehuda Sadeh 			return ret;
83659c2be1eSYehuda Sadeh 		}
83759c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
83859c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
839210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
84059c2be1eSYehuda Sadeh 	} else {
84159c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
84259c2be1eSYehuda Sadeh 	}
84359c2be1eSYehuda Sadeh 
84459c2be1eSYehuda Sadeh 	switch (token) {
845b5584180SIlya Dryomov 	case Opt_queue_depth:
846b5584180SIlya Dryomov 		if (intval < 1) {
847b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
848b5584180SIlya Dryomov 			return -EINVAL;
849b5584180SIlya Dryomov 		}
850b5584180SIlya Dryomov 		rbd_opts->queue_depth = intval;
851b5584180SIlya Dryomov 		break;
852cc0538b6SAlex Elder 	case Opt_read_only:
853cc0538b6SAlex Elder 		rbd_opts->read_only = true;
854cc0538b6SAlex Elder 		break;
855cc0538b6SAlex Elder 	case Opt_read_write:
856cc0538b6SAlex Elder 		rbd_opts->read_only = false;
857cc0538b6SAlex Elder 		break;
85880de1912SIlya Dryomov 	case Opt_lock_on_read:
85980de1912SIlya Dryomov 		rbd_opts->lock_on_read = true;
86080de1912SIlya Dryomov 		break;
86159c2be1eSYehuda Sadeh 	default:
862210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
863210c104cSIlya Dryomov 		return -EINVAL;
86459c2be1eSYehuda Sadeh 	}
865210c104cSIlya Dryomov 
86659c2be1eSYehuda Sadeh 	return 0;
86759c2be1eSYehuda Sadeh }
86859c2be1eSYehuda Sadeh 
8696d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8706d2940c8SGuangliang Zhao {
8716d2940c8SGuangliang Zhao 	switch (op_type) {
8726d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8736d2940c8SGuangliang Zhao 		return "read";
8746d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8756d2940c8SGuangliang Zhao 		return "write";
87690e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
87790e98c52SGuangliang Zhao 		return "discard";
8786d2940c8SGuangliang Zhao 	default:
8796d2940c8SGuangliang Zhao 		return "???";
8806d2940c8SGuangliang Zhao 	}
8816d2940c8SGuangliang Zhao }
8826d2940c8SGuangliang Zhao 
88359c2be1eSYehuda Sadeh /*
884602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8857262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8867262cfcaSAlex Elder  * function.
887602adf40SYehuda Sadeh  */
8889d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
889602adf40SYehuda Sadeh {
890f8c38929SAlex Elder 	struct rbd_client *rbdc;
89159c2be1eSYehuda Sadeh 
892cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
8931f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
8949d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
89543ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
8969d3997fdSAlex Elder 	else
897f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
898cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
899d720bcb0SAlex Elder 
9009d3997fdSAlex Elder 	return rbdc;
901602adf40SYehuda Sadeh }
902602adf40SYehuda Sadeh 
903602adf40SYehuda Sadeh /*
904602adf40SYehuda Sadeh  * Destroy ceph client
905d23a4b3fSAlex Elder  *
906432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
907602adf40SYehuda Sadeh  */
908602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
909602adf40SYehuda Sadeh {
910602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
911602adf40SYehuda Sadeh 
91237206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
913cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
914602adf40SYehuda Sadeh 	list_del(&rbdc->node);
915cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
916602adf40SYehuda Sadeh 
917602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
918602adf40SYehuda Sadeh 	kfree(rbdc);
919602adf40SYehuda Sadeh }
920602adf40SYehuda Sadeh 
921602adf40SYehuda Sadeh /*
922602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
923602adf40SYehuda Sadeh  * it.
924602adf40SYehuda Sadeh  */
9259d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
926602adf40SYehuda Sadeh {
927c53d5893SAlex Elder 	if (rbdc)
9289d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
929602adf40SYehuda Sadeh }
930602adf40SYehuda Sadeh 
931a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
932a30b71b9SAlex Elder {
933a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
934a30b71b9SAlex Elder }
935a30b71b9SAlex Elder 
9368e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9378e94af8eSAlex Elder {
938103a150fSAlex Elder 	size_t size;
939103a150fSAlex Elder 	u32 snap_count;
940103a150fSAlex Elder 
941103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
942103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
943103a150fSAlex Elder 		return false;
944103a150fSAlex Elder 
945db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
946db2388b6SAlex Elder 
947db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
948db2388b6SAlex Elder 		return false;
949db2388b6SAlex Elder 
950db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
951db2388b6SAlex Elder 
952db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
953db2388b6SAlex Elder 		return false;
954db2388b6SAlex Elder 
955103a150fSAlex Elder 	/*
956103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
957103a150fSAlex Elder 	 * that limits the number of snapshots.
958103a150fSAlex Elder 	 */
959103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
960103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
961103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
962103a150fSAlex Elder 		return false;
963103a150fSAlex Elder 
964103a150fSAlex Elder 	/*
965103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
966103a150fSAlex Elder 	 * header must also be representable in a size_t.
967103a150fSAlex Elder 	 */
968103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
969103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
970103a150fSAlex Elder 		return false;
971103a150fSAlex Elder 
972103a150fSAlex Elder 	return true;
9738e94af8eSAlex Elder }
9748e94af8eSAlex Elder 
975602adf40SYehuda Sadeh /*
976bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
977bb23e37aSAlex Elder  * on-disk header.
978602adf40SYehuda Sadeh  */
979662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
9804156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
981602adf40SYehuda Sadeh {
982662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
983bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
984bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
985bb23e37aSAlex Elder 	char *object_prefix = NULL;
986bb23e37aSAlex Elder 	char *snap_names = NULL;
987bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
988ccece235SAlex Elder 	u32 snap_count;
989bb23e37aSAlex Elder 	int ret = -ENOMEM;
990621901d6SAlex Elder 	u32 i;
991602adf40SYehuda Sadeh 
992bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
993103a150fSAlex Elder 
994bb23e37aSAlex Elder 	if (first_time) {
995bb23e37aSAlex Elder 		size_t len;
996bb23e37aSAlex Elder 
997bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
998bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
999bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
1000bb23e37aSAlex Elder 		if (!object_prefix)
1001602adf40SYehuda Sadeh 			return -ENOMEM;
1002bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
1003bb23e37aSAlex Elder 		object_prefix[len] = '\0';
1004bb23e37aSAlex Elder 	}
100500f1f36fSAlex Elder 
1006bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1007d2bb24e5SAlex Elder 
1008602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1009bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1010bb23e37aSAlex Elder 	if (!snapc)
1011bb23e37aSAlex Elder 		goto out_err;
1012bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1013602adf40SYehuda Sadeh 	if (snap_count) {
1014bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1015f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1016f785cc1dSAlex Elder 
1017bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1018621901d6SAlex Elder 
1019f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1020bb23e37aSAlex Elder 			goto out_2big;
1021bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1022bb23e37aSAlex Elder 		if (!snap_names)
1023602adf40SYehuda Sadeh 			goto out_err;
1024bb23e37aSAlex Elder 
1025bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
102688a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
102788a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
102888a25a5fSMarkus Elfring 					   GFP_KERNEL);
1029bb23e37aSAlex Elder 		if (!snap_sizes)
1030bb23e37aSAlex Elder 			goto out_err;
1031bb23e37aSAlex Elder 
1032f785cc1dSAlex Elder 		/*
1033bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1034bb23e37aSAlex Elder 		 * and size.
1035bb23e37aSAlex Elder 		 *
103699a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1037bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1038f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1039f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1040f785cc1dSAlex Elder 		 */
1041bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1042bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1043bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1044bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1045bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1046bb23e37aSAlex Elder 		}
1047602adf40SYehuda Sadeh 	}
1048849b4260SAlex Elder 
1049bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1050bb23e37aSAlex Elder 
1051bb23e37aSAlex Elder 	if (first_time) {
1052bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1053602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1054602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
1055602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
1056bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
1057bb23e37aSAlex Elder 		header->stripe_unit = 0;
1058bb23e37aSAlex Elder 		header->stripe_count = 0;
1059bb23e37aSAlex Elder 		header->features = 0;
1060662518b1SAlex Elder 	} else {
1061662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1062662518b1SAlex Elder 		kfree(header->snap_names);
1063662518b1SAlex Elder 		kfree(header->snap_sizes);
1064bb23e37aSAlex Elder 	}
10656a52325fSAlex Elder 
1066bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1067621901d6SAlex Elder 
1068f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1069bb23e37aSAlex Elder 	header->snapc = snapc;
1070bb23e37aSAlex Elder 	header->snap_names = snap_names;
1071bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1072468521c1SAlex Elder 
1073602adf40SYehuda Sadeh 	return 0;
1074bb23e37aSAlex Elder out_2big:
1075bb23e37aSAlex Elder 	ret = -EIO;
10766a52325fSAlex Elder out_err:
1077bb23e37aSAlex Elder 	kfree(snap_sizes);
1078bb23e37aSAlex Elder 	kfree(snap_names);
1079bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1080bb23e37aSAlex Elder 	kfree(object_prefix);
1081ccece235SAlex Elder 
1082bb23e37aSAlex Elder 	return ret;
1083602adf40SYehuda Sadeh }
1084602adf40SYehuda Sadeh 
10859682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
10869682fc6dSAlex Elder {
10879682fc6dSAlex Elder 	const char *snap_name;
10889682fc6dSAlex Elder 
10899682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
10909682fc6dSAlex Elder 
10919682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
10929682fc6dSAlex Elder 
10939682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
10949682fc6dSAlex Elder 	while (which--)
10959682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
10969682fc6dSAlex Elder 
10979682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
10989682fc6dSAlex Elder }
10999682fc6dSAlex Elder 
110030d1cff8SAlex Elder /*
110130d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
110230d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
110330d1cff8SAlex Elder  */
110430d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
110530d1cff8SAlex Elder {
110630d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
110730d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
110830d1cff8SAlex Elder 
110930d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
111030d1cff8SAlex Elder 		return 1;
111130d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
111230d1cff8SAlex Elder }
111330d1cff8SAlex Elder 
111430d1cff8SAlex Elder /*
111530d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
111630d1cff8SAlex Elder  * present.
111730d1cff8SAlex Elder  *
111830d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
111930d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
112030d1cff8SAlex Elder  *
112130d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
112230d1cff8SAlex Elder  * reverse order, highest snapshot id first.
112330d1cff8SAlex Elder  */
11249682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11259682fc6dSAlex Elder {
11269682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
112730d1cff8SAlex Elder 	u64 *found;
11289682fc6dSAlex Elder 
112930d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
113030d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11319682fc6dSAlex Elder 
113230d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11339682fc6dSAlex Elder }
11349682fc6dSAlex Elder 
11352ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11362ad3d716SAlex Elder 					u64 snap_id)
113754cac61fSAlex Elder {
113854cac61fSAlex Elder 	u32 which;
1139da6a6b63SJosh Durgin 	const char *snap_name;
114054cac61fSAlex Elder 
114154cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
114254cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1143da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
114454cac61fSAlex Elder 
1145da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1146da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
114754cac61fSAlex Elder }
114854cac61fSAlex Elder 
11499e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11509e15b77dSAlex Elder {
11519e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11529e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11539e15b77dSAlex Elder 
115454cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
115554cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
115654cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
11579e15b77dSAlex Elder 
115854cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
11599e15b77dSAlex Elder }
11609e15b77dSAlex Elder 
11612ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
11622ad3d716SAlex Elder 				u64 *snap_size)
1163602adf40SYehuda Sadeh {
11642ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11652ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11662ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11672ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11682ad3d716SAlex Elder 		u32 which;
116900f1f36fSAlex Elder 
11702ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11712ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11722ad3d716SAlex Elder 			return -ENOENT;
117300f1f36fSAlex Elder 
11742ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11752ad3d716SAlex Elder 	} else {
11762ad3d716SAlex Elder 		u64 size = 0;
11772ad3d716SAlex Elder 		int ret;
11782ad3d716SAlex Elder 
11792ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11802ad3d716SAlex Elder 		if (ret)
11812ad3d716SAlex Elder 			return ret;
11822ad3d716SAlex Elder 
11832ad3d716SAlex Elder 		*snap_size = size;
11842ad3d716SAlex Elder 	}
11852ad3d716SAlex Elder 	return 0;
11862ad3d716SAlex Elder }
11872ad3d716SAlex Elder 
11882ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
11892ad3d716SAlex Elder 			u64 *snap_features)
11902ad3d716SAlex Elder {
11912ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11922ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11932ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
11942ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11952ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
11962ad3d716SAlex Elder 	} else {
11972ad3d716SAlex Elder 		u64 features = 0;
11982ad3d716SAlex Elder 		int ret;
11992ad3d716SAlex Elder 
12002ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
12012ad3d716SAlex Elder 		if (ret)
12022ad3d716SAlex Elder 			return ret;
12032ad3d716SAlex Elder 
12042ad3d716SAlex Elder 		*snap_features = features;
12052ad3d716SAlex Elder 	}
12062ad3d716SAlex Elder 	return 0;
120700f1f36fSAlex Elder }
1208602adf40SYehuda Sadeh 
1209d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1210602adf40SYehuda Sadeh {
12118f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12122ad3d716SAlex Elder 	u64 size = 0;
12132ad3d716SAlex Elder 	u64 features = 0;
12142ad3d716SAlex Elder 	int ret;
12158b0241f8SAlex Elder 
12162ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12172ad3d716SAlex Elder 	if (ret)
12182ad3d716SAlex Elder 		return ret;
12192ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
12202ad3d716SAlex Elder 	if (ret)
12212ad3d716SAlex Elder 		return ret;
12222ad3d716SAlex Elder 
12232ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12242ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
12252ad3d716SAlex Elder 
12268b0241f8SAlex Elder 	return 0;
1227602adf40SYehuda Sadeh }
1228602adf40SYehuda Sadeh 
1229d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1230d1cf5788SAlex Elder {
1231d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1232d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1233200a6a8bSAlex Elder }
1234200a6a8bSAlex Elder 
12357d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name)
12367d5079aaSHimangi Saraogi {
12377d5079aaSHimangi Saraogi 	/* The explicit cast here is needed to drop the const qualifier */
12387d5079aaSHimangi Saraogi 
12397d5079aaSHimangi Saraogi 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
12407d5079aaSHimangi Saraogi }
12417d5079aaSHimangi Saraogi 
124298571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1243602adf40SYehuda Sadeh {
124465ccfe21SAlex Elder 	char *name;
124565ccfe21SAlex Elder 	u64 segment;
124665ccfe21SAlex Elder 	int ret;
12473a96d5cdSJosh Durgin 	char *name_format;
1248602adf40SYehuda Sadeh 
124978c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
125065ccfe21SAlex Elder 	if (!name)
125165ccfe21SAlex Elder 		return NULL;
125265ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
12533a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
12543a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
12553a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
12562d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
125765ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
12582d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
125965ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
126065ccfe21SAlex Elder 			segment, ret);
12617d5079aaSHimangi Saraogi 		rbd_segment_name_free(name);
126265ccfe21SAlex Elder 		name = NULL;
126365ccfe21SAlex Elder 	}
1264602adf40SYehuda Sadeh 
126565ccfe21SAlex Elder 	return name;
126665ccfe21SAlex Elder }
1267602adf40SYehuda Sadeh 
126865ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
126965ccfe21SAlex Elder {
127065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1271602adf40SYehuda Sadeh 
127265ccfe21SAlex Elder 	return offset & (segment_size - 1);
127365ccfe21SAlex Elder }
127465ccfe21SAlex Elder 
127565ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
127665ccfe21SAlex Elder 				u64 offset, u64 length)
127765ccfe21SAlex Elder {
127865ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
127965ccfe21SAlex Elder 
128065ccfe21SAlex Elder 	offset &= segment_size - 1;
128165ccfe21SAlex Elder 
1282aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
128365ccfe21SAlex Elder 	if (offset + length > segment_size)
128465ccfe21SAlex Elder 		length = segment_size - offset;
128565ccfe21SAlex Elder 
128665ccfe21SAlex Elder 	return length;
1287602adf40SYehuda Sadeh }
1288602adf40SYehuda Sadeh 
1289602adf40SYehuda Sadeh /*
1290029bcbd8SJosh Durgin  * returns the size of an object in the image
1291029bcbd8SJosh Durgin  */
1292029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1293029bcbd8SJosh Durgin {
1294029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1295029bcbd8SJosh Durgin }
1296029bcbd8SJosh Durgin 
1297029bcbd8SJosh Durgin /*
1298602adf40SYehuda Sadeh  * bio helpers
1299602adf40SYehuda Sadeh  */
1300602adf40SYehuda Sadeh 
1301602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1302602adf40SYehuda Sadeh {
1303602adf40SYehuda Sadeh 	struct bio *tmp;
1304602adf40SYehuda Sadeh 
1305602adf40SYehuda Sadeh 	while (chain) {
1306602adf40SYehuda Sadeh 		tmp = chain;
1307602adf40SYehuda Sadeh 		chain = chain->bi_next;
1308602adf40SYehuda Sadeh 		bio_put(tmp);
1309602adf40SYehuda Sadeh 	}
1310602adf40SYehuda Sadeh }
1311602adf40SYehuda Sadeh 
1312602adf40SYehuda Sadeh /*
1313602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1314602adf40SYehuda Sadeh  */
1315602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1316602adf40SYehuda Sadeh {
13177988613bSKent Overstreet 	struct bio_vec bv;
13187988613bSKent Overstreet 	struct bvec_iter iter;
1319602adf40SYehuda Sadeh 	unsigned long flags;
1320602adf40SYehuda Sadeh 	void *buf;
1321602adf40SYehuda Sadeh 	int pos = 0;
1322602adf40SYehuda Sadeh 
1323602adf40SYehuda Sadeh 	while (chain) {
13247988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
13257988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1326602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
13277988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1328602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
13297988613bSKent Overstreet 				       bv.bv_len - remainder);
13307988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
133185b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1332602adf40SYehuda Sadeh 			}
13337988613bSKent Overstreet 			pos += bv.bv_len;
1334602adf40SYehuda Sadeh 		}
1335602adf40SYehuda Sadeh 
1336602adf40SYehuda Sadeh 		chain = chain->bi_next;
1337602adf40SYehuda Sadeh 	}
1338602adf40SYehuda Sadeh }
1339602adf40SYehuda Sadeh 
1340602adf40SYehuda Sadeh /*
1341b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1342b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1343b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1344b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1345b9434c5bSAlex Elder  */
1346b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1347b9434c5bSAlex Elder {
1348b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1349b9434c5bSAlex Elder 
1350b9434c5bSAlex Elder 	rbd_assert(end > offset);
1351b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1352b9434c5bSAlex Elder 	while (offset < end) {
1353b9434c5bSAlex Elder 		size_t page_offset;
1354b9434c5bSAlex Elder 		size_t length;
1355b9434c5bSAlex Elder 		unsigned long flags;
1356b9434c5bSAlex Elder 		void *kaddr;
1357b9434c5bSAlex Elder 
1358491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1359491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1360b9434c5bSAlex Elder 		local_irq_save(flags);
1361b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1362b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1363e2156054SAlex Elder 		flush_dcache_page(*page);
1364b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1365b9434c5bSAlex Elder 		local_irq_restore(flags);
1366b9434c5bSAlex Elder 
1367b9434c5bSAlex Elder 		offset += length;
1368b9434c5bSAlex Elder 		page++;
1369b9434c5bSAlex Elder 	}
1370b9434c5bSAlex Elder }
1371b9434c5bSAlex Elder 
1372b9434c5bSAlex Elder /*
1373f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1374f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1375602adf40SYehuda Sadeh  */
1376f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1377f7760dadSAlex Elder 					unsigned int offset,
1378f7760dadSAlex Elder 					unsigned int len,
1379f7760dadSAlex Elder 					gfp_t gfpmask)
1380602adf40SYehuda Sadeh {
1381f7760dadSAlex Elder 	struct bio *bio;
1382602adf40SYehuda Sadeh 
13835341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1384f7760dadSAlex Elder 	if (!bio)
1385f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1386f7760dadSAlex Elder 
13875341a627SKent Overstreet 	bio_advance(bio, offset);
13884f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1389602adf40SYehuda Sadeh 
1390f7760dadSAlex Elder 	return bio;
1391602adf40SYehuda Sadeh }
1392602adf40SYehuda Sadeh 
1393f7760dadSAlex Elder /*
1394f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1395f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1396f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1397f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1398f7760dadSAlex Elder  *
1399f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1400f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1401f7760dadSAlex Elder  * the start of data to be cloned is located.
1402f7760dadSAlex Elder  *
1403f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1404f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1405f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1406f7760dadSAlex Elder  */
1407f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1408f7760dadSAlex Elder 					unsigned int *offset,
1409f7760dadSAlex Elder 					unsigned int len,
1410f7760dadSAlex Elder 					gfp_t gfpmask)
1411f7760dadSAlex Elder {
1412f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1413f7760dadSAlex Elder 	unsigned int off = *offset;
1414f7760dadSAlex Elder 	struct bio *chain = NULL;
1415f7760dadSAlex Elder 	struct bio **end;
1416602adf40SYehuda Sadeh 
1417f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1418602adf40SYehuda Sadeh 
14194f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1420f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1421602adf40SYehuda Sadeh 
1422f7760dadSAlex Elder 	end = &chain;
1423f7760dadSAlex Elder 	while (len) {
1424f7760dadSAlex Elder 		unsigned int bi_size;
1425f7760dadSAlex Elder 		struct bio *bio;
1426f7760dadSAlex Elder 
1427f5400b7aSAlex Elder 		if (!bi) {
1428f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1429f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1430f5400b7aSAlex Elder 		}
14314f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1432f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1433f7760dadSAlex Elder 		if (!bio)
1434f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1435f7760dadSAlex Elder 
1436f7760dadSAlex Elder 		*end = bio;
1437f7760dadSAlex Elder 		end = &bio->bi_next;
1438f7760dadSAlex Elder 
1439f7760dadSAlex Elder 		off += bi_size;
14404f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1441f7760dadSAlex Elder 			bi = bi->bi_next;
1442f7760dadSAlex Elder 			off = 0;
1443f7760dadSAlex Elder 		}
1444f7760dadSAlex Elder 		len -= bi_size;
1445f7760dadSAlex Elder 	}
1446f7760dadSAlex Elder 	*bio_src = bi;
1447f7760dadSAlex Elder 	*offset = off;
1448f7760dadSAlex Elder 
1449f7760dadSAlex Elder 	return chain;
1450f7760dadSAlex Elder out_err:
1451f7760dadSAlex Elder 	bio_chain_put(chain);
1452f7760dadSAlex Elder 
1453602adf40SYehuda Sadeh 	return NULL;
1454602adf40SYehuda Sadeh }
1455602adf40SYehuda Sadeh 
1456926f9b3fSAlex Elder /*
1457926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1458926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1459926f9b3fSAlex Elder  * again.
1460926f9b3fSAlex Elder  */
14616365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
14626365d33aSAlex Elder {
14636365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
14646365d33aSAlex Elder 		struct rbd_device *rbd_dev;
14656365d33aSAlex Elder 
146657acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
14679584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
14686365d33aSAlex Elder 			obj_request);
14696365d33aSAlex Elder 	}
14706365d33aSAlex Elder }
14716365d33aSAlex Elder 
14726365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14736365d33aSAlex Elder {
14746365d33aSAlex Elder 	smp_mb();
14756365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14766365d33aSAlex Elder }
14776365d33aSAlex Elder 
147857acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
147957acbaa7SAlex Elder {
148057acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
148157acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
148257acbaa7SAlex Elder 
148357acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
148457acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14859584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
148657acbaa7SAlex Elder 			obj_request);
148757acbaa7SAlex Elder 	}
148857acbaa7SAlex Elder }
148957acbaa7SAlex Elder 
149057acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
149157acbaa7SAlex Elder {
149257acbaa7SAlex Elder 	smp_mb();
149357acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
149457acbaa7SAlex Elder }
149557acbaa7SAlex Elder 
14965679c59fSAlex Elder /*
14975679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14985679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14995679c59fSAlex Elder  *
15005679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
15015679c59fSAlex Elder  * away again.  It's possible that the response from two existence
15025679c59fSAlex Elder  * checks are separated by the creation of the target object, and
15035679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
15045679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
15055679c59fSAlex Elder  */
15065679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
15075679c59fSAlex Elder 				bool exists)
15085679c59fSAlex Elder {
15095679c59fSAlex Elder 	if (exists)
15105679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
15115679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
15125679c59fSAlex Elder 	smp_mb();
15135679c59fSAlex Elder }
15145679c59fSAlex Elder 
15155679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
15165679c59fSAlex Elder {
15175679c59fSAlex Elder 	smp_mb();
15185679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
15195679c59fSAlex Elder }
15205679c59fSAlex Elder 
15215679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
15225679c59fSAlex Elder {
15235679c59fSAlex Elder 	smp_mb();
15245679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
15255679c59fSAlex Elder }
15265679c59fSAlex Elder 
15279638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
15289638556aSIlya Dryomov {
15299638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
15309638556aSIlya Dryomov 
15319638556aSIlya Dryomov 	return obj_request->img_offset <
15329638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
15339638556aSIlya Dryomov }
15349638556aSIlya Dryomov 
1535bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1536bf0d5f50SAlex Elder {
153737206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
153837206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1539bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1540bf0d5f50SAlex Elder }
1541bf0d5f50SAlex Elder 
1542bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1543bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1544bf0d5f50SAlex Elder {
1545bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
154637206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
154737206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1548bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1549bf0d5f50SAlex Elder }
1550bf0d5f50SAlex Elder 
15510f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
15520f2d5be7SAlex Elder {
15530f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
15540f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
15550f2d5be7SAlex Elder 	kref_get(&img_request->kref);
15560f2d5be7SAlex Elder }
15570f2d5be7SAlex Elder 
1558e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1559e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1560bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1561bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1562bf0d5f50SAlex Elder {
1563bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
156437206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
156537206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1566e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1567e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1568e93f3152SAlex Elder 	else
1569bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1570bf0d5f50SAlex Elder }
1571bf0d5f50SAlex Elder 
1572bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1573bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1574bf0d5f50SAlex Elder {
157525dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
157625dcf954SAlex Elder 
1577b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1578bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
157925dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15806365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15816365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1582bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
158325dcf954SAlex Elder 	img_request->obj_request_count++;
158425dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
158537206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
158637206ee5SAlex Elder 		obj_request->which);
1587bf0d5f50SAlex Elder }
1588bf0d5f50SAlex Elder 
1589bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1590bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1591bf0d5f50SAlex Elder {
1592bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
159325dcf954SAlex Elder 
159437206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
159537206ee5SAlex Elder 		obj_request->which);
1596bf0d5f50SAlex Elder 	list_del(&obj_request->links);
159725dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
159825dcf954SAlex Elder 	img_request->obj_request_count--;
159925dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
160025dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
16016365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1602bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1603bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
160425dcf954SAlex Elder 	obj_request->callback = NULL;
1605bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1606bf0d5f50SAlex Elder }
1607bf0d5f50SAlex Elder 
1608bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1609bf0d5f50SAlex Elder {
1610bf0d5f50SAlex Elder 	switch (type) {
16119969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1612bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1613788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1614bf0d5f50SAlex Elder 		return true;
1615bf0d5f50SAlex Elder 	default:
1616bf0d5f50SAlex Elder 		return false;
1617bf0d5f50SAlex Elder 	}
1618bf0d5f50SAlex Elder }
1619bf0d5f50SAlex Elder 
16204a17dadcSIlya Dryomov static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
16214a17dadcSIlya Dryomov 
1622980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1623bf0d5f50SAlex Elder {
1624980917fcSIlya Dryomov 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1625980917fcSIlya Dryomov 
1626980917fcSIlya Dryomov 	dout("%s %p osd_req %p\n", __func__, obj_request, osd_req);
16274a17dadcSIlya Dryomov 	if (obj_request_img_data_test(obj_request)) {
16284a17dadcSIlya Dryomov 		WARN_ON(obj_request->callback != rbd_img_obj_callback);
16294a17dadcSIlya Dryomov 		rbd_img_request_get(obj_request->img_request);
16304a17dadcSIlya Dryomov 	}
1631980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1632bf0d5f50SAlex Elder }
1633bf0d5f50SAlex Elder 
163471c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
163571c20a06SIlya Dryomov {
163671c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
163771c20a06SIlya Dryomov 	ceph_osdc_cancel_request(obj_request->osd_req);
163871c20a06SIlya Dryomov }
163971c20a06SIlya Dryomov 
164071c20a06SIlya Dryomov /*
164171c20a06SIlya Dryomov  * Wait for an object request to complete.  If interrupted, cancel the
164271c20a06SIlya Dryomov  * underlying osd request.
16432894e1d7SIlya Dryomov  *
16442894e1d7SIlya Dryomov  * @timeout: in jiffies, 0 means "wait forever"
164571c20a06SIlya Dryomov  */
16462894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
16472894e1d7SIlya Dryomov 				  unsigned long timeout)
164871c20a06SIlya Dryomov {
16492894e1d7SIlya Dryomov 	long ret;
165071c20a06SIlya Dryomov 
165171c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
16522894e1d7SIlya Dryomov 	ret = wait_for_completion_interruptible_timeout(
16532894e1d7SIlya Dryomov 					&obj_request->completion,
16542894e1d7SIlya Dryomov 					ceph_timeout_jiffies(timeout));
16552894e1d7SIlya Dryomov 	if (ret <= 0) {
16562894e1d7SIlya Dryomov 		if (ret == 0)
16572894e1d7SIlya Dryomov 			ret = -ETIMEDOUT;
165871c20a06SIlya Dryomov 		rbd_obj_request_end(obj_request);
16592894e1d7SIlya Dryomov 	} else {
16602894e1d7SIlya Dryomov 		ret = 0;
16612894e1d7SIlya Dryomov 	}
16622894e1d7SIlya Dryomov 
16632894e1d7SIlya Dryomov 	dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
166471c20a06SIlya Dryomov 	return ret;
166571c20a06SIlya Dryomov }
166671c20a06SIlya Dryomov 
16672894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
16682894e1d7SIlya Dryomov {
16692894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, 0);
16702894e1d7SIlya Dryomov }
16712894e1d7SIlya Dryomov 
1672bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1673bf0d5f50SAlex Elder {
167455f27e09SAlex Elder 
167537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
167655f27e09SAlex Elder 
167755f27e09SAlex Elder 	/*
167855f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
167955f27e09SAlex Elder 	 * count for the image request.  We could instead use
168055f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
168155f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
168255f27e09SAlex Elder 	 */
168355f27e09SAlex Elder 	if (!img_request->result) {
168455f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
168555f27e09SAlex Elder 		u64 xferred = 0;
168655f27e09SAlex Elder 
168755f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
168855f27e09SAlex Elder 			xferred += obj_request->xferred;
168955f27e09SAlex Elder 		img_request->xferred = xferred;
169055f27e09SAlex Elder 	}
169155f27e09SAlex Elder 
1692bf0d5f50SAlex Elder 	if (img_request->callback)
1693bf0d5f50SAlex Elder 		img_request->callback(img_request);
1694bf0d5f50SAlex Elder 	else
1695bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1696bf0d5f50SAlex Elder }
1697bf0d5f50SAlex Elder 
16980c425248SAlex Elder /*
16990c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
17000c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
17010c425248SAlex Elder  * and currently never change thereafter.
17020c425248SAlex Elder  */
17030c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
17040c425248SAlex Elder {
17050c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
17060c425248SAlex Elder 	smp_mb();
17070c425248SAlex Elder }
17080c425248SAlex Elder 
17090c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
17100c425248SAlex Elder {
17110c425248SAlex Elder 	smp_mb();
17120c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
17130c425248SAlex Elder }
17140c425248SAlex Elder 
171590e98c52SGuangliang Zhao /*
171690e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
171790e98c52SGuangliang Zhao  */
171890e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
171990e98c52SGuangliang Zhao {
172090e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
172190e98c52SGuangliang Zhao 	smp_mb();
172290e98c52SGuangliang Zhao }
172390e98c52SGuangliang Zhao 
172490e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
172590e98c52SGuangliang Zhao {
172690e98c52SGuangliang Zhao 	smp_mb();
172790e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
172890e98c52SGuangliang Zhao }
172990e98c52SGuangliang Zhao 
17309849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
17319849e986SAlex Elder {
17329849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
17339849e986SAlex Elder 	smp_mb();
17349849e986SAlex Elder }
17359849e986SAlex Elder 
1736e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1737e93f3152SAlex Elder {
1738e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1739e93f3152SAlex Elder 	smp_mb();
1740e93f3152SAlex Elder }
1741e93f3152SAlex Elder 
17429849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
17439849e986SAlex Elder {
17449849e986SAlex Elder 	smp_mb();
17459849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
17469849e986SAlex Elder }
17479849e986SAlex Elder 
1748d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1749d0b2e944SAlex Elder {
1750d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1751d0b2e944SAlex Elder 	smp_mb();
1752d0b2e944SAlex Elder }
1753d0b2e944SAlex Elder 
1754a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1755a2acd00eSAlex Elder {
1756a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1757a2acd00eSAlex Elder 	smp_mb();
1758a2acd00eSAlex Elder }
1759a2acd00eSAlex Elder 
1760d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1761d0b2e944SAlex Elder {
1762d0b2e944SAlex Elder 	smp_mb();
1763d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1764d0b2e944SAlex Elder }
1765d0b2e944SAlex Elder 
17663b434a2aSJosh Durgin static enum obj_operation_type
17673b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
17683b434a2aSJosh Durgin {
17693b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
17703b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
17713b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
17723b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
17733b434a2aSJosh Durgin 	else
17743b434a2aSJosh Durgin 		return OBJ_OP_READ;
17753b434a2aSJosh Durgin }
17763b434a2aSJosh Durgin 
17776e2a4505SAlex Elder static void
17786e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17796e2a4505SAlex Elder {
1780b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1781b9434c5bSAlex Elder 	u64 length = obj_request->length;
1782b9434c5bSAlex Elder 
17836e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17846e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1785b9434c5bSAlex Elder 		xferred, length);
17866e2a4505SAlex Elder 	/*
178717c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
178817c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
178917c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
179017c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
179117c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
179217c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17936e2a4505SAlex Elder 	 */
1794b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17956e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1796b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17976e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1798b9434c5bSAlex Elder 		else
1799b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
18006e2a4505SAlex Elder 		obj_request->result = 0;
1801b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1802b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1803b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1804b9434c5bSAlex Elder 		else
1805b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
18066e2a4505SAlex Elder 	}
180717c1cc1dSJosh Durgin 	obj_request->xferred = length;
18086e2a4505SAlex Elder 	obj_request_done_set(obj_request);
18096e2a4505SAlex Elder }
18106e2a4505SAlex Elder 
1811bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1812bf0d5f50SAlex Elder {
181337206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
181437206ee5SAlex Elder 		obj_request->callback);
1815bf0d5f50SAlex Elder 	if (obj_request->callback)
1816bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1817788e2df3SAlex Elder 	else
1818788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1819bf0d5f50SAlex Elder }
1820bf0d5f50SAlex Elder 
18210dcc685eSIlya Dryomov static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err)
18220dcc685eSIlya Dryomov {
18230dcc685eSIlya Dryomov 	obj_request->result = err;
18240dcc685eSIlya Dryomov 	obj_request->xferred = 0;
18250dcc685eSIlya Dryomov 	/*
18260dcc685eSIlya Dryomov 	 * kludge - mirror rbd_obj_request_submit() to match a put in
18270dcc685eSIlya Dryomov 	 * rbd_img_obj_callback()
18280dcc685eSIlya Dryomov 	 */
18290dcc685eSIlya Dryomov 	if (obj_request_img_data_test(obj_request)) {
18300dcc685eSIlya Dryomov 		WARN_ON(obj_request->callback != rbd_img_obj_callback);
18310dcc685eSIlya Dryomov 		rbd_img_request_get(obj_request->img_request);
18320dcc685eSIlya Dryomov 	}
18330dcc685eSIlya Dryomov 	obj_request_done_set(obj_request);
18340dcc685eSIlya Dryomov 	rbd_obj_request_complete(obj_request);
18350dcc685eSIlya Dryomov }
18360dcc685eSIlya Dryomov 
1837c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1838bf0d5f50SAlex Elder {
183957acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1840a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
184157acbaa7SAlex Elder 	bool layered = false;
184257acbaa7SAlex Elder 
184357acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
184457acbaa7SAlex Elder 		img_request = obj_request->img_request;
184557acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1846a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
184757acbaa7SAlex Elder 	}
18488b3e1a56SAlex Elder 
18498b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
18508b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
18518b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1852a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1853a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
18548b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
18558b3e1a56SAlex Elder 	else if (img_request)
18566e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
18576e2a4505SAlex Elder 	else
185807741308SAlex Elder 		obj_request_done_set(obj_request);
1859bf0d5f50SAlex Elder }
1860bf0d5f50SAlex Elder 
1861c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1862bf0d5f50SAlex Elder {
18631b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
18641b83bef2SSage Weil 		obj_request->result, obj_request->length);
18651b83bef2SSage Weil 	/*
18668b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
18678b3e1a56SAlex Elder 	 * it to our originally-requested length.
18681b83bef2SSage Weil 	 */
18691b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
187007741308SAlex Elder 	obj_request_done_set(obj_request);
1871bf0d5f50SAlex Elder }
1872bf0d5f50SAlex Elder 
187390e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
187490e98c52SGuangliang Zhao {
187590e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
187690e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
187790e98c52SGuangliang Zhao 	/*
187890e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
187990e98c52SGuangliang Zhao 	 * it to our originally-requested length.
188090e98c52SGuangliang Zhao 	 */
188190e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1882d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1883d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1884d0265de7SJosh Durgin 		obj_request->result = 0;
188590e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
188690e98c52SGuangliang Zhao }
188790e98c52SGuangliang Zhao 
1888fbfab539SAlex Elder /*
1889fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1890fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1891fbfab539SAlex Elder  */
1892c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1893fbfab539SAlex Elder {
189437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1895fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1896fbfab539SAlex Elder }
1897fbfab539SAlex Elder 
18982761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
18992761713dSIlya Dryomov {
19002761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
19012761713dSIlya Dryomov 
19022761713dSIlya Dryomov 	if (obj_request_img_data_test(obj_request))
19032761713dSIlya Dryomov 		rbd_osd_copyup_callback(obj_request);
19042761713dSIlya Dryomov 	else
19052761713dSIlya Dryomov 		obj_request_done_set(obj_request);
19062761713dSIlya Dryomov }
19072761713dSIlya Dryomov 
190885e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1909bf0d5f50SAlex Elder {
1910bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1911bf0d5f50SAlex Elder 	u16 opcode;
1912bf0d5f50SAlex Elder 
191385e084feSIlya Dryomov 	dout("%s: osd_req %p\n", __func__, osd_req);
1914bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
191557acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
191657acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
191757acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
191857acbaa7SAlex Elder 	} else {
191957acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
192057acbaa7SAlex Elder 	}
1921bf0d5f50SAlex Elder 
19221b83bef2SSage Weil 	if (osd_req->r_result < 0)
19231b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1924bf0d5f50SAlex Elder 
1925c47f9371SAlex Elder 	/*
1926c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
19277ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
19287ad18afaSChristoph Hellwig 	 * length field.
1929c47f9371SAlex Elder 	 */
19307665d85bSYan, Zheng 	obj_request->xferred = osd_req->r_ops[0].outdata_len;
1931c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
19320ccd5926SIlya Dryomov 
193379528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1934bf0d5f50SAlex Elder 	switch (opcode) {
1935bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1936c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1937bf0d5f50SAlex Elder 		break;
19380ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
1939e30b7577SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1940e30b7577SIlya Dryomov 			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
19410ccd5926SIlya Dryomov 		/* fall through */
1942bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1943e30b7577SIlya Dryomov 	case CEPH_OSD_OP_WRITEFULL:
1944c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1945bf0d5f50SAlex Elder 		break;
1946fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1947c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1948fbfab539SAlex Elder 		break;
194990e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
195090e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
195190e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
195290e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
195390e98c52SGuangliang Zhao 		break;
195436be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
19552761713dSIlya Dryomov 		rbd_osd_call_callback(obj_request);
19562761713dSIlya Dryomov 		break;
1957bf0d5f50SAlex Elder 	default:
19589584d508SIlya Dryomov 		rbd_warn(NULL, "%s: unsupported op %hu",
1959bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1960bf0d5f50SAlex Elder 		break;
1961bf0d5f50SAlex Elder 	}
1962bf0d5f50SAlex Elder 
196307741308SAlex Elder 	if (obj_request_done_test(obj_request))
1964bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1965bf0d5f50SAlex Elder }
1966bf0d5f50SAlex Elder 
19679d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1968430c28c3SAlex Elder {
19698c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1970430c28c3SAlex Elder 
19717c84883aSIlya Dryomov 	rbd_assert(obj_request_img_data_test(obj_request));
19727c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
19739d4df01fSAlex Elder }
19749d4df01fSAlex Elder 
19759d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
19769d4df01fSAlex Elder {
19779d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19789d4df01fSAlex Elder 
1979bb873b53SIlya Dryomov 	osd_req->r_mtime = CURRENT_TIME;
1980bb873b53SIlya Dryomov 	osd_req->r_data_offset = obj_request->offset;
1981430c28c3SAlex Elder }
1982430c28c3SAlex Elder 
19830ccd5926SIlya Dryomov /*
19840ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
19850ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
19860ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
19870ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
19880ccd5926SIlya Dryomov  */
1989bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1990bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19916d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1992deb236b3SIlya Dryomov 					unsigned int num_ops,
1993430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1994bf0d5f50SAlex Elder {
1995bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1996bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1997bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1998bf0d5f50SAlex Elder 
199990e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
200090e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
20016365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
200290e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
20036d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
200490e98c52SGuangliang Zhao 		} else {
200590e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
200690e98c52SGuangliang Zhao 		}
2007bf0d5f50SAlex Elder 		snapc = img_request->snapc;
2008bf0d5f50SAlex Elder 	}
2009bf0d5f50SAlex Elder 
20106d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
2011deb236b3SIlya Dryomov 
2012deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
2013bf0d5f50SAlex Elder 
2014bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2015deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
20162224d879SDavid Disseldorp 					  GFP_NOIO);
2017bf0d5f50SAlex Elder 	if (!osd_req)
201813d1ad16SIlya Dryomov 		goto fail;
2019bf0d5f50SAlex Elder 
202090e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
2021bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
2022430c28c3SAlex Elder 	else
2023bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
2024bf0d5f50SAlex Elder 
2025bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
2026bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
2027bf0d5f50SAlex Elder 
20287627151eSYan, Zheng 	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
2029d30291b9SIlya Dryomov 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
2030d30291b9SIlya Dryomov 			     obj_request->object_name))
2031d30291b9SIlya Dryomov 		goto fail;
2032bf0d5f50SAlex Elder 
203313d1ad16SIlya Dryomov 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
203413d1ad16SIlya Dryomov 		goto fail;
203513d1ad16SIlya Dryomov 
2036bf0d5f50SAlex Elder 	return osd_req;
203713d1ad16SIlya Dryomov 
203813d1ad16SIlya Dryomov fail:
203913d1ad16SIlya Dryomov 	ceph_osdc_put_request(osd_req);
204013d1ad16SIlya Dryomov 	return NULL;
2041bf0d5f50SAlex Elder }
2042bf0d5f50SAlex Elder 
20430eefd470SAlex Elder /*
2044d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
2045d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
2046d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
2047d3246fb0SJosh Durgin  * or zero op.
20480eefd470SAlex Elder  */
20490eefd470SAlex Elder static struct ceph_osd_request *
20500eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
20510eefd470SAlex Elder {
20520eefd470SAlex Elder 	struct rbd_img_request *img_request;
20530eefd470SAlex Elder 	struct ceph_snap_context *snapc;
20540eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20550eefd470SAlex Elder 	struct ceph_osd_client *osdc;
20560eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
2057d3246fb0SJosh Durgin 	int num_osd_ops = 3;
20580eefd470SAlex Elder 
20590eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20600eefd470SAlex Elder 	img_request = obj_request->img_request;
20610eefd470SAlex Elder 	rbd_assert(img_request);
2062d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
2063d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
20640eefd470SAlex Elder 
2065d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
2066d3246fb0SJosh Durgin 		num_osd_ops = 2;
2067d3246fb0SJosh Durgin 
2068d3246fb0SJosh Durgin 	/* Allocate and initialize the request, for all the ops */
20690eefd470SAlex Elder 
20700eefd470SAlex Elder 	snapc = img_request->snapc;
20710eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20720eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2073d3246fb0SJosh Durgin 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
20742224d879SDavid Disseldorp 						false, GFP_NOIO);
20750eefd470SAlex Elder 	if (!osd_req)
207613d1ad16SIlya Dryomov 		goto fail;
20770eefd470SAlex Elder 
20780eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
20790eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
20800eefd470SAlex Elder 	osd_req->r_priv = obj_request;
20810eefd470SAlex Elder 
20827627151eSYan, Zheng 	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
2083d30291b9SIlya Dryomov 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
2084d30291b9SIlya Dryomov 			     obj_request->object_name))
2085d30291b9SIlya Dryomov 		goto fail;
20860eefd470SAlex Elder 
208713d1ad16SIlya Dryomov 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
208813d1ad16SIlya Dryomov 		goto fail;
208913d1ad16SIlya Dryomov 
20900eefd470SAlex Elder 	return osd_req;
209113d1ad16SIlya Dryomov 
209213d1ad16SIlya Dryomov fail:
209313d1ad16SIlya Dryomov 	ceph_osdc_put_request(osd_req);
209413d1ad16SIlya Dryomov 	return NULL;
20950eefd470SAlex Elder }
20960eefd470SAlex Elder 
20970eefd470SAlex Elder 
2098bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2099bf0d5f50SAlex Elder {
2100bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2101bf0d5f50SAlex Elder }
2102bf0d5f50SAlex Elder 
2103bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
2104bf0d5f50SAlex Elder 
2105bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
2106bf0d5f50SAlex Elder 						u64 offset, u64 length,
2107bf0d5f50SAlex Elder 						enum obj_request_type type)
2108bf0d5f50SAlex Elder {
2109bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2110bf0d5f50SAlex Elder 	size_t size;
2111bf0d5f50SAlex Elder 	char *name;
2112bf0d5f50SAlex Elder 
2113bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2114bf0d5f50SAlex Elder 
2115bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
21165a60e876SIlya Dryomov 	name = kmalloc(size, GFP_NOIO);
2117f907ad55SAlex Elder 	if (!name)
2118bf0d5f50SAlex Elder 		return NULL;
2119bf0d5f50SAlex Elder 
21205a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
2121f907ad55SAlex Elder 	if (!obj_request) {
2122f907ad55SAlex Elder 		kfree(name);
2123f907ad55SAlex Elder 		return NULL;
2124f907ad55SAlex Elder 	}
2125f907ad55SAlex Elder 
2126bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
2127bf0d5f50SAlex Elder 	obj_request->offset = offset;
2128bf0d5f50SAlex Elder 	obj_request->length = length;
2129926f9b3fSAlex Elder 	obj_request->flags = 0;
2130bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2131bf0d5f50SAlex Elder 	obj_request->type = type;
2132bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2133788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2134bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2135bf0d5f50SAlex Elder 
213637206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
213737206ee5SAlex Elder 		offset, length, (int)type, obj_request);
213837206ee5SAlex Elder 
2139bf0d5f50SAlex Elder 	return obj_request;
2140bf0d5f50SAlex Elder }
2141bf0d5f50SAlex Elder 
2142bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2143bf0d5f50SAlex Elder {
2144bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2145bf0d5f50SAlex Elder 
2146bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2147bf0d5f50SAlex Elder 
214837206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
214937206ee5SAlex Elder 
2150bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2151bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2152bf0d5f50SAlex Elder 
2153bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2154bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2155bf0d5f50SAlex Elder 
2156bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2157bf0d5f50SAlex Elder 	switch (obj_request->type) {
21589969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
21599969ebc5SAlex Elder 		break;		/* Nothing to do */
2160bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2161bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2162bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2163bf0d5f50SAlex Elder 		break;
2164788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
216504dc923cSIlya Dryomov 		/* img_data requests don't own their page array */
216604dc923cSIlya Dryomov 		if (obj_request->pages &&
216704dc923cSIlya Dryomov 		    !obj_request_img_data_test(obj_request))
2168788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2169788e2df3SAlex Elder 						obj_request->page_count);
2170788e2df3SAlex Elder 		break;
2171bf0d5f50SAlex Elder 	}
2172bf0d5f50SAlex Elder 
2173f907ad55SAlex Elder 	kfree(obj_request->object_name);
2174868311b1SAlex Elder 	obj_request->object_name = NULL;
2175868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2176bf0d5f50SAlex Elder }
2177bf0d5f50SAlex Elder 
2178fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2179fb65d228SAlex Elder 
2180fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2181fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2182fb65d228SAlex Elder {
2183fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2184fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2185fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2186fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2187fb65d228SAlex Elder }
2188fb65d228SAlex Elder 
2189bf0d5f50SAlex Elder /*
2190a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2191a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2192a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2193a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2194a2acd00eSAlex Elder  */
2195a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2196a2acd00eSAlex Elder {
2197a2acd00eSAlex Elder 	int counter;
2198a2acd00eSAlex Elder 
2199a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2200a2acd00eSAlex Elder 		return;
2201a2acd00eSAlex Elder 
2202a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2203a2acd00eSAlex Elder 	if (counter > 0)
2204a2acd00eSAlex Elder 		return;
2205a2acd00eSAlex Elder 
2206a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2207a2acd00eSAlex Elder 
2208a2acd00eSAlex Elder 	if (!counter)
2209a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2210a2acd00eSAlex Elder 	else
22119584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2212a2acd00eSAlex Elder }
2213a2acd00eSAlex Elder 
2214a2acd00eSAlex Elder /*
2215a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2216a2acd00eSAlex Elder  * parent.
2217a2acd00eSAlex Elder  *
2218a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2219a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2220a2acd00eSAlex Elder  * false otherwise.
2221a2acd00eSAlex Elder  */
2222a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2223a2acd00eSAlex Elder {
2224ae43e9d0SIlya Dryomov 	int counter = 0;
2225a2acd00eSAlex Elder 
2226a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2227a2acd00eSAlex Elder 		return false;
2228a2acd00eSAlex Elder 
2229ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2230ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2231a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2232ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2233a2acd00eSAlex Elder 
2234a2acd00eSAlex Elder 	if (counter < 0)
22359584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2236a2acd00eSAlex Elder 
2237ae43e9d0SIlya Dryomov 	return counter > 0;
2238a2acd00eSAlex Elder }
2239a2acd00eSAlex Elder 
2240bf0d5f50SAlex Elder /*
2241bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2242bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2243bf0d5f50SAlex Elder  * (if there is one).
2244bf0d5f50SAlex Elder  */
2245cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2246cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2247bf0d5f50SAlex Elder 					u64 offset, u64 length,
22486d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
22494e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2250bf0d5f50SAlex Elder {
2251bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2252bf0d5f50SAlex Elder 
22537a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2254bf0d5f50SAlex Elder 	if (!img_request)
2255bf0d5f50SAlex Elder 		return NULL;
2256bf0d5f50SAlex Elder 
2257bf0d5f50SAlex Elder 	img_request->rq = NULL;
2258bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2259bf0d5f50SAlex Elder 	img_request->offset = offset;
2260bf0d5f50SAlex Elder 	img_request->length = length;
22610c425248SAlex Elder 	img_request->flags = 0;
226290e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
226390e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
226490e98c52SGuangliang Zhao 		img_request->snapc = snapc;
226590e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
22660c425248SAlex Elder 		img_request_write_set(img_request);
22674e752f0aSJosh Durgin 		img_request->snapc = snapc;
22680c425248SAlex Elder 	} else {
2269bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
22700c425248SAlex Elder 	}
2271a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2272d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2273bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2274bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2275bf0d5f50SAlex Elder 	img_request->callback = NULL;
2276a5a337d4SAlex Elder 	img_request->result = 0;
2277bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2278bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2279bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2280bf0d5f50SAlex Elder 
228137206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
22826d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
228337206ee5SAlex Elder 
2284bf0d5f50SAlex Elder 	return img_request;
2285bf0d5f50SAlex Elder }
2286bf0d5f50SAlex Elder 
2287bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2288bf0d5f50SAlex Elder {
2289bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2290bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2291bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2292bf0d5f50SAlex Elder 
2293bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2294bf0d5f50SAlex Elder 
229537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
229637206ee5SAlex Elder 
2297bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2298bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
229925dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2300bf0d5f50SAlex Elder 
2301a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2302a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2303a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2304a2acd00eSAlex Elder 	}
2305a2acd00eSAlex Elder 
2306bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2307bef95455SJosh Durgin 		img_request_discard_test(img_request))
2308812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2309bf0d5f50SAlex Elder 
23101c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2311bf0d5f50SAlex Elder }
2312bf0d5f50SAlex Elder 
2313e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2314e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2315e93f3152SAlex Elder 					u64 img_offset, u64 length)
2316e93f3152SAlex Elder {
2317e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2318e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2319e93f3152SAlex Elder 
2320e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2321e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2322e93f3152SAlex Elder 
23234e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
23246d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2325e93f3152SAlex Elder 	if (!parent_request)
2326e93f3152SAlex Elder 		return NULL;
2327e93f3152SAlex Elder 
2328e93f3152SAlex Elder 	img_request_child_set(parent_request);
2329e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2330e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2331e93f3152SAlex Elder 
2332e93f3152SAlex Elder 	return parent_request;
2333e93f3152SAlex Elder }
2334e93f3152SAlex Elder 
2335e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2336e93f3152SAlex Elder {
2337e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2338e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2339e93f3152SAlex Elder 
2340e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2341e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2342e93f3152SAlex Elder 
2343e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2344e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2345e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2346e93f3152SAlex Elder 
2347e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2348e93f3152SAlex Elder }
2349e93f3152SAlex Elder 
23501217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
23511217857fSAlex Elder {
23526365d33aSAlex Elder 	struct rbd_img_request *img_request;
23531217857fSAlex Elder 	unsigned int xferred;
23541217857fSAlex Elder 	int result;
23558b3e1a56SAlex Elder 	bool more;
23561217857fSAlex Elder 
23576365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23586365d33aSAlex Elder 	img_request = obj_request->img_request;
23596365d33aSAlex Elder 
23601217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
23611217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
23621217857fSAlex Elder 	result = obj_request->result;
23631217857fSAlex Elder 	if (result) {
23641217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
23656d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
23666d2940c8SGuangliang Zhao 
236790e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
236890e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
236990e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
237090e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
237190e98c52SGuangliang Zhao 		else
237290e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
23731217857fSAlex Elder 
23749584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
23756d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
23766d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
23779584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
23781217857fSAlex Elder 			result, xferred);
23791217857fSAlex Elder 		if (!img_request->result)
23801217857fSAlex Elder 			img_request->result = result;
2381082a75daSIlya Dryomov 		/*
2382082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2383082a75daSIlya Dryomov 		 * bytes in case of error.
2384082a75daSIlya Dryomov 		 */
2385082a75daSIlya Dryomov 		xferred = obj_request->length;
23861217857fSAlex Elder 	}
23871217857fSAlex Elder 
23888b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
23898b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
23908b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
23918b3e1a56SAlex Elder 	} else {
23928b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
23937ad18afaSChristoph Hellwig 
23947ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
23957ad18afaSChristoph Hellwig 		if (!more)
23967ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
23978b3e1a56SAlex Elder 	}
23988b3e1a56SAlex Elder 
23998b3e1a56SAlex Elder 	return more;
24001217857fSAlex Elder }
24011217857fSAlex Elder 
24022169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
24032169238dSAlex Elder {
24042169238dSAlex Elder 	struct rbd_img_request *img_request;
24052169238dSAlex Elder 	u32 which = obj_request->which;
24062169238dSAlex Elder 	bool more = true;
24072169238dSAlex Elder 
24086365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
24092169238dSAlex Elder 	img_request = obj_request->img_request;
24102169238dSAlex Elder 
24112169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
24122169238dSAlex Elder 	rbd_assert(img_request != NULL);
24132169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
24142169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
24152169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
24162169238dSAlex Elder 
24172169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
24182169238dSAlex Elder 	if (which != img_request->next_completion)
24192169238dSAlex Elder 		goto out;
24202169238dSAlex Elder 
24212169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
24222169238dSAlex Elder 		rbd_assert(more);
24232169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
24242169238dSAlex Elder 
24252169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
24262169238dSAlex Elder 			break;
24271217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
24282169238dSAlex Elder 		which++;
24292169238dSAlex Elder 	}
24302169238dSAlex Elder 
24312169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
24322169238dSAlex Elder 	img_request->next_completion = which;
24332169238dSAlex Elder out:
24342169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
24350f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
24362169238dSAlex Elder 
24372169238dSAlex Elder 	if (!more)
24382169238dSAlex Elder 		rbd_img_request_complete(img_request);
24392169238dSAlex Elder }
24402169238dSAlex Elder 
2441f1a4739fSAlex Elder /*
24423b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
24433b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
24443b434a2aSJosh Durgin  * osd operations already to the object request.
24453b434a2aSJosh Durgin  */
24463b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
24473b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
24483b434a2aSJosh Durgin 				enum obj_operation_type op_type,
24493b434a2aSJosh Durgin 				unsigned int num_ops)
24503b434a2aSJosh Durgin {
24513b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
24523b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
24533b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
24543b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
24553b434a2aSJosh Durgin 	u64 length = obj_request->length;
24563b434a2aSJosh Durgin 	u64 img_end;
24573b434a2aSJosh Durgin 	u16 opcode;
24583b434a2aSJosh Durgin 
24593b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2460d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2461d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2462d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
24633b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
24643b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
24653b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
24663b434a2aSJosh Durgin 		} else {
24673b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
24683b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
24693b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
24703b434a2aSJosh Durgin 
24713b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
24723b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
24733b434a2aSJosh Durgin 			else
24743b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
24753b434a2aSJosh Durgin 		}
24763b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
2477e30b7577SIlya Dryomov 		if (!offset && length == object_size)
2478e30b7577SIlya Dryomov 			opcode = CEPH_OSD_OP_WRITEFULL;
2479e30b7577SIlya Dryomov 		else
24803b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_WRITE;
24813b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
24823b434a2aSJosh Durgin 					object_size, object_size);
24833b434a2aSJosh Durgin 		num_ops++;
24843b434a2aSJosh Durgin 	} else {
24853b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
24863b434a2aSJosh Durgin 	}
24873b434a2aSJosh Durgin 
24887e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2489144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
24907e868b6eSIlya Dryomov 	else
24917e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
24927e868b6eSIlya Dryomov 				       offset, length, 0, 0);
24937e868b6eSIlya Dryomov 
24943b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
24953b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
24963b434a2aSJosh Durgin 					obj_request->bio_list, length);
24973b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
24983b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
24993b434a2aSJosh Durgin 					obj_request->pages, length,
25003b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
25013b434a2aSJosh Durgin 
25023b434a2aSJosh Durgin 	/* Discards are also writes */
25033b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
25043b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
25053b434a2aSJosh Durgin 	else
25063b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
25073b434a2aSJosh Durgin }
25083b434a2aSJosh Durgin 
25093b434a2aSJosh Durgin /*
2510f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2511f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2512f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2513f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2514f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2515f1a4739fSAlex Elder  * all data described by the image request.
2516f1a4739fSAlex Elder  */
2517f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2518f1a4739fSAlex Elder 					enum obj_request_type type,
2519f1a4739fSAlex Elder 					void *data_desc)
2520bf0d5f50SAlex Elder {
2521bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2522bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2523bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2524a158073cSJingoo Han 	struct bio *bio_list = NULL;
2525f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2526a158073cSJingoo Han 	struct page **pages = NULL;
25276d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
25287da22d29SAlex Elder 	u64 img_offset;
2529bf0d5f50SAlex Elder 	u64 resid;
2530bf0d5f50SAlex Elder 
2531f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2532f1a4739fSAlex Elder 		(int)type, data_desc);
253337206ee5SAlex Elder 
25347da22d29SAlex Elder 	img_offset = img_request->offset;
2535bf0d5f50SAlex Elder 	resid = img_request->length;
25364dda41d3SAlex Elder 	rbd_assert(resid > 0);
25373b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2538f1a4739fSAlex Elder 
2539f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2540f1a4739fSAlex Elder 		bio_list = data_desc;
25414f024f37SKent Overstreet 		rbd_assert(img_offset ==
25424f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
254390e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2544f1a4739fSAlex Elder 		pages = data_desc;
2545f1a4739fSAlex Elder 	}
2546f1a4739fSAlex Elder 
2547bf0d5f50SAlex Elder 	while (resid) {
25482fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2549bf0d5f50SAlex Elder 		const char *object_name;
2550bf0d5f50SAlex Elder 		u64 offset;
2551bf0d5f50SAlex Elder 		u64 length;
2552bf0d5f50SAlex Elder 
25537da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2554bf0d5f50SAlex Elder 		if (!object_name)
2555bf0d5f50SAlex Elder 			goto out_unwind;
25567da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
25577da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2558bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2559f1a4739fSAlex Elder 						offset, length, type);
256078c2a44aSAlex Elder 		/* object request has its own copy of the object name */
256178c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2562bf0d5f50SAlex Elder 		if (!obj_request)
2563bf0d5f50SAlex Elder 			goto out_unwind;
256462054da6SIlya Dryomov 
256503507db6SJosh Durgin 		/*
256603507db6SJosh Durgin 		 * set obj_request->img_request before creating the
256703507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
256803507db6SJosh Durgin 		 */
256903507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2570bf0d5f50SAlex Elder 
2571f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2572f1a4739fSAlex Elder 			unsigned int clone_size;
2573f1a4739fSAlex Elder 
2574bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2575bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2576f1a4739fSAlex Elder 			obj_request->bio_list =
2577f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2578f1a4739fSAlex Elder 								&bio_offset,
2579f1a4739fSAlex Elder 								clone_size,
25802224d879SDavid Disseldorp 								GFP_NOIO);
2581bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
258262054da6SIlya Dryomov 				goto out_unwind;
258390e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2584f1a4739fSAlex Elder 			unsigned int page_count;
2585f1a4739fSAlex Elder 
2586f1a4739fSAlex Elder 			obj_request->pages = pages;
2587f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2588f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2589f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2590f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2591f1a4739fSAlex Elder 			pages += page_count;
2592f1a4739fSAlex Elder 		}
2593bf0d5f50SAlex Elder 
25946d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
25956d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
25962fa12320SAlex Elder 					obj_request);
25972fa12320SAlex Elder 		if (!osd_req)
259862054da6SIlya Dryomov 			goto out_unwind;
25993b434a2aSJosh Durgin 
26002fa12320SAlex Elder 		obj_request->osd_req = osd_req;
26012169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
26027da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2603bf0d5f50SAlex Elder 
26043b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
26053b434a2aSJosh Durgin 
26067da22d29SAlex Elder 		img_offset += length;
2607bf0d5f50SAlex Elder 		resid -= length;
2608bf0d5f50SAlex Elder 	}
2609bf0d5f50SAlex Elder 
2610bf0d5f50SAlex Elder 	return 0;
2611bf0d5f50SAlex Elder 
2612bf0d5f50SAlex Elder out_unwind:
2613bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
261442dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2615bf0d5f50SAlex Elder 
2616bf0d5f50SAlex Elder 	return -ENOMEM;
2617bf0d5f50SAlex Elder }
2618bf0d5f50SAlex Elder 
26193d7efd18SAlex Elder static void
26202761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
26210eefd470SAlex Elder {
26220eefd470SAlex Elder 	struct rbd_img_request *img_request;
26230eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2624ebda6408SAlex Elder 	struct page **pages;
26250eefd470SAlex Elder 	u32 page_count;
26260eefd470SAlex Elder 
26272761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
26282761713dSIlya Dryomov 
2629d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2630d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
26310eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
26320eefd470SAlex Elder 	img_request = obj_request->img_request;
26330eefd470SAlex Elder 	rbd_assert(img_request);
26340eefd470SAlex Elder 
26350eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
26360eefd470SAlex Elder 	rbd_assert(rbd_dev);
26370eefd470SAlex Elder 
2638ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2639ebda6408SAlex Elder 	rbd_assert(pages != NULL);
26400eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2641ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2642ebda6408SAlex Elder 	rbd_assert(page_count);
2643ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2644ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
26450eefd470SAlex Elder 
26460eefd470SAlex Elder 	/*
26470eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
26480eefd470SAlex Elder 	 * original write request.  There is no such thing as a
26490eefd470SAlex Elder 	 * successful short write, so if the request was successful
26500eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
26510eefd470SAlex Elder 	 */
26520eefd470SAlex Elder 	if (!obj_request->result)
26530eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
26540eefd470SAlex Elder 
26552761713dSIlya Dryomov 	obj_request_done_set(obj_request);
26560eefd470SAlex Elder }
26570eefd470SAlex Elder 
26580eefd470SAlex Elder static void
26593d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
26603d7efd18SAlex Elder {
26613d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
26620eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
26630eefd470SAlex Elder 	struct rbd_device *rbd_dev;
26643d7efd18SAlex Elder 	struct page **pages;
2665d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2666ebda6408SAlex Elder 	u32 page_count;
2667bbea1c1aSAlex Elder 	int img_result;
2668ebda6408SAlex Elder 	u64 parent_length;
26693d7efd18SAlex Elder 
26703d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
26713d7efd18SAlex Elder 
26723d7efd18SAlex Elder 	/* First get what we need from the image request */
26733d7efd18SAlex Elder 
26743d7efd18SAlex Elder 	pages = img_request->copyup_pages;
26753d7efd18SAlex Elder 	rbd_assert(pages != NULL);
26763d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2677ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2678ebda6408SAlex Elder 	rbd_assert(page_count);
2679ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
26803d7efd18SAlex Elder 
26813d7efd18SAlex Elder 	orig_request = img_request->obj_request;
26823d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2683b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2684bbea1c1aSAlex Elder 	img_result = img_request->result;
2685ebda6408SAlex Elder 	parent_length = img_request->length;
2686fa355112SIlya Dryomov 	rbd_assert(img_result || parent_length == img_request->xferred);
26873d7efd18SAlex Elder 	rbd_img_request_put(img_request);
26883d7efd18SAlex Elder 
268991c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
269091c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
26913d7efd18SAlex Elder 	rbd_assert(rbd_dev);
26923d7efd18SAlex Elder 
2693bbea1c1aSAlex Elder 	/*
2694bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2695bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2696bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2697bbea1c1aSAlex Elder 	 */
2698bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2699bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2700980917fcSIlya Dryomov 		rbd_obj_request_submit(orig_request);
2701bbea1c1aSAlex Elder 		return;
2702bbea1c1aSAlex Elder 	}
2703bbea1c1aSAlex Elder 
2704bbea1c1aSAlex Elder 	if (img_result)
27050eefd470SAlex Elder 		goto out_err;
27063d7efd18SAlex Elder 
27078785b1d4SAlex Elder 	/*
27088785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
27090ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
27108785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
27118785b1d4SAlex Elder 	 * original request, and release the old one.
27128785b1d4SAlex Elder 	 */
2713bbea1c1aSAlex Elder 	img_result = -ENOMEM;
27140eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
27150eefd470SAlex Elder 	if (!osd_req)
27160eefd470SAlex Elder 		goto out_err;
27178785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
27180eefd470SAlex Elder 	orig_request->osd_req = osd_req;
27190eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2720ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
27213d7efd18SAlex Elder 
27220eefd470SAlex Elder 	/* Initialize the copyup op */
27230eefd470SAlex Elder 
27240eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2725ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
27260eefd470SAlex Elder 						false, false);
27270eefd470SAlex Elder 
2728d3246fb0SJosh Durgin 	/* Add the other op(s) */
27290ccd5926SIlya Dryomov 
2730d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2731d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
27320eefd470SAlex Elder 
27330eefd470SAlex Elder 	/* All set, send it off. */
27340eefd470SAlex Elder 
2735980917fcSIlya Dryomov 	rbd_obj_request_submit(orig_request);
27360eefd470SAlex Elder 	return;
2737980917fcSIlya Dryomov 
27380eefd470SAlex Elder out_err:
2739fa355112SIlya Dryomov 	ceph_release_page_vector(pages, page_count);
27400dcc685eSIlya Dryomov 	rbd_obj_request_error(orig_request, img_result);
27413d7efd18SAlex Elder }
27423d7efd18SAlex Elder 
27433d7efd18SAlex Elder /*
27443d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
27453d7efd18SAlex Elder  * entire target of the given object request.  This is used for
27463d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
27473d7efd18SAlex Elder  * object request from the image request does not exist.
27483d7efd18SAlex Elder  *
27493d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
27503d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
27513d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
27523d7efd18SAlex Elder  * the original object request for the copyup operation.
27533d7efd18SAlex Elder  *
2754c2e82414SIlya Dryomov  * If an error occurs, it is recorded as the result of the original
2755c2e82414SIlya Dryomov  * object request in rbd_img_obj_exists_callback().
27563d7efd18SAlex Elder  */
27573d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
27583d7efd18SAlex Elder {
2759058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
27603d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
27613d7efd18SAlex Elder 	u64 img_offset;
27623d7efd18SAlex Elder 	u64 length;
27633d7efd18SAlex Elder 	struct page **pages = NULL;
27643d7efd18SAlex Elder 	u32 page_count;
27653d7efd18SAlex Elder 	int result;
27663d7efd18SAlex Elder 
27673d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
27683d7efd18SAlex Elder 
27693d7efd18SAlex Elder 	/*
27703d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
27713d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
27723d7efd18SAlex Elder 	 */
27733d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
27743d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
27753d7efd18SAlex Elder 
27763d7efd18SAlex Elder 	/*
2777a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2778a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2779a9e8ba2cSAlex Elder 	 * necessary.
2780a9e8ba2cSAlex Elder 	 */
2781a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2782a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2783a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2784a9e8ba2cSAlex Elder 	}
2785a9e8ba2cSAlex Elder 
2786a9e8ba2cSAlex Elder 	/*
27873d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
27883d7efd18SAlex Elder 	 * from the parent.
27893d7efd18SAlex Elder 	 */
27903d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
27913d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
27923d7efd18SAlex Elder 	if (IS_ERR(pages)) {
27933d7efd18SAlex Elder 		result = PTR_ERR(pages);
27943d7efd18SAlex Elder 		pages = NULL;
27953d7efd18SAlex Elder 		goto out_err;
27963d7efd18SAlex Elder 	}
27973d7efd18SAlex Elder 
27983d7efd18SAlex Elder 	result = -ENOMEM;
2799e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2800e93f3152SAlex Elder 						img_offset, length);
28013d7efd18SAlex Elder 	if (!parent_request)
28023d7efd18SAlex Elder 		goto out_err;
28033d7efd18SAlex Elder 
28043d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
28053d7efd18SAlex Elder 	if (result)
28063d7efd18SAlex Elder 		goto out_err;
2807058aa991SIlya Dryomov 
28083d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2809ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
28103d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
2811058aa991SIlya Dryomov 
28123d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
28133d7efd18SAlex Elder 	if (!result)
28143d7efd18SAlex Elder 		return 0;
28153d7efd18SAlex Elder 
28163d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2817ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
28183d7efd18SAlex Elder 	parent_request->obj_request = NULL;
28193d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
28203d7efd18SAlex Elder out_err:
28213d7efd18SAlex Elder 	if (pages)
28223d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
28233d7efd18SAlex Elder 	if (parent_request)
28243d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
28253d7efd18SAlex Elder 	return result;
28263d7efd18SAlex Elder }
28273d7efd18SAlex Elder 
2828c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2829c5b5ef6cSAlex Elder {
2830c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2831638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2832c5b5ef6cSAlex Elder 	int result;
2833c5b5ef6cSAlex Elder 
2834c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2835c5b5ef6cSAlex Elder 
2836c5b5ef6cSAlex Elder 	/*
2837c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2838c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2839c5b5ef6cSAlex Elder 	 * we're done with the request.
2840c5b5ef6cSAlex Elder 	 */
2841c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2842c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2843912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2844c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2845c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2846c5b5ef6cSAlex Elder 
2847c5b5ef6cSAlex Elder 	result = obj_request->result;
2848c5b5ef6cSAlex Elder 	obj_request->result = 0;
2849c5b5ef6cSAlex Elder 
2850c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2851c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2852c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2853c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2854c5b5ef6cSAlex Elder 
2855638f5abeSAlex Elder 	/*
2856638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2857980917fcSIlya Dryomov 	 * image has been flattened) we need to re-submit the
2858980917fcSIlya Dryomov 	 * original request.
2859638f5abeSAlex Elder 	 */
2860638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2861638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2862980917fcSIlya Dryomov 		rbd_obj_request_submit(orig_request);
2863638f5abeSAlex Elder 		return;
2864638f5abeSAlex Elder 	}
2865c5b5ef6cSAlex Elder 
2866c5b5ef6cSAlex Elder 	/*
2867c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2868c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2869c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2870c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2871c5b5ef6cSAlex Elder 	 */
2872c5b5ef6cSAlex Elder 	if (!result) {
2873c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2874c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2875c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2876c2e82414SIlya Dryomov 	} else {
2877c2e82414SIlya Dryomov 		goto fail_orig_request;
2878c5b5ef6cSAlex Elder 	}
2879c5b5ef6cSAlex Elder 
2880c5b5ef6cSAlex Elder 	/*
2881c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2882c5b5ef6cSAlex Elder 	 * whether the target object exists.
2883c5b5ef6cSAlex Elder 	 */
2884c2e82414SIlya Dryomov 	result = rbd_img_obj_request_submit(orig_request);
2885c2e82414SIlya Dryomov 	if (result)
2886c2e82414SIlya Dryomov 		goto fail_orig_request;
2887c2e82414SIlya Dryomov 
2888c2e82414SIlya Dryomov 	return;
2889c2e82414SIlya Dryomov 
2890c2e82414SIlya Dryomov fail_orig_request:
28910dcc685eSIlya Dryomov 	rbd_obj_request_error(orig_request, result);
2892c5b5ef6cSAlex Elder }
2893c5b5ef6cSAlex Elder 
2894c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2895c5b5ef6cSAlex Elder {
2896058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
2897c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2898710214e3SIlya Dryomov 	struct page **pages;
2899c5b5ef6cSAlex Elder 	u32 page_count;
2900c5b5ef6cSAlex Elder 	size_t size;
2901c5b5ef6cSAlex Elder 	int ret;
2902c5b5ef6cSAlex Elder 
2903710214e3SIlya Dryomov 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2904710214e3SIlya Dryomov 					      OBJ_REQUEST_PAGES);
2905710214e3SIlya Dryomov 	if (!stat_request)
2906710214e3SIlya Dryomov 		return -ENOMEM;
2907710214e3SIlya Dryomov 
2908710214e3SIlya Dryomov 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2909710214e3SIlya Dryomov 						   stat_request);
2910710214e3SIlya Dryomov 	if (!stat_request->osd_req) {
2911710214e3SIlya Dryomov 		ret = -ENOMEM;
2912710214e3SIlya Dryomov 		goto fail_stat_request;
2913710214e3SIlya Dryomov 	}
2914710214e3SIlya Dryomov 
2915c5b5ef6cSAlex Elder 	/*
2916c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2917c5b5ef6cSAlex Elder 	 *     le64 length;
2918c5b5ef6cSAlex Elder 	 *     struct {
2919c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2920c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2921c5b5ef6cSAlex Elder 	 *     } mtime;
2922c5b5ef6cSAlex Elder 	 */
2923c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2924c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2925c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2926710214e3SIlya Dryomov 	if (IS_ERR(pages)) {
2927710214e3SIlya Dryomov 		ret = PTR_ERR(pages);
2928710214e3SIlya Dryomov 		goto fail_stat_request;
2929710214e3SIlya Dryomov 	}
2930c5b5ef6cSAlex Elder 
2931710214e3SIlya Dryomov 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2932710214e3SIlya Dryomov 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2933710214e3SIlya Dryomov 				     false, false);
2934c5b5ef6cSAlex Elder 
2935c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2936c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2937c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2938c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2939c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2940c5b5ef6cSAlex Elder 
2941980917fcSIlya Dryomov 	rbd_obj_request_submit(stat_request);
2942980917fcSIlya Dryomov 	return 0;
2943980917fcSIlya Dryomov 
2944710214e3SIlya Dryomov fail_stat_request:
2945710214e3SIlya Dryomov 	rbd_obj_request_put(stat_request);
2946c5b5ef6cSAlex Elder 	return ret;
2947c5b5ef6cSAlex Elder }
2948c5b5ef6cSAlex Elder 
294970d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2950b454e36dSAlex Elder {
2951058aa991SIlya Dryomov 	struct rbd_img_request *img_request = obj_request->img_request;
2952058aa991SIlya Dryomov 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2953b454e36dSAlex Elder 
295470d045f6SIlya Dryomov 	/* Reads */
29551c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
29561c220881SJosh Durgin 	    !img_request_discard_test(img_request))
295770d045f6SIlya Dryomov 		return true;
2958b454e36dSAlex Elder 
295970d045f6SIlya Dryomov 	/* Non-layered writes */
296070d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
296170d045f6SIlya Dryomov 		return true;
296270d045f6SIlya Dryomov 
296370d045f6SIlya Dryomov 	/*
296470d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
296570d045f6SIlya Dryomov 	 * share any data with the parent.
296670d045f6SIlya Dryomov 	 */
296770d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
296870d045f6SIlya Dryomov 		return true;
296970d045f6SIlya Dryomov 
297070d045f6SIlya Dryomov 	/*
2971c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2972c622d226SGuangliang Zhao 	 * parent data there is anyway.
2973c622d226SGuangliang Zhao 	 */
2974c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2975c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2976c622d226SGuangliang Zhao 		return true;
2977c622d226SGuangliang Zhao 
2978c622d226SGuangliang Zhao 	/*
297970d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
298070d045f6SIlya Dryomov 	 * already been copied.
298170d045f6SIlya Dryomov 	 */
298270d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
298370d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
298470d045f6SIlya Dryomov 		return true;
298570d045f6SIlya Dryomov 
298670d045f6SIlya Dryomov 	return false;
298770d045f6SIlya Dryomov }
298870d045f6SIlya Dryomov 
298970d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
299070d045f6SIlya Dryomov {
2991058aa991SIlya Dryomov 	rbd_assert(obj_request_img_data_test(obj_request));
2992058aa991SIlya Dryomov 	rbd_assert(obj_request_type_valid(obj_request->type));
2993058aa991SIlya Dryomov 	rbd_assert(obj_request->img_request);
2994058aa991SIlya Dryomov 
299570d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2996980917fcSIlya Dryomov 		rbd_obj_request_submit(obj_request);
2997980917fcSIlya Dryomov 		return 0;
2998b454e36dSAlex Elder 	}
2999b454e36dSAlex Elder 
3000b454e36dSAlex Elder 	/*
30013d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
30023d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
30033d7efd18SAlex Elder 	 * start by reading the data for the full target object from
30043d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
3005b454e36dSAlex Elder 	 */
300670d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
30073d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
30083d7efd18SAlex Elder 
30093d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
3010b454e36dSAlex Elder 
3011b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
3012b454e36dSAlex Elder }
3013b454e36dSAlex Elder 
3014bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
3015bf0d5f50SAlex Elder {
3016bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
301746faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
3018663ae2ccSIlya Dryomov 	int ret = 0;
3019bf0d5f50SAlex Elder 
302037206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
3021bf0d5f50SAlex Elder 
3022663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
3023663ae2ccSIlya Dryomov 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
3024b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
3025bf0d5f50SAlex Elder 		if (ret)
3026663ae2ccSIlya Dryomov 			goto out_put_ireq;
3027bf0d5f50SAlex Elder 	}
3028bf0d5f50SAlex Elder 
3029663ae2ccSIlya Dryomov out_put_ireq:
3030663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
3031663ae2ccSIlya Dryomov 	return ret;
3032bf0d5f50SAlex Elder }
3033bf0d5f50SAlex Elder 
30348b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
30358b3e1a56SAlex Elder {
30368b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
3037a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
3038a9e8ba2cSAlex Elder 	u64 obj_end;
303902c74fbaSAlex Elder 	u64 img_xferred;
304002c74fbaSAlex Elder 	int img_result;
30418b3e1a56SAlex Elder 
30428b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
30438b3e1a56SAlex Elder 
304402c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
304502c74fbaSAlex Elder 
30468b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
304702c74fbaSAlex Elder 	img_xferred = img_request->xferred;
304802c74fbaSAlex Elder 	img_result = img_request->result;
304902c74fbaSAlex Elder 	rbd_img_request_put(img_request);
305002c74fbaSAlex Elder 
305102c74fbaSAlex Elder 	/*
305202c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
305302c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
305402c74fbaSAlex Elder 	 * original request.
305502c74fbaSAlex Elder 	 */
3056a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
3057a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
305802c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
305902c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
3060980917fcSIlya Dryomov 		rbd_obj_request_submit(obj_request);
306102c74fbaSAlex Elder 		return;
306202c74fbaSAlex Elder 	}
306302c74fbaSAlex Elder 
306402c74fbaSAlex Elder 	obj_request->result = img_result;
3065a9e8ba2cSAlex Elder 	if (obj_request->result)
3066a9e8ba2cSAlex Elder 		goto out;
3067a9e8ba2cSAlex Elder 
3068a9e8ba2cSAlex Elder 	/*
3069a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
3070a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
3071a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
3072a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
3073a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
3074a9e8ba2cSAlex Elder 	 */
3075a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3076a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
3077a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
3078a9e8ba2cSAlex Elder 		u64 xferred = 0;
3079a9e8ba2cSAlex Elder 
3080a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
3081a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
3082a9e8ba2cSAlex Elder 					obj_request->img_offset;
3083a9e8ba2cSAlex Elder 
308402c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
3085a9e8ba2cSAlex Elder 	} else {
308602c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
3087a9e8ba2cSAlex Elder 	}
3088a9e8ba2cSAlex Elder out:
30898b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
30908b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
30918b3e1a56SAlex Elder }
30928b3e1a56SAlex Elder 
30938b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
30948b3e1a56SAlex Elder {
30958b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
30968b3e1a56SAlex Elder 	int result;
30978b3e1a56SAlex Elder 
30988b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
30998b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
31008b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
31015b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
31028b3e1a56SAlex Elder 
31038b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
3104e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
31058b3e1a56SAlex Elder 						obj_request->img_offset,
3106e93f3152SAlex Elder 						obj_request->length);
31078b3e1a56SAlex Elder 	result = -ENOMEM;
31088b3e1a56SAlex Elder 	if (!img_request)
31098b3e1a56SAlex Elder 		goto out_err;
31108b3e1a56SAlex Elder 
31115b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3112f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3113f1a4739fSAlex Elder 						obj_request->bio_list);
31145b2ab72dSAlex Elder 	else
31155b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
31165b2ab72dSAlex Elder 						obj_request->pages);
31178b3e1a56SAlex Elder 	if (result)
31188b3e1a56SAlex Elder 		goto out_err;
31198b3e1a56SAlex Elder 
31208b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
31218b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
31228b3e1a56SAlex Elder 	if (result)
31238b3e1a56SAlex Elder 		goto out_err;
31248b3e1a56SAlex Elder 
31258b3e1a56SAlex Elder 	return;
31268b3e1a56SAlex Elder out_err:
31278b3e1a56SAlex Elder 	if (img_request)
31288b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
31298b3e1a56SAlex Elder 	obj_request->result = result;
31308b3e1a56SAlex Elder 	obj_request->xferred = 0;
31318b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
31328b3e1a56SAlex Elder }
31338b3e1a56SAlex Elder 
3134ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3135ed95b21aSIlya Dryomov 
3136ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3137ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
3138ed95b21aSIlya Dryomov {
3139ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3140ed95b21aSIlya Dryomov }
3141ed95b21aSIlya Dryomov 
3142ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3143ed95b21aSIlya Dryomov {
3144ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
3145ed95b21aSIlya Dryomov 
3146ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3147ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3148ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
3149ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3150ed95b21aSIlya Dryomov 	return cid;
3151ed95b21aSIlya Dryomov }
3152ed95b21aSIlya Dryomov 
3153ed95b21aSIlya Dryomov /*
3154ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3155ed95b21aSIlya Dryomov  */
3156ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3157ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
3158ed95b21aSIlya Dryomov {
3159ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3160ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3161ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
3162ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
3163ed95b21aSIlya Dryomov }
3164ed95b21aSIlya Dryomov 
3165ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3166ed95b21aSIlya Dryomov {
3167ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3168ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3169ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3170ed95b21aSIlya Dryomov }
3171ed95b21aSIlya Dryomov 
3172ed95b21aSIlya Dryomov /*
3173ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3174ed95b21aSIlya Dryomov  */
3175ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3176ed95b21aSIlya Dryomov {
3177ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3178ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3179ed95b21aSIlya Dryomov 	char cookie[32];
3180ed95b21aSIlya Dryomov 	int ret;
3181ed95b21aSIlya Dryomov 
3182ed95b21aSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev));
3183ed95b21aSIlya Dryomov 
3184ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3185ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3186ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3187ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
3188ed95b21aSIlya Dryomov 	if (ret)
3189ed95b21aSIlya Dryomov 		return ret;
3190ed95b21aSIlya Dryomov 
3191ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3192ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &cid);
3193ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3194ed95b21aSIlya Dryomov 	return 0;
3195ed95b21aSIlya Dryomov }
3196ed95b21aSIlya Dryomov 
3197ed95b21aSIlya Dryomov /*
3198ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3199ed95b21aSIlya Dryomov  */
3200ed95b21aSIlya Dryomov static int rbd_unlock(struct rbd_device *rbd_dev)
3201ed95b21aSIlya Dryomov {
3202ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3203ed95b21aSIlya Dryomov 	char cookie[32];
3204ed95b21aSIlya Dryomov 	int ret;
3205ed95b21aSIlya Dryomov 
3206ed95b21aSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev));
3207ed95b21aSIlya Dryomov 
3208ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3209ed95b21aSIlya Dryomov 
3210ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3211ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3212ed95b21aSIlya Dryomov 			      RBD_LOCK_NAME, cookie);
3213ed95b21aSIlya Dryomov 	if (ret && ret != -ENOENT) {
3214ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "cls_unlock failed: %d", ret);
3215ed95b21aSIlya Dryomov 		return ret;
3216ed95b21aSIlya Dryomov 	}
3217ed95b21aSIlya Dryomov 
3218ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3219ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3220ed95b21aSIlya Dryomov 	return 0;
3221ed95b21aSIlya Dryomov }
3222ed95b21aSIlya Dryomov 
3223ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3224ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
3225ed95b21aSIlya Dryomov 				struct page ***preply_pages,
3226ed95b21aSIlya Dryomov 				size_t *preply_len)
3227ed95b21aSIlya Dryomov {
3228ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3229ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3230ed95b21aSIlya Dryomov 	int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3231ed95b21aSIlya Dryomov 	char buf[buf_size];
3232ed95b21aSIlya Dryomov 	void *p = buf;
3233ed95b21aSIlya Dryomov 
3234ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3235ed95b21aSIlya Dryomov 
3236ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3237ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3238ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
3239ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
3240ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
3241ed95b21aSIlya Dryomov 
3242ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3243ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
3244ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3245ed95b21aSIlya Dryomov }
3246ed95b21aSIlya Dryomov 
3247ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3248ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
3249ed95b21aSIlya Dryomov {
3250ed95b21aSIlya Dryomov 	struct page **reply_pages;
3251ed95b21aSIlya Dryomov 	size_t reply_len;
3252ed95b21aSIlya Dryomov 
3253ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3254ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3255ed95b21aSIlya Dryomov }
3256ed95b21aSIlya Dryomov 
3257ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3258ed95b21aSIlya Dryomov {
3259ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3260ed95b21aSIlya Dryomov 						  acquired_lock_work);
3261ed95b21aSIlya Dryomov 
3262ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3263ed95b21aSIlya Dryomov }
3264ed95b21aSIlya Dryomov 
3265ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3266ed95b21aSIlya Dryomov {
3267ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3268ed95b21aSIlya Dryomov 						  released_lock_work);
3269ed95b21aSIlya Dryomov 
3270ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3271ed95b21aSIlya Dryomov }
3272ed95b21aSIlya Dryomov 
3273ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3274ed95b21aSIlya Dryomov {
3275ed95b21aSIlya Dryomov 	struct page **reply_pages;
3276ed95b21aSIlya Dryomov 	size_t reply_len;
3277ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
3278ed95b21aSIlya Dryomov 	int ret;
3279ed95b21aSIlya Dryomov 
3280ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3281ed95b21aSIlya Dryomov 
3282ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3283ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
3284ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
3285ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3286ed95b21aSIlya Dryomov 		goto out;
3287ed95b21aSIlya Dryomov 	}
3288ed95b21aSIlya Dryomov 
3289ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3290ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
3291ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
3292ed95b21aSIlya Dryomov 		u32 n;
3293ed95b21aSIlya Dryomov 
3294ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3295ed95b21aSIlya Dryomov 		while (n--) {
3296ed95b21aSIlya Dryomov 			u8 struct_v;
3297ed95b21aSIlya Dryomov 			u32 len;
3298ed95b21aSIlya Dryomov 
3299ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3300ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
3301ed95b21aSIlya Dryomov 
3302ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
3303ed95b21aSIlya Dryomov 			if (!len)
3304ed95b21aSIlya Dryomov 				continue;
3305ed95b21aSIlya Dryomov 
3306ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
3307ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3308ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
3309ed95b21aSIlya Dryomov 				ret = -EIO;
3310ed95b21aSIlya Dryomov 				goto out;
3311ed95b21aSIlya Dryomov 			}
3312ed95b21aSIlya Dryomov 
3313ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3314ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3315ed95b21aSIlya Dryomov 						  &struct_v, &len);
3316ed95b21aSIlya Dryomov 			if (ret) {
3317ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3318ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3319ed95b21aSIlya Dryomov 					 ret);
3320ed95b21aSIlya Dryomov 				goto e_inval;
3321ed95b21aSIlya Dryomov 			}
3322ed95b21aSIlya Dryomov 
3323ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3324ed95b21aSIlya Dryomov 		}
3325ed95b21aSIlya Dryomov 	}
3326ed95b21aSIlya Dryomov 
3327ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3328ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3329ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3330ed95b21aSIlya Dryomov 	}
3331ed95b21aSIlya Dryomov 
3332ed95b21aSIlya Dryomov out:
3333ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3334ed95b21aSIlya Dryomov 	return ret;
3335ed95b21aSIlya Dryomov 
3336ed95b21aSIlya Dryomov e_inval:
3337ed95b21aSIlya Dryomov 	ret = -EINVAL;
3338ed95b21aSIlya Dryomov 	goto out;
3339ed95b21aSIlya Dryomov }
3340ed95b21aSIlya Dryomov 
3341ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3342ed95b21aSIlya Dryomov {
3343ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3344ed95b21aSIlya Dryomov 
3345ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3346ed95b21aSIlya Dryomov 	if (wake_all)
3347ed95b21aSIlya Dryomov 		wake_up_all(&rbd_dev->lock_waitq);
3348ed95b21aSIlya Dryomov 	else
3349ed95b21aSIlya Dryomov 		wake_up(&rbd_dev->lock_waitq);
3350ed95b21aSIlya Dryomov }
3351ed95b21aSIlya Dryomov 
3352ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3353ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3354ed95b21aSIlya Dryomov {
3355ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3356ed95b21aSIlya Dryomov 	u8 lock_type;
3357ed95b21aSIlya Dryomov 	char *lock_tag;
3358ed95b21aSIlya Dryomov 	int ret;
3359ed95b21aSIlya Dryomov 
3360ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3361ed95b21aSIlya Dryomov 
3362ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3363ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3364ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
3365ed95b21aSIlya Dryomov 	if (ret)
3366ed95b21aSIlya Dryomov 		return ret;
3367ed95b21aSIlya Dryomov 
3368ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
3369ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3370ed95b21aSIlya Dryomov 		goto out;
3371ed95b21aSIlya Dryomov 	}
3372ed95b21aSIlya Dryomov 
3373ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3374ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3375ed95b21aSIlya Dryomov 			 lock_tag);
3376ed95b21aSIlya Dryomov 		ret = -EBUSY;
3377ed95b21aSIlya Dryomov 		goto out;
3378ed95b21aSIlya Dryomov 	}
3379ed95b21aSIlya Dryomov 
3380ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3381ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
3382ed95b21aSIlya Dryomov 		ret = -EBUSY;
3383ed95b21aSIlya Dryomov 		goto out;
3384ed95b21aSIlya Dryomov 	}
3385ed95b21aSIlya Dryomov 
3386ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3387ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3388ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3389ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3390ed95b21aSIlya Dryomov 		ret = -EBUSY;
3391ed95b21aSIlya Dryomov 		goto out;
3392ed95b21aSIlya Dryomov 	}
3393ed95b21aSIlya Dryomov 
3394ed95b21aSIlya Dryomov out:
3395ed95b21aSIlya Dryomov 	kfree(lock_tag);
3396ed95b21aSIlya Dryomov 	return ret;
3397ed95b21aSIlya Dryomov }
3398ed95b21aSIlya Dryomov 
3399ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3400ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3401ed95b21aSIlya Dryomov {
3402ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3403ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3404ed95b21aSIlya Dryomov 	u32 num_watchers;
3405ed95b21aSIlya Dryomov 	u64 cookie;
3406ed95b21aSIlya Dryomov 	int i;
3407ed95b21aSIlya Dryomov 	int ret;
3408ed95b21aSIlya Dryomov 
3409ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3410ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3411ed95b21aSIlya Dryomov 				      &num_watchers);
3412ed95b21aSIlya Dryomov 	if (ret)
3413ed95b21aSIlya Dryomov 		return ret;
3414ed95b21aSIlya Dryomov 
3415ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3416ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3417ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3418ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
3419ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3420ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3421ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3422ed95b21aSIlya Dryomov 				.handle = cookie,
3423ed95b21aSIlya Dryomov 			};
3424ed95b21aSIlya Dryomov 
3425ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3426ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3427ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3428ed95b21aSIlya Dryomov 			ret = 1;
3429ed95b21aSIlya Dryomov 			goto out;
3430ed95b21aSIlya Dryomov 		}
3431ed95b21aSIlya Dryomov 	}
3432ed95b21aSIlya Dryomov 
3433ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3434ed95b21aSIlya Dryomov 	ret = 0;
3435ed95b21aSIlya Dryomov out:
3436ed95b21aSIlya Dryomov 	kfree(watchers);
3437ed95b21aSIlya Dryomov 	return ret;
3438ed95b21aSIlya Dryomov }
3439ed95b21aSIlya Dryomov 
3440ed95b21aSIlya Dryomov /*
3441ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3442ed95b21aSIlya Dryomov  */
3443ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3444ed95b21aSIlya Dryomov {
3445ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3446ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3447ed95b21aSIlya Dryomov 	u32 num_lockers;
3448ed95b21aSIlya Dryomov 	int ret;
3449ed95b21aSIlya Dryomov 
3450ed95b21aSIlya Dryomov 	for (;;) {
3451ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3452ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3453ed95b21aSIlya Dryomov 			return ret;
3454ed95b21aSIlya Dryomov 
3455ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3456ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3457ed95b21aSIlya Dryomov 		if (ret)
3458ed95b21aSIlya Dryomov 			return ret;
3459ed95b21aSIlya Dryomov 
3460ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3461ed95b21aSIlya Dryomov 			goto again;
3462ed95b21aSIlya Dryomov 
3463ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3464ed95b21aSIlya Dryomov 		if (ret) {
3465ed95b21aSIlya Dryomov 			if (ret > 0)
3466ed95b21aSIlya Dryomov 				ret = 0; /* have to request lock */
3467ed95b21aSIlya Dryomov 			goto out;
3468ed95b21aSIlya Dryomov 		}
3469ed95b21aSIlya Dryomov 
3470ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3471ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3472ed95b21aSIlya Dryomov 
3473ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
3474ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3475ed95b21aSIlya Dryomov 		if (ret) {
3476ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3477ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3478ed95b21aSIlya Dryomov 			goto out;
3479ed95b21aSIlya Dryomov 		}
3480ed95b21aSIlya Dryomov 
3481ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3482ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3483ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3484ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3485ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3486ed95b21aSIlya Dryomov 			goto out;
3487ed95b21aSIlya Dryomov 
3488ed95b21aSIlya Dryomov again:
3489ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3490ed95b21aSIlya Dryomov 	}
3491ed95b21aSIlya Dryomov 
3492ed95b21aSIlya Dryomov out:
3493ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3494ed95b21aSIlya Dryomov 	return ret;
3495ed95b21aSIlya Dryomov }
3496ed95b21aSIlya Dryomov 
3497ed95b21aSIlya Dryomov /*
3498ed95b21aSIlya Dryomov  * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3499ed95b21aSIlya Dryomov  */
3500ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3501ed95b21aSIlya Dryomov 						int *pret)
3502ed95b21aSIlya Dryomov {
3503ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3504ed95b21aSIlya Dryomov 
3505ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3506ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3507ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3508ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
3509ed95b21aSIlya Dryomov 		lock_state = rbd_dev->lock_state;
3510ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3511ed95b21aSIlya Dryomov 		return lock_state;
3512ed95b21aSIlya Dryomov 	}
3513ed95b21aSIlya Dryomov 
3514ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3515ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3516ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3517ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3518ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev)) {
3519ed95b21aSIlya Dryomov 		*pret = rbd_try_lock(rbd_dev);
3520ed95b21aSIlya Dryomov 		if (*pret)
3521ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3522ed95b21aSIlya Dryomov 	}
3523ed95b21aSIlya Dryomov 
3524ed95b21aSIlya Dryomov 	lock_state = rbd_dev->lock_state;
3525ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3526ed95b21aSIlya Dryomov 	return lock_state;
3527ed95b21aSIlya Dryomov }
3528ed95b21aSIlya Dryomov 
3529ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
3530ed95b21aSIlya Dryomov {
3531ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3532ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
3533ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3534ed95b21aSIlya Dryomov 	int ret;
3535ed95b21aSIlya Dryomov 
3536ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3537ed95b21aSIlya Dryomov again:
3538ed95b21aSIlya Dryomov 	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3539ed95b21aSIlya Dryomov 	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3540ed95b21aSIlya Dryomov 		if (lock_state == RBD_LOCK_STATE_LOCKED)
3541ed95b21aSIlya Dryomov 			wake_requests(rbd_dev, true);
3542ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3543ed95b21aSIlya Dryomov 		     rbd_dev, lock_state, ret);
3544ed95b21aSIlya Dryomov 		return;
3545ed95b21aSIlya Dryomov 	}
3546ed95b21aSIlya Dryomov 
3547ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
3548ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
3549ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
3550ed95b21aSIlya Dryomov 	} else if (ret < 0) {
3551ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3552ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3553ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
3554ed95b21aSIlya Dryomov 	} else {
3555ed95b21aSIlya Dryomov 		/*
3556ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
3557ed95b21aSIlya Dryomov 		 * release the lock
3558ed95b21aSIlya Dryomov 		 */
3559ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3560ed95b21aSIlya Dryomov 		     rbd_dev);
3561ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3562ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3563ed95b21aSIlya Dryomov 	}
3564ed95b21aSIlya Dryomov }
3565ed95b21aSIlya Dryomov 
3566ed95b21aSIlya Dryomov /*
3567ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3568ed95b21aSIlya Dryomov  */
3569ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev)
3570ed95b21aSIlya Dryomov {
3571ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3572ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3573ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3574ed95b21aSIlya Dryomov 		return false;
3575ed95b21aSIlya Dryomov 
3576ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3577ed95b21aSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
3578ed95b21aSIlya Dryomov 	/*
3579ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
3580ed95b21aSIlya Dryomov 	 *
3581ed95b21aSIlya Dryomov 	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3582ed95b21aSIlya Dryomov 	 * may be shared with other devices.
3583ed95b21aSIlya Dryomov 	 */
3584ed95b21aSIlya Dryomov 	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3585ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3586ed95b21aSIlya Dryomov 
3587ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3588ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3589ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3590ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3591ed95b21aSIlya Dryomov 		return false;
3592ed95b21aSIlya Dryomov 
3593ed95b21aSIlya Dryomov 	if (!rbd_unlock(rbd_dev))
3594ed95b21aSIlya Dryomov 		/*
3595ed95b21aSIlya Dryomov 		 * Give others a chance to grab the lock - we would re-acquire
3596ed95b21aSIlya Dryomov 		 * almost immediately if we got new IO during ceph_osdc_sync()
3597ed95b21aSIlya Dryomov 		 * otherwise.  We need to ack our own notifications, so this
3598ed95b21aSIlya Dryomov 		 * lock_dwork will be requeued from rbd_wait_state_locked()
3599ed95b21aSIlya Dryomov 		 * after wake_requests() in rbd_handle_released_lock().
3600ed95b21aSIlya Dryomov 		 */
3601ed95b21aSIlya Dryomov 		cancel_delayed_work(&rbd_dev->lock_dwork);
3602ed95b21aSIlya Dryomov 
3603ed95b21aSIlya Dryomov 	return true;
3604ed95b21aSIlya Dryomov }
3605ed95b21aSIlya Dryomov 
3606ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
3607ed95b21aSIlya Dryomov {
3608ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3609ed95b21aSIlya Dryomov 						  unlock_work);
3610ed95b21aSIlya Dryomov 
3611ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3612ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
3613ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3614ed95b21aSIlya Dryomov }
3615ed95b21aSIlya Dryomov 
3616ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3617ed95b21aSIlya Dryomov 				     void **p)
3618ed95b21aSIlya Dryomov {
3619ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3620ed95b21aSIlya Dryomov 
3621ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3622ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3623ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3624ed95b21aSIlya Dryomov 	}
3625ed95b21aSIlya Dryomov 
3626ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3627ed95b21aSIlya Dryomov 	     cid.handle);
3628ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3629ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3630ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3631ed95b21aSIlya Dryomov 			/*
3632ed95b21aSIlya Dryomov 			 * we already know that the remote client is
3633ed95b21aSIlya Dryomov 			 * the owner
3634ed95b21aSIlya Dryomov 			 */
3635ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3636ed95b21aSIlya Dryomov 			return;
3637ed95b21aSIlya Dryomov 		}
3638ed95b21aSIlya Dryomov 
3639ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
3640ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3641ed95b21aSIlya Dryomov 	} else {
3642ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3643ed95b21aSIlya Dryomov 	}
3644ed95b21aSIlya Dryomov 
3645ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3646ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3647ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3648ed95b21aSIlya Dryomov }
3649ed95b21aSIlya Dryomov 
3650ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3651ed95b21aSIlya Dryomov 				     void **p)
3652ed95b21aSIlya Dryomov {
3653ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3654ed95b21aSIlya Dryomov 
3655ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3656ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3657ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3658ed95b21aSIlya Dryomov 	}
3659ed95b21aSIlya Dryomov 
3660ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3661ed95b21aSIlya Dryomov 	     cid.handle);
3662ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3663ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3664ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3665ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3666ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
3667ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3668ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3669ed95b21aSIlya Dryomov 			return;
3670ed95b21aSIlya Dryomov 		}
3671ed95b21aSIlya Dryomov 
3672ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3673ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3674ed95b21aSIlya Dryomov 	} else {
3675ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3676ed95b21aSIlya Dryomov 	}
3677ed95b21aSIlya Dryomov 
3678ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3679ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3680ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3681ed95b21aSIlya Dryomov }
3682ed95b21aSIlya Dryomov 
3683ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3684ed95b21aSIlya Dryomov 				    void **p)
3685ed95b21aSIlya Dryomov {
3686ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3687ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3688ed95b21aSIlya Dryomov 	bool need_to_send;
3689ed95b21aSIlya Dryomov 
3690ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3691ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3692ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3693ed95b21aSIlya Dryomov 	}
3694ed95b21aSIlya Dryomov 
3695ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3696ed95b21aSIlya Dryomov 	     cid.handle);
3697ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
3698ed95b21aSIlya Dryomov 		return false;
3699ed95b21aSIlya Dryomov 
3700ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3701ed95b21aSIlya Dryomov 	need_to_send = __rbd_is_lock_owner(rbd_dev);
3702ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3703ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) {
3704ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p queueing unlock_work\n", __func__,
3705ed95b21aSIlya Dryomov 			     rbd_dev);
3706ed95b21aSIlya Dryomov 			queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work);
3707ed95b21aSIlya Dryomov 		}
3708ed95b21aSIlya Dryomov 	}
3709ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3710ed95b21aSIlya Dryomov 	return need_to_send;
3711ed95b21aSIlya Dryomov }
3712ed95b21aSIlya Dryomov 
3713ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3714ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
3715ed95b21aSIlya Dryomov {
3716ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3717ed95b21aSIlya Dryomov 	int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3718ed95b21aSIlya Dryomov 	char buf[buf_size];
3719ed95b21aSIlya Dryomov 	int ret;
3720ed95b21aSIlya Dryomov 
3721ed95b21aSIlya Dryomov 	if (result) {
3722ed95b21aSIlya Dryomov 		void *p = buf;
3723ed95b21aSIlya Dryomov 
3724ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
3725ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
3726ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3727ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
3728ed95b21aSIlya Dryomov 	} else {
3729ed95b21aSIlya Dryomov 		buf_size = 0;
3730ed95b21aSIlya Dryomov 	}
3731ed95b21aSIlya Dryomov 
3732ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3733ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3734ed95b21aSIlya Dryomov 				   buf, buf_size);
3735ed95b21aSIlya Dryomov 	if (ret)
3736ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3737ed95b21aSIlya Dryomov }
3738ed95b21aSIlya Dryomov 
3739ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3740ed95b21aSIlya Dryomov 				   u64 cookie)
3741ed95b21aSIlya Dryomov {
3742ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3743ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3744ed95b21aSIlya Dryomov }
3745ed95b21aSIlya Dryomov 
3746ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3747ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
3748ed95b21aSIlya Dryomov {
3749ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3750ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3751ed95b21aSIlya Dryomov }
3752ed95b21aSIlya Dryomov 
3753922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3754922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3755b8d70035SAlex Elder {
3756922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3757ed95b21aSIlya Dryomov 	void *p = data;
3758ed95b21aSIlya Dryomov 	void *const end = p + data_len;
3759ed95b21aSIlya Dryomov 	u8 struct_v;
3760ed95b21aSIlya Dryomov 	u32 len;
3761ed95b21aSIlya Dryomov 	u32 notify_op;
3762b8d70035SAlex Elder 	int ret;
3763b8d70035SAlex Elder 
3764ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3765ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
3766ed95b21aSIlya Dryomov 	if (data_len) {
3767ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3768ed95b21aSIlya Dryomov 					  &struct_v, &len);
3769ed95b21aSIlya Dryomov 		if (ret) {
3770ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3771ed95b21aSIlya Dryomov 				 ret);
3772ed95b21aSIlya Dryomov 			return;
3773ed95b21aSIlya Dryomov 		}
377452bb1f9bSIlya Dryomov 
3775ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
3776ed95b21aSIlya Dryomov 	} else {
3777ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
3778ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3779ed95b21aSIlya Dryomov 		len = 0;
3780ed95b21aSIlya Dryomov 	}
3781ed95b21aSIlya Dryomov 
3782ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3783ed95b21aSIlya Dryomov 	switch (notify_op) {
3784ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3785ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3786ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3787ed95b21aSIlya Dryomov 		break;
3788ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
3789ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3790ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3791ed95b21aSIlya Dryomov 		break;
3792ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
3793ed95b21aSIlya Dryomov 		if (rbd_handle_request_lock(rbd_dev, struct_v, &p))
379452bb1f9bSIlya Dryomov 			/*
3795ed95b21aSIlya Dryomov 			 * send ResponseMessage(0) back so the client
3796ed95b21aSIlya Dryomov 			 * can detect a missing owner
379752bb1f9bSIlya Dryomov 			 */
3798ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3799ed95b21aSIlya Dryomov 						      cookie, 0);
3800ed95b21aSIlya Dryomov 		else
3801ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3802ed95b21aSIlya Dryomov 		break;
3803ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
3804e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
3805e627db08SAlex Elder 		if (ret)
38069584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3807b8d70035SAlex Elder 
3808ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3809ed95b21aSIlya Dryomov 		break;
3810ed95b21aSIlya Dryomov 	default:
3811ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
3812ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3813ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
3814ed95b21aSIlya Dryomov 		else
3815ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3816ed95b21aSIlya Dryomov 		break;
3817ed95b21aSIlya Dryomov 	}
3818b8d70035SAlex Elder }
3819b8d70035SAlex Elder 
382099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
382199d16943SIlya Dryomov 
3822922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3823bb040aa0SIlya Dryomov {
3824922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3825bb040aa0SIlya Dryomov 
3826922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3827bb040aa0SIlya Dryomov 
3828ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3829ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3830ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3831ed95b21aSIlya Dryomov 
383299d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
383399d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
383499d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
383599d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3836bb040aa0SIlya Dryomov 
383799d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3838bb040aa0SIlya Dryomov 	}
383999d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3840bb040aa0SIlya Dryomov }
3841bb040aa0SIlya Dryomov 
3842bb040aa0SIlya Dryomov /*
384399d16943SIlya Dryomov  * watch_mutex must be locked
38449969ebc5SAlex Elder  */
384599d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
38469969ebc5SAlex Elder {
38479969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3848922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
38499969ebc5SAlex Elder 
3850922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
385199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
38529969ebc5SAlex Elder 
3853922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3854922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3855922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3856922dab61SIlya Dryomov 	if (IS_ERR(handle))
3857922dab61SIlya Dryomov 		return PTR_ERR(handle);
38589969ebc5SAlex Elder 
3859922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
38608eb87565SAlex Elder 	return 0;
38619969ebc5SAlex Elder }
38629969ebc5SAlex Elder 
386399d16943SIlya Dryomov /*
386499d16943SIlya Dryomov  * watch_mutex must be locked
386599d16943SIlya Dryomov  */
386699d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3867fca27065SIlya Dryomov {
3868922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3869922dab61SIlya Dryomov 	int ret;
3870b30a01f2SIlya Dryomov 
387199d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
387299d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3873b30a01f2SIlya Dryomov 
3874922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3875922dab61SIlya Dryomov 	if (ret)
3876922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3877b30a01f2SIlya Dryomov 
3878922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3879c525f036SIlya Dryomov }
3880c525f036SIlya Dryomov 
388199d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
3882c525f036SIlya Dryomov {
388399d16943SIlya Dryomov 	int ret;
3884811c6688SIlya Dryomov 
388599d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
388699d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
388799d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
388899d16943SIlya Dryomov 	if (ret)
388999d16943SIlya Dryomov 		goto out;
389099d16943SIlya Dryomov 
389199d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
389299d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
389399d16943SIlya Dryomov 
389499d16943SIlya Dryomov out:
389599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
389699d16943SIlya Dryomov 	return ret;
389799d16943SIlya Dryomov }
389899d16943SIlya Dryomov 
389999d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
390099d16943SIlya Dryomov {
390199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
390299d16943SIlya Dryomov 
390399d16943SIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3904ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
3905ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
3906ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3907ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
390899d16943SIlya Dryomov }
390999d16943SIlya Dryomov 
391099d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
391199d16943SIlya Dryomov {
3912ed95b21aSIlya Dryomov 	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
391399d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
391499d16943SIlya Dryomov 
391599d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
391699d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
391799d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
391899d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
391999d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
392099d16943SIlya Dryomov 
3921811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3922fca27065SIlya Dryomov }
3923fca27065SIlya Dryomov 
392499d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
392599d16943SIlya Dryomov {
392699d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
392799d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
3928ed95b21aSIlya Dryomov 	bool was_lock_owner = false;
392999d16943SIlya Dryomov 	int ret;
393099d16943SIlya Dryomov 
393199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
393299d16943SIlya Dryomov 
3933ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3934ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3935ed95b21aSIlya Dryomov 		was_lock_owner = rbd_release_lock(rbd_dev);
3936ed95b21aSIlya Dryomov 
393799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
393899d16943SIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR)
393999d16943SIlya Dryomov 		goto fail_unlock;
394099d16943SIlya Dryomov 
394199d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
394299d16943SIlya Dryomov 	if (ret) {
394399d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
394499d16943SIlya Dryomov 		if (ret != -EBLACKLISTED)
394599d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
394699d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
394799d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
394899d16943SIlya Dryomov 		goto fail_unlock;
394999d16943SIlya Dryomov 	}
395099d16943SIlya Dryomov 
395199d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
395299d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
395399d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
395499d16943SIlya Dryomov 
395599d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
395699d16943SIlya Dryomov 	if (ret)
395799d16943SIlya Dryomov 		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
395899d16943SIlya Dryomov 
3959ed95b21aSIlya Dryomov 	if (was_lock_owner) {
3960ed95b21aSIlya Dryomov 		ret = rbd_try_lock(rbd_dev);
3961ed95b21aSIlya Dryomov 		if (ret)
3962ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "reregisteration lock failed: %d",
3963ed95b21aSIlya Dryomov 				 ret);
3964ed95b21aSIlya Dryomov 	}
3965ed95b21aSIlya Dryomov 
3966ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3967ed95b21aSIlya Dryomov 	wake_requests(rbd_dev, true);
396899d16943SIlya Dryomov 	return;
396999d16943SIlya Dryomov 
397099d16943SIlya Dryomov fail_unlock:
397199d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3972ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
397399d16943SIlya Dryomov }
397499d16943SIlya Dryomov 
397536be9a76SAlex Elder /*
3976f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3977f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
397836be9a76SAlex Elder  */
397936be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
398036be9a76SAlex Elder 			     const char *object_name,
398136be9a76SAlex Elder 			     const char *class_name,
398236be9a76SAlex Elder 			     const char *method_name,
39834157976bSAlex Elder 			     const void *outbound,
398436be9a76SAlex Elder 			     size_t outbound_size,
39854157976bSAlex Elder 			     void *inbound,
3986e2a58ee5SAlex Elder 			     size_t inbound_size)
398736be9a76SAlex Elder {
398836be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
398936be9a76SAlex Elder 	struct page **pages;
399036be9a76SAlex Elder 	u32 page_count;
399136be9a76SAlex Elder 	int ret;
399236be9a76SAlex Elder 
399336be9a76SAlex Elder 	/*
39946010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
39956010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
39966010a451SAlex Elder 	 * also supply outbound data--parameters for the object
39976010a451SAlex Elder 	 * method.  Currently if this is present it will be a
39986010a451SAlex Elder 	 * snapshot id.
399936be9a76SAlex Elder 	 */
400036be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
400136be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
400236be9a76SAlex Elder 	if (IS_ERR(pages))
400336be9a76SAlex Elder 		return PTR_ERR(pages);
400436be9a76SAlex Elder 
400536be9a76SAlex Elder 	ret = -ENOMEM;
40066010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
400736be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
400836be9a76SAlex Elder 	if (!obj_request)
400936be9a76SAlex Elder 		goto out;
401036be9a76SAlex Elder 
401136be9a76SAlex Elder 	obj_request->pages = pages;
401236be9a76SAlex Elder 	obj_request->page_count = page_count;
401336be9a76SAlex Elder 
40146d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
4015deb236b3SIlya Dryomov 						  obj_request);
401636be9a76SAlex Elder 	if (!obj_request->osd_req)
401736be9a76SAlex Elder 		goto out;
401836be9a76SAlex Elder 
4019c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
402004017e29SAlex Elder 					class_name, method_name);
402104017e29SAlex Elder 	if (outbound_size) {
402204017e29SAlex Elder 		struct ceph_pagelist *pagelist;
402304017e29SAlex Elder 
402404017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
402504017e29SAlex Elder 		if (!pagelist)
402604017e29SAlex Elder 			goto out;
402704017e29SAlex Elder 
402804017e29SAlex Elder 		ceph_pagelist_init(pagelist);
402904017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
403004017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
403104017e29SAlex Elder 						pagelist);
403204017e29SAlex Elder 	}
4033a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
4034a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
403544cd188dSAlex Elder 					0, false, false);
4036430c28c3SAlex Elder 
4037980917fcSIlya Dryomov 	rbd_obj_request_submit(obj_request);
403836be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
403936be9a76SAlex Elder 	if (ret)
404036be9a76SAlex Elder 		goto out;
404136be9a76SAlex Elder 
404236be9a76SAlex Elder 	ret = obj_request->result;
404336be9a76SAlex Elder 	if (ret < 0)
404436be9a76SAlex Elder 		goto out;
404557385b51SAlex Elder 
404657385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
404757385b51SAlex Elder 	ret = (int)obj_request->xferred;
4048903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
404936be9a76SAlex Elder out:
405036be9a76SAlex Elder 	if (obj_request)
405136be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
405236be9a76SAlex Elder 	else
405336be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
405436be9a76SAlex Elder 
405536be9a76SAlex Elder 	return ret;
405636be9a76SAlex Elder }
405736be9a76SAlex Elder 
4058ed95b21aSIlya Dryomov /*
4059ed95b21aSIlya Dryomov  * lock_rwsem must be held for read
4060ed95b21aSIlya Dryomov  */
4061ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
4062ed95b21aSIlya Dryomov {
4063ed95b21aSIlya Dryomov 	DEFINE_WAIT(wait);
4064ed95b21aSIlya Dryomov 
4065ed95b21aSIlya Dryomov 	do {
4066ed95b21aSIlya Dryomov 		/*
4067ed95b21aSIlya Dryomov 		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
4068ed95b21aSIlya Dryomov 		 * and cancel_delayed_work() in wake_requests().
4069ed95b21aSIlya Dryomov 		 */
4070ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
4071ed95b21aSIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4072ed95b21aSIlya Dryomov 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
4073ed95b21aSIlya Dryomov 					  TASK_UNINTERRUPTIBLE);
4074ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4075ed95b21aSIlya Dryomov 		schedule();
4076ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4077ed95b21aSIlya Dryomov 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
4078ed95b21aSIlya Dryomov 	finish_wait(&rbd_dev->lock_waitq, &wait);
4079ed95b21aSIlya Dryomov }
4080ed95b21aSIlya Dryomov 
40817ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
4082bc1ecc65SIlya Dryomov {
40837ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
40847ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
4085bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
40864e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
4087bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4088bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
40896d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
40904e752f0aSJosh Durgin 	u64 mapping_size;
409180de1912SIlya Dryomov 	bool must_be_locked;
4092bc1ecc65SIlya Dryomov 	int result;
4093bc1ecc65SIlya Dryomov 
40947ad18afaSChristoph Hellwig 	if (rq->cmd_type != REQ_TYPE_FS) {
40957ad18afaSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__,
40967ad18afaSChristoph Hellwig 			(int) rq->cmd_type);
40977ad18afaSChristoph Hellwig 		result = -EIO;
40987ad18afaSChristoph Hellwig 		goto err;
40997ad18afaSChristoph Hellwig 	}
41007ad18afaSChristoph Hellwig 
4101c2df40dfSMike Christie 	if (req_op(rq) == REQ_OP_DISCARD)
410290e98c52SGuangliang Zhao 		op_type = OBJ_OP_DISCARD;
4103c2df40dfSMike Christie 	else if (req_op(rq) == REQ_OP_WRITE)
41046d2940c8SGuangliang Zhao 		op_type = OBJ_OP_WRITE;
41056d2940c8SGuangliang Zhao 	else
41066d2940c8SGuangliang Zhao 		op_type = OBJ_OP_READ;
41076d2940c8SGuangliang Zhao 
4108bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
4109bc1ecc65SIlya Dryomov 
4110bc1ecc65SIlya Dryomov 	if (!length) {
4111bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
4112bc1ecc65SIlya Dryomov 		result = 0;
4113bc1ecc65SIlya Dryomov 		goto err_rq;
4114bc1ecc65SIlya Dryomov 	}
4115bc1ecc65SIlya Dryomov 
41166d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
4117bc1ecc65SIlya Dryomov 
41186d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
4119bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
4120bc1ecc65SIlya Dryomov 			result = -EROFS;
4121bc1ecc65SIlya Dryomov 			goto err_rq;
4122bc1ecc65SIlya Dryomov 		}
4123bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
4124bc1ecc65SIlya Dryomov 	}
4125bc1ecc65SIlya Dryomov 
4126bc1ecc65SIlya Dryomov 	/*
4127bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
4128bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
4129bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
4130bc1ecc65SIlya Dryomov 	 * sending it if we already know.
4131bc1ecc65SIlya Dryomov 	 */
4132bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4133bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
4134bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4135bc1ecc65SIlya Dryomov 		result = -ENXIO;
4136bc1ecc65SIlya Dryomov 		goto err_rq;
4137bc1ecc65SIlya Dryomov 	}
4138bc1ecc65SIlya Dryomov 
4139bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
4140bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4141bc1ecc65SIlya Dryomov 			 length);
4142bc1ecc65SIlya Dryomov 		result = -EINVAL;
4143bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
4144bc1ecc65SIlya Dryomov 	}
4145bc1ecc65SIlya Dryomov 
41467ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
41477ad18afaSChristoph Hellwig 
41484e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
41494e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
41506d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
41514e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
41524e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
4153ed95b21aSIlya Dryomov 		must_be_locked = rbd_is_lock_supported(rbd_dev);
415480de1912SIlya Dryomov 	} else {
415580de1912SIlya Dryomov 		must_be_locked = rbd_dev->opts->lock_on_read &&
415680de1912SIlya Dryomov 					rbd_is_lock_supported(rbd_dev);
41574e752f0aSJosh Durgin 	}
41584e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
41594e752f0aSJosh Durgin 
41604e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
4161bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
41624e752f0aSJosh Durgin 			 length, mapping_size);
4163bc1ecc65SIlya Dryomov 		result = -EIO;
4164bc1ecc65SIlya Dryomov 		goto err_rq;
4165bc1ecc65SIlya Dryomov 	}
4166bc1ecc65SIlya Dryomov 
4167ed95b21aSIlya Dryomov 	if (must_be_locked) {
4168ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4169ed95b21aSIlya Dryomov 		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4170ed95b21aSIlya Dryomov 			rbd_wait_state_locked(rbd_dev);
4171ed95b21aSIlya Dryomov 	}
4172ed95b21aSIlya Dryomov 
41736d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
41744e752f0aSJosh Durgin 					     snapc);
4175bc1ecc65SIlya Dryomov 	if (!img_request) {
4176bc1ecc65SIlya Dryomov 		result = -ENOMEM;
4177ed95b21aSIlya Dryomov 		goto err_unlock;
4178bc1ecc65SIlya Dryomov 	}
4179bc1ecc65SIlya Dryomov 	img_request->rq = rq;
418070b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
4181bc1ecc65SIlya Dryomov 
418290e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
418390e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
418490e98c52SGuangliang Zhao 					      NULL);
418590e98c52SGuangliang Zhao 	else
418690e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
418790e98c52SGuangliang Zhao 					      rq->bio);
4188bc1ecc65SIlya Dryomov 	if (result)
4189bc1ecc65SIlya Dryomov 		goto err_img_request;
4190bc1ecc65SIlya Dryomov 
4191bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
4192bc1ecc65SIlya Dryomov 	if (result)
4193bc1ecc65SIlya Dryomov 		goto err_img_request;
4194bc1ecc65SIlya Dryomov 
4195ed95b21aSIlya Dryomov 	if (must_be_locked)
4196ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4197bc1ecc65SIlya Dryomov 	return;
4198bc1ecc65SIlya Dryomov 
4199bc1ecc65SIlya Dryomov err_img_request:
4200bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
4201ed95b21aSIlya Dryomov err_unlock:
4202ed95b21aSIlya Dryomov 	if (must_be_locked)
4203ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4204bc1ecc65SIlya Dryomov err_rq:
4205bc1ecc65SIlya Dryomov 	if (result)
4206bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
42076d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
42084e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
42097ad18afaSChristoph Hellwig err:
42107ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
4211bc1ecc65SIlya Dryomov }
4212bc1ecc65SIlya Dryomov 
42137ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
42147ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
4215bc1ecc65SIlya Dryomov {
42167ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
42177ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4218bc1ecc65SIlya Dryomov 
42197ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
42207ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
4221bf0d5f50SAlex Elder }
4222bf0d5f50SAlex Elder 
4223602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4224602adf40SYehuda Sadeh {
4225602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
4226602adf40SYehuda Sadeh 
4227602adf40SYehuda Sadeh 	if (!disk)
4228602adf40SYehuda Sadeh 		return;
4229602adf40SYehuda Sadeh 
4230a0cab924SAlex Elder 	rbd_dev->disk = NULL;
4231a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
4232602adf40SYehuda Sadeh 		del_gendisk(disk);
4233602adf40SYehuda Sadeh 		if (disk->queue)
4234602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
42357ad18afaSChristoph Hellwig 		blk_mq_free_tag_set(&rbd_dev->tag_set);
4236a0cab924SAlex Elder 	}
4237602adf40SYehuda Sadeh 	put_disk(disk);
4238602adf40SYehuda Sadeh }
4239602adf40SYehuda Sadeh 
4240788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4241788e2df3SAlex Elder 				const char *object_name,
42427097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
4243788e2df3SAlex Elder 
4244788e2df3SAlex Elder {
4245788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
4246788e2df3SAlex Elder 	struct page **pages = NULL;
4247788e2df3SAlex Elder 	u32 page_count;
42481ceae7efSAlex Elder 	size_t size;
4249788e2df3SAlex Elder 	int ret;
4250788e2df3SAlex Elder 
4251788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
4252788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
4253788e2df3SAlex Elder 	if (IS_ERR(pages))
4254a8d42056SJan Kara 		return PTR_ERR(pages);
4255788e2df3SAlex Elder 
4256788e2df3SAlex Elder 	ret = -ENOMEM;
4257788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
4258788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
4259788e2df3SAlex Elder 	if (!obj_request)
4260788e2df3SAlex Elder 		goto out;
4261788e2df3SAlex Elder 
4262788e2df3SAlex Elder 	obj_request->pages = pages;
4263788e2df3SAlex Elder 	obj_request->page_count = page_count;
4264788e2df3SAlex Elder 
42656d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
4266deb236b3SIlya Dryomov 						  obj_request);
4267788e2df3SAlex Elder 	if (!obj_request->osd_req)
4268788e2df3SAlex Elder 		goto out;
4269788e2df3SAlex Elder 
4270c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
4271c99d2d4aSAlex Elder 					offset, length, 0, 0);
4272406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
4273a4ce40a9SAlex Elder 					obj_request->pages,
427444cd188dSAlex Elder 					obj_request->length,
427544cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
427644cd188dSAlex Elder 					false, false);
4277430c28c3SAlex Elder 
4278980917fcSIlya Dryomov 	rbd_obj_request_submit(obj_request);
4279788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
4280788e2df3SAlex Elder 	if (ret)
4281788e2df3SAlex Elder 		goto out;
4282788e2df3SAlex Elder 
4283788e2df3SAlex Elder 	ret = obj_request->result;
4284788e2df3SAlex Elder 	if (ret < 0)
4285788e2df3SAlex Elder 		goto out;
42861ceae7efSAlex Elder 
42871ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
42881ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
4289903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
429023ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
429123ed6e13SAlex Elder 	ret = (int)size;
4292788e2df3SAlex Elder out:
4293788e2df3SAlex Elder 	if (obj_request)
4294788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
4295788e2df3SAlex Elder 	else
4296788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
4297788e2df3SAlex Elder 
4298788e2df3SAlex Elder 	return ret;
4299788e2df3SAlex Elder }
4300788e2df3SAlex Elder 
4301602adf40SYehuda Sadeh /*
4302662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4303662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4304662518b1SAlex Elder  * information about the image.
43054156d998SAlex Elder  */
430699a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
43074156d998SAlex Elder {
43084156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
43094156d998SAlex Elder 	u32 snap_count = 0;
43104156d998SAlex Elder 	u64 names_size = 0;
43114156d998SAlex Elder 	u32 want_count;
43124156d998SAlex Elder 	int ret;
43134156d998SAlex Elder 
43144156d998SAlex Elder 	/*
43154156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
43164156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
43174156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
43184156d998SAlex Elder 	 * the number of snapshots could change by the time we read
43194156d998SAlex Elder 	 * it in, in which case we re-read it.
43204156d998SAlex Elder 	 */
43214156d998SAlex Elder 	do {
43224156d998SAlex Elder 		size_t size;
43234156d998SAlex Elder 
43244156d998SAlex Elder 		kfree(ondisk);
43254156d998SAlex Elder 
43264156d998SAlex Elder 		size = sizeof (*ondisk);
43274156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
43284156d998SAlex Elder 		size += names_size;
43294156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
43304156d998SAlex Elder 		if (!ondisk)
4331662518b1SAlex Elder 			return -ENOMEM;
43324156d998SAlex Elder 
4333c41d13a3SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
43347097f8dfSAlex Elder 				       0, size, ondisk);
43354156d998SAlex Elder 		if (ret < 0)
4336662518b1SAlex Elder 			goto out;
4337c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
43384156d998SAlex Elder 			ret = -ENXIO;
433906ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
434006ecc6cbSAlex Elder 				size, ret);
4341662518b1SAlex Elder 			goto out;
43424156d998SAlex Elder 		}
43434156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
43444156d998SAlex Elder 			ret = -ENXIO;
434506ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4346662518b1SAlex Elder 			goto out;
43474156d998SAlex Elder 		}
43484156d998SAlex Elder 
43494156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
43504156d998SAlex Elder 		want_count = snap_count;
43514156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
43524156d998SAlex Elder 	} while (snap_count != want_count);
43534156d998SAlex Elder 
4354662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4355662518b1SAlex Elder out:
43564156d998SAlex Elder 	kfree(ondisk);
43574156d998SAlex Elder 
4358dfc5606dSYehuda Sadeh 	return ret;
4359602adf40SYehuda Sadeh }
4360602adf40SYehuda Sadeh 
436115228edeSAlex Elder /*
436215228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
436315228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
436415228edeSAlex Elder  */
436515228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
436615228edeSAlex Elder {
436715228edeSAlex Elder 	u64 snap_id;
436815228edeSAlex Elder 
436915228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
437015228edeSAlex Elder 		return;
437115228edeSAlex Elder 
437215228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
437315228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
437415228edeSAlex Elder 		return;
437515228edeSAlex Elder 
437615228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
437715228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
437815228edeSAlex Elder }
437915228edeSAlex Elder 
43809875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
43819875201eSJosh Durgin {
43829875201eSJosh Durgin 	sector_t size;
43839875201eSJosh Durgin 
43849875201eSJosh Durgin 	/*
4385811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4386811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4387811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
43889875201eSJosh Durgin 	 */
4389811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4390811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
43919875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
43929875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
43939875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
43949875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
43959875201eSJosh Durgin 	}
43969875201eSJosh Durgin }
43979875201eSJosh Durgin 
4398cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
43991fe5e993SAlex Elder {
4400e627db08SAlex Elder 	u64 mapping_size;
44011fe5e993SAlex Elder 	int ret;
44021fe5e993SAlex Elder 
4403cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
44043b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4405a720ae09SIlya Dryomov 
4406a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
440752bb1f9bSIlya Dryomov 	if (ret)
440873e39e4dSIlya Dryomov 		goto out;
440915228edeSAlex Elder 
4410e8f59b59SIlya Dryomov 	/*
4411e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4412e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4413e8f59b59SIlya Dryomov 	 */
4414e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4415e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4416e8f59b59SIlya Dryomov 		if (ret)
441773e39e4dSIlya Dryomov 			goto out;
4418e8f59b59SIlya Dryomov 	}
4419e8f59b59SIlya Dryomov 
44205ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
44215ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
44225ff1108cSIlya Dryomov 	} else {
44235ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
442415228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
44255ff1108cSIlya Dryomov 	}
44265ff1108cSIlya Dryomov 
442773e39e4dSIlya Dryomov out:
4428cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
442973e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
44309875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
44311fe5e993SAlex Elder 
443273e39e4dSIlya Dryomov 	return ret;
44331fe5e993SAlex Elder }
44341fe5e993SAlex Elder 
44357ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
44367ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
44377ad18afaSChristoph Hellwig 		unsigned int numa_node)
44387ad18afaSChristoph Hellwig {
44397ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
44407ad18afaSChristoph Hellwig 
44417ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
44427ad18afaSChristoph Hellwig 	return 0;
44437ad18afaSChristoph Hellwig }
44447ad18afaSChristoph Hellwig 
44457ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
44467ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
44477ad18afaSChristoph Hellwig 	.map_queue	= blk_mq_map_queue,
44487ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
44497ad18afaSChristoph Hellwig };
44507ad18afaSChristoph Hellwig 
4451602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4452602adf40SYehuda Sadeh {
4453602adf40SYehuda Sadeh 	struct gendisk *disk;
4454602adf40SYehuda Sadeh 	struct request_queue *q;
4455593a9e7bSAlex Elder 	u64 segment_size;
44567ad18afaSChristoph Hellwig 	int err;
4457602adf40SYehuda Sadeh 
4458602adf40SYehuda Sadeh 	/* create gendisk info */
44597e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
44607e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
44617e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
4462602adf40SYehuda Sadeh 	if (!disk)
44631fcdb8aaSAlex Elder 		return -ENOMEM;
4464602adf40SYehuda Sadeh 
4465f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4466de71a297SAlex Elder 		 rbd_dev->dev_id);
4467602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
4468dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
44697e513d43SIlya Dryomov 	if (single_major)
44707e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
4471602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
4472602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
4473602adf40SYehuda Sadeh 
44747ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
44757ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4476b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
44777ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
4478b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
44797ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
44807ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
44817ad18afaSChristoph Hellwig 
44827ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
44837ad18afaSChristoph Hellwig 	if (err)
4484602adf40SYehuda Sadeh 		goto out_disk;
4485029bcbd8SJosh Durgin 
44867ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
44877ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
44887ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
44897ad18afaSChristoph Hellwig 		goto out_tag_set;
44907ad18afaSChristoph Hellwig 	}
44917ad18afaSChristoph Hellwig 
4492d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4493d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4494593a9e7bSAlex Elder 
4495029bcbd8SJosh Durgin 	/* set io sizes to object size */
4496593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
4497593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
44980d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
4499d3834fefSIlya Dryomov 	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
4500593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
4501593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
4502593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
4503029bcbd8SJosh Durgin 
450490e98c52SGuangliang Zhao 	/* enable the discard support */
450590e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
450690e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
450790e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
45082bb4cd5cSJens Axboe 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
4509b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
451090e98c52SGuangliang Zhao 
4511bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4512bae818eeSRonny Hegewald 		q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
4513bae818eeSRonny Hegewald 
4514602adf40SYehuda Sadeh 	disk->queue = q;
4515602adf40SYehuda Sadeh 
4516602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
4517602adf40SYehuda Sadeh 
4518602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4519602adf40SYehuda Sadeh 
4520602adf40SYehuda Sadeh 	return 0;
45217ad18afaSChristoph Hellwig out_tag_set:
45227ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4523602adf40SYehuda Sadeh out_disk:
4524602adf40SYehuda Sadeh 	put_disk(disk);
45257ad18afaSChristoph Hellwig 	return err;
4526602adf40SYehuda Sadeh }
4527602adf40SYehuda Sadeh 
4528dfc5606dSYehuda Sadeh /*
4529dfc5606dSYehuda Sadeh   sysfs
4530dfc5606dSYehuda Sadeh */
4531602adf40SYehuda Sadeh 
4532593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4533593a9e7bSAlex Elder {
4534593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4535593a9e7bSAlex Elder }
4536593a9e7bSAlex Elder 
4537dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4538dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4539602adf40SYehuda Sadeh {
4540593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4541dfc5606dSYehuda Sadeh 
4542fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4543fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4544602adf40SYehuda Sadeh }
4545602adf40SYehuda Sadeh 
454634b13184SAlex Elder /*
454734b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
454834b13184SAlex Elder  * necessarily the base image.
454934b13184SAlex Elder  */
455034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
455134b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
455234b13184SAlex Elder {
455334b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
455434b13184SAlex Elder 
455534b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
455634b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
455734b13184SAlex Elder }
455834b13184SAlex Elder 
4559dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4560dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4561602adf40SYehuda Sadeh {
4562593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4563dfc5606dSYehuda Sadeh 
4564fc71d833SAlex Elder 	if (rbd_dev->major)
4565dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4566fc71d833SAlex Elder 
4567fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4568dd82fff1SIlya Dryomov }
4569fc71d833SAlex Elder 
4570dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4571dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
4572dd82fff1SIlya Dryomov {
4573dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4574dd82fff1SIlya Dryomov 
4575dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
4576dfc5606dSYehuda Sadeh }
4577dfc5606dSYehuda Sadeh 
4578005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
4579005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
4580005a07bfSIlya Dryomov {
4581005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4582005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
4583005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
4584005a07bfSIlya Dryomov 
4585005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4586005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
4587005a07bfSIlya Dryomov }
4588005a07bfSIlya Dryomov 
4589dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
4590dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
4591dfc5606dSYehuda Sadeh {
4592593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4593dfc5606dSYehuda Sadeh 
45941dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
4595033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
4596dfc5606dSYehuda Sadeh }
4597dfc5606dSYehuda Sadeh 
4598267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
4599267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
4600267fb90bSMike Christie {
4601267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4602267fb90bSMike Christie 
4603267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4604267fb90bSMike Christie }
4605267fb90bSMike Christie 
46060d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
46070d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
46080d6d1e9cSMike Christie {
46090d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
46100d6d1e9cSMike Christie 
46110d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
46120d6d1e9cSMike Christie }
46130d6d1e9cSMike Christie 
4614dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
4615dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4616dfc5606dSYehuda Sadeh {
4617593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4618dfc5606dSYehuda Sadeh 
46190d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4620dfc5606dSYehuda Sadeh }
4621dfc5606dSYehuda Sadeh 
46229bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
46239bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
46249bb2f334SAlex Elder {
46259bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
46269bb2f334SAlex Elder 
46270d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
46280d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
46299bb2f334SAlex Elder }
46309bb2f334SAlex Elder 
4631dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
4632dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4633dfc5606dSYehuda Sadeh {
4634593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4635dfc5606dSYehuda Sadeh 
4636a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
46370d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4638a92ffdf8SAlex Elder 
4639a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
4640dfc5606dSYehuda Sadeh }
4641dfc5606dSYehuda Sadeh 
4642589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
4643589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
4644589d30e0SAlex Elder {
4645589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4646589d30e0SAlex Elder 
46470d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4648589d30e0SAlex Elder }
4649589d30e0SAlex Elder 
465034b13184SAlex Elder /*
465134b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
465234b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
465334b13184SAlex Elder  */
4654dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
4655dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
4656dfc5606dSYehuda Sadeh 			     char *buf)
4657dfc5606dSYehuda Sadeh {
4658593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4659dfc5606dSYehuda Sadeh 
46600d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4661dfc5606dSYehuda Sadeh }
4662dfc5606dSYehuda Sadeh 
466392a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
466492a58671SMike Christie 				struct device_attribute *attr, char *buf)
466592a58671SMike Christie {
466692a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
466792a58671SMike Christie 
466892a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
466992a58671SMike Christie }
467092a58671SMike Christie 
467186b00e0dSAlex Elder /*
4672ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
4673ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
4674ff96128fSIlya Dryomov  * image)".
467586b00e0dSAlex Elder  */
467686b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
467786b00e0dSAlex Elder 			       struct device_attribute *attr,
467886b00e0dSAlex Elder 			       char *buf)
467986b00e0dSAlex Elder {
468086b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4681ff96128fSIlya Dryomov 	ssize_t count = 0;
468286b00e0dSAlex Elder 
4683ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
468486b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
468586b00e0dSAlex Elder 
4686ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4687ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
468886b00e0dSAlex Elder 
4689ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
4690ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
4691ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
4692ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
4693ff96128fSIlya Dryomov 			    "overlap %llu\n",
4694ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
4695ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
4696ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
4697ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
4698ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
4699ff96128fSIlya Dryomov 	}
470086b00e0dSAlex Elder 
470186b00e0dSAlex Elder 	return count;
470286b00e0dSAlex Elder }
470386b00e0dSAlex Elder 
4704dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
4705dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
4706dfc5606dSYehuda Sadeh 				 const char *buf,
4707dfc5606dSYehuda Sadeh 				 size_t size)
4708dfc5606dSYehuda Sadeh {
4709593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4710b813623aSAlex Elder 	int ret;
4711602adf40SYehuda Sadeh 
4712cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
4713e627db08SAlex Elder 	if (ret)
471452bb1f9bSIlya Dryomov 		return ret;
4715b813623aSAlex Elder 
471652bb1f9bSIlya Dryomov 	return size;
4717dfc5606dSYehuda Sadeh }
4718602adf40SYehuda Sadeh 
4719dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
472034b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
4721dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
4722dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
4723005a07bfSIlya Dryomov static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
4724dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
4725267fb90bSMike Christie static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
47260d6d1e9cSMike Christie static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
4727dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
47289bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
4729dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
4730589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
4731dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4732dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
473392a58671SMike Christie static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
473486b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
4735dfc5606dSYehuda Sadeh 
4736dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
4737dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
473834b13184SAlex Elder 	&dev_attr_features.attr,
4739dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
4740dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
4741005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
4742dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
4743267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
47440d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
4745dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
47469bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
4747dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
4748589d30e0SAlex Elder 	&dev_attr_image_id.attr,
4749dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
475092a58671SMike Christie 	&dev_attr_snap_id.attr,
475186b00e0dSAlex Elder 	&dev_attr_parent.attr,
4752dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4753dfc5606dSYehuda Sadeh 	NULL
4754dfc5606dSYehuda Sadeh };
4755dfc5606dSYehuda Sadeh 
4756dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4757dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4758dfc5606dSYehuda Sadeh };
4759dfc5606dSYehuda Sadeh 
4760dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4761dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4762dfc5606dSYehuda Sadeh 	NULL
4763dfc5606dSYehuda Sadeh };
4764dfc5606dSYehuda Sadeh 
47656cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
4766dfc5606dSYehuda Sadeh 
4767dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
4768dfc5606dSYehuda Sadeh 	.name		= "rbd",
4769dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
47706cac4695SIlya Dryomov 	.release	= rbd_dev_release,
4771dfc5606dSYehuda Sadeh };
4772dfc5606dSYehuda Sadeh 
47738b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
47748b8fb99cSAlex Elder {
47758b8fb99cSAlex Elder 	kref_get(&spec->kref);
47768b8fb99cSAlex Elder 
47778b8fb99cSAlex Elder 	return spec;
47788b8fb99cSAlex Elder }
47798b8fb99cSAlex Elder 
47808b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
47818b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
47828b8fb99cSAlex Elder {
47838b8fb99cSAlex Elder 	if (spec)
47848b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
47858b8fb99cSAlex Elder }
47868b8fb99cSAlex Elder 
47878b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
47888b8fb99cSAlex Elder {
47898b8fb99cSAlex Elder 	struct rbd_spec *spec;
47908b8fb99cSAlex Elder 
47918b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
47928b8fb99cSAlex Elder 	if (!spec)
47938b8fb99cSAlex Elder 		return NULL;
479404077599SIlya Dryomov 
479504077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
479604077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
47978b8fb99cSAlex Elder 	kref_init(&spec->kref);
47988b8fb99cSAlex Elder 
47998b8fb99cSAlex Elder 	return spec;
48008b8fb99cSAlex Elder }
48018b8fb99cSAlex Elder 
48028b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
48038b8fb99cSAlex Elder {
48048b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
48058b8fb99cSAlex Elder 
48068b8fb99cSAlex Elder 	kfree(spec->pool_name);
48078b8fb99cSAlex Elder 	kfree(spec->image_id);
48088b8fb99cSAlex Elder 	kfree(spec->image_name);
48098b8fb99cSAlex Elder 	kfree(spec->snap_name);
48108b8fb99cSAlex Elder 	kfree(spec);
48118b8fb99cSAlex Elder }
48128b8fb99cSAlex Elder 
48131643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
4814dd5ac32dSIlya Dryomov {
481599d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4816ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
481799d16943SIlya Dryomov 
4818c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
48196b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
48200d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
4821c41d13a3SIlya Dryomov 
4822dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4823dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4824dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4825dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
48261643dfa4SIlya Dryomov }
48271643dfa4SIlya Dryomov 
48281643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
48291643dfa4SIlya Dryomov {
48301643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
48311643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
48321643dfa4SIlya Dryomov 
48331643dfa4SIlya Dryomov 	if (need_put) {
48341643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
48351643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
48361643dfa4SIlya Dryomov 	}
48371643dfa4SIlya Dryomov 
48381643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
4839dd5ac32dSIlya Dryomov 
4840dd5ac32dSIlya Dryomov 	/*
4841dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4842dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4843dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4844dd5ac32dSIlya Dryomov 	 */
4845dd5ac32dSIlya Dryomov 	if (need_put)
4846dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4847dd5ac32dSIlya Dryomov }
4848dd5ac32dSIlya Dryomov 
48491643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
48501643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
4851c53d5893SAlex Elder {
4852c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4853c53d5893SAlex Elder 
4854c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4855c53d5893SAlex Elder 	if (!rbd_dev)
4856c53d5893SAlex Elder 		return NULL;
4857c53d5893SAlex Elder 
4858c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
4859c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4860c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4861c53d5893SAlex Elder 
4862c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
4863922dab61SIlya Dryomov 	ceph_oloc_init(&rbd_dev->header_oloc);
4864c41d13a3SIlya Dryomov 
486599d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
486699d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
486799d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
486899d16943SIlya Dryomov 
4869ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
4870ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4871ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4872ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4873ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4874ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4875ed95b21aSIlya Dryomov 	init_waitqueue_head(&rbd_dev->lock_waitq);
4876ed95b21aSIlya Dryomov 
4877dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4878dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4879dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4880dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4881dd5ac32dSIlya Dryomov 
4882c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4883d147543dSIlya Dryomov 	rbd_dev->spec = spec;
48840903e875SAlex Elder 
48857627151eSYan, Zheng 	rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
48867627151eSYan, Zheng 	rbd_dev->layout.stripe_count = 1;
48877627151eSYan, Zheng 	rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
48887627151eSYan, Zheng 	rbd_dev->layout.pool_id = spec->pool_id;
488930c156d9SYan, Zheng 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
48900903e875SAlex Elder 
48911643dfa4SIlya Dryomov 	return rbd_dev;
48921643dfa4SIlya Dryomov }
48931643dfa4SIlya Dryomov 
4894dd5ac32dSIlya Dryomov /*
48951643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4896dd5ac32dSIlya Dryomov  */
48971643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
48981643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
48991643dfa4SIlya Dryomov 					 struct rbd_options *opts)
49001643dfa4SIlya Dryomov {
49011643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
49021643dfa4SIlya Dryomov 
49031643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
49041643dfa4SIlya Dryomov 	if (!rbd_dev)
49051643dfa4SIlya Dryomov 		return NULL;
49061643dfa4SIlya Dryomov 
49071643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
49081643dfa4SIlya Dryomov 
49091643dfa4SIlya Dryomov 	/* get an id and fill in device name */
49101643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
49111643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
49121643dfa4SIlya Dryomov 					 GFP_KERNEL);
49131643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
49141643dfa4SIlya Dryomov 		goto fail_rbd_dev;
49151643dfa4SIlya Dryomov 
49161643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
49171643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
49181643dfa4SIlya Dryomov 						   rbd_dev->name);
49191643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
49201643dfa4SIlya Dryomov 		goto fail_dev_id;
49211643dfa4SIlya Dryomov 
49221643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4923dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4924dd5ac32dSIlya Dryomov 
49251643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4926c53d5893SAlex Elder 	return rbd_dev;
49271643dfa4SIlya Dryomov 
49281643dfa4SIlya Dryomov fail_dev_id:
49291643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
49301643dfa4SIlya Dryomov fail_rbd_dev:
49311643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
49321643dfa4SIlya Dryomov 	return NULL;
4933c53d5893SAlex Elder }
4934c53d5893SAlex Elder 
4935c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4936c53d5893SAlex Elder {
4937dd5ac32dSIlya Dryomov 	if (rbd_dev)
4938dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4939c53d5893SAlex Elder }
4940c53d5893SAlex Elder 
4941dfc5606dSYehuda Sadeh /*
49429d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
49439d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
49449d475de5SAlex Elder  * image.
49459d475de5SAlex Elder  */
49469d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
49479d475de5SAlex Elder 				u8 *order, u64 *snap_size)
49489d475de5SAlex Elder {
49499d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
49509d475de5SAlex Elder 	int ret;
49519d475de5SAlex Elder 	struct {
49529d475de5SAlex Elder 		u8 order;
49539d475de5SAlex Elder 		__le64 size;
49549d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
49559d475de5SAlex Elder 
4956c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
49579d475de5SAlex Elder 				"rbd", "get_size",
49584157976bSAlex Elder 				&snapid, sizeof (snapid),
4959e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
496036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
49619d475de5SAlex Elder 	if (ret < 0)
49629d475de5SAlex Elder 		return ret;
496357385b51SAlex Elder 	if (ret < sizeof (size_buf))
496457385b51SAlex Elder 		return -ERANGE;
49659d475de5SAlex Elder 
4966c3545579SJosh Durgin 	if (order) {
49679d475de5SAlex Elder 		*order = size_buf.order;
4968c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4969c3545579SJosh Durgin 	}
49709d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
49719d475de5SAlex Elder 
4972c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4973c3545579SJosh Durgin 		(unsigned long long)snap_id,
49749d475de5SAlex Elder 		(unsigned long long)*snap_size);
49759d475de5SAlex Elder 
49769d475de5SAlex Elder 	return 0;
49779d475de5SAlex Elder }
49789d475de5SAlex Elder 
49799d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
49809d475de5SAlex Elder {
49819d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
49829d475de5SAlex Elder 					&rbd_dev->header.obj_order,
49839d475de5SAlex Elder 					&rbd_dev->header.image_size);
49849d475de5SAlex Elder }
49859d475de5SAlex Elder 
49861e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
49871e130199SAlex Elder {
49881e130199SAlex Elder 	void *reply_buf;
49891e130199SAlex Elder 	int ret;
49901e130199SAlex Elder 	void *p;
49911e130199SAlex Elder 
49921e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
49931e130199SAlex Elder 	if (!reply_buf)
49941e130199SAlex Elder 		return -ENOMEM;
49951e130199SAlex Elder 
4996c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
49974157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
4998e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
499936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
50001e130199SAlex Elder 	if (ret < 0)
50011e130199SAlex Elder 		goto out;
50021e130199SAlex Elder 
50031e130199SAlex Elder 	p = reply_buf;
50041e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
500557385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
500657385b51SAlex Elder 	ret = 0;
50071e130199SAlex Elder 
50081e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
50091e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
50101e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
50111e130199SAlex Elder 	} else {
50121e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
50131e130199SAlex Elder 	}
50141e130199SAlex Elder out:
50151e130199SAlex Elder 	kfree(reply_buf);
50161e130199SAlex Elder 
50171e130199SAlex Elder 	return ret;
50181e130199SAlex Elder }
50191e130199SAlex Elder 
5020b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5021b1b5402aSAlex Elder 		u64 *snap_features)
5022b1b5402aSAlex Elder {
5023b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
5024b1b5402aSAlex Elder 	struct {
5025b1b5402aSAlex Elder 		__le64 features;
5026b1b5402aSAlex Elder 		__le64 incompat;
50274157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
5028d3767f0fSIlya Dryomov 	u64 unsup;
5029b1b5402aSAlex Elder 	int ret;
5030b1b5402aSAlex Elder 
5031c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
5032b1b5402aSAlex Elder 				"rbd", "get_features",
50334157976bSAlex Elder 				&snapid, sizeof (snapid),
5034e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
503536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5036b1b5402aSAlex Elder 	if (ret < 0)
5037b1b5402aSAlex Elder 		return ret;
503857385b51SAlex Elder 	if (ret < sizeof (features_buf))
503957385b51SAlex Elder 		return -ERANGE;
5040d889140cSAlex Elder 
5041d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5042d3767f0fSIlya Dryomov 	if (unsup) {
5043d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5044d3767f0fSIlya Dryomov 			 unsup);
5045b8f5c6edSAlex Elder 		return -ENXIO;
5046d3767f0fSIlya Dryomov 	}
5047d889140cSAlex Elder 
5048b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
5049b1b5402aSAlex Elder 
5050b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5051b1b5402aSAlex Elder 		(unsigned long long)snap_id,
5052b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
5053b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5054b1b5402aSAlex Elder 
5055b1b5402aSAlex Elder 	return 0;
5056b1b5402aSAlex Elder }
5057b1b5402aSAlex Elder 
5058b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5059b1b5402aSAlex Elder {
5060b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5061b1b5402aSAlex Elder 						&rbd_dev->header.features);
5062b1b5402aSAlex Elder }
5063b1b5402aSAlex Elder 
506486b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
506586b00e0dSAlex Elder {
506686b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
506786b00e0dSAlex Elder 	size_t size;
506886b00e0dSAlex Elder 	void *reply_buf = NULL;
506986b00e0dSAlex Elder 	__le64 snapid;
507086b00e0dSAlex Elder 	void *p;
507186b00e0dSAlex Elder 	void *end;
5072642a2537SAlex Elder 	u64 pool_id;
507386b00e0dSAlex Elder 	char *image_id;
50743b5cf2a2SAlex Elder 	u64 snap_id;
507586b00e0dSAlex Elder 	u64 overlap;
507686b00e0dSAlex Elder 	int ret;
507786b00e0dSAlex Elder 
507886b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
507986b00e0dSAlex Elder 	if (!parent_spec)
508086b00e0dSAlex Elder 		return -ENOMEM;
508186b00e0dSAlex Elder 
508286b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
508386b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
508486b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
508586b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
508686b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
508786b00e0dSAlex Elder 	if (!reply_buf) {
508886b00e0dSAlex Elder 		ret = -ENOMEM;
508986b00e0dSAlex Elder 		goto out_err;
509086b00e0dSAlex Elder 	}
509186b00e0dSAlex Elder 
50924d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5093c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
509486b00e0dSAlex Elder 				"rbd", "get_parent",
50954157976bSAlex Elder 				&snapid, sizeof (snapid),
5096e2a58ee5SAlex Elder 				reply_buf, size);
509736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
509886b00e0dSAlex Elder 	if (ret < 0)
509986b00e0dSAlex Elder 		goto out_err;
510086b00e0dSAlex Elder 
510186b00e0dSAlex Elder 	p = reply_buf;
510257385b51SAlex Elder 	end = reply_buf + ret;
510357385b51SAlex Elder 	ret = -ERANGE;
5104642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
5105392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
5106392a9dadSAlex Elder 		/*
5107392a9dadSAlex Elder 		 * Either the parent never existed, or we have
5108392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
5109392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
5110392a9dadSAlex Elder 		 * layered image disappears we immediately set the
5111392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
5112392a9dadSAlex Elder 		 * requests will be treated as if the image had no
5113392a9dadSAlex Elder 		 * parent.
5114392a9dadSAlex Elder 		 */
5115392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
5116392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
5117392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
5118392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
5119392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
5120392a9dadSAlex Elder 		}
5121392a9dadSAlex Elder 
512286b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
5123392a9dadSAlex Elder 	}
512486b00e0dSAlex Elder 
51250903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
51260903e875SAlex Elder 
51270903e875SAlex Elder 	ret = -EIO;
5128642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
51299584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5130642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
513157385b51SAlex Elder 		goto out_err;
5132c0cd10dbSAlex Elder 	}
51330903e875SAlex Elder 
5134979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
513586b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
513686b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
513786b00e0dSAlex Elder 		goto out_err;
513886b00e0dSAlex Elder 	}
51393b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
514086b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
514186b00e0dSAlex Elder 
51423b5cf2a2SAlex Elder 	/*
51433b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
51443b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
51453b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
51463b5cf2a2SAlex Elder 	 */
51473b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
51483b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
51493b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
51503b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
515186b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
515286b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
5153fbba11b3SIlya Dryomov 	} else {
5154fbba11b3SIlya Dryomov 		kfree(image_id);
51553b5cf2a2SAlex Elder 	}
51563b5cf2a2SAlex Elder 
51573b5cf2a2SAlex Elder 	/*
5158cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
5159cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
51603b5cf2a2SAlex Elder 	 */
51613b5cf2a2SAlex Elder 	if (!overlap) {
51623b5cf2a2SAlex Elder 		if (parent_spec) {
5163cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
5164cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
5165cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
5166cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
516770cf49cfSAlex Elder 		} else {
5168cf32bd9cSIlya Dryomov 			/* initial probe */
5169cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
51703b5cf2a2SAlex Elder 		}
517170cf49cfSAlex Elder 	}
5172cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
5173cf32bd9cSIlya Dryomov 
517486b00e0dSAlex Elder out:
517586b00e0dSAlex Elder 	ret = 0;
517686b00e0dSAlex Elder out_err:
517786b00e0dSAlex Elder 	kfree(reply_buf);
517886b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
517986b00e0dSAlex Elder 
518086b00e0dSAlex Elder 	return ret;
518186b00e0dSAlex Elder }
518286b00e0dSAlex Elder 
5183cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5184cc070d59SAlex Elder {
5185cc070d59SAlex Elder 	struct {
5186cc070d59SAlex Elder 		__le64 stripe_unit;
5187cc070d59SAlex Elder 		__le64 stripe_count;
5188cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5189cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5190cc070d59SAlex Elder 	void *p;
5191cc070d59SAlex Elder 	u64 obj_size;
5192cc070d59SAlex Elder 	u64 stripe_unit;
5193cc070d59SAlex Elder 	u64 stripe_count;
5194cc070d59SAlex Elder 	int ret;
5195cc070d59SAlex Elder 
5196c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
5197cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
5198e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
5199cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5200cc070d59SAlex Elder 	if (ret < 0)
5201cc070d59SAlex Elder 		return ret;
5202cc070d59SAlex Elder 	if (ret < size)
5203cc070d59SAlex Elder 		return -ERANGE;
5204cc070d59SAlex Elder 
5205cc070d59SAlex Elder 	/*
5206cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
5207cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
5208cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
5209cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
5210cc070d59SAlex Elder 	 */
5211cc070d59SAlex Elder 	ret = -EINVAL;
5212cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
5213cc070d59SAlex Elder 	p = &striping_info_buf;
5214cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
5215cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
5216cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
5217cc070d59SAlex Elder 				"(got %llu want %llu)",
5218cc070d59SAlex Elder 				stripe_unit, obj_size);
5219cc070d59SAlex Elder 		return -EINVAL;
5220cc070d59SAlex Elder 	}
5221cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
5222cc070d59SAlex Elder 	if (stripe_count != 1) {
5223cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
5224cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
5225cc070d59SAlex Elder 		return -EINVAL;
5226cc070d59SAlex Elder 	}
5227500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
5228500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
5229cc070d59SAlex Elder 
5230cc070d59SAlex Elder 	return 0;
5231cc070d59SAlex Elder }
5232cc070d59SAlex Elder 
52339e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
52349e15b77dSAlex Elder {
52359e15b77dSAlex Elder 	size_t image_id_size;
52369e15b77dSAlex Elder 	char *image_id;
52379e15b77dSAlex Elder 	void *p;
52389e15b77dSAlex Elder 	void *end;
52399e15b77dSAlex Elder 	size_t size;
52409e15b77dSAlex Elder 	void *reply_buf = NULL;
52419e15b77dSAlex Elder 	size_t len = 0;
52429e15b77dSAlex Elder 	char *image_name = NULL;
52439e15b77dSAlex Elder 	int ret;
52449e15b77dSAlex Elder 
52459e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
52469e15b77dSAlex Elder 
524769e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
524869e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
52499e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
52509e15b77dSAlex Elder 	if (!image_id)
52519e15b77dSAlex Elder 		return NULL;
52529e15b77dSAlex Elder 
52539e15b77dSAlex Elder 	p = image_id;
52544157976bSAlex Elder 	end = image_id + image_id_size;
525569e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
52569e15b77dSAlex Elder 
52579e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
52589e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
52599e15b77dSAlex Elder 	if (!reply_buf)
52609e15b77dSAlex Elder 		goto out;
52619e15b77dSAlex Elder 
526236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
52639e15b77dSAlex Elder 				"rbd", "dir_get_name",
52649e15b77dSAlex Elder 				image_id, image_id_size,
5265e2a58ee5SAlex Elder 				reply_buf, size);
52669e15b77dSAlex Elder 	if (ret < 0)
52679e15b77dSAlex Elder 		goto out;
52689e15b77dSAlex Elder 	p = reply_buf;
5269f40eb349SAlex Elder 	end = reply_buf + ret;
5270f40eb349SAlex Elder 
52719e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
52729e15b77dSAlex Elder 	if (IS_ERR(image_name))
52739e15b77dSAlex Elder 		image_name = NULL;
52749e15b77dSAlex Elder 	else
52759e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
52769e15b77dSAlex Elder out:
52779e15b77dSAlex Elder 	kfree(reply_buf);
52789e15b77dSAlex Elder 	kfree(image_id);
52799e15b77dSAlex Elder 
52809e15b77dSAlex Elder 	return image_name;
52819e15b77dSAlex Elder }
52829e15b77dSAlex Elder 
52832ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
52842ad3d716SAlex Elder {
52852ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
52862ad3d716SAlex Elder 	const char *snap_name;
52872ad3d716SAlex Elder 	u32 which = 0;
52882ad3d716SAlex Elder 
52892ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
52902ad3d716SAlex Elder 
52912ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
52922ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
52932ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
52942ad3d716SAlex Elder 			return snapc->snaps[which];
52952ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
52962ad3d716SAlex Elder 		which++;
52972ad3d716SAlex Elder 	}
52982ad3d716SAlex Elder 	return CEPH_NOSNAP;
52992ad3d716SAlex Elder }
53002ad3d716SAlex Elder 
53012ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
53022ad3d716SAlex Elder {
53032ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
53042ad3d716SAlex Elder 	u32 which;
53052ad3d716SAlex Elder 	bool found = false;
53062ad3d716SAlex Elder 	u64 snap_id;
53072ad3d716SAlex Elder 
53082ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
53092ad3d716SAlex Elder 		const char *snap_name;
53102ad3d716SAlex Elder 
53112ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
53122ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5313efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5314efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5315efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5316efadc98aSJosh Durgin 				continue;
5317efadc98aSJosh Durgin 			else
53182ad3d716SAlex Elder 				break;
5319efadc98aSJosh Durgin 		}
53202ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
53212ad3d716SAlex Elder 		kfree(snap_name);
53222ad3d716SAlex Elder 	}
53232ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
53242ad3d716SAlex Elder }
53252ad3d716SAlex Elder 
53262ad3d716SAlex Elder /*
53272ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
53282ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
53292ad3d716SAlex Elder  */
53302ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
53312ad3d716SAlex Elder {
53322ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
53332ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
53342ad3d716SAlex Elder 
53352ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
53362ad3d716SAlex Elder }
53372ad3d716SAlex Elder 
53389e15b77dSAlex Elder /*
533904077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
53409e15b77dSAlex Elder  */
534104077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
534204077599SIlya Dryomov {
534304077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
534404077599SIlya Dryomov 
534504077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
534604077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
534704077599SIlya Dryomov 	rbd_assert(spec->snap_name);
534804077599SIlya Dryomov 
534904077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
535004077599SIlya Dryomov 		u64 snap_id;
535104077599SIlya Dryomov 
535204077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
535304077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
535404077599SIlya Dryomov 			return -ENOENT;
535504077599SIlya Dryomov 
535604077599SIlya Dryomov 		spec->snap_id = snap_id;
535704077599SIlya Dryomov 	} else {
535804077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
535904077599SIlya Dryomov 	}
536004077599SIlya Dryomov 
536104077599SIlya Dryomov 	return 0;
536204077599SIlya Dryomov }
536304077599SIlya Dryomov 
536404077599SIlya Dryomov /*
536504077599SIlya Dryomov  * A parent image will have all ids but none of the names.
536604077599SIlya Dryomov  *
536704077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
536804077599SIlya Dryomov  * can't figure out the name for an image id.
536904077599SIlya Dryomov  */
537004077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
53719e15b77dSAlex Elder {
53722e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
53732e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
53742e9f7f1cSAlex Elder 	const char *pool_name;
53752e9f7f1cSAlex Elder 	const char *image_name;
53762e9f7f1cSAlex Elder 	const char *snap_name;
53779e15b77dSAlex Elder 	int ret;
53789e15b77dSAlex Elder 
537904077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
538004077599SIlya Dryomov 	rbd_assert(spec->image_id);
538104077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
53829e15b77dSAlex Elder 
53832e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
53849e15b77dSAlex Elder 
53852e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
53862e9f7f1cSAlex Elder 	if (!pool_name) {
53872e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
5388935dc89fSAlex Elder 		return -EIO;
5389935dc89fSAlex Elder 	}
53902e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
53912e9f7f1cSAlex Elder 	if (!pool_name)
53929e15b77dSAlex Elder 		return -ENOMEM;
53939e15b77dSAlex Elder 
53949e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
53959e15b77dSAlex Elder 
53962e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
53972e9f7f1cSAlex Elder 	if (!image_name)
539806ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
53999e15b77dSAlex Elder 
540004077599SIlya Dryomov 	/* Fetch the snapshot name */
54019e15b77dSAlex Elder 
54022e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
5403da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
5404da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
54059e15b77dSAlex Elder 		goto out_err;
54062e9f7f1cSAlex Elder 	}
54072e9f7f1cSAlex Elder 
54082e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
54092e9f7f1cSAlex Elder 	spec->image_name = image_name;
54102e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
54119e15b77dSAlex Elder 
54129e15b77dSAlex Elder 	return 0;
541304077599SIlya Dryomov 
54149e15b77dSAlex Elder out_err:
54152e9f7f1cSAlex Elder 	kfree(image_name);
54162e9f7f1cSAlex Elder 	kfree(pool_name);
54179e15b77dSAlex Elder 	return ret;
54189e15b77dSAlex Elder }
54199e15b77dSAlex Elder 
5420cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
542135d489f9SAlex Elder {
542235d489f9SAlex Elder 	size_t size;
542335d489f9SAlex Elder 	int ret;
542435d489f9SAlex Elder 	void *reply_buf;
542535d489f9SAlex Elder 	void *p;
542635d489f9SAlex Elder 	void *end;
542735d489f9SAlex Elder 	u64 seq;
542835d489f9SAlex Elder 	u32 snap_count;
542935d489f9SAlex Elder 	struct ceph_snap_context *snapc;
543035d489f9SAlex Elder 	u32 i;
543135d489f9SAlex Elder 
543235d489f9SAlex Elder 	/*
543335d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
543435d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
543535d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
543635d489f9SAlex Elder 	 * prepared to receive.
543735d489f9SAlex Elder 	 */
543835d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
543935d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
544035d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
544135d489f9SAlex Elder 	if (!reply_buf)
544235d489f9SAlex Elder 		return -ENOMEM;
544335d489f9SAlex Elder 
5444c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
54454157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
5446e2a58ee5SAlex Elder 				reply_buf, size);
544736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
544835d489f9SAlex Elder 	if (ret < 0)
544935d489f9SAlex Elder 		goto out;
545035d489f9SAlex Elder 
545135d489f9SAlex Elder 	p = reply_buf;
545257385b51SAlex Elder 	end = reply_buf + ret;
545357385b51SAlex Elder 	ret = -ERANGE;
545435d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
545535d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
545635d489f9SAlex Elder 
545735d489f9SAlex Elder 	/*
545835d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
545935d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
546035d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
546135d489f9SAlex Elder 	 * allocate is representable in a size_t.
546235d489f9SAlex Elder 	 */
546335d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
546435d489f9SAlex Elder 				 / sizeof (u64)) {
546535d489f9SAlex Elder 		ret = -EINVAL;
546635d489f9SAlex Elder 		goto out;
546735d489f9SAlex Elder 	}
546835d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
546935d489f9SAlex Elder 		goto out;
5470468521c1SAlex Elder 	ret = 0;
547135d489f9SAlex Elder 
5472812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
547335d489f9SAlex Elder 	if (!snapc) {
547435d489f9SAlex Elder 		ret = -ENOMEM;
547535d489f9SAlex Elder 		goto out;
547635d489f9SAlex Elder 	}
547735d489f9SAlex Elder 	snapc->seq = seq;
547835d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
547935d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
548035d489f9SAlex Elder 
548149ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
548235d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
548335d489f9SAlex Elder 
548435d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
548535d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
548635d489f9SAlex Elder out:
548735d489f9SAlex Elder 	kfree(reply_buf);
548835d489f9SAlex Elder 
548957385b51SAlex Elder 	return ret;
549035d489f9SAlex Elder }
549135d489f9SAlex Elder 
549254cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
549354cac61fSAlex Elder 					u64 snap_id)
5494b8b1e2dbSAlex Elder {
5495b8b1e2dbSAlex Elder 	size_t size;
5496b8b1e2dbSAlex Elder 	void *reply_buf;
549754cac61fSAlex Elder 	__le64 snapid;
5498b8b1e2dbSAlex Elder 	int ret;
5499b8b1e2dbSAlex Elder 	void *p;
5500b8b1e2dbSAlex Elder 	void *end;
5501b8b1e2dbSAlex Elder 	char *snap_name;
5502b8b1e2dbSAlex Elder 
5503b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5504b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
5505b8b1e2dbSAlex Elder 	if (!reply_buf)
5506b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
5507b8b1e2dbSAlex Elder 
550854cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
5509c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
5510b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
551154cac61fSAlex Elder 				&snapid, sizeof (snapid),
5512e2a58ee5SAlex Elder 				reply_buf, size);
551336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5514f40eb349SAlex Elder 	if (ret < 0) {
5515f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
5516b8b1e2dbSAlex Elder 		goto out;
5517f40eb349SAlex Elder 	}
5518b8b1e2dbSAlex Elder 
5519b8b1e2dbSAlex Elder 	p = reply_buf;
5520f40eb349SAlex Elder 	end = reply_buf + ret;
5521e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5522f40eb349SAlex Elder 	if (IS_ERR(snap_name))
5523b8b1e2dbSAlex Elder 		goto out;
5524f40eb349SAlex Elder 
5525b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
552654cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
5527b8b1e2dbSAlex Elder out:
5528b8b1e2dbSAlex Elder 	kfree(reply_buf);
5529b8b1e2dbSAlex Elder 
5530f40eb349SAlex Elder 	return snap_name;
5531b8b1e2dbSAlex Elder }
5532b8b1e2dbSAlex Elder 
55332df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5534117973fbSAlex Elder {
55352df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
5536117973fbSAlex Elder 	int ret;
5537117973fbSAlex Elder 
55381617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
55391617e40cSJosh Durgin 	if (ret)
5540cfbf6377SAlex Elder 		return ret;
55411617e40cSJosh Durgin 
55422df3fac7SAlex Elder 	if (first_time) {
55432df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
55442df3fac7SAlex Elder 		if (ret)
5545cfbf6377SAlex Elder 			return ret;
55462df3fac7SAlex Elder 	}
55472df3fac7SAlex Elder 
5548cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
5549d194cd1dSIlya Dryomov 	if (ret && first_time) {
5550d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
5551d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
5552d194cd1dSIlya Dryomov 	}
5553117973fbSAlex Elder 
5554117973fbSAlex Elder 	return ret;
5555117973fbSAlex Elder }
5556117973fbSAlex Elder 
5557a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5558a720ae09SIlya Dryomov {
5559a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5560a720ae09SIlya Dryomov 
5561a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
5562a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
5563a720ae09SIlya Dryomov 
5564a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
5565a720ae09SIlya Dryomov }
5566a720ae09SIlya Dryomov 
55671ddbe94eSAlex Elder /*
5568e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
5569e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
5570593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
5571593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
5572e28fff26SAlex Elder  */
5573e28fff26SAlex Elder static inline size_t next_token(const char **buf)
5574e28fff26SAlex Elder {
5575e28fff26SAlex Elder         /*
5576e28fff26SAlex Elder         * These are the characters that produce nonzero for
5577e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
5578e28fff26SAlex Elder         */
5579e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
5580e28fff26SAlex Elder 
5581e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
5582e28fff26SAlex Elder 
5583e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
5584e28fff26SAlex Elder }
5585e28fff26SAlex Elder 
5586e28fff26SAlex Elder /*
5587ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
5588ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
5589ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5590ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
5591ea3352f4SAlex Elder  *
5592ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
5593ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
5594ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
5595ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
5596ea3352f4SAlex Elder  *
5597ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
5598ea3352f4SAlex Elder  * the end of the found token.
5599ea3352f4SAlex Elder  *
5600ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
5601ea3352f4SAlex Elder  */
5602ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
5603ea3352f4SAlex Elder {
5604ea3352f4SAlex Elder 	char *dup;
5605ea3352f4SAlex Elder 	size_t len;
5606ea3352f4SAlex Elder 
5607ea3352f4SAlex Elder 	len = next_token(buf);
56084caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5609ea3352f4SAlex Elder 	if (!dup)
5610ea3352f4SAlex Elder 		return NULL;
5611ea3352f4SAlex Elder 	*(dup + len) = '\0';
5612ea3352f4SAlex Elder 	*buf += len;
5613ea3352f4SAlex Elder 
5614ea3352f4SAlex Elder 	if (lenp)
5615ea3352f4SAlex Elder 		*lenp = len;
5616ea3352f4SAlex Elder 
5617ea3352f4SAlex Elder 	return dup;
5618ea3352f4SAlex Elder }
5619ea3352f4SAlex Elder 
5620ea3352f4SAlex Elder /*
5621859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
5622859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5623859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
5624859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
5625d22f76e7SAlex Elder  *
5626859c31dfSAlex Elder  * The information extracted from these options is recorded in
5627859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
5628859c31dfSAlex Elder  * structures:
5629859c31dfSAlex Elder  *  ceph_opts
5630859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
5631859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
5632859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
5633859c31dfSAlex Elder  *  rbd_opts
5634859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
5635859c31dfSAlex Elder  *	this function; caller must release with kfree().
5636859c31dfSAlex Elder  *  spec
5637859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
5638859c31dfSAlex Elder  *	initialized by this function based on parsed options.
5639859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
5640859c31dfSAlex Elder  *
5641859c31dfSAlex Elder  * The options passed take this form:
5642859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5643859c31dfSAlex Elder  * where:
5644859c31dfSAlex Elder  *  <mon_addrs>
5645859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
5646859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
5647859c31dfSAlex Elder  *      by a port number (separated by a colon).
5648859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
5649859c31dfSAlex Elder  *  <options>
5650859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
5651859c31dfSAlex Elder  *  <pool_name>
5652859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
5653859c31dfSAlex Elder  *  <image_name>
5654859c31dfSAlex Elder  *      The name of the image in that pool to map.
5655859c31dfSAlex Elder  *  <snap_id>
5656859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
5657859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
5658859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
5659859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
5660a725f65eSAlex Elder  */
5661859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
5662dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
5663859c31dfSAlex Elder 				struct rbd_options **opts,
5664859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
5665a725f65eSAlex Elder {
5666e28fff26SAlex Elder 	size_t len;
5667859c31dfSAlex Elder 	char *options;
56680ddebc0cSAlex Elder 	const char *mon_addrs;
5669ecb4dc22SAlex Elder 	char *snap_name;
56700ddebc0cSAlex Elder 	size_t mon_addrs_size;
5671859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
56724e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5673859c31dfSAlex Elder 	struct ceph_options *copts;
5674dc79b113SAlex Elder 	int ret;
5675e28fff26SAlex Elder 
5676e28fff26SAlex Elder 	/* The first four tokens are required */
5677e28fff26SAlex Elder 
56787ef3214aSAlex Elder 	len = next_token(&buf);
56794fb5d671SAlex Elder 	if (!len) {
56804fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
56814fb5d671SAlex Elder 		return -EINVAL;
56824fb5d671SAlex Elder 	}
56830ddebc0cSAlex Elder 	mon_addrs = buf;
5684f28e565aSAlex Elder 	mon_addrs_size = len + 1;
56857ef3214aSAlex Elder 	buf += len;
5686a725f65eSAlex Elder 
5687dc79b113SAlex Elder 	ret = -EINVAL;
5688f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
5689f28e565aSAlex Elder 	if (!options)
5690dc79b113SAlex Elder 		return -ENOMEM;
56914fb5d671SAlex Elder 	if (!*options) {
56924fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
56934fb5d671SAlex Elder 		goto out_err;
56944fb5d671SAlex Elder 	}
5695a725f65eSAlex Elder 
5696859c31dfSAlex Elder 	spec = rbd_spec_alloc();
5697859c31dfSAlex Elder 	if (!spec)
5698f28e565aSAlex Elder 		goto out_mem;
5699859c31dfSAlex Elder 
5700859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
5701859c31dfSAlex Elder 	if (!spec->pool_name)
5702859c31dfSAlex Elder 		goto out_mem;
57034fb5d671SAlex Elder 	if (!*spec->pool_name) {
57044fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
57054fb5d671SAlex Elder 		goto out_err;
57064fb5d671SAlex Elder 	}
5707e28fff26SAlex Elder 
570869e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
5709859c31dfSAlex Elder 	if (!spec->image_name)
5710f28e565aSAlex Elder 		goto out_mem;
57114fb5d671SAlex Elder 	if (!*spec->image_name) {
57124fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
57134fb5d671SAlex Elder 		goto out_err;
57144fb5d671SAlex Elder 	}
5715e28fff26SAlex Elder 
5716f28e565aSAlex Elder 	/*
5717f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
5718f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
5719f28e565aSAlex Elder 	 */
57203feeb894SAlex Elder 	len = next_token(&buf);
5721820a5f3eSAlex Elder 	if (!len) {
57223feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
57233feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5724f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5725dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
5726f28e565aSAlex Elder 		goto out_err;
5727849b4260SAlex Elder 	}
5728ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5729ecb4dc22SAlex Elder 	if (!snap_name)
5730f28e565aSAlex Elder 		goto out_mem;
5731ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
5732ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
5733e5c35534SAlex Elder 
57340ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
5735e28fff26SAlex Elder 
57364e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
57374e9afebaSAlex Elder 	if (!rbd_opts)
57384e9afebaSAlex Elder 		goto out_mem;
57394e9afebaSAlex Elder 
57404e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
5741b5584180SIlya Dryomov 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
574280de1912SIlya Dryomov 	rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5743d22f76e7SAlex Elder 
5744859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
57450ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
57464e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
5747859c31dfSAlex Elder 	if (IS_ERR(copts)) {
5748859c31dfSAlex Elder 		ret = PTR_ERR(copts);
5749dc79b113SAlex Elder 		goto out_err;
5750dc79b113SAlex Elder 	}
5751859c31dfSAlex Elder 	kfree(options);
5752859c31dfSAlex Elder 
5753859c31dfSAlex Elder 	*ceph_opts = copts;
57544e9afebaSAlex Elder 	*opts = rbd_opts;
5755859c31dfSAlex Elder 	*rbd_spec = spec;
57560ddebc0cSAlex Elder 
5757dc79b113SAlex Elder 	return 0;
5758f28e565aSAlex Elder out_mem:
5759dc79b113SAlex Elder 	ret = -ENOMEM;
5760d22f76e7SAlex Elder out_err:
5761859c31dfSAlex Elder 	kfree(rbd_opts);
5762859c31dfSAlex Elder 	rbd_spec_put(spec);
5763f28e565aSAlex Elder 	kfree(options);
5764d22f76e7SAlex Elder 
5765dc79b113SAlex Elder 	return ret;
5766a725f65eSAlex Elder }
5767a725f65eSAlex Elder 
5768589d30e0SAlex Elder /*
576930ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
577030ba1f02SIlya Dryomov  */
577130ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
577230ba1f02SIlya Dryomov {
5773a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
577430ba1f02SIlya Dryomov 	u64 newest_epoch;
577530ba1f02SIlya Dryomov 	int tries = 0;
577630ba1f02SIlya Dryomov 	int ret;
577730ba1f02SIlya Dryomov 
577830ba1f02SIlya Dryomov again:
577930ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
578030ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
5781d0b19705SIlya Dryomov 		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
578230ba1f02SIlya Dryomov 					    &newest_epoch);
578330ba1f02SIlya Dryomov 		if (ret < 0)
578430ba1f02SIlya Dryomov 			return ret;
578530ba1f02SIlya Dryomov 
578630ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
57877cca78c9SIlya Dryomov 			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
578830ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
5789a319bf56SIlya Dryomov 						     newest_epoch,
5790a319bf56SIlya Dryomov 						     opts->mount_timeout);
579130ba1f02SIlya Dryomov 			goto again;
579230ba1f02SIlya Dryomov 		} else {
579330ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
579430ba1f02SIlya Dryomov 			return -ENOENT;
579530ba1f02SIlya Dryomov 		}
579630ba1f02SIlya Dryomov 	}
579730ba1f02SIlya Dryomov 
579830ba1f02SIlya Dryomov 	return ret;
579930ba1f02SIlya Dryomov }
580030ba1f02SIlya Dryomov 
580130ba1f02SIlya Dryomov /*
5802589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5803589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5804589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5805589d30e0SAlex Elder  *
5806589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5807589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5808589d30e0SAlex Elder  * with the supplied name.
5809589d30e0SAlex Elder  *
5810589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5811589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5812589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5813589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5814589d30e0SAlex Elder  */
5815589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5816589d30e0SAlex Elder {
5817589d30e0SAlex Elder 	int ret;
5818589d30e0SAlex Elder 	size_t size;
5819589d30e0SAlex Elder 	char *object_name;
5820589d30e0SAlex Elder 	void *response;
5821c0fba368SAlex Elder 	char *image_id;
58222f82ee54SAlex Elder 
5823589d30e0SAlex Elder 	/*
58242c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
58252c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5826c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5827c0fba368SAlex Elder 	 * do still need to set the image format though.
58282c0d0a10SAlex Elder 	 */
5829c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5830c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5831c0fba368SAlex Elder 
58322c0d0a10SAlex Elder 		return 0;
5833c0fba368SAlex Elder 	}
58342c0d0a10SAlex Elder 
58352c0d0a10SAlex Elder 	/*
5836589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5837589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5838589d30e0SAlex Elder 	 */
583969e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
5840589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
5841589d30e0SAlex Elder 	if (!object_name)
5842589d30e0SAlex Elder 		return -ENOMEM;
58430d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
5844589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
5845589d30e0SAlex Elder 
5846589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5847589d30e0SAlex Elder 
5848589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5849589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5850589d30e0SAlex Elder 	if (!response) {
5851589d30e0SAlex Elder 		ret = -ENOMEM;
5852589d30e0SAlex Elder 		goto out;
5853589d30e0SAlex Elder 	}
5854589d30e0SAlex Elder 
5855c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5856c0fba368SAlex Elder 
585736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
58584157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
5859e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
586036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5861c0fba368SAlex Elder 	if (ret == -ENOENT) {
5862c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5863c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5864c0fba368SAlex Elder 		if (!ret)
5865c0fba368SAlex Elder 			rbd_dev->image_format = 1;
58667dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5867c0fba368SAlex Elder 		void *p = response;
5868589d30e0SAlex Elder 
5869c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5870979ed480SAlex Elder 						NULL, GFP_NOIO);
5871461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5872c0fba368SAlex Elder 		if (!ret)
5873c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5874c0fba368SAlex Elder 	}
5875c0fba368SAlex Elder 
5876c0fba368SAlex Elder 	if (!ret) {
5877c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5878c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5879589d30e0SAlex Elder 	}
5880589d30e0SAlex Elder out:
5881589d30e0SAlex Elder 	kfree(response);
5882589d30e0SAlex Elder 	kfree(object_name);
5883589d30e0SAlex Elder 
5884589d30e0SAlex Elder 	return ret;
5885589d30e0SAlex Elder }
5886589d30e0SAlex Elder 
58873abef3b3SAlex Elder /*
58883abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
58893abef3b3SAlex Elder  * call.
58903abef3b3SAlex Elder  */
58916fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
58926fd48b3bSAlex Elder {
58936fd48b3bSAlex Elder 	struct rbd_image_header	*header;
58946fd48b3bSAlex Elder 
5895a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
58966fd48b3bSAlex Elder 
58976fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
58986fd48b3bSAlex Elder 
58996fd48b3bSAlex Elder 	header = &rbd_dev->header;
5900812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
59016fd48b3bSAlex Elder 	kfree(header->snap_sizes);
59026fd48b3bSAlex Elder 	kfree(header->snap_names);
59036fd48b3bSAlex Elder 	kfree(header->object_prefix);
59046fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
59056fd48b3bSAlex Elder }
59066fd48b3bSAlex Elder 
59072df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5908a30b71b9SAlex Elder {
5909a30b71b9SAlex Elder 	int ret;
5910a30b71b9SAlex Elder 
59111e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
591257385b51SAlex Elder 	if (ret)
59131e130199SAlex Elder 		goto out_err;
5914b1b5402aSAlex Elder 
59152df3fac7SAlex Elder 	/*
59162df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
59172df3fac7SAlex Elder 	 * features are assumed to never change.
59182df3fac7SAlex Elder 	 */
5919b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
592057385b51SAlex Elder 	if (ret)
5921b1b5402aSAlex Elder 		goto out_err;
592235d489f9SAlex Elder 
5923cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5924cc070d59SAlex Elder 
5925cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5926cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5927cc070d59SAlex Elder 		if (ret < 0)
5928cc070d59SAlex Elder 			goto out_err;
5929cc070d59SAlex Elder 	}
59302df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
5931a30b71b9SAlex Elder 
593235152979SAlex Elder 	return 0;
59339d475de5SAlex Elder out_err:
5934642a2537SAlex Elder 	rbd_dev->header.features = 0;
59351e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
59361e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
59379d475de5SAlex Elder 
59389d475de5SAlex Elder 	return ret;
5939a30b71b9SAlex Elder }
5940a30b71b9SAlex Elder 
59416d69bb53SIlya Dryomov /*
59426d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
59436d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
59446d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
59456d69bb53SIlya Dryomov  */
59466d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
594783a06263SAlex Elder {
59482f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5949124afba2SAlex Elder 	int ret;
5950124afba2SAlex Elder 
5951124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5952124afba2SAlex Elder 		return 0;
5953124afba2SAlex Elder 
59546d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
59556d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
59566d69bb53SIlya Dryomov 		ret = -EINVAL;
59576d69bb53SIlya Dryomov 		goto out_err;
59586d69bb53SIlya Dryomov 	}
59596d69bb53SIlya Dryomov 
59601643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
59611f2c6651SIlya Dryomov 	if (!parent) {
5962124afba2SAlex Elder 		ret = -ENOMEM;
5963124afba2SAlex Elder 		goto out_err;
59641f2c6651SIlya Dryomov 	}
59651f2c6651SIlya Dryomov 
59661f2c6651SIlya Dryomov 	/*
59671f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
59681f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
59691f2c6651SIlya Dryomov 	 */
59701f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
59711f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5972124afba2SAlex Elder 
59736d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5974124afba2SAlex Elder 	if (ret < 0)
5975124afba2SAlex Elder 		goto out_err;
59761f2c6651SIlya Dryomov 
5977124afba2SAlex Elder 	rbd_dev->parent = parent;
5978a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5979124afba2SAlex Elder 	return 0;
5980124afba2SAlex Elder 
59811f2c6651SIlya Dryomov out_err:
59821f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
59831f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5984124afba2SAlex Elder 	return ret;
5985124afba2SAlex Elder }
5986124afba2SAlex Elder 
5987811c6688SIlya Dryomov /*
5988811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5989811c6688SIlya Dryomov  * upon return.
5990811c6688SIlya Dryomov  */
5991200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5992124afba2SAlex Elder {
599383a06263SAlex Elder 	int ret;
599483a06263SAlex Elder 
59959b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
599683a06263SAlex Elder 
59979b60e70bSIlya Dryomov 	if (!single_major) {
599883a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
599983a06263SAlex Elder 		if (ret < 0)
60001643dfa4SIlya Dryomov 			goto err_out_unlock;
60019b60e70bSIlya Dryomov 
600283a06263SAlex Elder 		rbd_dev->major = ret;
6003dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
60049b60e70bSIlya Dryomov 	} else {
60059b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
60069b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
60079b60e70bSIlya Dryomov 	}
600883a06263SAlex Elder 
600983a06263SAlex Elder 	/* Set up the blkdev mapping. */
601083a06263SAlex Elder 
601183a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
601283a06263SAlex Elder 	if (ret)
601383a06263SAlex Elder 		goto err_out_blkdev;
601483a06263SAlex Elder 
6015f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
601683a06263SAlex Elder 	if (ret)
601783a06263SAlex Elder 		goto err_out_disk;
6018bc1ecc65SIlya Dryomov 
6019f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
602022001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
6021f35a4deeSAlex Elder 
6022dd5ac32dSIlya Dryomov 	dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6023dd5ac32dSIlya Dryomov 	ret = device_add(&rbd_dev->dev);
6024f35a4deeSAlex Elder 	if (ret)
6025f5ee37bdSIlya Dryomov 		goto err_out_mapping;
602683a06263SAlex Elder 
602783a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
602883a06263SAlex Elder 
6029129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6030811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
603183a06263SAlex Elder 
60321643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
60331643dfa4SIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
60341643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
60351643dfa4SIlya Dryomov 
6036811c6688SIlya Dryomov 	add_disk(rbd_dev->disk);
6037ca7909e8SIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6038ca7909e8SIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6039ca7909e8SIlya Dryomov 		rbd_dev->header.features);
604083a06263SAlex Elder 
604183a06263SAlex Elder 	return ret;
60422f82ee54SAlex Elder 
6043f35a4deeSAlex Elder err_out_mapping:
6044f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
604583a06263SAlex Elder err_out_disk:
604683a06263SAlex Elder 	rbd_free_disk(rbd_dev);
604783a06263SAlex Elder err_out_blkdev:
60489b60e70bSIlya Dryomov 	if (!single_major)
604983a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6050811c6688SIlya Dryomov err_out_unlock:
6051811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
605283a06263SAlex Elder 	return ret;
605383a06263SAlex Elder }
605483a06263SAlex Elder 
6055332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6056332bb12dSAlex Elder {
6057332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
6058c41d13a3SIlya Dryomov 	int ret;
6059332bb12dSAlex Elder 
6060332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
6061332bb12dSAlex Elder 
6062332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6063332bb12dSAlex Elder 
60647627151eSYan, Zheng 	rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
6065332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
6066c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6067332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
6068332bb12dSAlex Elder 	else
6069c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6070332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
6071c41d13a3SIlya Dryomov 
6072c41d13a3SIlya Dryomov 	return ret;
6073332bb12dSAlex Elder }
6074332bb12dSAlex Elder 
6075200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6076200a6a8bSAlex Elder {
60776fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
60786fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
60796fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
60806fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
60816fd48b3bSAlex Elder 
6082200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
6083200a6a8bSAlex Elder }
6084200a6a8bSAlex Elder 
6085a30b71b9SAlex Elder /*
6086a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
60871f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
60881f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
60891f3ef788SAlex Elder  * object to get detailed information about the rbd image.
6090a30b71b9SAlex Elder  */
60916d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6092a30b71b9SAlex Elder {
6093a30b71b9SAlex Elder 	int ret;
6094a30b71b9SAlex Elder 
6095a30b71b9SAlex Elder 	/*
60963abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
60973abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
60983abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
60993abef3b3SAlex Elder 	 * will be set to either 1 or 2.
6100a30b71b9SAlex Elder 	 */
6101a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
6102a30b71b9SAlex Elder 	if (ret)
6103c0fba368SAlex Elder 		return ret;
6104c0fba368SAlex Elder 
6105332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
6106332bb12dSAlex Elder 	if (ret)
6107332bb12dSAlex Elder 		goto err_out_format;
6108332bb12dSAlex Elder 
61096d69bb53SIlya Dryomov 	if (!depth) {
611099d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
61111fe48023SIlya Dryomov 		if (ret) {
61121fe48023SIlya Dryomov 			if (ret == -ENOENT)
61131fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
61141fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
61151fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
6116c41d13a3SIlya Dryomov 			goto err_out_format;
61171f3ef788SAlex Elder 		}
61181fe48023SIlya Dryomov 	}
6119b644de2bSAlex Elder 
6120a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
61215655c4d9SAlex Elder 	if (ret)
6122b644de2bSAlex Elder 		goto err_out_watch;
6123a30b71b9SAlex Elder 
612404077599SIlya Dryomov 	/*
612504077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
612604077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
612704077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
612804077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
612904077599SIlya Dryomov 	 */
61306d69bb53SIlya Dryomov 	if (!depth)
613104077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
613204077599SIlya Dryomov 	else
613304077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
61341fe48023SIlya Dryomov 	if (ret) {
61351fe48023SIlya Dryomov 		if (ret == -ENOENT)
61361fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
61371fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
61381fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
61391fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
614033dca39fSAlex Elder 		goto err_out_probe;
61411fe48023SIlya Dryomov 	}
61429bb81c9bSAlex Elder 
6143e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6144e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
6145e8f59b59SIlya Dryomov 		if (ret)
6146e8f59b59SIlya Dryomov 			goto err_out_probe;
6147e8f59b59SIlya Dryomov 
6148e8f59b59SIlya Dryomov 		/*
6149e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
6150e8f59b59SIlya Dryomov 		 * mapped and has a parent.
6151e8f59b59SIlya Dryomov 		 */
61526d69bb53SIlya Dryomov 		if (!depth && rbd_dev->parent_spec)
6153e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
6154e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
6155e8f59b59SIlya Dryomov 	}
6156e8f59b59SIlya Dryomov 
61576d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
615830d60ba2SAlex Elder 	if (ret)
615930d60ba2SAlex Elder 		goto err_out_probe;
616083a06263SAlex Elder 
616130d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
6162c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
616330d60ba2SAlex Elder 	return 0;
6164e8f59b59SIlya Dryomov 
61656fd48b3bSAlex Elder err_out_probe:
61666fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
6167b644de2bSAlex Elder err_out_watch:
61686d69bb53SIlya Dryomov 	if (!depth)
616999d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6170332bb12dSAlex Elder err_out_format:
6171332bb12dSAlex Elder 	rbd_dev->image_format = 0;
61725655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
61735655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
61745655c4d9SAlex Elder 	return ret;
617583a06263SAlex Elder }
617683a06263SAlex Elder 
61779b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
617859c2be1eSYehuda Sadeh 			  const char *buf,
617959c2be1eSYehuda Sadeh 			  size_t count)
6180602adf40SYehuda Sadeh {
6181cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
6182dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
61834e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
6184859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
61859d3997fdSAlex Elder 	struct rbd_client *rbdc;
618651344a38SAlex Elder 	bool read_only;
6187b51c83c2SIlya Dryomov 	int rc;
6188602adf40SYehuda Sadeh 
6189602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
6190602adf40SYehuda Sadeh 		return -ENODEV;
6191602adf40SYehuda Sadeh 
6192a725f65eSAlex Elder 	/* parse add command */
6193859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
6194dc79b113SAlex Elder 	if (rc < 0)
6195dd5ac32dSIlya Dryomov 		goto out;
6196a725f65eSAlex Elder 
61979d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
61989d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
61999d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
62000ddebc0cSAlex Elder 		goto err_out_args;
62019d3997fdSAlex Elder 	}
6202602adf40SYehuda Sadeh 
6203602adf40SYehuda Sadeh 	/* pick the pool */
620430ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
62051fe48023SIlya Dryomov 	if (rc < 0) {
62061fe48023SIlya Dryomov 		if (rc == -ENOENT)
62071fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
6208602adf40SYehuda Sadeh 		goto err_out_client;
62091fe48023SIlya Dryomov 	}
6210859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
6211859c31dfSAlex Elder 
6212d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
6213b51c83c2SIlya Dryomov 	if (!rbd_dev) {
6214b51c83c2SIlya Dryomov 		rc = -ENOMEM;
6215bd4ba655SAlex Elder 		goto err_out_client;
6216b51c83c2SIlya Dryomov 	}
6217c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
6218c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
6219d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
6220602adf40SYehuda Sadeh 
62210d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
62220d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
62230d6d1e9cSMike Christie 		rc = -ENOMEM;
62240d6d1e9cSMike Christie 		goto err_out_rbd_dev;
62250d6d1e9cSMike Christie 	}
62260d6d1e9cSMike Christie 
6227811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
62286d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
62290d6d1e9cSMike Christie 	if (rc < 0) {
62300d6d1e9cSMike Christie 		up_write(&rbd_dev->header_rwsem);
6231c53d5893SAlex Elder 		goto err_out_rbd_dev;
62320d6d1e9cSMike Christie 	}
623305fd6f6fSAlex Elder 
62347ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
62357ce4eef7SAlex Elder 
6236d147543dSIlya Dryomov 	read_only = rbd_dev->opts->read_only;
62377ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
62387ce4eef7SAlex Elder 		read_only = true;
62397ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
62407ce4eef7SAlex Elder 
6241b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
62423abef3b3SAlex Elder 	if (rc) {
6243e37180c0SIlya Dryomov 		/*
624499d16943SIlya Dryomov 		 * rbd_unregister_watch() can't be moved into
6245e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
6246e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
6247e37180c0SIlya Dryomov 		 */
624899d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
62493abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
6250dd5ac32dSIlya Dryomov 		goto out;
62513abef3b3SAlex Elder 	}
62523abef3b3SAlex Elder 
6253dd5ac32dSIlya Dryomov 	rc = count;
6254dd5ac32dSIlya Dryomov out:
6255dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
6256dd5ac32dSIlya Dryomov 	return rc;
6257b536f69aSAlex Elder 
6258c53d5893SAlex Elder err_out_rbd_dev:
6259c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
6260bd4ba655SAlex Elder err_out_client:
62619d3997fdSAlex Elder 	rbd_put_client(rbdc);
62620ddebc0cSAlex Elder err_out_args:
6263859c31dfSAlex Elder 	rbd_spec_put(spec);
6264d147543dSIlya Dryomov 	kfree(rbd_opts);
6265dd5ac32dSIlya Dryomov 	goto out;
6266602adf40SYehuda Sadeh }
6267602adf40SYehuda Sadeh 
62689b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
62699b60e70bSIlya Dryomov 		       const char *buf,
62709b60e70bSIlya Dryomov 		       size_t count)
62719b60e70bSIlya Dryomov {
62729b60e70bSIlya Dryomov 	if (single_major)
62739b60e70bSIlya Dryomov 		return -EINVAL;
62749b60e70bSIlya Dryomov 
62759b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
62769b60e70bSIlya Dryomov }
62779b60e70bSIlya Dryomov 
62789b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
62799b60e70bSIlya Dryomov 				    const char *buf,
62809b60e70bSIlya Dryomov 				    size_t count)
62819b60e70bSIlya Dryomov {
62829b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
62839b60e70bSIlya Dryomov }
62849b60e70bSIlya Dryomov 
6285dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6286602adf40SYehuda Sadeh {
6287602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
62881643dfa4SIlya Dryomov 
62891643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
62901643dfa4SIlya Dryomov 	list_del_init(&rbd_dev->node);
62911643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
62921643dfa4SIlya Dryomov 
6293200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6294dd5ac32dSIlya Dryomov 	device_del(&rbd_dev->dev);
62956d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
62969b60e70bSIlya Dryomov 	if (!single_major)
6297602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6298602adf40SYehuda Sadeh }
6299602adf40SYehuda Sadeh 
630005a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
630105a46afdSAlex Elder {
6302ad945fc1SAlex Elder 	while (rbd_dev->parent) {
630305a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
630405a46afdSAlex Elder 		struct rbd_device *second = first->parent;
630505a46afdSAlex Elder 		struct rbd_device *third;
630605a46afdSAlex Elder 
630705a46afdSAlex Elder 		/*
630805a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
630905a46afdSAlex Elder 		 * remove it.
631005a46afdSAlex Elder 		 */
631105a46afdSAlex Elder 		while (second && (third = second->parent)) {
631205a46afdSAlex Elder 			first = second;
631305a46afdSAlex Elder 			second = third;
631405a46afdSAlex Elder 		}
6315ad945fc1SAlex Elder 		rbd_assert(second);
63168ad42cd0SAlex Elder 		rbd_dev_image_release(second);
6317ad945fc1SAlex Elder 		first->parent = NULL;
6318ad945fc1SAlex Elder 		first->parent_overlap = 0;
6319ad945fc1SAlex Elder 
6320ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
632105a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
632205a46afdSAlex Elder 		first->parent_spec = NULL;
632305a46afdSAlex Elder 	}
632405a46afdSAlex Elder }
632505a46afdSAlex Elder 
63269b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
6327602adf40SYehuda Sadeh 			     const char *buf,
6328602adf40SYehuda Sadeh 			     size_t count)
6329602adf40SYehuda Sadeh {
6330602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
6331751cc0e3SAlex Elder 	struct list_head *tmp;
6332751cc0e3SAlex Elder 	int dev_id;
63330276dca6SMike Christie 	char opt_buf[6];
633482a442d2SAlex Elder 	bool already = false;
63350276dca6SMike Christie 	bool force = false;
63360d8189e1SAlex Elder 	int ret;
6337602adf40SYehuda Sadeh 
63380276dca6SMike Christie 	dev_id = -1;
63390276dca6SMike Christie 	opt_buf[0] = '\0';
63400276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
63410276dca6SMike Christie 	if (dev_id < 0) {
63420276dca6SMike Christie 		pr_err("dev_id out of range\n");
6343602adf40SYehuda Sadeh 		return -EINVAL;
63440276dca6SMike Christie 	}
63450276dca6SMike Christie 	if (opt_buf[0] != '\0') {
63460276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
63470276dca6SMike Christie 			force = true;
63480276dca6SMike Christie 		} else {
63490276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
63500276dca6SMike Christie 			return -EINVAL;
63510276dca6SMike Christie 		}
63520276dca6SMike Christie 	}
6353602adf40SYehuda Sadeh 
6354602adf40SYehuda Sadeh 	ret = -ENOENT;
6355751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
6356751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
6357751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
6358751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
6359751cc0e3SAlex Elder 			ret = 0;
6360751cc0e3SAlex Elder 			break;
6361602adf40SYehuda Sadeh 		}
6362751cc0e3SAlex Elder 	}
6363751cc0e3SAlex Elder 	if (!ret) {
6364a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
63650276dca6SMike Christie 		if (rbd_dev->open_count && !force)
636642382b70SAlex Elder 			ret = -EBUSY;
6367b82d167bSAlex Elder 		else
636882a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
636982a442d2SAlex Elder 							&rbd_dev->flags);
6370a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
6371751cc0e3SAlex Elder 	}
6372751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
637382a442d2SAlex Elder 	if (ret < 0 || already)
63741ba0f1e7SAlex Elder 		return ret;
6375751cc0e3SAlex Elder 
63760276dca6SMike Christie 	if (force) {
63770276dca6SMike Christie 		/*
63780276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
63790276dca6SMike Christie 		 * IO to complete/fail.
63800276dca6SMike Christie 		 */
63810276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
63820276dca6SMike Christie 		blk_set_queue_dying(rbd_dev->disk->queue);
63830276dca6SMike Christie 	}
63840276dca6SMike Christie 
6385ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
6386ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
6387ed95b21aSIlya Dryomov 		rbd_unlock(rbd_dev);
6388ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
638999d16943SIlya Dryomov 	rbd_unregister_watch(rbd_dev);
6390fca27065SIlya Dryomov 
63919875201eSJosh Durgin 	/*
63929875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
63939875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
63949875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
63959875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
63969875201eSJosh Durgin 	 */
6397dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
63988ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
6399aafb230eSAlex Elder 
64001ba0f1e7SAlex Elder 	return count;
6401602adf40SYehuda Sadeh }
6402602adf40SYehuda Sadeh 
64039b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
64049b60e70bSIlya Dryomov 			  const char *buf,
64059b60e70bSIlya Dryomov 			  size_t count)
64069b60e70bSIlya Dryomov {
64079b60e70bSIlya Dryomov 	if (single_major)
64089b60e70bSIlya Dryomov 		return -EINVAL;
64099b60e70bSIlya Dryomov 
64109b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
64119b60e70bSIlya Dryomov }
64129b60e70bSIlya Dryomov 
64139b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
64149b60e70bSIlya Dryomov 				       const char *buf,
64159b60e70bSIlya Dryomov 				       size_t count)
64169b60e70bSIlya Dryomov {
64179b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
64189b60e70bSIlya Dryomov }
64199b60e70bSIlya Dryomov 
6420602adf40SYehuda Sadeh /*
6421602adf40SYehuda Sadeh  * create control files in sysfs
6422dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
6423602adf40SYehuda Sadeh  */
6424602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
6425602adf40SYehuda Sadeh {
6426dfc5606dSYehuda Sadeh 	int ret;
6427602adf40SYehuda Sadeh 
6428fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
6429dfc5606dSYehuda Sadeh 	if (ret < 0)
6430dfc5606dSYehuda Sadeh 		return ret;
6431602adf40SYehuda Sadeh 
6432fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
6433fed4c143SAlex Elder 	if (ret < 0)
6434fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
6435602adf40SYehuda Sadeh 
6436602adf40SYehuda Sadeh 	return ret;
6437602adf40SYehuda Sadeh }
6438602adf40SYehuda Sadeh 
6439602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
6440602adf40SYehuda Sadeh {
6441dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
6442fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
6443602adf40SYehuda Sadeh }
6444602adf40SYehuda Sadeh 
64451c2a9dfeSAlex Elder static int rbd_slab_init(void)
64461c2a9dfeSAlex Elder {
64471c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
644803d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
6449868311b1SAlex Elder 	if (!rbd_img_request_cache)
6450868311b1SAlex Elder 		return -ENOMEM;
6451868311b1SAlex Elder 
6452868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
645303d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
645478c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
645578c2a44aSAlex Elder 		goto out_err;
645678c2a44aSAlex Elder 
645778c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
645878c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
64592d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
646078c2a44aSAlex Elder 	if (rbd_segment_name_cache)
64611c2a9dfeSAlex Elder 		return 0;
646278c2a44aSAlex Elder out_err:
646378c2a44aSAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
646478c2a44aSAlex Elder 	rbd_obj_request_cache = NULL;
64651c2a9dfeSAlex Elder 
6466868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
6467868311b1SAlex Elder 	rbd_img_request_cache = NULL;
6468868311b1SAlex Elder 
64691c2a9dfeSAlex Elder 	return -ENOMEM;
64701c2a9dfeSAlex Elder }
64711c2a9dfeSAlex Elder 
64721c2a9dfeSAlex Elder static void rbd_slab_exit(void)
64731c2a9dfeSAlex Elder {
647478c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
647578c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
647678c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
647778c2a44aSAlex Elder 
6478868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
6479868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
6480868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
6481868311b1SAlex Elder 
64821c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
64831c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
64841c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
64851c2a9dfeSAlex Elder }
64861c2a9dfeSAlex Elder 
6487cc344fa1SAlex Elder static int __init rbd_init(void)
6488602adf40SYehuda Sadeh {
6489602adf40SYehuda Sadeh 	int rc;
6490602adf40SYehuda Sadeh 
64911e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
64921e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
64931e32d34cSAlex Elder 		return -EINVAL;
64941e32d34cSAlex Elder 	}
6495e1b4d96dSIlya Dryomov 
64961c2a9dfeSAlex Elder 	rc = rbd_slab_init();
6497602adf40SYehuda Sadeh 	if (rc)
6498602adf40SYehuda Sadeh 		return rc;
6499e1b4d96dSIlya Dryomov 
6500f5ee37bdSIlya Dryomov 	/*
6501f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
6502f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
6503f5ee37bdSIlya Dryomov 	 */
6504f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6505f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
6506f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
6507f5ee37bdSIlya Dryomov 		goto err_out_slab;
6508f5ee37bdSIlya Dryomov 	}
6509f5ee37bdSIlya Dryomov 
65109b60e70bSIlya Dryomov 	if (single_major) {
65119b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
65129b60e70bSIlya Dryomov 		if (rbd_major < 0) {
65139b60e70bSIlya Dryomov 			rc = rbd_major;
6514f5ee37bdSIlya Dryomov 			goto err_out_wq;
65159b60e70bSIlya Dryomov 		}
65169b60e70bSIlya Dryomov 	}
65179b60e70bSIlya Dryomov 
65181c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
65191c2a9dfeSAlex Elder 	if (rc)
65209b60e70bSIlya Dryomov 		goto err_out_blkdev;
65211c2a9dfeSAlex Elder 
65229b60e70bSIlya Dryomov 	if (single_major)
65239b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
65249b60e70bSIlya Dryomov 	else
6525e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
65269b60e70bSIlya Dryomov 
6527e1b4d96dSIlya Dryomov 	return 0;
6528e1b4d96dSIlya Dryomov 
65299b60e70bSIlya Dryomov err_out_blkdev:
65309b60e70bSIlya Dryomov 	if (single_major)
65319b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6532f5ee37bdSIlya Dryomov err_out_wq:
6533f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
6534e1b4d96dSIlya Dryomov err_out_slab:
6535e1b4d96dSIlya Dryomov 	rbd_slab_exit();
65361c2a9dfeSAlex Elder 	return rc;
6537602adf40SYehuda Sadeh }
6538602adf40SYehuda Sadeh 
6539cc344fa1SAlex Elder static void __exit rbd_exit(void)
6540602adf40SYehuda Sadeh {
6541ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
6542602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
65439b60e70bSIlya Dryomov 	if (single_major)
65449b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6545f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
65461c2a9dfeSAlex Elder 	rbd_slab_exit();
6547602adf40SYehuda Sadeh }
6548602adf40SYehuda Sadeh 
6549602adf40SYehuda Sadeh module_init(rbd_init);
6550602adf40SYehuda Sadeh module_exit(rbd_exit);
6551602adf40SYehuda Sadeh 
6552d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6553602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6554602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6555602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
6556602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6557602adf40SYehuda Sadeh 
655890da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6559602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
6560