xref: /openbmc/linux/drivers/block/rbd.c (revision ed95b21a)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
35602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3659c2be1eSYehuda Sadeh #include <linux/parser.h>
3730d1cff8SAlex Elder #include <linux/bsearch.h>
38602adf40SYehuda Sadeh 
39602adf40SYehuda Sadeh #include <linux/kernel.h>
40602adf40SYehuda Sadeh #include <linux/device.h>
41602adf40SYehuda Sadeh #include <linux/module.h>
427ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
43602adf40SYehuda Sadeh #include <linux/fs.h>
44602adf40SYehuda Sadeh #include <linux/blkdev.h>
451c2a9dfeSAlex Elder #include <linux/slab.h>
46f8a22fc2SIlya Dryomov #include <linux/idr.h>
47bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
48602adf40SYehuda Sadeh 
49602adf40SYehuda Sadeh #include "rbd_types.h"
50602adf40SYehuda Sadeh 
51aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
52aafb230eSAlex Elder 
53593a9e7bSAlex Elder /*
54593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
55593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
56593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
57593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
58593a9e7bSAlex Elder  */
59593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
60593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
61593a9e7bSAlex Elder 
62a2acd00eSAlex Elder /*
63a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
64a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
65a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
66a2acd00eSAlex Elder  * -EINVAL without updating it.
67a2acd00eSAlex Elder  */
68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
69a2acd00eSAlex Elder {
70a2acd00eSAlex Elder 	unsigned int counter;
71a2acd00eSAlex Elder 
72a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
74a2acd00eSAlex Elder 		return (int)counter;
75a2acd00eSAlex Elder 
76a2acd00eSAlex Elder 	atomic_dec(v);
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	return -EINVAL;
79a2acd00eSAlex Elder }
80a2acd00eSAlex Elder 
81a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
83a2acd00eSAlex Elder {
84a2acd00eSAlex Elder 	int counter;
85a2acd00eSAlex Elder 
86a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
87a2acd00eSAlex Elder 	if (counter >= 0)
88a2acd00eSAlex Elder 		return counter;
89a2acd00eSAlex Elder 
90a2acd00eSAlex Elder 	atomic_inc(v);
91a2acd00eSAlex Elder 
92a2acd00eSAlex Elder 	return -EINVAL;
93a2acd00eSAlex Elder }
94a2acd00eSAlex Elder 
95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
96602adf40SYehuda Sadeh 
977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
99602adf40SYehuda Sadeh 
1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
1016d69bb53SIlya Dryomov 
102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
104d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105d4b125e9SAlex Elder 
10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
107602adf40SYehuda Sadeh 
108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
109602adf40SYehuda Sadeh 
1109682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1119682fc6dSAlex Elder 
1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1159e15b77dSAlex Elder 
1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
117589d30e0SAlex Elder 
118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11999d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
12099d16943SIlya Dryomov 
121d889140cSAlex Elder /* Feature bits */
122d889140cSAlex Elder 
1235cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1245cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
125ed95b21aSIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
126ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
127ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
128ed95b21aSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK)
129d889140cSAlex Elder 
130d889140cSAlex Elder /* Features supported by this (client software) implementation. */
131d889140cSAlex Elder 
132770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
133d889140cSAlex Elder 
13481a89793SAlex Elder /*
13581a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13681a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13781a89793SAlex Elder  */
138602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
139602adf40SYehuda Sadeh 
140602adf40SYehuda Sadeh /*
141602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
142602adf40SYehuda Sadeh  */
143602adf40SYehuda Sadeh struct rbd_image_header {
144f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
145849b4260SAlex Elder 	char *object_prefix;
146602adf40SYehuda Sadeh 	__u8 obj_order;
147602adf40SYehuda Sadeh 	__u8 crypt_type;
148602adf40SYehuda Sadeh 	__u8 comp_type;
149f35a4deeSAlex Elder 	u64 stripe_unit;
150f35a4deeSAlex Elder 	u64 stripe_count;
151f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
152602adf40SYehuda Sadeh 
153f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
154f84344f3SAlex Elder 	u64 image_size;
155f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
156f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
157f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15859c2be1eSYehuda Sadeh };
15959c2be1eSYehuda Sadeh 
1600d7dbfceSAlex Elder /*
1610d7dbfceSAlex Elder  * An rbd image specification.
1620d7dbfceSAlex Elder  *
1630d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
164c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
165c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
166c66c6e0cSAlex Elder  *
167c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
168c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
169c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
170c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
171c66c6e0cSAlex Elder  *
172c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
173c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
174c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
175c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
176c66c6e0cSAlex Elder  * is shared between the parent and child).
177c66c6e0cSAlex Elder  *
178c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
179c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
180c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
181c66c6e0cSAlex Elder  *
182c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
183c66c6e0cSAlex Elder  * could be a null pointer).
1840d7dbfceSAlex Elder  */
1850d7dbfceSAlex Elder struct rbd_spec {
1860d7dbfceSAlex Elder 	u64		pool_id;
187ecb4dc22SAlex Elder 	const char	*pool_name;
1880d7dbfceSAlex Elder 
189ecb4dc22SAlex Elder 	const char	*image_id;
190ecb4dc22SAlex Elder 	const char	*image_name;
1910d7dbfceSAlex Elder 
1920d7dbfceSAlex Elder 	u64		snap_id;
193ecb4dc22SAlex Elder 	const char	*snap_name;
1940d7dbfceSAlex Elder 
1950d7dbfceSAlex Elder 	struct kref	kref;
1960d7dbfceSAlex Elder };
1970d7dbfceSAlex Elder 
198602adf40SYehuda Sadeh /*
199f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
200602adf40SYehuda Sadeh  */
201602adf40SYehuda Sadeh struct rbd_client {
202602adf40SYehuda Sadeh 	struct ceph_client	*client;
203602adf40SYehuda Sadeh 	struct kref		kref;
204602adf40SYehuda Sadeh 	struct list_head	node;
205602adf40SYehuda Sadeh };
206602adf40SYehuda Sadeh 
207bf0d5f50SAlex Elder struct rbd_img_request;
208bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
209bf0d5f50SAlex Elder 
210bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
211bf0d5f50SAlex Elder 
212bf0d5f50SAlex Elder struct rbd_obj_request;
213bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
214bf0d5f50SAlex Elder 
2159969ebc5SAlex Elder enum obj_request_type {
2169969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2179969ebc5SAlex Elder };
218bf0d5f50SAlex Elder 
2196d2940c8SGuangliang Zhao enum obj_operation_type {
2206d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
2216d2940c8SGuangliang Zhao 	OBJ_OP_READ,
22290e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2236d2940c8SGuangliang Zhao };
2246d2940c8SGuangliang Zhao 
225926f9b3fSAlex Elder enum obj_req_flags {
226926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2276365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2285679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2295679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
230926f9b3fSAlex Elder };
231926f9b3fSAlex Elder 
232bf0d5f50SAlex Elder struct rbd_obj_request {
233bf0d5f50SAlex Elder 	const char		*object_name;
234bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
235bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
236926f9b3fSAlex Elder 	unsigned long		flags;
237bf0d5f50SAlex Elder 
238c5b5ef6cSAlex Elder 	/*
239c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
240c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
241c5b5ef6cSAlex Elder 	 *
242c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
243c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
244c5b5ef6cSAlex Elder 	 *
245c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
246c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
247c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
248c5b5ef6cSAlex Elder 	 *
249c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
250c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
251c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
252c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
253c5b5ef6cSAlex Elder 	 */
254c5b5ef6cSAlex Elder 	union {
255c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
256c5b5ef6cSAlex Elder 		struct {
257bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
258c5b5ef6cSAlex Elder 			u64			img_offset;
259c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
260c5b5ef6cSAlex Elder 			struct list_head	links;
261c5b5ef6cSAlex Elder 		};
262c5b5ef6cSAlex Elder 	};
263bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	enum obj_request_type	type;
266788e2df3SAlex Elder 	union {
267bf0d5f50SAlex Elder 		struct bio	*bio_list;
268788e2df3SAlex Elder 		struct {
269788e2df3SAlex Elder 			struct page	**pages;
270788e2df3SAlex Elder 			u32		page_count;
271788e2df3SAlex Elder 		};
272788e2df3SAlex Elder 	};
2730eefd470SAlex Elder 	struct page		**copyup_pages;
274ebda6408SAlex Elder 	u32			copyup_page_count;
275bf0d5f50SAlex Elder 
276bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
277bf0d5f50SAlex Elder 
278bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2791b83bef2SSage Weil 	int			result;
280bf0d5f50SAlex Elder 
281bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
282788e2df3SAlex Elder 	struct completion	completion;
283bf0d5f50SAlex Elder 
284bf0d5f50SAlex Elder 	struct kref		kref;
285bf0d5f50SAlex Elder };
286bf0d5f50SAlex Elder 
2870c425248SAlex Elder enum img_req_flags {
2889849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2899849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
290d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
29190e98c52SGuangliang Zhao 	IMG_REQ_DISCARD,	/* discard: normal = 0, discard request = 1 */
2920c425248SAlex Elder };
2930c425248SAlex Elder 
294bf0d5f50SAlex Elder struct rbd_img_request {
295bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
296bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
297bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2980c425248SAlex Elder 	unsigned long		flags;
299bf0d5f50SAlex Elder 	union {
300bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
3019849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
3029849e986SAlex Elder 	};
3039849e986SAlex Elder 	union {
3049849e986SAlex Elder 		struct request		*rq;		/* block request */
3059849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
306bf0d5f50SAlex Elder 	};
3073d7efd18SAlex Elder 	struct page		**copyup_pages;
308ebda6408SAlex Elder 	u32			copyup_page_count;
309bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
310bf0d5f50SAlex Elder 	u32			next_completion;
311bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
31255f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
313a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
314bf0d5f50SAlex Elder 
315bf0d5f50SAlex Elder 	u32			obj_request_count;
316bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
317bf0d5f50SAlex Elder 
318bf0d5f50SAlex Elder 	struct kref		kref;
319bf0d5f50SAlex Elder };
320bf0d5f50SAlex Elder 
321bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
322ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
323bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
324ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
325bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
326ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
327bf0d5f50SAlex Elder 
32899d16943SIlya Dryomov enum rbd_watch_state {
32999d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
33099d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
33199d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
33299d16943SIlya Dryomov };
33399d16943SIlya Dryomov 
334ed95b21aSIlya Dryomov enum rbd_lock_state {
335ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
336ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
337ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
338ed95b21aSIlya Dryomov };
339ed95b21aSIlya Dryomov 
340ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
341ed95b21aSIlya Dryomov struct rbd_client_id {
342ed95b21aSIlya Dryomov 	u64 gid;
343ed95b21aSIlya Dryomov 	u64 handle;
344ed95b21aSIlya Dryomov };
345ed95b21aSIlya Dryomov 
346f84344f3SAlex Elder struct rbd_mapping {
34799c1f08fSAlex Elder 	u64                     size;
34834b13184SAlex Elder 	u64                     features;
349f84344f3SAlex Elder 	bool			read_only;
350f84344f3SAlex Elder };
351f84344f3SAlex Elder 
352602adf40SYehuda Sadeh /*
353602adf40SYehuda Sadeh  * a single device
354602adf40SYehuda Sadeh  */
355602adf40SYehuda Sadeh struct rbd_device {
356de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
357602adf40SYehuda Sadeh 
358602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
359dd82fff1SIlya Dryomov 	int			minor;
360602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
361602adf40SYehuda Sadeh 
362a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
363602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
364602adf40SYehuda Sadeh 
365602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
366602adf40SYehuda Sadeh 
367b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
368602adf40SYehuda Sadeh 
369602adf40SYehuda Sadeh 	struct rbd_image_header	header;
370b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3710d7dbfceSAlex Elder 	struct rbd_spec		*spec;
372d147543dSIlya Dryomov 	struct rbd_options	*opts;
373602adf40SYehuda Sadeh 
374c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
375922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
376971f839aSAlex Elder 
3771643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3780903e875SAlex Elder 
37999d16943SIlya Dryomov 	struct mutex		watch_mutex;
38099d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
381922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
38299d16943SIlya Dryomov 	u64			watch_cookie;
38399d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
38459c2be1eSYehuda Sadeh 
385ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
386ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
387ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
388ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
389ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
390ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
391ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
392ed95b21aSIlya Dryomov 	wait_queue_head_t	lock_waitq;
393ed95b21aSIlya Dryomov 
3941643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
3951643dfa4SIlya Dryomov 
39686b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
39786b00e0dSAlex Elder 	u64			parent_overlap;
398a2acd00eSAlex Elder 	atomic_t		parent_ref;
3992f82ee54SAlex Elder 	struct rbd_device	*parent;
40086b00e0dSAlex Elder 
4017ad18afaSChristoph Hellwig 	/* Block layer tags. */
4027ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
4037ad18afaSChristoph Hellwig 
404c666601aSJosh Durgin 	/* protects updating the header */
405c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
406f84344f3SAlex Elder 
407f84344f3SAlex Elder 	struct rbd_mapping	mapping;
408602adf40SYehuda Sadeh 
409602adf40SYehuda Sadeh 	struct list_head	node;
410dfc5606dSYehuda Sadeh 
411dfc5606dSYehuda Sadeh 	/* sysfs related */
412dfc5606dSYehuda Sadeh 	struct device		dev;
413b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
414dfc5606dSYehuda Sadeh };
415dfc5606dSYehuda Sadeh 
416b82d167bSAlex Elder /*
417b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
418b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
419b82d167bSAlex Elder  *
420b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
421b82d167bSAlex Elder  * "open_count" field) requires atomic access.
422b82d167bSAlex Elder  */
4236d292906SAlex Elder enum rbd_dev_flags {
4246d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
425b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
4266d292906SAlex Elder };
4276d292906SAlex Elder 
428cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
429e124a82fSAlex Elder 
430602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
431e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
432e124a82fSAlex Elder 
433602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
434432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
435602adf40SYehuda Sadeh 
43678c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
43778c2a44aSAlex Elder 
4381c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
439868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
44078c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
4411c2a9dfeSAlex Elder 
4429b60e70bSIlya Dryomov static int rbd_major;
443f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
444f8a22fc2SIlya Dryomov 
445f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
446f5ee37bdSIlya Dryomov 
4479b60e70bSIlya Dryomov /*
4489b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
4499b60e70bSIlya Dryomov  * userspace rbd utility.
4509b60e70bSIlya Dryomov  */
4519b60e70bSIlya Dryomov static bool single_major = false;
4529b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4539b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4549b60e70bSIlya Dryomov 
4553d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4563d7efd18SAlex Elder 
457f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
458f0f8cef5SAlex Elder 		       size_t count);
459f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
460f0f8cef5SAlex Elder 			  size_t count);
4619b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4629b60e70bSIlya Dryomov 				    size_t count);
4639b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4649b60e70bSIlya Dryomov 				       size_t count);
4656d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
466a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
467f0f8cef5SAlex Elder 
4689b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4699b60e70bSIlya Dryomov {
4707e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4719b60e70bSIlya Dryomov }
4729b60e70bSIlya Dryomov 
4739b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4749b60e70bSIlya Dryomov {
4757e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4769b60e70bSIlya Dryomov }
4779b60e70bSIlya Dryomov 
478ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev)
479ed95b21aSIlya Dryomov {
480ed95b21aSIlya Dryomov 	return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
481ed95b21aSIlya Dryomov 	       rbd_dev->spec->snap_id == CEPH_NOSNAP &&
482ed95b21aSIlya Dryomov 	       !rbd_dev->mapping.read_only;
483ed95b21aSIlya Dryomov }
484ed95b21aSIlya Dryomov 
485ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
486ed95b21aSIlya Dryomov {
487ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
488ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
489ed95b21aSIlya Dryomov }
490ed95b21aSIlya Dryomov 
491ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
492ed95b21aSIlya Dryomov {
493ed95b21aSIlya Dryomov 	bool is_lock_owner;
494ed95b21aSIlya Dryomov 
495ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
496ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
497ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
498ed95b21aSIlya Dryomov 	return is_lock_owner;
499ed95b21aSIlya Dryomov }
500ed95b21aSIlya Dryomov 
501b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
502b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
5039b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
5049b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
505b15a21ddSGreg Kroah-Hartman 
506b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
507b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
508b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
5099b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
5109b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
511b15a21ddSGreg Kroah-Hartman 	NULL,
512f0f8cef5SAlex Elder };
51392c76dc0SIlya Dryomov 
51492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
51592c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
51692c76dc0SIlya Dryomov {
5179b60e70bSIlya Dryomov 	if (!single_major &&
5189b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5199b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5209b60e70bSIlya Dryomov 		return 0;
5219b60e70bSIlya Dryomov 
52292c76dc0SIlya Dryomov 	return attr->mode;
52392c76dc0SIlya Dryomov }
52492c76dc0SIlya Dryomov 
52592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
52692c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
52792c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
52892c76dc0SIlya Dryomov };
52992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
530f0f8cef5SAlex Elder 
531f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
532f0f8cef5SAlex Elder 	.name		= "rbd",
533b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
534f0f8cef5SAlex Elder };
535f0f8cef5SAlex Elder 
536f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
537f0f8cef5SAlex Elder {
538f0f8cef5SAlex Elder }
539f0f8cef5SAlex Elder 
540f0f8cef5SAlex Elder static struct device rbd_root_dev = {
541f0f8cef5SAlex Elder 	.init_name =    "rbd",
542f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
543f0f8cef5SAlex Elder };
544f0f8cef5SAlex Elder 
54506ecc6cbSAlex Elder static __printf(2, 3)
54606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
54706ecc6cbSAlex Elder {
54806ecc6cbSAlex Elder 	struct va_format vaf;
54906ecc6cbSAlex Elder 	va_list args;
55006ecc6cbSAlex Elder 
55106ecc6cbSAlex Elder 	va_start(args, fmt);
55206ecc6cbSAlex Elder 	vaf.fmt = fmt;
55306ecc6cbSAlex Elder 	vaf.va = &args;
55406ecc6cbSAlex Elder 
55506ecc6cbSAlex Elder 	if (!rbd_dev)
55606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
55706ecc6cbSAlex Elder 	else if (rbd_dev->disk)
55806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
55906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
56006ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
56106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
56206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
56306ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
56406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
56506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
56606ecc6cbSAlex Elder 	else	/* punt */
56706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
56806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
56906ecc6cbSAlex Elder 	va_end(args);
57006ecc6cbSAlex Elder }
57106ecc6cbSAlex Elder 
572aafb230eSAlex Elder #ifdef RBD_DEBUG
573aafb230eSAlex Elder #define rbd_assert(expr)						\
574aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
575aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
576aafb230eSAlex Elder 						"at line %d:\n\n"	\
577aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
578aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
579aafb230eSAlex Elder 			BUG();						\
580aafb230eSAlex Elder 		}
581aafb230eSAlex Elder #else /* !RBD_DEBUG */
582aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
583aafb230eSAlex Elder #endif /* !RBD_DEBUG */
584dfc5606dSYehuda Sadeh 
5852761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
586b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
58705a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
58805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5898b3e1a56SAlex Elder 
590cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5912df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
592a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
593e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
59454cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
59554cac61fSAlex Elder 					u64 snap_id);
5962ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5972ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5982ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5992ad3d716SAlex Elder 		u64 *snap_features);
60059c2be1eSYehuda Sadeh 
601602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
602602adf40SYehuda Sadeh {
603f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
604b82d167bSAlex Elder 	bool removing = false;
605602adf40SYehuda Sadeh 
606f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
607602adf40SYehuda Sadeh 		return -EROFS;
608602adf40SYehuda Sadeh 
609a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
610b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
611b82d167bSAlex Elder 		removing = true;
612b82d167bSAlex Elder 	else
613b82d167bSAlex Elder 		rbd_dev->open_count++;
614a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
615b82d167bSAlex Elder 	if (removing)
616b82d167bSAlex Elder 		return -ENOENT;
617b82d167bSAlex Elder 
618c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
619340c7a2bSAlex Elder 
620602adf40SYehuda Sadeh 	return 0;
621602adf40SYehuda Sadeh }
622602adf40SYehuda Sadeh 
623db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
624dfc5606dSYehuda Sadeh {
625dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
626b82d167bSAlex Elder 	unsigned long open_count_before;
627b82d167bSAlex Elder 
628a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
629b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
630a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
631b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
632dfc5606dSYehuda Sadeh 
633c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
634dfc5606dSYehuda Sadeh }
635dfc5606dSYehuda Sadeh 
636131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
637131fd9f6SGuangliang Zhao {
63877f33c03SJosh Durgin 	int ret = 0;
639131fd9f6SGuangliang Zhao 	int val;
640131fd9f6SGuangliang Zhao 	bool ro;
64177f33c03SJosh Durgin 	bool ro_changed = false;
642131fd9f6SGuangliang Zhao 
64377f33c03SJosh Durgin 	/* get_user() may sleep, so call it before taking rbd_dev->lock */
644131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
645131fd9f6SGuangliang Zhao 		return -EFAULT;
646131fd9f6SGuangliang Zhao 
647131fd9f6SGuangliang Zhao 	ro = val ? true : false;
648131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
649131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
650131fd9f6SGuangliang Zhao 		return -EROFS;
651131fd9f6SGuangliang Zhao 
65277f33c03SJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
65377f33c03SJosh Durgin 	/* prevent others open this device */
65477f33c03SJosh Durgin 	if (rbd_dev->open_count > 1) {
65577f33c03SJosh Durgin 		ret = -EBUSY;
65677f33c03SJosh Durgin 		goto out;
657131fd9f6SGuangliang Zhao 	}
658131fd9f6SGuangliang Zhao 
65977f33c03SJosh Durgin 	if (rbd_dev->mapping.read_only != ro) {
66077f33c03SJosh Durgin 		rbd_dev->mapping.read_only = ro;
66177f33c03SJosh Durgin 		ro_changed = true;
66277f33c03SJosh Durgin 	}
66377f33c03SJosh Durgin 
66477f33c03SJosh Durgin out:
66577f33c03SJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
66677f33c03SJosh Durgin 	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
66777f33c03SJosh Durgin 	if (ret == 0 && ro_changed)
66877f33c03SJosh Durgin 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
66977f33c03SJosh Durgin 
67077f33c03SJosh Durgin 	return ret;
671131fd9f6SGuangliang Zhao }
672131fd9f6SGuangliang Zhao 
673131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
674131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
675131fd9f6SGuangliang Zhao {
676131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
677131fd9f6SGuangliang Zhao 	int ret = 0;
678131fd9f6SGuangliang Zhao 
679131fd9f6SGuangliang Zhao 	switch (cmd) {
680131fd9f6SGuangliang Zhao 	case BLKROSET:
681131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
682131fd9f6SGuangliang Zhao 		break;
683131fd9f6SGuangliang Zhao 	default:
684131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
685131fd9f6SGuangliang Zhao 	}
686131fd9f6SGuangliang Zhao 
687131fd9f6SGuangliang Zhao 	return ret;
688131fd9f6SGuangliang Zhao }
689131fd9f6SGuangliang Zhao 
690131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
691131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
692131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
693131fd9f6SGuangliang Zhao {
694131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
695131fd9f6SGuangliang Zhao }
696131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
697131fd9f6SGuangliang Zhao 
698602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
699602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
700602adf40SYehuda Sadeh 	.open			= rbd_open,
701dfc5606dSYehuda Sadeh 	.release		= rbd_release,
702131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
703131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
704131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
705131fd9f6SGuangliang Zhao #endif
706602adf40SYehuda Sadeh };
707602adf40SYehuda Sadeh 
708602adf40SYehuda Sadeh /*
7097262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
710cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
711602adf40SYehuda Sadeh  */
712f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
713602adf40SYehuda Sadeh {
714602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
715602adf40SYehuda Sadeh 	int ret = -ENOMEM;
716602adf40SYehuda Sadeh 
71737206ee5SAlex Elder 	dout("%s:\n", __func__);
718602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
719602adf40SYehuda Sadeh 	if (!rbdc)
720602adf40SYehuda Sadeh 		goto out_opt;
721602adf40SYehuda Sadeh 
722602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
723602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
724602adf40SYehuda Sadeh 
72543ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
726602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
72708f75463SAlex Elder 		goto out_rbdc;
72843ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
729602adf40SYehuda Sadeh 
730602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
731602adf40SYehuda Sadeh 	if (ret < 0)
73208f75463SAlex Elder 		goto out_client;
733602adf40SYehuda Sadeh 
734432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
735602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
736432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
737602adf40SYehuda Sadeh 
73837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
739bc534d86SAlex Elder 
740602adf40SYehuda Sadeh 	return rbdc;
74108f75463SAlex Elder out_client:
742602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
74308f75463SAlex Elder out_rbdc:
744602adf40SYehuda Sadeh 	kfree(rbdc);
745602adf40SYehuda Sadeh out_opt:
74643ae4701SAlex Elder 	if (ceph_opts)
74743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
74837206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
74937206ee5SAlex Elder 
75028f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
751602adf40SYehuda Sadeh }
752602adf40SYehuda Sadeh 
7532f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7542f82ee54SAlex Elder {
7552f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7562f82ee54SAlex Elder 
7572f82ee54SAlex Elder 	return rbdc;
7582f82ee54SAlex Elder }
7592f82ee54SAlex Elder 
760602adf40SYehuda Sadeh /*
7611f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7621f7ba331SAlex Elder  * found, bump its reference count.
763602adf40SYehuda Sadeh  */
7641f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
765602adf40SYehuda Sadeh {
766602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7671f7ba331SAlex Elder 	bool found = false;
768602adf40SYehuda Sadeh 
76943ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
770602adf40SYehuda Sadeh 		return NULL;
771602adf40SYehuda Sadeh 
7721f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7731f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7741f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7752f82ee54SAlex Elder 			__rbd_get_client(client_node);
7762f82ee54SAlex Elder 
7771f7ba331SAlex Elder 			found = true;
7781f7ba331SAlex Elder 			break;
7791f7ba331SAlex Elder 		}
7801f7ba331SAlex Elder 	}
7811f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7821f7ba331SAlex Elder 
7831f7ba331SAlex Elder 	return found ? client_node : NULL;
784602adf40SYehuda Sadeh }
785602adf40SYehuda Sadeh 
786602adf40SYehuda Sadeh /*
787210c104cSIlya Dryomov  * (Per device) rbd map options
78859c2be1eSYehuda Sadeh  */
78959c2be1eSYehuda Sadeh enum {
790b5584180SIlya Dryomov 	Opt_queue_depth,
79159c2be1eSYehuda Sadeh 	Opt_last_int,
79259c2be1eSYehuda Sadeh 	/* int args above */
79359c2be1eSYehuda Sadeh 	Opt_last_string,
79459c2be1eSYehuda Sadeh 	/* string args above */
795cc0538b6SAlex Elder 	Opt_read_only,
796cc0538b6SAlex Elder 	Opt_read_write,
797210c104cSIlya Dryomov 	Opt_err
79859c2be1eSYehuda Sadeh };
79959c2be1eSYehuda Sadeh 
80043ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
801b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
80259c2be1eSYehuda Sadeh 	/* int args above */
80359c2be1eSYehuda Sadeh 	/* string args above */
804be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
805cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
806cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
807cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
808210c104cSIlya Dryomov 	{Opt_err, NULL}
80959c2be1eSYehuda Sadeh };
81059c2be1eSYehuda Sadeh 
81198571b5aSAlex Elder struct rbd_options {
812b5584180SIlya Dryomov 	int	queue_depth;
81398571b5aSAlex Elder 	bool	read_only;
81498571b5aSAlex Elder };
81598571b5aSAlex Elder 
816b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
81798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
81898571b5aSAlex Elder 
81959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
82059c2be1eSYehuda Sadeh {
82143ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
82259c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
82359c2be1eSYehuda Sadeh 	int token, intval, ret;
82459c2be1eSYehuda Sadeh 
82543ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
82659c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
82759c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
82859c2be1eSYehuda Sadeh 		if (ret < 0) {
829210c104cSIlya Dryomov 			pr_err("bad mount option arg (not int) at '%s'\n", c);
83059c2be1eSYehuda Sadeh 			return ret;
83159c2be1eSYehuda Sadeh 		}
83259c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
83359c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
834210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
83559c2be1eSYehuda Sadeh 	} else {
83659c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
83759c2be1eSYehuda Sadeh 	}
83859c2be1eSYehuda Sadeh 
83959c2be1eSYehuda Sadeh 	switch (token) {
840b5584180SIlya Dryomov 	case Opt_queue_depth:
841b5584180SIlya Dryomov 		if (intval < 1) {
842b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
843b5584180SIlya Dryomov 			return -EINVAL;
844b5584180SIlya Dryomov 		}
845b5584180SIlya Dryomov 		rbd_opts->queue_depth = intval;
846b5584180SIlya Dryomov 		break;
847cc0538b6SAlex Elder 	case Opt_read_only:
848cc0538b6SAlex Elder 		rbd_opts->read_only = true;
849cc0538b6SAlex Elder 		break;
850cc0538b6SAlex Elder 	case Opt_read_write:
851cc0538b6SAlex Elder 		rbd_opts->read_only = false;
852cc0538b6SAlex Elder 		break;
85359c2be1eSYehuda Sadeh 	default:
854210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
855210c104cSIlya Dryomov 		return -EINVAL;
85659c2be1eSYehuda Sadeh 	}
857210c104cSIlya Dryomov 
85859c2be1eSYehuda Sadeh 	return 0;
85959c2be1eSYehuda Sadeh }
86059c2be1eSYehuda Sadeh 
8616d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8626d2940c8SGuangliang Zhao {
8636d2940c8SGuangliang Zhao 	switch (op_type) {
8646d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8656d2940c8SGuangliang Zhao 		return "read";
8666d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8676d2940c8SGuangliang Zhao 		return "write";
86890e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
86990e98c52SGuangliang Zhao 		return "discard";
8706d2940c8SGuangliang Zhao 	default:
8716d2940c8SGuangliang Zhao 		return "???";
8726d2940c8SGuangliang Zhao 	}
8736d2940c8SGuangliang Zhao }
8746d2940c8SGuangliang Zhao 
87559c2be1eSYehuda Sadeh /*
876602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
8777262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
8787262cfcaSAlex Elder  * function.
879602adf40SYehuda Sadeh  */
8809d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
881602adf40SYehuda Sadeh {
882f8c38929SAlex Elder 	struct rbd_client *rbdc;
88359c2be1eSYehuda Sadeh 
884cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
8851f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
8869d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
88743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
8889d3997fdSAlex Elder 	else
889f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
890cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
891d720bcb0SAlex Elder 
8929d3997fdSAlex Elder 	return rbdc;
893602adf40SYehuda Sadeh }
894602adf40SYehuda Sadeh 
895602adf40SYehuda Sadeh /*
896602adf40SYehuda Sadeh  * Destroy ceph client
897d23a4b3fSAlex Elder  *
898432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
899602adf40SYehuda Sadeh  */
900602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
901602adf40SYehuda Sadeh {
902602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
903602adf40SYehuda Sadeh 
90437206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
905cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
906602adf40SYehuda Sadeh 	list_del(&rbdc->node);
907cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
908602adf40SYehuda Sadeh 
909602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
910602adf40SYehuda Sadeh 	kfree(rbdc);
911602adf40SYehuda Sadeh }
912602adf40SYehuda Sadeh 
913602adf40SYehuda Sadeh /*
914602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
915602adf40SYehuda Sadeh  * it.
916602adf40SYehuda Sadeh  */
9179d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
918602adf40SYehuda Sadeh {
919c53d5893SAlex Elder 	if (rbdc)
9209d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
921602adf40SYehuda Sadeh }
922602adf40SYehuda Sadeh 
923a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
924a30b71b9SAlex Elder {
925a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
926a30b71b9SAlex Elder }
927a30b71b9SAlex Elder 
9288e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9298e94af8eSAlex Elder {
930103a150fSAlex Elder 	size_t size;
931103a150fSAlex Elder 	u32 snap_count;
932103a150fSAlex Elder 
933103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
934103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
935103a150fSAlex Elder 		return false;
936103a150fSAlex Elder 
937db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
938db2388b6SAlex Elder 
939db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
940db2388b6SAlex Elder 		return false;
941db2388b6SAlex Elder 
942db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
943db2388b6SAlex Elder 
944db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
945db2388b6SAlex Elder 		return false;
946db2388b6SAlex Elder 
947103a150fSAlex Elder 	/*
948103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
949103a150fSAlex Elder 	 * that limits the number of snapshots.
950103a150fSAlex Elder 	 */
951103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
952103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
953103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
954103a150fSAlex Elder 		return false;
955103a150fSAlex Elder 
956103a150fSAlex Elder 	/*
957103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
958103a150fSAlex Elder 	 * header must also be representable in a size_t.
959103a150fSAlex Elder 	 */
960103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
961103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
962103a150fSAlex Elder 		return false;
963103a150fSAlex Elder 
964103a150fSAlex Elder 	return true;
9658e94af8eSAlex Elder }
9668e94af8eSAlex Elder 
967602adf40SYehuda Sadeh /*
968bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
969bb23e37aSAlex Elder  * on-disk header.
970602adf40SYehuda Sadeh  */
971662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
9724156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
973602adf40SYehuda Sadeh {
974662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
975bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
976bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
977bb23e37aSAlex Elder 	char *object_prefix = NULL;
978bb23e37aSAlex Elder 	char *snap_names = NULL;
979bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
980ccece235SAlex Elder 	u32 snap_count;
981d2bb24e5SAlex Elder 	size_t size;
982bb23e37aSAlex Elder 	int ret = -ENOMEM;
983621901d6SAlex Elder 	u32 i;
984602adf40SYehuda Sadeh 
985bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
986103a150fSAlex Elder 
987bb23e37aSAlex Elder 	if (first_time) {
988bb23e37aSAlex Elder 		size_t len;
989bb23e37aSAlex Elder 
990bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
991bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
992bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
993bb23e37aSAlex Elder 		if (!object_prefix)
994602adf40SYehuda Sadeh 			return -ENOMEM;
995bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
996bb23e37aSAlex Elder 		object_prefix[len] = '\0';
997bb23e37aSAlex Elder 	}
99800f1f36fSAlex Elder 
999bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1000d2bb24e5SAlex Elder 
1001602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1002bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1003bb23e37aSAlex Elder 	if (!snapc)
1004bb23e37aSAlex Elder 		goto out_err;
1005bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1006602adf40SYehuda Sadeh 	if (snap_count) {
1007bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1008f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1009f785cc1dSAlex Elder 
1010bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1011621901d6SAlex Elder 
1012f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1013bb23e37aSAlex Elder 			goto out_2big;
1014bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1015bb23e37aSAlex Elder 		if (!snap_names)
1016602adf40SYehuda Sadeh 			goto out_err;
1017bb23e37aSAlex Elder 
1018bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
1019bb23e37aSAlex Elder 
1020bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
1021bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
1022bb23e37aSAlex Elder 		if (!snap_sizes)
1023bb23e37aSAlex Elder 			goto out_err;
1024bb23e37aSAlex Elder 
1025f785cc1dSAlex Elder 		/*
1026bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1027bb23e37aSAlex Elder 		 * and size.
1028bb23e37aSAlex Elder 		 *
102999a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1030bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1031f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1032f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1033f785cc1dSAlex Elder 		 */
1034bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1035bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1036bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1037bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1038bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1039bb23e37aSAlex Elder 		}
1040602adf40SYehuda Sadeh 	}
1041849b4260SAlex Elder 
1042bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1043bb23e37aSAlex Elder 
1044bb23e37aSAlex Elder 	if (first_time) {
1045bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1046602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1047602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
1048602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
1049bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
1050bb23e37aSAlex Elder 		header->stripe_unit = 0;
1051bb23e37aSAlex Elder 		header->stripe_count = 0;
1052bb23e37aSAlex Elder 		header->features = 0;
1053662518b1SAlex Elder 	} else {
1054662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1055662518b1SAlex Elder 		kfree(header->snap_names);
1056662518b1SAlex Elder 		kfree(header->snap_sizes);
1057bb23e37aSAlex Elder 	}
10586a52325fSAlex Elder 
1059bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1060621901d6SAlex Elder 
1061f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1062bb23e37aSAlex Elder 	header->snapc = snapc;
1063bb23e37aSAlex Elder 	header->snap_names = snap_names;
1064bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1065468521c1SAlex Elder 
1066602adf40SYehuda Sadeh 	return 0;
1067bb23e37aSAlex Elder out_2big:
1068bb23e37aSAlex Elder 	ret = -EIO;
10696a52325fSAlex Elder out_err:
1070bb23e37aSAlex Elder 	kfree(snap_sizes);
1071bb23e37aSAlex Elder 	kfree(snap_names);
1072bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1073bb23e37aSAlex Elder 	kfree(object_prefix);
1074ccece235SAlex Elder 
1075bb23e37aSAlex Elder 	return ret;
1076602adf40SYehuda Sadeh }
1077602adf40SYehuda Sadeh 
10789682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
10799682fc6dSAlex Elder {
10809682fc6dSAlex Elder 	const char *snap_name;
10819682fc6dSAlex Elder 
10829682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
10839682fc6dSAlex Elder 
10849682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
10859682fc6dSAlex Elder 
10869682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
10879682fc6dSAlex Elder 	while (which--)
10889682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
10899682fc6dSAlex Elder 
10909682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
10919682fc6dSAlex Elder }
10929682fc6dSAlex Elder 
109330d1cff8SAlex Elder /*
109430d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
109530d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
109630d1cff8SAlex Elder  */
109730d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
109830d1cff8SAlex Elder {
109930d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
110030d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
110130d1cff8SAlex Elder 
110230d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
110330d1cff8SAlex Elder 		return 1;
110430d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
110530d1cff8SAlex Elder }
110630d1cff8SAlex Elder 
110730d1cff8SAlex Elder /*
110830d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
110930d1cff8SAlex Elder  * present.
111030d1cff8SAlex Elder  *
111130d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
111230d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
111330d1cff8SAlex Elder  *
111430d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
111530d1cff8SAlex Elder  * reverse order, highest snapshot id first.
111630d1cff8SAlex Elder  */
11179682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11189682fc6dSAlex Elder {
11199682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
112030d1cff8SAlex Elder 	u64 *found;
11219682fc6dSAlex Elder 
112230d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
112330d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11249682fc6dSAlex Elder 
112530d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11269682fc6dSAlex Elder }
11279682fc6dSAlex Elder 
11282ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11292ad3d716SAlex Elder 					u64 snap_id)
113054cac61fSAlex Elder {
113154cac61fSAlex Elder 	u32 which;
1132da6a6b63SJosh Durgin 	const char *snap_name;
113354cac61fSAlex Elder 
113454cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
113554cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1136da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
113754cac61fSAlex Elder 
1138da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1139da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
114054cac61fSAlex Elder }
114154cac61fSAlex Elder 
11429e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11439e15b77dSAlex Elder {
11449e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11459e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11469e15b77dSAlex Elder 
114754cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
114854cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
114954cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
11509e15b77dSAlex Elder 
115154cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
11529e15b77dSAlex Elder }
11539e15b77dSAlex Elder 
11542ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
11552ad3d716SAlex Elder 				u64 *snap_size)
1156602adf40SYehuda Sadeh {
11572ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11582ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11592ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
11602ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11612ad3d716SAlex Elder 		u32 which;
116200f1f36fSAlex Elder 
11632ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
11642ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
11652ad3d716SAlex Elder 			return -ENOENT;
116600f1f36fSAlex Elder 
11672ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
11682ad3d716SAlex Elder 	} else {
11692ad3d716SAlex Elder 		u64 size = 0;
11702ad3d716SAlex Elder 		int ret;
11712ad3d716SAlex Elder 
11722ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11732ad3d716SAlex Elder 		if (ret)
11742ad3d716SAlex Elder 			return ret;
11752ad3d716SAlex Elder 
11762ad3d716SAlex Elder 		*snap_size = size;
11772ad3d716SAlex Elder 	}
11782ad3d716SAlex Elder 	return 0;
11792ad3d716SAlex Elder }
11802ad3d716SAlex Elder 
11812ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
11822ad3d716SAlex Elder 			u64 *snap_features)
11832ad3d716SAlex Elder {
11842ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11852ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
11862ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
11872ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
11882ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
11892ad3d716SAlex Elder 	} else {
11902ad3d716SAlex Elder 		u64 features = 0;
11912ad3d716SAlex Elder 		int ret;
11922ad3d716SAlex Elder 
11932ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
11942ad3d716SAlex Elder 		if (ret)
11952ad3d716SAlex Elder 			return ret;
11962ad3d716SAlex Elder 
11972ad3d716SAlex Elder 		*snap_features = features;
11982ad3d716SAlex Elder 	}
11992ad3d716SAlex Elder 	return 0;
120000f1f36fSAlex Elder }
1201602adf40SYehuda Sadeh 
1202d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1203602adf40SYehuda Sadeh {
12048f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12052ad3d716SAlex Elder 	u64 size = 0;
12062ad3d716SAlex Elder 	u64 features = 0;
12072ad3d716SAlex Elder 	int ret;
12088b0241f8SAlex Elder 
12092ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12102ad3d716SAlex Elder 	if (ret)
12112ad3d716SAlex Elder 		return ret;
12122ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
12132ad3d716SAlex Elder 	if (ret)
12142ad3d716SAlex Elder 		return ret;
12152ad3d716SAlex Elder 
12162ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12172ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
12182ad3d716SAlex Elder 
12198b0241f8SAlex Elder 	return 0;
1220602adf40SYehuda Sadeh }
1221602adf40SYehuda Sadeh 
1222d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1223d1cf5788SAlex Elder {
1224d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1225d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1226200a6a8bSAlex Elder }
1227200a6a8bSAlex Elder 
12287d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name)
12297d5079aaSHimangi Saraogi {
12307d5079aaSHimangi Saraogi 	/* The explicit cast here is needed to drop the const qualifier */
12317d5079aaSHimangi Saraogi 
12327d5079aaSHimangi Saraogi 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
12337d5079aaSHimangi Saraogi }
12347d5079aaSHimangi Saraogi 
123598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1236602adf40SYehuda Sadeh {
123765ccfe21SAlex Elder 	char *name;
123865ccfe21SAlex Elder 	u64 segment;
123965ccfe21SAlex Elder 	int ret;
12403a96d5cdSJosh Durgin 	char *name_format;
1241602adf40SYehuda Sadeh 
124278c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
124365ccfe21SAlex Elder 	if (!name)
124465ccfe21SAlex Elder 		return NULL;
124565ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
12463a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
12473a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
12483a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
12492d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
125065ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
12512d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
125265ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
125365ccfe21SAlex Elder 			segment, ret);
12547d5079aaSHimangi Saraogi 		rbd_segment_name_free(name);
125565ccfe21SAlex Elder 		name = NULL;
125665ccfe21SAlex Elder 	}
1257602adf40SYehuda Sadeh 
125865ccfe21SAlex Elder 	return name;
125965ccfe21SAlex Elder }
1260602adf40SYehuda Sadeh 
126165ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
126265ccfe21SAlex Elder {
126365ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1264602adf40SYehuda Sadeh 
126565ccfe21SAlex Elder 	return offset & (segment_size - 1);
126665ccfe21SAlex Elder }
126765ccfe21SAlex Elder 
126865ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
126965ccfe21SAlex Elder 				u64 offset, u64 length)
127065ccfe21SAlex Elder {
127165ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
127265ccfe21SAlex Elder 
127365ccfe21SAlex Elder 	offset &= segment_size - 1;
127465ccfe21SAlex Elder 
1275aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
127665ccfe21SAlex Elder 	if (offset + length > segment_size)
127765ccfe21SAlex Elder 		length = segment_size - offset;
127865ccfe21SAlex Elder 
127965ccfe21SAlex Elder 	return length;
1280602adf40SYehuda Sadeh }
1281602adf40SYehuda Sadeh 
1282602adf40SYehuda Sadeh /*
1283029bcbd8SJosh Durgin  * returns the size of an object in the image
1284029bcbd8SJosh Durgin  */
1285029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1286029bcbd8SJosh Durgin {
1287029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1288029bcbd8SJosh Durgin }
1289029bcbd8SJosh Durgin 
1290029bcbd8SJosh Durgin /*
1291602adf40SYehuda Sadeh  * bio helpers
1292602adf40SYehuda Sadeh  */
1293602adf40SYehuda Sadeh 
1294602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1295602adf40SYehuda Sadeh {
1296602adf40SYehuda Sadeh 	struct bio *tmp;
1297602adf40SYehuda Sadeh 
1298602adf40SYehuda Sadeh 	while (chain) {
1299602adf40SYehuda Sadeh 		tmp = chain;
1300602adf40SYehuda Sadeh 		chain = chain->bi_next;
1301602adf40SYehuda Sadeh 		bio_put(tmp);
1302602adf40SYehuda Sadeh 	}
1303602adf40SYehuda Sadeh }
1304602adf40SYehuda Sadeh 
1305602adf40SYehuda Sadeh /*
1306602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1307602adf40SYehuda Sadeh  */
1308602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1309602adf40SYehuda Sadeh {
13107988613bSKent Overstreet 	struct bio_vec bv;
13117988613bSKent Overstreet 	struct bvec_iter iter;
1312602adf40SYehuda Sadeh 	unsigned long flags;
1313602adf40SYehuda Sadeh 	void *buf;
1314602adf40SYehuda Sadeh 	int pos = 0;
1315602adf40SYehuda Sadeh 
1316602adf40SYehuda Sadeh 	while (chain) {
13177988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
13187988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1319602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
13207988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1321602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
13227988613bSKent Overstreet 				       bv.bv_len - remainder);
13237988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
132485b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1325602adf40SYehuda Sadeh 			}
13267988613bSKent Overstreet 			pos += bv.bv_len;
1327602adf40SYehuda Sadeh 		}
1328602adf40SYehuda Sadeh 
1329602adf40SYehuda Sadeh 		chain = chain->bi_next;
1330602adf40SYehuda Sadeh 	}
1331602adf40SYehuda Sadeh }
1332602adf40SYehuda Sadeh 
1333602adf40SYehuda Sadeh /*
1334b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1335b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1336b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1337b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1338b9434c5bSAlex Elder  */
1339b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1340b9434c5bSAlex Elder {
1341b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1342b9434c5bSAlex Elder 
1343b9434c5bSAlex Elder 	rbd_assert(end > offset);
1344b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1345b9434c5bSAlex Elder 	while (offset < end) {
1346b9434c5bSAlex Elder 		size_t page_offset;
1347b9434c5bSAlex Elder 		size_t length;
1348b9434c5bSAlex Elder 		unsigned long flags;
1349b9434c5bSAlex Elder 		void *kaddr;
1350b9434c5bSAlex Elder 
1351491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1352491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1353b9434c5bSAlex Elder 		local_irq_save(flags);
1354b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1355b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1356e2156054SAlex Elder 		flush_dcache_page(*page);
1357b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1358b9434c5bSAlex Elder 		local_irq_restore(flags);
1359b9434c5bSAlex Elder 
1360b9434c5bSAlex Elder 		offset += length;
1361b9434c5bSAlex Elder 		page++;
1362b9434c5bSAlex Elder 	}
1363b9434c5bSAlex Elder }
1364b9434c5bSAlex Elder 
1365b9434c5bSAlex Elder /*
1366f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1367f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1368602adf40SYehuda Sadeh  */
1369f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1370f7760dadSAlex Elder 					unsigned int offset,
1371f7760dadSAlex Elder 					unsigned int len,
1372f7760dadSAlex Elder 					gfp_t gfpmask)
1373602adf40SYehuda Sadeh {
1374f7760dadSAlex Elder 	struct bio *bio;
1375602adf40SYehuda Sadeh 
13765341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1377f7760dadSAlex Elder 	if (!bio)
1378f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1379f7760dadSAlex Elder 
13805341a627SKent Overstreet 	bio_advance(bio, offset);
13814f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1382602adf40SYehuda Sadeh 
1383f7760dadSAlex Elder 	return bio;
1384602adf40SYehuda Sadeh }
1385602adf40SYehuda Sadeh 
1386f7760dadSAlex Elder /*
1387f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1388f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1389f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1390f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1391f7760dadSAlex Elder  *
1392f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1393f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1394f7760dadSAlex Elder  * the start of data to be cloned is located.
1395f7760dadSAlex Elder  *
1396f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1397f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1398f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1399f7760dadSAlex Elder  */
1400f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1401f7760dadSAlex Elder 					unsigned int *offset,
1402f7760dadSAlex Elder 					unsigned int len,
1403f7760dadSAlex Elder 					gfp_t gfpmask)
1404f7760dadSAlex Elder {
1405f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1406f7760dadSAlex Elder 	unsigned int off = *offset;
1407f7760dadSAlex Elder 	struct bio *chain = NULL;
1408f7760dadSAlex Elder 	struct bio **end;
1409602adf40SYehuda Sadeh 
1410f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1411602adf40SYehuda Sadeh 
14124f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1413f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1414602adf40SYehuda Sadeh 
1415f7760dadSAlex Elder 	end = &chain;
1416f7760dadSAlex Elder 	while (len) {
1417f7760dadSAlex Elder 		unsigned int bi_size;
1418f7760dadSAlex Elder 		struct bio *bio;
1419f7760dadSAlex Elder 
1420f5400b7aSAlex Elder 		if (!bi) {
1421f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1422f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1423f5400b7aSAlex Elder 		}
14244f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1425f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1426f7760dadSAlex Elder 		if (!bio)
1427f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1428f7760dadSAlex Elder 
1429f7760dadSAlex Elder 		*end = bio;
1430f7760dadSAlex Elder 		end = &bio->bi_next;
1431f7760dadSAlex Elder 
1432f7760dadSAlex Elder 		off += bi_size;
14334f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1434f7760dadSAlex Elder 			bi = bi->bi_next;
1435f7760dadSAlex Elder 			off = 0;
1436f7760dadSAlex Elder 		}
1437f7760dadSAlex Elder 		len -= bi_size;
1438f7760dadSAlex Elder 	}
1439f7760dadSAlex Elder 	*bio_src = bi;
1440f7760dadSAlex Elder 	*offset = off;
1441f7760dadSAlex Elder 
1442f7760dadSAlex Elder 	return chain;
1443f7760dadSAlex Elder out_err:
1444f7760dadSAlex Elder 	bio_chain_put(chain);
1445f7760dadSAlex Elder 
1446602adf40SYehuda Sadeh 	return NULL;
1447602adf40SYehuda Sadeh }
1448602adf40SYehuda Sadeh 
1449926f9b3fSAlex Elder /*
1450926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1451926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1452926f9b3fSAlex Elder  * again.
1453926f9b3fSAlex Elder  */
14546365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
14556365d33aSAlex Elder {
14566365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
14576365d33aSAlex Elder 		struct rbd_device *rbd_dev;
14586365d33aSAlex Elder 
145957acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
14609584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
14616365d33aSAlex Elder 			obj_request);
14626365d33aSAlex Elder 	}
14636365d33aSAlex Elder }
14646365d33aSAlex Elder 
14656365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
14666365d33aSAlex Elder {
14676365d33aSAlex Elder 	smp_mb();
14686365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
14696365d33aSAlex Elder }
14706365d33aSAlex Elder 
147157acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
147257acbaa7SAlex Elder {
147357acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
147457acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
147557acbaa7SAlex Elder 
147657acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
147757acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
14789584d508SIlya Dryomov 		rbd_warn(rbd_dev, "obj_request %p already marked done",
147957acbaa7SAlex Elder 			obj_request);
148057acbaa7SAlex Elder 	}
148157acbaa7SAlex Elder }
148257acbaa7SAlex Elder 
148357acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
148457acbaa7SAlex Elder {
148557acbaa7SAlex Elder 	smp_mb();
148657acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
148757acbaa7SAlex Elder }
148857acbaa7SAlex Elder 
14895679c59fSAlex Elder /*
14905679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
14915679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14925679c59fSAlex Elder  *
14935679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14945679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14955679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14965679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14975679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14985679c59fSAlex Elder  */
14995679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
15005679c59fSAlex Elder 				bool exists)
15015679c59fSAlex Elder {
15025679c59fSAlex Elder 	if (exists)
15035679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
15045679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
15055679c59fSAlex Elder 	smp_mb();
15065679c59fSAlex Elder }
15075679c59fSAlex Elder 
15085679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
15095679c59fSAlex Elder {
15105679c59fSAlex Elder 	smp_mb();
15115679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
15125679c59fSAlex Elder }
15135679c59fSAlex Elder 
15145679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
15155679c59fSAlex Elder {
15165679c59fSAlex Elder 	smp_mb();
15175679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
15185679c59fSAlex Elder }
15195679c59fSAlex Elder 
15209638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
15219638556aSIlya Dryomov {
15229638556aSIlya Dryomov 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
15239638556aSIlya Dryomov 
15249638556aSIlya Dryomov 	return obj_request->img_offset <
15259638556aSIlya Dryomov 	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
15269638556aSIlya Dryomov }
15279638556aSIlya Dryomov 
1528bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1529bf0d5f50SAlex Elder {
153037206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
153137206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1532bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1533bf0d5f50SAlex Elder }
1534bf0d5f50SAlex Elder 
1535bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1536bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1537bf0d5f50SAlex Elder {
1538bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
153937206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
154037206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1541bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1542bf0d5f50SAlex Elder }
1543bf0d5f50SAlex Elder 
15440f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
15450f2d5be7SAlex Elder {
15460f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
15470f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
15480f2d5be7SAlex Elder 	kref_get(&img_request->kref);
15490f2d5be7SAlex Elder }
15500f2d5be7SAlex Elder 
1551e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1552e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1553bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1554bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1555bf0d5f50SAlex Elder {
1556bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
155737206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
155837206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1559e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1560e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1561e93f3152SAlex Elder 	else
1562bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1563bf0d5f50SAlex Elder }
1564bf0d5f50SAlex Elder 
1565bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1566bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1567bf0d5f50SAlex Elder {
156825dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
156925dcf954SAlex Elder 
1570b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1571bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
157225dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
15736365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
15746365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1575bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
157625dcf954SAlex Elder 	img_request->obj_request_count++;
157725dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
157837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
157937206ee5SAlex Elder 		obj_request->which);
1580bf0d5f50SAlex Elder }
1581bf0d5f50SAlex Elder 
1582bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1583bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1584bf0d5f50SAlex Elder {
1585bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
158625dcf954SAlex Elder 
158737206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
158837206ee5SAlex Elder 		obj_request->which);
1589bf0d5f50SAlex Elder 	list_del(&obj_request->links);
159025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
159125dcf954SAlex Elder 	img_request->obj_request_count--;
159225dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
159325dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
15946365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1595bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1596bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
159725dcf954SAlex Elder 	obj_request->callback = NULL;
1598bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1599bf0d5f50SAlex Elder }
1600bf0d5f50SAlex Elder 
1601bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1602bf0d5f50SAlex Elder {
1603bf0d5f50SAlex Elder 	switch (type) {
16049969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1605bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1606788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1607bf0d5f50SAlex Elder 		return true;
1608bf0d5f50SAlex Elder 	default:
1609bf0d5f50SAlex Elder 		return false;
1610bf0d5f50SAlex Elder 	}
1611bf0d5f50SAlex Elder }
1612bf0d5f50SAlex Elder 
1613bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1614bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1615bf0d5f50SAlex Elder {
161671c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1617bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1618bf0d5f50SAlex Elder }
1619bf0d5f50SAlex Elder 
162071c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
162171c20a06SIlya Dryomov {
162271c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
162371c20a06SIlya Dryomov 	ceph_osdc_cancel_request(obj_request->osd_req);
162471c20a06SIlya Dryomov }
162571c20a06SIlya Dryomov 
162671c20a06SIlya Dryomov /*
162771c20a06SIlya Dryomov  * Wait for an object request to complete.  If interrupted, cancel the
162871c20a06SIlya Dryomov  * underlying osd request.
16292894e1d7SIlya Dryomov  *
16302894e1d7SIlya Dryomov  * @timeout: in jiffies, 0 means "wait forever"
163171c20a06SIlya Dryomov  */
16322894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
16332894e1d7SIlya Dryomov 				  unsigned long timeout)
163471c20a06SIlya Dryomov {
16352894e1d7SIlya Dryomov 	long ret;
163671c20a06SIlya Dryomov 
163771c20a06SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
16382894e1d7SIlya Dryomov 	ret = wait_for_completion_interruptible_timeout(
16392894e1d7SIlya Dryomov 					&obj_request->completion,
16402894e1d7SIlya Dryomov 					ceph_timeout_jiffies(timeout));
16412894e1d7SIlya Dryomov 	if (ret <= 0) {
16422894e1d7SIlya Dryomov 		if (ret == 0)
16432894e1d7SIlya Dryomov 			ret = -ETIMEDOUT;
164471c20a06SIlya Dryomov 		rbd_obj_request_end(obj_request);
16452894e1d7SIlya Dryomov 	} else {
16462894e1d7SIlya Dryomov 		ret = 0;
16472894e1d7SIlya Dryomov 	}
16482894e1d7SIlya Dryomov 
16492894e1d7SIlya Dryomov 	dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
165071c20a06SIlya Dryomov 	return ret;
165171c20a06SIlya Dryomov }
165271c20a06SIlya Dryomov 
16532894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
16542894e1d7SIlya Dryomov {
16552894e1d7SIlya Dryomov 	return __rbd_obj_request_wait(obj_request, 0);
16562894e1d7SIlya Dryomov }
16572894e1d7SIlya Dryomov 
1658bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1659bf0d5f50SAlex Elder {
166055f27e09SAlex Elder 
166137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
166255f27e09SAlex Elder 
166355f27e09SAlex Elder 	/*
166455f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
166555f27e09SAlex Elder 	 * count for the image request.  We could instead use
166655f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
166755f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
166855f27e09SAlex Elder 	 */
166955f27e09SAlex Elder 	if (!img_request->result) {
167055f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
167155f27e09SAlex Elder 		u64 xferred = 0;
167255f27e09SAlex Elder 
167355f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
167455f27e09SAlex Elder 			xferred += obj_request->xferred;
167555f27e09SAlex Elder 		img_request->xferred = xferred;
167655f27e09SAlex Elder 	}
167755f27e09SAlex Elder 
1678bf0d5f50SAlex Elder 	if (img_request->callback)
1679bf0d5f50SAlex Elder 		img_request->callback(img_request);
1680bf0d5f50SAlex Elder 	else
1681bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1682bf0d5f50SAlex Elder }
1683bf0d5f50SAlex Elder 
16840c425248SAlex Elder /*
16850c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
16860c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
16870c425248SAlex Elder  * and currently never change thereafter.
16880c425248SAlex Elder  */
16890c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
16900c425248SAlex Elder {
16910c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
16920c425248SAlex Elder 	smp_mb();
16930c425248SAlex Elder }
16940c425248SAlex Elder 
16950c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
16960c425248SAlex Elder {
16970c425248SAlex Elder 	smp_mb();
16980c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
16990c425248SAlex Elder }
17000c425248SAlex Elder 
170190e98c52SGuangliang Zhao /*
170290e98c52SGuangliang Zhao  * Set the discard flag when the img_request is an discard request
170390e98c52SGuangliang Zhao  */
170490e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request)
170590e98c52SGuangliang Zhao {
170690e98c52SGuangliang Zhao 	set_bit(IMG_REQ_DISCARD, &img_request->flags);
170790e98c52SGuangliang Zhao 	smp_mb();
170890e98c52SGuangliang Zhao }
170990e98c52SGuangliang Zhao 
171090e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request)
171190e98c52SGuangliang Zhao {
171290e98c52SGuangliang Zhao 	smp_mb();
171390e98c52SGuangliang Zhao 	return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
171490e98c52SGuangliang Zhao }
171590e98c52SGuangliang Zhao 
17169849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
17179849e986SAlex Elder {
17189849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
17199849e986SAlex Elder 	smp_mb();
17209849e986SAlex Elder }
17219849e986SAlex Elder 
1722e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1723e93f3152SAlex Elder {
1724e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1725e93f3152SAlex Elder 	smp_mb();
1726e93f3152SAlex Elder }
1727e93f3152SAlex Elder 
17289849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
17299849e986SAlex Elder {
17309849e986SAlex Elder 	smp_mb();
17319849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
17329849e986SAlex Elder }
17339849e986SAlex Elder 
1734d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1735d0b2e944SAlex Elder {
1736d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1737d0b2e944SAlex Elder 	smp_mb();
1738d0b2e944SAlex Elder }
1739d0b2e944SAlex Elder 
1740a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1741a2acd00eSAlex Elder {
1742a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1743a2acd00eSAlex Elder 	smp_mb();
1744a2acd00eSAlex Elder }
1745a2acd00eSAlex Elder 
1746d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1747d0b2e944SAlex Elder {
1748d0b2e944SAlex Elder 	smp_mb();
1749d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1750d0b2e944SAlex Elder }
1751d0b2e944SAlex Elder 
17523b434a2aSJosh Durgin static enum obj_operation_type
17533b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request)
17543b434a2aSJosh Durgin {
17553b434a2aSJosh Durgin 	if (img_request_write_test(img_request))
17563b434a2aSJosh Durgin 		return OBJ_OP_WRITE;
17573b434a2aSJosh Durgin 	else if (img_request_discard_test(img_request))
17583b434a2aSJosh Durgin 		return OBJ_OP_DISCARD;
17593b434a2aSJosh Durgin 	else
17603b434a2aSJosh Durgin 		return OBJ_OP_READ;
17613b434a2aSJosh Durgin }
17623b434a2aSJosh Durgin 
17636e2a4505SAlex Elder static void
17646e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
17656e2a4505SAlex Elder {
1766b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1767b9434c5bSAlex Elder 	u64 length = obj_request->length;
1768b9434c5bSAlex Elder 
17696e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
17706e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1771b9434c5bSAlex Elder 		xferred, length);
17726e2a4505SAlex Elder 	/*
177317c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
177417c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
177517c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
177617c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
177717c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
177817c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
17796e2a4505SAlex Elder 	 */
1780b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
17816e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1782b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
17836e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1784b9434c5bSAlex Elder 		else
1785b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
17866e2a4505SAlex Elder 		obj_request->result = 0;
1787b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1788b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1789b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1790b9434c5bSAlex Elder 		else
1791b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
17926e2a4505SAlex Elder 	}
179317c1cc1dSJosh Durgin 	obj_request->xferred = length;
17946e2a4505SAlex Elder 	obj_request_done_set(obj_request);
17956e2a4505SAlex Elder }
17966e2a4505SAlex Elder 
1797bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1798bf0d5f50SAlex Elder {
179937206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
180037206ee5SAlex Elder 		obj_request->callback);
1801bf0d5f50SAlex Elder 	if (obj_request->callback)
1802bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1803788e2df3SAlex Elder 	else
1804788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1805bf0d5f50SAlex Elder }
1806bf0d5f50SAlex Elder 
1807c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1808bf0d5f50SAlex Elder {
180957acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1810a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
181157acbaa7SAlex Elder 	bool layered = false;
181257acbaa7SAlex Elder 
181357acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
181457acbaa7SAlex Elder 		img_request = obj_request->img_request;
181557acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1816a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
181757acbaa7SAlex Elder 	}
18188b3e1a56SAlex Elder 
18198b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
18208b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
18218b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1822a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1823a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
18248b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
18258b3e1a56SAlex Elder 	else if (img_request)
18266e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
18276e2a4505SAlex Elder 	else
182807741308SAlex Elder 		obj_request_done_set(obj_request);
1829bf0d5f50SAlex Elder }
1830bf0d5f50SAlex Elder 
1831c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1832bf0d5f50SAlex Elder {
18331b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
18341b83bef2SSage Weil 		obj_request->result, obj_request->length);
18351b83bef2SSage Weil 	/*
18368b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
18378b3e1a56SAlex Elder 	 * it to our originally-requested length.
18381b83bef2SSage Weil 	 */
18391b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
184007741308SAlex Elder 	obj_request_done_set(obj_request);
1841bf0d5f50SAlex Elder }
1842bf0d5f50SAlex Elder 
184390e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
184490e98c52SGuangliang Zhao {
184590e98c52SGuangliang Zhao 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
184690e98c52SGuangliang Zhao 		obj_request->result, obj_request->length);
184790e98c52SGuangliang Zhao 	/*
184890e98c52SGuangliang Zhao 	 * There is no such thing as a successful short discard.  Set
184990e98c52SGuangliang Zhao 	 * it to our originally-requested length.
185090e98c52SGuangliang Zhao 	 */
185190e98c52SGuangliang Zhao 	obj_request->xferred = obj_request->length;
1852d0265de7SJosh Durgin 	/* discarding a non-existent object is not a problem */
1853d0265de7SJosh Durgin 	if (obj_request->result == -ENOENT)
1854d0265de7SJosh Durgin 		obj_request->result = 0;
185590e98c52SGuangliang Zhao 	obj_request_done_set(obj_request);
185690e98c52SGuangliang Zhao }
185790e98c52SGuangliang Zhao 
1858fbfab539SAlex Elder /*
1859fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1860fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1861fbfab539SAlex Elder  */
1862c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1863fbfab539SAlex Elder {
186437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1865fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1866fbfab539SAlex Elder }
1867fbfab539SAlex Elder 
18682761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
18692761713dSIlya Dryomov {
18702761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
18712761713dSIlya Dryomov 
18722761713dSIlya Dryomov 	if (obj_request_img_data_test(obj_request))
18732761713dSIlya Dryomov 		rbd_osd_copyup_callback(obj_request);
18742761713dSIlya Dryomov 	else
18752761713dSIlya Dryomov 		obj_request_done_set(obj_request);
18762761713dSIlya Dryomov }
18772761713dSIlya Dryomov 
187885e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1879bf0d5f50SAlex Elder {
1880bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1881bf0d5f50SAlex Elder 	u16 opcode;
1882bf0d5f50SAlex Elder 
188385e084feSIlya Dryomov 	dout("%s: osd_req %p\n", __func__, osd_req);
1884bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
188557acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
188657acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
188757acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
188857acbaa7SAlex Elder 	} else {
188957acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
189057acbaa7SAlex Elder 	}
1891bf0d5f50SAlex Elder 
18921b83bef2SSage Weil 	if (osd_req->r_result < 0)
18931b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1894bf0d5f50SAlex Elder 
1895c47f9371SAlex Elder 	/*
1896c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
18977ad18afaSChristoph Hellwig 	 * passed to the block layer, which just supports a 32-bit
18987ad18afaSChristoph Hellwig 	 * length field.
1899c47f9371SAlex Elder 	 */
19007665d85bSYan, Zheng 	obj_request->xferred = osd_req->r_ops[0].outdata_len;
1901c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
19020ccd5926SIlya Dryomov 
190379528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1904bf0d5f50SAlex Elder 	switch (opcode) {
1905bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1906c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1907bf0d5f50SAlex Elder 		break;
19080ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
1909e30b7577SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1910e30b7577SIlya Dryomov 			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
19110ccd5926SIlya Dryomov 		/* fall through */
1912bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1913e30b7577SIlya Dryomov 	case CEPH_OSD_OP_WRITEFULL:
1914c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1915bf0d5f50SAlex Elder 		break;
1916fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1917c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1918fbfab539SAlex Elder 		break;
191990e98c52SGuangliang Zhao 	case CEPH_OSD_OP_DELETE:
192090e98c52SGuangliang Zhao 	case CEPH_OSD_OP_TRUNCATE:
192190e98c52SGuangliang Zhao 	case CEPH_OSD_OP_ZERO:
192290e98c52SGuangliang Zhao 		rbd_osd_discard_callback(obj_request);
192390e98c52SGuangliang Zhao 		break;
192436be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
19252761713dSIlya Dryomov 		rbd_osd_call_callback(obj_request);
19262761713dSIlya Dryomov 		break;
1927bf0d5f50SAlex Elder 	default:
19289584d508SIlya Dryomov 		rbd_warn(NULL, "%s: unsupported op %hu",
1929bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1930bf0d5f50SAlex Elder 		break;
1931bf0d5f50SAlex Elder 	}
1932bf0d5f50SAlex Elder 
193307741308SAlex Elder 	if (obj_request_done_test(obj_request))
1934bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1935bf0d5f50SAlex Elder }
1936bf0d5f50SAlex Elder 
19379d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1938430c28c3SAlex Elder {
1939430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
19408c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1941430c28c3SAlex Elder 
1942bb873b53SIlya Dryomov 	if (img_request)
1943bb873b53SIlya Dryomov 		osd_req->r_snapid = img_request->snap_id;
19449d4df01fSAlex Elder }
19459d4df01fSAlex Elder 
19469d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
19479d4df01fSAlex Elder {
19489d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
19499d4df01fSAlex Elder 
1950bb873b53SIlya Dryomov 	osd_req->r_mtime = CURRENT_TIME;
1951bb873b53SIlya Dryomov 	osd_req->r_data_offset = obj_request->offset;
1952430c28c3SAlex Elder }
1953430c28c3SAlex Elder 
19540ccd5926SIlya Dryomov /*
19550ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
19560ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
19570ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
19580ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
19590ccd5926SIlya Dryomov  */
1960bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1961bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
19626d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
1963deb236b3SIlya Dryomov 					unsigned int num_ops,
1964430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1965bf0d5f50SAlex Elder {
1966bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1967bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1968bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1969bf0d5f50SAlex Elder 
197090e98c52SGuangliang Zhao 	if (obj_request_img_data_test(obj_request) &&
197190e98c52SGuangliang Zhao 		(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
19726365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
197390e98c52SGuangliang Zhao 		if (op_type == OBJ_OP_WRITE) {
19746d2940c8SGuangliang Zhao 			rbd_assert(img_request_write_test(img_request));
197590e98c52SGuangliang Zhao 		} else {
197690e98c52SGuangliang Zhao 			rbd_assert(img_request_discard_test(img_request));
197790e98c52SGuangliang Zhao 		}
1978bf0d5f50SAlex Elder 		snapc = img_request->snapc;
1979bf0d5f50SAlex Elder 	}
1980bf0d5f50SAlex Elder 
19816d2940c8SGuangliang Zhao 	rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1982deb236b3SIlya Dryomov 
1983deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1984bf0d5f50SAlex Elder 
1985bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1986deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
19872224d879SDavid Disseldorp 					  GFP_NOIO);
1988bf0d5f50SAlex Elder 	if (!osd_req)
198913d1ad16SIlya Dryomov 		goto fail;
1990bf0d5f50SAlex Elder 
199190e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1992bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1993430c28c3SAlex Elder 	else
1994bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1995bf0d5f50SAlex Elder 
1996bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1997bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1998bf0d5f50SAlex Elder 
19997627151eSYan, Zheng 	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
2000d30291b9SIlya Dryomov 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
2001d30291b9SIlya Dryomov 			     obj_request->object_name))
2002d30291b9SIlya Dryomov 		goto fail;
2003bf0d5f50SAlex Elder 
200413d1ad16SIlya Dryomov 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
200513d1ad16SIlya Dryomov 		goto fail;
200613d1ad16SIlya Dryomov 
2007bf0d5f50SAlex Elder 	return osd_req;
200813d1ad16SIlya Dryomov 
200913d1ad16SIlya Dryomov fail:
201013d1ad16SIlya Dryomov 	ceph_osdc_put_request(osd_req);
201113d1ad16SIlya Dryomov 	return NULL;
2012bf0d5f50SAlex Elder }
2013bf0d5f50SAlex Elder 
20140eefd470SAlex Elder /*
2015d3246fb0SJosh Durgin  * Create a copyup osd request based on the information in the object
2016d3246fb0SJosh Durgin  * request supplied.  A copyup request has two or three osd ops, a
2017d3246fb0SJosh Durgin  * copyup method call, potentially a hint op, and a write or truncate
2018d3246fb0SJosh Durgin  * or zero op.
20190eefd470SAlex Elder  */
20200eefd470SAlex Elder static struct ceph_osd_request *
20210eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
20220eefd470SAlex Elder {
20230eefd470SAlex Elder 	struct rbd_img_request *img_request;
20240eefd470SAlex Elder 	struct ceph_snap_context *snapc;
20250eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20260eefd470SAlex Elder 	struct ceph_osd_client *osdc;
20270eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
2028d3246fb0SJosh Durgin 	int num_osd_ops = 3;
20290eefd470SAlex Elder 
20300eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20310eefd470SAlex Elder 	img_request = obj_request->img_request;
20320eefd470SAlex Elder 	rbd_assert(img_request);
2033d3246fb0SJosh Durgin 	rbd_assert(img_request_write_test(img_request) ||
2034d3246fb0SJosh Durgin 			img_request_discard_test(img_request));
20350eefd470SAlex Elder 
2036d3246fb0SJosh Durgin 	if (img_request_discard_test(img_request))
2037d3246fb0SJosh Durgin 		num_osd_ops = 2;
2038d3246fb0SJosh Durgin 
2039d3246fb0SJosh Durgin 	/* Allocate and initialize the request, for all the ops */
20400eefd470SAlex Elder 
20410eefd470SAlex Elder 	snapc = img_request->snapc;
20420eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20430eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2044d3246fb0SJosh Durgin 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
20452224d879SDavid Disseldorp 						false, GFP_NOIO);
20460eefd470SAlex Elder 	if (!osd_req)
204713d1ad16SIlya Dryomov 		goto fail;
20480eefd470SAlex Elder 
20490eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
20500eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
20510eefd470SAlex Elder 	osd_req->r_priv = obj_request;
20520eefd470SAlex Elder 
20537627151eSYan, Zheng 	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
2054d30291b9SIlya Dryomov 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
2055d30291b9SIlya Dryomov 			     obj_request->object_name))
2056d30291b9SIlya Dryomov 		goto fail;
20570eefd470SAlex Elder 
205813d1ad16SIlya Dryomov 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
205913d1ad16SIlya Dryomov 		goto fail;
206013d1ad16SIlya Dryomov 
20610eefd470SAlex Elder 	return osd_req;
206213d1ad16SIlya Dryomov 
206313d1ad16SIlya Dryomov fail:
206413d1ad16SIlya Dryomov 	ceph_osdc_put_request(osd_req);
206513d1ad16SIlya Dryomov 	return NULL;
20660eefd470SAlex Elder }
20670eefd470SAlex Elder 
20680eefd470SAlex Elder 
2069bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2070bf0d5f50SAlex Elder {
2071bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
2072bf0d5f50SAlex Elder }
2073bf0d5f50SAlex Elder 
2074bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
2075bf0d5f50SAlex Elder 
2076bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
2077bf0d5f50SAlex Elder 						u64 offset, u64 length,
2078bf0d5f50SAlex Elder 						enum obj_request_type type)
2079bf0d5f50SAlex Elder {
2080bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2081bf0d5f50SAlex Elder 	size_t size;
2082bf0d5f50SAlex Elder 	char *name;
2083bf0d5f50SAlex Elder 
2084bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
2085bf0d5f50SAlex Elder 
2086bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
20875a60e876SIlya Dryomov 	name = kmalloc(size, GFP_NOIO);
2088f907ad55SAlex Elder 	if (!name)
2089bf0d5f50SAlex Elder 		return NULL;
2090bf0d5f50SAlex Elder 
20915a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
2092f907ad55SAlex Elder 	if (!obj_request) {
2093f907ad55SAlex Elder 		kfree(name);
2094f907ad55SAlex Elder 		return NULL;
2095f907ad55SAlex Elder 	}
2096f907ad55SAlex Elder 
2097bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
2098bf0d5f50SAlex Elder 	obj_request->offset = offset;
2099bf0d5f50SAlex Elder 	obj_request->length = length;
2100926f9b3fSAlex Elder 	obj_request->flags = 0;
2101bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
2102bf0d5f50SAlex Elder 	obj_request->type = type;
2103bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
2104788e2df3SAlex Elder 	init_completion(&obj_request->completion);
2105bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
2106bf0d5f50SAlex Elder 
210737206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
210837206ee5SAlex Elder 		offset, length, (int)type, obj_request);
210937206ee5SAlex Elder 
2110bf0d5f50SAlex Elder 	return obj_request;
2111bf0d5f50SAlex Elder }
2112bf0d5f50SAlex Elder 
2113bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
2114bf0d5f50SAlex Elder {
2115bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2116bf0d5f50SAlex Elder 
2117bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
2118bf0d5f50SAlex Elder 
211937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
212037206ee5SAlex Elder 
2121bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
2122bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
2123bf0d5f50SAlex Elder 
2124bf0d5f50SAlex Elder 	if (obj_request->osd_req)
2125bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
2126bf0d5f50SAlex Elder 
2127bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
2128bf0d5f50SAlex Elder 	switch (obj_request->type) {
21299969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
21309969ebc5SAlex Elder 		break;		/* Nothing to do */
2131bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
2132bf0d5f50SAlex Elder 		if (obj_request->bio_list)
2133bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
2134bf0d5f50SAlex Elder 		break;
2135788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
2136788e2df3SAlex Elder 		if (obj_request->pages)
2137788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
2138788e2df3SAlex Elder 						obj_request->page_count);
2139788e2df3SAlex Elder 		break;
2140bf0d5f50SAlex Elder 	}
2141bf0d5f50SAlex Elder 
2142f907ad55SAlex Elder 	kfree(obj_request->object_name);
2143868311b1SAlex Elder 	obj_request->object_name = NULL;
2144868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
2145bf0d5f50SAlex Elder }
2146bf0d5f50SAlex Elder 
2147fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
2148fb65d228SAlex Elder 
2149fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
2150fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2151fb65d228SAlex Elder {
2152fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
2153fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2154fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
2155fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
2156fb65d228SAlex Elder }
2157fb65d228SAlex Elder 
2158bf0d5f50SAlex Elder /*
2159a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
2160a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
2161a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
2162a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
2163a2acd00eSAlex Elder  */
2164a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2165a2acd00eSAlex Elder {
2166a2acd00eSAlex Elder 	int counter;
2167a2acd00eSAlex Elder 
2168a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2169a2acd00eSAlex Elder 		return;
2170a2acd00eSAlex Elder 
2171a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2172a2acd00eSAlex Elder 	if (counter > 0)
2173a2acd00eSAlex Elder 		return;
2174a2acd00eSAlex Elder 
2175a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
2176a2acd00eSAlex Elder 
2177a2acd00eSAlex Elder 	if (!counter)
2178a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
2179a2acd00eSAlex Elder 	else
21809584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
2181a2acd00eSAlex Elder }
2182a2acd00eSAlex Elder 
2183a2acd00eSAlex Elder /*
2184a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
2185a2acd00eSAlex Elder  * parent.
2186a2acd00eSAlex Elder  *
2187a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2188a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2189a2acd00eSAlex Elder  * false otherwise.
2190a2acd00eSAlex Elder  */
2191a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2192a2acd00eSAlex Elder {
2193ae43e9d0SIlya Dryomov 	int counter = 0;
2194a2acd00eSAlex Elder 
2195a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2196a2acd00eSAlex Elder 		return false;
2197a2acd00eSAlex Elder 
2198ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
2199ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
2200a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2201ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
2202a2acd00eSAlex Elder 
2203a2acd00eSAlex Elder 	if (counter < 0)
22049584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
2205a2acd00eSAlex Elder 
2206ae43e9d0SIlya Dryomov 	return counter > 0;
2207a2acd00eSAlex Elder }
2208a2acd00eSAlex Elder 
2209bf0d5f50SAlex Elder /*
2210bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2211bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2212bf0d5f50SAlex Elder  * (if there is one).
2213bf0d5f50SAlex Elder  */
2214cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2215cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2216bf0d5f50SAlex Elder 					u64 offset, u64 length,
22176d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
22184e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
2219bf0d5f50SAlex Elder {
2220bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2221bf0d5f50SAlex Elder 
22227a716aacSIlya Dryomov 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2223bf0d5f50SAlex Elder 	if (!img_request)
2224bf0d5f50SAlex Elder 		return NULL;
2225bf0d5f50SAlex Elder 
2226bf0d5f50SAlex Elder 	img_request->rq = NULL;
2227bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2228bf0d5f50SAlex Elder 	img_request->offset = offset;
2229bf0d5f50SAlex Elder 	img_request->length = length;
22300c425248SAlex Elder 	img_request->flags = 0;
223190e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD) {
223290e98c52SGuangliang Zhao 		img_request_discard_set(img_request);
223390e98c52SGuangliang Zhao 		img_request->snapc = snapc;
223490e98c52SGuangliang Zhao 	} else if (op_type == OBJ_OP_WRITE) {
22350c425248SAlex Elder 		img_request_write_set(img_request);
22364e752f0aSJosh Durgin 		img_request->snapc = snapc;
22370c425248SAlex Elder 	} else {
2238bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
22390c425248SAlex Elder 	}
2240a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2241d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2242bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2243bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2244bf0d5f50SAlex Elder 	img_request->callback = NULL;
2245a5a337d4SAlex Elder 	img_request->result = 0;
2246bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2247bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2248bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2249bf0d5f50SAlex Elder 
225037206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
22516d2940c8SGuangliang Zhao 		obj_op_name(op_type), offset, length, img_request);
225237206ee5SAlex Elder 
2253bf0d5f50SAlex Elder 	return img_request;
2254bf0d5f50SAlex Elder }
2255bf0d5f50SAlex Elder 
2256bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2257bf0d5f50SAlex Elder {
2258bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2259bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2260bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2261bf0d5f50SAlex Elder 
2262bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2263bf0d5f50SAlex Elder 
226437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
226537206ee5SAlex Elder 
2266bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2267bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
226825dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2269bf0d5f50SAlex Elder 
2270a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2271a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2272a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2273a2acd00eSAlex Elder 	}
2274a2acd00eSAlex Elder 
2275bef95455SJosh Durgin 	if (img_request_write_test(img_request) ||
2276bef95455SJosh Durgin 		img_request_discard_test(img_request))
2277812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2278bf0d5f50SAlex Elder 
22791c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2280bf0d5f50SAlex Elder }
2281bf0d5f50SAlex Elder 
2282e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2283e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2284e93f3152SAlex Elder 					u64 img_offset, u64 length)
2285e93f3152SAlex Elder {
2286e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2287e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2288e93f3152SAlex Elder 
2289e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2290e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2291e93f3152SAlex Elder 
22924e752f0aSJosh Durgin 	parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
22936d2940c8SGuangliang Zhao 						length, OBJ_OP_READ, NULL);
2294e93f3152SAlex Elder 	if (!parent_request)
2295e93f3152SAlex Elder 		return NULL;
2296e93f3152SAlex Elder 
2297e93f3152SAlex Elder 	img_request_child_set(parent_request);
2298e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2299e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2300e93f3152SAlex Elder 
2301e93f3152SAlex Elder 	return parent_request;
2302e93f3152SAlex Elder }
2303e93f3152SAlex Elder 
2304e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2305e93f3152SAlex Elder {
2306e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2307e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2308e93f3152SAlex Elder 
2309e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2310e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2311e93f3152SAlex Elder 
2312e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2313e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2314e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2315e93f3152SAlex Elder 
2316e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2317e93f3152SAlex Elder }
2318e93f3152SAlex Elder 
23191217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
23201217857fSAlex Elder {
23216365d33aSAlex Elder 	struct rbd_img_request *img_request;
23221217857fSAlex Elder 	unsigned int xferred;
23231217857fSAlex Elder 	int result;
23248b3e1a56SAlex Elder 	bool more;
23251217857fSAlex Elder 
23266365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23276365d33aSAlex Elder 	img_request = obj_request->img_request;
23286365d33aSAlex Elder 
23291217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
23301217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
23311217857fSAlex Elder 	result = obj_request->result;
23321217857fSAlex Elder 	if (result) {
23331217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
23346d2940c8SGuangliang Zhao 		enum obj_operation_type op_type;
23356d2940c8SGuangliang Zhao 
233690e98c52SGuangliang Zhao 		if (img_request_discard_test(img_request))
233790e98c52SGuangliang Zhao 			op_type = OBJ_OP_DISCARD;
233890e98c52SGuangliang Zhao 		else if (img_request_write_test(img_request))
233990e98c52SGuangliang Zhao 			op_type = OBJ_OP_WRITE;
234090e98c52SGuangliang Zhao 		else
234190e98c52SGuangliang Zhao 			op_type = OBJ_OP_READ;
23421217857fSAlex Elder 
23439584d508SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
23446d2940c8SGuangliang Zhao 			obj_op_name(op_type), obj_request->length,
23456d2940c8SGuangliang Zhao 			obj_request->img_offset, obj_request->offset);
23469584d508SIlya Dryomov 		rbd_warn(rbd_dev, "  result %d xferred %x",
23471217857fSAlex Elder 			result, xferred);
23481217857fSAlex Elder 		if (!img_request->result)
23491217857fSAlex Elder 			img_request->result = result;
2350082a75daSIlya Dryomov 		/*
2351082a75daSIlya Dryomov 		 * Need to end I/O on the entire obj_request worth of
2352082a75daSIlya Dryomov 		 * bytes in case of error.
2353082a75daSIlya Dryomov 		 */
2354082a75daSIlya Dryomov 		xferred = obj_request->length;
23551217857fSAlex Elder 	}
23561217857fSAlex Elder 
2357f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2358f1a4739fSAlex Elder 
2359f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2360f1a4739fSAlex Elder 		obj_request->pages = NULL;
2361f1a4739fSAlex Elder 		obj_request->page_count = 0;
2362f1a4739fSAlex Elder 	}
2363f1a4739fSAlex Elder 
23648b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
23658b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
23668b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
23678b3e1a56SAlex Elder 	} else {
23688b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
23697ad18afaSChristoph Hellwig 
23707ad18afaSChristoph Hellwig 		more = blk_update_request(img_request->rq, result, xferred);
23717ad18afaSChristoph Hellwig 		if (!more)
23727ad18afaSChristoph Hellwig 			__blk_mq_end_request(img_request->rq, result);
23738b3e1a56SAlex Elder 	}
23748b3e1a56SAlex Elder 
23758b3e1a56SAlex Elder 	return more;
23761217857fSAlex Elder }
23771217857fSAlex Elder 
23782169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
23792169238dSAlex Elder {
23802169238dSAlex Elder 	struct rbd_img_request *img_request;
23812169238dSAlex Elder 	u32 which = obj_request->which;
23822169238dSAlex Elder 	bool more = true;
23832169238dSAlex Elder 
23846365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23852169238dSAlex Elder 	img_request = obj_request->img_request;
23862169238dSAlex Elder 
23872169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
23882169238dSAlex Elder 	rbd_assert(img_request != NULL);
23892169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
23902169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
23912169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
23922169238dSAlex Elder 
23932169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
23942169238dSAlex Elder 	if (which != img_request->next_completion)
23952169238dSAlex Elder 		goto out;
23962169238dSAlex Elder 
23972169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
23982169238dSAlex Elder 		rbd_assert(more);
23992169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
24002169238dSAlex Elder 
24012169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
24022169238dSAlex Elder 			break;
24031217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
24042169238dSAlex Elder 		which++;
24052169238dSAlex Elder 	}
24062169238dSAlex Elder 
24072169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
24082169238dSAlex Elder 	img_request->next_completion = which;
24092169238dSAlex Elder out:
24102169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
24110f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
24122169238dSAlex Elder 
24132169238dSAlex Elder 	if (!more)
24142169238dSAlex Elder 		rbd_img_request_complete(img_request);
24152169238dSAlex Elder }
24162169238dSAlex Elder 
2417f1a4739fSAlex Elder /*
24183b434a2aSJosh Durgin  * Add individual osd ops to the given ceph_osd_request and prepare
24193b434a2aSJosh Durgin  * them for submission. num_ops is the current number of
24203b434a2aSJosh Durgin  * osd operations already to the object request.
24213b434a2aSJosh Durgin  */
24223b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
24233b434a2aSJosh Durgin 				struct ceph_osd_request *osd_request,
24243b434a2aSJosh Durgin 				enum obj_operation_type op_type,
24253b434a2aSJosh Durgin 				unsigned int num_ops)
24263b434a2aSJosh Durgin {
24273b434a2aSJosh Durgin 	struct rbd_img_request *img_request = obj_request->img_request;
24283b434a2aSJosh Durgin 	struct rbd_device *rbd_dev = img_request->rbd_dev;
24293b434a2aSJosh Durgin 	u64 object_size = rbd_obj_bytes(&rbd_dev->header);
24303b434a2aSJosh Durgin 	u64 offset = obj_request->offset;
24313b434a2aSJosh Durgin 	u64 length = obj_request->length;
24323b434a2aSJosh Durgin 	u64 img_end;
24333b434a2aSJosh Durgin 	u16 opcode;
24343b434a2aSJosh Durgin 
24353b434a2aSJosh Durgin 	if (op_type == OBJ_OP_DISCARD) {
2436d3246fb0SJosh Durgin 		if (!offset && length == object_size &&
2437d3246fb0SJosh Durgin 		    (!img_request_layered_test(img_request) ||
2438d3246fb0SJosh Durgin 		     !obj_request_overlaps_parent(obj_request))) {
24393b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_DELETE;
24403b434a2aSJosh Durgin 		} else if ((offset + length == object_size)) {
24413b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_TRUNCATE;
24423b434a2aSJosh Durgin 		} else {
24433b434a2aSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
24443b434a2aSJosh Durgin 			img_end = rbd_dev->header.image_size;
24453b434a2aSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
24463b434a2aSJosh Durgin 
24473b434a2aSJosh Durgin 			if (obj_request->img_offset + length == img_end)
24483b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_TRUNCATE;
24493b434a2aSJosh Durgin 			else
24503b434a2aSJosh Durgin 				opcode = CEPH_OSD_OP_ZERO;
24513b434a2aSJosh Durgin 		}
24523b434a2aSJosh Durgin 	} else if (op_type == OBJ_OP_WRITE) {
2453e30b7577SIlya Dryomov 		if (!offset && length == object_size)
2454e30b7577SIlya Dryomov 			opcode = CEPH_OSD_OP_WRITEFULL;
2455e30b7577SIlya Dryomov 		else
24563b434a2aSJosh Durgin 			opcode = CEPH_OSD_OP_WRITE;
24573b434a2aSJosh Durgin 		osd_req_op_alloc_hint_init(osd_request, num_ops,
24583b434a2aSJosh Durgin 					object_size, object_size);
24593b434a2aSJosh Durgin 		num_ops++;
24603b434a2aSJosh Durgin 	} else {
24613b434a2aSJosh Durgin 		opcode = CEPH_OSD_OP_READ;
24623b434a2aSJosh Durgin 	}
24633b434a2aSJosh Durgin 
24647e868b6eSIlya Dryomov 	if (opcode == CEPH_OSD_OP_DELETE)
2465144cba14SYan, Zheng 		osd_req_op_init(osd_request, num_ops, opcode, 0);
24667e868b6eSIlya Dryomov 	else
24677e868b6eSIlya Dryomov 		osd_req_op_extent_init(osd_request, num_ops, opcode,
24687e868b6eSIlya Dryomov 				       offset, length, 0, 0);
24697e868b6eSIlya Dryomov 
24703b434a2aSJosh Durgin 	if (obj_request->type == OBJ_REQUEST_BIO)
24713b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_bio(osd_request, num_ops,
24723b434a2aSJosh Durgin 					obj_request->bio_list, length);
24733b434a2aSJosh Durgin 	else if (obj_request->type == OBJ_REQUEST_PAGES)
24743b434a2aSJosh Durgin 		osd_req_op_extent_osd_data_pages(osd_request, num_ops,
24753b434a2aSJosh Durgin 					obj_request->pages, length,
24763b434a2aSJosh Durgin 					offset & ~PAGE_MASK, false, false);
24773b434a2aSJosh Durgin 
24783b434a2aSJosh Durgin 	/* Discards are also writes */
24793b434a2aSJosh Durgin 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
24803b434a2aSJosh Durgin 		rbd_osd_req_format_write(obj_request);
24813b434a2aSJosh Durgin 	else
24823b434a2aSJosh Durgin 		rbd_osd_req_format_read(obj_request);
24833b434a2aSJosh Durgin }
24843b434a2aSJosh Durgin 
24853b434a2aSJosh Durgin /*
2486f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2487f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2488f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2489f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2490f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2491f1a4739fSAlex Elder  * all data described by the image request.
2492f1a4739fSAlex Elder  */
2493f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2494f1a4739fSAlex Elder 					enum obj_request_type type,
2495f1a4739fSAlex Elder 					void *data_desc)
2496bf0d5f50SAlex Elder {
2497bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2498bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2499bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2500a158073cSJingoo Han 	struct bio *bio_list = NULL;
2501f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2502a158073cSJingoo Han 	struct page **pages = NULL;
25036d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
25047da22d29SAlex Elder 	u64 img_offset;
2505bf0d5f50SAlex Elder 	u64 resid;
2506bf0d5f50SAlex Elder 
2507f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2508f1a4739fSAlex Elder 		(int)type, data_desc);
250937206ee5SAlex Elder 
25107da22d29SAlex Elder 	img_offset = img_request->offset;
2511bf0d5f50SAlex Elder 	resid = img_request->length;
25124dda41d3SAlex Elder 	rbd_assert(resid > 0);
25133b434a2aSJosh Durgin 	op_type = rbd_img_request_op_type(img_request);
2514f1a4739fSAlex Elder 
2515f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2516f1a4739fSAlex Elder 		bio_list = data_desc;
25174f024f37SKent Overstreet 		rbd_assert(img_offset ==
25184f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
251990e98c52SGuangliang Zhao 	} else if (type == OBJ_REQUEST_PAGES) {
2520f1a4739fSAlex Elder 		pages = data_desc;
2521f1a4739fSAlex Elder 	}
2522f1a4739fSAlex Elder 
2523bf0d5f50SAlex Elder 	while (resid) {
25242fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2525bf0d5f50SAlex Elder 		const char *object_name;
2526bf0d5f50SAlex Elder 		u64 offset;
2527bf0d5f50SAlex Elder 		u64 length;
2528bf0d5f50SAlex Elder 
25297da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2530bf0d5f50SAlex Elder 		if (!object_name)
2531bf0d5f50SAlex Elder 			goto out_unwind;
25327da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
25337da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2534bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2535f1a4739fSAlex Elder 						offset, length, type);
253678c2a44aSAlex Elder 		/* object request has its own copy of the object name */
253778c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2538bf0d5f50SAlex Elder 		if (!obj_request)
2539bf0d5f50SAlex Elder 			goto out_unwind;
254062054da6SIlya Dryomov 
254103507db6SJosh Durgin 		/*
254203507db6SJosh Durgin 		 * set obj_request->img_request before creating the
254303507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
254403507db6SJosh Durgin 		 */
254503507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2546bf0d5f50SAlex Elder 
2547f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2548f1a4739fSAlex Elder 			unsigned int clone_size;
2549f1a4739fSAlex Elder 
2550bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2551bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2552f1a4739fSAlex Elder 			obj_request->bio_list =
2553f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2554f1a4739fSAlex Elder 								&bio_offset,
2555f1a4739fSAlex Elder 								clone_size,
25562224d879SDavid Disseldorp 								GFP_NOIO);
2557bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
255862054da6SIlya Dryomov 				goto out_unwind;
255990e98c52SGuangliang Zhao 		} else if (type == OBJ_REQUEST_PAGES) {
2560f1a4739fSAlex Elder 			unsigned int page_count;
2561f1a4739fSAlex Elder 
2562f1a4739fSAlex Elder 			obj_request->pages = pages;
2563f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2564f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2565f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2566f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2567f1a4739fSAlex Elder 			pages += page_count;
2568f1a4739fSAlex Elder 		}
2569bf0d5f50SAlex Elder 
25706d2940c8SGuangliang Zhao 		osd_req = rbd_osd_req_create(rbd_dev, op_type,
25716d2940c8SGuangliang Zhao 					(op_type == OBJ_OP_WRITE) ? 2 : 1,
25722fa12320SAlex Elder 					obj_request);
25732fa12320SAlex Elder 		if (!osd_req)
257462054da6SIlya Dryomov 			goto out_unwind;
25753b434a2aSJosh Durgin 
25762fa12320SAlex Elder 		obj_request->osd_req = osd_req;
25772169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
25787da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2579bf0d5f50SAlex Elder 
25803b434a2aSJosh Durgin 		rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
25813b434a2aSJosh Durgin 
25823b434a2aSJosh Durgin 		rbd_img_request_get(img_request);
25833b434a2aSJosh Durgin 
25847da22d29SAlex Elder 		img_offset += length;
2585bf0d5f50SAlex Elder 		resid -= length;
2586bf0d5f50SAlex Elder 	}
2587bf0d5f50SAlex Elder 
2588bf0d5f50SAlex Elder 	return 0;
2589bf0d5f50SAlex Elder 
2590bf0d5f50SAlex Elder out_unwind:
2591bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
259242dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2593bf0d5f50SAlex Elder 
2594bf0d5f50SAlex Elder 	return -ENOMEM;
2595bf0d5f50SAlex Elder }
2596bf0d5f50SAlex Elder 
25973d7efd18SAlex Elder static void
25982761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
25990eefd470SAlex Elder {
26000eefd470SAlex Elder 	struct rbd_img_request *img_request;
26010eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2602ebda6408SAlex Elder 	struct page **pages;
26030eefd470SAlex Elder 	u32 page_count;
26040eefd470SAlex Elder 
26052761713dSIlya Dryomov 	dout("%s: obj %p\n", __func__, obj_request);
26062761713dSIlya Dryomov 
2607d3246fb0SJosh Durgin 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2608d3246fb0SJosh Durgin 		obj_request->type == OBJ_REQUEST_NODATA);
26090eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
26100eefd470SAlex Elder 	img_request = obj_request->img_request;
26110eefd470SAlex Elder 	rbd_assert(img_request);
26120eefd470SAlex Elder 
26130eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
26140eefd470SAlex Elder 	rbd_assert(rbd_dev);
26150eefd470SAlex Elder 
2616ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2617ebda6408SAlex Elder 	rbd_assert(pages != NULL);
26180eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2619ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2620ebda6408SAlex Elder 	rbd_assert(page_count);
2621ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2622ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
26230eefd470SAlex Elder 
26240eefd470SAlex Elder 	/*
26250eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
26260eefd470SAlex Elder 	 * original write request.  There is no such thing as a
26270eefd470SAlex Elder 	 * successful short write, so if the request was successful
26280eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
26290eefd470SAlex Elder 	 */
26300eefd470SAlex Elder 	if (!obj_request->result)
26310eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
26320eefd470SAlex Elder 
26332761713dSIlya Dryomov 	obj_request_done_set(obj_request);
26340eefd470SAlex Elder }
26350eefd470SAlex Elder 
26360eefd470SAlex Elder static void
26373d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
26383d7efd18SAlex Elder {
26393d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
26400eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
26410eefd470SAlex Elder 	struct ceph_osd_client *osdc;
26420eefd470SAlex Elder 	struct rbd_device *rbd_dev;
26433d7efd18SAlex Elder 	struct page **pages;
2644d3246fb0SJosh Durgin 	enum obj_operation_type op_type;
2645ebda6408SAlex Elder 	u32 page_count;
2646bbea1c1aSAlex Elder 	int img_result;
2647ebda6408SAlex Elder 	u64 parent_length;
26483d7efd18SAlex Elder 
26493d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
26503d7efd18SAlex Elder 
26513d7efd18SAlex Elder 	/* First get what we need from the image request */
26523d7efd18SAlex Elder 
26533d7efd18SAlex Elder 	pages = img_request->copyup_pages;
26543d7efd18SAlex Elder 	rbd_assert(pages != NULL);
26553d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2656ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2657ebda6408SAlex Elder 	rbd_assert(page_count);
2658ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
26593d7efd18SAlex Elder 
26603d7efd18SAlex Elder 	orig_request = img_request->obj_request;
26613d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2662b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2663bbea1c1aSAlex Elder 	img_result = img_request->result;
2664ebda6408SAlex Elder 	parent_length = img_request->length;
2665ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
26663d7efd18SAlex Elder 	rbd_img_request_put(img_request);
26673d7efd18SAlex Elder 
266891c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
266991c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
26703d7efd18SAlex Elder 	rbd_assert(rbd_dev);
26713d7efd18SAlex Elder 
2672bbea1c1aSAlex Elder 	/*
2673bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2674bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2675bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2676bbea1c1aSAlex Elder 	 */
2677bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2678bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2679bbea1c1aSAlex Elder 
2680bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2681bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2682bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2683bbea1c1aSAlex Elder 		if (!img_result)
2684bbea1c1aSAlex Elder 			return;
2685bbea1c1aSAlex Elder 	}
2686bbea1c1aSAlex Elder 
2687bbea1c1aSAlex Elder 	if (img_result)
26880eefd470SAlex Elder 		goto out_err;
26893d7efd18SAlex Elder 
26908785b1d4SAlex Elder 	/*
26918785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
26920ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
26938785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
26948785b1d4SAlex Elder 	 * original request, and release the old one.
26958785b1d4SAlex Elder 	 */
2696bbea1c1aSAlex Elder 	img_result = -ENOMEM;
26970eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
26980eefd470SAlex Elder 	if (!osd_req)
26990eefd470SAlex Elder 		goto out_err;
27008785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
27010eefd470SAlex Elder 	orig_request->osd_req = osd_req;
27020eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2703ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
27043d7efd18SAlex Elder 
27050eefd470SAlex Elder 	/* Initialize the copyup op */
27060eefd470SAlex Elder 
27070eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2708ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
27090eefd470SAlex Elder 						false, false);
27100eefd470SAlex Elder 
2711d3246fb0SJosh Durgin 	/* Add the other op(s) */
27120ccd5926SIlya Dryomov 
2713d3246fb0SJosh Durgin 	op_type = rbd_img_request_op_type(orig_request->img_request);
2714d3246fb0SJosh Durgin 	rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
27150eefd470SAlex Elder 
27160eefd470SAlex Elder 	/* All set, send it off. */
27170eefd470SAlex Elder 
27180eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2719bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2720bbea1c1aSAlex Elder 	if (!img_result)
27210eefd470SAlex Elder 		return;
27220eefd470SAlex Elder out_err:
27230eefd470SAlex Elder 	/* Record the error code and complete the request */
27240eefd470SAlex Elder 
2725bbea1c1aSAlex Elder 	orig_request->result = img_result;
27260eefd470SAlex Elder 	orig_request->xferred = 0;
27273d7efd18SAlex Elder 	obj_request_done_set(orig_request);
27283d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
27293d7efd18SAlex Elder }
27303d7efd18SAlex Elder 
27313d7efd18SAlex Elder /*
27323d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
27333d7efd18SAlex Elder  * entire target of the given object request.  This is used for
27343d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
27353d7efd18SAlex Elder  * object request from the image request does not exist.
27363d7efd18SAlex Elder  *
27373d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
27383d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
27393d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
27403d7efd18SAlex Elder  * the original object request for the copyup operation.
27413d7efd18SAlex Elder  *
27423d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
27433d7efd18SAlex Elder  * object request and mark it done so it gets completed.
27443d7efd18SAlex Elder  */
27453d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
27463d7efd18SAlex Elder {
27473d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
27483d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
27493d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
27503d7efd18SAlex Elder 	u64 img_offset;
27513d7efd18SAlex Elder 	u64 length;
27523d7efd18SAlex Elder 	struct page **pages = NULL;
27533d7efd18SAlex Elder 	u32 page_count;
27543d7efd18SAlex Elder 	int result;
27553d7efd18SAlex Elder 
27563d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2757b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
27583d7efd18SAlex Elder 
27593d7efd18SAlex Elder 	img_request = obj_request->img_request;
27603d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
27613d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
27623d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
27633d7efd18SAlex Elder 
27643d7efd18SAlex Elder 	/*
27653d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
27663d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
27673d7efd18SAlex Elder 	 */
27683d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
27693d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
27703d7efd18SAlex Elder 
27713d7efd18SAlex Elder 	/*
2772a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2773a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2774a9e8ba2cSAlex Elder 	 * necessary.
2775a9e8ba2cSAlex Elder 	 */
2776a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2777a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2778a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2779a9e8ba2cSAlex Elder 	}
2780a9e8ba2cSAlex Elder 
2781a9e8ba2cSAlex Elder 	/*
27823d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
27833d7efd18SAlex Elder 	 * from the parent.
27843d7efd18SAlex Elder 	 */
27853d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
27863d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
27873d7efd18SAlex Elder 	if (IS_ERR(pages)) {
27883d7efd18SAlex Elder 		result = PTR_ERR(pages);
27893d7efd18SAlex Elder 		pages = NULL;
27903d7efd18SAlex Elder 		goto out_err;
27913d7efd18SAlex Elder 	}
27923d7efd18SAlex Elder 
27933d7efd18SAlex Elder 	result = -ENOMEM;
2794e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2795e93f3152SAlex Elder 						img_offset, length);
27963d7efd18SAlex Elder 	if (!parent_request)
27973d7efd18SAlex Elder 		goto out_err;
27983d7efd18SAlex Elder 
27993d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
28003d7efd18SAlex Elder 	if (result)
28013d7efd18SAlex Elder 		goto out_err;
28023d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2803ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
28043d7efd18SAlex Elder 
28053d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
28063d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
28073d7efd18SAlex Elder 	if (!result)
28083d7efd18SAlex Elder 		return 0;
28093d7efd18SAlex Elder 
28103d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2811ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
28123d7efd18SAlex Elder 	parent_request->obj_request = NULL;
28133d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
28143d7efd18SAlex Elder out_err:
28153d7efd18SAlex Elder 	if (pages)
28163d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
28173d7efd18SAlex Elder 	if (parent_request)
28183d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
28193d7efd18SAlex Elder 	obj_request->result = result;
28203d7efd18SAlex Elder 	obj_request->xferred = 0;
28213d7efd18SAlex Elder 	obj_request_done_set(obj_request);
28223d7efd18SAlex Elder 
28233d7efd18SAlex Elder 	return result;
28243d7efd18SAlex Elder }
28253d7efd18SAlex Elder 
2826c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2827c5b5ef6cSAlex Elder {
2828c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2829638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2830c5b5ef6cSAlex Elder 	int result;
2831c5b5ef6cSAlex Elder 
2832c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2833c5b5ef6cSAlex Elder 
2834c5b5ef6cSAlex Elder 	/*
2835c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2836c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2837c5b5ef6cSAlex Elder 	 * we're done with the request.
2838c5b5ef6cSAlex Elder 	 */
2839c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2840c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2841912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2842c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2843c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2844c5b5ef6cSAlex Elder 
2845c5b5ef6cSAlex Elder 	result = obj_request->result;
2846c5b5ef6cSAlex Elder 	obj_request->result = 0;
2847c5b5ef6cSAlex Elder 
2848c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2849c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2850c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2851c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2852c5b5ef6cSAlex Elder 
2853638f5abeSAlex Elder 	/*
2854638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2855638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2856638f5abeSAlex Elder 	 * and re-submit the original write request.
2857638f5abeSAlex Elder 	 */
2858638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2859638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2860638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2861638f5abeSAlex Elder 
2862638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2863638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2864638f5abeSAlex Elder 		if (!result)
2865638f5abeSAlex Elder 			return;
2866638f5abeSAlex Elder 	}
2867c5b5ef6cSAlex Elder 
2868c5b5ef6cSAlex Elder 	/*
2869c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2870c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2871c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2872c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2873c5b5ef6cSAlex Elder 	 */
2874c5b5ef6cSAlex Elder 	if (!result) {
2875c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2876c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2877c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2878c5b5ef6cSAlex Elder 	} else if (result) {
2879c5b5ef6cSAlex Elder 		orig_request->result = result;
28803d7efd18SAlex Elder 		goto out;
2881c5b5ef6cSAlex Elder 	}
2882c5b5ef6cSAlex Elder 
2883c5b5ef6cSAlex Elder 	/*
2884c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2885c5b5ef6cSAlex Elder 	 * whether the target object exists.
2886c5b5ef6cSAlex Elder 	 */
2887b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
28883d7efd18SAlex Elder out:
2889c5b5ef6cSAlex Elder 	if (orig_request->result)
2890c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2891c5b5ef6cSAlex Elder }
2892c5b5ef6cSAlex Elder 
2893c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2894c5b5ef6cSAlex Elder {
2895c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2896c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2897c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2898c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2899c5b5ef6cSAlex Elder 	u32 page_count;
2900c5b5ef6cSAlex Elder 	size_t size;
2901c5b5ef6cSAlex Elder 	int ret;
2902c5b5ef6cSAlex Elder 
2903c5b5ef6cSAlex Elder 	/*
2904c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2905c5b5ef6cSAlex Elder 	 *     le64 length;
2906c5b5ef6cSAlex Elder 	 *     struct {
2907c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2908c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2909c5b5ef6cSAlex Elder 	 *     } mtime;
2910c5b5ef6cSAlex Elder 	 */
2911c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2912c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2913c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2914c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2915c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2916c5b5ef6cSAlex Elder 
2917c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2918c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2919c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2920c5b5ef6cSAlex Elder 	if (!stat_request)
2921c5b5ef6cSAlex Elder 		goto out;
2922c5b5ef6cSAlex Elder 
2923c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2924c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2925c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2926c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2927c5b5ef6cSAlex Elder 
2928c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2929c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
29306d2940c8SGuangliang Zhao 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2931c5b5ef6cSAlex Elder 						   stat_request);
2932c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2933c5b5ef6cSAlex Elder 		goto out;
2934c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2935c5b5ef6cSAlex Elder 
2936144cba14SYan, Zheng 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2937c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2938c5b5ef6cSAlex Elder 					false, false);
29399d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2940c5b5ef6cSAlex Elder 
2941c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2942c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2943c5b5ef6cSAlex Elder out:
2944c5b5ef6cSAlex Elder 	if (ret)
2945c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2946c5b5ef6cSAlex Elder 
2947c5b5ef6cSAlex Elder 	return ret;
2948c5b5ef6cSAlex Elder }
2949c5b5ef6cSAlex Elder 
295070d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2951b454e36dSAlex Elder {
2952b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2953a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2954b454e36dSAlex Elder 
2955b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2956b454e36dSAlex Elder 
2957b454e36dSAlex Elder 	img_request = obj_request->img_request;
2958b454e36dSAlex Elder 	rbd_assert(img_request);
2959a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2960b454e36dSAlex Elder 
296170d045f6SIlya Dryomov 	/* Reads */
29621c220881SJosh Durgin 	if (!img_request_write_test(img_request) &&
29631c220881SJosh Durgin 	    !img_request_discard_test(img_request))
296470d045f6SIlya Dryomov 		return true;
2965b454e36dSAlex Elder 
296670d045f6SIlya Dryomov 	/* Non-layered writes */
296770d045f6SIlya Dryomov 	if (!img_request_layered_test(img_request))
296870d045f6SIlya Dryomov 		return true;
296970d045f6SIlya Dryomov 
297070d045f6SIlya Dryomov 	/*
297170d045f6SIlya Dryomov 	 * Layered writes outside of the parent overlap range don't
297270d045f6SIlya Dryomov 	 * share any data with the parent.
297370d045f6SIlya Dryomov 	 */
297470d045f6SIlya Dryomov 	if (!obj_request_overlaps_parent(obj_request))
297570d045f6SIlya Dryomov 		return true;
297670d045f6SIlya Dryomov 
297770d045f6SIlya Dryomov 	/*
2978c622d226SGuangliang Zhao 	 * Entire-object layered writes - we will overwrite whatever
2979c622d226SGuangliang Zhao 	 * parent data there is anyway.
2980c622d226SGuangliang Zhao 	 */
2981c622d226SGuangliang Zhao 	if (!obj_request->offset &&
2982c622d226SGuangliang Zhao 	    obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2983c622d226SGuangliang Zhao 		return true;
2984c622d226SGuangliang Zhao 
2985c622d226SGuangliang Zhao 	/*
298670d045f6SIlya Dryomov 	 * If the object is known to already exist, its parent data has
298770d045f6SIlya Dryomov 	 * already been copied.
298870d045f6SIlya Dryomov 	 */
298970d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request) &&
299070d045f6SIlya Dryomov 	    obj_request_exists_test(obj_request))
299170d045f6SIlya Dryomov 		return true;
299270d045f6SIlya Dryomov 
299370d045f6SIlya Dryomov 	return false;
299470d045f6SIlya Dryomov }
299570d045f6SIlya Dryomov 
299670d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
299770d045f6SIlya Dryomov {
299870d045f6SIlya Dryomov 	if (img_obj_request_simple(obj_request)) {
2999b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
3000b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
3001b454e36dSAlex Elder 
3002b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
3003b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
3004b454e36dSAlex Elder 
3005b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
3006b454e36dSAlex Elder 	}
3007b454e36dSAlex Elder 
3008b454e36dSAlex Elder 	/*
30093d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
30103d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
30113d7efd18SAlex Elder 	 * start by reading the data for the full target object from
30123d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
3013b454e36dSAlex Elder 	 */
301470d045f6SIlya Dryomov 	if (obj_request_known_test(obj_request))
30153d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
30163d7efd18SAlex Elder 
30173d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
3018b454e36dSAlex Elder 
3019b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
3020b454e36dSAlex Elder }
3021b454e36dSAlex Elder 
3022bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
3023bf0d5f50SAlex Elder {
3024bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
302546faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
3026663ae2ccSIlya Dryomov 	int ret = 0;
3027bf0d5f50SAlex Elder 
302837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
3029bf0d5f50SAlex Elder 
3030663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
3031663ae2ccSIlya Dryomov 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
3032b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
3033bf0d5f50SAlex Elder 		if (ret)
3034663ae2ccSIlya Dryomov 			goto out_put_ireq;
3035bf0d5f50SAlex Elder 	}
3036bf0d5f50SAlex Elder 
3037663ae2ccSIlya Dryomov out_put_ireq:
3038663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
3039663ae2ccSIlya Dryomov 	return ret;
3040bf0d5f50SAlex Elder }
3041bf0d5f50SAlex Elder 
30428b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
30438b3e1a56SAlex Elder {
30448b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
3045a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
3046a9e8ba2cSAlex Elder 	u64 obj_end;
304702c74fbaSAlex Elder 	u64 img_xferred;
304802c74fbaSAlex Elder 	int img_result;
30498b3e1a56SAlex Elder 
30508b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
30518b3e1a56SAlex Elder 
305202c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
305302c74fbaSAlex Elder 
30548b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
305502c74fbaSAlex Elder 	img_xferred = img_request->xferred;
305602c74fbaSAlex Elder 	img_result = img_request->result;
305702c74fbaSAlex Elder 	rbd_img_request_put(img_request);
305802c74fbaSAlex Elder 
305902c74fbaSAlex Elder 	/*
306002c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
306102c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
306202c74fbaSAlex Elder 	 * original request.
306302c74fbaSAlex Elder 	 */
3064a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
3065a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
306602c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
306702c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
306802c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
30698b3e1a56SAlex Elder 
307002c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
307102c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
307202c74fbaSAlex Elder 		if (!img_result)
307302c74fbaSAlex Elder 			return;
307402c74fbaSAlex Elder 	}
307502c74fbaSAlex Elder 
307602c74fbaSAlex Elder 	obj_request->result = img_result;
3077a9e8ba2cSAlex Elder 	if (obj_request->result)
3078a9e8ba2cSAlex Elder 		goto out;
3079a9e8ba2cSAlex Elder 
3080a9e8ba2cSAlex Elder 	/*
3081a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
3082a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
3083a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
3084a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
3085a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
3086a9e8ba2cSAlex Elder 	 */
3087a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3088a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
3089a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
3090a9e8ba2cSAlex Elder 		u64 xferred = 0;
3091a9e8ba2cSAlex Elder 
3092a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
3093a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
3094a9e8ba2cSAlex Elder 					obj_request->img_offset;
3095a9e8ba2cSAlex Elder 
309602c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
3097a9e8ba2cSAlex Elder 	} else {
309802c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
3099a9e8ba2cSAlex Elder 	}
3100a9e8ba2cSAlex Elder out:
31018b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
31028b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
31038b3e1a56SAlex Elder }
31048b3e1a56SAlex Elder 
31058b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
31068b3e1a56SAlex Elder {
31078b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
31088b3e1a56SAlex Elder 	int result;
31098b3e1a56SAlex Elder 
31108b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
31118b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
31128b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
31135b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
31148b3e1a56SAlex Elder 
31158b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
3116e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
31178b3e1a56SAlex Elder 						obj_request->img_offset,
3118e93f3152SAlex Elder 						obj_request->length);
31198b3e1a56SAlex Elder 	result = -ENOMEM;
31208b3e1a56SAlex Elder 	if (!img_request)
31218b3e1a56SAlex Elder 		goto out_err;
31228b3e1a56SAlex Elder 
31235b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
3124f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3125f1a4739fSAlex Elder 						obj_request->bio_list);
31265b2ab72dSAlex Elder 	else
31275b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
31285b2ab72dSAlex Elder 						obj_request->pages);
31298b3e1a56SAlex Elder 	if (result)
31308b3e1a56SAlex Elder 		goto out_err;
31318b3e1a56SAlex Elder 
31328b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
31338b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
31348b3e1a56SAlex Elder 	if (result)
31358b3e1a56SAlex Elder 		goto out_err;
31368b3e1a56SAlex Elder 
31378b3e1a56SAlex Elder 	return;
31388b3e1a56SAlex Elder out_err:
31398b3e1a56SAlex Elder 	if (img_request)
31408b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
31418b3e1a56SAlex Elder 	obj_request->result = result;
31428b3e1a56SAlex Elder 	obj_request->xferred = 0;
31438b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
31448b3e1a56SAlex Elder }
31458b3e1a56SAlex Elder 
3146ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3147ed95b21aSIlya Dryomov 
3148ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3149ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
3150ed95b21aSIlya Dryomov {
3151ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3152ed95b21aSIlya Dryomov }
3153ed95b21aSIlya Dryomov 
3154ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3155ed95b21aSIlya Dryomov {
3156ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
3157ed95b21aSIlya Dryomov 
3158ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3159ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3160ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
3161ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3162ed95b21aSIlya Dryomov 	return cid;
3163ed95b21aSIlya Dryomov }
3164ed95b21aSIlya Dryomov 
3165ed95b21aSIlya Dryomov /*
3166ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3167ed95b21aSIlya Dryomov  */
3168ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3169ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
3170ed95b21aSIlya Dryomov {
3171ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3172ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3173ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
3174ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
3175ed95b21aSIlya Dryomov }
3176ed95b21aSIlya Dryomov 
3177ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3178ed95b21aSIlya Dryomov {
3179ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
3180ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3181ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3182ed95b21aSIlya Dryomov }
3183ed95b21aSIlya Dryomov 
3184ed95b21aSIlya Dryomov /*
3185ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3186ed95b21aSIlya Dryomov  */
3187ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3188ed95b21aSIlya Dryomov {
3189ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3190ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3191ed95b21aSIlya Dryomov 	char cookie[32];
3192ed95b21aSIlya Dryomov 	int ret;
3193ed95b21aSIlya Dryomov 
3194ed95b21aSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev));
3195ed95b21aSIlya Dryomov 
3196ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3197ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3198ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3199ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
3200ed95b21aSIlya Dryomov 	if (ret)
3201ed95b21aSIlya Dryomov 		return ret;
3202ed95b21aSIlya Dryomov 
3203ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3204ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &cid);
3205ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3206ed95b21aSIlya Dryomov 	return 0;
3207ed95b21aSIlya Dryomov }
3208ed95b21aSIlya Dryomov 
3209ed95b21aSIlya Dryomov /*
3210ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3211ed95b21aSIlya Dryomov  */
3212ed95b21aSIlya Dryomov static int rbd_unlock(struct rbd_device *rbd_dev)
3213ed95b21aSIlya Dryomov {
3214ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3215ed95b21aSIlya Dryomov 	char cookie[32];
3216ed95b21aSIlya Dryomov 	int ret;
3217ed95b21aSIlya Dryomov 
3218ed95b21aSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev));
3219ed95b21aSIlya Dryomov 
3220ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3221ed95b21aSIlya Dryomov 
3222ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
3223ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3224ed95b21aSIlya Dryomov 			      RBD_LOCK_NAME, cookie);
3225ed95b21aSIlya Dryomov 	if (ret && ret != -ENOENT) {
3226ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "cls_unlock failed: %d", ret);
3227ed95b21aSIlya Dryomov 		return ret;
3228ed95b21aSIlya Dryomov 	}
3229ed95b21aSIlya Dryomov 
3230ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3231ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3232ed95b21aSIlya Dryomov 	return 0;
3233ed95b21aSIlya Dryomov }
3234ed95b21aSIlya Dryomov 
3235ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3236ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
3237ed95b21aSIlya Dryomov 				struct page ***preply_pages,
3238ed95b21aSIlya Dryomov 				size_t *preply_len)
3239ed95b21aSIlya Dryomov {
3240ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3241ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3242ed95b21aSIlya Dryomov 	int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3243ed95b21aSIlya Dryomov 	char buf[buf_size];
3244ed95b21aSIlya Dryomov 	void *p = buf;
3245ed95b21aSIlya Dryomov 
3246ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3247ed95b21aSIlya Dryomov 
3248ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3249ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3250ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
3251ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
3252ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
3253ed95b21aSIlya Dryomov 
3254ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3255ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
3256ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3257ed95b21aSIlya Dryomov }
3258ed95b21aSIlya Dryomov 
3259ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3260ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
3261ed95b21aSIlya Dryomov {
3262ed95b21aSIlya Dryomov 	struct page **reply_pages;
3263ed95b21aSIlya Dryomov 	size_t reply_len;
3264ed95b21aSIlya Dryomov 
3265ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3266ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3267ed95b21aSIlya Dryomov }
3268ed95b21aSIlya Dryomov 
3269ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3270ed95b21aSIlya Dryomov {
3271ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3272ed95b21aSIlya Dryomov 						  acquired_lock_work);
3273ed95b21aSIlya Dryomov 
3274ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3275ed95b21aSIlya Dryomov }
3276ed95b21aSIlya Dryomov 
3277ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3278ed95b21aSIlya Dryomov {
3279ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3280ed95b21aSIlya Dryomov 						  released_lock_work);
3281ed95b21aSIlya Dryomov 
3282ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3283ed95b21aSIlya Dryomov }
3284ed95b21aSIlya Dryomov 
3285ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3286ed95b21aSIlya Dryomov {
3287ed95b21aSIlya Dryomov 	struct page **reply_pages;
3288ed95b21aSIlya Dryomov 	size_t reply_len;
3289ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
3290ed95b21aSIlya Dryomov 	int ret;
3291ed95b21aSIlya Dryomov 
3292ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3293ed95b21aSIlya Dryomov 
3294ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3295ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
3296ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
3297ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3298ed95b21aSIlya Dryomov 		goto out;
3299ed95b21aSIlya Dryomov 	}
3300ed95b21aSIlya Dryomov 
3301ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3302ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
3303ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
3304ed95b21aSIlya Dryomov 		u32 n;
3305ed95b21aSIlya Dryomov 
3306ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3307ed95b21aSIlya Dryomov 		while (n--) {
3308ed95b21aSIlya Dryomov 			u8 struct_v;
3309ed95b21aSIlya Dryomov 			u32 len;
3310ed95b21aSIlya Dryomov 
3311ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3312ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
3313ed95b21aSIlya Dryomov 
3314ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
3315ed95b21aSIlya Dryomov 			if (!len)
3316ed95b21aSIlya Dryomov 				continue;
3317ed95b21aSIlya Dryomov 
3318ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
3319ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3320ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
3321ed95b21aSIlya Dryomov 				ret = -EIO;
3322ed95b21aSIlya Dryomov 				goto out;
3323ed95b21aSIlya Dryomov 			}
3324ed95b21aSIlya Dryomov 
3325ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3326ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3327ed95b21aSIlya Dryomov 						  &struct_v, &len);
3328ed95b21aSIlya Dryomov 			if (ret) {
3329ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3330ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3331ed95b21aSIlya Dryomov 					 ret);
3332ed95b21aSIlya Dryomov 				goto e_inval;
3333ed95b21aSIlya Dryomov 			}
3334ed95b21aSIlya Dryomov 
3335ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3336ed95b21aSIlya Dryomov 		}
3337ed95b21aSIlya Dryomov 	}
3338ed95b21aSIlya Dryomov 
3339ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3340ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3341ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3342ed95b21aSIlya Dryomov 	}
3343ed95b21aSIlya Dryomov 
3344ed95b21aSIlya Dryomov out:
3345ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3346ed95b21aSIlya Dryomov 	return ret;
3347ed95b21aSIlya Dryomov 
3348ed95b21aSIlya Dryomov e_inval:
3349ed95b21aSIlya Dryomov 	ret = -EINVAL;
3350ed95b21aSIlya Dryomov 	goto out;
3351ed95b21aSIlya Dryomov }
3352ed95b21aSIlya Dryomov 
3353ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3354ed95b21aSIlya Dryomov {
3355ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3356ed95b21aSIlya Dryomov 
3357ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3358ed95b21aSIlya Dryomov 	if (wake_all)
3359ed95b21aSIlya Dryomov 		wake_up_all(&rbd_dev->lock_waitq);
3360ed95b21aSIlya Dryomov 	else
3361ed95b21aSIlya Dryomov 		wake_up(&rbd_dev->lock_waitq);
3362ed95b21aSIlya Dryomov }
3363ed95b21aSIlya Dryomov 
3364ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3365ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3366ed95b21aSIlya Dryomov {
3367ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3368ed95b21aSIlya Dryomov 	u8 lock_type;
3369ed95b21aSIlya Dryomov 	char *lock_tag;
3370ed95b21aSIlya Dryomov 	int ret;
3371ed95b21aSIlya Dryomov 
3372ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3373ed95b21aSIlya Dryomov 
3374ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3375ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3376ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
3377ed95b21aSIlya Dryomov 	if (ret)
3378ed95b21aSIlya Dryomov 		return ret;
3379ed95b21aSIlya Dryomov 
3380ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
3381ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3382ed95b21aSIlya Dryomov 		goto out;
3383ed95b21aSIlya Dryomov 	}
3384ed95b21aSIlya Dryomov 
3385ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3386ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3387ed95b21aSIlya Dryomov 			 lock_tag);
3388ed95b21aSIlya Dryomov 		ret = -EBUSY;
3389ed95b21aSIlya Dryomov 		goto out;
3390ed95b21aSIlya Dryomov 	}
3391ed95b21aSIlya Dryomov 
3392ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3393ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
3394ed95b21aSIlya Dryomov 		ret = -EBUSY;
3395ed95b21aSIlya Dryomov 		goto out;
3396ed95b21aSIlya Dryomov 	}
3397ed95b21aSIlya Dryomov 
3398ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3399ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3400ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3401ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3402ed95b21aSIlya Dryomov 		ret = -EBUSY;
3403ed95b21aSIlya Dryomov 		goto out;
3404ed95b21aSIlya Dryomov 	}
3405ed95b21aSIlya Dryomov 
3406ed95b21aSIlya Dryomov out:
3407ed95b21aSIlya Dryomov 	kfree(lock_tag);
3408ed95b21aSIlya Dryomov 	return ret;
3409ed95b21aSIlya Dryomov }
3410ed95b21aSIlya Dryomov 
3411ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3412ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3413ed95b21aSIlya Dryomov {
3414ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3415ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3416ed95b21aSIlya Dryomov 	u32 num_watchers;
3417ed95b21aSIlya Dryomov 	u64 cookie;
3418ed95b21aSIlya Dryomov 	int i;
3419ed95b21aSIlya Dryomov 	int ret;
3420ed95b21aSIlya Dryomov 
3421ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3422ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3423ed95b21aSIlya Dryomov 				      &num_watchers);
3424ed95b21aSIlya Dryomov 	if (ret)
3425ed95b21aSIlya Dryomov 		return ret;
3426ed95b21aSIlya Dryomov 
3427ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3428ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3429ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3430ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
3431ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3432ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3433ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3434ed95b21aSIlya Dryomov 				.handle = cookie,
3435ed95b21aSIlya Dryomov 			};
3436ed95b21aSIlya Dryomov 
3437ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3438ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3439ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3440ed95b21aSIlya Dryomov 			ret = 1;
3441ed95b21aSIlya Dryomov 			goto out;
3442ed95b21aSIlya Dryomov 		}
3443ed95b21aSIlya Dryomov 	}
3444ed95b21aSIlya Dryomov 
3445ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3446ed95b21aSIlya Dryomov 	ret = 0;
3447ed95b21aSIlya Dryomov out:
3448ed95b21aSIlya Dryomov 	kfree(watchers);
3449ed95b21aSIlya Dryomov 	return ret;
3450ed95b21aSIlya Dryomov }
3451ed95b21aSIlya Dryomov 
3452ed95b21aSIlya Dryomov /*
3453ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3454ed95b21aSIlya Dryomov  */
3455ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3456ed95b21aSIlya Dryomov {
3457ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3458ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3459ed95b21aSIlya Dryomov 	u32 num_lockers;
3460ed95b21aSIlya Dryomov 	int ret;
3461ed95b21aSIlya Dryomov 
3462ed95b21aSIlya Dryomov 	for (;;) {
3463ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3464ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3465ed95b21aSIlya Dryomov 			return ret;
3466ed95b21aSIlya Dryomov 
3467ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3468ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3469ed95b21aSIlya Dryomov 		if (ret)
3470ed95b21aSIlya Dryomov 			return ret;
3471ed95b21aSIlya Dryomov 
3472ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3473ed95b21aSIlya Dryomov 			goto again;
3474ed95b21aSIlya Dryomov 
3475ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3476ed95b21aSIlya Dryomov 		if (ret) {
3477ed95b21aSIlya Dryomov 			if (ret > 0)
3478ed95b21aSIlya Dryomov 				ret = 0; /* have to request lock */
3479ed95b21aSIlya Dryomov 			goto out;
3480ed95b21aSIlya Dryomov 		}
3481ed95b21aSIlya Dryomov 
3482ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3483ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3484ed95b21aSIlya Dryomov 
3485ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
3486ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3487ed95b21aSIlya Dryomov 		if (ret) {
3488ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3489ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3490ed95b21aSIlya Dryomov 			goto out;
3491ed95b21aSIlya Dryomov 		}
3492ed95b21aSIlya Dryomov 
3493ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3494ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3495ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3496ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3497ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3498ed95b21aSIlya Dryomov 			goto out;
3499ed95b21aSIlya Dryomov 
3500ed95b21aSIlya Dryomov again:
3501ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3502ed95b21aSIlya Dryomov 	}
3503ed95b21aSIlya Dryomov 
3504ed95b21aSIlya Dryomov out:
3505ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3506ed95b21aSIlya Dryomov 	return ret;
3507ed95b21aSIlya Dryomov }
3508ed95b21aSIlya Dryomov 
3509ed95b21aSIlya Dryomov /*
3510ed95b21aSIlya Dryomov  * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3511ed95b21aSIlya Dryomov  */
3512ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3513ed95b21aSIlya Dryomov 						int *pret)
3514ed95b21aSIlya Dryomov {
3515ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3516ed95b21aSIlya Dryomov 
3517ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3518ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3519ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3520ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
3521ed95b21aSIlya Dryomov 		lock_state = rbd_dev->lock_state;
3522ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3523ed95b21aSIlya Dryomov 		return lock_state;
3524ed95b21aSIlya Dryomov 	}
3525ed95b21aSIlya Dryomov 
3526ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3527ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3528ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3529ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3530ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev)) {
3531ed95b21aSIlya Dryomov 		*pret = rbd_try_lock(rbd_dev);
3532ed95b21aSIlya Dryomov 		if (*pret)
3533ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3534ed95b21aSIlya Dryomov 	}
3535ed95b21aSIlya Dryomov 
3536ed95b21aSIlya Dryomov 	lock_state = rbd_dev->lock_state;
3537ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3538ed95b21aSIlya Dryomov 	return lock_state;
3539ed95b21aSIlya Dryomov }
3540ed95b21aSIlya Dryomov 
3541ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
3542ed95b21aSIlya Dryomov {
3543ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3544ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
3545ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3546ed95b21aSIlya Dryomov 	int ret;
3547ed95b21aSIlya Dryomov 
3548ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3549ed95b21aSIlya Dryomov again:
3550ed95b21aSIlya Dryomov 	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3551ed95b21aSIlya Dryomov 	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3552ed95b21aSIlya Dryomov 		if (lock_state == RBD_LOCK_STATE_LOCKED)
3553ed95b21aSIlya Dryomov 			wake_requests(rbd_dev, true);
3554ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3555ed95b21aSIlya Dryomov 		     rbd_dev, lock_state, ret);
3556ed95b21aSIlya Dryomov 		return;
3557ed95b21aSIlya Dryomov 	}
3558ed95b21aSIlya Dryomov 
3559ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
3560ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
3561ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
3562ed95b21aSIlya Dryomov 	} else if (ret < 0) {
3563ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3564ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3565ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
3566ed95b21aSIlya Dryomov 	} else {
3567ed95b21aSIlya Dryomov 		/*
3568ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
3569ed95b21aSIlya Dryomov 		 * release the lock
3570ed95b21aSIlya Dryomov 		 */
3571ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3572ed95b21aSIlya Dryomov 		     rbd_dev);
3573ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3574ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3575ed95b21aSIlya Dryomov 	}
3576ed95b21aSIlya Dryomov }
3577ed95b21aSIlya Dryomov 
3578ed95b21aSIlya Dryomov /*
3579ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3580ed95b21aSIlya Dryomov  */
3581ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev)
3582ed95b21aSIlya Dryomov {
3583ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3584ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3585ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3586ed95b21aSIlya Dryomov 		return false;
3587ed95b21aSIlya Dryomov 
3588ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3589ed95b21aSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
3590ed95b21aSIlya Dryomov 	/*
3591ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
3592ed95b21aSIlya Dryomov 	 *
3593ed95b21aSIlya Dryomov 	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3594ed95b21aSIlya Dryomov 	 * may be shared with other devices.
3595ed95b21aSIlya Dryomov 	 */
3596ed95b21aSIlya Dryomov 	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3597ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3598ed95b21aSIlya Dryomov 
3599ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3600ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3601ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3602ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3603ed95b21aSIlya Dryomov 		return false;
3604ed95b21aSIlya Dryomov 
3605ed95b21aSIlya Dryomov 	if (!rbd_unlock(rbd_dev))
3606ed95b21aSIlya Dryomov 		/*
3607ed95b21aSIlya Dryomov 		 * Give others a chance to grab the lock - we would re-acquire
3608ed95b21aSIlya Dryomov 		 * almost immediately if we got new IO during ceph_osdc_sync()
3609ed95b21aSIlya Dryomov 		 * otherwise.  We need to ack our own notifications, so this
3610ed95b21aSIlya Dryomov 		 * lock_dwork will be requeued from rbd_wait_state_locked()
3611ed95b21aSIlya Dryomov 		 * after wake_requests() in rbd_handle_released_lock().
3612ed95b21aSIlya Dryomov 		 */
3613ed95b21aSIlya Dryomov 		cancel_delayed_work(&rbd_dev->lock_dwork);
3614ed95b21aSIlya Dryomov 
3615ed95b21aSIlya Dryomov 	return true;
3616ed95b21aSIlya Dryomov }
3617ed95b21aSIlya Dryomov 
3618ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
3619ed95b21aSIlya Dryomov {
3620ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3621ed95b21aSIlya Dryomov 						  unlock_work);
3622ed95b21aSIlya Dryomov 
3623ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3624ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
3625ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3626ed95b21aSIlya Dryomov }
3627ed95b21aSIlya Dryomov 
3628ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3629ed95b21aSIlya Dryomov 				     void **p)
3630ed95b21aSIlya Dryomov {
3631ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3632ed95b21aSIlya Dryomov 
3633ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3634ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3635ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3636ed95b21aSIlya Dryomov 	}
3637ed95b21aSIlya Dryomov 
3638ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3639ed95b21aSIlya Dryomov 	     cid.handle);
3640ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3641ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3642ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3643ed95b21aSIlya Dryomov 			/*
3644ed95b21aSIlya Dryomov 			 * we already know that the remote client is
3645ed95b21aSIlya Dryomov 			 * the owner
3646ed95b21aSIlya Dryomov 			 */
3647ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3648ed95b21aSIlya Dryomov 			return;
3649ed95b21aSIlya Dryomov 		}
3650ed95b21aSIlya Dryomov 
3651ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
3652ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3653ed95b21aSIlya Dryomov 	} else {
3654ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3655ed95b21aSIlya Dryomov 	}
3656ed95b21aSIlya Dryomov 
3657ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3658ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3659ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3660ed95b21aSIlya Dryomov }
3661ed95b21aSIlya Dryomov 
3662ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3663ed95b21aSIlya Dryomov 				     void **p)
3664ed95b21aSIlya Dryomov {
3665ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3666ed95b21aSIlya Dryomov 
3667ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3668ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3669ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3670ed95b21aSIlya Dryomov 	}
3671ed95b21aSIlya Dryomov 
3672ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3673ed95b21aSIlya Dryomov 	     cid.handle);
3674ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3675ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3676ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3677ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3678ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
3679ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3680ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3681ed95b21aSIlya Dryomov 			return;
3682ed95b21aSIlya Dryomov 		}
3683ed95b21aSIlya Dryomov 
3684ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3685ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3686ed95b21aSIlya Dryomov 	} else {
3687ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3688ed95b21aSIlya Dryomov 	}
3689ed95b21aSIlya Dryomov 
3690ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3691ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3692ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3693ed95b21aSIlya Dryomov }
3694ed95b21aSIlya Dryomov 
3695ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3696ed95b21aSIlya Dryomov 				    void **p)
3697ed95b21aSIlya Dryomov {
3698ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3699ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3700ed95b21aSIlya Dryomov 	bool need_to_send;
3701ed95b21aSIlya Dryomov 
3702ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3703ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3704ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3705ed95b21aSIlya Dryomov 	}
3706ed95b21aSIlya Dryomov 
3707ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3708ed95b21aSIlya Dryomov 	     cid.handle);
3709ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
3710ed95b21aSIlya Dryomov 		return false;
3711ed95b21aSIlya Dryomov 
3712ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3713ed95b21aSIlya Dryomov 	need_to_send = __rbd_is_lock_owner(rbd_dev);
3714ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3715ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) {
3716ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p queueing unlock_work\n", __func__,
3717ed95b21aSIlya Dryomov 			     rbd_dev);
3718ed95b21aSIlya Dryomov 			queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work);
3719ed95b21aSIlya Dryomov 		}
3720ed95b21aSIlya Dryomov 	}
3721ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3722ed95b21aSIlya Dryomov 	return need_to_send;
3723ed95b21aSIlya Dryomov }
3724ed95b21aSIlya Dryomov 
3725ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3726ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
3727ed95b21aSIlya Dryomov {
3728ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3729ed95b21aSIlya Dryomov 	int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3730ed95b21aSIlya Dryomov 	char buf[buf_size];
3731ed95b21aSIlya Dryomov 	int ret;
3732ed95b21aSIlya Dryomov 
3733ed95b21aSIlya Dryomov 	if (result) {
3734ed95b21aSIlya Dryomov 		void *p = buf;
3735ed95b21aSIlya Dryomov 
3736ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
3737ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
3738ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3739ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
3740ed95b21aSIlya Dryomov 	} else {
3741ed95b21aSIlya Dryomov 		buf_size = 0;
3742ed95b21aSIlya Dryomov 	}
3743ed95b21aSIlya Dryomov 
3744ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3745ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3746ed95b21aSIlya Dryomov 				   buf, buf_size);
3747ed95b21aSIlya Dryomov 	if (ret)
3748ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3749ed95b21aSIlya Dryomov }
3750ed95b21aSIlya Dryomov 
3751ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3752ed95b21aSIlya Dryomov 				   u64 cookie)
3753ed95b21aSIlya Dryomov {
3754ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3755ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3756ed95b21aSIlya Dryomov }
3757ed95b21aSIlya Dryomov 
3758ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3759ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
3760ed95b21aSIlya Dryomov {
3761ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3762ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3763ed95b21aSIlya Dryomov }
3764ed95b21aSIlya Dryomov 
3765922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3766922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3767b8d70035SAlex Elder {
3768922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3769ed95b21aSIlya Dryomov 	void *p = data;
3770ed95b21aSIlya Dryomov 	void *const end = p + data_len;
3771ed95b21aSIlya Dryomov 	u8 struct_v;
3772ed95b21aSIlya Dryomov 	u32 len;
3773ed95b21aSIlya Dryomov 	u32 notify_op;
3774b8d70035SAlex Elder 	int ret;
3775b8d70035SAlex Elder 
3776ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3777ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
3778ed95b21aSIlya Dryomov 	if (data_len) {
3779ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3780ed95b21aSIlya Dryomov 					  &struct_v, &len);
3781ed95b21aSIlya Dryomov 		if (ret) {
3782ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3783ed95b21aSIlya Dryomov 				 ret);
3784ed95b21aSIlya Dryomov 			return;
3785ed95b21aSIlya Dryomov 		}
378652bb1f9bSIlya Dryomov 
3787ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
3788ed95b21aSIlya Dryomov 	} else {
3789ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
3790ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3791ed95b21aSIlya Dryomov 		len = 0;
3792ed95b21aSIlya Dryomov 	}
3793ed95b21aSIlya Dryomov 
3794ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3795ed95b21aSIlya Dryomov 	switch (notify_op) {
3796ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3797ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3798ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3799ed95b21aSIlya Dryomov 		break;
3800ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
3801ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3802ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3803ed95b21aSIlya Dryomov 		break;
3804ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
3805ed95b21aSIlya Dryomov 		if (rbd_handle_request_lock(rbd_dev, struct_v, &p))
380652bb1f9bSIlya Dryomov 			/*
3807ed95b21aSIlya Dryomov 			 * send ResponseMessage(0) back so the client
3808ed95b21aSIlya Dryomov 			 * can detect a missing owner
380952bb1f9bSIlya Dryomov 			 */
3810ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3811ed95b21aSIlya Dryomov 						      cookie, 0);
3812ed95b21aSIlya Dryomov 		else
3813ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3814ed95b21aSIlya Dryomov 		break;
3815ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
3816e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
3817e627db08SAlex Elder 		if (ret)
38189584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3819b8d70035SAlex Elder 
3820ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3821ed95b21aSIlya Dryomov 		break;
3822ed95b21aSIlya Dryomov 	default:
3823ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
3824ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3825ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
3826ed95b21aSIlya Dryomov 		else
3827ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3828ed95b21aSIlya Dryomov 		break;
3829ed95b21aSIlya Dryomov 	}
3830b8d70035SAlex Elder }
3831b8d70035SAlex Elder 
383299d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
383399d16943SIlya Dryomov 
3834922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3835bb040aa0SIlya Dryomov {
3836922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3837bb040aa0SIlya Dryomov 
3838922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3839bb040aa0SIlya Dryomov 
3840ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3841ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3842ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3843ed95b21aSIlya Dryomov 
384499d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
384599d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
384699d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
384799d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3848bb040aa0SIlya Dryomov 
384999d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3850bb040aa0SIlya Dryomov 	}
385199d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3852bb040aa0SIlya Dryomov }
3853bb040aa0SIlya Dryomov 
3854bb040aa0SIlya Dryomov /*
385599d16943SIlya Dryomov  * watch_mutex must be locked
38569969ebc5SAlex Elder  */
385799d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
38589969ebc5SAlex Elder {
38599969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3860922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
38619969ebc5SAlex Elder 
3862922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
386399d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
38649969ebc5SAlex Elder 
3865922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3866922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3867922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3868922dab61SIlya Dryomov 	if (IS_ERR(handle))
3869922dab61SIlya Dryomov 		return PTR_ERR(handle);
38709969ebc5SAlex Elder 
3871922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
38728eb87565SAlex Elder 	return 0;
38739969ebc5SAlex Elder }
38749969ebc5SAlex Elder 
387599d16943SIlya Dryomov /*
387699d16943SIlya Dryomov  * watch_mutex must be locked
387799d16943SIlya Dryomov  */
387899d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3879fca27065SIlya Dryomov {
3880922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3881922dab61SIlya Dryomov 	int ret;
3882b30a01f2SIlya Dryomov 
388399d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
388499d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3885b30a01f2SIlya Dryomov 
3886922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3887922dab61SIlya Dryomov 	if (ret)
3888922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3889b30a01f2SIlya Dryomov 
3890922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3891c525f036SIlya Dryomov }
3892c525f036SIlya Dryomov 
389399d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
3894c525f036SIlya Dryomov {
389599d16943SIlya Dryomov 	int ret;
3896811c6688SIlya Dryomov 
389799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
389899d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
389999d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
390099d16943SIlya Dryomov 	if (ret)
390199d16943SIlya Dryomov 		goto out;
390299d16943SIlya Dryomov 
390399d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
390499d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
390599d16943SIlya Dryomov 
390699d16943SIlya Dryomov out:
390799d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
390899d16943SIlya Dryomov 	return ret;
390999d16943SIlya Dryomov }
391099d16943SIlya Dryomov 
391199d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
391299d16943SIlya Dryomov {
391399d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
391499d16943SIlya Dryomov 
391599d16943SIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3916ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
3917ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
3918ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3919ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
392099d16943SIlya Dryomov }
392199d16943SIlya Dryomov 
392299d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
392399d16943SIlya Dryomov {
3924ed95b21aSIlya Dryomov 	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
392599d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
392699d16943SIlya Dryomov 
392799d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
392899d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
392999d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
393099d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
393199d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
393299d16943SIlya Dryomov 
3933811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3934fca27065SIlya Dryomov }
3935fca27065SIlya Dryomov 
393699d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
393799d16943SIlya Dryomov {
393899d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
393999d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
3940ed95b21aSIlya Dryomov 	bool was_lock_owner = false;
394199d16943SIlya Dryomov 	int ret;
394299d16943SIlya Dryomov 
394399d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
394499d16943SIlya Dryomov 
3945ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3946ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3947ed95b21aSIlya Dryomov 		was_lock_owner = rbd_release_lock(rbd_dev);
3948ed95b21aSIlya Dryomov 
394999d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
395099d16943SIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR)
395199d16943SIlya Dryomov 		goto fail_unlock;
395299d16943SIlya Dryomov 
395399d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
395499d16943SIlya Dryomov 	if (ret) {
395599d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
395699d16943SIlya Dryomov 		if (ret != -EBLACKLISTED)
395799d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
395899d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
395999d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
396099d16943SIlya Dryomov 		goto fail_unlock;
396199d16943SIlya Dryomov 	}
396299d16943SIlya Dryomov 
396399d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
396499d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
396599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
396699d16943SIlya Dryomov 
396799d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
396899d16943SIlya Dryomov 	if (ret)
396999d16943SIlya Dryomov 		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
397099d16943SIlya Dryomov 
3971ed95b21aSIlya Dryomov 	if (was_lock_owner) {
3972ed95b21aSIlya Dryomov 		ret = rbd_try_lock(rbd_dev);
3973ed95b21aSIlya Dryomov 		if (ret)
3974ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "reregisteration lock failed: %d",
3975ed95b21aSIlya Dryomov 				 ret);
3976ed95b21aSIlya Dryomov 	}
3977ed95b21aSIlya Dryomov 
3978ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3979ed95b21aSIlya Dryomov 	wake_requests(rbd_dev, true);
398099d16943SIlya Dryomov 	return;
398199d16943SIlya Dryomov 
398299d16943SIlya Dryomov fail_unlock:
398399d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3984ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
398599d16943SIlya Dryomov }
398699d16943SIlya Dryomov 
398736be9a76SAlex Elder /*
3988f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3989f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
399036be9a76SAlex Elder  */
399136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
399236be9a76SAlex Elder 			     const char *object_name,
399336be9a76SAlex Elder 			     const char *class_name,
399436be9a76SAlex Elder 			     const char *method_name,
39954157976bSAlex Elder 			     const void *outbound,
399636be9a76SAlex Elder 			     size_t outbound_size,
39974157976bSAlex Elder 			     void *inbound,
3998e2a58ee5SAlex Elder 			     size_t inbound_size)
399936be9a76SAlex Elder {
40002169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
400136be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
400236be9a76SAlex Elder 	struct page **pages;
400336be9a76SAlex Elder 	u32 page_count;
400436be9a76SAlex Elder 	int ret;
400536be9a76SAlex Elder 
400636be9a76SAlex Elder 	/*
40076010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
40086010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
40096010a451SAlex Elder 	 * also supply outbound data--parameters for the object
40106010a451SAlex Elder 	 * method.  Currently if this is present it will be a
40116010a451SAlex Elder 	 * snapshot id.
401236be9a76SAlex Elder 	 */
401336be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
401436be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
401536be9a76SAlex Elder 	if (IS_ERR(pages))
401636be9a76SAlex Elder 		return PTR_ERR(pages);
401736be9a76SAlex Elder 
401836be9a76SAlex Elder 	ret = -ENOMEM;
40196010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
402036be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
402136be9a76SAlex Elder 	if (!obj_request)
402236be9a76SAlex Elder 		goto out;
402336be9a76SAlex Elder 
402436be9a76SAlex Elder 	obj_request->pages = pages;
402536be9a76SAlex Elder 	obj_request->page_count = page_count;
402636be9a76SAlex Elder 
40276d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
4028deb236b3SIlya Dryomov 						  obj_request);
402936be9a76SAlex Elder 	if (!obj_request->osd_req)
403036be9a76SAlex Elder 		goto out;
403136be9a76SAlex Elder 
4032c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
403304017e29SAlex Elder 					class_name, method_name);
403404017e29SAlex Elder 	if (outbound_size) {
403504017e29SAlex Elder 		struct ceph_pagelist *pagelist;
403604017e29SAlex Elder 
403704017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
403804017e29SAlex Elder 		if (!pagelist)
403904017e29SAlex Elder 			goto out;
404004017e29SAlex Elder 
404104017e29SAlex Elder 		ceph_pagelist_init(pagelist);
404204017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
404304017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
404404017e29SAlex Elder 						pagelist);
404504017e29SAlex Elder 	}
4046a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
4047a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
404844cd188dSAlex Elder 					0, false, false);
40499d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
4050430c28c3SAlex Elder 
405136be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
405236be9a76SAlex Elder 	if (ret)
405336be9a76SAlex Elder 		goto out;
405436be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
405536be9a76SAlex Elder 	if (ret)
405636be9a76SAlex Elder 		goto out;
405736be9a76SAlex Elder 
405836be9a76SAlex Elder 	ret = obj_request->result;
405936be9a76SAlex Elder 	if (ret < 0)
406036be9a76SAlex Elder 		goto out;
406157385b51SAlex Elder 
406257385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
406357385b51SAlex Elder 	ret = (int)obj_request->xferred;
4064903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
406536be9a76SAlex Elder out:
406636be9a76SAlex Elder 	if (obj_request)
406736be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
406836be9a76SAlex Elder 	else
406936be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
407036be9a76SAlex Elder 
407136be9a76SAlex Elder 	return ret;
407236be9a76SAlex Elder }
407336be9a76SAlex Elder 
4074ed95b21aSIlya Dryomov /*
4075ed95b21aSIlya Dryomov  * lock_rwsem must be held for read
4076ed95b21aSIlya Dryomov  */
4077ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
4078ed95b21aSIlya Dryomov {
4079ed95b21aSIlya Dryomov 	DEFINE_WAIT(wait);
4080ed95b21aSIlya Dryomov 
4081ed95b21aSIlya Dryomov 	do {
4082ed95b21aSIlya Dryomov 		/*
4083ed95b21aSIlya Dryomov 		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
4084ed95b21aSIlya Dryomov 		 * and cancel_delayed_work() in wake_requests().
4085ed95b21aSIlya Dryomov 		 */
4086ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
4087ed95b21aSIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4088ed95b21aSIlya Dryomov 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
4089ed95b21aSIlya Dryomov 					  TASK_UNINTERRUPTIBLE);
4090ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4091ed95b21aSIlya Dryomov 		schedule();
4092ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4093ed95b21aSIlya Dryomov 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
4094ed95b21aSIlya Dryomov 	finish_wait(&rbd_dev->lock_waitq, &wait);
4095ed95b21aSIlya Dryomov }
4096ed95b21aSIlya Dryomov 
40977ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
4098bc1ecc65SIlya Dryomov {
40997ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
41007ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
4101bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
41024e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
4103bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4104bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
41056d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
41064e752f0aSJosh Durgin 	u64 mapping_size;
4107ed95b21aSIlya Dryomov 	bool must_be_locked = false;
4108bc1ecc65SIlya Dryomov 	int result;
4109bc1ecc65SIlya Dryomov 
41107ad18afaSChristoph Hellwig 	if (rq->cmd_type != REQ_TYPE_FS) {
41117ad18afaSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__,
41127ad18afaSChristoph Hellwig 			(int) rq->cmd_type);
41137ad18afaSChristoph Hellwig 		result = -EIO;
41147ad18afaSChristoph Hellwig 		goto err;
41157ad18afaSChristoph Hellwig 	}
41167ad18afaSChristoph Hellwig 
4117c2df40dfSMike Christie 	if (req_op(rq) == REQ_OP_DISCARD)
411890e98c52SGuangliang Zhao 		op_type = OBJ_OP_DISCARD;
4119c2df40dfSMike Christie 	else if (req_op(rq) == REQ_OP_WRITE)
41206d2940c8SGuangliang Zhao 		op_type = OBJ_OP_WRITE;
41216d2940c8SGuangliang Zhao 	else
41226d2940c8SGuangliang Zhao 		op_type = OBJ_OP_READ;
41236d2940c8SGuangliang Zhao 
4124bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
4125bc1ecc65SIlya Dryomov 
4126bc1ecc65SIlya Dryomov 	if (!length) {
4127bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
4128bc1ecc65SIlya Dryomov 		result = 0;
4129bc1ecc65SIlya Dryomov 		goto err_rq;
4130bc1ecc65SIlya Dryomov 	}
4131bc1ecc65SIlya Dryomov 
41326d2940c8SGuangliang Zhao 	/* Only reads are allowed to a read-only device */
4133bc1ecc65SIlya Dryomov 
41346d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
4135bc1ecc65SIlya Dryomov 		if (rbd_dev->mapping.read_only) {
4136bc1ecc65SIlya Dryomov 			result = -EROFS;
4137bc1ecc65SIlya Dryomov 			goto err_rq;
4138bc1ecc65SIlya Dryomov 		}
4139bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
4140bc1ecc65SIlya Dryomov 	}
4141bc1ecc65SIlya Dryomov 
4142bc1ecc65SIlya Dryomov 	/*
4143bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
4144bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
4145bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
4146bc1ecc65SIlya Dryomov 	 * sending it if we already know.
4147bc1ecc65SIlya Dryomov 	 */
4148bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4149bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
4150bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4151bc1ecc65SIlya Dryomov 		result = -ENXIO;
4152bc1ecc65SIlya Dryomov 		goto err_rq;
4153bc1ecc65SIlya Dryomov 	}
4154bc1ecc65SIlya Dryomov 
4155bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
4156bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4157bc1ecc65SIlya Dryomov 			 length);
4158bc1ecc65SIlya Dryomov 		result = -EINVAL;
4159bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
4160bc1ecc65SIlya Dryomov 	}
4161bc1ecc65SIlya Dryomov 
41627ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
41637ad18afaSChristoph Hellwig 
41644e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
41654e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
41666d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
41674e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
41684e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
4169ed95b21aSIlya Dryomov 		must_be_locked = rbd_is_lock_supported(rbd_dev);
41704e752f0aSJosh Durgin 	}
41714e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
41724e752f0aSJosh Durgin 
41734e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
4174bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
41754e752f0aSJosh Durgin 			 length, mapping_size);
4176bc1ecc65SIlya Dryomov 		result = -EIO;
4177bc1ecc65SIlya Dryomov 		goto err_rq;
4178bc1ecc65SIlya Dryomov 	}
4179bc1ecc65SIlya Dryomov 
4180ed95b21aSIlya Dryomov 	if (must_be_locked) {
4181ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
4182ed95b21aSIlya Dryomov 		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4183ed95b21aSIlya Dryomov 			rbd_wait_state_locked(rbd_dev);
4184ed95b21aSIlya Dryomov 	}
4185ed95b21aSIlya Dryomov 
41866d2940c8SGuangliang Zhao 	img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
41874e752f0aSJosh Durgin 					     snapc);
4188bc1ecc65SIlya Dryomov 	if (!img_request) {
4189bc1ecc65SIlya Dryomov 		result = -ENOMEM;
4190ed95b21aSIlya Dryomov 		goto err_unlock;
4191bc1ecc65SIlya Dryomov 	}
4192bc1ecc65SIlya Dryomov 	img_request->rq = rq;
419370b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
4194bc1ecc65SIlya Dryomov 
419590e98c52SGuangliang Zhao 	if (op_type == OBJ_OP_DISCARD)
419690e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
419790e98c52SGuangliang Zhao 					      NULL);
419890e98c52SGuangliang Zhao 	else
419990e98c52SGuangliang Zhao 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
420090e98c52SGuangliang Zhao 					      rq->bio);
4201bc1ecc65SIlya Dryomov 	if (result)
4202bc1ecc65SIlya Dryomov 		goto err_img_request;
4203bc1ecc65SIlya Dryomov 
4204bc1ecc65SIlya Dryomov 	result = rbd_img_request_submit(img_request);
4205bc1ecc65SIlya Dryomov 	if (result)
4206bc1ecc65SIlya Dryomov 		goto err_img_request;
4207bc1ecc65SIlya Dryomov 
4208ed95b21aSIlya Dryomov 	if (must_be_locked)
4209ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4210bc1ecc65SIlya Dryomov 	return;
4211bc1ecc65SIlya Dryomov 
4212bc1ecc65SIlya Dryomov err_img_request:
4213bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
4214ed95b21aSIlya Dryomov err_unlock:
4215ed95b21aSIlya Dryomov 	if (must_be_locked)
4216ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
4217bc1ecc65SIlya Dryomov err_rq:
4218bc1ecc65SIlya Dryomov 	if (result)
4219bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
42206d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
42214e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
42227ad18afaSChristoph Hellwig err:
42237ad18afaSChristoph Hellwig 	blk_mq_end_request(rq, result);
4224bc1ecc65SIlya Dryomov }
4225bc1ecc65SIlya Dryomov 
42267ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
42277ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
4228bc1ecc65SIlya Dryomov {
42297ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
42307ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4231bc1ecc65SIlya Dryomov 
42327ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
42337ad18afaSChristoph Hellwig 	return BLK_MQ_RQ_QUEUE_OK;
4234bf0d5f50SAlex Elder }
4235bf0d5f50SAlex Elder 
4236602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4237602adf40SYehuda Sadeh {
4238602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
4239602adf40SYehuda Sadeh 
4240602adf40SYehuda Sadeh 	if (!disk)
4241602adf40SYehuda Sadeh 		return;
4242602adf40SYehuda Sadeh 
4243a0cab924SAlex Elder 	rbd_dev->disk = NULL;
4244a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
4245602adf40SYehuda Sadeh 		del_gendisk(disk);
4246602adf40SYehuda Sadeh 		if (disk->queue)
4247602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
42487ad18afaSChristoph Hellwig 		blk_mq_free_tag_set(&rbd_dev->tag_set);
4249a0cab924SAlex Elder 	}
4250602adf40SYehuda Sadeh 	put_disk(disk);
4251602adf40SYehuda Sadeh }
4252602adf40SYehuda Sadeh 
4253788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4254788e2df3SAlex Elder 				const char *object_name,
42557097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
4256788e2df3SAlex Elder 
4257788e2df3SAlex Elder {
42582169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4259788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
4260788e2df3SAlex Elder 	struct page **pages = NULL;
4261788e2df3SAlex Elder 	u32 page_count;
42621ceae7efSAlex Elder 	size_t size;
4263788e2df3SAlex Elder 	int ret;
4264788e2df3SAlex Elder 
4265788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
4266788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
4267788e2df3SAlex Elder 	if (IS_ERR(pages))
4268a8d42056SJan Kara 		return PTR_ERR(pages);
4269788e2df3SAlex Elder 
4270788e2df3SAlex Elder 	ret = -ENOMEM;
4271788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
4272788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
4273788e2df3SAlex Elder 	if (!obj_request)
4274788e2df3SAlex Elder 		goto out;
4275788e2df3SAlex Elder 
4276788e2df3SAlex Elder 	obj_request->pages = pages;
4277788e2df3SAlex Elder 	obj_request->page_count = page_count;
4278788e2df3SAlex Elder 
42796d2940c8SGuangliang Zhao 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
4280deb236b3SIlya Dryomov 						  obj_request);
4281788e2df3SAlex Elder 	if (!obj_request->osd_req)
4282788e2df3SAlex Elder 		goto out;
4283788e2df3SAlex Elder 
4284c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
4285c99d2d4aSAlex Elder 					offset, length, 0, 0);
4286406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
4287a4ce40a9SAlex Elder 					obj_request->pages,
428844cd188dSAlex Elder 					obj_request->length,
428944cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
429044cd188dSAlex Elder 					false, false);
42919d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
4292430c28c3SAlex Elder 
4293788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
4294788e2df3SAlex Elder 	if (ret)
4295788e2df3SAlex Elder 		goto out;
4296788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
4297788e2df3SAlex Elder 	if (ret)
4298788e2df3SAlex Elder 		goto out;
4299788e2df3SAlex Elder 
4300788e2df3SAlex Elder 	ret = obj_request->result;
4301788e2df3SAlex Elder 	if (ret < 0)
4302788e2df3SAlex Elder 		goto out;
43031ceae7efSAlex Elder 
43041ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
43051ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
4306903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
430723ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
430823ed6e13SAlex Elder 	ret = (int)size;
4309788e2df3SAlex Elder out:
4310788e2df3SAlex Elder 	if (obj_request)
4311788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
4312788e2df3SAlex Elder 	else
4313788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
4314788e2df3SAlex Elder 
4315788e2df3SAlex Elder 	return ret;
4316788e2df3SAlex Elder }
4317788e2df3SAlex Elder 
4318602adf40SYehuda Sadeh /*
4319662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4320662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4321662518b1SAlex Elder  * information about the image.
43224156d998SAlex Elder  */
432399a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
43244156d998SAlex Elder {
43254156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
43264156d998SAlex Elder 	u32 snap_count = 0;
43274156d998SAlex Elder 	u64 names_size = 0;
43284156d998SAlex Elder 	u32 want_count;
43294156d998SAlex Elder 	int ret;
43304156d998SAlex Elder 
43314156d998SAlex Elder 	/*
43324156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
43334156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
43344156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
43354156d998SAlex Elder 	 * the number of snapshots could change by the time we read
43364156d998SAlex Elder 	 * it in, in which case we re-read it.
43374156d998SAlex Elder 	 */
43384156d998SAlex Elder 	do {
43394156d998SAlex Elder 		size_t size;
43404156d998SAlex Elder 
43414156d998SAlex Elder 		kfree(ondisk);
43424156d998SAlex Elder 
43434156d998SAlex Elder 		size = sizeof (*ondisk);
43444156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
43454156d998SAlex Elder 		size += names_size;
43464156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
43474156d998SAlex Elder 		if (!ondisk)
4348662518b1SAlex Elder 			return -ENOMEM;
43494156d998SAlex Elder 
4350c41d13a3SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
43517097f8dfSAlex Elder 				       0, size, ondisk);
43524156d998SAlex Elder 		if (ret < 0)
4353662518b1SAlex Elder 			goto out;
4354c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
43554156d998SAlex Elder 			ret = -ENXIO;
435606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
435706ecc6cbSAlex Elder 				size, ret);
4358662518b1SAlex Elder 			goto out;
43594156d998SAlex Elder 		}
43604156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
43614156d998SAlex Elder 			ret = -ENXIO;
436206ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4363662518b1SAlex Elder 			goto out;
43644156d998SAlex Elder 		}
43654156d998SAlex Elder 
43664156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
43674156d998SAlex Elder 		want_count = snap_count;
43684156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
43694156d998SAlex Elder 	} while (snap_count != want_count);
43704156d998SAlex Elder 
4371662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4372662518b1SAlex Elder out:
43734156d998SAlex Elder 	kfree(ondisk);
43744156d998SAlex Elder 
4375dfc5606dSYehuda Sadeh 	return ret;
4376602adf40SYehuda Sadeh }
4377602adf40SYehuda Sadeh 
437815228edeSAlex Elder /*
437915228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
438015228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
438115228edeSAlex Elder  */
438215228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
438315228edeSAlex Elder {
438415228edeSAlex Elder 	u64 snap_id;
438515228edeSAlex Elder 
438615228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
438715228edeSAlex Elder 		return;
438815228edeSAlex Elder 
438915228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
439015228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
439115228edeSAlex Elder 		return;
439215228edeSAlex Elder 
439315228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
439415228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
439515228edeSAlex Elder }
439615228edeSAlex Elder 
43979875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
43989875201eSJosh Durgin {
43999875201eSJosh Durgin 	sector_t size;
44009875201eSJosh Durgin 
44019875201eSJosh Durgin 	/*
4402811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4403811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4404811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
44059875201eSJosh Durgin 	 */
4406811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4407811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
44089875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
44099875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
44109875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
44119875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
44129875201eSJosh Durgin 	}
44139875201eSJosh Durgin }
44149875201eSJosh Durgin 
4415cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
44161fe5e993SAlex Elder {
4417e627db08SAlex Elder 	u64 mapping_size;
44181fe5e993SAlex Elder 	int ret;
44191fe5e993SAlex Elder 
4420cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
44213b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4422a720ae09SIlya Dryomov 
4423a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
442452bb1f9bSIlya Dryomov 	if (ret)
442573e39e4dSIlya Dryomov 		goto out;
442615228edeSAlex Elder 
4427e8f59b59SIlya Dryomov 	/*
4428e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4429e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4430e8f59b59SIlya Dryomov 	 */
4431e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4432e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4433e8f59b59SIlya Dryomov 		if (ret)
443473e39e4dSIlya Dryomov 			goto out;
4435e8f59b59SIlya Dryomov 	}
4436e8f59b59SIlya Dryomov 
44375ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
44385ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
44395ff1108cSIlya Dryomov 	} else {
44405ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
444115228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
44425ff1108cSIlya Dryomov 	}
44435ff1108cSIlya Dryomov 
444473e39e4dSIlya Dryomov out:
4445cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
444673e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
44479875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
44481fe5e993SAlex Elder 
444973e39e4dSIlya Dryomov 	return ret;
44501fe5e993SAlex Elder }
44511fe5e993SAlex Elder 
44527ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq,
44537ad18afaSChristoph Hellwig 		unsigned int hctx_idx, unsigned int request_idx,
44547ad18afaSChristoph Hellwig 		unsigned int numa_node)
44557ad18afaSChristoph Hellwig {
44567ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
44577ad18afaSChristoph Hellwig 
44587ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
44597ad18afaSChristoph Hellwig 	return 0;
44607ad18afaSChristoph Hellwig }
44617ad18afaSChristoph Hellwig 
44627ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = {
44637ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
44647ad18afaSChristoph Hellwig 	.map_queue	= blk_mq_map_queue,
44657ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
44667ad18afaSChristoph Hellwig };
44677ad18afaSChristoph Hellwig 
4468602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4469602adf40SYehuda Sadeh {
4470602adf40SYehuda Sadeh 	struct gendisk *disk;
4471602adf40SYehuda Sadeh 	struct request_queue *q;
4472593a9e7bSAlex Elder 	u64 segment_size;
44737ad18afaSChristoph Hellwig 	int err;
4474602adf40SYehuda Sadeh 
4475602adf40SYehuda Sadeh 	/* create gendisk info */
44767e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
44777e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
44787e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
4479602adf40SYehuda Sadeh 	if (!disk)
44801fcdb8aaSAlex Elder 		return -ENOMEM;
4481602adf40SYehuda Sadeh 
4482f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4483de71a297SAlex Elder 		 rbd_dev->dev_id);
4484602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
4485dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
44867e513d43SIlya Dryomov 	if (single_major)
44877e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
4488602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
4489602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
4490602adf40SYehuda Sadeh 
44917ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
44927ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4493b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
44947ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
4495b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
44967ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
44977ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
44987ad18afaSChristoph Hellwig 
44997ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
45007ad18afaSChristoph Hellwig 	if (err)
4501602adf40SYehuda Sadeh 		goto out_disk;
4502029bcbd8SJosh Durgin 
45037ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
45047ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
45057ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
45067ad18afaSChristoph Hellwig 		goto out_tag_set;
45077ad18afaSChristoph Hellwig 	}
45087ad18afaSChristoph Hellwig 
4509d8a2c89cSIlya Dryomov 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4510d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4511593a9e7bSAlex Elder 
4512029bcbd8SJosh Durgin 	/* set io sizes to object size */
4513593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
4514593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
45150d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
4516d3834fefSIlya Dryomov 	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
4517593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
4518593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
4519593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
4520029bcbd8SJosh Durgin 
452190e98c52SGuangliang Zhao 	/* enable the discard support */
452290e98c52SGuangliang Zhao 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
452390e98c52SGuangliang Zhao 	q->limits.discard_granularity = segment_size;
452490e98c52SGuangliang Zhao 	q->limits.discard_alignment = segment_size;
45252bb4cd5cSJens Axboe 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
4526b76f8239SJosh Durgin 	q->limits.discard_zeroes_data = 1;
452790e98c52SGuangliang Zhao 
4528bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4529bae818eeSRonny Hegewald 		q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
4530bae818eeSRonny Hegewald 
4531602adf40SYehuda Sadeh 	disk->queue = q;
4532602adf40SYehuda Sadeh 
4533602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
4534602adf40SYehuda Sadeh 
4535602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4536602adf40SYehuda Sadeh 
4537602adf40SYehuda Sadeh 	return 0;
45387ad18afaSChristoph Hellwig out_tag_set:
45397ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4540602adf40SYehuda Sadeh out_disk:
4541602adf40SYehuda Sadeh 	put_disk(disk);
45427ad18afaSChristoph Hellwig 	return err;
4543602adf40SYehuda Sadeh }
4544602adf40SYehuda Sadeh 
4545dfc5606dSYehuda Sadeh /*
4546dfc5606dSYehuda Sadeh   sysfs
4547dfc5606dSYehuda Sadeh */
4548602adf40SYehuda Sadeh 
4549593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4550593a9e7bSAlex Elder {
4551593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4552593a9e7bSAlex Elder }
4553593a9e7bSAlex Elder 
4554dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4555dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4556602adf40SYehuda Sadeh {
4557593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4558dfc5606dSYehuda Sadeh 
4559fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4560fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4561602adf40SYehuda Sadeh }
4562602adf40SYehuda Sadeh 
456334b13184SAlex Elder /*
456434b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
456534b13184SAlex Elder  * necessarily the base image.
456634b13184SAlex Elder  */
456734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
456834b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
456934b13184SAlex Elder {
457034b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
457134b13184SAlex Elder 
457234b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
457334b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
457434b13184SAlex Elder }
457534b13184SAlex Elder 
4576dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4577dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4578602adf40SYehuda Sadeh {
4579593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4580dfc5606dSYehuda Sadeh 
4581fc71d833SAlex Elder 	if (rbd_dev->major)
4582dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4583fc71d833SAlex Elder 
4584fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4585dd82fff1SIlya Dryomov }
4586fc71d833SAlex Elder 
4587dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4588dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
4589dd82fff1SIlya Dryomov {
4590dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4591dd82fff1SIlya Dryomov 
4592dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
4593dfc5606dSYehuda Sadeh }
4594dfc5606dSYehuda Sadeh 
4595dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
4596dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
4597dfc5606dSYehuda Sadeh {
4598593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4599dfc5606dSYehuda Sadeh 
46001dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
4601033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
4602dfc5606dSYehuda Sadeh }
4603dfc5606dSYehuda Sadeh 
4604dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
4605dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4606dfc5606dSYehuda Sadeh {
4607593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4608dfc5606dSYehuda Sadeh 
46090d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4610dfc5606dSYehuda Sadeh }
4611dfc5606dSYehuda Sadeh 
46129bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
46139bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
46149bb2f334SAlex Elder {
46159bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
46169bb2f334SAlex Elder 
46170d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
46180d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
46199bb2f334SAlex Elder }
46209bb2f334SAlex Elder 
4621dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
4622dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4623dfc5606dSYehuda Sadeh {
4624593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4625dfc5606dSYehuda Sadeh 
4626a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
46270d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4628a92ffdf8SAlex Elder 
4629a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
4630dfc5606dSYehuda Sadeh }
4631dfc5606dSYehuda Sadeh 
4632589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
4633589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
4634589d30e0SAlex Elder {
4635589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4636589d30e0SAlex Elder 
46370d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4638589d30e0SAlex Elder }
4639589d30e0SAlex Elder 
464034b13184SAlex Elder /*
464134b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
464234b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
464334b13184SAlex Elder  */
4644dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
4645dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
4646dfc5606dSYehuda Sadeh 			     char *buf)
4647dfc5606dSYehuda Sadeh {
4648593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4649dfc5606dSYehuda Sadeh 
46500d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4651dfc5606dSYehuda Sadeh }
4652dfc5606dSYehuda Sadeh 
465386b00e0dSAlex Elder /*
4654ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
4655ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
4656ff96128fSIlya Dryomov  * image)".
465786b00e0dSAlex Elder  */
465886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
465986b00e0dSAlex Elder 			       struct device_attribute *attr,
466086b00e0dSAlex Elder 			       char *buf)
466186b00e0dSAlex Elder {
466286b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4663ff96128fSIlya Dryomov 	ssize_t count = 0;
466486b00e0dSAlex Elder 
4665ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
466686b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
466786b00e0dSAlex Elder 
4668ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4669ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
467086b00e0dSAlex Elder 
4671ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
4672ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
4673ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
4674ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
4675ff96128fSIlya Dryomov 			    "overlap %llu\n",
4676ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
4677ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
4678ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
4679ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
4680ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
4681ff96128fSIlya Dryomov 	}
468286b00e0dSAlex Elder 
468386b00e0dSAlex Elder 	return count;
468486b00e0dSAlex Elder }
468586b00e0dSAlex Elder 
4686dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
4687dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
4688dfc5606dSYehuda Sadeh 				 const char *buf,
4689dfc5606dSYehuda Sadeh 				 size_t size)
4690dfc5606dSYehuda Sadeh {
4691593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4692b813623aSAlex Elder 	int ret;
4693602adf40SYehuda Sadeh 
4694cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
4695e627db08SAlex Elder 	if (ret)
469652bb1f9bSIlya Dryomov 		return ret;
4697b813623aSAlex Elder 
469852bb1f9bSIlya Dryomov 	return size;
4699dfc5606dSYehuda Sadeh }
4700602adf40SYehuda Sadeh 
4701dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
470234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
4703dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
4704dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
4705dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
4706dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
47079bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
4708dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
4709589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
4710dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4711dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
471286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
4713dfc5606dSYehuda Sadeh 
4714dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
4715dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
471634b13184SAlex Elder 	&dev_attr_features.attr,
4717dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
4718dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
4719dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
4720dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
47219bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
4722dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
4723589d30e0SAlex Elder 	&dev_attr_image_id.attr,
4724dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
472586b00e0dSAlex Elder 	&dev_attr_parent.attr,
4726dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4727dfc5606dSYehuda Sadeh 	NULL
4728dfc5606dSYehuda Sadeh };
4729dfc5606dSYehuda Sadeh 
4730dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4731dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4732dfc5606dSYehuda Sadeh };
4733dfc5606dSYehuda Sadeh 
4734dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4735dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4736dfc5606dSYehuda Sadeh 	NULL
4737dfc5606dSYehuda Sadeh };
4738dfc5606dSYehuda Sadeh 
47396cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
4740dfc5606dSYehuda Sadeh 
4741dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
4742dfc5606dSYehuda Sadeh 	.name		= "rbd",
4743dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
47446cac4695SIlya Dryomov 	.release	= rbd_dev_release,
4745dfc5606dSYehuda Sadeh };
4746dfc5606dSYehuda Sadeh 
47478b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
47488b8fb99cSAlex Elder {
47498b8fb99cSAlex Elder 	kref_get(&spec->kref);
47508b8fb99cSAlex Elder 
47518b8fb99cSAlex Elder 	return spec;
47528b8fb99cSAlex Elder }
47538b8fb99cSAlex Elder 
47548b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
47558b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
47568b8fb99cSAlex Elder {
47578b8fb99cSAlex Elder 	if (spec)
47588b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
47598b8fb99cSAlex Elder }
47608b8fb99cSAlex Elder 
47618b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
47628b8fb99cSAlex Elder {
47638b8fb99cSAlex Elder 	struct rbd_spec *spec;
47648b8fb99cSAlex Elder 
47658b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
47668b8fb99cSAlex Elder 	if (!spec)
47678b8fb99cSAlex Elder 		return NULL;
476804077599SIlya Dryomov 
476904077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
477004077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
47718b8fb99cSAlex Elder 	kref_init(&spec->kref);
47728b8fb99cSAlex Elder 
47738b8fb99cSAlex Elder 	return spec;
47748b8fb99cSAlex Elder }
47758b8fb99cSAlex Elder 
47768b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
47778b8fb99cSAlex Elder {
47788b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
47798b8fb99cSAlex Elder 
47808b8fb99cSAlex Elder 	kfree(spec->pool_name);
47818b8fb99cSAlex Elder 	kfree(spec->image_id);
47828b8fb99cSAlex Elder 	kfree(spec->image_name);
47838b8fb99cSAlex Elder 	kfree(spec->snap_name);
47848b8fb99cSAlex Elder 	kfree(spec);
47858b8fb99cSAlex Elder }
47868b8fb99cSAlex Elder 
47871643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
4788dd5ac32dSIlya Dryomov {
478999d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4790ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
479199d16943SIlya Dryomov 
4792c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
47936b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
4794c41d13a3SIlya Dryomov 
4795dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4796dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4797dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4798dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
47991643dfa4SIlya Dryomov }
48001643dfa4SIlya Dryomov 
48011643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
48021643dfa4SIlya Dryomov {
48031643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
48041643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
48051643dfa4SIlya Dryomov 
48061643dfa4SIlya Dryomov 	if (need_put) {
48071643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
48081643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
48091643dfa4SIlya Dryomov 	}
48101643dfa4SIlya Dryomov 
48111643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
4812dd5ac32dSIlya Dryomov 
4813dd5ac32dSIlya Dryomov 	/*
4814dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4815dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4816dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4817dd5ac32dSIlya Dryomov 	 */
4818dd5ac32dSIlya Dryomov 	if (need_put)
4819dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4820dd5ac32dSIlya Dryomov }
4821dd5ac32dSIlya Dryomov 
48221643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
48231643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
4824c53d5893SAlex Elder {
4825c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4826c53d5893SAlex Elder 
4827c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4828c53d5893SAlex Elder 	if (!rbd_dev)
4829c53d5893SAlex Elder 		return NULL;
4830c53d5893SAlex Elder 
4831c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
4832c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4833c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4834c53d5893SAlex Elder 
4835c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
4836922dab61SIlya Dryomov 	ceph_oloc_init(&rbd_dev->header_oloc);
4837c41d13a3SIlya Dryomov 
483899d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
483999d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
484099d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
484199d16943SIlya Dryomov 
4842ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
4843ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4844ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4845ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4846ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4847ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4848ed95b21aSIlya Dryomov 	init_waitqueue_head(&rbd_dev->lock_waitq);
4849ed95b21aSIlya Dryomov 
4850dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4851dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4852dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4853dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4854dd5ac32dSIlya Dryomov 
4855c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4856d147543dSIlya Dryomov 	rbd_dev->spec = spec;
48570903e875SAlex Elder 
48587627151eSYan, Zheng 	rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
48597627151eSYan, Zheng 	rbd_dev->layout.stripe_count = 1;
48607627151eSYan, Zheng 	rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
48617627151eSYan, Zheng 	rbd_dev->layout.pool_id = spec->pool_id;
486230c156d9SYan, Zheng 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
48630903e875SAlex Elder 
48641643dfa4SIlya Dryomov 	return rbd_dev;
48651643dfa4SIlya Dryomov }
48661643dfa4SIlya Dryomov 
4867dd5ac32dSIlya Dryomov /*
48681643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4869dd5ac32dSIlya Dryomov  */
48701643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
48711643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
48721643dfa4SIlya Dryomov 					 struct rbd_options *opts)
48731643dfa4SIlya Dryomov {
48741643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
48751643dfa4SIlya Dryomov 
48761643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
48771643dfa4SIlya Dryomov 	if (!rbd_dev)
48781643dfa4SIlya Dryomov 		return NULL;
48791643dfa4SIlya Dryomov 
48801643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
48811643dfa4SIlya Dryomov 
48821643dfa4SIlya Dryomov 	/* get an id and fill in device name */
48831643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
48841643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
48851643dfa4SIlya Dryomov 					 GFP_KERNEL);
48861643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
48871643dfa4SIlya Dryomov 		goto fail_rbd_dev;
48881643dfa4SIlya Dryomov 
48891643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
48901643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
48911643dfa4SIlya Dryomov 						   rbd_dev->name);
48921643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
48931643dfa4SIlya Dryomov 		goto fail_dev_id;
48941643dfa4SIlya Dryomov 
48951643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4896dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4897dd5ac32dSIlya Dryomov 
48981643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4899c53d5893SAlex Elder 	return rbd_dev;
49001643dfa4SIlya Dryomov 
49011643dfa4SIlya Dryomov fail_dev_id:
49021643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
49031643dfa4SIlya Dryomov fail_rbd_dev:
49041643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
49051643dfa4SIlya Dryomov 	return NULL;
4906c53d5893SAlex Elder }
4907c53d5893SAlex Elder 
4908c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4909c53d5893SAlex Elder {
4910dd5ac32dSIlya Dryomov 	if (rbd_dev)
4911dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4912c53d5893SAlex Elder }
4913c53d5893SAlex Elder 
4914dfc5606dSYehuda Sadeh /*
49159d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
49169d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
49179d475de5SAlex Elder  * image.
49189d475de5SAlex Elder  */
49199d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
49209d475de5SAlex Elder 				u8 *order, u64 *snap_size)
49219d475de5SAlex Elder {
49229d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
49239d475de5SAlex Elder 	int ret;
49249d475de5SAlex Elder 	struct {
49259d475de5SAlex Elder 		u8 order;
49269d475de5SAlex Elder 		__le64 size;
49279d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
49289d475de5SAlex Elder 
4929c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
49309d475de5SAlex Elder 				"rbd", "get_size",
49314157976bSAlex Elder 				&snapid, sizeof (snapid),
4932e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
493336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
49349d475de5SAlex Elder 	if (ret < 0)
49359d475de5SAlex Elder 		return ret;
493657385b51SAlex Elder 	if (ret < sizeof (size_buf))
493757385b51SAlex Elder 		return -ERANGE;
49389d475de5SAlex Elder 
4939c3545579SJosh Durgin 	if (order) {
49409d475de5SAlex Elder 		*order = size_buf.order;
4941c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4942c3545579SJosh Durgin 	}
49439d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
49449d475de5SAlex Elder 
4945c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4946c3545579SJosh Durgin 		(unsigned long long)snap_id,
49479d475de5SAlex Elder 		(unsigned long long)*snap_size);
49489d475de5SAlex Elder 
49499d475de5SAlex Elder 	return 0;
49509d475de5SAlex Elder }
49519d475de5SAlex Elder 
49529d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
49539d475de5SAlex Elder {
49549d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
49559d475de5SAlex Elder 					&rbd_dev->header.obj_order,
49569d475de5SAlex Elder 					&rbd_dev->header.image_size);
49579d475de5SAlex Elder }
49589d475de5SAlex Elder 
49591e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
49601e130199SAlex Elder {
49611e130199SAlex Elder 	void *reply_buf;
49621e130199SAlex Elder 	int ret;
49631e130199SAlex Elder 	void *p;
49641e130199SAlex Elder 
49651e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
49661e130199SAlex Elder 	if (!reply_buf)
49671e130199SAlex Elder 		return -ENOMEM;
49681e130199SAlex Elder 
4969c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
49704157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
4971e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
497236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
49731e130199SAlex Elder 	if (ret < 0)
49741e130199SAlex Elder 		goto out;
49751e130199SAlex Elder 
49761e130199SAlex Elder 	p = reply_buf;
49771e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
497857385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
497957385b51SAlex Elder 	ret = 0;
49801e130199SAlex Elder 
49811e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
49821e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
49831e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
49841e130199SAlex Elder 	} else {
49851e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
49861e130199SAlex Elder 	}
49871e130199SAlex Elder out:
49881e130199SAlex Elder 	kfree(reply_buf);
49891e130199SAlex Elder 
49901e130199SAlex Elder 	return ret;
49911e130199SAlex Elder }
49921e130199SAlex Elder 
4993b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4994b1b5402aSAlex Elder 		u64 *snap_features)
4995b1b5402aSAlex Elder {
4996b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4997b1b5402aSAlex Elder 	struct {
4998b1b5402aSAlex Elder 		__le64 features;
4999b1b5402aSAlex Elder 		__le64 incompat;
50004157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
5001d3767f0fSIlya Dryomov 	u64 unsup;
5002b1b5402aSAlex Elder 	int ret;
5003b1b5402aSAlex Elder 
5004c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
5005b1b5402aSAlex Elder 				"rbd", "get_features",
50064157976bSAlex Elder 				&snapid, sizeof (snapid),
5007e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
500836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5009b1b5402aSAlex Elder 	if (ret < 0)
5010b1b5402aSAlex Elder 		return ret;
501157385b51SAlex Elder 	if (ret < sizeof (features_buf))
501257385b51SAlex Elder 		return -ERANGE;
5013d889140cSAlex Elder 
5014d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5015d3767f0fSIlya Dryomov 	if (unsup) {
5016d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5017d3767f0fSIlya Dryomov 			 unsup);
5018b8f5c6edSAlex Elder 		return -ENXIO;
5019d3767f0fSIlya Dryomov 	}
5020d889140cSAlex Elder 
5021b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
5022b1b5402aSAlex Elder 
5023b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5024b1b5402aSAlex Elder 		(unsigned long long)snap_id,
5025b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
5026b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5027b1b5402aSAlex Elder 
5028b1b5402aSAlex Elder 	return 0;
5029b1b5402aSAlex Elder }
5030b1b5402aSAlex Elder 
5031b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5032b1b5402aSAlex Elder {
5033b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5034b1b5402aSAlex Elder 						&rbd_dev->header.features);
5035b1b5402aSAlex Elder }
5036b1b5402aSAlex Elder 
503786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
503886b00e0dSAlex Elder {
503986b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
504086b00e0dSAlex Elder 	size_t size;
504186b00e0dSAlex Elder 	void *reply_buf = NULL;
504286b00e0dSAlex Elder 	__le64 snapid;
504386b00e0dSAlex Elder 	void *p;
504486b00e0dSAlex Elder 	void *end;
5045642a2537SAlex Elder 	u64 pool_id;
504686b00e0dSAlex Elder 	char *image_id;
50473b5cf2a2SAlex Elder 	u64 snap_id;
504886b00e0dSAlex Elder 	u64 overlap;
504986b00e0dSAlex Elder 	int ret;
505086b00e0dSAlex Elder 
505186b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
505286b00e0dSAlex Elder 	if (!parent_spec)
505386b00e0dSAlex Elder 		return -ENOMEM;
505486b00e0dSAlex Elder 
505586b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
505686b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
505786b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
505886b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
505986b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
506086b00e0dSAlex Elder 	if (!reply_buf) {
506186b00e0dSAlex Elder 		ret = -ENOMEM;
506286b00e0dSAlex Elder 		goto out_err;
506386b00e0dSAlex Elder 	}
506486b00e0dSAlex Elder 
50654d9b67cdSIlya Dryomov 	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5066c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
506786b00e0dSAlex Elder 				"rbd", "get_parent",
50684157976bSAlex Elder 				&snapid, sizeof (snapid),
5069e2a58ee5SAlex Elder 				reply_buf, size);
507036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
507186b00e0dSAlex Elder 	if (ret < 0)
507286b00e0dSAlex Elder 		goto out_err;
507386b00e0dSAlex Elder 
507486b00e0dSAlex Elder 	p = reply_buf;
507557385b51SAlex Elder 	end = reply_buf + ret;
507657385b51SAlex Elder 	ret = -ERANGE;
5077642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
5078392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
5079392a9dadSAlex Elder 		/*
5080392a9dadSAlex Elder 		 * Either the parent never existed, or we have
5081392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
5082392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
5083392a9dadSAlex Elder 		 * layered image disappears we immediately set the
5084392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
5085392a9dadSAlex Elder 		 * requests will be treated as if the image had no
5086392a9dadSAlex Elder 		 * parent.
5087392a9dadSAlex Elder 		 */
5088392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
5089392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
5090392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
5091392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
5092392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
5093392a9dadSAlex Elder 		}
5094392a9dadSAlex Elder 
509586b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
5096392a9dadSAlex Elder 	}
509786b00e0dSAlex Elder 
50980903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
50990903e875SAlex Elder 
51000903e875SAlex Elder 	ret = -EIO;
5101642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
51029584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5103642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
510457385b51SAlex Elder 		goto out_err;
5105c0cd10dbSAlex Elder 	}
51060903e875SAlex Elder 
5107979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
510886b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
510986b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
511086b00e0dSAlex Elder 		goto out_err;
511186b00e0dSAlex Elder 	}
51123b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
511386b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
511486b00e0dSAlex Elder 
51153b5cf2a2SAlex Elder 	/*
51163b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
51173b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
51183b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
51193b5cf2a2SAlex Elder 	 */
51203b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
51213b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
51223b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
51233b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
512486b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
512586b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
5126fbba11b3SIlya Dryomov 	} else {
5127fbba11b3SIlya Dryomov 		kfree(image_id);
51283b5cf2a2SAlex Elder 	}
51293b5cf2a2SAlex Elder 
51303b5cf2a2SAlex Elder 	/*
5131cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
5132cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
51333b5cf2a2SAlex Elder 	 */
51343b5cf2a2SAlex Elder 	if (!overlap) {
51353b5cf2a2SAlex Elder 		if (parent_spec) {
5136cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
5137cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
5138cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
5139cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
514070cf49cfSAlex Elder 		} else {
5141cf32bd9cSIlya Dryomov 			/* initial probe */
5142cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
51433b5cf2a2SAlex Elder 		}
514470cf49cfSAlex Elder 	}
5145cf32bd9cSIlya Dryomov 	rbd_dev->parent_overlap = overlap;
5146cf32bd9cSIlya Dryomov 
514786b00e0dSAlex Elder out:
514886b00e0dSAlex Elder 	ret = 0;
514986b00e0dSAlex Elder out_err:
515086b00e0dSAlex Elder 	kfree(reply_buf);
515186b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
515286b00e0dSAlex Elder 
515386b00e0dSAlex Elder 	return ret;
515486b00e0dSAlex Elder }
515586b00e0dSAlex Elder 
5156cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5157cc070d59SAlex Elder {
5158cc070d59SAlex Elder 	struct {
5159cc070d59SAlex Elder 		__le64 stripe_unit;
5160cc070d59SAlex Elder 		__le64 stripe_count;
5161cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5162cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5163cc070d59SAlex Elder 	void *p;
5164cc070d59SAlex Elder 	u64 obj_size;
5165cc070d59SAlex Elder 	u64 stripe_unit;
5166cc070d59SAlex Elder 	u64 stripe_count;
5167cc070d59SAlex Elder 	int ret;
5168cc070d59SAlex Elder 
5169c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
5170cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
5171e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
5172cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5173cc070d59SAlex Elder 	if (ret < 0)
5174cc070d59SAlex Elder 		return ret;
5175cc070d59SAlex Elder 	if (ret < size)
5176cc070d59SAlex Elder 		return -ERANGE;
5177cc070d59SAlex Elder 
5178cc070d59SAlex Elder 	/*
5179cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
5180cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
5181cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
5182cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
5183cc070d59SAlex Elder 	 */
5184cc070d59SAlex Elder 	ret = -EINVAL;
5185cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
5186cc070d59SAlex Elder 	p = &striping_info_buf;
5187cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
5188cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
5189cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
5190cc070d59SAlex Elder 				"(got %llu want %llu)",
5191cc070d59SAlex Elder 				stripe_unit, obj_size);
5192cc070d59SAlex Elder 		return -EINVAL;
5193cc070d59SAlex Elder 	}
5194cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
5195cc070d59SAlex Elder 	if (stripe_count != 1) {
5196cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
5197cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
5198cc070d59SAlex Elder 		return -EINVAL;
5199cc070d59SAlex Elder 	}
5200500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
5201500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
5202cc070d59SAlex Elder 
5203cc070d59SAlex Elder 	return 0;
5204cc070d59SAlex Elder }
5205cc070d59SAlex Elder 
52069e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
52079e15b77dSAlex Elder {
52089e15b77dSAlex Elder 	size_t image_id_size;
52099e15b77dSAlex Elder 	char *image_id;
52109e15b77dSAlex Elder 	void *p;
52119e15b77dSAlex Elder 	void *end;
52129e15b77dSAlex Elder 	size_t size;
52139e15b77dSAlex Elder 	void *reply_buf = NULL;
52149e15b77dSAlex Elder 	size_t len = 0;
52159e15b77dSAlex Elder 	char *image_name = NULL;
52169e15b77dSAlex Elder 	int ret;
52179e15b77dSAlex Elder 
52189e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
52199e15b77dSAlex Elder 
522069e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
522169e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
52229e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
52239e15b77dSAlex Elder 	if (!image_id)
52249e15b77dSAlex Elder 		return NULL;
52259e15b77dSAlex Elder 
52269e15b77dSAlex Elder 	p = image_id;
52274157976bSAlex Elder 	end = image_id + image_id_size;
522869e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
52299e15b77dSAlex Elder 
52309e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
52319e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
52329e15b77dSAlex Elder 	if (!reply_buf)
52339e15b77dSAlex Elder 		goto out;
52349e15b77dSAlex Elder 
523536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
52369e15b77dSAlex Elder 				"rbd", "dir_get_name",
52379e15b77dSAlex Elder 				image_id, image_id_size,
5238e2a58ee5SAlex Elder 				reply_buf, size);
52399e15b77dSAlex Elder 	if (ret < 0)
52409e15b77dSAlex Elder 		goto out;
52419e15b77dSAlex Elder 	p = reply_buf;
5242f40eb349SAlex Elder 	end = reply_buf + ret;
5243f40eb349SAlex Elder 
52449e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
52459e15b77dSAlex Elder 	if (IS_ERR(image_name))
52469e15b77dSAlex Elder 		image_name = NULL;
52479e15b77dSAlex Elder 	else
52489e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
52499e15b77dSAlex Elder out:
52509e15b77dSAlex Elder 	kfree(reply_buf);
52519e15b77dSAlex Elder 	kfree(image_id);
52529e15b77dSAlex Elder 
52539e15b77dSAlex Elder 	return image_name;
52549e15b77dSAlex Elder }
52559e15b77dSAlex Elder 
52562ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
52572ad3d716SAlex Elder {
52582ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
52592ad3d716SAlex Elder 	const char *snap_name;
52602ad3d716SAlex Elder 	u32 which = 0;
52612ad3d716SAlex Elder 
52622ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
52632ad3d716SAlex Elder 
52642ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
52652ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
52662ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
52672ad3d716SAlex Elder 			return snapc->snaps[which];
52682ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
52692ad3d716SAlex Elder 		which++;
52702ad3d716SAlex Elder 	}
52712ad3d716SAlex Elder 	return CEPH_NOSNAP;
52722ad3d716SAlex Elder }
52732ad3d716SAlex Elder 
52742ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
52752ad3d716SAlex Elder {
52762ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
52772ad3d716SAlex Elder 	u32 which;
52782ad3d716SAlex Elder 	bool found = false;
52792ad3d716SAlex Elder 	u64 snap_id;
52802ad3d716SAlex Elder 
52812ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
52822ad3d716SAlex Elder 		const char *snap_name;
52832ad3d716SAlex Elder 
52842ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
52852ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5286efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5287efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5288efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5289efadc98aSJosh Durgin 				continue;
5290efadc98aSJosh Durgin 			else
52912ad3d716SAlex Elder 				break;
5292efadc98aSJosh Durgin 		}
52932ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
52942ad3d716SAlex Elder 		kfree(snap_name);
52952ad3d716SAlex Elder 	}
52962ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
52972ad3d716SAlex Elder }
52982ad3d716SAlex Elder 
52992ad3d716SAlex Elder /*
53002ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
53012ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
53022ad3d716SAlex Elder  */
53032ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
53042ad3d716SAlex Elder {
53052ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
53062ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
53072ad3d716SAlex Elder 
53082ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
53092ad3d716SAlex Elder }
53102ad3d716SAlex Elder 
53119e15b77dSAlex Elder /*
531204077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
53139e15b77dSAlex Elder  */
531404077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
531504077599SIlya Dryomov {
531604077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
531704077599SIlya Dryomov 
531804077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
531904077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
532004077599SIlya Dryomov 	rbd_assert(spec->snap_name);
532104077599SIlya Dryomov 
532204077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
532304077599SIlya Dryomov 		u64 snap_id;
532404077599SIlya Dryomov 
532504077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
532604077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
532704077599SIlya Dryomov 			return -ENOENT;
532804077599SIlya Dryomov 
532904077599SIlya Dryomov 		spec->snap_id = snap_id;
533004077599SIlya Dryomov 	} else {
533104077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
533204077599SIlya Dryomov 	}
533304077599SIlya Dryomov 
533404077599SIlya Dryomov 	return 0;
533504077599SIlya Dryomov }
533604077599SIlya Dryomov 
533704077599SIlya Dryomov /*
533804077599SIlya Dryomov  * A parent image will have all ids but none of the names.
533904077599SIlya Dryomov  *
534004077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
534104077599SIlya Dryomov  * can't figure out the name for an image id.
534204077599SIlya Dryomov  */
534304077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
53449e15b77dSAlex Elder {
53452e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
53462e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
53472e9f7f1cSAlex Elder 	const char *pool_name;
53482e9f7f1cSAlex Elder 	const char *image_name;
53492e9f7f1cSAlex Elder 	const char *snap_name;
53509e15b77dSAlex Elder 	int ret;
53519e15b77dSAlex Elder 
535204077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
535304077599SIlya Dryomov 	rbd_assert(spec->image_id);
535404077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
53559e15b77dSAlex Elder 
53562e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
53579e15b77dSAlex Elder 
53582e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
53592e9f7f1cSAlex Elder 	if (!pool_name) {
53602e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
5361935dc89fSAlex Elder 		return -EIO;
5362935dc89fSAlex Elder 	}
53632e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
53642e9f7f1cSAlex Elder 	if (!pool_name)
53659e15b77dSAlex Elder 		return -ENOMEM;
53669e15b77dSAlex Elder 
53679e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
53689e15b77dSAlex Elder 
53692e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
53702e9f7f1cSAlex Elder 	if (!image_name)
537106ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
53729e15b77dSAlex Elder 
537304077599SIlya Dryomov 	/* Fetch the snapshot name */
53749e15b77dSAlex Elder 
53752e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
5376da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
5377da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
53789e15b77dSAlex Elder 		goto out_err;
53792e9f7f1cSAlex Elder 	}
53802e9f7f1cSAlex Elder 
53812e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
53822e9f7f1cSAlex Elder 	spec->image_name = image_name;
53832e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
53849e15b77dSAlex Elder 
53859e15b77dSAlex Elder 	return 0;
538604077599SIlya Dryomov 
53879e15b77dSAlex Elder out_err:
53882e9f7f1cSAlex Elder 	kfree(image_name);
53892e9f7f1cSAlex Elder 	kfree(pool_name);
53909e15b77dSAlex Elder 	return ret;
53919e15b77dSAlex Elder }
53929e15b77dSAlex Elder 
5393cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
539435d489f9SAlex Elder {
539535d489f9SAlex Elder 	size_t size;
539635d489f9SAlex Elder 	int ret;
539735d489f9SAlex Elder 	void *reply_buf;
539835d489f9SAlex Elder 	void *p;
539935d489f9SAlex Elder 	void *end;
540035d489f9SAlex Elder 	u64 seq;
540135d489f9SAlex Elder 	u32 snap_count;
540235d489f9SAlex Elder 	struct ceph_snap_context *snapc;
540335d489f9SAlex Elder 	u32 i;
540435d489f9SAlex Elder 
540535d489f9SAlex Elder 	/*
540635d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
540735d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
540835d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
540935d489f9SAlex Elder 	 * prepared to receive.
541035d489f9SAlex Elder 	 */
541135d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
541235d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
541335d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
541435d489f9SAlex Elder 	if (!reply_buf)
541535d489f9SAlex Elder 		return -ENOMEM;
541635d489f9SAlex Elder 
5417c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
54184157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
5419e2a58ee5SAlex Elder 				reply_buf, size);
542036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
542135d489f9SAlex Elder 	if (ret < 0)
542235d489f9SAlex Elder 		goto out;
542335d489f9SAlex Elder 
542435d489f9SAlex Elder 	p = reply_buf;
542557385b51SAlex Elder 	end = reply_buf + ret;
542657385b51SAlex Elder 	ret = -ERANGE;
542735d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
542835d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
542935d489f9SAlex Elder 
543035d489f9SAlex Elder 	/*
543135d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
543235d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
543335d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
543435d489f9SAlex Elder 	 * allocate is representable in a size_t.
543535d489f9SAlex Elder 	 */
543635d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
543735d489f9SAlex Elder 				 / sizeof (u64)) {
543835d489f9SAlex Elder 		ret = -EINVAL;
543935d489f9SAlex Elder 		goto out;
544035d489f9SAlex Elder 	}
544135d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
544235d489f9SAlex Elder 		goto out;
5443468521c1SAlex Elder 	ret = 0;
544435d489f9SAlex Elder 
5445812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
544635d489f9SAlex Elder 	if (!snapc) {
544735d489f9SAlex Elder 		ret = -ENOMEM;
544835d489f9SAlex Elder 		goto out;
544935d489f9SAlex Elder 	}
545035d489f9SAlex Elder 	snapc->seq = seq;
545135d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
545235d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
545335d489f9SAlex Elder 
545449ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
545535d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
545635d489f9SAlex Elder 
545735d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
545835d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
545935d489f9SAlex Elder out:
546035d489f9SAlex Elder 	kfree(reply_buf);
546135d489f9SAlex Elder 
546257385b51SAlex Elder 	return ret;
546335d489f9SAlex Elder }
546435d489f9SAlex Elder 
546554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
546654cac61fSAlex Elder 					u64 snap_id)
5467b8b1e2dbSAlex Elder {
5468b8b1e2dbSAlex Elder 	size_t size;
5469b8b1e2dbSAlex Elder 	void *reply_buf;
547054cac61fSAlex Elder 	__le64 snapid;
5471b8b1e2dbSAlex Elder 	int ret;
5472b8b1e2dbSAlex Elder 	void *p;
5473b8b1e2dbSAlex Elder 	void *end;
5474b8b1e2dbSAlex Elder 	char *snap_name;
5475b8b1e2dbSAlex Elder 
5476b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5477b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
5478b8b1e2dbSAlex Elder 	if (!reply_buf)
5479b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
5480b8b1e2dbSAlex Elder 
548154cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
5482c41d13a3SIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
5483b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
548454cac61fSAlex Elder 				&snapid, sizeof (snapid),
5485e2a58ee5SAlex Elder 				reply_buf, size);
548636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5487f40eb349SAlex Elder 	if (ret < 0) {
5488f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
5489b8b1e2dbSAlex Elder 		goto out;
5490f40eb349SAlex Elder 	}
5491b8b1e2dbSAlex Elder 
5492b8b1e2dbSAlex Elder 	p = reply_buf;
5493f40eb349SAlex Elder 	end = reply_buf + ret;
5494e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5495f40eb349SAlex Elder 	if (IS_ERR(snap_name))
5496b8b1e2dbSAlex Elder 		goto out;
5497f40eb349SAlex Elder 
5498b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
549954cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
5500b8b1e2dbSAlex Elder out:
5501b8b1e2dbSAlex Elder 	kfree(reply_buf);
5502b8b1e2dbSAlex Elder 
5503f40eb349SAlex Elder 	return snap_name;
5504b8b1e2dbSAlex Elder }
5505b8b1e2dbSAlex Elder 
55062df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5507117973fbSAlex Elder {
55082df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
5509117973fbSAlex Elder 	int ret;
5510117973fbSAlex Elder 
55111617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
55121617e40cSJosh Durgin 	if (ret)
5513cfbf6377SAlex Elder 		return ret;
55141617e40cSJosh Durgin 
55152df3fac7SAlex Elder 	if (first_time) {
55162df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
55172df3fac7SAlex Elder 		if (ret)
5518cfbf6377SAlex Elder 			return ret;
55192df3fac7SAlex Elder 	}
55202df3fac7SAlex Elder 
5521cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
5522d194cd1dSIlya Dryomov 	if (ret && first_time) {
5523d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
5524d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
5525d194cd1dSIlya Dryomov 	}
5526117973fbSAlex Elder 
5527117973fbSAlex Elder 	return ret;
5528117973fbSAlex Elder }
5529117973fbSAlex Elder 
5530a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5531a720ae09SIlya Dryomov {
5532a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5533a720ae09SIlya Dryomov 
5534a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
5535a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
5536a720ae09SIlya Dryomov 
5537a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
5538a720ae09SIlya Dryomov }
5539a720ae09SIlya Dryomov 
55401ddbe94eSAlex Elder /*
5541e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
5542e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
5543593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
5544593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
5545e28fff26SAlex Elder  */
5546e28fff26SAlex Elder static inline size_t next_token(const char **buf)
5547e28fff26SAlex Elder {
5548e28fff26SAlex Elder         /*
5549e28fff26SAlex Elder         * These are the characters that produce nonzero for
5550e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
5551e28fff26SAlex Elder         */
5552e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
5553e28fff26SAlex Elder 
5554e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
5555e28fff26SAlex Elder 
5556e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
5557e28fff26SAlex Elder }
5558e28fff26SAlex Elder 
5559e28fff26SAlex Elder /*
5560ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
5561ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
5562ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5563ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
5564ea3352f4SAlex Elder  *
5565ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
5566ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
5567ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
5568ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
5569ea3352f4SAlex Elder  *
5570ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
5571ea3352f4SAlex Elder  * the end of the found token.
5572ea3352f4SAlex Elder  *
5573ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
5574ea3352f4SAlex Elder  */
5575ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
5576ea3352f4SAlex Elder {
5577ea3352f4SAlex Elder 	char *dup;
5578ea3352f4SAlex Elder 	size_t len;
5579ea3352f4SAlex Elder 
5580ea3352f4SAlex Elder 	len = next_token(buf);
55814caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5582ea3352f4SAlex Elder 	if (!dup)
5583ea3352f4SAlex Elder 		return NULL;
5584ea3352f4SAlex Elder 	*(dup + len) = '\0';
5585ea3352f4SAlex Elder 	*buf += len;
5586ea3352f4SAlex Elder 
5587ea3352f4SAlex Elder 	if (lenp)
5588ea3352f4SAlex Elder 		*lenp = len;
5589ea3352f4SAlex Elder 
5590ea3352f4SAlex Elder 	return dup;
5591ea3352f4SAlex Elder }
5592ea3352f4SAlex Elder 
5593ea3352f4SAlex Elder /*
5594859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
5595859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5596859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
5597859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
5598d22f76e7SAlex Elder  *
5599859c31dfSAlex Elder  * The information extracted from these options is recorded in
5600859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
5601859c31dfSAlex Elder  * structures:
5602859c31dfSAlex Elder  *  ceph_opts
5603859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
5604859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
5605859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
5606859c31dfSAlex Elder  *  rbd_opts
5607859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
5608859c31dfSAlex Elder  *	this function; caller must release with kfree().
5609859c31dfSAlex Elder  *  spec
5610859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
5611859c31dfSAlex Elder  *	initialized by this function based on parsed options.
5612859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
5613859c31dfSAlex Elder  *
5614859c31dfSAlex Elder  * The options passed take this form:
5615859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5616859c31dfSAlex Elder  * where:
5617859c31dfSAlex Elder  *  <mon_addrs>
5618859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
5619859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
5620859c31dfSAlex Elder  *      by a port number (separated by a colon).
5621859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
5622859c31dfSAlex Elder  *  <options>
5623859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
5624859c31dfSAlex Elder  *  <pool_name>
5625859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
5626859c31dfSAlex Elder  *  <image_name>
5627859c31dfSAlex Elder  *      The name of the image in that pool to map.
5628859c31dfSAlex Elder  *  <snap_id>
5629859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
5630859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
5631859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
5632859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
5633a725f65eSAlex Elder  */
5634859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
5635dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
5636859c31dfSAlex Elder 				struct rbd_options **opts,
5637859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
5638a725f65eSAlex Elder {
5639e28fff26SAlex Elder 	size_t len;
5640859c31dfSAlex Elder 	char *options;
56410ddebc0cSAlex Elder 	const char *mon_addrs;
5642ecb4dc22SAlex Elder 	char *snap_name;
56430ddebc0cSAlex Elder 	size_t mon_addrs_size;
5644859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
56454e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5646859c31dfSAlex Elder 	struct ceph_options *copts;
5647dc79b113SAlex Elder 	int ret;
5648e28fff26SAlex Elder 
5649e28fff26SAlex Elder 	/* The first four tokens are required */
5650e28fff26SAlex Elder 
56517ef3214aSAlex Elder 	len = next_token(&buf);
56524fb5d671SAlex Elder 	if (!len) {
56534fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
56544fb5d671SAlex Elder 		return -EINVAL;
56554fb5d671SAlex Elder 	}
56560ddebc0cSAlex Elder 	mon_addrs = buf;
5657f28e565aSAlex Elder 	mon_addrs_size = len + 1;
56587ef3214aSAlex Elder 	buf += len;
5659a725f65eSAlex Elder 
5660dc79b113SAlex Elder 	ret = -EINVAL;
5661f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
5662f28e565aSAlex Elder 	if (!options)
5663dc79b113SAlex Elder 		return -ENOMEM;
56644fb5d671SAlex Elder 	if (!*options) {
56654fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
56664fb5d671SAlex Elder 		goto out_err;
56674fb5d671SAlex Elder 	}
5668a725f65eSAlex Elder 
5669859c31dfSAlex Elder 	spec = rbd_spec_alloc();
5670859c31dfSAlex Elder 	if (!spec)
5671f28e565aSAlex Elder 		goto out_mem;
5672859c31dfSAlex Elder 
5673859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
5674859c31dfSAlex Elder 	if (!spec->pool_name)
5675859c31dfSAlex Elder 		goto out_mem;
56764fb5d671SAlex Elder 	if (!*spec->pool_name) {
56774fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
56784fb5d671SAlex Elder 		goto out_err;
56794fb5d671SAlex Elder 	}
5680e28fff26SAlex Elder 
568169e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
5682859c31dfSAlex Elder 	if (!spec->image_name)
5683f28e565aSAlex Elder 		goto out_mem;
56844fb5d671SAlex Elder 	if (!*spec->image_name) {
56854fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
56864fb5d671SAlex Elder 		goto out_err;
56874fb5d671SAlex Elder 	}
5688e28fff26SAlex Elder 
5689f28e565aSAlex Elder 	/*
5690f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
5691f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
5692f28e565aSAlex Elder 	 */
56933feeb894SAlex Elder 	len = next_token(&buf);
5694820a5f3eSAlex Elder 	if (!len) {
56953feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
56963feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5697f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5698dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
5699f28e565aSAlex Elder 		goto out_err;
5700849b4260SAlex Elder 	}
5701ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5702ecb4dc22SAlex Elder 	if (!snap_name)
5703f28e565aSAlex Elder 		goto out_mem;
5704ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
5705ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
5706e5c35534SAlex Elder 
57070ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
5708e28fff26SAlex Elder 
57094e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
57104e9afebaSAlex Elder 	if (!rbd_opts)
57114e9afebaSAlex Elder 		goto out_mem;
57124e9afebaSAlex Elder 
57134e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
5714b5584180SIlya Dryomov 	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
5715d22f76e7SAlex Elder 
5716859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
57170ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
57184e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
5719859c31dfSAlex Elder 	if (IS_ERR(copts)) {
5720859c31dfSAlex Elder 		ret = PTR_ERR(copts);
5721dc79b113SAlex Elder 		goto out_err;
5722dc79b113SAlex Elder 	}
5723859c31dfSAlex Elder 	kfree(options);
5724859c31dfSAlex Elder 
5725859c31dfSAlex Elder 	*ceph_opts = copts;
57264e9afebaSAlex Elder 	*opts = rbd_opts;
5727859c31dfSAlex Elder 	*rbd_spec = spec;
57280ddebc0cSAlex Elder 
5729dc79b113SAlex Elder 	return 0;
5730f28e565aSAlex Elder out_mem:
5731dc79b113SAlex Elder 	ret = -ENOMEM;
5732d22f76e7SAlex Elder out_err:
5733859c31dfSAlex Elder 	kfree(rbd_opts);
5734859c31dfSAlex Elder 	rbd_spec_put(spec);
5735f28e565aSAlex Elder 	kfree(options);
5736d22f76e7SAlex Elder 
5737dc79b113SAlex Elder 	return ret;
5738a725f65eSAlex Elder }
5739a725f65eSAlex Elder 
5740589d30e0SAlex Elder /*
574130ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
574230ba1f02SIlya Dryomov  */
574330ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
574430ba1f02SIlya Dryomov {
5745a319bf56SIlya Dryomov 	struct ceph_options *opts = rbdc->client->options;
574630ba1f02SIlya Dryomov 	u64 newest_epoch;
574730ba1f02SIlya Dryomov 	int tries = 0;
574830ba1f02SIlya Dryomov 	int ret;
574930ba1f02SIlya Dryomov 
575030ba1f02SIlya Dryomov again:
575130ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
575230ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
5753d0b19705SIlya Dryomov 		ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
575430ba1f02SIlya Dryomov 					    &newest_epoch);
575530ba1f02SIlya Dryomov 		if (ret < 0)
575630ba1f02SIlya Dryomov 			return ret;
575730ba1f02SIlya Dryomov 
575830ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
57597cca78c9SIlya Dryomov 			ceph_osdc_maybe_request_map(&rbdc->client->osdc);
576030ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
5761a319bf56SIlya Dryomov 						     newest_epoch,
5762a319bf56SIlya Dryomov 						     opts->mount_timeout);
576330ba1f02SIlya Dryomov 			goto again;
576430ba1f02SIlya Dryomov 		} else {
576530ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
576630ba1f02SIlya Dryomov 			return -ENOENT;
576730ba1f02SIlya Dryomov 		}
576830ba1f02SIlya Dryomov 	}
576930ba1f02SIlya Dryomov 
577030ba1f02SIlya Dryomov 	return ret;
577130ba1f02SIlya Dryomov }
577230ba1f02SIlya Dryomov 
577330ba1f02SIlya Dryomov /*
5774589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5775589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5776589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5777589d30e0SAlex Elder  *
5778589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5779589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5780589d30e0SAlex Elder  * with the supplied name.
5781589d30e0SAlex Elder  *
5782589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5783589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5784589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5785589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5786589d30e0SAlex Elder  */
5787589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5788589d30e0SAlex Elder {
5789589d30e0SAlex Elder 	int ret;
5790589d30e0SAlex Elder 	size_t size;
5791589d30e0SAlex Elder 	char *object_name;
5792589d30e0SAlex Elder 	void *response;
5793c0fba368SAlex Elder 	char *image_id;
57942f82ee54SAlex Elder 
5795589d30e0SAlex Elder 	/*
57962c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
57972c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5798c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5799c0fba368SAlex Elder 	 * do still need to set the image format though.
58002c0d0a10SAlex Elder 	 */
5801c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5802c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5803c0fba368SAlex Elder 
58042c0d0a10SAlex Elder 		return 0;
5805c0fba368SAlex Elder 	}
58062c0d0a10SAlex Elder 
58072c0d0a10SAlex Elder 	/*
5808589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5809589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5810589d30e0SAlex Elder 	 */
581169e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
5812589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
5813589d30e0SAlex Elder 	if (!object_name)
5814589d30e0SAlex Elder 		return -ENOMEM;
58150d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
5816589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
5817589d30e0SAlex Elder 
5818589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5819589d30e0SAlex Elder 
5820589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5821589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5822589d30e0SAlex Elder 	if (!response) {
5823589d30e0SAlex Elder 		ret = -ENOMEM;
5824589d30e0SAlex Elder 		goto out;
5825589d30e0SAlex Elder 	}
5826589d30e0SAlex Elder 
5827c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5828c0fba368SAlex Elder 
582936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
58304157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
5831e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
583236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5833c0fba368SAlex Elder 	if (ret == -ENOENT) {
5834c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5835c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5836c0fba368SAlex Elder 		if (!ret)
5837c0fba368SAlex Elder 			rbd_dev->image_format = 1;
58387dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5839c0fba368SAlex Elder 		void *p = response;
5840589d30e0SAlex Elder 
5841c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5842979ed480SAlex Elder 						NULL, GFP_NOIO);
5843461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5844c0fba368SAlex Elder 		if (!ret)
5845c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5846c0fba368SAlex Elder 	}
5847c0fba368SAlex Elder 
5848c0fba368SAlex Elder 	if (!ret) {
5849c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5850c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5851589d30e0SAlex Elder 	}
5852589d30e0SAlex Elder out:
5853589d30e0SAlex Elder 	kfree(response);
5854589d30e0SAlex Elder 	kfree(object_name);
5855589d30e0SAlex Elder 
5856589d30e0SAlex Elder 	return ret;
5857589d30e0SAlex Elder }
5858589d30e0SAlex Elder 
58593abef3b3SAlex Elder /*
58603abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
58613abef3b3SAlex Elder  * call.
58623abef3b3SAlex Elder  */
58636fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
58646fd48b3bSAlex Elder {
58656fd48b3bSAlex Elder 	struct rbd_image_header	*header;
58666fd48b3bSAlex Elder 
5867a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
58686fd48b3bSAlex Elder 
58696fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
58706fd48b3bSAlex Elder 
58716fd48b3bSAlex Elder 	header = &rbd_dev->header;
5872812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
58736fd48b3bSAlex Elder 	kfree(header->snap_sizes);
58746fd48b3bSAlex Elder 	kfree(header->snap_names);
58756fd48b3bSAlex Elder 	kfree(header->object_prefix);
58766fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
58776fd48b3bSAlex Elder }
58786fd48b3bSAlex Elder 
58792df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5880a30b71b9SAlex Elder {
5881a30b71b9SAlex Elder 	int ret;
5882a30b71b9SAlex Elder 
58831e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
588457385b51SAlex Elder 	if (ret)
58851e130199SAlex Elder 		goto out_err;
5886b1b5402aSAlex Elder 
58872df3fac7SAlex Elder 	/*
58882df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
58892df3fac7SAlex Elder 	 * features are assumed to never change.
58902df3fac7SAlex Elder 	 */
5891b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
589257385b51SAlex Elder 	if (ret)
5893b1b5402aSAlex Elder 		goto out_err;
589435d489f9SAlex Elder 
5895cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5896cc070d59SAlex Elder 
5897cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5898cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5899cc070d59SAlex Elder 		if (ret < 0)
5900cc070d59SAlex Elder 			goto out_err;
5901cc070d59SAlex Elder 	}
59022df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
5903a30b71b9SAlex Elder 
590435152979SAlex Elder 	return 0;
59059d475de5SAlex Elder out_err:
5906642a2537SAlex Elder 	rbd_dev->header.features = 0;
59071e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
59081e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
59099d475de5SAlex Elder 
59109d475de5SAlex Elder 	return ret;
5911a30b71b9SAlex Elder }
5912a30b71b9SAlex Elder 
59136d69bb53SIlya Dryomov /*
59146d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
59156d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
59166d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
59176d69bb53SIlya Dryomov  */
59186d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
591983a06263SAlex Elder {
59202f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5921124afba2SAlex Elder 	int ret;
5922124afba2SAlex Elder 
5923124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5924124afba2SAlex Elder 		return 0;
5925124afba2SAlex Elder 
59266d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
59276d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
59286d69bb53SIlya Dryomov 		ret = -EINVAL;
59296d69bb53SIlya Dryomov 		goto out_err;
59306d69bb53SIlya Dryomov 	}
59316d69bb53SIlya Dryomov 
59321643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
59331f2c6651SIlya Dryomov 	if (!parent) {
5934124afba2SAlex Elder 		ret = -ENOMEM;
5935124afba2SAlex Elder 		goto out_err;
59361f2c6651SIlya Dryomov 	}
59371f2c6651SIlya Dryomov 
59381f2c6651SIlya Dryomov 	/*
59391f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
59401f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
59411f2c6651SIlya Dryomov 	 */
59421f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
59431f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5944124afba2SAlex Elder 
59456d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5946124afba2SAlex Elder 	if (ret < 0)
5947124afba2SAlex Elder 		goto out_err;
59481f2c6651SIlya Dryomov 
5949124afba2SAlex Elder 	rbd_dev->parent = parent;
5950a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5951124afba2SAlex Elder 	return 0;
5952124afba2SAlex Elder 
59531f2c6651SIlya Dryomov out_err:
59541f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
59551f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5956124afba2SAlex Elder 	return ret;
5957124afba2SAlex Elder }
5958124afba2SAlex Elder 
5959811c6688SIlya Dryomov /*
5960811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5961811c6688SIlya Dryomov  * upon return.
5962811c6688SIlya Dryomov  */
5963200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5964124afba2SAlex Elder {
596583a06263SAlex Elder 	int ret;
596683a06263SAlex Elder 
59679b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
596883a06263SAlex Elder 
59699b60e70bSIlya Dryomov 	if (!single_major) {
597083a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
597183a06263SAlex Elder 		if (ret < 0)
59721643dfa4SIlya Dryomov 			goto err_out_unlock;
59739b60e70bSIlya Dryomov 
597483a06263SAlex Elder 		rbd_dev->major = ret;
5975dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
59769b60e70bSIlya Dryomov 	} else {
59779b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
59789b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
59799b60e70bSIlya Dryomov 	}
598083a06263SAlex Elder 
598183a06263SAlex Elder 	/* Set up the blkdev mapping. */
598283a06263SAlex Elder 
598383a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
598483a06263SAlex Elder 	if (ret)
598583a06263SAlex Elder 		goto err_out_blkdev;
598683a06263SAlex Elder 
5987f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
598883a06263SAlex Elder 	if (ret)
598983a06263SAlex Elder 		goto err_out_disk;
5990bc1ecc65SIlya Dryomov 
5991f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
599222001f61SJosh Durgin 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5993f35a4deeSAlex Elder 
5994dd5ac32dSIlya Dryomov 	dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5995dd5ac32dSIlya Dryomov 	ret = device_add(&rbd_dev->dev);
5996f35a4deeSAlex Elder 	if (ret)
5997f5ee37bdSIlya Dryomov 		goto err_out_mapping;
599883a06263SAlex Elder 
599983a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
600083a06263SAlex Elder 
6001129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6002811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
600383a06263SAlex Elder 
60041643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
60051643dfa4SIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
60061643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
60071643dfa4SIlya Dryomov 
6008811c6688SIlya Dryomov 	add_disk(rbd_dev->disk);
600983a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
601083a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
601183a06263SAlex Elder 
601283a06263SAlex Elder 	return ret;
60132f82ee54SAlex Elder 
6014f35a4deeSAlex Elder err_out_mapping:
6015f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
601683a06263SAlex Elder err_out_disk:
601783a06263SAlex Elder 	rbd_free_disk(rbd_dev);
601883a06263SAlex Elder err_out_blkdev:
60199b60e70bSIlya Dryomov 	if (!single_major)
602083a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6021811c6688SIlya Dryomov err_out_unlock:
6022811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
602383a06263SAlex Elder 	return ret;
602483a06263SAlex Elder }
602583a06263SAlex Elder 
6026332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6027332bb12dSAlex Elder {
6028332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
6029c41d13a3SIlya Dryomov 	int ret;
6030332bb12dSAlex Elder 
6031332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
6032332bb12dSAlex Elder 
6033332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6034332bb12dSAlex Elder 
60357627151eSYan, Zheng 	rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
6036332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
6037c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6038332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
6039332bb12dSAlex Elder 	else
6040c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6041332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
6042c41d13a3SIlya Dryomov 
6043c41d13a3SIlya Dryomov 	return ret;
6044332bb12dSAlex Elder }
6045332bb12dSAlex Elder 
6046200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6047200a6a8bSAlex Elder {
60486fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
60496fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
60506fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
60516fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
60526fd48b3bSAlex Elder 
6053200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
6054200a6a8bSAlex Elder }
6055200a6a8bSAlex Elder 
6056a30b71b9SAlex Elder /*
6057a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
60581f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
60591f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
60601f3ef788SAlex Elder  * object to get detailed information about the rbd image.
6061a30b71b9SAlex Elder  */
60626d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6063a30b71b9SAlex Elder {
6064a30b71b9SAlex Elder 	int ret;
6065a30b71b9SAlex Elder 
6066a30b71b9SAlex Elder 	/*
60673abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
60683abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
60693abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
60703abef3b3SAlex Elder 	 * will be set to either 1 or 2.
6071a30b71b9SAlex Elder 	 */
6072a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
6073a30b71b9SAlex Elder 	if (ret)
6074c0fba368SAlex Elder 		return ret;
6075c0fba368SAlex Elder 
6076332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
6077332bb12dSAlex Elder 	if (ret)
6078332bb12dSAlex Elder 		goto err_out_format;
6079332bb12dSAlex Elder 
60806d69bb53SIlya Dryomov 	if (!depth) {
608199d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
60821fe48023SIlya Dryomov 		if (ret) {
60831fe48023SIlya Dryomov 			if (ret == -ENOENT)
60841fe48023SIlya Dryomov 				pr_info("image %s/%s does not exist\n",
60851fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
60861fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
6087c41d13a3SIlya Dryomov 			goto err_out_format;
60881f3ef788SAlex Elder 		}
60891fe48023SIlya Dryomov 	}
6090b644de2bSAlex Elder 
6091a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
60925655c4d9SAlex Elder 	if (ret)
6093b644de2bSAlex Elder 		goto err_out_watch;
6094a30b71b9SAlex Elder 
609504077599SIlya Dryomov 	/*
609604077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
609704077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
609804077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
609904077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
610004077599SIlya Dryomov 	 */
61016d69bb53SIlya Dryomov 	if (!depth)
610204077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
610304077599SIlya Dryomov 	else
610404077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
61051fe48023SIlya Dryomov 	if (ret) {
61061fe48023SIlya Dryomov 		if (ret == -ENOENT)
61071fe48023SIlya Dryomov 			pr_info("snap %s/%s@%s does not exist\n",
61081fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
61091fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
61101fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
611133dca39fSAlex Elder 		goto err_out_probe;
61121fe48023SIlya Dryomov 	}
61139bb81c9bSAlex Elder 
6114e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6115e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
6116e8f59b59SIlya Dryomov 		if (ret)
6117e8f59b59SIlya Dryomov 			goto err_out_probe;
6118e8f59b59SIlya Dryomov 
6119e8f59b59SIlya Dryomov 		/*
6120e8f59b59SIlya Dryomov 		 * Need to warn users if this image is the one being
6121e8f59b59SIlya Dryomov 		 * mapped and has a parent.
6122e8f59b59SIlya Dryomov 		 */
61236d69bb53SIlya Dryomov 		if (!depth && rbd_dev->parent_spec)
6124e8f59b59SIlya Dryomov 			rbd_warn(rbd_dev,
6125e8f59b59SIlya Dryomov 				 "WARNING: kernel layering is EXPERIMENTAL!");
6126e8f59b59SIlya Dryomov 	}
6127e8f59b59SIlya Dryomov 
61286d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
612930d60ba2SAlex Elder 	if (ret)
613030d60ba2SAlex Elder 		goto err_out_probe;
613183a06263SAlex Elder 
613230d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
6133c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
613430d60ba2SAlex Elder 	return 0;
6135e8f59b59SIlya Dryomov 
61366fd48b3bSAlex Elder err_out_probe:
61376fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
6138b644de2bSAlex Elder err_out_watch:
61396d69bb53SIlya Dryomov 	if (!depth)
614099d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6141332bb12dSAlex Elder err_out_format:
6142332bb12dSAlex Elder 	rbd_dev->image_format = 0;
61435655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
61445655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
61455655c4d9SAlex Elder 	return ret;
614683a06263SAlex Elder }
614783a06263SAlex Elder 
61489b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
614959c2be1eSYehuda Sadeh 			  const char *buf,
615059c2be1eSYehuda Sadeh 			  size_t count)
6151602adf40SYehuda Sadeh {
6152cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
6153dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
61544e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
6155859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
61569d3997fdSAlex Elder 	struct rbd_client *rbdc;
615751344a38SAlex Elder 	bool read_only;
6158b51c83c2SIlya Dryomov 	int rc;
6159602adf40SYehuda Sadeh 
6160602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
6161602adf40SYehuda Sadeh 		return -ENODEV;
6162602adf40SYehuda Sadeh 
6163a725f65eSAlex Elder 	/* parse add command */
6164859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
6165dc79b113SAlex Elder 	if (rc < 0)
6166dd5ac32dSIlya Dryomov 		goto out;
6167a725f65eSAlex Elder 
61689d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
61699d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
61709d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
61710ddebc0cSAlex Elder 		goto err_out_args;
61729d3997fdSAlex Elder 	}
6173602adf40SYehuda Sadeh 
6174602adf40SYehuda Sadeh 	/* pick the pool */
617530ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
61761fe48023SIlya Dryomov 	if (rc < 0) {
61771fe48023SIlya Dryomov 		if (rc == -ENOENT)
61781fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
6179602adf40SYehuda Sadeh 		goto err_out_client;
61801fe48023SIlya Dryomov 	}
6181859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
6182859c31dfSAlex Elder 
6183d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
6184b51c83c2SIlya Dryomov 	if (!rbd_dev) {
6185b51c83c2SIlya Dryomov 		rc = -ENOMEM;
6186bd4ba655SAlex Elder 		goto err_out_client;
6187b51c83c2SIlya Dryomov 	}
6188c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
6189c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
6190d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
6191602adf40SYehuda Sadeh 
6192811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
61936d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
6194a30b71b9SAlex Elder 	if (rc < 0)
6195c53d5893SAlex Elder 		goto err_out_rbd_dev;
619605fd6f6fSAlex Elder 
61977ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
61987ce4eef7SAlex Elder 
6199d147543dSIlya Dryomov 	read_only = rbd_dev->opts->read_only;
62007ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
62017ce4eef7SAlex Elder 		read_only = true;
62027ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
62037ce4eef7SAlex Elder 
6204b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
62053abef3b3SAlex Elder 	if (rc) {
6206e37180c0SIlya Dryomov 		/*
620799d16943SIlya Dryomov 		 * rbd_unregister_watch() can't be moved into
6208e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
6209e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
6210e37180c0SIlya Dryomov 		 */
621199d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
62123abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
6213dd5ac32dSIlya Dryomov 		goto out;
62143abef3b3SAlex Elder 	}
62153abef3b3SAlex Elder 
6216dd5ac32dSIlya Dryomov 	rc = count;
6217dd5ac32dSIlya Dryomov out:
6218dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
6219dd5ac32dSIlya Dryomov 	return rc;
6220b536f69aSAlex Elder 
6221c53d5893SAlex Elder err_out_rbd_dev:
6222811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
6223c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
6224bd4ba655SAlex Elder err_out_client:
62259d3997fdSAlex Elder 	rbd_put_client(rbdc);
62260ddebc0cSAlex Elder err_out_args:
6227859c31dfSAlex Elder 	rbd_spec_put(spec);
6228d147543dSIlya Dryomov 	kfree(rbd_opts);
6229dd5ac32dSIlya Dryomov 	goto out;
6230602adf40SYehuda Sadeh }
6231602adf40SYehuda Sadeh 
62329b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
62339b60e70bSIlya Dryomov 		       const char *buf,
62349b60e70bSIlya Dryomov 		       size_t count)
62359b60e70bSIlya Dryomov {
62369b60e70bSIlya Dryomov 	if (single_major)
62379b60e70bSIlya Dryomov 		return -EINVAL;
62389b60e70bSIlya Dryomov 
62399b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
62409b60e70bSIlya Dryomov }
62419b60e70bSIlya Dryomov 
62429b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
62439b60e70bSIlya Dryomov 				    const char *buf,
62449b60e70bSIlya Dryomov 				    size_t count)
62459b60e70bSIlya Dryomov {
62469b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
62479b60e70bSIlya Dryomov }
62489b60e70bSIlya Dryomov 
6249dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6250602adf40SYehuda Sadeh {
6251602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
62521643dfa4SIlya Dryomov 
62531643dfa4SIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
62541643dfa4SIlya Dryomov 	list_del_init(&rbd_dev->node);
62551643dfa4SIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
62561643dfa4SIlya Dryomov 
6257200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6258dd5ac32dSIlya Dryomov 	device_del(&rbd_dev->dev);
62596d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
62609b60e70bSIlya Dryomov 	if (!single_major)
6261602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6262602adf40SYehuda Sadeh }
6263602adf40SYehuda Sadeh 
626405a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
626505a46afdSAlex Elder {
6266ad945fc1SAlex Elder 	while (rbd_dev->parent) {
626705a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
626805a46afdSAlex Elder 		struct rbd_device *second = first->parent;
626905a46afdSAlex Elder 		struct rbd_device *third;
627005a46afdSAlex Elder 
627105a46afdSAlex Elder 		/*
627205a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
627305a46afdSAlex Elder 		 * remove it.
627405a46afdSAlex Elder 		 */
627505a46afdSAlex Elder 		while (second && (third = second->parent)) {
627605a46afdSAlex Elder 			first = second;
627705a46afdSAlex Elder 			second = third;
627805a46afdSAlex Elder 		}
6279ad945fc1SAlex Elder 		rbd_assert(second);
62808ad42cd0SAlex Elder 		rbd_dev_image_release(second);
6281ad945fc1SAlex Elder 		first->parent = NULL;
6282ad945fc1SAlex Elder 		first->parent_overlap = 0;
6283ad945fc1SAlex Elder 
6284ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
628505a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
628605a46afdSAlex Elder 		first->parent_spec = NULL;
628705a46afdSAlex Elder 	}
628805a46afdSAlex Elder }
628905a46afdSAlex Elder 
62909b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
6291602adf40SYehuda Sadeh 			     const char *buf,
6292602adf40SYehuda Sadeh 			     size_t count)
6293602adf40SYehuda Sadeh {
6294602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
6295751cc0e3SAlex Elder 	struct list_head *tmp;
6296751cc0e3SAlex Elder 	int dev_id;
6297602adf40SYehuda Sadeh 	unsigned long ul;
629882a442d2SAlex Elder 	bool already = false;
62990d8189e1SAlex Elder 	int ret;
6300602adf40SYehuda Sadeh 
6301bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
63020d8189e1SAlex Elder 	if (ret)
63030d8189e1SAlex Elder 		return ret;
6304602adf40SYehuda Sadeh 
6305602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
6306751cc0e3SAlex Elder 	dev_id = (int)ul;
6307751cc0e3SAlex Elder 	if (dev_id != ul)
6308602adf40SYehuda Sadeh 		return -EINVAL;
6309602adf40SYehuda Sadeh 
6310602adf40SYehuda Sadeh 	ret = -ENOENT;
6311751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
6312751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
6313751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
6314751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
6315751cc0e3SAlex Elder 			ret = 0;
6316751cc0e3SAlex Elder 			break;
6317602adf40SYehuda Sadeh 		}
6318751cc0e3SAlex Elder 	}
6319751cc0e3SAlex Elder 	if (!ret) {
6320a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
6321b82d167bSAlex Elder 		if (rbd_dev->open_count)
632242382b70SAlex Elder 			ret = -EBUSY;
6323b82d167bSAlex Elder 		else
632482a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
632582a442d2SAlex Elder 							&rbd_dev->flags);
6326a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
6327751cc0e3SAlex Elder 	}
6328751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
632982a442d2SAlex Elder 	if (ret < 0 || already)
63301ba0f1e7SAlex Elder 		return ret;
6331751cc0e3SAlex Elder 
6332ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
6333ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
6334ed95b21aSIlya Dryomov 		rbd_unlock(rbd_dev);
6335ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
633699d16943SIlya Dryomov 	rbd_unregister_watch(rbd_dev);
6337fca27065SIlya Dryomov 
63389875201eSJosh Durgin 	/*
63399875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
63409875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
63419875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
63429875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
63439875201eSJosh Durgin 	 */
6344dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
63458ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
6346aafb230eSAlex Elder 
63471ba0f1e7SAlex Elder 	return count;
6348602adf40SYehuda Sadeh }
6349602adf40SYehuda Sadeh 
63509b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
63519b60e70bSIlya Dryomov 			  const char *buf,
63529b60e70bSIlya Dryomov 			  size_t count)
63539b60e70bSIlya Dryomov {
63549b60e70bSIlya Dryomov 	if (single_major)
63559b60e70bSIlya Dryomov 		return -EINVAL;
63569b60e70bSIlya Dryomov 
63579b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
63589b60e70bSIlya Dryomov }
63599b60e70bSIlya Dryomov 
63609b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
63619b60e70bSIlya Dryomov 				       const char *buf,
63629b60e70bSIlya Dryomov 				       size_t count)
63639b60e70bSIlya Dryomov {
63649b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
63659b60e70bSIlya Dryomov }
63669b60e70bSIlya Dryomov 
6367602adf40SYehuda Sadeh /*
6368602adf40SYehuda Sadeh  * create control files in sysfs
6369dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
6370602adf40SYehuda Sadeh  */
6371602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
6372602adf40SYehuda Sadeh {
6373dfc5606dSYehuda Sadeh 	int ret;
6374602adf40SYehuda Sadeh 
6375fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
6376dfc5606dSYehuda Sadeh 	if (ret < 0)
6377dfc5606dSYehuda Sadeh 		return ret;
6378602adf40SYehuda Sadeh 
6379fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
6380fed4c143SAlex Elder 	if (ret < 0)
6381fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
6382602adf40SYehuda Sadeh 
6383602adf40SYehuda Sadeh 	return ret;
6384602adf40SYehuda Sadeh }
6385602adf40SYehuda Sadeh 
6386602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
6387602adf40SYehuda Sadeh {
6388dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
6389fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
6390602adf40SYehuda Sadeh }
6391602adf40SYehuda Sadeh 
63921c2a9dfeSAlex Elder static int rbd_slab_init(void)
63931c2a9dfeSAlex Elder {
63941c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
639503d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
6396868311b1SAlex Elder 	if (!rbd_img_request_cache)
6397868311b1SAlex Elder 		return -ENOMEM;
6398868311b1SAlex Elder 
6399868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
640003d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
640178c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
640278c2a44aSAlex Elder 		goto out_err;
640378c2a44aSAlex Elder 
640478c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
640578c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
64062d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
640778c2a44aSAlex Elder 	if (rbd_segment_name_cache)
64081c2a9dfeSAlex Elder 		return 0;
640978c2a44aSAlex Elder out_err:
641078c2a44aSAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
641178c2a44aSAlex Elder 	rbd_obj_request_cache = NULL;
64121c2a9dfeSAlex Elder 
6413868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
6414868311b1SAlex Elder 	rbd_img_request_cache = NULL;
6415868311b1SAlex Elder 
64161c2a9dfeSAlex Elder 	return -ENOMEM;
64171c2a9dfeSAlex Elder }
64181c2a9dfeSAlex Elder 
64191c2a9dfeSAlex Elder static void rbd_slab_exit(void)
64201c2a9dfeSAlex Elder {
642178c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
642278c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
642378c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
642478c2a44aSAlex Elder 
6425868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
6426868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
6427868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
6428868311b1SAlex Elder 
64291c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
64301c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
64311c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
64321c2a9dfeSAlex Elder }
64331c2a9dfeSAlex Elder 
6434cc344fa1SAlex Elder static int __init rbd_init(void)
6435602adf40SYehuda Sadeh {
6436602adf40SYehuda Sadeh 	int rc;
6437602adf40SYehuda Sadeh 
64381e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
64391e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
64401e32d34cSAlex Elder 		return -EINVAL;
64411e32d34cSAlex Elder 	}
6442e1b4d96dSIlya Dryomov 
64431c2a9dfeSAlex Elder 	rc = rbd_slab_init();
6444602adf40SYehuda Sadeh 	if (rc)
6445602adf40SYehuda Sadeh 		return rc;
6446e1b4d96dSIlya Dryomov 
6447f5ee37bdSIlya Dryomov 	/*
6448f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
6449f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
6450f5ee37bdSIlya Dryomov 	 */
6451f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6452f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
6453f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
6454f5ee37bdSIlya Dryomov 		goto err_out_slab;
6455f5ee37bdSIlya Dryomov 	}
6456f5ee37bdSIlya Dryomov 
64579b60e70bSIlya Dryomov 	if (single_major) {
64589b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
64599b60e70bSIlya Dryomov 		if (rbd_major < 0) {
64609b60e70bSIlya Dryomov 			rc = rbd_major;
6461f5ee37bdSIlya Dryomov 			goto err_out_wq;
64629b60e70bSIlya Dryomov 		}
64639b60e70bSIlya Dryomov 	}
64649b60e70bSIlya Dryomov 
64651c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
64661c2a9dfeSAlex Elder 	if (rc)
64679b60e70bSIlya Dryomov 		goto err_out_blkdev;
64681c2a9dfeSAlex Elder 
64699b60e70bSIlya Dryomov 	if (single_major)
64709b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
64719b60e70bSIlya Dryomov 	else
6472e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
64739b60e70bSIlya Dryomov 
6474e1b4d96dSIlya Dryomov 	return 0;
6475e1b4d96dSIlya Dryomov 
64769b60e70bSIlya Dryomov err_out_blkdev:
64779b60e70bSIlya Dryomov 	if (single_major)
64789b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6479f5ee37bdSIlya Dryomov err_out_wq:
6480f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
6481e1b4d96dSIlya Dryomov err_out_slab:
6482e1b4d96dSIlya Dryomov 	rbd_slab_exit();
64831c2a9dfeSAlex Elder 	return rc;
6484602adf40SYehuda Sadeh }
6485602adf40SYehuda Sadeh 
6486cc344fa1SAlex Elder static void __exit rbd_exit(void)
6487602adf40SYehuda Sadeh {
6488ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
6489602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
64909b60e70bSIlya Dryomov 	if (single_major)
64919b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6492f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
64931c2a9dfeSAlex Elder 	rbd_slab_exit();
6494602adf40SYehuda Sadeh }
6495602adf40SYehuda Sadeh 
6496602adf40SYehuda Sadeh module_init(rbd_init);
6497602adf40SYehuda Sadeh module_exit(rbd_exit);
6498602adf40SYehuda Sadeh 
6499d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6500602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6501602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6502602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
6503602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6504602adf40SYehuda Sadeh 
650590da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6506602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
6507