xref: /openbmc/linux/drivers/block/rbd.c (revision 9d4a227f)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
3543df3d35SIlya Dryomov #include <linux/ceph/striper.h>
36602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3759c2be1eSYehuda Sadeh #include <linux/parser.h>
3830d1cff8SAlex Elder #include <linux/bsearch.h>
39602adf40SYehuda Sadeh 
40602adf40SYehuda Sadeh #include <linux/kernel.h>
41602adf40SYehuda Sadeh #include <linux/device.h>
42602adf40SYehuda Sadeh #include <linux/module.h>
437ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
44602adf40SYehuda Sadeh #include <linux/fs.h>
45602adf40SYehuda Sadeh #include <linux/blkdev.h>
461c2a9dfeSAlex Elder #include <linux/slab.h>
47f8a22fc2SIlya Dryomov #include <linux/idr.h>
48bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
49602adf40SYehuda Sadeh 
50602adf40SYehuda Sadeh #include "rbd_types.h"
51602adf40SYehuda Sadeh 
52aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
53aafb230eSAlex Elder 
54593a9e7bSAlex Elder /*
55a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
56a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
57a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
58a2acd00eSAlex Elder  * -EINVAL without updating it.
59a2acd00eSAlex Elder  */
60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
61a2acd00eSAlex Elder {
62a2acd00eSAlex Elder 	unsigned int counter;
63a2acd00eSAlex Elder 
64bfc18e38SMark Rutland 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
66a2acd00eSAlex Elder 		return (int)counter;
67a2acd00eSAlex Elder 
68a2acd00eSAlex Elder 	atomic_dec(v);
69a2acd00eSAlex Elder 
70a2acd00eSAlex Elder 	return -EINVAL;
71a2acd00eSAlex Elder }
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
75a2acd00eSAlex Elder {
76a2acd00eSAlex Elder 	int counter;
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
79a2acd00eSAlex Elder 	if (counter >= 0)
80a2acd00eSAlex Elder 		return counter;
81a2acd00eSAlex Elder 
82a2acd00eSAlex Elder 	atomic_inc(v);
83a2acd00eSAlex Elder 
84a2acd00eSAlex Elder 	return -EINVAL;
85a2acd00eSAlex Elder }
86a2acd00eSAlex Elder 
87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
88602adf40SYehuda Sadeh 
897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91602adf40SYehuda Sadeh 
926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
936d69bb53SIlya Dryomov 
94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
96d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97d4b125e9SAlex Elder 
9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99602adf40SYehuda Sadeh 
100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
101602adf40SYehuda Sadeh 
1029682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1039682fc6dSAlex Elder 
1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1079e15b77dSAlex Elder 
1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
109589d30e0SAlex Elder 
110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11199d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
11299d16943SIlya Dryomov 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING		(1ULL<<0)
1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
118b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
1198767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
120e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
1218767b293SIlya Dryomov 
122ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
123ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1247e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
125b9f6d447SIlya Dryomov 				 RBD_FEATURE_DEEP_FLATTEN |	\
126e573427aSIlya Dryomov 				 RBD_FEATURE_DATA_POOL |	\
127e573427aSIlya Dryomov 				 RBD_FEATURE_OPERATIONS)
128d889140cSAlex Elder 
129d889140cSAlex Elder /* Features supported by this (client software) implementation. */
130d889140cSAlex Elder 
131770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
132d889140cSAlex Elder 
13381a89793SAlex Elder /*
13481a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13581a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13681a89793SAlex Elder  */
137602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
138602adf40SYehuda Sadeh 
139602adf40SYehuda Sadeh /*
140602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
141602adf40SYehuda Sadeh  */
142602adf40SYehuda Sadeh struct rbd_image_header {
143f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
144849b4260SAlex Elder 	char *object_prefix;
145602adf40SYehuda Sadeh 	__u8 obj_order;
146f35a4deeSAlex Elder 	u64 stripe_unit;
147f35a4deeSAlex Elder 	u64 stripe_count;
1487e97332eSIlya Dryomov 	s64 data_pool_id;
149f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
150602adf40SYehuda Sadeh 
151f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
152f84344f3SAlex Elder 	u64 image_size;
153f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
154f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
155f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15659c2be1eSYehuda Sadeh };
15759c2be1eSYehuda Sadeh 
1580d7dbfceSAlex Elder /*
1590d7dbfceSAlex Elder  * An rbd image specification.
1600d7dbfceSAlex Elder  *
1610d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
162c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
163c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
164c66c6e0cSAlex Elder  *
165c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
166c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
167c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
168c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
169c66c6e0cSAlex Elder  *
170c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
171c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
172c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
173c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
174c66c6e0cSAlex Elder  * is shared between the parent and child).
175c66c6e0cSAlex Elder  *
176c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
177c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
178c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
179c66c6e0cSAlex Elder  *
180c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
181c66c6e0cSAlex Elder  * could be a null pointer).
1820d7dbfceSAlex Elder  */
1830d7dbfceSAlex Elder struct rbd_spec {
1840d7dbfceSAlex Elder 	u64		pool_id;
185ecb4dc22SAlex Elder 	const char	*pool_name;
186b26c047bSIlya Dryomov 	const char	*pool_ns;	/* NULL if default, never "" */
1870d7dbfceSAlex Elder 
188ecb4dc22SAlex Elder 	const char	*image_id;
189ecb4dc22SAlex Elder 	const char	*image_name;
1900d7dbfceSAlex Elder 
1910d7dbfceSAlex Elder 	u64		snap_id;
192ecb4dc22SAlex Elder 	const char	*snap_name;
1930d7dbfceSAlex Elder 
1940d7dbfceSAlex Elder 	struct kref	kref;
1950d7dbfceSAlex Elder };
1960d7dbfceSAlex Elder 
197602adf40SYehuda Sadeh /*
198f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
199602adf40SYehuda Sadeh  */
200602adf40SYehuda Sadeh struct rbd_client {
201602adf40SYehuda Sadeh 	struct ceph_client	*client;
202602adf40SYehuda Sadeh 	struct kref		kref;
203602adf40SYehuda Sadeh 	struct list_head	node;
204602adf40SYehuda Sadeh };
205602adf40SYehuda Sadeh 
206bf0d5f50SAlex Elder struct rbd_img_request;
207bf0d5f50SAlex Elder 
2089969ebc5SAlex Elder enum obj_request_type {
209a1fbb5e7SIlya Dryomov 	OBJ_REQUEST_NODATA = 1,
2105359a17dSIlya Dryomov 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
2117e07efb1SIlya Dryomov 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
212afb97888SIlya Dryomov 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
2139969ebc5SAlex Elder };
214bf0d5f50SAlex Elder 
2156d2940c8SGuangliang Zhao enum obj_operation_type {
216a1fbb5e7SIlya Dryomov 	OBJ_OP_READ = 1,
2176d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
21890e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2196484cbe9SIlya Dryomov 	OBJ_OP_ZEROOUT,
2206d2940c8SGuangliang Zhao };
2216d2940c8SGuangliang Zhao 
2223da691bfSIlya Dryomov /*
2233da691bfSIlya Dryomov  * Writes go through the following state machine to deal with
2243da691bfSIlya Dryomov  * layering:
2253da691bfSIlya Dryomov  *
22689a59c1cSIlya Dryomov  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
22789a59c1cSIlya Dryomov  *            .                 |                                    .
22889a59c1cSIlya Dryomov  *            .                 v                                    .
22989a59c1cSIlya Dryomov  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
23089a59c1cSIlya Dryomov  *            .                 |                    .               .
23189a59c1cSIlya Dryomov  *            .                 v                    v (deep-copyup  .
23289a59c1cSIlya Dryomov  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
23389a59c1cSIlya Dryomov  * flattened) v                 |                    .               .
23489a59c1cSIlya Dryomov  *            .                 v                    .               .
23589a59c1cSIlya Dryomov  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
23689a59c1cSIlya Dryomov  *                              |                        not needed) v
23789a59c1cSIlya Dryomov  *                              v                                    .
23889a59c1cSIlya Dryomov  *                            done . . . . . . . . . . . . . . . . . .
2393da691bfSIlya Dryomov  *                              ^
2403da691bfSIlya Dryomov  *                              |
2413da691bfSIlya Dryomov  *                     RBD_OBJ_WRITE_FLAT
2423da691bfSIlya Dryomov  *
2433da691bfSIlya Dryomov  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
24489a59c1cSIlya Dryomov  * assert_exists guard is needed or not (in some cases it's not needed
24589a59c1cSIlya Dryomov  * even if there is a parent).
2463da691bfSIlya Dryomov  */
2473da691bfSIlya Dryomov enum rbd_obj_write_state {
2483da691bfSIlya Dryomov 	RBD_OBJ_WRITE_FLAT = 1,
2493da691bfSIlya Dryomov 	RBD_OBJ_WRITE_GUARD,
2503a482501SIlya Dryomov 	RBD_OBJ_WRITE_READ_FROM_PARENT,
25189a59c1cSIlya Dryomov 	RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC,
2523a482501SIlya Dryomov 	RBD_OBJ_WRITE_COPYUP_OPS,
253926f9b3fSAlex Elder };
254926f9b3fSAlex Elder 
255bf0d5f50SAlex Elder struct rbd_obj_request {
25643df3d35SIlya Dryomov 	struct ceph_object_extent ex;
257c5b5ef6cSAlex Elder 	union {
2583da691bfSIlya Dryomov 		bool			tried_parent;	/* for reads */
2593da691bfSIlya Dryomov 		enum rbd_obj_write_state write_state;	/* for writes */
2603da691bfSIlya Dryomov 	};
261bf0d5f50SAlex Elder 
262bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
26386bd7998SIlya Dryomov 	struct ceph_file_extent	*img_extents;
26486bd7998SIlya Dryomov 	u32			num_img_extents;
265bf0d5f50SAlex Elder 
266788e2df3SAlex Elder 	union {
2675359a17dSIlya Dryomov 		struct ceph_bio_iter	bio_pos;
268788e2df3SAlex Elder 		struct {
2697e07efb1SIlya Dryomov 			struct ceph_bvec_iter	bvec_pos;
2707e07efb1SIlya Dryomov 			u32			bvec_count;
271afb97888SIlya Dryomov 			u32			bvec_idx;
272788e2df3SAlex Elder 		};
273788e2df3SAlex Elder 	};
2747e07efb1SIlya Dryomov 	struct bio_vec		*copyup_bvecs;
2757e07efb1SIlya Dryomov 	u32			copyup_bvec_count;
276bf0d5f50SAlex Elder 
277bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
278bf0d5f50SAlex Elder 
279bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2801b83bef2SSage Weil 	int			result;
281bf0d5f50SAlex Elder 
282bf0d5f50SAlex Elder 	struct kref		kref;
283bf0d5f50SAlex Elder };
284bf0d5f50SAlex Elder 
2850c425248SAlex Elder enum img_req_flags {
2869849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
287d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2880c425248SAlex Elder };
2890c425248SAlex Elder 
290bf0d5f50SAlex Elder struct rbd_img_request {
291bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
2929bb0248dSIlya Dryomov 	enum obj_operation_type	op_type;
293ecc633caSIlya Dryomov 	enum obj_request_type	data_type;
2940c425248SAlex Elder 	unsigned long		flags;
295bf0d5f50SAlex Elder 	union {
296bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2979849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2989849e986SAlex Elder 	};
2999849e986SAlex Elder 	union {
3009849e986SAlex Elder 		struct request		*rq;		/* block request */
3019849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
302bf0d5f50SAlex Elder 	};
30315961b44SIlya Dryomov 	spinlock_t		completion_lock;
30455f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
305a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
306bf0d5f50SAlex Elder 
30743df3d35SIlya Dryomov 	struct list_head	object_extents;	/* obj_req.ex structs */
3087114edacSIlya Dryomov 	u32			pending_count;
309bf0d5f50SAlex Elder 
310bf0d5f50SAlex Elder 	struct kref		kref;
311bf0d5f50SAlex Elder };
312bf0d5f50SAlex Elder 
313bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
31443df3d35SIlya Dryomov 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
315bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
31643df3d35SIlya Dryomov 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
317bf0d5f50SAlex Elder 
31899d16943SIlya Dryomov enum rbd_watch_state {
31999d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
32099d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
32199d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
32299d16943SIlya Dryomov };
32399d16943SIlya Dryomov 
324ed95b21aSIlya Dryomov enum rbd_lock_state {
325ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
326ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
327ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
328ed95b21aSIlya Dryomov };
329ed95b21aSIlya Dryomov 
330ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
331ed95b21aSIlya Dryomov struct rbd_client_id {
332ed95b21aSIlya Dryomov 	u64 gid;
333ed95b21aSIlya Dryomov 	u64 handle;
334ed95b21aSIlya Dryomov };
335ed95b21aSIlya Dryomov 
336f84344f3SAlex Elder struct rbd_mapping {
33799c1f08fSAlex Elder 	u64                     size;
33834b13184SAlex Elder 	u64                     features;
339f84344f3SAlex Elder };
340f84344f3SAlex Elder 
341602adf40SYehuda Sadeh /*
342602adf40SYehuda Sadeh  * a single device
343602adf40SYehuda Sadeh  */
344602adf40SYehuda Sadeh struct rbd_device {
345de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
346602adf40SYehuda Sadeh 
347602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
348dd82fff1SIlya Dryomov 	int			minor;
349602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
350602adf40SYehuda Sadeh 
351a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
352602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
353602adf40SYehuda Sadeh 
354602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
355602adf40SYehuda Sadeh 
356b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
357602adf40SYehuda Sadeh 
358602adf40SYehuda Sadeh 	struct rbd_image_header	header;
359b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3600d7dbfceSAlex Elder 	struct rbd_spec		*spec;
361d147543dSIlya Dryomov 	struct rbd_options	*opts;
3620d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
363602adf40SYehuda Sadeh 
364c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
365922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
366971f839aSAlex Elder 
3671643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3680903e875SAlex Elder 
36999d16943SIlya Dryomov 	struct mutex		watch_mutex;
37099d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
371922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
37299d16943SIlya Dryomov 	u64			watch_cookie;
37399d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
37459c2be1eSYehuda Sadeh 
375ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
376ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
377cbbfb0ffSIlya Dryomov 	char			lock_cookie[32];
378ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
379ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
380ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
381ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
382ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
383ed95b21aSIlya Dryomov 	wait_queue_head_t	lock_waitq;
384ed95b21aSIlya Dryomov 
3851643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
386602adf40SYehuda Sadeh 
38786b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
38886b00e0dSAlex Elder 	u64			parent_overlap;
389a2acd00eSAlex Elder 	atomic_t		parent_ref;
3902f82ee54SAlex Elder 	struct rbd_device	*parent;
39186b00e0dSAlex Elder 
3927ad18afaSChristoph Hellwig 	/* Block layer tags. */
3937ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
3947ad18afaSChristoph Hellwig 
395c666601aSJosh Durgin 	/* protects updating the header */
396c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
397f84344f3SAlex Elder 
398f84344f3SAlex Elder 	struct rbd_mapping	mapping;
399602adf40SYehuda Sadeh 
400602adf40SYehuda Sadeh 	struct list_head	node;
401dfc5606dSYehuda Sadeh 
402dfc5606dSYehuda Sadeh 	/* sysfs related */
403dfc5606dSYehuda Sadeh 	struct device		dev;
404b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
405dfc5606dSYehuda Sadeh };
406dfc5606dSYehuda Sadeh 
407b82d167bSAlex Elder /*
40887c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
40987c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
41087c0fdedSIlya Dryomov  *   by rbd_dev->lock
41187c0fdedSIlya Dryomov  * - BLACKLISTED is protected by rbd_dev->lock_rwsem
412b82d167bSAlex Elder  */
4136d292906SAlex Elder enum rbd_dev_flags {
4146d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
415b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
41687c0fdedSIlya Dryomov 	RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
4176d292906SAlex Elder };
4186d292906SAlex Elder 
419cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
420e124a82fSAlex Elder 
421602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
422e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
423e124a82fSAlex Elder 
424602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
425432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
426602adf40SYehuda Sadeh 
42778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
42878c2a44aSAlex Elder 
4291c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
430868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
4311c2a9dfeSAlex Elder 
4329b60e70bSIlya Dryomov static int rbd_major;
433f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
434f8a22fc2SIlya Dryomov 
435f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
436f5ee37bdSIlya Dryomov 
43789a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = {
43889a59c1cSIlya Dryomov 	.nref = REFCOUNT_INIT(1),
43989a59c1cSIlya Dryomov };
44089a59c1cSIlya Dryomov 
4419b60e70bSIlya Dryomov /*
4423cfa3b16SIlya Dryomov  * single-major requires >= 0.75 version of userspace rbd utility.
4439b60e70bSIlya Dryomov  */
4443cfa3b16SIlya Dryomov static bool single_major = true;
4455657a819SJoe Perches module_param(single_major, bool, 0444);
4463cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
4479b60e70bSIlya Dryomov 
4487e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
4497e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf,
450f0f8cef5SAlex Elder 			    size_t count);
4517e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
452f0f8cef5SAlex Elder 				      size_t count);
4537e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
4549b60e70bSIlya Dryomov 					 size_t count);
4556d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
456f0f8cef5SAlex Elder 
4579b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4589b60e70bSIlya Dryomov {
4597e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4609b60e70bSIlya Dryomov }
4619b60e70bSIlya Dryomov 
4629b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4639b60e70bSIlya Dryomov {
4647e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4659b60e70bSIlya Dryomov }
4669b60e70bSIlya Dryomov 
467ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
468ed95b21aSIlya Dryomov {
469ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
470ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
471ed95b21aSIlya Dryomov }
472ed95b21aSIlya Dryomov 
473ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
474ed95b21aSIlya Dryomov {
475ed95b21aSIlya Dryomov 	bool is_lock_owner;
476ed95b21aSIlya Dryomov 
477ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
478ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
479ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
480ed95b21aSIlya Dryomov 	return is_lock_owner;
481ed95b21aSIlya Dryomov }
482ed95b21aSIlya Dryomov 
4837e9586baSGreg Kroah-Hartman static ssize_t supported_features_show(struct bus_type *bus, char *buf)
4848767b293SIlya Dryomov {
4858767b293SIlya Dryomov 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
4868767b293SIlya Dryomov }
4878767b293SIlya Dryomov 
4887e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add);
4897e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove);
4907e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add_single_major);
4917e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove_single_major);
4927e9586baSGreg Kroah-Hartman static BUS_ATTR_RO(supported_features);
493b15a21ddSGreg Kroah-Hartman 
494b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
495b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
496b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4979b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4989b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
4998767b293SIlya Dryomov 	&bus_attr_supported_features.attr,
500b15a21ddSGreg Kroah-Hartman 	NULL,
501f0f8cef5SAlex Elder };
50292c76dc0SIlya Dryomov 
50392c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
50492c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
50592c76dc0SIlya Dryomov {
5069b60e70bSIlya Dryomov 	if (!single_major &&
5079b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5089b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5099b60e70bSIlya Dryomov 		return 0;
5109b60e70bSIlya Dryomov 
51192c76dc0SIlya Dryomov 	return attr->mode;
51292c76dc0SIlya Dryomov }
51392c76dc0SIlya Dryomov 
51492c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
51592c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
51692c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
51792c76dc0SIlya Dryomov };
51892c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
519f0f8cef5SAlex Elder 
520f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
521f0f8cef5SAlex Elder 	.name		= "rbd",
522b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
523f0f8cef5SAlex Elder };
524f0f8cef5SAlex Elder 
525f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
526f0f8cef5SAlex Elder {
527f0f8cef5SAlex Elder }
528f0f8cef5SAlex Elder 
529f0f8cef5SAlex Elder static struct device rbd_root_dev = {
530f0f8cef5SAlex Elder 	.init_name =    "rbd",
531f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
532f0f8cef5SAlex Elder };
533f0f8cef5SAlex Elder 
53406ecc6cbSAlex Elder static __printf(2, 3)
53506ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
53606ecc6cbSAlex Elder {
53706ecc6cbSAlex Elder 	struct va_format vaf;
53806ecc6cbSAlex Elder 	va_list args;
53906ecc6cbSAlex Elder 
54006ecc6cbSAlex Elder 	va_start(args, fmt);
54106ecc6cbSAlex Elder 	vaf.fmt = fmt;
54206ecc6cbSAlex Elder 	vaf.va = &args;
54306ecc6cbSAlex Elder 
54406ecc6cbSAlex Elder 	if (!rbd_dev)
54506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
54606ecc6cbSAlex Elder 	else if (rbd_dev->disk)
54706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
54806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
54906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
55006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
55106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
55206ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
55306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
55406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
55506ecc6cbSAlex Elder 	else	/* punt */
55606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
55706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
55806ecc6cbSAlex Elder 	va_end(args);
55906ecc6cbSAlex Elder }
56006ecc6cbSAlex Elder 
561aafb230eSAlex Elder #ifdef RBD_DEBUG
562aafb230eSAlex Elder #define rbd_assert(expr)						\
563aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
564aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
565aafb230eSAlex Elder 						"at line %d:\n\n"	\
566aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
567aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
568aafb230eSAlex Elder 			BUG();						\
569aafb230eSAlex Elder 		}
570aafb230eSAlex Elder #else /* !RBD_DEBUG */
571aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
572aafb230eSAlex Elder #endif /* !RBD_DEBUG */
573dfc5606dSYehuda Sadeh 
57405a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5758b3e1a56SAlex Elder 
576cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5772df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
578a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
579e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
58054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
58154cac61fSAlex Elder 					u64 snap_id);
5822ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5832ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5842ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5852ad3d716SAlex Elder 		u64 *snap_features);
58659c2be1eSYehuda Sadeh 
587602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
588602adf40SYehuda Sadeh {
589f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
590b82d167bSAlex Elder 	bool removing = false;
591602adf40SYehuda Sadeh 
592a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
593b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
594b82d167bSAlex Elder 		removing = true;
595b82d167bSAlex Elder 	else
596b82d167bSAlex Elder 		rbd_dev->open_count++;
597a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
598b82d167bSAlex Elder 	if (removing)
599b82d167bSAlex Elder 		return -ENOENT;
600b82d167bSAlex Elder 
601c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
602340c7a2bSAlex Elder 
603602adf40SYehuda Sadeh 	return 0;
604602adf40SYehuda Sadeh }
605602adf40SYehuda Sadeh 
606db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
607dfc5606dSYehuda Sadeh {
608dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
609b82d167bSAlex Elder 	unsigned long open_count_before;
610b82d167bSAlex Elder 
611a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
612b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
613a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
614b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
615dfc5606dSYehuda Sadeh 
616c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
617dfc5606dSYehuda Sadeh }
618dfc5606dSYehuda Sadeh 
619131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
620131fd9f6SGuangliang Zhao {
6211de797bbSIlya Dryomov 	int ro;
622131fd9f6SGuangliang Zhao 
6231de797bbSIlya Dryomov 	if (get_user(ro, (int __user *)arg))
624131fd9f6SGuangliang Zhao 		return -EFAULT;
625131fd9f6SGuangliang Zhao 
6261de797bbSIlya Dryomov 	/* Snapshots can't be marked read-write */
627131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
628131fd9f6SGuangliang Zhao 		return -EROFS;
629131fd9f6SGuangliang Zhao 
6301de797bbSIlya Dryomov 	/* Let blkdev_roset() handle it */
6311de797bbSIlya Dryomov 	return -ENOTTY;
632131fd9f6SGuangliang Zhao }
633131fd9f6SGuangliang Zhao 
634131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
635131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
636131fd9f6SGuangliang Zhao {
637131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
6381de797bbSIlya Dryomov 	int ret;
639131fd9f6SGuangliang Zhao 
640131fd9f6SGuangliang Zhao 	switch (cmd) {
641131fd9f6SGuangliang Zhao 	case BLKROSET:
642131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
643131fd9f6SGuangliang Zhao 		break;
644131fd9f6SGuangliang Zhao 	default:
645131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
646131fd9f6SGuangliang Zhao 	}
647131fd9f6SGuangliang Zhao 
648131fd9f6SGuangliang Zhao 	return ret;
649131fd9f6SGuangliang Zhao }
650131fd9f6SGuangliang Zhao 
651131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
652131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
653131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
654131fd9f6SGuangliang Zhao {
655131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
656131fd9f6SGuangliang Zhao }
657131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
658131fd9f6SGuangliang Zhao 
659602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
660602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
661602adf40SYehuda Sadeh 	.open			= rbd_open,
662dfc5606dSYehuda Sadeh 	.release		= rbd_release,
663131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
664131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
665131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
666131fd9f6SGuangliang Zhao #endif
667602adf40SYehuda Sadeh };
668602adf40SYehuda Sadeh 
669602adf40SYehuda Sadeh /*
6707262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
671cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
672602adf40SYehuda Sadeh  */
673f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
674602adf40SYehuda Sadeh {
675602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
676602adf40SYehuda Sadeh 	int ret = -ENOMEM;
677602adf40SYehuda Sadeh 
67837206ee5SAlex Elder 	dout("%s:\n", __func__);
679602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
680602adf40SYehuda Sadeh 	if (!rbdc)
681602adf40SYehuda Sadeh 		goto out_opt;
682602adf40SYehuda Sadeh 
683602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
684602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
685602adf40SYehuda Sadeh 
68674da4a0fSIlya Dryomov 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
687602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
68808f75463SAlex Elder 		goto out_rbdc;
68943ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
690602adf40SYehuda Sadeh 
691602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
692602adf40SYehuda Sadeh 	if (ret < 0)
69308f75463SAlex Elder 		goto out_client;
694602adf40SYehuda Sadeh 
695432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
696602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
697432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
698602adf40SYehuda Sadeh 
69937206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
700bc534d86SAlex Elder 
701602adf40SYehuda Sadeh 	return rbdc;
70208f75463SAlex Elder out_client:
703602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
70408f75463SAlex Elder out_rbdc:
705602adf40SYehuda Sadeh 	kfree(rbdc);
706602adf40SYehuda Sadeh out_opt:
70743ae4701SAlex Elder 	if (ceph_opts)
70843ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
70937206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
71037206ee5SAlex Elder 
71128f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
712602adf40SYehuda Sadeh }
713602adf40SYehuda Sadeh 
7142f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7152f82ee54SAlex Elder {
7162f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7172f82ee54SAlex Elder 
7182f82ee54SAlex Elder 	return rbdc;
7192f82ee54SAlex Elder }
7202f82ee54SAlex Elder 
721602adf40SYehuda Sadeh /*
7221f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7231f7ba331SAlex Elder  * found, bump its reference count.
724602adf40SYehuda Sadeh  */
7251f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
726602adf40SYehuda Sadeh {
727602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7281f7ba331SAlex Elder 	bool found = false;
729602adf40SYehuda Sadeh 
73043ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
731602adf40SYehuda Sadeh 		return NULL;
732602adf40SYehuda Sadeh 
7331f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7341f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7351f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7362f82ee54SAlex Elder 			__rbd_get_client(client_node);
7372f82ee54SAlex Elder 
7381f7ba331SAlex Elder 			found = true;
7391f7ba331SAlex Elder 			break;
7401f7ba331SAlex Elder 		}
7411f7ba331SAlex Elder 	}
7421f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7431f7ba331SAlex Elder 
7441f7ba331SAlex Elder 	return found ? client_node : NULL;
745602adf40SYehuda Sadeh }
746602adf40SYehuda Sadeh 
747602adf40SYehuda Sadeh /*
748210c104cSIlya Dryomov  * (Per device) rbd map options
74959c2be1eSYehuda Sadeh  */
75059c2be1eSYehuda Sadeh enum {
751b5584180SIlya Dryomov 	Opt_queue_depth,
7520c93e1b7SIlya Dryomov 	Opt_alloc_size,
75334f55d0bSDongsheng Yang 	Opt_lock_timeout,
75459c2be1eSYehuda Sadeh 	Opt_last_int,
75559c2be1eSYehuda Sadeh 	/* int args above */
756b26c047bSIlya Dryomov 	Opt_pool_ns,
75759c2be1eSYehuda Sadeh 	Opt_last_string,
75859c2be1eSYehuda Sadeh 	/* string args above */
759cc0538b6SAlex Elder 	Opt_read_only,
760cc0538b6SAlex Elder 	Opt_read_write,
76180de1912SIlya Dryomov 	Opt_lock_on_read,
762e010dd0aSIlya Dryomov 	Opt_exclusive,
763d9360540SIlya Dryomov 	Opt_notrim,
764210c104cSIlya Dryomov 	Opt_err
76559c2be1eSYehuda Sadeh };
76659c2be1eSYehuda Sadeh 
76743ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
768b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
7690c93e1b7SIlya Dryomov 	{Opt_alloc_size, "alloc_size=%d"},
77034f55d0bSDongsheng Yang 	{Opt_lock_timeout, "lock_timeout=%d"},
77159c2be1eSYehuda Sadeh 	/* int args above */
772b26c047bSIlya Dryomov 	{Opt_pool_ns, "_pool_ns=%s"},
77359c2be1eSYehuda Sadeh 	/* string args above */
774be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
775cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
776cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
777cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
77880de1912SIlya Dryomov 	{Opt_lock_on_read, "lock_on_read"},
779e010dd0aSIlya Dryomov 	{Opt_exclusive, "exclusive"},
780d9360540SIlya Dryomov 	{Opt_notrim, "notrim"},
781210c104cSIlya Dryomov 	{Opt_err, NULL}
78259c2be1eSYehuda Sadeh };
78359c2be1eSYehuda Sadeh 
78498571b5aSAlex Elder struct rbd_options {
785b5584180SIlya Dryomov 	int	queue_depth;
7860c93e1b7SIlya Dryomov 	int	alloc_size;
78734f55d0bSDongsheng Yang 	unsigned long	lock_timeout;
78898571b5aSAlex Elder 	bool	read_only;
78980de1912SIlya Dryomov 	bool	lock_on_read;
790e010dd0aSIlya Dryomov 	bool	exclusive;
791d9360540SIlya Dryomov 	bool	trim;
79298571b5aSAlex Elder };
79398571b5aSAlex Elder 
794b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
7950c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
79634f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
79798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
79880de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
799e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT	false
800d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT	true
80198571b5aSAlex Elder 
802c300156bSIlya Dryomov struct parse_rbd_opts_ctx {
803c300156bSIlya Dryomov 	struct rbd_spec		*spec;
804c300156bSIlya Dryomov 	struct rbd_options	*opts;
805c300156bSIlya Dryomov };
806c300156bSIlya Dryomov 
80759c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
80859c2be1eSYehuda Sadeh {
809c300156bSIlya Dryomov 	struct parse_rbd_opts_ctx *pctx = private;
81059c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
81159c2be1eSYehuda Sadeh 	int token, intval, ret;
81259c2be1eSYehuda Sadeh 
81343ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
81459c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
81559c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
81659c2be1eSYehuda Sadeh 		if (ret < 0) {
8172f56b6baSIlya Dryomov 			pr_err("bad option arg (not int) at '%s'\n", c);
81859c2be1eSYehuda Sadeh 			return ret;
81959c2be1eSYehuda Sadeh 		}
82059c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
82159c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
822210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
82359c2be1eSYehuda Sadeh 	} else {
82459c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
82559c2be1eSYehuda Sadeh 	}
82659c2be1eSYehuda Sadeh 
82759c2be1eSYehuda Sadeh 	switch (token) {
828b5584180SIlya Dryomov 	case Opt_queue_depth:
829b5584180SIlya Dryomov 		if (intval < 1) {
830b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
831b5584180SIlya Dryomov 			return -EINVAL;
832b5584180SIlya Dryomov 		}
833c300156bSIlya Dryomov 		pctx->opts->queue_depth = intval;
834b5584180SIlya Dryomov 		break;
8350c93e1b7SIlya Dryomov 	case Opt_alloc_size:
83616d80c54SIlya Dryomov 		if (intval < SECTOR_SIZE) {
8370c93e1b7SIlya Dryomov 			pr_err("alloc_size out of range\n");
8380c93e1b7SIlya Dryomov 			return -EINVAL;
8390c93e1b7SIlya Dryomov 		}
8400c93e1b7SIlya Dryomov 		if (!is_power_of_2(intval)) {
8410c93e1b7SIlya Dryomov 			pr_err("alloc_size must be a power of 2\n");
8420c93e1b7SIlya Dryomov 			return -EINVAL;
8430c93e1b7SIlya Dryomov 		}
8440c93e1b7SIlya Dryomov 		pctx->opts->alloc_size = intval;
8450c93e1b7SIlya Dryomov 		break;
84634f55d0bSDongsheng Yang 	case Opt_lock_timeout:
84734f55d0bSDongsheng Yang 		/* 0 is "wait forever" (i.e. infinite timeout) */
84834f55d0bSDongsheng Yang 		if (intval < 0 || intval > INT_MAX / 1000) {
84934f55d0bSDongsheng Yang 			pr_err("lock_timeout out of range\n");
85034f55d0bSDongsheng Yang 			return -EINVAL;
85134f55d0bSDongsheng Yang 		}
852c300156bSIlya Dryomov 		pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
85334f55d0bSDongsheng Yang 		break;
854b26c047bSIlya Dryomov 	case Opt_pool_ns:
855b26c047bSIlya Dryomov 		kfree(pctx->spec->pool_ns);
856b26c047bSIlya Dryomov 		pctx->spec->pool_ns = match_strdup(argstr);
857b26c047bSIlya Dryomov 		if (!pctx->spec->pool_ns)
858b26c047bSIlya Dryomov 			return -ENOMEM;
85959c2be1eSYehuda Sadeh 		break;
860cc0538b6SAlex Elder 	case Opt_read_only:
861c300156bSIlya Dryomov 		pctx->opts->read_only = true;
862cc0538b6SAlex Elder 		break;
863cc0538b6SAlex Elder 	case Opt_read_write:
864c300156bSIlya Dryomov 		pctx->opts->read_only = false;
865cc0538b6SAlex Elder 		break;
86680de1912SIlya Dryomov 	case Opt_lock_on_read:
867c300156bSIlya Dryomov 		pctx->opts->lock_on_read = true;
86880de1912SIlya Dryomov 		break;
869e010dd0aSIlya Dryomov 	case Opt_exclusive:
870c300156bSIlya Dryomov 		pctx->opts->exclusive = true;
871e010dd0aSIlya Dryomov 		break;
872d9360540SIlya Dryomov 	case Opt_notrim:
873c300156bSIlya Dryomov 		pctx->opts->trim = false;
874d9360540SIlya Dryomov 		break;
87559c2be1eSYehuda Sadeh 	default:
876210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
877210c104cSIlya Dryomov 		return -EINVAL;
87859c2be1eSYehuda Sadeh 	}
879210c104cSIlya Dryomov 
88059c2be1eSYehuda Sadeh 	return 0;
88159c2be1eSYehuda Sadeh }
88259c2be1eSYehuda Sadeh 
8836d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8846d2940c8SGuangliang Zhao {
8856d2940c8SGuangliang Zhao 	switch (op_type) {
8866d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8876d2940c8SGuangliang Zhao 		return "read";
8886d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8896d2940c8SGuangliang Zhao 		return "write";
89090e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
89190e98c52SGuangliang Zhao 		return "discard";
8926484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
8936484cbe9SIlya Dryomov 		return "zeroout";
8946d2940c8SGuangliang Zhao 	default:
8956d2940c8SGuangliang Zhao 		return "???";
8966d2940c8SGuangliang Zhao 	}
8976d2940c8SGuangliang Zhao }
8986d2940c8SGuangliang Zhao 
89959c2be1eSYehuda Sadeh /*
900602adf40SYehuda Sadeh  * Destroy ceph client
901d23a4b3fSAlex Elder  *
902432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
903602adf40SYehuda Sadeh  */
904602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
905602adf40SYehuda Sadeh {
906602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
907602adf40SYehuda Sadeh 
90837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
909cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
910602adf40SYehuda Sadeh 	list_del(&rbdc->node);
911cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
912602adf40SYehuda Sadeh 
913602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
914602adf40SYehuda Sadeh 	kfree(rbdc);
915602adf40SYehuda Sadeh }
916602adf40SYehuda Sadeh 
917602adf40SYehuda Sadeh /*
918602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
919602adf40SYehuda Sadeh  * it.
920602adf40SYehuda Sadeh  */
9219d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
922602adf40SYehuda Sadeh {
923c53d5893SAlex Elder 	if (rbdc)
9249d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
925602adf40SYehuda Sadeh }
926602adf40SYehuda Sadeh 
9275feb0d8dSIlya Dryomov /*
9285feb0d8dSIlya Dryomov  * Get a ceph client with specific addr and configuration, if one does
9295feb0d8dSIlya Dryomov  * not exist create it.  Either way, ceph_opts is consumed by this
9305feb0d8dSIlya Dryomov  * function.
9315feb0d8dSIlya Dryomov  */
9325feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
9335feb0d8dSIlya Dryomov {
9345feb0d8dSIlya Dryomov 	struct rbd_client *rbdc;
935dd435855SIlya Dryomov 	int ret;
9365feb0d8dSIlya Dryomov 
9375feb0d8dSIlya Dryomov 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
9385feb0d8dSIlya Dryomov 	rbdc = rbd_client_find(ceph_opts);
939dd435855SIlya Dryomov 	if (rbdc) {
9405feb0d8dSIlya Dryomov 		ceph_destroy_options(ceph_opts);
941dd435855SIlya Dryomov 
942dd435855SIlya Dryomov 		/*
943dd435855SIlya Dryomov 		 * Using an existing client.  Make sure ->pg_pools is up to
944dd435855SIlya Dryomov 		 * date before we look up the pool id in do_rbd_add().
945dd435855SIlya Dryomov 		 */
9469d4a227fSIlya Dryomov 		ret = ceph_wait_for_latest_osdmap(rbdc->client,
9479d4a227fSIlya Dryomov 					rbdc->client->options->mount_timeout);
948dd435855SIlya Dryomov 		if (ret) {
949dd435855SIlya Dryomov 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
950dd435855SIlya Dryomov 			rbd_put_client(rbdc);
951dd435855SIlya Dryomov 			rbdc = ERR_PTR(ret);
952dd435855SIlya Dryomov 		}
953dd435855SIlya Dryomov 	} else {
9545feb0d8dSIlya Dryomov 		rbdc = rbd_client_create(ceph_opts);
955dd435855SIlya Dryomov 	}
9565feb0d8dSIlya Dryomov 	mutex_unlock(&client_mutex);
9575feb0d8dSIlya Dryomov 
9585feb0d8dSIlya Dryomov 	return rbdc;
9595feb0d8dSIlya Dryomov }
9605feb0d8dSIlya Dryomov 
961a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
962a30b71b9SAlex Elder {
963a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
964a30b71b9SAlex Elder }
965a30b71b9SAlex Elder 
9668e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9678e94af8eSAlex Elder {
968103a150fSAlex Elder 	size_t size;
969103a150fSAlex Elder 	u32 snap_count;
970103a150fSAlex Elder 
971103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
972103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
973103a150fSAlex Elder 		return false;
974103a150fSAlex Elder 
975db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
976db2388b6SAlex Elder 
977db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
978db2388b6SAlex Elder 		return false;
979db2388b6SAlex Elder 
980db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
981db2388b6SAlex Elder 
982db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
983db2388b6SAlex Elder 		return false;
984db2388b6SAlex Elder 
985103a150fSAlex Elder 	/*
986103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
987103a150fSAlex Elder 	 * that limits the number of snapshots.
988103a150fSAlex Elder 	 */
989103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
990103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
991103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
992103a150fSAlex Elder 		return false;
993103a150fSAlex Elder 
994103a150fSAlex Elder 	/*
995103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
996103a150fSAlex Elder 	 * header must also be representable in a size_t.
997103a150fSAlex Elder 	 */
998103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
999103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1000103a150fSAlex Elder 		return false;
1001103a150fSAlex Elder 
1002103a150fSAlex Elder 	return true;
10038e94af8eSAlex Elder }
10048e94af8eSAlex Elder 
1005602adf40SYehuda Sadeh /*
10065bc3fb17SIlya Dryomov  * returns the size of an object in the image
10075bc3fb17SIlya Dryomov  */
10085bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
10095bc3fb17SIlya Dryomov {
10105bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
10115bc3fb17SIlya Dryomov }
10125bc3fb17SIlya Dryomov 
1013263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
1014263423f8SIlya Dryomov {
1015263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
1016263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
1017263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1018263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
1019263423f8SIlya Dryomov 	}
1020263423f8SIlya Dryomov 
1021263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1022263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1023263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
10247e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
10257e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1026263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1027263423f8SIlya Dryomov }
1028263423f8SIlya Dryomov 
10295bc3fb17SIlya Dryomov /*
1030bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1031bb23e37aSAlex Elder  * on-disk header.
1032602adf40SYehuda Sadeh  */
1033662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
10344156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
1035602adf40SYehuda Sadeh {
1036662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
1037bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
1038bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1039bb23e37aSAlex Elder 	char *object_prefix = NULL;
1040bb23e37aSAlex Elder 	char *snap_names = NULL;
1041bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1042ccece235SAlex Elder 	u32 snap_count;
1043bb23e37aSAlex Elder 	int ret = -ENOMEM;
1044621901d6SAlex Elder 	u32 i;
1045602adf40SYehuda Sadeh 
1046bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1047103a150fSAlex Elder 
1048bb23e37aSAlex Elder 	if (first_time) {
1049848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1050848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1051848d796cSIlya Dryomov 					 GFP_KERNEL);
1052bb23e37aSAlex Elder 		if (!object_prefix)
1053602adf40SYehuda Sadeh 			return -ENOMEM;
1054bb23e37aSAlex Elder 	}
105500f1f36fSAlex Elder 
1056bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1057d2bb24e5SAlex Elder 
1058602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1059bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1060bb23e37aSAlex Elder 	if (!snapc)
1061bb23e37aSAlex Elder 		goto out_err;
1062bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1063602adf40SYehuda Sadeh 	if (snap_count) {
1064bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1065f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1066f785cc1dSAlex Elder 
1067bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1068621901d6SAlex Elder 
1069f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1070bb23e37aSAlex Elder 			goto out_2big;
1071bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1072bb23e37aSAlex Elder 		if (!snap_names)
1073602adf40SYehuda Sadeh 			goto out_err;
1074bb23e37aSAlex Elder 
1075bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
107688a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
107788a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
107888a25a5fSMarkus Elfring 					   GFP_KERNEL);
1079bb23e37aSAlex Elder 		if (!snap_sizes)
1080bb23e37aSAlex Elder 			goto out_err;
1081bb23e37aSAlex Elder 
1082f785cc1dSAlex Elder 		/*
1083bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1084bb23e37aSAlex Elder 		 * and size.
1085bb23e37aSAlex Elder 		 *
108699a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1087bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1088f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1089f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1090f785cc1dSAlex Elder 		 */
1091bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1092bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1093bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1094bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1095bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1096bb23e37aSAlex Elder 		}
1097602adf40SYehuda Sadeh 	}
1098849b4260SAlex Elder 
1099bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1100bb23e37aSAlex Elder 
1101bb23e37aSAlex Elder 	if (first_time) {
1102bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1103602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1104263423f8SIlya Dryomov 		rbd_init_layout(rbd_dev);
1105662518b1SAlex Elder 	} else {
1106662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1107662518b1SAlex Elder 		kfree(header->snap_names);
1108662518b1SAlex Elder 		kfree(header->snap_sizes);
1109bb23e37aSAlex Elder 	}
11106a52325fSAlex Elder 
1111bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1112621901d6SAlex Elder 
1113f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1114bb23e37aSAlex Elder 	header->snapc = snapc;
1115bb23e37aSAlex Elder 	header->snap_names = snap_names;
1116bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1117468521c1SAlex Elder 
1118602adf40SYehuda Sadeh 	return 0;
1119bb23e37aSAlex Elder out_2big:
1120bb23e37aSAlex Elder 	ret = -EIO;
11216a52325fSAlex Elder out_err:
1122bb23e37aSAlex Elder 	kfree(snap_sizes);
1123bb23e37aSAlex Elder 	kfree(snap_names);
1124bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1125bb23e37aSAlex Elder 	kfree(object_prefix);
1126ccece235SAlex Elder 
1127bb23e37aSAlex Elder 	return ret;
1128602adf40SYehuda Sadeh }
1129602adf40SYehuda Sadeh 
11309682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11319682fc6dSAlex Elder {
11329682fc6dSAlex Elder 	const char *snap_name;
11339682fc6dSAlex Elder 
11349682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11359682fc6dSAlex Elder 
11369682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
11379682fc6dSAlex Elder 
11389682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
11399682fc6dSAlex Elder 	while (which--)
11409682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
11419682fc6dSAlex Elder 
11429682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
11439682fc6dSAlex Elder }
11449682fc6dSAlex Elder 
114530d1cff8SAlex Elder /*
114630d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
114730d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
114830d1cff8SAlex Elder  */
114930d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
115030d1cff8SAlex Elder {
115130d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
115230d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
115330d1cff8SAlex Elder 
115430d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
115530d1cff8SAlex Elder 		return 1;
115630d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
115730d1cff8SAlex Elder }
115830d1cff8SAlex Elder 
115930d1cff8SAlex Elder /*
116030d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
116130d1cff8SAlex Elder  * present.
116230d1cff8SAlex Elder  *
116330d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
116430d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
116530d1cff8SAlex Elder  *
116630d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
116730d1cff8SAlex Elder  * reverse order, highest snapshot id first.
116830d1cff8SAlex Elder  */
11699682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11709682fc6dSAlex Elder {
11719682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
117230d1cff8SAlex Elder 	u64 *found;
11739682fc6dSAlex Elder 
117430d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
117530d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11769682fc6dSAlex Elder 
117730d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11789682fc6dSAlex Elder }
11799682fc6dSAlex Elder 
11802ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11812ad3d716SAlex Elder 					u64 snap_id)
118254cac61fSAlex Elder {
118354cac61fSAlex Elder 	u32 which;
1184da6a6b63SJosh Durgin 	const char *snap_name;
118554cac61fSAlex Elder 
118654cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
118754cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1188da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
118954cac61fSAlex Elder 
1190da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1191da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
119254cac61fSAlex Elder }
119354cac61fSAlex Elder 
11949e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11959e15b77dSAlex Elder {
11969e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
11979e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
11989e15b77dSAlex Elder 
119954cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
120054cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
120154cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
12029e15b77dSAlex Elder 
120354cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
12049e15b77dSAlex Elder }
12059e15b77dSAlex Elder 
12062ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
12072ad3d716SAlex Elder 				u64 *snap_size)
1208602adf40SYehuda Sadeh {
12092ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12102ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12112ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
12122ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12132ad3d716SAlex Elder 		u32 which;
121400f1f36fSAlex Elder 
12152ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
12162ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
12172ad3d716SAlex Elder 			return -ENOENT;
121800f1f36fSAlex Elder 
12192ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
12202ad3d716SAlex Elder 	} else {
12212ad3d716SAlex Elder 		u64 size = 0;
12222ad3d716SAlex Elder 		int ret;
12232ad3d716SAlex Elder 
12242ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
12252ad3d716SAlex Elder 		if (ret)
12262ad3d716SAlex Elder 			return ret;
12272ad3d716SAlex Elder 
12282ad3d716SAlex Elder 		*snap_size = size;
12292ad3d716SAlex Elder 	}
12302ad3d716SAlex Elder 	return 0;
12312ad3d716SAlex Elder }
12322ad3d716SAlex Elder 
12332ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
12342ad3d716SAlex Elder 			u64 *snap_features)
12352ad3d716SAlex Elder {
12362ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12372ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12382ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
12392ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12402ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
12412ad3d716SAlex Elder 	} else {
12422ad3d716SAlex Elder 		u64 features = 0;
12432ad3d716SAlex Elder 		int ret;
12442ad3d716SAlex Elder 
12452ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
12462ad3d716SAlex Elder 		if (ret)
12472ad3d716SAlex Elder 			return ret;
12482ad3d716SAlex Elder 
12492ad3d716SAlex Elder 		*snap_features = features;
12502ad3d716SAlex Elder 	}
12512ad3d716SAlex Elder 	return 0;
125200f1f36fSAlex Elder }
1253602adf40SYehuda Sadeh 
1254d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1255602adf40SYehuda Sadeh {
12568f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12572ad3d716SAlex Elder 	u64 size = 0;
12582ad3d716SAlex Elder 	u64 features = 0;
12592ad3d716SAlex Elder 	int ret;
12608b0241f8SAlex Elder 
12612ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12622ad3d716SAlex Elder 	if (ret)
12632ad3d716SAlex Elder 		return ret;
12642ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
12652ad3d716SAlex Elder 	if (ret)
12662ad3d716SAlex Elder 		return ret;
12672ad3d716SAlex Elder 
12682ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12692ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
12702ad3d716SAlex Elder 
12718b0241f8SAlex Elder 	return 0;
1272602adf40SYehuda Sadeh }
1273602adf40SYehuda Sadeh 
1274d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1275d1cf5788SAlex Elder {
1276d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1277d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1278200a6a8bSAlex Elder }
1279200a6a8bSAlex Elder 
12805359a17dSIlya Dryomov static void zero_bvec(struct bio_vec *bv)
128165ccfe21SAlex Elder {
1282602adf40SYehuda Sadeh 	void *buf;
12835359a17dSIlya Dryomov 	unsigned long flags;
1284602adf40SYehuda Sadeh 
12855359a17dSIlya Dryomov 	buf = bvec_kmap_irq(bv, &flags);
12865359a17dSIlya Dryomov 	memset(buf, 0, bv->bv_len);
12875359a17dSIlya Dryomov 	flush_dcache_page(bv->bv_page);
128885b5aaa6SDan Carpenter 	bvec_kunmap_irq(buf, &flags);
1289602adf40SYehuda Sadeh }
1290602adf40SYehuda Sadeh 
12915359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1292b9434c5bSAlex Elder {
12935359a17dSIlya Dryomov 	struct ceph_bio_iter it = *bio_pos;
1294b9434c5bSAlex Elder 
12955359a17dSIlya Dryomov 	ceph_bio_iter_advance(&it, off);
12965359a17dSIlya Dryomov 	ceph_bio_iter_advance_step(&it, bytes, ({
12975359a17dSIlya Dryomov 		zero_bvec(&bv);
12985359a17dSIlya Dryomov 	}));
1299b9434c5bSAlex Elder }
1300b9434c5bSAlex Elder 
13017e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1302602adf40SYehuda Sadeh {
13037e07efb1SIlya Dryomov 	struct ceph_bvec_iter it = *bvec_pos;
1304602adf40SYehuda Sadeh 
13057e07efb1SIlya Dryomov 	ceph_bvec_iter_advance(&it, off);
13067e07efb1SIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
13077e07efb1SIlya Dryomov 		zero_bvec(&bv);
13087e07efb1SIlya Dryomov 	}));
1309602adf40SYehuda Sadeh }
1310602adf40SYehuda Sadeh 
1311f7760dadSAlex Elder /*
13123da691bfSIlya Dryomov  * Zero a range in @obj_req data buffer defined by a bio (list) or
1313afb97888SIlya Dryomov  * (private) bio_vec array.
1314f7760dadSAlex Elder  *
13153da691bfSIlya Dryomov  * @off is relative to the start of the data buffer.
1316f7760dadSAlex Elder  */
13173da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
13183da691bfSIlya Dryomov 			       u32 bytes)
1319f7760dadSAlex Elder {
1320ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
13213da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
13223da691bfSIlya Dryomov 		zero_bios(&obj_req->bio_pos, off, bytes);
13233da691bfSIlya Dryomov 		break;
13243da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1325afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
13263da691bfSIlya Dryomov 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
13273da691bfSIlya Dryomov 		break;
13283da691bfSIlya Dryomov 	default:
13293da691bfSIlya Dryomov 		rbd_assert(0);
1330f5400b7aSAlex Elder 	}
1331bf0d5f50SAlex Elder }
1332bf0d5f50SAlex Elder 
1333bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1334bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1335bf0d5f50SAlex Elder {
1336bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
133737206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
13382c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1339bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1340bf0d5f50SAlex Elder }
1341bf0d5f50SAlex Elder 
13420f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
13430f2d5be7SAlex Elder {
13440f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
13452c935bc5SPeter Zijlstra 	     kref_read(&img_request->kref));
13460f2d5be7SAlex Elder 	kref_get(&img_request->kref);
13470f2d5be7SAlex Elder }
13480f2d5be7SAlex Elder 
1349bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1350bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1351bf0d5f50SAlex Elder {
1352bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
135337206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
13542c935bc5SPeter Zijlstra 		kref_read(&img_request->kref));
1355bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1356bf0d5f50SAlex Elder }
1357bf0d5f50SAlex Elder 
1358bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1359bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1360bf0d5f50SAlex Elder {
136125dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
136225dcf954SAlex Elder 
1363b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1364bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
13657114edacSIlya Dryomov 	img_request->pending_count++;
136615961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1367bf0d5f50SAlex Elder }
1368bf0d5f50SAlex Elder 
1369bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1370bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1371bf0d5f50SAlex Elder {
137215961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
137343df3d35SIlya Dryomov 	list_del(&obj_request->ex.oe_item);
1374bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1375bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1376bf0d5f50SAlex Elder }
1377bf0d5f50SAlex Elder 
1378980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1379bf0d5f50SAlex Elder {
1380980917fcSIlya Dryomov 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1381980917fcSIlya Dryomov 
1382a90bb0c1SIlya Dryomov 	dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
138343df3d35SIlya Dryomov 	     obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
138443df3d35SIlya Dryomov 	     obj_request->ex.oe_len, osd_req);
1385980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1386bf0d5f50SAlex Elder }
1387bf0d5f50SAlex Elder 
13880c425248SAlex Elder /*
13890c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
13900c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
13910c425248SAlex Elder  * and currently never change thereafter.
13920c425248SAlex Elder  */
1393d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1394d0b2e944SAlex Elder {
1395d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1396d0b2e944SAlex Elder 	smp_mb();
1397d0b2e944SAlex Elder }
1398d0b2e944SAlex Elder 
1399a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1400a2acd00eSAlex Elder {
1401a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1402a2acd00eSAlex Elder 	smp_mb();
1403a2acd00eSAlex Elder }
1404a2acd00eSAlex Elder 
1405d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1406d0b2e944SAlex Elder {
1407d0b2e944SAlex Elder 	smp_mb();
1408d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1409d0b2e944SAlex Elder }
1410d0b2e944SAlex Elder 
14113da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
14123b434a2aSJosh Durgin {
14133da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
14143da691bfSIlya Dryomov 
141543df3d35SIlya Dryomov 	return !obj_req->ex.oe_off &&
141643df3d35SIlya Dryomov 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
14173b434a2aSJosh Durgin }
14183b434a2aSJosh Durgin 
14193da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
14206e2a4505SAlex Elder {
14213da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1422b9434c5bSAlex Elder 
142343df3d35SIlya Dryomov 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
14243da691bfSIlya Dryomov 					rbd_dev->layout.object_size;
14256e2a4505SAlex Elder }
14266e2a4505SAlex Elder 
142713488d53SIlya Dryomov /*
142813488d53SIlya Dryomov  * Must be called after rbd_obj_calc_img_extents().
142913488d53SIlya Dryomov  */
143013488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
143113488d53SIlya Dryomov {
143213488d53SIlya Dryomov 	if (!obj_req->num_img_extents ||
14339b17eb2cSIlya Dryomov 	    (rbd_obj_is_entire(obj_req) &&
14349b17eb2cSIlya Dryomov 	     !obj_req->img_request->snapc->num_snaps))
143513488d53SIlya Dryomov 		return false;
143613488d53SIlya Dryomov 
143713488d53SIlya Dryomov 	return true;
143813488d53SIlya Dryomov }
143913488d53SIlya Dryomov 
144086bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1441bf0d5f50SAlex Elder {
144286bd7998SIlya Dryomov 	return ceph_file_extents_bytes(obj_req->img_extents,
144386bd7998SIlya Dryomov 				       obj_req->num_img_extents);
1444bf0d5f50SAlex Elder }
1445bf0d5f50SAlex Elder 
14463da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req)
14470dcc685eSIlya Dryomov {
14489bb0248dSIlya Dryomov 	switch (img_req->op_type) {
14493da691bfSIlya Dryomov 	case OBJ_OP_READ:
14503da691bfSIlya Dryomov 		return false;
14513da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
14523da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
14536484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
14543da691bfSIlya Dryomov 		return true;
14553da691bfSIlya Dryomov 	default:
1456c6244b3bSArnd Bergmann 		BUG();
14570dcc685eSIlya Dryomov 	}
14580dcc685eSIlya Dryomov }
14590dcc685eSIlya Dryomov 
14603da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
14612761713dSIlya Dryomov 
146285e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1463bf0d5f50SAlex Elder {
14643da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1465bf0d5f50SAlex Elder 
14663da691bfSIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
14673da691bfSIlya Dryomov 	     osd_req->r_result, obj_req);
14683da691bfSIlya Dryomov 	rbd_assert(osd_req == obj_req->osd_req);
1469bf0d5f50SAlex Elder 
14703da691bfSIlya Dryomov 	obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
14713da691bfSIlya Dryomov 	if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
14723da691bfSIlya Dryomov 		obj_req->xferred = osd_req->r_result;
14733da691bfSIlya Dryomov 	else
1474c47f9371SAlex Elder 		/*
14753da691bfSIlya Dryomov 		 * Writes aren't allowed to return a data payload.  In some
14763da691bfSIlya Dryomov 		 * guarded write cases (e.g. stat + zero on an empty object)
14773da691bfSIlya Dryomov 		 * a stat response makes it through, but we don't care.
1478c47f9371SAlex Elder 		 */
14793da691bfSIlya Dryomov 		obj_req->xferred = 0;
14800ccd5926SIlya Dryomov 
14813da691bfSIlya Dryomov 	rbd_obj_handle_request(obj_req);
1482bf0d5f50SAlex Elder }
1483bf0d5f50SAlex Elder 
14849d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1485430c28c3SAlex Elder {
14868c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1487430c28c3SAlex Elder 
1488a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_READ;
14897c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
14909d4df01fSAlex Elder }
14919d4df01fSAlex Elder 
14929d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
14939d4df01fSAlex Elder {
14949d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
14959d4df01fSAlex Elder 
1496a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1497fac02ddfSArnd Bergmann 	ktime_get_real_ts64(&osd_req->r_mtime);
149843df3d35SIlya Dryomov 	osd_req->r_data_offset = obj_request->ex.oe_off;
1499430c28c3SAlex Elder }
1500430c28c3SAlex Elder 
1501bc81207eSIlya Dryomov static struct ceph_osd_request *
1502e28eded5SIlya Dryomov __rbd_osd_req_create(struct rbd_obj_request *obj_req,
1503e28eded5SIlya Dryomov 		     struct ceph_snap_context *snapc, unsigned int num_ops)
1504bc81207eSIlya Dryomov {
1505e28eded5SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1506bc81207eSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1507bc81207eSIlya Dryomov 	struct ceph_osd_request *req;
1508a90bb0c1SIlya Dryomov 	const char *name_format = rbd_dev->image_format == 1 ?
1509a90bb0c1SIlya Dryomov 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1510bc81207eSIlya Dryomov 
1511e28eded5SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1512bc81207eSIlya Dryomov 	if (!req)
1513bc81207eSIlya Dryomov 		return NULL;
1514bc81207eSIlya Dryomov 
1515bc81207eSIlya Dryomov 	req->r_callback = rbd_osd_req_callback;
1516a162b308SIlya Dryomov 	req->r_priv = obj_req;
1517bc81207eSIlya Dryomov 
1518b26c047bSIlya Dryomov 	/*
1519b26c047bSIlya Dryomov 	 * Data objects may be stored in a separate pool, but always in
1520b26c047bSIlya Dryomov 	 * the same namespace in that pool as the header in its pool.
1521b26c047bSIlya Dryomov 	 */
1522b26c047bSIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1523bc81207eSIlya Dryomov 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1524b26c047bSIlya Dryomov 
1525a90bb0c1SIlya Dryomov 	if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
152643df3d35SIlya Dryomov 			rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
1527bc81207eSIlya Dryomov 		goto err_req;
1528bc81207eSIlya Dryomov 
1529bc81207eSIlya Dryomov 	return req;
1530bc81207eSIlya Dryomov 
1531bc81207eSIlya Dryomov err_req:
1532bc81207eSIlya Dryomov 	ceph_osdc_put_request(req);
1533bc81207eSIlya Dryomov 	return NULL;
1534bc81207eSIlya Dryomov }
1535bc81207eSIlya Dryomov 
1536e28eded5SIlya Dryomov static struct ceph_osd_request *
1537e28eded5SIlya Dryomov rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
1538e28eded5SIlya Dryomov {
1539e28eded5SIlya Dryomov 	return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc,
1540e28eded5SIlya Dryomov 				    num_ops);
1541e28eded5SIlya Dryomov }
1542e28eded5SIlya Dryomov 
1543bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1544bf0d5f50SAlex Elder {
1545bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1546bf0d5f50SAlex Elder }
1547bf0d5f50SAlex Elder 
1548ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void)
1549bf0d5f50SAlex Elder {
1550bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1551bf0d5f50SAlex Elder 
15525a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
15536c696d85SIlya Dryomov 	if (!obj_request)
1554f907ad55SAlex Elder 		return NULL;
1555f907ad55SAlex Elder 
155643df3d35SIlya Dryomov 	ceph_object_extent_init(&obj_request->ex);
1557bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1558bf0d5f50SAlex Elder 
155967e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1560bf0d5f50SAlex Elder 	return obj_request;
1561bf0d5f50SAlex Elder }
1562bf0d5f50SAlex Elder 
1563bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1564bf0d5f50SAlex Elder {
1565bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
15667e07efb1SIlya Dryomov 	u32 i;
1567bf0d5f50SAlex Elder 
1568bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1569bf0d5f50SAlex Elder 
157037206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
157137206ee5SAlex Elder 
1572bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1573bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1574bf0d5f50SAlex Elder 
1575ecc633caSIlya Dryomov 	switch (obj_request->img_request->data_type) {
15769969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1577bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
15787e07efb1SIlya Dryomov 	case OBJ_REQUEST_BVECS:
15795359a17dSIlya Dryomov 		break;		/* Nothing to do */
1580afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
1581afb97888SIlya Dryomov 		kfree(obj_request->bvec_pos.bvecs);
1582bf0d5f50SAlex Elder 		break;
15837e07efb1SIlya Dryomov 	default:
15847e07efb1SIlya Dryomov 		rbd_assert(0);
1585bf0d5f50SAlex Elder 	}
1586bf0d5f50SAlex Elder 
158786bd7998SIlya Dryomov 	kfree(obj_request->img_extents);
15887e07efb1SIlya Dryomov 	if (obj_request->copyup_bvecs) {
15897e07efb1SIlya Dryomov 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
15907e07efb1SIlya Dryomov 			if (obj_request->copyup_bvecs[i].bv_page)
15917e07efb1SIlya Dryomov 				__free_page(obj_request->copyup_bvecs[i].bv_page);
15927e07efb1SIlya Dryomov 		}
15937e07efb1SIlya Dryomov 		kfree(obj_request->copyup_bvecs);
1594bf0d5f50SAlex Elder 	}
1595bf0d5f50SAlex Elder 
1596868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1597bf0d5f50SAlex Elder }
1598bf0d5f50SAlex Elder 
1599fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1600fb65d228SAlex Elder 
1601fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1602fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1603fb65d228SAlex Elder {
1604fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1605fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1606fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1607fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1608fb65d228SAlex Elder }
1609fb65d228SAlex Elder 
1610bf0d5f50SAlex Elder /*
1611a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1612a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1613a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1614a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1615a2acd00eSAlex Elder  */
1616a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1617a2acd00eSAlex Elder {
1618a2acd00eSAlex Elder 	int counter;
1619a2acd00eSAlex Elder 
1620a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1621a2acd00eSAlex Elder 		return;
1622a2acd00eSAlex Elder 
1623a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1624a2acd00eSAlex Elder 	if (counter > 0)
1625a2acd00eSAlex Elder 		return;
1626a2acd00eSAlex Elder 
1627a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1628a2acd00eSAlex Elder 
1629a2acd00eSAlex Elder 	if (!counter)
1630a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1631a2acd00eSAlex Elder 	else
16329584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
1633a2acd00eSAlex Elder }
1634a2acd00eSAlex Elder 
1635a2acd00eSAlex Elder /*
1636a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1637a2acd00eSAlex Elder  * parent.
1638a2acd00eSAlex Elder  *
1639a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1640a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1641a2acd00eSAlex Elder  * false otherwise.
1642a2acd00eSAlex Elder  */
1643a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1644a2acd00eSAlex Elder {
1645ae43e9d0SIlya Dryomov 	int counter = 0;
1646a2acd00eSAlex Elder 
1647a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1648a2acd00eSAlex Elder 		return false;
1649a2acd00eSAlex Elder 
1650ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
1651ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
1652a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1653ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
1654a2acd00eSAlex Elder 
1655a2acd00eSAlex Elder 	if (counter < 0)
16569584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
1657a2acd00eSAlex Elder 
1658ae43e9d0SIlya Dryomov 	return counter > 0;
1659a2acd00eSAlex Elder }
1660a2acd00eSAlex Elder 
1661bf0d5f50SAlex Elder /*
1662bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1663bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1664bf0d5f50SAlex Elder  * (if there is one).
1665bf0d5f50SAlex Elder  */
1666cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1667cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
16686d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
16694e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
1670bf0d5f50SAlex Elder {
1671bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1672bf0d5f50SAlex Elder 
1673a0c5895bSIlya Dryomov 	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
1674bf0d5f50SAlex Elder 	if (!img_request)
1675bf0d5f50SAlex Elder 		return NULL;
1676bf0d5f50SAlex Elder 
1677bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
16789bb0248dSIlya Dryomov 	img_request->op_type = op_type;
16799bb0248dSIlya Dryomov 	if (!rbd_img_is_write(img_request))
1680bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
16819bb0248dSIlya Dryomov 	else
16829bb0248dSIlya Dryomov 		img_request->snapc = snapc;
16839bb0248dSIlya Dryomov 
1684a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
1685d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1686a0c5895bSIlya Dryomov 
1687bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
168843df3d35SIlya Dryomov 	INIT_LIST_HEAD(&img_request->object_extents);
1689bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1690bf0d5f50SAlex Elder 
1691dfd9875fSIlya Dryomov 	dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1692dfd9875fSIlya Dryomov 	     obj_op_name(op_type), img_request);
1693bf0d5f50SAlex Elder 	return img_request;
1694bf0d5f50SAlex Elder }
1695bf0d5f50SAlex Elder 
1696bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1697bf0d5f50SAlex Elder {
1698bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1699bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1700bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1701bf0d5f50SAlex Elder 
1702bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1703bf0d5f50SAlex Elder 
170437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
170537206ee5SAlex Elder 
1706bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1707bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
1708bf0d5f50SAlex Elder 
1709a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
1710a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
1711a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
1712a2acd00eSAlex Elder 	}
1713a2acd00eSAlex Elder 
17149bb0248dSIlya Dryomov 	if (rbd_img_is_write(img_request))
1715812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1716bf0d5f50SAlex Elder 
17171c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
1718bf0d5f50SAlex Elder }
1719bf0d5f50SAlex Elder 
172086bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents,
172186bd7998SIlya Dryomov 			  u32 *num_img_extents, u64 overlap)
1722e93f3152SAlex Elder {
172386bd7998SIlya Dryomov 	u32 cnt = *num_img_extents;
1724e93f3152SAlex Elder 
172586bd7998SIlya Dryomov 	/* drop extents completely beyond the overlap */
172686bd7998SIlya Dryomov 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
172786bd7998SIlya Dryomov 		cnt--;
1728e93f3152SAlex Elder 
172986bd7998SIlya Dryomov 	if (cnt) {
173086bd7998SIlya Dryomov 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
1731e93f3152SAlex Elder 
173286bd7998SIlya Dryomov 		/* trim final overlapping extent */
173386bd7998SIlya Dryomov 		if (ex->fe_off + ex->fe_len > overlap)
173486bd7998SIlya Dryomov 			ex->fe_len = overlap - ex->fe_off;
1735e93f3152SAlex Elder 	}
1736e93f3152SAlex Elder 
173786bd7998SIlya Dryomov 	*num_img_extents = cnt;
173886bd7998SIlya Dryomov }
173986bd7998SIlya Dryomov 
174086bd7998SIlya Dryomov /*
174186bd7998SIlya Dryomov  * Determine the byte range(s) covered by either just the object extent
174286bd7998SIlya Dryomov  * or the entire object in the parent image.
174386bd7998SIlya Dryomov  */
174486bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
174586bd7998SIlya Dryomov 				    bool entire)
1746e93f3152SAlex Elder {
174786bd7998SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1748c5b5ef6cSAlex Elder 	int ret;
1749c5b5ef6cSAlex Elder 
175086bd7998SIlya Dryomov 	if (!rbd_dev->parent_overlap)
175186bd7998SIlya Dryomov 		return 0;
175286bd7998SIlya Dryomov 
175386bd7998SIlya Dryomov 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
175486bd7998SIlya Dryomov 				  entire ? 0 : obj_req->ex.oe_off,
175586bd7998SIlya Dryomov 				  entire ? rbd_dev->layout.object_size :
175686bd7998SIlya Dryomov 							obj_req->ex.oe_len,
175786bd7998SIlya Dryomov 				  &obj_req->img_extents,
175886bd7998SIlya Dryomov 				  &obj_req->num_img_extents);
175986bd7998SIlya Dryomov 	if (ret)
176086bd7998SIlya Dryomov 		return ret;
176186bd7998SIlya Dryomov 
176286bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
176386bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
176486bd7998SIlya Dryomov 	return 0;
176586bd7998SIlya Dryomov }
176686bd7998SIlya Dryomov 
17673da691bfSIlya Dryomov static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
17683da691bfSIlya Dryomov {
1769ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
17703da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
17713da691bfSIlya Dryomov 		osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
17723da691bfSIlya Dryomov 					       &obj_req->bio_pos,
177343df3d35SIlya Dryomov 					       obj_req->ex.oe_len);
17743da691bfSIlya Dryomov 		break;
17753da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1776afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
17773da691bfSIlya Dryomov 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
177843df3d35SIlya Dryomov 							obj_req->ex.oe_len);
1779afb97888SIlya Dryomov 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
17803da691bfSIlya Dryomov 		osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
17813da691bfSIlya Dryomov 						    &obj_req->bvec_pos);
17823da691bfSIlya Dryomov 		break;
17833da691bfSIlya Dryomov 	default:
17843da691bfSIlya Dryomov 		rbd_assert(0);
17853da691bfSIlya Dryomov 	}
17863da691bfSIlya Dryomov }
17873da691bfSIlya Dryomov 
17883da691bfSIlya Dryomov static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
17893da691bfSIlya Dryomov {
1790e28eded5SIlya Dryomov 	obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1);
17913da691bfSIlya Dryomov 	if (!obj_req->osd_req)
1792710214e3SIlya Dryomov 		return -ENOMEM;
1793710214e3SIlya Dryomov 
17943da691bfSIlya Dryomov 	osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
179543df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
17963da691bfSIlya Dryomov 	rbd_osd_req_setup_data(obj_req, 0);
1797a90bb0c1SIlya Dryomov 
17983da691bfSIlya Dryomov 	rbd_osd_req_format_read(obj_req);
17993da691bfSIlya Dryomov 	return 0;
1800710214e3SIlya Dryomov }
1801710214e3SIlya Dryomov 
18023da691bfSIlya Dryomov static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
18033da691bfSIlya Dryomov 				unsigned int which)
18043da691bfSIlya Dryomov {
18053da691bfSIlya Dryomov 	struct page **pages;
18063da691bfSIlya Dryomov 
1807c5b5ef6cSAlex Elder 	/*
1808c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
1809c5b5ef6cSAlex Elder 	 *     le64 length;
1810c5b5ef6cSAlex Elder 	 *     struct {
1811c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
1812c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
1813c5b5ef6cSAlex Elder 	 *     } mtime;
1814c5b5ef6cSAlex Elder 	 */
18153da691bfSIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
18163da691bfSIlya Dryomov 	if (IS_ERR(pages))
18173da691bfSIlya Dryomov 		return PTR_ERR(pages);
18183da691bfSIlya Dryomov 
18193da691bfSIlya Dryomov 	osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
18203da691bfSIlya Dryomov 	osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
18213da691bfSIlya Dryomov 				     8 + sizeof(struct ceph_timespec),
18223da691bfSIlya Dryomov 				     0, false, true);
18233da691bfSIlya Dryomov 	return 0;
1824710214e3SIlya Dryomov }
1825c5b5ef6cSAlex Elder 
182613488d53SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req)
182713488d53SIlya Dryomov {
182813488d53SIlya Dryomov 	return 2; /* setallochint + write/writefull */
182913488d53SIlya Dryomov }
183013488d53SIlya Dryomov 
18313da691bfSIlya Dryomov static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
18323da691bfSIlya Dryomov 				  unsigned int which)
18333da691bfSIlya Dryomov {
18343da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
18353da691bfSIlya Dryomov 	u16 opcode;
1836c5b5ef6cSAlex Elder 
18373da691bfSIlya Dryomov 	osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
18383da691bfSIlya Dryomov 				   rbd_dev->layout.object_size,
18393da691bfSIlya Dryomov 				   rbd_dev->layout.object_size);
1840c5b5ef6cSAlex Elder 
18413da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req))
18423da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITEFULL;
18433da691bfSIlya Dryomov 	else
18443da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITE;
1845c5b5ef6cSAlex Elder 
18463da691bfSIlya Dryomov 	osd_req_op_extent_init(obj_req->osd_req, which, opcode,
184743df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
18483da691bfSIlya Dryomov 	rbd_osd_req_setup_data(obj_req, which++);
18493da691bfSIlya Dryomov 
18503da691bfSIlya Dryomov 	rbd_assert(which == obj_req->osd_req->r_num_ops);
18513da691bfSIlya Dryomov 	rbd_osd_req_format_write(obj_req);
18523da691bfSIlya Dryomov }
18533da691bfSIlya Dryomov 
18543da691bfSIlya Dryomov static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
18553da691bfSIlya Dryomov {
18563da691bfSIlya Dryomov 	unsigned int num_osd_ops, which = 0;
185713488d53SIlya Dryomov 	bool need_guard;
18583da691bfSIlya Dryomov 	int ret;
18593da691bfSIlya Dryomov 
186086bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
186186bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
186286bd7998SIlya Dryomov 	if (ret)
186386bd7998SIlya Dryomov 		return ret;
186486bd7998SIlya Dryomov 
186513488d53SIlya Dryomov 	need_guard = rbd_obj_copyup_enabled(obj_req);
186613488d53SIlya Dryomov 	num_osd_ops = need_guard + count_write_ops(obj_req);
18673da691bfSIlya Dryomov 
1868a162b308SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
18693da691bfSIlya Dryomov 	if (!obj_req->osd_req)
18703da691bfSIlya Dryomov 		return -ENOMEM;
18713da691bfSIlya Dryomov 
187213488d53SIlya Dryomov 	if (need_guard) {
18733da691bfSIlya Dryomov 		ret = __rbd_obj_setup_stat(obj_req, which++);
18743da691bfSIlya Dryomov 		if (ret)
1875c5b5ef6cSAlex Elder 			return ret;
187613488d53SIlya Dryomov 
187713488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_GUARD;
187813488d53SIlya Dryomov 	} else {
187913488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1880c5b5ef6cSAlex Elder 	}
1881c5b5ef6cSAlex Elder 
18823da691bfSIlya Dryomov 	__rbd_obj_setup_write(obj_req, which);
18833da691bfSIlya Dryomov 	return 0;
188470d045f6SIlya Dryomov }
188570d045f6SIlya Dryomov 
18866484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
18876484cbe9SIlya Dryomov {
18886484cbe9SIlya Dryomov 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
18896484cbe9SIlya Dryomov 					  CEPH_OSD_OP_ZERO;
18906484cbe9SIlya Dryomov }
18916484cbe9SIlya Dryomov 
18926484cbe9SIlya Dryomov static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
18936484cbe9SIlya Dryomov {
18940c93e1b7SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
18950c93e1b7SIlya Dryomov 	u64 off = obj_req->ex.oe_off;
18960c93e1b7SIlya Dryomov 	u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
18976484cbe9SIlya Dryomov 	int ret;
18986484cbe9SIlya Dryomov 
18990c93e1b7SIlya Dryomov 	/*
19000c93e1b7SIlya Dryomov 	 * Align the range to alloc_size boundary and punt on discards
19010c93e1b7SIlya Dryomov 	 * that are too small to free up any space.
19020c93e1b7SIlya Dryomov 	 *
19030c93e1b7SIlya Dryomov 	 * alloc_size == object_size && is_tail() is a special case for
19040c93e1b7SIlya Dryomov 	 * filestore with filestore_punch_hole = false, needed to allow
19050c93e1b7SIlya Dryomov 	 * truncate (in addition to delete).
19060c93e1b7SIlya Dryomov 	 */
19070c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
19080c93e1b7SIlya Dryomov 	    !rbd_obj_is_tail(obj_req)) {
19090c93e1b7SIlya Dryomov 		off = round_up(off, rbd_dev->opts->alloc_size);
19100c93e1b7SIlya Dryomov 		next_off = round_down(next_off, rbd_dev->opts->alloc_size);
19110c93e1b7SIlya Dryomov 		if (off >= next_off)
19120c93e1b7SIlya Dryomov 			return 1;
19130c93e1b7SIlya Dryomov 	}
19140c93e1b7SIlya Dryomov 
19156484cbe9SIlya Dryomov 	/* reverse map the entire object onto the parent */
19166484cbe9SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
19176484cbe9SIlya Dryomov 	if (ret)
19186484cbe9SIlya Dryomov 		return ret;
19196484cbe9SIlya Dryomov 
19206484cbe9SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
19216484cbe9SIlya Dryomov 	if (!obj_req->osd_req)
19226484cbe9SIlya Dryomov 		return -ENOMEM;
19236484cbe9SIlya Dryomov 
19246484cbe9SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
19256484cbe9SIlya Dryomov 		osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
19266484cbe9SIlya Dryomov 	} else {
19270c93e1b7SIlya Dryomov 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
19280c93e1b7SIlya Dryomov 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
19290c93e1b7SIlya Dryomov 		     off, next_off - off);
19306484cbe9SIlya Dryomov 		osd_req_op_extent_init(obj_req->osd_req, 0,
19316484cbe9SIlya Dryomov 				       truncate_or_zero_opcode(obj_req),
19320c93e1b7SIlya Dryomov 				       off, next_off - off, 0, 0);
19336484cbe9SIlya Dryomov 	}
19346484cbe9SIlya Dryomov 
19356484cbe9SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_FLAT;
19366484cbe9SIlya Dryomov 	rbd_osd_req_format_write(obj_req);
19376484cbe9SIlya Dryomov 	return 0;
19386484cbe9SIlya Dryomov }
19396484cbe9SIlya Dryomov 
194013488d53SIlya Dryomov static int count_zeroout_ops(struct rbd_obj_request *obj_req)
194113488d53SIlya Dryomov {
194213488d53SIlya Dryomov 	int num_osd_ops;
194313488d53SIlya Dryomov 
19449b17eb2cSIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
19459b17eb2cSIlya Dryomov 	    !rbd_obj_copyup_enabled(obj_req))
194613488d53SIlya Dryomov 		num_osd_ops = 2; /* create + truncate */
194713488d53SIlya Dryomov 	else
194813488d53SIlya Dryomov 		num_osd_ops = 1; /* delete/truncate/zero */
194913488d53SIlya Dryomov 
195013488d53SIlya Dryomov 	return num_osd_ops;
195113488d53SIlya Dryomov }
195213488d53SIlya Dryomov 
19536484cbe9SIlya Dryomov static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
19543da691bfSIlya Dryomov 				    unsigned int which)
195570d045f6SIlya Dryomov {
19563da691bfSIlya Dryomov 	u16 opcode;
1957058aa991SIlya Dryomov 
19583da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req)) {
195986bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
19609b17eb2cSIlya Dryomov 			if (!rbd_obj_copyup_enabled(obj_req))
19612bb1e56eSIlya Dryomov 				osd_req_op_init(obj_req->osd_req, which++,
19622bb1e56eSIlya Dryomov 						CEPH_OSD_OP_CREATE, 0);
19633da691bfSIlya Dryomov 			opcode = CEPH_OSD_OP_TRUNCATE;
19643da691bfSIlya Dryomov 		} else {
19653da691bfSIlya Dryomov 			osd_req_op_init(obj_req->osd_req, which++,
19663da691bfSIlya Dryomov 					CEPH_OSD_OP_DELETE, 0);
19673da691bfSIlya Dryomov 			opcode = 0;
19683da691bfSIlya Dryomov 		}
19693da691bfSIlya Dryomov 	} else {
19706484cbe9SIlya Dryomov 		opcode = truncate_or_zero_opcode(obj_req);
19713da691bfSIlya Dryomov 	}
19723da691bfSIlya Dryomov 
19733da691bfSIlya Dryomov 	if (opcode)
19743da691bfSIlya Dryomov 		osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
197543df3d35SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
19763da691bfSIlya Dryomov 				       0, 0);
19773da691bfSIlya Dryomov 
19783da691bfSIlya Dryomov 	rbd_assert(which == obj_req->osd_req->r_num_ops);
19793da691bfSIlya Dryomov 	rbd_osd_req_format_write(obj_req);
19803da691bfSIlya Dryomov }
19813da691bfSIlya Dryomov 
19826484cbe9SIlya Dryomov static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
19833da691bfSIlya Dryomov {
19843da691bfSIlya Dryomov 	unsigned int num_osd_ops, which = 0;
198513488d53SIlya Dryomov 	bool need_guard;
19863da691bfSIlya Dryomov 	int ret;
19873da691bfSIlya Dryomov 
198886bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
198986bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
199086bd7998SIlya Dryomov 	if (ret)
199186bd7998SIlya Dryomov 		return ret;
199286bd7998SIlya Dryomov 
199313488d53SIlya Dryomov 	need_guard = rbd_obj_copyup_enabled(obj_req);
199413488d53SIlya Dryomov 	num_osd_ops = need_guard + count_zeroout_ops(obj_req);
19953da691bfSIlya Dryomov 
1996a162b308SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
19973da691bfSIlya Dryomov 	if (!obj_req->osd_req)
19983da691bfSIlya Dryomov 		return -ENOMEM;
19993da691bfSIlya Dryomov 
200013488d53SIlya Dryomov 	if (need_guard) {
20013da691bfSIlya Dryomov 		ret = __rbd_obj_setup_stat(obj_req, which++);
20023da691bfSIlya Dryomov 		if (ret)
20033da691bfSIlya Dryomov 			return ret;
200413488d53SIlya Dryomov 
200513488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_GUARD;
200613488d53SIlya Dryomov 	} else {
200713488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
20083da691bfSIlya Dryomov 	}
20093da691bfSIlya Dryomov 
20106484cbe9SIlya Dryomov 	__rbd_obj_setup_zeroout(obj_req, which);
2011980917fcSIlya Dryomov 	return 0;
2012b454e36dSAlex Elder }
2013b454e36dSAlex Elder 
2014b454e36dSAlex Elder /*
20153da691bfSIlya Dryomov  * For each object request in @img_req, allocate an OSD request, add
20163da691bfSIlya Dryomov  * individual OSD ops and prepare them for submission.  The number of
20173da691bfSIlya Dryomov  * OSD ops depends on op_type and the overlap point (if any).
2018b454e36dSAlex Elder  */
20193da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req)
20203da691bfSIlya Dryomov {
20210c93e1b7SIlya Dryomov 	struct rbd_obj_request *obj_req, *next_obj_req;
20223da691bfSIlya Dryomov 	int ret;
20233d7efd18SAlex Elder 
20240c93e1b7SIlya Dryomov 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
20259bb0248dSIlya Dryomov 		switch (img_req->op_type) {
20263da691bfSIlya Dryomov 		case OBJ_OP_READ:
20273da691bfSIlya Dryomov 			ret = rbd_obj_setup_read(obj_req);
20283da691bfSIlya Dryomov 			break;
20293da691bfSIlya Dryomov 		case OBJ_OP_WRITE:
20303da691bfSIlya Dryomov 			ret = rbd_obj_setup_write(obj_req);
20313da691bfSIlya Dryomov 			break;
20323da691bfSIlya Dryomov 		case OBJ_OP_DISCARD:
20333da691bfSIlya Dryomov 			ret = rbd_obj_setup_discard(obj_req);
20343da691bfSIlya Dryomov 			break;
20356484cbe9SIlya Dryomov 		case OBJ_OP_ZEROOUT:
20366484cbe9SIlya Dryomov 			ret = rbd_obj_setup_zeroout(obj_req);
20376484cbe9SIlya Dryomov 			break;
20383da691bfSIlya Dryomov 		default:
20393da691bfSIlya Dryomov 			rbd_assert(0);
20403da691bfSIlya Dryomov 		}
20410c93e1b7SIlya Dryomov 		if (ret < 0)
20423da691bfSIlya Dryomov 			return ret;
20430c93e1b7SIlya Dryomov 		if (ret > 0) {
20440c93e1b7SIlya Dryomov 			img_req->xferred += obj_req->ex.oe_len;
20450c93e1b7SIlya Dryomov 			img_req->pending_count--;
20460c93e1b7SIlya Dryomov 			rbd_img_obj_request_del(img_req, obj_req);
20470c93e1b7SIlya Dryomov 			continue;
20480c93e1b7SIlya Dryomov 		}
204926f887e0SIlya Dryomov 
205026f887e0SIlya Dryomov 		ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
205126f887e0SIlya Dryomov 		if (ret)
205226f887e0SIlya Dryomov 			return ret;
2053b454e36dSAlex Elder 	}
2054b454e36dSAlex Elder 
20553da691bfSIlya Dryomov 	return 0;
20563da691bfSIlya Dryomov }
20573da691bfSIlya Dryomov 
20585a237819SIlya Dryomov union rbd_img_fill_iter {
20595a237819SIlya Dryomov 	struct ceph_bio_iter	bio_iter;
20605a237819SIlya Dryomov 	struct ceph_bvec_iter	bvec_iter;
20615a237819SIlya Dryomov };
20625a237819SIlya Dryomov 
20635a237819SIlya Dryomov struct rbd_img_fill_ctx {
20645a237819SIlya Dryomov 	enum obj_request_type	pos_type;
20655a237819SIlya Dryomov 	union rbd_img_fill_iter	*pos;
20665a237819SIlya Dryomov 	union rbd_img_fill_iter	iter;
20675a237819SIlya Dryomov 	ceph_object_extent_fn_t	set_pos_fn;
2068afb97888SIlya Dryomov 	ceph_object_extent_fn_t	count_fn;
2069afb97888SIlya Dryomov 	ceph_object_extent_fn_t	copy_fn;
20705a237819SIlya Dryomov };
20715a237819SIlya Dryomov 
20725a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg)
20735a237819SIlya Dryomov {
20745a237819SIlya Dryomov 	struct rbd_img_request *img_req = arg;
20755a237819SIlya Dryomov 	struct rbd_obj_request *obj_req;
20765a237819SIlya Dryomov 
20775a237819SIlya Dryomov 	obj_req = rbd_obj_request_create();
20785a237819SIlya Dryomov 	if (!obj_req)
20795a237819SIlya Dryomov 		return NULL;
20805a237819SIlya Dryomov 
20815a237819SIlya Dryomov 	rbd_img_obj_request_add(img_req, obj_req);
20825a237819SIlya Dryomov 	return &obj_req->ex;
20835a237819SIlya Dryomov }
20845a237819SIlya Dryomov 
20855a237819SIlya Dryomov /*
2086afb97888SIlya Dryomov  * While su != os && sc == 1 is technically not fancy (it's the same
2087afb97888SIlya Dryomov  * layout as su == os && sc == 1), we can't use the nocopy path for it
2088afb97888SIlya Dryomov  * because ->set_pos_fn() should be called only once per object.
2089afb97888SIlya Dryomov  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2090afb97888SIlya Dryomov  * treat su != os && sc == 1 as fancy.
20915a237819SIlya Dryomov  */
2092afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2093afb97888SIlya Dryomov {
2094afb97888SIlya Dryomov 	return l->stripe_unit != l->object_size;
2095afb97888SIlya Dryomov }
2096afb97888SIlya Dryomov 
2097afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
20985a237819SIlya Dryomov 				       struct ceph_file_extent *img_extents,
20995a237819SIlya Dryomov 				       u32 num_img_extents,
21005a237819SIlya Dryomov 				       struct rbd_img_fill_ctx *fctx)
21015a237819SIlya Dryomov {
21025a237819SIlya Dryomov 	u32 i;
21035a237819SIlya Dryomov 	int ret;
21045a237819SIlya Dryomov 
21055a237819SIlya Dryomov 	img_req->data_type = fctx->pos_type;
21065a237819SIlya Dryomov 
21075a237819SIlya Dryomov 	/*
21085a237819SIlya Dryomov 	 * Create object requests and set each object request's starting
21095a237819SIlya Dryomov 	 * position in the provided bio (list) or bio_vec array.
21105a237819SIlya Dryomov 	 */
21115a237819SIlya Dryomov 	fctx->iter = *fctx->pos;
21125a237819SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
21135a237819SIlya Dryomov 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
21145a237819SIlya Dryomov 					   img_extents[i].fe_off,
21155a237819SIlya Dryomov 					   img_extents[i].fe_len,
21165a237819SIlya Dryomov 					   &img_req->object_extents,
21175a237819SIlya Dryomov 					   alloc_object_extent, img_req,
21185a237819SIlya Dryomov 					   fctx->set_pos_fn, &fctx->iter);
21195a237819SIlya Dryomov 		if (ret)
21205a237819SIlya Dryomov 			return ret;
21215a237819SIlya Dryomov 	}
21225a237819SIlya Dryomov 
21235a237819SIlya Dryomov 	return __rbd_img_fill_request(img_req);
21245a237819SIlya Dryomov }
21255a237819SIlya Dryomov 
2126afb97888SIlya Dryomov /*
2127afb97888SIlya Dryomov  * Map a list of image extents to a list of object extents, create the
2128afb97888SIlya Dryomov  * corresponding object requests (normally each to a different object,
2129afb97888SIlya Dryomov  * but not always) and add them to @img_req.  For each object request,
2130afb97888SIlya Dryomov  * set up its data descriptor to point to the corresponding chunk(s) of
2131afb97888SIlya Dryomov  * @fctx->pos data buffer.
2132afb97888SIlya Dryomov  *
2133afb97888SIlya Dryomov  * Because ceph_file_to_extents() will merge adjacent object extents
2134afb97888SIlya Dryomov  * together, each object request's data descriptor may point to multiple
2135afb97888SIlya Dryomov  * different chunks of @fctx->pos data buffer.
2136afb97888SIlya Dryomov  *
2137afb97888SIlya Dryomov  * @fctx->pos data buffer is assumed to be large enough.
2138afb97888SIlya Dryomov  */
2139afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req,
2140afb97888SIlya Dryomov 				struct ceph_file_extent *img_extents,
2141afb97888SIlya Dryomov 				u32 num_img_extents,
2142afb97888SIlya Dryomov 				struct rbd_img_fill_ctx *fctx)
2143afb97888SIlya Dryomov {
2144afb97888SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2145afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req;
2146afb97888SIlya Dryomov 	u32 i;
2147afb97888SIlya Dryomov 	int ret;
2148afb97888SIlya Dryomov 
2149afb97888SIlya Dryomov 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2150afb97888SIlya Dryomov 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2151afb97888SIlya Dryomov 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2152afb97888SIlya Dryomov 						   num_img_extents, fctx);
2153afb97888SIlya Dryomov 
2154afb97888SIlya Dryomov 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2155afb97888SIlya Dryomov 
2156afb97888SIlya Dryomov 	/*
2157afb97888SIlya Dryomov 	 * Create object requests and determine ->bvec_count for each object
2158afb97888SIlya Dryomov 	 * request.  Note that ->bvec_count sum over all object requests may
2159afb97888SIlya Dryomov 	 * be greater than the number of bio_vecs in the provided bio (list)
2160afb97888SIlya Dryomov 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2161afb97888SIlya Dryomov 	 * stripe unit boundaries.
2162afb97888SIlya Dryomov 	 */
2163afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2164afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2165afb97888SIlya Dryomov 		ret = ceph_file_to_extents(&rbd_dev->layout,
2166afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2167afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2168afb97888SIlya Dryomov 					   &img_req->object_extents,
2169afb97888SIlya Dryomov 					   alloc_object_extent, img_req,
2170afb97888SIlya Dryomov 					   fctx->count_fn, &fctx->iter);
2171afb97888SIlya Dryomov 		if (ret)
2172afb97888SIlya Dryomov 			return ret;
2173afb97888SIlya Dryomov 	}
2174afb97888SIlya Dryomov 
2175afb97888SIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
2176afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2177afb97888SIlya Dryomov 					      sizeof(*obj_req->bvec_pos.bvecs),
2178afb97888SIlya Dryomov 					      GFP_NOIO);
2179afb97888SIlya Dryomov 		if (!obj_req->bvec_pos.bvecs)
2180afb97888SIlya Dryomov 			return -ENOMEM;
2181afb97888SIlya Dryomov 	}
2182afb97888SIlya Dryomov 
2183afb97888SIlya Dryomov 	/*
2184afb97888SIlya Dryomov 	 * Fill in each object request's private bio_vec array, splitting and
2185afb97888SIlya Dryomov 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2186afb97888SIlya Dryomov 	 */
2187afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2188afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2189afb97888SIlya Dryomov 		ret = ceph_iterate_extents(&rbd_dev->layout,
2190afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2191afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2192afb97888SIlya Dryomov 					   &img_req->object_extents,
2193afb97888SIlya Dryomov 					   fctx->copy_fn, &fctx->iter);
2194afb97888SIlya Dryomov 		if (ret)
2195afb97888SIlya Dryomov 			return ret;
2196afb97888SIlya Dryomov 	}
2197afb97888SIlya Dryomov 
2198afb97888SIlya Dryomov 	return __rbd_img_fill_request(img_req);
2199afb97888SIlya Dryomov }
2200afb97888SIlya Dryomov 
22015a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
22025a237819SIlya Dryomov 			       u64 off, u64 len)
22035a237819SIlya Dryomov {
22045a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
22055a237819SIlya Dryomov 	union rbd_img_fill_iter dummy;
22065a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
22075a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_NODATA,
22085a237819SIlya Dryomov 		.pos = &dummy,
22095a237819SIlya Dryomov 	};
22105a237819SIlya Dryomov 
22115a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
22125a237819SIlya Dryomov }
22135a237819SIlya Dryomov 
22145a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
22155a237819SIlya Dryomov {
22165a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
22175a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
22185a237819SIlya Dryomov 	struct ceph_bio_iter *it = arg;
22195a237819SIlya Dryomov 
22205a237819SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
22215a237819SIlya Dryomov 	obj_req->bio_pos = *it;
22225a237819SIlya Dryomov 	ceph_bio_iter_advance(it, bytes);
22235a237819SIlya Dryomov }
22245a237819SIlya Dryomov 
2225afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2226afb97888SIlya Dryomov {
2227afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2228afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2229afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2230afb97888SIlya Dryomov 
2231afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2232afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2233afb97888SIlya Dryomov 		obj_req->bvec_count++;
2234afb97888SIlya Dryomov 	}));
2235afb97888SIlya Dryomov 
2236afb97888SIlya Dryomov }
2237afb97888SIlya Dryomov 
2238afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2239afb97888SIlya Dryomov {
2240afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2241afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2242afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2243afb97888SIlya Dryomov 
2244afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2245afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2246afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2247afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2248afb97888SIlya Dryomov 	}));
2249afb97888SIlya Dryomov }
2250afb97888SIlya Dryomov 
22515a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
22525a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
22535a237819SIlya Dryomov 				   u32 num_img_extents,
22545a237819SIlya Dryomov 				   struct ceph_bio_iter *bio_pos)
22555a237819SIlya Dryomov {
22565a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
22575a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BIO,
22585a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bio_pos,
22595a237819SIlya Dryomov 		.set_pos_fn = set_bio_pos,
2260afb97888SIlya Dryomov 		.count_fn = count_bio_bvecs,
2261afb97888SIlya Dryomov 		.copy_fn = copy_bio_bvecs,
22625a237819SIlya Dryomov 	};
22635a237819SIlya Dryomov 
22645a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
22655a237819SIlya Dryomov 				    &fctx);
22665a237819SIlya Dryomov }
22675a237819SIlya Dryomov 
22685a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
22695a237819SIlya Dryomov 				 u64 off, u64 len, struct bio *bio)
22705a237819SIlya Dryomov {
22715a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
22725a237819SIlya Dryomov 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
22735a237819SIlya Dryomov 
22745a237819SIlya Dryomov 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
22755a237819SIlya Dryomov }
22765a237819SIlya Dryomov 
22775a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
22785a237819SIlya Dryomov {
22795a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
22805a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
22815a237819SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
22825a237819SIlya Dryomov 
22835a237819SIlya Dryomov 	obj_req->bvec_pos = *it;
22845a237819SIlya Dryomov 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
22855a237819SIlya Dryomov 	ceph_bvec_iter_advance(it, bytes);
22865a237819SIlya Dryomov }
22875a237819SIlya Dryomov 
2288afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2289afb97888SIlya Dryomov {
2290afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2291afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2292afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2293afb97888SIlya Dryomov 
2294afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2295afb97888SIlya Dryomov 		obj_req->bvec_count++;
2296afb97888SIlya Dryomov 	}));
2297afb97888SIlya Dryomov }
2298afb97888SIlya Dryomov 
2299afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2300afb97888SIlya Dryomov {
2301afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2302afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2303afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2304afb97888SIlya Dryomov 
2305afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2306afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2307afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2308afb97888SIlya Dryomov 	}));
2309afb97888SIlya Dryomov }
2310afb97888SIlya Dryomov 
23115a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
23125a237819SIlya Dryomov 				     struct ceph_file_extent *img_extents,
23135a237819SIlya Dryomov 				     u32 num_img_extents,
23145a237819SIlya Dryomov 				     struct ceph_bvec_iter *bvec_pos)
23155a237819SIlya Dryomov {
23165a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
23175a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BVECS,
23185a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bvec_pos,
23195a237819SIlya Dryomov 		.set_pos_fn = set_bvec_pos,
2320afb97888SIlya Dryomov 		.count_fn = count_bvecs,
2321afb97888SIlya Dryomov 		.copy_fn = copy_bvecs,
23225a237819SIlya Dryomov 	};
23235a237819SIlya Dryomov 
23245a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
23255a237819SIlya Dryomov 				    &fctx);
23265a237819SIlya Dryomov }
23275a237819SIlya Dryomov 
23285a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
23295a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
23305a237819SIlya Dryomov 				   u32 num_img_extents,
23315a237819SIlya Dryomov 				   struct bio_vec *bvecs)
23325a237819SIlya Dryomov {
23335a237819SIlya Dryomov 	struct ceph_bvec_iter it = {
23345a237819SIlya Dryomov 		.bvecs = bvecs,
23355a237819SIlya Dryomov 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
23365a237819SIlya Dryomov 							     num_img_extents) },
23375a237819SIlya Dryomov 	};
23385a237819SIlya Dryomov 
23395a237819SIlya Dryomov 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
23405a237819SIlya Dryomov 					 &it);
23415a237819SIlya Dryomov }
23425a237819SIlya Dryomov 
2343efbd1a11SIlya Dryomov static void rbd_img_request_submit(struct rbd_img_request *img_request)
2344bf0d5f50SAlex Elder {
2345bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2346bf0d5f50SAlex Elder 
234737206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
2348bf0d5f50SAlex Elder 
2349663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
2350efbd1a11SIlya Dryomov 	for_each_obj_request(img_request, obj_request)
23513da691bfSIlya Dryomov 		rbd_obj_request_submit(obj_request);
2352bf0d5f50SAlex Elder 
2353663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
2354bf0d5f50SAlex Elder }
2355bf0d5f50SAlex Elder 
235686bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
23573da691bfSIlya Dryomov {
23583da691bfSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
23593da691bfSIlya Dryomov 	struct rbd_img_request *child_img_req;
23603da691bfSIlya Dryomov 	int ret;
23613da691bfSIlya Dryomov 
2362e93aca0aSIlya Dryomov 	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2363e93aca0aSIlya Dryomov 					       OBJ_OP_READ, NULL);
23643da691bfSIlya Dryomov 	if (!child_img_req)
23653da691bfSIlya Dryomov 		return -ENOMEM;
23663da691bfSIlya Dryomov 
2367e93aca0aSIlya Dryomov 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2368e93aca0aSIlya Dryomov 	child_img_req->obj_request = obj_req;
2369e93aca0aSIlya Dryomov 
23703da691bfSIlya Dryomov 	if (!rbd_img_is_write(img_req)) {
2371ecc633caSIlya Dryomov 		switch (img_req->data_type) {
23723da691bfSIlya Dryomov 		case OBJ_REQUEST_BIO:
23735a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bio(child_img_req,
23745a237819SIlya Dryomov 						      obj_req->img_extents,
23755a237819SIlya Dryomov 						      obj_req->num_img_extents,
23763da691bfSIlya Dryomov 						      &obj_req->bio_pos);
23773da691bfSIlya Dryomov 			break;
23783da691bfSIlya Dryomov 		case OBJ_REQUEST_BVECS:
2379afb97888SIlya Dryomov 		case OBJ_REQUEST_OWN_BVECS:
23805a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bvecs(child_img_req,
23815a237819SIlya Dryomov 						      obj_req->img_extents,
23825a237819SIlya Dryomov 						      obj_req->num_img_extents,
23833da691bfSIlya Dryomov 						      &obj_req->bvec_pos);
23843da691bfSIlya Dryomov 			break;
23853da691bfSIlya Dryomov 		default:
23863da691bfSIlya Dryomov 			rbd_assert(0);
23873da691bfSIlya Dryomov 		}
23883da691bfSIlya Dryomov 	} else {
23895a237819SIlya Dryomov 		ret = rbd_img_fill_from_bvecs(child_img_req,
23905a237819SIlya Dryomov 					      obj_req->img_extents,
23915a237819SIlya Dryomov 					      obj_req->num_img_extents,
23925a237819SIlya Dryomov 					      obj_req->copyup_bvecs);
23933da691bfSIlya Dryomov 	}
23943da691bfSIlya Dryomov 	if (ret) {
23953da691bfSIlya Dryomov 		rbd_img_request_put(child_img_req);
2396663ae2ccSIlya Dryomov 		return ret;
2397bf0d5f50SAlex Elder 	}
2398bf0d5f50SAlex Elder 
23993da691bfSIlya Dryomov 	rbd_img_request_submit(child_img_req);
24003da691bfSIlya Dryomov 	return 0;
24013da691bfSIlya Dryomov }
24023da691bfSIlya Dryomov 
24033da691bfSIlya Dryomov static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
24048b3e1a56SAlex Elder {
24053da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
24063da691bfSIlya Dryomov 	int ret;
24078b3e1a56SAlex Elder 
24083da691bfSIlya Dryomov 	if (obj_req->result == -ENOENT &&
240986bd7998SIlya Dryomov 	    rbd_dev->parent_overlap && !obj_req->tried_parent) {
241086bd7998SIlya Dryomov 		/* reverse map this object extent onto the parent */
241186bd7998SIlya Dryomov 		ret = rbd_obj_calc_img_extents(obj_req, false);
241286bd7998SIlya Dryomov 		if (ret) {
241386bd7998SIlya Dryomov 			obj_req->result = ret;
241486bd7998SIlya Dryomov 			return true;
241586bd7998SIlya Dryomov 		}
24168b3e1a56SAlex Elder 
241786bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
24183da691bfSIlya Dryomov 			obj_req->tried_parent = true;
241986bd7998SIlya Dryomov 			ret = rbd_obj_read_from_parent(obj_req);
24203da691bfSIlya Dryomov 			if (ret) {
24213da691bfSIlya Dryomov 				obj_req->result = ret;
24223da691bfSIlya Dryomov 				return true;
24233da691bfSIlya Dryomov 			}
24243da691bfSIlya Dryomov 			return false;
24253da691bfSIlya Dryomov 		}
242686bd7998SIlya Dryomov 	}
242702c74fbaSAlex Elder 
242802c74fbaSAlex Elder 	/*
24293da691bfSIlya Dryomov 	 * -ENOENT means a hole in the image -- zero-fill the entire
24303da691bfSIlya Dryomov 	 * length of the request.  A short read also implies zero-fill
24313da691bfSIlya Dryomov 	 * to the end of the request.  In both cases we update xferred
24323da691bfSIlya Dryomov 	 * count to indicate the whole request was satisfied.
243302c74fbaSAlex Elder 	 */
24343da691bfSIlya Dryomov 	if (obj_req->result == -ENOENT ||
243543df3d35SIlya Dryomov 	    (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
24363da691bfSIlya Dryomov 		rbd_assert(!obj_req->xferred || !obj_req->result);
24373da691bfSIlya Dryomov 		rbd_obj_zero_range(obj_req, obj_req->xferred,
243843df3d35SIlya Dryomov 				   obj_req->ex.oe_len - obj_req->xferred);
24393da691bfSIlya Dryomov 		obj_req->result = 0;
244043df3d35SIlya Dryomov 		obj_req->xferred = obj_req->ex.oe_len;
24413da691bfSIlya Dryomov 	}
24423da691bfSIlya Dryomov 
24433da691bfSIlya Dryomov 	return true;
24443da691bfSIlya Dryomov }
24453da691bfSIlya Dryomov 
24463da691bfSIlya Dryomov /*
24473da691bfSIlya Dryomov  * copyup_bvecs pages are never highmem pages
24483da691bfSIlya Dryomov  */
24493da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
24503da691bfSIlya Dryomov {
24513da691bfSIlya Dryomov 	struct ceph_bvec_iter it = {
24523da691bfSIlya Dryomov 		.bvecs = bvecs,
24533da691bfSIlya Dryomov 		.iter = { .bi_size = bytes },
24543da691bfSIlya Dryomov 	};
24553da691bfSIlya Dryomov 
24563da691bfSIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
24573da691bfSIlya Dryomov 		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
24583da691bfSIlya Dryomov 			       bv.bv_len))
24593da691bfSIlya Dryomov 			return false;
24603da691bfSIlya Dryomov 	}));
24613da691bfSIlya Dryomov 	return true;
24623da691bfSIlya Dryomov }
24633da691bfSIlya Dryomov 
24643a482501SIlya Dryomov #define MODS_ONLY	U32_MAX
24653a482501SIlya Dryomov 
246689a59c1cSIlya Dryomov static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req,
246789a59c1cSIlya Dryomov 					    u32 bytes)
24683da691bfSIlya Dryomov {
2469fe943d50SChengguang Xu 	int ret;
24703da691bfSIlya Dryomov 
24713da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
24723da691bfSIlya Dryomov 	rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
247389a59c1cSIlya Dryomov 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
24743da691bfSIlya Dryomov 	rbd_osd_req_destroy(obj_req->osd_req);
24753da691bfSIlya Dryomov 
247689a59c1cSIlya Dryomov 	obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1);
24773da691bfSIlya Dryomov 	if (!obj_req->osd_req)
24783da691bfSIlya Dryomov 		return -ENOMEM;
24793da691bfSIlya Dryomov 
248024639ce5SIlya Dryomov 	ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup");
2481fe943d50SChengguang Xu 	if (ret)
2482fe943d50SChengguang Xu 		return ret;
2483fe943d50SChengguang Xu 
24843da691bfSIlya Dryomov 	osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
24850010f705SIlya Dryomov 					  obj_req->copyup_bvecs,
24860010f705SIlya Dryomov 					  obj_req->copyup_bvec_count,
24870010f705SIlya Dryomov 					  bytes);
248889a59c1cSIlya Dryomov 	rbd_osd_req_format_write(obj_req);
24893da691bfSIlya Dryomov 
249089a59c1cSIlya Dryomov 	ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
249189a59c1cSIlya Dryomov 	if (ret)
249289a59c1cSIlya Dryomov 		return ret;
249389a59c1cSIlya Dryomov 
249489a59c1cSIlya Dryomov 	rbd_obj_request_submit(obj_req);
249589a59c1cSIlya Dryomov 	return 0;
249689a59c1cSIlya Dryomov }
249789a59c1cSIlya Dryomov 
24983a482501SIlya Dryomov static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
24993da691bfSIlya Dryomov {
250013488d53SIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
25013a482501SIlya Dryomov 	unsigned int num_osd_ops = (bytes != MODS_ONLY);
25023a482501SIlya Dryomov 	unsigned int which = 0;
25033da691bfSIlya Dryomov 	int ret;
25043da691bfSIlya Dryomov 
25053da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
250689a59c1cSIlya Dryomov 	rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT ||
250789a59c1cSIlya Dryomov 		   obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL);
25083da691bfSIlya Dryomov 	rbd_osd_req_destroy(obj_req->osd_req);
25093da691bfSIlya Dryomov 
251013488d53SIlya Dryomov 	switch (img_req->op_type) {
25113da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
251213488d53SIlya Dryomov 		num_osd_ops += count_write_ops(obj_req);
25133da691bfSIlya Dryomov 		break;
251413488d53SIlya Dryomov 	case OBJ_OP_ZEROOUT:
251513488d53SIlya Dryomov 		num_osd_ops += count_zeroout_ops(obj_req);
251613488d53SIlya Dryomov 		break;
251713488d53SIlya Dryomov 	default:
251813488d53SIlya Dryomov 		rbd_assert(0);
251913488d53SIlya Dryomov 	}
252013488d53SIlya Dryomov 
25213da691bfSIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
25223da691bfSIlya Dryomov 	if (!obj_req->osd_req)
25233da691bfSIlya Dryomov 		return -ENOMEM;
25243da691bfSIlya Dryomov 
25253a482501SIlya Dryomov 	if (bytes != MODS_ONLY) {
25263a482501SIlya Dryomov 		ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd",
25273a482501SIlya Dryomov 					  "copyup");
25283da691bfSIlya Dryomov 		if (ret)
25293da691bfSIlya Dryomov 			return ret;
25303da691bfSIlya Dryomov 
25313a482501SIlya Dryomov 		osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++,
25323da691bfSIlya Dryomov 						  obj_req->copyup_bvecs,
25333da691bfSIlya Dryomov 						  obj_req->copyup_bvec_count,
25343da691bfSIlya Dryomov 						  bytes);
25353a482501SIlya Dryomov 	}
25363da691bfSIlya Dryomov 
253713488d53SIlya Dryomov 	switch (img_req->op_type) {
25383da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
25393a482501SIlya Dryomov 		__rbd_obj_setup_write(obj_req, which);
25403da691bfSIlya Dryomov 		break;
25416484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
25423a482501SIlya Dryomov 		__rbd_obj_setup_zeroout(obj_req, which);
25433da691bfSIlya Dryomov 		break;
25443da691bfSIlya Dryomov 	default:
25453da691bfSIlya Dryomov 		rbd_assert(0);
25463da691bfSIlya Dryomov 	}
25473da691bfSIlya Dryomov 
254826f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
254926f887e0SIlya Dryomov 	if (ret)
255026f887e0SIlya Dryomov 		return ret;
255126f887e0SIlya Dryomov 
25523da691bfSIlya Dryomov 	rbd_obj_request_submit(obj_req);
25533da691bfSIlya Dryomov 	return 0;
25543da691bfSIlya Dryomov }
25553da691bfSIlya Dryomov 
25563a482501SIlya Dryomov static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
25573a482501SIlya Dryomov {
25583a482501SIlya Dryomov 	/*
25593a482501SIlya Dryomov 	 * Only send non-zero copyup data to save some I/O and network
25603a482501SIlya Dryomov 	 * bandwidth -- zero copyup data is equivalent to the object not
25613a482501SIlya Dryomov 	 * existing.
25623a482501SIlya Dryomov 	 */
25633a482501SIlya Dryomov 	if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
25643a482501SIlya Dryomov 		dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
25653a482501SIlya Dryomov 		bytes = 0;
25663a482501SIlya Dryomov 	}
25673a482501SIlya Dryomov 
256889a59c1cSIlya Dryomov 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
256989a59c1cSIlya Dryomov 		/*
257089a59c1cSIlya Dryomov 		 * Send a copyup request with an empty snapshot context to
257189a59c1cSIlya Dryomov 		 * deep-copyup the object through all existing snapshots.
257289a59c1cSIlya Dryomov 		 * A second request with the current snapshot context will be
257389a59c1cSIlya Dryomov 		 * sent for the actual modification.
257489a59c1cSIlya Dryomov 		 */
257589a59c1cSIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
257689a59c1cSIlya Dryomov 		return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
257789a59c1cSIlya Dryomov 	}
257889a59c1cSIlya Dryomov 
25793a482501SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
25803a482501SIlya Dryomov 	return rbd_obj_issue_copyup_ops(obj_req, bytes);
25813a482501SIlya Dryomov }
25823a482501SIlya Dryomov 
25837e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
25847e07efb1SIlya Dryomov {
25857e07efb1SIlya Dryomov 	u32 i;
25867e07efb1SIlya Dryomov 
25877e07efb1SIlya Dryomov 	rbd_assert(!obj_req->copyup_bvecs);
25887e07efb1SIlya Dryomov 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
25897e07efb1SIlya Dryomov 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
25907e07efb1SIlya Dryomov 					sizeof(*obj_req->copyup_bvecs),
25917e07efb1SIlya Dryomov 					GFP_NOIO);
25927e07efb1SIlya Dryomov 	if (!obj_req->copyup_bvecs)
25937e07efb1SIlya Dryomov 		return -ENOMEM;
25947e07efb1SIlya Dryomov 
25957e07efb1SIlya Dryomov 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
25967e07efb1SIlya Dryomov 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
25977e07efb1SIlya Dryomov 
25987e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
25997e07efb1SIlya Dryomov 		if (!obj_req->copyup_bvecs[i].bv_page)
26007e07efb1SIlya Dryomov 			return -ENOMEM;
26017e07efb1SIlya Dryomov 
26027e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_offset = 0;
26037e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_len = len;
26047e07efb1SIlya Dryomov 		obj_overlap -= len;
26057e07efb1SIlya Dryomov 	}
26067e07efb1SIlya Dryomov 
26077e07efb1SIlya Dryomov 	rbd_assert(!obj_overlap);
26087e07efb1SIlya Dryomov 	return 0;
26097e07efb1SIlya Dryomov }
26107e07efb1SIlya Dryomov 
26113da691bfSIlya Dryomov static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
26123da691bfSIlya Dryomov {
26133da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
26143da691bfSIlya Dryomov 	int ret;
26153da691bfSIlya Dryomov 
261686bd7998SIlya Dryomov 	rbd_assert(obj_req->num_img_extents);
261786bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
261886bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
261986bd7998SIlya Dryomov 	if (!obj_req->num_img_extents) {
26203da691bfSIlya Dryomov 		/*
26213da691bfSIlya Dryomov 		 * The overlap has become 0 (most likely because the
26223a482501SIlya Dryomov 		 * image has been flattened).  Re-submit the original write
26233a482501SIlya Dryomov 		 * request -- pass MODS_ONLY since the copyup isn't needed
26243a482501SIlya Dryomov 		 * anymore.
26253da691bfSIlya Dryomov 		 */
26263a482501SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
26273a482501SIlya Dryomov 		return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
26283da691bfSIlya Dryomov 	}
26293da691bfSIlya Dryomov 
263086bd7998SIlya Dryomov 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
26313da691bfSIlya Dryomov 	if (ret)
26323da691bfSIlya Dryomov 		return ret;
26333da691bfSIlya Dryomov 
26343a482501SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
263586bd7998SIlya Dryomov 	return rbd_obj_read_from_parent(obj_req);
26363da691bfSIlya Dryomov }
26373da691bfSIlya Dryomov 
26383da691bfSIlya Dryomov static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
26393da691bfSIlya Dryomov {
26403da691bfSIlya Dryomov 	int ret;
26413da691bfSIlya Dryomov 
26423da691bfSIlya Dryomov 	switch (obj_req->write_state) {
26433da691bfSIlya Dryomov 	case RBD_OBJ_WRITE_GUARD:
26443da691bfSIlya Dryomov 		rbd_assert(!obj_req->xferred);
26453da691bfSIlya Dryomov 		if (obj_req->result == -ENOENT) {
26463da691bfSIlya Dryomov 			/*
26473da691bfSIlya Dryomov 			 * The target object doesn't exist.  Read the data for
26483da691bfSIlya Dryomov 			 * the entire target object up to the overlap point (if
26493da691bfSIlya Dryomov 			 * any) from the parent, so we can use it for a copyup.
26503da691bfSIlya Dryomov 			 */
26513da691bfSIlya Dryomov 			ret = rbd_obj_handle_write_guard(obj_req);
26523da691bfSIlya Dryomov 			if (ret) {
26533da691bfSIlya Dryomov 				obj_req->result = ret;
26543da691bfSIlya Dryomov 				return true;
26553da691bfSIlya Dryomov 			}
26563da691bfSIlya Dryomov 			return false;
26573da691bfSIlya Dryomov 		}
26583da691bfSIlya Dryomov 		/* fall through */
26593da691bfSIlya Dryomov 	case RBD_OBJ_WRITE_FLAT:
26603a482501SIlya Dryomov 	case RBD_OBJ_WRITE_COPYUP_OPS:
26613da691bfSIlya Dryomov 		if (!obj_req->result)
26623da691bfSIlya Dryomov 			/*
26633da691bfSIlya Dryomov 			 * There is no such thing as a successful short
26643da691bfSIlya Dryomov 			 * write -- indicate the whole request was satisfied.
26653da691bfSIlya Dryomov 			 */
266643df3d35SIlya Dryomov 			obj_req->xferred = obj_req->ex.oe_len;
26673da691bfSIlya Dryomov 		return true;
26683a482501SIlya Dryomov 	case RBD_OBJ_WRITE_READ_FROM_PARENT:
26693da691bfSIlya Dryomov 		if (obj_req->result)
26703a482501SIlya Dryomov 			return true;
26713da691bfSIlya Dryomov 
26723da691bfSIlya Dryomov 		rbd_assert(obj_req->xferred);
26733da691bfSIlya Dryomov 		ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
26743da691bfSIlya Dryomov 		if (ret) {
26753da691bfSIlya Dryomov 			obj_req->result = ret;
2676356889c4SIlya Dryomov 			obj_req->xferred = 0;
26773da691bfSIlya Dryomov 			return true;
26783da691bfSIlya Dryomov 		}
26793da691bfSIlya Dryomov 		return false;
268089a59c1cSIlya Dryomov 	case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC:
268189a59c1cSIlya Dryomov 		if (obj_req->result)
268289a59c1cSIlya Dryomov 			return true;
268389a59c1cSIlya Dryomov 
268489a59c1cSIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
268589a59c1cSIlya Dryomov 		ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
268689a59c1cSIlya Dryomov 		if (ret) {
268789a59c1cSIlya Dryomov 			obj_req->result = ret;
26883da691bfSIlya Dryomov 			return true;
26893da691bfSIlya Dryomov 		}
26903da691bfSIlya Dryomov 		return false;
26913da691bfSIlya Dryomov 	default:
2692c6244b3bSArnd Bergmann 		BUG();
26933da691bfSIlya Dryomov 	}
26943da691bfSIlya Dryomov }
26953da691bfSIlya Dryomov 
26963da691bfSIlya Dryomov /*
26973da691bfSIlya Dryomov  * Returns true if @obj_req is completed, or false otherwise.
26983da691bfSIlya Dryomov  */
26993da691bfSIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
27003da691bfSIlya Dryomov {
27019bb0248dSIlya Dryomov 	switch (obj_req->img_request->op_type) {
27023da691bfSIlya Dryomov 	case OBJ_OP_READ:
27033da691bfSIlya Dryomov 		return rbd_obj_handle_read(obj_req);
27043da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
27053da691bfSIlya Dryomov 		return rbd_obj_handle_write(obj_req);
27063da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
27076484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
27083da691bfSIlya Dryomov 		if (rbd_obj_handle_write(obj_req)) {
27093da691bfSIlya Dryomov 			/*
27103da691bfSIlya Dryomov 			 * Hide -ENOENT from delete/truncate/zero -- discarding
27113da691bfSIlya Dryomov 			 * a non-existent object is not a problem.
27123da691bfSIlya Dryomov 			 */
27133da691bfSIlya Dryomov 			if (obj_req->result == -ENOENT) {
27143da691bfSIlya Dryomov 				obj_req->result = 0;
271543df3d35SIlya Dryomov 				obj_req->xferred = obj_req->ex.oe_len;
27163da691bfSIlya Dryomov 			}
27173da691bfSIlya Dryomov 			return true;
27183da691bfSIlya Dryomov 		}
27193da691bfSIlya Dryomov 		return false;
27203da691bfSIlya Dryomov 	default:
2721c6244b3bSArnd Bergmann 		BUG();
27223da691bfSIlya Dryomov 	}
27233da691bfSIlya Dryomov }
27243da691bfSIlya Dryomov 
27257114edacSIlya Dryomov static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
27267114edacSIlya Dryomov {
27277114edacSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
27287114edacSIlya Dryomov 
27297114edacSIlya Dryomov 	rbd_assert((!obj_req->result &&
273043df3d35SIlya Dryomov 		    obj_req->xferred == obj_req->ex.oe_len) ||
27317114edacSIlya Dryomov 		   (obj_req->result < 0 && !obj_req->xferred));
27327114edacSIlya Dryomov 	if (!obj_req->result) {
27337114edacSIlya Dryomov 		img_req->xferred += obj_req->xferred;
273402c74fbaSAlex Elder 		return;
273502c74fbaSAlex Elder 	}
273602c74fbaSAlex Elder 
27377114edacSIlya Dryomov 	rbd_warn(img_req->rbd_dev,
27387114edacSIlya Dryomov 		 "%s at objno %llu %llu~%llu result %d xferred %llu",
273943df3d35SIlya Dryomov 		 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
274043df3d35SIlya Dryomov 		 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
27417114edacSIlya Dryomov 		 obj_req->xferred);
27427114edacSIlya Dryomov 	if (!img_req->result) {
27437114edacSIlya Dryomov 		img_req->result = obj_req->result;
27447114edacSIlya Dryomov 		img_req->xferred = 0;
2745a9e8ba2cSAlex Elder 	}
27468b3e1a56SAlex Elder }
27478b3e1a56SAlex Elder 
27483da691bfSIlya Dryomov static void rbd_img_end_child_request(struct rbd_img_request *img_req)
27498b3e1a56SAlex Elder {
27503da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = img_req->obj_request;
27518b3e1a56SAlex Elder 
27523da691bfSIlya Dryomov 	rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
275386bd7998SIlya Dryomov 	rbd_assert((!img_req->result &&
275486bd7998SIlya Dryomov 		    img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
275586bd7998SIlya Dryomov 		   (img_req->result < 0 && !img_req->xferred));
27568b3e1a56SAlex Elder 
27573da691bfSIlya Dryomov 	obj_req->result = img_req->result;
27583da691bfSIlya Dryomov 	obj_req->xferred = img_req->xferred;
27593da691bfSIlya Dryomov 	rbd_img_request_put(img_req);
27607114edacSIlya Dryomov }
27618b3e1a56SAlex Elder 
27627114edacSIlya Dryomov static void rbd_img_end_request(struct rbd_img_request *img_req)
27637114edacSIlya Dryomov {
27647114edacSIlya Dryomov 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
27657114edacSIlya Dryomov 	rbd_assert((!img_req->result &&
27667114edacSIlya Dryomov 		    img_req->xferred == blk_rq_bytes(img_req->rq)) ||
27677114edacSIlya Dryomov 		   (img_req->result < 0 && !img_req->xferred));
27688b3e1a56SAlex Elder 
27697114edacSIlya Dryomov 	blk_mq_end_request(img_req->rq,
27707114edacSIlya Dryomov 			   errno_to_blk_status(img_req->result));
27717114edacSIlya Dryomov 	rbd_img_request_put(img_req);
27723da691bfSIlya Dryomov }
27738b3e1a56SAlex Elder 
27743da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
27753da691bfSIlya Dryomov {
27767114edacSIlya Dryomov 	struct rbd_img_request *img_req;
27777114edacSIlya Dryomov 
27787114edacSIlya Dryomov again:
27793da691bfSIlya Dryomov 	if (!__rbd_obj_handle_request(obj_req))
27808b3e1a56SAlex Elder 		return;
27813da691bfSIlya Dryomov 
27827114edacSIlya Dryomov 	img_req = obj_req->img_request;
27837114edacSIlya Dryomov 	spin_lock(&img_req->completion_lock);
27847114edacSIlya Dryomov 	rbd_obj_end_request(obj_req);
27857114edacSIlya Dryomov 	rbd_assert(img_req->pending_count);
27867114edacSIlya Dryomov 	if (--img_req->pending_count) {
27877114edacSIlya Dryomov 		spin_unlock(&img_req->completion_lock);
27887114edacSIlya Dryomov 		return;
27897114edacSIlya Dryomov 	}
27907114edacSIlya Dryomov 
27917114edacSIlya Dryomov 	spin_unlock(&img_req->completion_lock);
27927114edacSIlya Dryomov 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
27937114edacSIlya Dryomov 		obj_req = img_req->obj_request;
27947114edacSIlya Dryomov 		rbd_img_end_child_request(img_req);
27957114edacSIlya Dryomov 		goto again;
27967114edacSIlya Dryomov 	}
27977114edacSIlya Dryomov 	rbd_img_end_request(img_req);
27988b3e1a56SAlex Elder }
27998b3e1a56SAlex Elder 
2800ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
2801ed95b21aSIlya Dryomov 
2802ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2803ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
2804ed95b21aSIlya Dryomov {
2805ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2806ed95b21aSIlya Dryomov }
2807ed95b21aSIlya Dryomov 
2808ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2809ed95b21aSIlya Dryomov {
2810ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
2811ed95b21aSIlya Dryomov 
2812ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
2813ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2814ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
2815ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
2816ed95b21aSIlya Dryomov 	return cid;
2817ed95b21aSIlya Dryomov }
2818ed95b21aSIlya Dryomov 
2819ed95b21aSIlya Dryomov /*
2820ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2821ed95b21aSIlya Dryomov  */
2822ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2823ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
2824ed95b21aSIlya Dryomov {
2825ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2826ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2827ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
2828ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
2829ed95b21aSIlya Dryomov }
2830ed95b21aSIlya Dryomov 
2831ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2832ed95b21aSIlya Dryomov {
2833ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
2834ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2835ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
2836ed95b21aSIlya Dryomov }
2837ed95b21aSIlya Dryomov 
2838edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2839edd8ca80SFlorian Margaine {
2840edd8ca80SFlorian Margaine 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2841edd8ca80SFlorian Margaine 
2842edd8ca80SFlorian Margaine 	strcpy(rbd_dev->lock_cookie, cookie);
2843edd8ca80SFlorian Margaine 	rbd_set_owner_cid(rbd_dev, &cid);
2844edd8ca80SFlorian Margaine 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2845edd8ca80SFlorian Margaine }
2846edd8ca80SFlorian Margaine 
2847ed95b21aSIlya Dryomov /*
2848ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2849ed95b21aSIlya Dryomov  */
2850ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
2851ed95b21aSIlya Dryomov {
2852ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2853ed95b21aSIlya Dryomov 	char cookie[32];
2854ed95b21aSIlya Dryomov 	int ret;
2855ed95b21aSIlya Dryomov 
2856cbbfb0ffSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2857cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] != '\0');
2858ed95b21aSIlya Dryomov 
2859ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
2860ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2861ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2862ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
2863ed95b21aSIlya Dryomov 	if (ret)
2864ed95b21aSIlya Dryomov 		return ret;
2865ed95b21aSIlya Dryomov 
2866ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
2867edd8ca80SFlorian Margaine 	__rbd_lock(rbd_dev, cookie);
2868ed95b21aSIlya Dryomov 	return 0;
2869ed95b21aSIlya Dryomov }
2870ed95b21aSIlya Dryomov 
2871ed95b21aSIlya Dryomov /*
2872ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2873ed95b21aSIlya Dryomov  */
2874bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev)
2875ed95b21aSIlya Dryomov {
2876ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2877ed95b21aSIlya Dryomov 	int ret;
2878ed95b21aSIlya Dryomov 
2879cbbfb0ffSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2880cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] == '\0');
2881ed95b21aSIlya Dryomov 
2882ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2883cbbfb0ffSIlya Dryomov 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
2884bbead745SIlya Dryomov 	if (ret && ret != -ENOENT)
2885bbead745SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock: %d", ret);
2886ed95b21aSIlya Dryomov 
2887bbead745SIlya Dryomov 	/* treat errors as the image is unlocked */
2888bbead745SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
2889cbbfb0ffSIlya Dryomov 	rbd_dev->lock_cookie[0] = '\0';
2890ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2891ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
2892ed95b21aSIlya Dryomov }
2893ed95b21aSIlya Dryomov 
2894ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2895ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
2896ed95b21aSIlya Dryomov 				struct page ***preply_pages,
2897ed95b21aSIlya Dryomov 				size_t *preply_len)
2898ed95b21aSIlya Dryomov {
2899ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2900ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
290108a79102SKyle Spiers 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
290208a79102SKyle Spiers 	int buf_size = sizeof(buf);
2903ed95b21aSIlya Dryomov 	void *p = buf;
2904ed95b21aSIlya Dryomov 
2905ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2906ed95b21aSIlya Dryomov 
2907ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
2908ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2909ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
2910ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
2911ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
2912ed95b21aSIlya Dryomov 
2913ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2914ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
2915ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2916ed95b21aSIlya Dryomov }
2917ed95b21aSIlya Dryomov 
2918ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2919ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
2920ed95b21aSIlya Dryomov {
2921ed95b21aSIlya Dryomov 	struct page **reply_pages;
2922ed95b21aSIlya Dryomov 	size_t reply_len;
2923ed95b21aSIlya Dryomov 
2924ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2925ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2926ed95b21aSIlya Dryomov }
2927ed95b21aSIlya Dryomov 
2928ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
2929ed95b21aSIlya Dryomov {
2930ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2931ed95b21aSIlya Dryomov 						  acquired_lock_work);
2932ed95b21aSIlya Dryomov 
2933ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2934ed95b21aSIlya Dryomov }
2935ed95b21aSIlya Dryomov 
2936ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
2937ed95b21aSIlya Dryomov {
2938ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2939ed95b21aSIlya Dryomov 						  released_lock_work);
2940ed95b21aSIlya Dryomov 
2941ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2942ed95b21aSIlya Dryomov }
2943ed95b21aSIlya Dryomov 
2944ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
2945ed95b21aSIlya Dryomov {
2946ed95b21aSIlya Dryomov 	struct page **reply_pages;
2947ed95b21aSIlya Dryomov 	size_t reply_len;
2948ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
2949ed95b21aSIlya Dryomov 	int ret;
2950ed95b21aSIlya Dryomov 
2951ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
2952ed95b21aSIlya Dryomov 
2953ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2954ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
2955ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
2956ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2957ed95b21aSIlya Dryomov 		goto out;
2958ed95b21aSIlya Dryomov 	}
2959ed95b21aSIlya Dryomov 
2960ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2961ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
2962ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
2963ed95b21aSIlya Dryomov 		u32 n;
2964ed95b21aSIlya Dryomov 
2965ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2966ed95b21aSIlya Dryomov 		while (n--) {
2967ed95b21aSIlya Dryomov 			u8 struct_v;
2968ed95b21aSIlya Dryomov 			u32 len;
2969ed95b21aSIlya Dryomov 
2970ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
2971ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
2972ed95b21aSIlya Dryomov 
2973ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
2974ed95b21aSIlya Dryomov 			if (!len)
2975ed95b21aSIlya Dryomov 				continue;
2976ed95b21aSIlya Dryomov 
2977ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
2978ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
2979ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
2980ed95b21aSIlya Dryomov 				ret = -EIO;
2981ed95b21aSIlya Dryomov 				goto out;
2982ed95b21aSIlya Dryomov 			}
2983ed95b21aSIlya Dryomov 
2984ed95b21aSIlya Dryomov 			lock_owner_responded = true;
2985ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2986ed95b21aSIlya Dryomov 						  &struct_v, &len);
2987ed95b21aSIlya Dryomov 			if (ret) {
2988ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
2989ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
2990ed95b21aSIlya Dryomov 					 ret);
2991ed95b21aSIlya Dryomov 				goto e_inval;
2992ed95b21aSIlya Dryomov 			}
2993ed95b21aSIlya Dryomov 
2994ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
2995ed95b21aSIlya Dryomov 		}
2996ed95b21aSIlya Dryomov 	}
2997ed95b21aSIlya Dryomov 
2998ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
2999ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3000ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3001ed95b21aSIlya Dryomov 	}
3002ed95b21aSIlya Dryomov 
3003ed95b21aSIlya Dryomov out:
3004ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3005ed95b21aSIlya Dryomov 	return ret;
3006ed95b21aSIlya Dryomov 
3007ed95b21aSIlya Dryomov e_inval:
3008ed95b21aSIlya Dryomov 	ret = -EINVAL;
3009ed95b21aSIlya Dryomov 	goto out;
3010ed95b21aSIlya Dryomov }
3011ed95b21aSIlya Dryomov 
3012ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3013ed95b21aSIlya Dryomov {
3014ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3015ed95b21aSIlya Dryomov 
3016ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3017ed95b21aSIlya Dryomov 	if (wake_all)
3018ed95b21aSIlya Dryomov 		wake_up_all(&rbd_dev->lock_waitq);
3019ed95b21aSIlya Dryomov 	else
3020ed95b21aSIlya Dryomov 		wake_up(&rbd_dev->lock_waitq);
3021ed95b21aSIlya Dryomov }
3022ed95b21aSIlya Dryomov 
3023ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3024ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3025ed95b21aSIlya Dryomov {
3026ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3027ed95b21aSIlya Dryomov 	u8 lock_type;
3028ed95b21aSIlya Dryomov 	char *lock_tag;
3029ed95b21aSIlya Dryomov 	int ret;
3030ed95b21aSIlya Dryomov 
3031ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3032ed95b21aSIlya Dryomov 
3033ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3034ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3035ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
3036ed95b21aSIlya Dryomov 	if (ret)
3037ed95b21aSIlya Dryomov 		return ret;
3038ed95b21aSIlya Dryomov 
3039ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
3040ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3041ed95b21aSIlya Dryomov 		goto out;
3042ed95b21aSIlya Dryomov 	}
3043ed95b21aSIlya Dryomov 
3044ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3045ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3046ed95b21aSIlya Dryomov 			 lock_tag);
3047ed95b21aSIlya Dryomov 		ret = -EBUSY;
3048ed95b21aSIlya Dryomov 		goto out;
3049ed95b21aSIlya Dryomov 	}
3050ed95b21aSIlya Dryomov 
3051ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3052ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
3053ed95b21aSIlya Dryomov 		ret = -EBUSY;
3054ed95b21aSIlya Dryomov 		goto out;
3055ed95b21aSIlya Dryomov 	}
3056ed95b21aSIlya Dryomov 
3057ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3058ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3059ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3060ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3061ed95b21aSIlya Dryomov 		ret = -EBUSY;
3062ed95b21aSIlya Dryomov 		goto out;
3063ed95b21aSIlya Dryomov 	}
3064ed95b21aSIlya Dryomov 
3065ed95b21aSIlya Dryomov out:
3066ed95b21aSIlya Dryomov 	kfree(lock_tag);
3067ed95b21aSIlya Dryomov 	return ret;
3068ed95b21aSIlya Dryomov }
3069ed95b21aSIlya Dryomov 
3070ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3071ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3072ed95b21aSIlya Dryomov {
3073ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3074ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3075ed95b21aSIlya Dryomov 	u32 num_watchers;
3076ed95b21aSIlya Dryomov 	u64 cookie;
3077ed95b21aSIlya Dryomov 	int i;
3078ed95b21aSIlya Dryomov 	int ret;
3079ed95b21aSIlya Dryomov 
3080ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3081ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3082ed95b21aSIlya Dryomov 				      &num_watchers);
3083ed95b21aSIlya Dryomov 	if (ret)
3084ed95b21aSIlya Dryomov 		return ret;
3085ed95b21aSIlya Dryomov 
3086ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3087ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3088ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3089ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
3090ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3091ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3092ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3093ed95b21aSIlya Dryomov 				.handle = cookie,
3094ed95b21aSIlya Dryomov 			};
3095ed95b21aSIlya Dryomov 
3096ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3097ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3098ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3099ed95b21aSIlya Dryomov 			ret = 1;
3100ed95b21aSIlya Dryomov 			goto out;
3101ed95b21aSIlya Dryomov 		}
3102ed95b21aSIlya Dryomov 	}
3103ed95b21aSIlya Dryomov 
3104ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3105ed95b21aSIlya Dryomov 	ret = 0;
3106ed95b21aSIlya Dryomov out:
3107ed95b21aSIlya Dryomov 	kfree(watchers);
3108ed95b21aSIlya Dryomov 	return ret;
3109ed95b21aSIlya Dryomov }
3110ed95b21aSIlya Dryomov 
3111ed95b21aSIlya Dryomov /*
3112ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3113ed95b21aSIlya Dryomov  */
3114ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3115ed95b21aSIlya Dryomov {
3116ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3117ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3118ed95b21aSIlya Dryomov 	u32 num_lockers;
3119ed95b21aSIlya Dryomov 	int ret;
3120ed95b21aSIlya Dryomov 
3121ed95b21aSIlya Dryomov 	for (;;) {
3122ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3123ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3124ed95b21aSIlya Dryomov 			return ret;
3125ed95b21aSIlya Dryomov 
3126ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3127ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3128ed95b21aSIlya Dryomov 		if (ret)
3129ed95b21aSIlya Dryomov 			return ret;
3130ed95b21aSIlya Dryomov 
3131ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3132ed95b21aSIlya Dryomov 			goto again;
3133ed95b21aSIlya Dryomov 
3134ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3135ed95b21aSIlya Dryomov 		if (ret) {
3136ed95b21aSIlya Dryomov 			if (ret > 0)
3137ed95b21aSIlya Dryomov 				ret = 0; /* have to request lock */
3138ed95b21aSIlya Dryomov 			goto out;
3139ed95b21aSIlya Dryomov 		}
3140ed95b21aSIlya Dryomov 
3141ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3142ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3143ed95b21aSIlya Dryomov 
3144ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
3145ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3146ed95b21aSIlya Dryomov 		if (ret) {
3147ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3148ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3149ed95b21aSIlya Dryomov 			goto out;
3150ed95b21aSIlya Dryomov 		}
3151ed95b21aSIlya Dryomov 
3152ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3153ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3154ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3155ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3156ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3157ed95b21aSIlya Dryomov 			goto out;
3158ed95b21aSIlya Dryomov 
3159ed95b21aSIlya Dryomov again:
3160ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3161ed95b21aSIlya Dryomov 	}
3162ed95b21aSIlya Dryomov 
3163ed95b21aSIlya Dryomov out:
3164ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3165ed95b21aSIlya Dryomov 	return ret;
3166ed95b21aSIlya Dryomov }
3167ed95b21aSIlya Dryomov 
3168ed95b21aSIlya Dryomov /*
3169ed95b21aSIlya Dryomov  * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3170ed95b21aSIlya Dryomov  */
3171ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3172ed95b21aSIlya Dryomov 						int *pret)
3173ed95b21aSIlya Dryomov {
3174ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3175ed95b21aSIlya Dryomov 
3176ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3177ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3178ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3179ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
3180ed95b21aSIlya Dryomov 		lock_state = rbd_dev->lock_state;
3181ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3182ed95b21aSIlya Dryomov 		return lock_state;
3183ed95b21aSIlya Dryomov 	}
3184ed95b21aSIlya Dryomov 
3185ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3186ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3187ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3188ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3189ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev)) {
3190ed95b21aSIlya Dryomov 		*pret = rbd_try_lock(rbd_dev);
3191ed95b21aSIlya Dryomov 		if (*pret)
3192ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3193ed95b21aSIlya Dryomov 	}
3194ed95b21aSIlya Dryomov 
3195ed95b21aSIlya Dryomov 	lock_state = rbd_dev->lock_state;
3196ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3197ed95b21aSIlya Dryomov 	return lock_state;
3198ed95b21aSIlya Dryomov }
3199ed95b21aSIlya Dryomov 
3200ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
3201ed95b21aSIlya Dryomov {
3202ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3203ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
3204ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
320537f13252SKefeng Wang 	int ret = 0;
3206ed95b21aSIlya Dryomov 
3207ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3208ed95b21aSIlya Dryomov again:
3209ed95b21aSIlya Dryomov 	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3210ed95b21aSIlya Dryomov 	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3211ed95b21aSIlya Dryomov 		if (lock_state == RBD_LOCK_STATE_LOCKED)
3212ed95b21aSIlya Dryomov 			wake_requests(rbd_dev, true);
3213ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3214ed95b21aSIlya Dryomov 		     rbd_dev, lock_state, ret);
3215ed95b21aSIlya Dryomov 		return;
3216ed95b21aSIlya Dryomov 	}
3217ed95b21aSIlya Dryomov 
3218ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
3219ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
3220ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
3221e010dd0aSIlya Dryomov 	} else if (ret == -EROFS) {
3222e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "peer will not release lock");
3223e010dd0aSIlya Dryomov 		/*
3224e010dd0aSIlya Dryomov 		 * If this is rbd_add_acquire_lock(), we want to fail
3225e010dd0aSIlya Dryomov 		 * immediately -- reuse BLACKLISTED flag.  Otherwise we
3226e010dd0aSIlya Dryomov 		 * want to block.
3227e010dd0aSIlya Dryomov 		 */
3228e010dd0aSIlya Dryomov 		if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3229e010dd0aSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3230e010dd0aSIlya Dryomov 			/* wake "rbd map --exclusive" process */
3231e010dd0aSIlya Dryomov 			wake_requests(rbd_dev, false);
3232e010dd0aSIlya Dryomov 		}
3233ed95b21aSIlya Dryomov 	} else if (ret < 0) {
3234ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3235ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3236ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
3237ed95b21aSIlya Dryomov 	} else {
3238ed95b21aSIlya Dryomov 		/*
3239ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
3240ed95b21aSIlya Dryomov 		 * release the lock
3241ed95b21aSIlya Dryomov 		 */
3242ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3243ed95b21aSIlya Dryomov 		     rbd_dev);
3244ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3245ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3246ed95b21aSIlya Dryomov 	}
3247ed95b21aSIlya Dryomov }
3248ed95b21aSIlya Dryomov 
3249ed95b21aSIlya Dryomov /*
3250ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3251ed95b21aSIlya Dryomov  */
3252ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev)
3253ed95b21aSIlya Dryomov {
3254ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3255ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3256ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3257ed95b21aSIlya Dryomov 		return false;
3258ed95b21aSIlya Dryomov 
3259ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3260ed95b21aSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
3261ed95b21aSIlya Dryomov 	/*
3262ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
3263ed95b21aSIlya Dryomov 	 *
3264ed95b21aSIlya Dryomov 	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3265ed95b21aSIlya Dryomov 	 * may be shared with other devices.
3266ed95b21aSIlya Dryomov 	 */
3267ed95b21aSIlya Dryomov 	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3268ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3269ed95b21aSIlya Dryomov 
3270ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3271ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3272ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3273ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3274ed95b21aSIlya Dryomov 		return false;
3275ed95b21aSIlya Dryomov 
3276bbead745SIlya Dryomov 	rbd_unlock(rbd_dev);
3277ed95b21aSIlya Dryomov 	/*
3278ed95b21aSIlya Dryomov 	 * Give others a chance to grab the lock - we would re-acquire
3279ed95b21aSIlya Dryomov 	 * almost immediately if we got new IO during ceph_osdc_sync()
3280ed95b21aSIlya Dryomov 	 * otherwise.  We need to ack our own notifications, so this
3281ed95b21aSIlya Dryomov 	 * lock_dwork will be requeued from rbd_wait_state_locked()
3282ed95b21aSIlya Dryomov 	 * after wake_requests() in rbd_handle_released_lock().
3283ed95b21aSIlya Dryomov 	 */
3284ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3285ed95b21aSIlya Dryomov 	return true;
3286ed95b21aSIlya Dryomov }
3287ed95b21aSIlya Dryomov 
3288ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
3289ed95b21aSIlya Dryomov {
3290ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3291ed95b21aSIlya Dryomov 						  unlock_work);
3292ed95b21aSIlya Dryomov 
3293ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3294ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
3295ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3296ed95b21aSIlya Dryomov }
3297ed95b21aSIlya Dryomov 
3298ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3299ed95b21aSIlya Dryomov 				     void **p)
3300ed95b21aSIlya Dryomov {
3301ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3302ed95b21aSIlya Dryomov 
3303ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3304ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3305ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3306ed95b21aSIlya Dryomov 	}
3307ed95b21aSIlya Dryomov 
3308ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3309ed95b21aSIlya Dryomov 	     cid.handle);
3310ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3311ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3312ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3313ed95b21aSIlya Dryomov 			/*
3314ed95b21aSIlya Dryomov 			 * we already know that the remote client is
3315ed95b21aSIlya Dryomov 			 * the owner
3316ed95b21aSIlya Dryomov 			 */
3317ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3318ed95b21aSIlya Dryomov 			return;
3319ed95b21aSIlya Dryomov 		}
3320ed95b21aSIlya Dryomov 
3321ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
3322ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3323ed95b21aSIlya Dryomov 	} else {
3324ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3325ed95b21aSIlya Dryomov 	}
3326ed95b21aSIlya Dryomov 
3327ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3328ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3329ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3330ed95b21aSIlya Dryomov }
3331ed95b21aSIlya Dryomov 
3332ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3333ed95b21aSIlya Dryomov 				     void **p)
3334ed95b21aSIlya Dryomov {
3335ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3336ed95b21aSIlya Dryomov 
3337ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3338ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3339ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3340ed95b21aSIlya Dryomov 	}
3341ed95b21aSIlya Dryomov 
3342ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3343ed95b21aSIlya Dryomov 	     cid.handle);
3344ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3345ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3346ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3347ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3348ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
3349ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3350ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3351ed95b21aSIlya Dryomov 			return;
3352ed95b21aSIlya Dryomov 		}
3353ed95b21aSIlya Dryomov 
3354ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3355ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3356ed95b21aSIlya Dryomov 	} else {
3357ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3358ed95b21aSIlya Dryomov 	}
3359ed95b21aSIlya Dryomov 
3360ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3361ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3362ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3363ed95b21aSIlya Dryomov }
3364ed95b21aSIlya Dryomov 
33653b77faa0SIlya Dryomov /*
33663b77faa0SIlya Dryomov  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
33673b77faa0SIlya Dryomov  * ResponseMessage is needed.
33683b77faa0SIlya Dryomov  */
33693b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3370ed95b21aSIlya Dryomov 				   void **p)
3371ed95b21aSIlya Dryomov {
3372ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3373ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
33743b77faa0SIlya Dryomov 	int result = 1;
3375ed95b21aSIlya Dryomov 
3376ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3377ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3378ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3379ed95b21aSIlya Dryomov 	}
3380ed95b21aSIlya Dryomov 
3381ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3382ed95b21aSIlya Dryomov 	     cid.handle);
3383ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
33843b77faa0SIlya Dryomov 		return result;
3385ed95b21aSIlya Dryomov 
3386ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
33873b77faa0SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
33883b77faa0SIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
33893b77faa0SIlya Dryomov 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
33903b77faa0SIlya Dryomov 			goto out_unlock;
33913b77faa0SIlya Dryomov 
33923b77faa0SIlya Dryomov 		/*
33933b77faa0SIlya Dryomov 		 * encode ResponseMessage(0) so the peer can detect
33943b77faa0SIlya Dryomov 		 * a missing owner
33953b77faa0SIlya Dryomov 		 */
33963b77faa0SIlya Dryomov 		result = 0;
33973b77faa0SIlya Dryomov 
3398ed95b21aSIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3399e010dd0aSIlya Dryomov 			if (!rbd_dev->opts->exclusive) {
3400e010dd0aSIlya Dryomov 				dout("%s rbd_dev %p queueing unlock_work\n",
3401e010dd0aSIlya Dryomov 				     __func__, rbd_dev);
3402e010dd0aSIlya Dryomov 				queue_work(rbd_dev->task_wq,
3403e010dd0aSIlya Dryomov 					   &rbd_dev->unlock_work);
3404e010dd0aSIlya Dryomov 			} else {
3405e010dd0aSIlya Dryomov 				/* refuse to release the lock */
3406e010dd0aSIlya Dryomov 				result = -EROFS;
3407ed95b21aSIlya Dryomov 			}
3408ed95b21aSIlya Dryomov 		}
3409ed95b21aSIlya Dryomov 	}
34103b77faa0SIlya Dryomov 
34113b77faa0SIlya Dryomov out_unlock:
3412ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
34133b77faa0SIlya Dryomov 	return result;
3414ed95b21aSIlya Dryomov }
3415ed95b21aSIlya Dryomov 
3416ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3417ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
3418ed95b21aSIlya Dryomov {
3419ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
342008a79102SKyle Spiers 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
342108a79102SKyle Spiers 	int buf_size = sizeof(buf);
3422ed95b21aSIlya Dryomov 	int ret;
3423ed95b21aSIlya Dryomov 
3424ed95b21aSIlya Dryomov 	if (result) {
3425ed95b21aSIlya Dryomov 		void *p = buf;
3426ed95b21aSIlya Dryomov 
3427ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
3428ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
3429ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3430ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
3431ed95b21aSIlya Dryomov 	} else {
3432ed95b21aSIlya Dryomov 		buf_size = 0;
3433ed95b21aSIlya Dryomov 	}
3434ed95b21aSIlya Dryomov 
3435ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3436ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3437ed95b21aSIlya Dryomov 				   buf, buf_size);
3438ed95b21aSIlya Dryomov 	if (ret)
3439ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3440ed95b21aSIlya Dryomov }
3441ed95b21aSIlya Dryomov 
3442ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3443ed95b21aSIlya Dryomov 				   u64 cookie)
3444ed95b21aSIlya Dryomov {
3445ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3446ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3447ed95b21aSIlya Dryomov }
3448ed95b21aSIlya Dryomov 
3449ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3450ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
3451ed95b21aSIlya Dryomov {
3452ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3453ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3454ed95b21aSIlya Dryomov }
3455922dab61SIlya Dryomov 
3456922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3457922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3458b8d70035SAlex Elder {
3459922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3460ed95b21aSIlya Dryomov 	void *p = data;
3461ed95b21aSIlya Dryomov 	void *const end = p + data_len;
3462d4c2269bSIlya Dryomov 	u8 struct_v = 0;
3463ed95b21aSIlya Dryomov 	u32 len;
3464ed95b21aSIlya Dryomov 	u32 notify_op;
3465b8d70035SAlex Elder 	int ret;
3466b8d70035SAlex Elder 
3467ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3468ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
3469ed95b21aSIlya Dryomov 	if (data_len) {
3470ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3471ed95b21aSIlya Dryomov 					  &struct_v, &len);
3472ed95b21aSIlya Dryomov 		if (ret) {
3473ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3474ed95b21aSIlya Dryomov 				 ret);
3475ed95b21aSIlya Dryomov 			return;
3476ed95b21aSIlya Dryomov 		}
347752bb1f9bSIlya Dryomov 
3478ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
3479ed95b21aSIlya Dryomov 	} else {
3480ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
3481ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3482ed95b21aSIlya Dryomov 		len = 0;
3483ed95b21aSIlya Dryomov 	}
3484ed95b21aSIlya Dryomov 
3485ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3486ed95b21aSIlya Dryomov 	switch (notify_op) {
3487ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3488ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3489ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3490ed95b21aSIlya Dryomov 		break;
3491ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
3492ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3493ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3494ed95b21aSIlya Dryomov 		break;
3495ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
34963b77faa0SIlya Dryomov 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
34973b77faa0SIlya Dryomov 		if (ret <= 0)
3498ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
34993b77faa0SIlya Dryomov 						      cookie, ret);
3500ed95b21aSIlya Dryomov 		else
3501ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3502ed95b21aSIlya Dryomov 		break;
3503ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
3504e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
3505e627db08SAlex Elder 		if (ret)
35069584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3507b8d70035SAlex Elder 
3508ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3509ed95b21aSIlya Dryomov 		break;
3510ed95b21aSIlya Dryomov 	default:
3511ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
3512ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3513ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
3514ed95b21aSIlya Dryomov 		else
3515ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3516ed95b21aSIlya Dryomov 		break;
3517b8d70035SAlex Elder 	}
3518b8d70035SAlex Elder }
3519b8d70035SAlex Elder 
352099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
35219969ebc5SAlex Elder 
3522922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3523bb040aa0SIlya Dryomov {
3524922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3525bb040aa0SIlya Dryomov 
3526922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3527bb040aa0SIlya Dryomov 
3528ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3529ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3530ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3531bb040aa0SIlya Dryomov 
353299d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
353399d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
353499d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
353599d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3536bb040aa0SIlya Dryomov 
353799d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3538bb040aa0SIlya Dryomov 	}
353999d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3540bb040aa0SIlya Dryomov }
3541bb040aa0SIlya Dryomov 
3542bb040aa0SIlya Dryomov /*
354399d16943SIlya Dryomov  * watch_mutex must be locked
35449969ebc5SAlex Elder  */
354599d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
35469969ebc5SAlex Elder {
35479969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3548922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
35499969ebc5SAlex Elder 
3550922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
355199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
35529969ebc5SAlex Elder 
3553922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3554922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3555922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3556922dab61SIlya Dryomov 	if (IS_ERR(handle))
3557922dab61SIlya Dryomov 		return PTR_ERR(handle);
35589969ebc5SAlex Elder 
3559922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
35608eb87565SAlex Elder 	return 0;
35619969ebc5SAlex Elder }
35629969ebc5SAlex Elder 
356399d16943SIlya Dryomov /*
356499d16943SIlya Dryomov  * watch_mutex must be locked
356599d16943SIlya Dryomov  */
356699d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3567fca27065SIlya Dryomov {
3568922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3569922dab61SIlya Dryomov 	int ret;
3570b30a01f2SIlya Dryomov 
357199d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
357299d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3573b30a01f2SIlya Dryomov 
3574922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3575922dab61SIlya Dryomov 	if (ret)
3576922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3577b30a01f2SIlya Dryomov 
3578922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3579c525f036SIlya Dryomov }
3580c525f036SIlya Dryomov 
358199d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
3582c525f036SIlya Dryomov {
358399d16943SIlya Dryomov 	int ret;
3584811c6688SIlya Dryomov 
358599d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
358699d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
358799d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
358899d16943SIlya Dryomov 	if (ret)
358999d16943SIlya Dryomov 		goto out;
359099d16943SIlya Dryomov 
359199d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
359299d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
359399d16943SIlya Dryomov 
359499d16943SIlya Dryomov out:
359599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
359699d16943SIlya Dryomov 	return ret;
359799d16943SIlya Dryomov }
359899d16943SIlya Dryomov 
359999d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
360099d16943SIlya Dryomov {
360199d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
360299d16943SIlya Dryomov 
3603ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
3604ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
3605ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3606ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
360799d16943SIlya Dryomov }
360899d16943SIlya Dryomov 
360999d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
361099d16943SIlya Dryomov {
3611ed95b21aSIlya Dryomov 	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
361299d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
361399d16943SIlya Dryomov 
361499d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
361599d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
361699d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
361799d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
361899d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
361999d16943SIlya Dryomov 
362023edca86SDongsheng Yang 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3621811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3622fca27065SIlya Dryomov }
3623fca27065SIlya Dryomov 
362414bb211dSIlya Dryomov /*
362514bb211dSIlya Dryomov  * lock_rwsem must be held for write
362614bb211dSIlya Dryomov  */
362714bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
362814bb211dSIlya Dryomov {
362914bb211dSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
363014bb211dSIlya Dryomov 	char cookie[32];
363114bb211dSIlya Dryomov 	int ret;
363214bb211dSIlya Dryomov 
363314bb211dSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
363414bb211dSIlya Dryomov 
363514bb211dSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
363614bb211dSIlya Dryomov 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
363714bb211dSIlya Dryomov 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
363814bb211dSIlya Dryomov 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
363914bb211dSIlya Dryomov 				  RBD_LOCK_TAG, cookie);
364014bb211dSIlya Dryomov 	if (ret) {
364114bb211dSIlya Dryomov 		if (ret != -EOPNOTSUPP)
364214bb211dSIlya Dryomov 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
364314bb211dSIlya Dryomov 				 ret);
364414bb211dSIlya Dryomov 
364514bb211dSIlya Dryomov 		/*
364614bb211dSIlya Dryomov 		 * Lock cookie cannot be updated on older OSDs, so do
364714bb211dSIlya Dryomov 		 * a manual release and queue an acquire.
364814bb211dSIlya Dryomov 		 */
364914bb211dSIlya Dryomov 		if (rbd_release_lock(rbd_dev))
365014bb211dSIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
365114bb211dSIlya Dryomov 					   &rbd_dev->lock_dwork, 0);
365214bb211dSIlya Dryomov 	} else {
3653edd8ca80SFlorian Margaine 		__rbd_lock(rbd_dev, cookie);
365414bb211dSIlya Dryomov 	}
365514bb211dSIlya Dryomov }
365614bb211dSIlya Dryomov 
365799d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
365899d16943SIlya Dryomov {
365999d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
366099d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
366199d16943SIlya Dryomov 	int ret;
366299d16943SIlya Dryomov 
366399d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
366499d16943SIlya Dryomov 
366599d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
366687c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
366787c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
366814bb211dSIlya Dryomov 		return;
366987c0fdedSIlya Dryomov 	}
367099d16943SIlya Dryomov 
367199d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
367299d16943SIlya Dryomov 	if (ret) {
367399d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
36744d73644bSIlya Dryomov 		if (ret == -EBLACKLISTED || ret == -ENOENT) {
367587c0fdedSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
367614bb211dSIlya Dryomov 			wake_requests(rbd_dev, true);
367787c0fdedSIlya Dryomov 		} else {
367899d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
367999d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
368099d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
368187c0fdedSIlya Dryomov 		}
368287c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
368314bb211dSIlya Dryomov 		return;
368499d16943SIlya Dryomov 	}
368599d16943SIlya Dryomov 
368699d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
368799d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
368899d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
368999d16943SIlya Dryomov 
369014bb211dSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
369114bb211dSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
369214bb211dSIlya Dryomov 		rbd_reacquire_lock(rbd_dev);
369314bb211dSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
369414bb211dSIlya Dryomov 
369599d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
369699d16943SIlya Dryomov 	if (ret)
3697f6870cc9SColin Ian King 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
369899d16943SIlya Dryomov }
369999d16943SIlya Dryomov 
370036be9a76SAlex Elder /*
3701f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3702f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
370336be9a76SAlex Elder  */
370436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3705ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
3706ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
370736be9a76SAlex Elder 			     const char *method_name,
37084157976bSAlex Elder 			     const void *outbound,
370936be9a76SAlex Elder 			     size_t outbound_size,
37104157976bSAlex Elder 			     void *inbound,
3711e2a58ee5SAlex Elder 			     size_t inbound_size)
371236be9a76SAlex Elder {
3713ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3714ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
3715ecd4a68aSIlya Dryomov 	struct page *reply_page;
371636be9a76SAlex Elder 	int ret;
371736be9a76SAlex Elder 
371836be9a76SAlex Elder 	/*
37196010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
37206010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
37216010a451SAlex Elder 	 * also supply outbound data--parameters for the object
37226010a451SAlex Elder 	 * method.  Currently if this is present it will be a
37236010a451SAlex Elder 	 * snapshot id.
372436be9a76SAlex Elder 	 */
3725ecd4a68aSIlya Dryomov 	if (outbound) {
3726ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
3727ecd4a68aSIlya Dryomov 			return -E2BIG;
372836be9a76SAlex Elder 
3729ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
3730ecd4a68aSIlya Dryomov 		if (!req_page)
3731ecd4a68aSIlya Dryomov 			return -ENOMEM;
373236be9a76SAlex Elder 
3733ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
373404017e29SAlex Elder 	}
3735430c28c3SAlex Elder 
3736ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
3737ecd4a68aSIlya Dryomov 	if (!reply_page) {
3738ecd4a68aSIlya Dryomov 		if (req_page)
3739ecd4a68aSIlya Dryomov 			__free_page(req_page);
3740ecd4a68aSIlya Dryomov 		return -ENOMEM;
3741ecd4a68aSIlya Dryomov 	}
374236be9a76SAlex Elder 
3743ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3744ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
3745ecd4a68aSIlya Dryomov 			     reply_page, &inbound_size);
3746ecd4a68aSIlya Dryomov 	if (!ret) {
3747ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
3748ecd4a68aSIlya Dryomov 		ret = inbound_size;
3749ecd4a68aSIlya Dryomov 	}
375057385b51SAlex Elder 
3751ecd4a68aSIlya Dryomov 	if (req_page)
3752ecd4a68aSIlya Dryomov 		__free_page(req_page);
3753ecd4a68aSIlya Dryomov 	__free_page(reply_page);
375436be9a76SAlex Elder 	return ret;
375536be9a76SAlex Elder }
375636be9a76SAlex Elder 
3757ed95b21aSIlya Dryomov /*
3758ed95b21aSIlya Dryomov  * lock_rwsem must be held for read
3759ed95b21aSIlya Dryomov  */
37602f18d466SIlya Dryomov static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
3761ed95b21aSIlya Dryomov {
3762ed95b21aSIlya Dryomov 	DEFINE_WAIT(wait);
376334f55d0bSDongsheng Yang 	unsigned long timeout;
37642f18d466SIlya Dryomov 	int ret = 0;
37652f18d466SIlya Dryomov 
37662f18d466SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
37672f18d466SIlya Dryomov 		return -EBLACKLISTED;
37682f18d466SIlya Dryomov 
37692f18d466SIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
37702f18d466SIlya Dryomov 		return 0;
37712f18d466SIlya Dryomov 
37722f18d466SIlya Dryomov 	if (!may_acquire) {
37732f18d466SIlya Dryomov 		rbd_warn(rbd_dev, "exclusive lock required");
37742f18d466SIlya Dryomov 		return -EROFS;
37752f18d466SIlya Dryomov 	}
3776ed95b21aSIlya Dryomov 
3777ed95b21aSIlya Dryomov 	do {
3778ed95b21aSIlya Dryomov 		/*
3779ed95b21aSIlya Dryomov 		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3780ed95b21aSIlya Dryomov 		 * and cancel_delayed_work() in wake_requests().
3781ed95b21aSIlya Dryomov 		 */
3782ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3783ed95b21aSIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3784ed95b21aSIlya Dryomov 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3785ed95b21aSIlya Dryomov 					  TASK_UNINTERRUPTIBLE);
3786ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
378734f55d0bSDongsheng Yang 		timeout = schedule_timeout(ceph_timeout_jiffies(
378834f55d0bSDongsheng Yang 						rbd_dev->opts->lock_timeout));
3789ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
37902f18d466SIlya Dryomov 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
37912f18d466SIlya Dryomov 			ret = -EBLACKLISTED;
37922f18d466SIlya Dryomov 			break;
37932f18d466SIlya Dryomov 		}
379434f55d0bSDongsheng Yang 		if (!timeout) {
379534f55d0bSDongsheng Yang 			rbd_warn(rbd_dev, "timed out waiting for lock");
379634f55d0bSDongsheng Yang 			ret = -ETIMEDOUT;
379734f55d0bSDongsheng Yang 			break;
379834f55d0bSDongsheng Yang 		}
37992f18d466SIlya Dryomov 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
380087c0fdedSIlya Dryomov 
3801ed95b21aSIlya Dryomov 	finish_wait(&rbd_dev->lock_waitq, &wait);
38022f18d466SIlya Dryomov 	return ret;
3803ed95b21aSIlya Dryomov }
3804ed95b21aSIlya Dryomov 
38057ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3806bc1ecc65SIlya Dryomov {
38077ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
38087ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3809bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
38104e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3811bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3812bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
38136d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
38144e752f0aSJosh Durgin 	u64 mapping_size;
381580de1912SIlya Dryomov 	bool must_be_locked;
3816bc1ecc65SIlya Dryomov 	int result;
3817bc1ecc65SIlya Dryomov 
3818aebf526bSChristoph Hellwig 	switch (req_op(rq)) {
3819aebf526bSChristoph Hellwig 	case REQ_OP_DISCARD:
3820aebf526bSChristoph Hellwig 		op_type = OBJ_OP_DISCARD;
3821aebf526bSChristoph Hellwig 		break;
38226484cbe9SIlya Dryomov 	case REQ_OP_WRITE_ZEROES:
38236484cbe9SIlya Dryomov 		op_type = OBJ_OP_ZEROOUT;
38246484cbe9SIlya Dryomov 		break;
3825aebf526bSChristoph Hellwig 	case REQ_OP_WRITE:
3826aebf526bSChristoph Hellwig 		op_type = OBJ_OP_WRITE;
3827aebf526bSChristoph Hellwig 		break;
3828aebf526bSChristoph Hellwig 	case REQ_OP_READ:
3829aebf526bSChristoph Hellwig 		op_type = OBJ_OP_READ;
3830aebf526bSChristoph Hellwig 		break;
3831aebf526bSChristoph Hellwig 	default:
3832aebf526bSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
38337ad18afaSChristoph Hellwig 		result = -EIO;
38347ad18afaSChristoph Hellwig 		goto err;
38357ad18afaSChristoph Hellwig 	}
38367ad18afaSChristoph Hellwig 
3837bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
3838bc1ecc65SIlya Dryomov 
3839bc1ecc65SIlya Dryomov 	if (!length) {
3840bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
3841bc1ecc65SIlya Dryomov 		result = 0;
3842bc1ecc65SIlya Dryomov 		goto err_rq;
3843bc1ecc65SIlya Dryomov 	}
3844bc1ecc65SIlya Dryomov 
38459568c93eSIlya Dryomov 	rbd_assert(op_type == OBJ_OP_READ ||
38469568c93eSIlya Dryomov 		   rbd_dev->spec->snap_id == CEPH_NOSNAP);
3847bc1ecc65SIlya Dryomov 
3848bc1ecc65SIlya Dryomov 	/*
3849bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
3850bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
3851bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
3852bc1ecc65SIlya Dryomov 	 * sending it if we already know.
3853bc1ecc65SIlya Dryomov 	 */
3854bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3855bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
3856bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3857bc1ecc65SIlya Dryomov 		result = -ENXIO;
3858bc1ecc65SIlya Dryomov 		goto err_rq;
3859bc1ecc65SIlya Dryomov 	}
3860bc1ecc65SIlya Dryomov 
3861bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
3862bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3863bc1ecc65SIlya Dryomov 			 length);
3864bc1ecc65SIlya Dryomov 		result = -EINVAL;
3865bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
3866bc1ecc65SIlya Dryomov 	}
3867bc1ecc65SIlya Dryomov 
38687ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
38697ad18afaSChristoph Hellwig 
38704e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
38714e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
38726d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
38734e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
38744e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
38754e752f0aSJosh Durgin 	}
38764e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
38774e752f0aSJosh Durgin 
38784e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
3879bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
38804e752f0aSJosh Durgin 			 length, mapping_size);
3881bc1ecc65SIlya Dryomov 		result = -EIO;
3882bc1ecc65SIlya Dryomov 		goto err_rq;
3883bc1ecc65SIlya Dryomov 	}
3884bc1ecc65SIlya Dryomov 
3885f9bebd58SIlya Dryomov 	must_be_locked =
3886f9bebd58SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3887f9bebd58SIlya Dryomov 	    (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
3888ed95b21aSIlya Dryomov 	if (must_be_locked) {
3889ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
38902f18d466SIlya Dryomov 		result = rbd_wait_state_locked(rbd_dev,
38912f18d466SIlya Dryomov 					       !rbd_dev->opts->exclusive);
38922f18d466SIlya Dryomov 		if (result)
3893e010dd0aSIlya Dryomov 			goto err_unlock;
3894e010dd0aSIlya Dryomov 	}
3895ed95b21aSIlya Dryomov 
3896dfd9875fSIlya Dryomov 	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
3897bc1ecc65SIlya Dryomov 	if (!img_request) {
3898bc1ecc65SIlya Dryomov 		result = -ENOMEM;
3899ed95b21aSIlya Dryomov 		goto err_unlock;
3900bc1ecc65SIlya Dryomov 	}
3901bc1ecc65SIlya Dryomov 	img_request->rq = rq;
390270b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
3903bc1ecc65SIlya Dryomov 
39046484cbe9SIlya Dryomov 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
39055a237819SIlya Dryomov 		result = rbd_img_fill_nodata(img_request, offset, length);
390690e98c52SGuangliang Zhao 	else
39075a237819SIlya Dryomov 		result = rbd_img_fill_from_bio(img_request, offset, length,
390890e98c52SGuangliang Zhao 					       rq->bio);
39090c93e1b7SIlya Dryomov 	if (result || !img_request->pending_count)
3910bc1ecc65SIlya Dryomov 		goto err_img_request;
3911bc1ecc65SIlya Dryomov 
3912efbd1a11SIlya Dryomov 	rbd_img_request_submit(img_request);
3913ed95b21aSIlya Dryomov 	if (must_be_locked)
3914ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3915bc1ecc65SIlya Dryomov 	return;
3916bc1ecc65SIlya Dryomov 
3917bc1ecc65SIlya Dryomov err_img_request:
3918bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
3919ed95b21aSIlya Dryomov err_unlock:
3920ed95b21aSIlya Dryomov 	if (must_be_locked)
3921ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3922bc1ecc65SIlya Dryomov err_rq:
3923bc1ecc65SIlya Dryomov 	if (result)
3924bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
39256d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
39264e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
39277ad18afaSChristoph Hellwig err:
39282a842acaSChristoph Hellwig 	blk_mq_end_request(rq, errno_to_blk_status(result));
3929bc1ecc65SIlya Dryomov }
3930bc1ecc65SIlya Dryomov 
3931fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
39327ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
3933bc1ecc65SIlya Dryomov {
39347ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
39357ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
3936bc1ecc65SIlya Dryomov 
39377ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
3938fc17b653SChristoph Hellwig 	return BLK_STS_OK;
3939bf0d5f50SAlex Elder }
3940bf0d5f50SAlex Elder 
3941602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3942602adf40SYehuda Sadeh {
39435769ed0cSIlya Dryomov 	blk_cleanup_queue(rbd_dev->disk->queue);
39447ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
39455769ed0cSIlya Dryomov 	put_disk(rbd_dev->disk);
39465769ed0cSIlya Dryomov 	rbd_dev->disk = NULL;
3947602adf40SYehuda Sadeh }
3948602adf40SYehuda Sadeh 
3949788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3950fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
3951fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
3952fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
3953788e2df3SAlex Elder 
3954788e2df3SAlex Elder {
3955fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3956fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
3957fe5478e0SIlya Dryomov 	struct page **pages;
3958fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
3959788e2df3SAlex Elder 	int ret;
3960788e2df3SAlex Elder 
3961fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3962fe5478e0SIlya Dryomov 	if (!req)
3963fe5478e0SIlya Dryomov 		return -ENOMEM;
3964788e2df3SAlex Elder 
3965fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
3966fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
3967fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
3968788e2df3SAlex Elder 
3969fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3970fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
3971fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
3972fe5478e0SIlya Dryomov 		goto out_req;
3973fe5478e0SIlya Dryomov 	}
39741ceae7efSAlex Elder 
3975fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3976fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3977fe5478e0SIlya Dryomov 					 true);
3978788e2df3SAlex Elder 
397926f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
398026f887e0SIlya Dryomov 	if (ret)
398126f887e0SIlya Dryomov 		goto out_req;
398226f887e0SIlya Dryomov 
3983fe5478e0SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
3984fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
3985fe5478e0SIlya Dryomov 	if (ret >= 0)
3986fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
3987fe5478e0SIlya Dryomov 
3988fe5478e0SIlya Dryomov out_req:
3989fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
3990788e2df3SAlex Elder 	return ret;
3991788e2df3SAlex Elder }
3992788e2df3SAlex Elder 
3993602adf40SYehuda Sadeh /*
3994662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3995662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3996662518b1SAlex Elder  * information about the image.
39974156d998SAlex Elder  */
399899a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
39994156d998SAlex Elder {
40004156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
40014156d998SAlex Elder 	u32 snap_count = 0;
40024156d998SAlex Elder 	u64 names_size = 0;
40034156d998SAlex Elder 	u32 want_count;
40044156d998SAlex Elder 	int ret;
40054156d998SAlex Elder 
40064156d998SAlex Elder 	/*
40074156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
40084156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
40094156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
40104156d998SAlex Elder 	 * the number of snapshots could change by the time we read
40114156d998SAlex Elder 	 * it in, in which case we re-read it.
40124156d998SAlex Elder 	 */
40134156d998SAlex Elder 	do {
40144156d998SAlex Elder 		size_t size;
40154156d998SAlex Elder 
40164156d998SAlex Elder 		kfree(ondisk);
40174156d998SAlex Elder 
40184156d998SAlex Elder 		size = sizeof (*ondisk);
40194156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
40204156d998SAlex Elder 		size += names_size;
40214156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
40224156d998SAlex Elder 		if (!ondisk)
4023662518b1SAlex Elder 			return -ENOMEM;
40244156d998SAlex Elder 
4025fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4026fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
40274156d998SAlex Elder 		if (ret < 0)
4028662518b1SAlex Elder 			goto out;
4029c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
40304156d998SAlex Elder 			ret = -ENXIO;
403106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
403206ecc6cbSAlex Elder 				size, ret);
4033662518b1SAlex Elder 			goto out;
40344156d998SAlex Elder 		}
40354156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
40364156d998SAlex Elder 			ret = -ENXIO;
403706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4038662518b1SAlex Elder 			goto out;
40394156d998SAlex Elder 		}
40404156d998SAlex Elder 
40414156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
40424156d998SAlex Elder 		want_count = snap_count;
40434156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
40444156d998SAlex Elder 	} while (snap_count != want_count);
40454156d998SAlex Elder 
4046662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4047662518b1SAlex Elder out:
40484156d998SAlex Elder 	kfree(ondisk);
40494156d998SAlex Elder 
4050dfc5606dSYehuda Sadeh 	return ret;
4051602adf40SYehuda Sadeh }
4052602adf40SYehuda Sadeh 
405315228edeSAlex Elder /*
405415228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
405515228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
405615228edeSAlex Elder  */
405715228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
405815228edeSAlex Elder {
405915228edeSAlex Elder 	u64 snap_id;
406015228edeSAlex Elder 
406115228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
406215228edeSAlex Elder 		return;
406315228edeSAlex Elder 
406415228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
406515228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
406615228edeSAlex Elder 		return;
406715228edeSAlex Elder 
406815228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
406915228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
407015228edeSAlex Elder }
407115228edeSAlex Elder 
40729875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
40739875201eSJosh Durgin {
40749875201eSJosh Durgin 	sector_t size;
40759875201eSJosh Durgin 
40769875201eSJosh Durgin 	/*
4077811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4078811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4079811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
40809875201eSJosh Durgin 	 */
4081811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4082811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
40839875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
40849875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
40859875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
40869875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
40879875201eSJosh Durgin 	}
40889875201eSJosh Durgin }
40899875201eSJosh Durgin 
4090cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
40911fe5e993SAlex Elder {
4092e627db08SAlex Elder 	u64 mapping_size;
40931fe5e993SAlex Elder 	int ret;
40941fe5e993SAlex Elder 
4095cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
40963b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4097a720ae09SIlya Dryomov 
4098a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
409952bb1f9bSIlya Dryomov 	if (ret)
410073e39e4dSIlya Dryomov 		goto out;
410115228edeSAlex Elder 
4102e8f59b59SIlya Dryomov 	/*
4103e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4104e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4105e8f59b59SIlya Dryomov 	 */
4106e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4107e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4108e8f59b59SIlya Dryomov 		if (ret)
410973e39e4dSIlya Dryomov 			goto out;
4110e8f59b59SIlya Dryomov 	}
4111e8f59b59SIlya Dryomov 
41125ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
41135ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
41145ff1108cSIlya Dryomov 	} else {
41155ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
411615228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
41175ff1108cSIlya Dryomov 	}
41185ff1108cSIlya Dryomov 
411973e39e4dSIlya Dryomov out:
4120cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
412173e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
41229875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
41231fe5e993SAlex Elder 
412473e39e4dSIlya Dryomov 	return ret;
41251fe5e993SAlex Elder }
41261fe5e993SAlex Elder 
4127d6296d39SChristoph Hellwig static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4128d6296d39SChristoph Hellwig 		unsigned int hctx_idx, unsigned int numa_node)
41297ad18afaSChristoph Hellwig {
41307ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
41317ad18afaSChristoph Hellwig 
41327ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
41337ad18afaSChristoph Hellwig 	return 0;
41347ad18afaSChristoph Hellwig }
41357ad18afaSChristoph Hellwig 
4136f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = {
41377ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
41387ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
41397ad18afaSChristoph Hellwig };
41407ad18afaSChristoph Hellwig 
4141602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4142602adf40SYehuda Sadeh {
4143602adf40SYehuda Sadeh 	struct gendisk *disk;
4144602adf40SYehuda Sadeh 	struct request_queue *q;
4145420efbdfSIlya Dryomov 	unsigned int objset_bytes =
4146420efbdfSIlya Dryomov 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
41477ad18afaSChristoph Hellwig 	int err;
4148602adf40SYehuda Sadeh 
4149602adf40SYehuda Sadeh 	/* create gendisk info */
41507e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
41517e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
41527e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
4153602adf40SYehuda Sadeh 	if (!disk)
41541fcdb8aaSAlex Elder 		return -ENOMEM;
4155602adf40SYehuda Sadeh 
4156f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4157de71a297SAlex Elder 		 rbd_dev->dev_id);
4158602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
4159dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
41607e513d43SIlya Dryomov 	if (single_major)
41617e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
4162602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
4163602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
4164602adf40SYehuda Sadeh 
41657ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
41667ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4167b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
41687ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
416956d18f62SMing Lei 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
41707ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
41717ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
41727ad18afaSChristoph Hellwig 
41737ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
41747ad18afaSChristoph Hellwig 	if (err)
4175602adf40SYehuda Sadeh 		goto out_disk;
4176029bcbd8SJosh Durgin 
41777ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
41787ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
41797ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
41807ad18afaSChristoph Hellwig 		goto out_tag_set;
41817ad18afaSChristoph Hellwig 	}
41827ad18afaSChristoph Hellwig 
41838b904b5bSBart Van Assche 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
4184d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4185593a9e7bSAlex Elder 
4186420efbdfSIlya Dryomov 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
41870d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
418821acdf45SIlya Dryomov 	blk_queue_max_segments(q, USHRT_MAX);
418924f1df60SIlya Dryomov 	blk_queue_max_segment_size(q, UINT_MAX);
419016d80c54SIlya Dryomov 	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
419116d80c54SIlya Dryomov 	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
4192029bcbd8SJosh Durgin 
4193d9360540SIlya Dryomov 	if (rbd_dev->opts->trim) {
41948b904b5bSBart Van Assche 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
419516d80c54SIlya Dryomov 		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
4196420efbdfSIlya Dryomov 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4197420efbdfSIlya Dryomov 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4198d9360540SIlya Dryomov 	}
419990e98c52SGuangliang Zhao 
4200bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4201dc3b17ccSJan Kara 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
4202bae818eeSRonny Hegewald 
42035769ed0cSIlya Dryomov 	/*
42045769ed0cSIlya Dryomov 	 * disk_release() expects a queue ref from add_disk() and will
42055769ed0cSIlya Dryomov 	 * put it.  Hold an extra ref until add_disk() is called.
42065769ed0cSIlya Dryomov 	 */
42075769ed0cSIlya Dryomov 	WARN_ON(!blk_get_queue(q));
4208602adf40SYehuda Sadeh 	disk->queue = q;
4209602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
4210602adf40SYehuda Sadeh 
4211602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4212602adf40SYehuda Sadeh 
4213602adf40SYehuda Sadeh 	return 0;
42147ad18afaSChristoph Hellwig out_tag_set:
42157ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4216602adf40SYehuda Sadeh out_disk:
4217602adf40SYehuda Sadeh 	put_disk(disk);
42187ad18afaSChristoph Hellwig 	return err;
4219602adf40SYehuda Sadeh }
4220602adf40SYehuda Sadeh 
4221dfc5606dSYehuda Sadeh /*
4222dfc5606dSYehuda Sadeh   sysfs
4223dfc5606dSYehuda Sadeh */
4224602adf40SYehuda Sadeh 
4225593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4226593a9e7bSAlex Elder {
4227593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4228593a9e7bSAlex Elder }
4229593a9e7bSAlex Elder 
4230dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4231dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4232602adf40SYehuda Sadeh {
4233593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4234dfc5606dSYehuda Sadeh 
4235fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4236fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4237602adf40SYehuda Sadeh }
4238602adf40SYehuda Sadeh 
423934b13184SAlex Elder /*
424034b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
424134b13184SAlex Elder  * necessarily the base image.
424234b13184SAlex Elder  */
424334b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
424434b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
424534b13184SAlex Elder {
424634b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
424734b13184SAlex Elder 
424834b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
424934b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
425034b13184SAlex Elder }
425134b13184SAlex Elder 
4252dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4253dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4254602adf40SYehuda Sadeh {
4255593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4256dfc5606dSYehuda Sadeh 
4257fc71d833SAlex Elder 	if (rbd_dev->major)
4258dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4259fc71d833SAlex Elder 
4260fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4261dd82fff1SIlya Dryomov }
4262fc71d833SAlex Elder 
4263dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4264dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
4265dd82fff1SIlya Dryomov {
4266dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4267dd82fff1SIlya Dryomov 
4268dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
4269dfc5606dSYehuda Sadeh }
4270dfc5606dSYehuda Sadeh 
4271005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
4272005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
4273005a07bfSIlya Dryomov {
4274005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4275005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
4276005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
4277005a07bfSIlya Dryomov 
4278005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4279005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
4280005a07bfSIlya Dryomov }
4281005a07bfSIlya Dryomov 
4282dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
4283dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
4284dfc5606dSYehuda Sadeh {
4285593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4286dfc5606dSYehuda Sadeh 
42871dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
4288033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
4289dfc5606dSYehuda Sadeh }
4290dfc5606dSYehuda Sadeh 
4291267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
4292267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
4293267fb90bSMike Christie {
4294267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4295267fb90bSMike Christie 
4296267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4297267fb90bSMike Christie }
4298267fb90bSMike Christie 
42990d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
43000d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
43010d6d1e9cSMike Christie {
43020d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
43030d6d1e9cSMike Christie 
43040d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
4305dfc5606dSYehuda Sadeh }
4306dfc5606dSYehuda Sadeh 
4307dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
4308dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4309dfc5606dSYehuda Sadeh {
4310593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4311dfc5606dSYehuda Sadeh 
43120d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4313dfc5606dSYehuda Sadeh }
4314dfc5606dSYehuda Sadeh 
43159bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
43169bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
43179bb2f334SAlex Elder {
43189bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
43199bb2f334SAlex Elder 
43200d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
43210d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
43229bb2f334SAlex Elder }
43239bb2f334SAlex Elder 
4324b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev,
4325b26c047bSIlya Dryomov 				struct device_attribute *attr, char *buf)
4326b26c047bSIlya Dryomov {
4327b26c047bSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4328b26c047bSIlya Dryomov 
4329b26c047bSIlya Dryomov 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4330b26c047bSIlya Dryomov }
4331b26c047bSIlya Dryomov 
4332dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
4333dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4334dfc5606dSYehuda Sadeh {
4335593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4336dfc5606dSYehuda Sadeh 
4337a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
43380d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4339a92ffdf8SAlex Elder 
4340a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
4341dfc5606dSYehuda Sadeh }
4342dfc5606dSYehuda Sadeh 
4343589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
4344589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
4345589d30e0SAlex Elder {
4346589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4347589d30e0SAlex Elder 
43480d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4349589d30e0SAlex Elder }
4350589d30e0SAlex Elder 
435134b13184SAlex Elder /*
435234b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
435334b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
435434b13184SAlex Elder  */
4355dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
4356dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
4357dfc5606dSYehuda Sadeh 			     char *buf)
4358dfc5606dSYehuda Sadeh {
4359593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4360dfc5606dSYehuda Sadeh 
43610d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4362dfc5606dSYehuda Sadeh }
4363dfc5606dSYehuda Sadeh 
436492a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
436592a58671SMike Christie 				struct device_attribute *attr, char *buf)
436692a58671SMike Christie {
436792a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
436892a58671SMike Christie 
436992a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
437092a58671SMike Christie }
437192a58671SMike Christie 
437286b00e0dSAlex Elder /*
4373ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
4374ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
4375ff96128fSIlya Dryomov  * image)".
437686b00e0dSAlex Elder  */
437786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
437886b00e0dSAlex Elder 			       struct device_attribute *attr,
437986b00e0dSAlex Elder 			       char *buf)
438086b00e0dSAlex Elder {
438186b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4382ff96128fSIlya Dryomov 	ssize_t count = 0;
438386b00e0dSAlex Elder 
4384ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
438586b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
438686b00e0dSAlex Elder 
4387ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4388ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
438986b00e0dSAlex Elder 
4390ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
4391ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
4392e92c0eafSIlya Dryomov 			    "pool_ns %s\n"
4393ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
4394ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
4395ff96128fSIlya Dryomov 			    "overlap %llu\n",
4396ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
4397ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
4398e92c0eafSIlya Dryomov 			    spec->pool_ns ?: "",
4399ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
4400ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
4401ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
4402ff96128fSIlya Dryomov 	}
440386b00e0dSAlex Elder 
440486b00e0dSAlex Elder 	return count;
440586b00e0dSAlex Elder }
440686b00e0dSAlex Elder 
4407dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
4408dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
4409dfc5606dSYehuda Sadeh 				 const char *buf,
4410dfc5606dSYehuda Sadeh 				 size_t size)
4411dfc5606dSYehuda Sadeh {
4412593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4413b813623aSAlex Elder 	int ret;
4414602adf40SYehuda Sadeh 
4415cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
4416e627db08SAlex Elder 	if (ret)
441752bb1f9bSIlya Dryomov 		return ret;
4418b813623aSAlex Elder 
441952bb1f9bSIlya Dryomov 	return size;
4420dfc5606dSYehuda Sadeh }
4421602adf40SYehuda Sadeh 
44225657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
44235657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
44245657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
44255657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
44265657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
44275657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
44285657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
44295657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
44305657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
44315657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
4432b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
44335657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
44345657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
44355657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
44365657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
44375657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
44385657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
4439dfc5606dSYehuda Sadeh 
4440dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
4441dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
444234b13184SAlex Elder 	&dev_attr_features.attr,
4443dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
4444dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
4445005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
4446dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
4447267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
44480d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
4449dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
44509bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
4451b26c047bSIlya Dryomov 	&dev_attr_pool_ns.attr,
4452dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
4453589d30e0SAlex Elder 	&dev_attr_image_id.attr,
4454dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
445592a58671SMike Christie 	&dev_attr_snap_id.attr,
445686b00e0dSAlex Elder 	&dev_attr_parent.attr,
4457dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4458dfc5606dSYehuda Sadeh 	NULL
4459dfc5606dSYehuda Sadeh };
4460dfc5606dSYehuda Sadeh 
4461dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4462dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4463dfc5606dSYehuda Sadeh };
4464dfc5606dSYehuda Sadeh 
4465dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4466dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4467dfc5606dSYehuda Sadeh 	NULL
4468dfc5606dSYehuda Sadeh };
4469dfc5606dSYehuda Sadeh 
44706cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
4471dfc5606dSYehuda Sadeh 
4472b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
4473dfc5606dSYehuda Sadeh 	.name		= "rbd",
4474dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
44756cac4695SIlya Dryomov 	.release	= rbd_dev_release,
4476dfc5606dSYehuda Sadeh };
4477dfc5606dSYehuda Sadeh 
44788b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
44798b8fb99cSAlex Elder {
44808b8fb99cSAlex Elder 	kref_get(&spec->kref);
44818b8fb99cSAlex Elder 
44828b8fb99cSAlex Elder 	return spec;
44838b8fb99cSAlex Elder }
44848b8fb99cSAlex Elder 
44858b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
44868b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
44878b8fb99cSAlex Elder {
44888b8fb99cSAlex Elder 	if (spec)
44898b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
44908b8fb99cSAlex Elder }
44918b8fb99cSAlex Elder 
44928b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
44938b8fb99cSAlex Elder {
44948b8fb99cSAlex Elder 	struct rbd_spec *spec;
44958b8fb99cSAlex Elder 
44968b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
44978b8fb99cSAlex Elder 	if (!spec)
44988b8fb99cSAlex Elder 		return NULL;
449904077599SIlya Dryomov 
450004077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
450104077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
45028b8fb99cSAlex Elder 	kref_init(&spec->kref);
45038b8fb99cSAlex Elder 
45048b8fb99cSAlex Elder 	return spec;
45058b8fb99cSAlex Elder }
45068b8fb99cSAlex Elder 
45078b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
45088b8fb99cSAlex Elder {
45098b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
45108b8fb99cSAlex Elder 
45118b8fb99cSAlex Elder 	kfree(spec->pool_name);
4512b26c047bSIlya Dryomov 	kfree(spec->pool_ns);
45138b8fb99cSAlex Elder 	kfree(spec->image_id);
45148b8fb99cSAlex Elder 	kfree(spec->image_name);
45158b8fb99cSAlex Elder 	kfree(spec->snap_name);
45168b8fb99cSAlex Elder 	kfree(spec);
45178b8fb99cSAlex Elder }
45188b8fb99cSAlex Elder 
45191643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
4520dd5ac32dSIlya Dryomov {
452199d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4522ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
4523dd5ac32dSIlya Dryomov 
4524c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
45256b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
45260d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
4527c41d13a3SIlya Dryomov 
4528dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4529dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4530dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4531dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
45321643dfa4SIlya Dryomov }
45331643dfa4SIlya Dryomov 
45341643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
45351643dfa4SIlya Dryomov {
45361643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
45371643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
45381643dfa4SIlya Dryomov 
45391643dfa4SIlya Dryomov 	if (need_put) {
45401643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
45411643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
45421643dfa4SIlya Dryomov 	}
45431643dfa4SIlya Dryomov 
45441643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
4545dd5ac32dSIlya Dryomov 
4546dd5ac32dSIlya Dryomov 	/*
4547dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4548dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4549dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4550dd5ac32dSIlya Dryomov 	 */
4551dd5ac32dSIlya Dryomov 	if (need_put)
4552dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4553dd5ac32dSIlya Dryomov }
4554dd5ac32dSIlya Dryomov 
45551643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
45561643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
4557c53d5893SAlex Elder {
4558c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4559c53d5893SAlex Elder 
4560c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4561c53d5893SAlex Elder 	if (!rbd_dev)
4562c53d5893SAlex Elder 		return NULL;
4563c53d5893SAlex Elder 
4564c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
4565c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4566c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4567c53d5893SAlex Elder 
45687e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
4569c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
4570431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
4571b26c047bSIlya Dryomov 	if (spec->pool_ns) {
4572b26c047bSIlya Dryomov 		WARN_ON(!*spec->pool_ns);
4573b26c047bSIlya Dryomov 		rbd_dev->header_oloc.pool_ns =
4574b26c047bSIlya Dryomov 		    ceph_find_or_create_string(spec->pool_ns,
4575b26c047bSIlya Dryomov 					       strlen(spec->pool_ns));
4576b26c047bSIlya Dryomov 	}
4577c41d13a3SIlya Dryomov 
457899d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
457999d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
458099d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
458199d16943SIlya Dryomov 
4582ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
4583ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4584ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4585ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4586ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4587ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4588ed95b21aSIlya Dryomov 	init_waitqueue_head(&rbd_dev->lock_waitq);
4589ed95b21aSIlya Dryomov 
4590dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4591dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4592dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4593dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4594dd5ac32dSIlya Dryomov 
4595c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4596d147543dSIlya Dryomov 	rbd_dev->spec = spec;
45970903e875SAlex Elder 
45981643dfa4SIlya Dryomov 	return rbd_dev;
45991643dfa4SIlya Dryomov }
46001643dfa4SIlya Dryomov 
4601dd5ac32dSIlya Dryomov /*
46021643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4603dd5ac32dSIlya Dryomov  */
46041643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
46051643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
46061643dfa4SIlya Dryomov 					 struct rbd_options *opts)
46071643dfa4SIlya Dryomov {
46081643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
46091643dfa4SIlya Dryomov 
46101643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
46111643dfa4SIlya Dryomov 	if (!rbd_dev)
46121643dfa4SIlya Dryomov 		return NULL;
46131643dfa4SIlya Dryomov 
46141643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
46151643dfa4SIlya Dryomov 
46161643dfa4SIlya Dryomov 	/* get an id and fill in device name */
46171643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
46181643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
46191643dfa4SIlya Dryomov 					 GFP_KERNEL);
46201643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
46211643dfa4SIlya Dryomov 		goto fail_rbd_dev;
46221643dfa4SIlya Dryomov 
46231643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
46241643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
46251643dfa4SIlya Dryomov 						   rbd_dev->name);
46261643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
46271643dfa4SIlya Dryomov 		goto fail_dev_id;
46281643dfa4SIlya Dryomov 
46291643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4630dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4631dd5ac32dSIlya Dryomov 
46321643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4633c53d5893SAlex Elder 	return rbd_dev;
46341643dfa4SIlya Dryomov 
46351643dfa4SIlya Dryomov fail_dev_id:
46361643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
46371643dfa4SIlya Dryomov fail_rbd_dev:
46381643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
46391643dfa4SIlya Dryomov 	return NULL;
4640c53d5893SAlex Elder }
4641c53d5893SAlex Elder 
4642c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4643c53d5893SAlex Elder {
4644dd5ac32dSIlya Dryomov 	if (rbd_dev)
4645dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4646c53d5893SAlex Elder }
4647c53d5893SAlex Elder 
4648dfc5606dSYehuda Sadeh /*
46499d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
46509d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
46519d475de5SAlex Elder  * image.
46529d475de5SAlex Elder  */
46539d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
46549d475de5SAlex Elder 				u8 *order, u64 *snap_size)
46559d475de5SAlex Elder {
46569d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
46579d475de5SAlex Elder 	int ret;
46589d475de5SAlex Elder 	struct {
46599d475de5SAlex Elder 		u8 order;
46609d475de5SAlex Elder 		__le64 size;
46619d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
46629d475de5SAlex Elder 
4663ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4664ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
46654157976bSAlex Elder 				  &snapid, sizeof(snapid),
4666e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
466736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
46689d475de5SAlex Elder 	if (ret < 0)
46699d475de5SAlex Elder 		return ret;
467057385b51SAlex Elder 	if (ret < sizeof (size_buf))
467157385b51SAlex Elder 		return -ERANGE;
46729d475de5SAlex Elder 
4673c3545579SJosh Durgin 	if (order) {
46749d475de5SAlex Elder 		*order = size_buf.order;
4675c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4676c3545579SJosh Durgin 	}
46779d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
46789d475de5SAlex Elder 
4679c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4680c3545579SJosh Durgin 		(unsigned long long)snap_id,
46819d475de5SAlex Elder 		(unsigned long long)*snap_size);
46829d475de5SAlex Elder 
46839d475de5SAlex Elder 	return 0;
46849d475de5SAlex Elder }
46859d475de5SAlex Elder 
46869d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
46879d475de5SAlex Elder {
46889d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
46899d475de5SAlex Elder 					&rbd_dev->header.obj_order,
46909d475de5SAlex Elder 					&rbd_dev->header.image_size);
46919d475de5SAlex Elder }
46929d475de5SAlex Elder 
46931e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
46941e130199SAlex Elder {
46951e130199SAlex Elder 	void *reply_buf;
46961e130199SAlex Elder 	int ret;
46971e130199SAlex Elder 	void *p;
46981e130199SAlex Elder 
46991e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
47001e130199SAlex Elder 	if (!reply_buf)
47011e130199SAlex Elder 		return -ENOMEM;
47021e130199SAlex Elder 
4703ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4704ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
4705ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
470636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
47071e130199SAlex Elder 	if (ret < 0)
47081e130199SAlex Elder 		goto out;
47091e130199SAlex Elder 
47101e130199SAlex Elder 	p = reply_buf;
47111e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
471257385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
471357385b51SAlex Elder 	ret = 0;
47141e130199SAlex Elder 
47151e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
47161e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
47171e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
47181e130199SAlex Elder 	} else {
47191e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
47201e130199SAlex Elder 	}
47211e130199SAlex Elder out:
47221e130199SAlex Elder 	kfree(reply_buf);
47231e130199SAlex Elder 
47241e130199SAlex Elder 	return ret;
47251e130199SAlex Elder }
47261e130199SAlex Elder 
4727b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4728b1b5402aSAlex Elder 		u64 *snap_features)
4729b1b5402aSAlex Elder {
4730b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4731b1b5402aSAlex Elder 	struct {
4732b1b5402aSAlex Elder 		__le64 features;
4733b1b5402aSAlex Elder 		__le64 incompat;
47344157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4735d3767f0fSIlya Dryomov 	u64 unsup;
4736b1b5402aSAlex Elder 	int ret;
4737b1b5402aSAlex Elder 
4738ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4739ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
47404157976bSAlex Elder 				  &snapid, sizeof(snapid),
4741e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
474236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4743b1b5402aSAlex Elder 	if (ret < 0)
4744b1b5402aSAlex Elder 		return ret;
474557385b51SAlex Elder 	if (ret < sizeof (features_buf))
474657385b51SAlex Elder 		return -ERANGE;
4747d889140cSAlex Elder 
4748d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4749d3767f0fSIlya Dryomov 	if (unsup) {
4750d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4751d3767f0fSIlya Dryomov 			 unsup);
4752b8f5c6edSAlex Elder 		return -ENXIO;
4753d3767f0fSIlya Dryomov 	}
4754d889140cSAlex Elder 
4755b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4756b1b5402aSAlex Elder 
4757b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4758b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4759b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4760b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4761b1b5402aSAlex Elder 
4762b1b5402aSAlex Elder 	return 0;
4763b1b5402aSAlex Elder }
4764b1b5402aSAlex Elder 
4765b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4766b1b5402aSAlex Elder {
4767b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4768b1b5402aSAlex Elder 						&rbd_dev->header.features);
4769b1b5402aSAlex Elder }
4770b1b5402aSAlex Elder 
4771eb3b2d6bSIlya Dryomov struct parent_image_info {
4772eb3b2d6bSIlya Dryomov 	u64		pool_id;
4773e92c0eafSIlya Dryomov 	const char	*pool_ns;
4774eb3b2d6bSIlya Dryomov 	const char	*image_id;
4775eb3b2d6bSIlya Dryomov 	u64		snap_id;
4776eb3b2d6bSIlya Dryomov 
4777e92c0eafSIlya Dryomov 	bool		has_overlap;
4778eb3b2d6bSIlya Dryomov 	u64		overlap;
4779eb3b2d6bSIlya Dryomov };
4780eb3b2d6bSIlya Dryomov 
4781eb3b2d6bSIlya Dryomov /*
4782eb3b2d6bSIlya Dryomov  * The caller is responsible for @pii.
4783eb3b2d6bSIlya Dryomov  */
4784e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end,
4785e92c0eafSIlya Dryomov 				    struct parent_image_info *pii)
4786e92c0eafSIlya Dryomov {
4787e92c0eafSIlya Dryomov 	u8 struct_v;
4788e92c0eafSIlya Dryomov 	u32 struct_len;
4789e92c0eafSIlya Dryomov 	int ret;
4790e92c0eafSIlya Dryomov 
4791e92c0eafSIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4792e92c0eafSIlya Dryomov 				  &struct_v, &struct_len);
4793e92c0eafSIlya Dryomov 	if (ret)
4794e92c0eafSIlya Dryomov 		return ret;
4795e92c0eafSIlya Dryomov 
4796e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4797e92c0eafSIlya Dryomov 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4798e92c0eafSIlya Dryomov 	if (IS_ERR(pii->pool_ns)) {
4799e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->pool_ns);
4800e92c0eafSIlya Dryomov 		pii->pool_ns = NULL;
4801e92c0eafSIlya Dryomov 		return ret;
4802e92c0eafSIlya Dryomov 	}
4803e92c0eafSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4804e92c0eafSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
4805e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
4806e92c0eafSIlya Dryomov 		pii->image_id = NULL;
4807e92c0eafSIlya Dryomov 		return ret;
4808e92c0eafSIlya Dryomov 	}
4809e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4810e92c0eafSIlya Dryomov 	return 0;
4811e92c0eafSIlya Dryomov 
4812e92c0eafSIlya Dryomov e_inval:
4813e92c0eafSIlya Dryomov 	return -EINVAL;
4814e92c0eafSIlya Dryomov }
4815e92c0eafSIlya Dryomov 
4816e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev,
4817e92c0eafSIlya Dryomov 			     struct page *req_page,
4818e92c0eafSIlya Dryomov 			     struct page *reply_page,
4819e92c0eafSIlya Dryomov 			     struct parent_image_info *pii)
4820e92c0eafSIlya Dryomov {
4821e92c0eafSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4822e92c0eafSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
4823e92c0eafSIlya Dryomov 	void *p, *end;
4824e92c0eafSIlya Dryomov 	int ret;
4825e92c0eafSIlya Dryomov 
4826e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4827e92c0eafSIlya Dryomov 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4828e92c0eafSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4829e92c0eafSIlya Dryomov 	if (ret)
4830e92c0eafSIlya Dryomov 		return ret == -EOPNOTSUPP ? 1 : ret;
4831e92c0eafSIlya Dryomov 
4832e92c0eafSIlya Dryomov 	p = page_address(reply_page);
4833e92c0eafSIlya Dryomov 	end = p + reply_len;
4834e92c0eafSIlya Dryomov 	ret = decode_parent_image_spec(&p, end, pii);
4835e92c0eafSIlya Dryomov 	if (ret)
4836e92c0eafSIlya Dryomov 		return ret;
4837e92c0eafSIlya Dryomov 
4838e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4839e92c0eafSIlya Dryomov 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4840e92c0eafSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4841e92c0eafSIlya Dryomov 	if (ret)
4842e92c0eafSIlya Dryomov 		return ret;
4843e92c0eafSIlya Dryomov 
4844e92c0eafSIlya Dryomov 	p = page_address(reply_page);
4845e92c0eafSIlya Dryomov 	end = p + reply_len;
4846e92c0eafSIlya Dryomov 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4847e92c0eafSIlya Dryomov 	if (pii->has_overlap)
4848e92c0eafSIlya Dryomov 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4849e92c0eafSIlya Dryomov 
4850e92c0eafSIlya Dryomov 	return 0;
4851e92c0eafSIlya Dryomov 
4852e92c0eafSIlya Dryomov e_inval:
4853e92c0eafSIlya Dryomov 	return -EINVAL;
4854e92c0eafSIlya Dryomov }
4855e92c0eafSIlya Dryomov 
4856e92c0eafSIlya Dryomov /*
4857e92c0eafSIlya Dryomov  * The caller is responsible for @pii.
4858e92c0eafSIlya Dryomov  */
4859eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4860eb3b2d6bSIlya Dryomov 				    struct page *req_page,
4861eb3b2d6bSIlya Dryomov 				    struct page *reply_page,
4862eb3b2d6bSIlya Dryomov 				    struct parent_image_info *pii)
4863eb3b2d6bSIlya Dryomov {
4864eb3b2d6bSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4865eb3b2d6bSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
4866eb3b2d6bSIlya Dryomov 	void *p, *end;
4867eb3b2d6bSIlya Dryomov 	int ret;
4868eb3b2d6bSIlya Dryomov 
4869eb3b2d6bSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4870eb3b2d6bSIlya Dryomov 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4871eb3b2d6bSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4872eb3b2d6bSIlya Dryomov 	if (ret)
4873eb3b2d6bSIlya Dryomov 		return ret;
4874eb3b2d6bSIlya Dryomov 
4875eb3b2d6bSIlya Dryomov 	p = page_address(reply_page);
4876eb3b2d6bSIlya Dryomov 	end = p + reply_len;
4877eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4878eb3b2d6bSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4879eb3b2d6bSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
4880eb3b2d6bSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
4881eb3b2d6bSIlya Dryomov 		pii->image_id = NULL;
4882eb3b2d6bSIlya Dryomov 		return ret;
4883eb3b2d6bSIlya Dryomov 	}
4884eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
4885e92c0eafSIlya Dryomov 	pii->has_overlap = true;
4886eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4887eb3b2d6bSIlya Dryomov 
4888eb3b2d6bSIlya Dryomov 	return 0;
4889eb3b2d6bSIlya Dryomov 
4890eb3b2d6bSIlya Dryomov e_inval:
4891eb3b2d6bSIlya Dryomov 	return -EINVAL;
4892eb3b2d6bSIlya Dryomov }
4893eb3b2d6bSIlya Dryomov 
4894eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev,
4895eb3b2d6bSIlya Dryomov 			   struct parent_image_info *pii)
4896eb3b2d6bSIlya Dryomov {
4897eb3b2d6bSIlya Dryomov 	struct page *req_page, *reply_page;
4898eb3b2d6bSIlya Dryomov 	void *p;
4899eb3b2d6bSIlya Dryomov 	int ret;
4900eb3b2d6bSIlya Dryomov 
4901eb3b2d6bSIlya Dryomov 	req_page = alloc_page(GFP_KERNEL);
4902eb3b2d6bSIlya Dryomov 	if (!req_page)
4903eb3b2d6bSIlya Dryomov 		return -ENOMEM;
4904eb3b2d6bSIlya Dryomov 
4905eb3b2d6bSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
4906eb3b2d6bSIlya Dryomov 	if (!reply_page) {
4907eb3b2d6bSIlya Dryomov 		__free_page(req_page);
4908eb3b2d6bSIlya Dryomov 		return -ENOMEM;
4909eb3b2d6bSIlya Dryomov 	}
4910eb3b2d6bSIlya Dryomov 
4911eb3b2d6bSIlya Dryomov 	p = page_address(req_page);
4912eb3b2d6bSIlya Dryomov 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
4913e92c0eafSIlya Dryomov 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
4914e92c0eafSIlya Dryomov 	if (ret > 0)
4915e92c0eafSIlya Dryomov 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
4916e92c0eafSIlya Dryomov 					       pii);
4917eb3b2d6bSIlya Dryomov 
4918eb3b2d6bSIlya Dryomov 	__free_page(req_page);
4919eb3b2d6bSIlya Dryomov 	__free_page(reply_page);
4920eb3b2d6bSIlya Dryomov 	return ret;
4921eb3b2d6bSIlya Dryomov }
4922eb3b2d6bSIlya Dryomov 
492386b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
492486b00e0dSAlex Elder {
492586b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
4926eb3b2d6bSIlya Dryomov 	struct parent_image_info pii = { 0 };
492786b00e0dSAlex Elder 	int ret;
492886b00e0dSAlex Elder 
492986b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
493086b00e0dSAlex Elder 	if (!parent_spec)
493186b00e0dSAlex Elder 		return -ENOMEM;
493286b00e0dSAlex Elder 
4933eb3b2d6bSIlya Dryomov 	ret = get_parent_info(rbd_dev, &pii);
4934eb3b2d6bSIlya Dryomov 	if (ret)
493586b00e0dSAlex Elder 		goto out_err;
493686b00e0dSAlex Elder 
4937e92c0eafSIlya Dryomov 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
4938e92c0eafSIlya Dryomov 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
4939e92c0eafSIlya Dryomov 	     pii.has_overlap, pii.overlap);
4940eb3b2d6bSIlya Dryomov 
4941e92c0eafSIlya Dryomov 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
4942392a9dadSAlex Elder 		/*
4943392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4944392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4945392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4946392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4947392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4948392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4949392a9dadSAlex Elder 		 * parent.
4950e92c0eafSIlya Dryomov 		 *
4951e92c0eafSIlya Dryomov 		 * If !pii.has_overlap, the parent image spec is not
4952e92c0eafSIlya Dryomov 		 * applicable.  It's there to avoid duplication in each
4953e92c0eafSIlya Dryomov 		 * snapshot record.
4954392a9dadSAlex Elder 		 */
4955392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4956392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4957392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4958392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4959392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4960392a9dadSAlex Elder 		}
4961392a9dadSAlex Elder 
496286b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4963392a9dadSAlex Elder 	}
496486b00e0dSAlex Elder 
49650903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
49660903e875SAlex Elder 
49670903e875SAlex Elder 	ret = -EIO;
4968eb3b2d6bSIlya Dryomov 	if (pii.pool_id > (u64)U32_MAX) {
49699584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4970eb3b2d6bSIlya Dryomov 			(unsigned long long)pii.pool_id, U32_MAX);
497157385b51SAlex Elder 		goto out_err;
4972c0cd10dbSAlex Elder 	}
49730903e875SAlex Elder 
49743b5cf2a2SAlex Elder 	/*
49753b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
49763b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
49773b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
49783b5cf2a2SAlex Elder 	 */
49793b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
4980eb3b2d6bSIlya Dryomov 		parent_spec->pool_id = pii.pool_id;
4981e92c0eafSIlya Dryomov 		if (pii.pool_ns && *pii.pool_ns) {
4982e92c0eafSIlya Dryomov 			parent_spec->pool_ns = pii.pool_ns;
4983e92c0eafSIlya Dryomov 			pii.pool_ns = NULL;
4984e92c0eafSIlya Dryomov 		}
4985eb3b2d6bSIlya Dryomov 		parent_spec->image_id = pii.image_id;
4986eb3b2d6bSIlya Dryomov 		pii.image_id = NULL;
4987eb3b2d6bSIlya Dryomov 		parent_spec->snap_id = pii.snap_id;
4988b26c047bSIlya Dryomov 
498986b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
499086b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
49913b5cf2a2SAlex Elder 	}
49923b5cf2a2SAlex Elder 
49933b5cf2a2SAlex Elder 	/*
4994cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
4995cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
49963b5cf2a2SAlex Elder 	 */
4997eb3b2d6bSIlya Dryomov 	if (!pii.overlap) {
49983b5cf2a2SAlex Elder 		if (parent_spec) {
4999cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
5000cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
5001cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
5002cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
500370cf49cfSAlex Elder 		} else {
5004cf32bd9cSIlya Dryomov 			/* initial probe */
5005cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
50063b5cf2a2SAlex Elder 		}
500770cf49cfSAlex Elder 	}
5008eb3b2d6bSIlya Dryomov 	rbd_dev->parent_overlap = pii.overlap;
5009cf32bd9cSIlya Dryomov 
501086b00e0dSAlex Elder out:
501186b00e0dSAlex Elder 	ret = 0;
501286b00e0dSAlex Elder out_err:
5013e92c0eafSIlya Dryomov 	kfree(pii.pool_ns);
5014eb3b2d6bSIlya Dryomov 	kfree(pii.image_id);
501586b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
501686b00e0dSAlex Elder 	return ret;
501786b00e0dSAlex Elder }
501886b00e0dSAlex Elder 
5019cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5020cc070d59SAlex Elder {
5021cc070d59SAlex Elder 	struct {
5022cc070d59SAlex Elder 		__le64 stripe_unit;
5023cc070d59SAlex Elder 		__le64 stripe_count;
5024cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5025cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5026cc070d59SAlex Elder 	void *p;
5027cc070d59SAlex Elder 	int ret;
5028cc070d59SAlex Elder 
5029ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5030ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5031ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
5032cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5033cc070d59SAlex Elder 	if (ret < 0)
5034cc070d59SAlex Elder 		return ret;
5035cc070d59SAlex Elder 	if (ret < size)
5036cc070d59SAlex Elder 		return -ERANGE;
5037cc070d59SAlex Elder 
5038cc070d59SAlex Elder 	p = &striping_info_buf;
5039b1331852SIlya Dryomov 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5040b1331852SIlya Dryomov 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
5041cc070d59SAlex Elder 	return 0;
5042cc070d59SAlex Elder }
5043cc070d59SAlex Elder 
50447e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
50457e97332eSIlya Dryomov {
50467e97332eSIlya Dryomov 	__le64 data_pool_id;
50477e97332eSIlya Dryomov 	int ret;
50487e97332eSIlya Dryomov 
50497e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
50507e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
50517e97332eSIlya Dryomov 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
50527e97332eSIlya Dryomov 	if (ret < 0)
50537e97332eSIlya Dryomov 		return ret;
50547e97332eSIlya Dryomov 	if (ret < sizeof(data_pool_id))
50557e97332eSIlya Dryomov 		return -EBADMSG;
50567e97332eSIlya Dryomov 
50577e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
50587e97332eSIlya Dryomov 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
50597e97332eSIlya Dryomov 	return 0;
50607e97332eSIlya Dryomov }
50617e97332eSIlya Dryomov 
50629e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
50639e15b77dSAlex Elder {
5064ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
50659e15b77dSAlex Elder 	size_t image_id_size;
50669e15b77dSAlex Elder 	char *image_id;
50679e15b77dSAlex Elder 	void *p;
50689e15b77dSAlex Elder 	void *end;
50699e15b77dSAlex Elder 	size_t size;
50709e15b77dSAlex Elder 	void *reply_buf = NULL;
50719e15b77dSAlex Elder 	size_t len = 0;
50729e15b77dSAlex Elder 	char *image_name = NULL;
50739e15b77dSAlex Elder 	int ret;
50749e15b77dSAlex Elder 
50759e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
50769e15b77dSAlex Elder 
507769e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
507869e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
50799e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
50809e15b77dSAlex Elder 	if (!image_id)
50819e15b77dSAlex Elder 		return NULL;
50829e15b77dSAlex Elder 
50839e15b77dSAlex Elder 	p = image_id;
50844157976bSAlex Elder 	end = image_id + image_id_size;
508569e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
50869e15b77dSAlex Elder 
50879e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
50889e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
50899e15b77dSAlex Elder 	if (!reply_buf)
50909e15b77dSAlex Elder 		goto out;
50919e15b77dSAlex Elder 
5092ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5093ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5094ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
5095e2a58ee5SAlex Elder 				  reply_buf, size);
50969e15b77dSAlex Elder 	if (ret < 0)
50979e15b77dSAlex Elder 		goto out;
50989e15b77dSAlex Elder 	p = reply_buf;
5099f40eb349SAlex Elder 	end = reply_buf + ret;
5100f40eb349SAlex Elder 
51019e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
51029e15b77dSAlex Elder 	if (IS_ERR(image_name))
51039e15b77dSAlex Elder 		image_name = NULL;
51049e15b77dSAlex Elder 	else
51059e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
51069e15b77dSAlex Elder out:
51079e15b77dSAlex Elder 	kfree(reply_buf);
51089e15b77dSAlex Elder 	kfree(image_id);
51099e15b77dSAlex Elder 
51109e15b77dSAlex Elder 	return image_name;
51119e15b77dSAlex Elder }
51129e15b77dSAlex Elder 
51132ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51142ad3d716SAlex Elder {
51152ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
51162ad3d716SAlex Elder 	const char *snap_name;
51172ad3d716SAlex Elder 	u32 which = 0;
51182ad3d716SAlex Elder 
51192ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
51202ad3d716SAlex Elder 
51212ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
51222ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
51232ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
51242ad3d716SAlex Elder 			return snapc->snaps[which];
51252ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
51262ad3d716SAlex Elder 		which++;
51272ad3d716SAlex Elder 	}
51282ad3d716SAlex Elder 	return CEPH_NOSNAP;
51292ad3d716SAlex Elder }
51302ad3d716SAlex Elder 
51312ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51322ad3d716SAlex Elder {
51332ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
51342ad3d716SAlex Elder 	u32 which;
51352ad3d716SAlex Elder 	bool found = false;
51362ad3d716SAlex Elder 	u64 snap_id;
51372ad3d716SAlex Elder 
51382ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
51392ad3d716SAlex Elder 		const char *snap_name;
51402ad3d716SAlex Elder 
51412ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
51422ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5143efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5144efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5145efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5146efadc98aSJosh Durgin 				continue;
5147efadc98aSJosh Durgin 			else
51482ad3d716SAlex Elder 				break;
5149efadc98aSJosh Durgin 		}
51502ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
51512ad3d716SAlex Elder 		kfree(snap_name);
51522ad3d716SAlex Elder 	}
51532ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
51542ad3d716SAlex Elder }
51552ad3d716SAlex Elder 
51562ad3d716SAlex Elder /*
51572ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
51582ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
51592ad3d716SAlex Elder  */
51602ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51612ad3d716SAlex Elder {
51622ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
51632ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
51642ad3d716SAlex Elder 
51652ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
51662ad3d716SAlex Elder }
51672ad3d716SAlex Elder 
51689e15b77dSAlex Elder /*
516904077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
51709e15b77dSAlex Elder  */
517104077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
517204077599SIlya Dryomov {
517304077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
517404077599SIlya Dryomov 
517504077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
517604077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
517704077599SIlya Dryomov 	rbd_assert(spec->snap_name);
517804077599SIlya Dryomov 
517904077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
518004077599SIlya Dryomov 		u64 snap_id;
518104077599SIlya Dryomov 
518204077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
518304077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
518404077599SIlya Dryomov 			return -ENOENT;
518504077599SIlya Dryomov 
518604077599SIlya Dryomov 		spec->snap_id = snap_id;
518704077599SIlya Dryomov 	} else {
518804077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
518904077599SIlya Dryomov 	}
519004077599SIlya Dryomov 
519104077599SIlya Dryomov 	return 0;
519204077599SIlya Dryomov }
519304077599SIlya Dryomov 
519404077599SIlya Dryomov /*
519504077599SIlya Dryomov  * A parent image will have all ids but none of the names.
519604077599SIlya Dryomov  *
519704077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
519804077599SIlya Dryomov  * can't figure out the name for an image id.
519904077599SIlya Dryomov  */
520004077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
52019e15b77dSAlex Elder {
52022e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
52032e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
52042e9f7f1cSAlex Elder 	const char *pool_name;
52052e9f7f1cSAlex Elder 	const char *image_name;
52062e9f7f1cSAlex Elder 	const char *snap_name;
52079e15b77dSAlex Elder 	int ret;
52089e15b77dSAlex Elder 
520904077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
521004077599SIlya Dryomov 	rbd_assert(spec->image_id);
521104077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
52129e15b77dSAlex Elder 
52132e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
52149e15b77dSAlex Elder 
52152e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
52162e9f7f1cSAlex Elder 	if (!pool_name) {
52172e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
5218935dc89fSAlex Elder 		return -EIO;
5219935dc89fSAlex Elder 	}
52202e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
52212e9f7f1cSAlex Elder 	if (!pool_name)
52229e15b77dSAlex Elder 		return -ENOMEM;
52239e15b77dSAlex Elder 
52249e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
52259e15b77dSAlex Elder 
52262e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
52272e9f7f1cSAlex Elder 	if (!image_name)
522806ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
52299e15b77dSAlex Elder 
523004077599SIlya Dryomov 	/* Fetch the snapshot name */
52319e15b77dSAlex Elder 
52322e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
5233da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
5234da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
52359e15b77dSAlex Elder 		goto out_err;
52362e9f7f1cSAlex Elder 	}
52372e9f7f1cSAlex Elder 
52382e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
52392e9f7f1cSAlex Elder 	spec->image_name = image_name;
52402e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
52419e15b77dSAlex Elder 
52429e15b77dSAlex Elder 	return 0;
524304077599SIlya Dryomov 
52449e15b77dSAlex Elder out_err:
52452e9f7f1cSAlex Elder 	kfree(image_name);
52462e9f7f1cSAlex Elder 	kfree(pool_name);
52479e15b77dSAlex Elder 	return ret;
52489e15b77dSAlex Elder }
52499e15b77dSAlex Elder 
5250cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
525135d489f9SAlex Elder {
525235d489f9SAlex Elder 	size_t size;
525335d489f9SAlex Elder 	int ret;
525435d489f9SAlex Elder 	void *reply_buf;
525535d489f9SAlex Elder 	void *p;
525635d489f9SAlex Elder 	void *end;
525735d489f9SAlex Elder 	u64 seq;
525835d489f9SAlex Elder 	u32 snap_count;
525935d489f9SAlex Elder 	struct ceph_snap_context *snapc;
526035d489f9SAlex Elder 	u32 i;
526135d489f9SAlex Elder 
526235d489f9SAlex Elder 	/*
526335d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
526435d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
526535d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
526635d489f9SAlex Elder 	 * prepared to receive.
526735d489f9SAlex Elder 	 */
526835d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
526935d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
527035d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
527135d489f9SAlex Elder 	if (!reply_buf)
527235d489f9SAlex Elder 		return -ENOMEM;
527335d489f9SAlex Elder 
5274ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5275ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
5276ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
527736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
527835d489f9SAlex Elder 	if (ret < 0)
527935d489f9SAlex Elder 		goto out;
528035d489f9SAlex Elder 
528135d489f9SAlex Elder 	p = reply_buf;
528257385b51SAlex Elder 	end = reply_buf + ret;
528357385b51SAlex Elder 	ret = -ERANGE;
528435d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
528535d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
528635d489f9SAlex Elder 
528735d489f9SAlex Elder 	/*
528835d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
528935d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
529035d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
529135d489f9SAlex Elder 	 * allocate is representable in a size_t.
529235d489f9SAlex Elder 	 */
529335d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
529435d489f9SAlex Elder 				 / sizeof (u64)) {
529535d489f9SAlex Elder 		ret = -EINVAL;
529635d489f9SAlex Elder 		goto out;
529735d489f9SAlex Elder 	}
529835d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
529935d489f9SAlex Elder 		goto out;
5300468521c1SAlex Elder 	ret = 0;
530135d489f9SAlex Elder 
5302812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
530335d489f9SAlex Elder 	if (!snapc) {
530435d489f9SAlex Elder 		ret = -ENOMEM;
530535d489f9SAlex Elder 		goto out;
530635d489f9SAlex Elder 	}
530735d489f9SAlex Elder 	snapc->seq = seq;
530835d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
530935d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
531035d489f9SAlex Elder 
531149ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
531235d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
531335d489f9SAlex Elder 
531435d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
531535d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
531635d489f9SAlex Elder out:
531735d489f9SAlex Elder 	kfree(reply_buf);
531835d489f9SAlex Elder 
531957385b51SAlex Elder 	return ret;
532035d489f9SAlex Elder }
532135d489f9SAlex Elder 
532254cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
532354cac61fSAlex Elder 					u64 snap_id)
5324b8b1e2dbSAlex Elder {
5325b8b1e2dbSAlex Elder 	size_t size;
5326b8b1e2dbSAlex Elder 	void *reply_buf;
532754cac61fSAlex Elder 	__le64 snapid;
5328b8b1e2dbSAlex Elder 	int ret;
5329b8b1e2dbSAlex Elder 	void *p;
5330b8b1e2dbSAlex Elder 	void *end;
5331b8b1e2dbSAlex Elder 	char *snap_name;
5332b8b1e2dbSAlex Elder 
5333b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5334b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
5335b8b1e2dbSAlex Elder 	if (!reply_buf)
5336b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
5337b8b1e2dbSAlex Elder 
533854cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
5339ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5340ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
5341ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
534236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5343f40eb349SAlex Elder 	if (ret < 0) {
5344f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
5345b8b1e2dbSAlex Elder 		goto out;
5346f40eb349SAlex Elder 	}
5347b8b1e2dbSAlex Elder 
5348b8b1e2dbSAlex Elder 	p = reply_buf;
5349f40eb349SAlex Elder 	end = reply_buf + ret;
5350e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5351f40eb349SAlex Elder 	if (IS_ERR(snap_name))
5352b8b1e2dbSAlex Elder 		goto out;
5353f40eb349SAlex Elder 
5354b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
535554cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
5356b8b1e2dbSAlex Elder out:
5357b8b1e2dbSAlex Elder 	kfree(reply_buf);
5358b8b1e2dbSAlex Elder 
5359f40eb349SAlex Elder 	return snap_name;
5360b8b1e2dbSAlex Elder }
5361b8b1e2dbSAlex Elder 
53622df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5363117973fbSAlex Elder {
53642df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
5365117973fbSAlex Elder 	int ret;
5366117973fbSAlex Elder 
53671617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
53681617e40cSJosh Durgin 	if (ret)
5369cfbf6377SAlex Elder 		return ret;
53701617e40cSJosh Durgin 
53712df3fac7SAlex Elder 	if (first_time) {
53722df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
53732df3fac7SAlex Elder 		if (ret)
5374cfbf6377SAlex Elder 			return ret;
53752df3fac7SAlex Elder 	}
53762df3fac7SAlex Elder 
5377cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
5378d194cd1dSIlya Dryomov 	if (ret && first_time) {
5379d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
5380d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
5381d194cd1dSIlya Dryomov 	}
5382117973fbSAlex Elder 
5383117973fbSAlex Elder 	return ret;
5384117973fbSAlex Elder }
5385117973fbSAlex Elder 
5386a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5387a720ae09SIlya Dryomov {
5388a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5389a720ae09SIlya Dryomov 
5390a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
5391a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
5392a720ae09SIlya Dryomov 
5393a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
5394a720ae09SIlya Dryomov }
5395a720ae09SIlya Dryomov 
53961ddbe94eSAlex Elder /*
5397e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
5398e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
5399593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
5400593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
5401e28fff26SAlex Elder  */
5402e28fff26SAlex Elder static inline size_t next_token(const char **buf)
5403e28fff26SAlex Elder {
5404e28fff26SAlex Elder         /*
5405e28fff26SAlex Elder         * These are the characters that produce nonzero for
5406e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
5407e28fff26SAlex Elder         */
5408e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
5409e28fff26SAlex Elder 
5410e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
5411e28fff26SAlex Elder 
5412e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
5413e28fff26SAlex Elder }
5414e28fff26SAlex Elder 
5415e28fff26SAlex Elder /*
5416ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
5417ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
5418ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5419ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
5420ea3352f4SAlex Elder  *
5421ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
5422ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
5423ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
5424ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
5425ea3352f4SAlex Elder  *
5426ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
5427ea3352f4SAlex Elder  * the end of the found token.
5428ea3352f4SAlex Elder  *
5429ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
5430ea3352f4SAlex Elder  */
5431ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
5432ea3352f4SAlex Elder {
5433ea3352f4SAlex Elder 	char *dup;
5434ea3352f4SAlex Elder 	size_t len;
5435ea3352f4SAlex Elder 
5436ea3352f4SAlex Elder 	len = next_token(buf);
54374caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5438ea3352f4SAlex Elder 	if (!dup)
5439ea3352f4SAlex Elder 		return NULL;
5440ea3352f4SAlex Elder 	*(dup + len) = '\0';
5441ea3352f4SAlex Elder 	*buf += len;
5442ea3352f4SAlex Elder 
5443ea3352f4SAlex Elder 	if (lenp)
5444ea3352f4SAlex Elder 		*lenp = len;
5445ea3352f4SAlex Elder 
5446ea3352f4SAlex Elder 	return dup;
5447ea3352f4SAlex Elder }
5448ea3352f4SAlex Elder 
5449ea3352f4SAlex Elder /*
5450859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
5451859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5452859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
5453859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
5454d22f76e7SAlex Elder  *
5455859c31dfSAlex Elder  * The information extracted from these options is recorded in
5456859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
5457859c31dfSAlex Elder  * structures:
5458859c31dfSAlex Elder  *  ceph_opts
5459859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
5460859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
5461859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
5462859c31dfSAlex Elder  *  rbd_opts
5463859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
5464859c31dfSAlex Elder  *	this function; caller must release with kfree().
5465859c31dfSAlex Elder  *  spec
5466859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
5467859c31dfSAlex Elder  *	initialized by this function based on parsed options.
5468859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
5469859c31dfSAlex Elder  *
5470859c31dfSAlex Elder  * The options passed take this form:
5471859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5472859c31dfSAlex Elder  * where:
5473859c31dfSAlex Elder  *  <mon_addrs>
5474859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
5475859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
5476859c31dfSAlex Elder  *      by a port number (separated by a colon).
5477859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
5478859c31dfSAlex Elder  *  <options>
5479859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
5480859c31dfSAlex Elder  *  <pool_name>
5481859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
5482859c31dfSAlex Elder  *  <image_name>
5483859c31dfSAlex Elder  *      The name of the image in that pool to map.
5484859c31dfSAlex Elder  *  <snap_id>
5485859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
5486859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
5487859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
5488859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
5489a725f65eSAlex Elder  */
5490859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
5491dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
5492859c31dfSAlex Elder 				struct rbd_options **opts,
5493859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
5494a725f65eSAlex Elder {
5495e28fff26SAlex Elder 	size_t len;
5496859c31dfSAlex Elder 	char *options;
54970ddebc0cSAlex Elder 	const char *mon_addrs;
5498ecb4dc22SAlex Elder 	char *snap_name;
54990ddebc0cSAlex Elder 	size_t mon_addrs_size;
5500c300156bSIlya Dryomov 	struct parse_rbd_opts_ctx pctx = { 0 };
5501859c31dfSAlex Elder 	struct ceph_options *copts;
5502dc79b113SAlex Elder 	int ret;
5503e28fff26SAlex Elder 
5504e28fff26SAlex Elder 	/* The first four tokens are required */
5505e28fff26SAlex Elder 
55067ef3214aSAlex Elder 	len = next_token(&buf);
55074fb5d671SAlex Elder 	if (!len) {
55084fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
55094fb5d671SAlex Elder 		return -EINVAL;
55104fb5d671SAlex Elder 	}
55110ddebc0cSAlex Elder 	mon_addrs = buf;
5512f28e565aSAlex Elder 	mon_addrs_size = len + 1;
55137ef3214aSAlex Elder 	buf += len;
5514a725f65eSAlex Elder 
5515dc79b113SAlex Elder 	ret = -EINVAL;
5516f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
5517f28e565aSAlex Elder 	if (!options)
5518dc79b113SAlex Elder 		return -ENOMEM;
55194fb5d671SAlex Elder 	if (!*options) {
55204fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
55214fb5d671SAlex Elder 		goto out_err;
55224fb5d671SAlex Elder 	}
5523a725f65eSAlex Elder 
5524c300156bSIlya Dryomov 	pctx.spec = rbd_spec_alloc();
5525c300156bSIlya Dryomov 	if (!pctx.spec)
5526f28e565aSAlex Elder 		goto out_mem;
5527859c31dfSAlex Elder 
5528c300156bSIlya Dryomov 	pctx.spec->pool_name = dup_token(&buf, NULL);
5529c300156bSIlya Dryomov 	if (!pctx.spec->pool_name)
5530859c31dfSAlex Elder 		goto out_mem;
5531c300156bSIlya Dryomov 	if (!*pctx.spec->pool_name) {
55324fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
55334fb5d671SAlex Elder 		goto out_err;
55344fb5d671SAlex Elder 	}
5535e28fff26SAlex Elder 
5536c300156bSIlya Dryomov 	pctx.spec->image_name = dup_token(&buf, NULL);
5537c300156bSIlya Dryomov 	if (!pctx.spec->image_name)
5538f28e565aSAlex Elder 		goto out_mem;
5539c300156bSIlya Dryomov 	if (!*pctx.spec->image_name) {
55404fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
55414fb5d671SAlex Elder 		goto out_err;
55424fb5d671SAlex Elder 	}
5543e28fff26SAlex Elder 
5544f28e565aSAlex Elder 	/*
5545f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
5546f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
5547f28e565aSAlex Elder 	 */
55483feeb894SAlex Elder 	len = next_token(&buf);
5549820a5f3eSAlex Elder 	if (!len) {
55503feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
55513feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5552f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5553dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
5554f28e565aSAlex Elder 		goto out_err;
5555849b4260SAlex Elder 	}
5556ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5557ecb4dc22SAlex Elder 	if (!snap_name)
5558f28e565aSAlex Elder 		goto out_mem;
5559ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
5560c300156bSIlya Dryomov 	pctx.spec->snap_name = snap_name;
5561e5c35534SAlex Elder 
55620ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
5563e28fff26SAlex Elder 
5564c300156bSIlya Dryomov 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5565c300156bSIlya Dryomov 	if (!pctx.opts)
55664e9afebaSAlex Elder 		goto out_mem;
55674e9afebaSAlex Elder 
5568c300156bSIlya Dryomov 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5569c300156bSIlya Dryomov 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
55700c93e1b7SIlya Dryomov 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
5571c300156bSIlya Dryomov 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5572c300156bSIlya Dryomov 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5573c300156bSIlya Dryomov 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5574c300156bSIlya Dryomov 	pctx.opts->trim = RBD_TRIM_DEFAULT;
5575d22f76e7SAlex Elder 
5576859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
55770ddebc0cSAlex Elder 				   mon_addrs + mon_addrs_size - 1,
5578c300156bSIlya Dryomov 				   parse_rbd_opts_token, &pctx);
5579859c31dfSAlex Elder 	if (IS_ERR(copts)) {
5580859c31dfSAlex Elder 		ret = PTR_ERR(copts);
5581dc79b113SAlex Elder 		goto out_err;
5582dc79b113SAlex Elder 	}
5583859c31dfSAlex Elder 	kfree(options);
5584859c31dfSAlex Elder 
5585859c31dfSAlex Elder 	*ceph_opts = copts;
5586c300156bSIlya Dryomov 	*opts = pctx.opts;
5587c300156bSIlya Dryomov 	*rbd_spec = pctx.spec;
55880ddebc0cSAlex Elder 
5589dc79b113SAlex Elder 	return 0;
5590f28e565aSAlex Elder out_mem:
5591dc79b113SAlex Elder 	ret = -ENOMEM;
5592d22f76e7SAlex Elder out_err:
5593c300156bSIlya Dryomov 	kfree(pctx.opts);
5594c300156bSIlya Dryomov 	rbd_spec_put(pctx.spec);
5595f28e565aSAlex Elder 	kfree(options);
5596d22f76e7SAlex Elder 
5597dc79b113SAlex Elder 	return ret;
5598a725f65eSAlex Elder }
5599a725f65eSAlex Elder 
5600e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5601e010dd0aSIlya Dryomov {
5602e010dd0aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
5603e010dd0aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
5604e010dd0aSIlya Dryomov 		rbd_unlock(rbd_dev);
5605e010dd0aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
5606e010dd0aSIlya Dryomov }
5607e010dd0aSIlya Dryomov 
5608e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5609e010dd0aSIlya Dryomov {
56102f18d466SIlya Dryomov 	int ret;
56112f18d466SIlya Dryomov 
5612e010dd0aSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5613e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5614e010dd0aSIlya Dryomov 		return -EINVAL;
5615e010dd0aSIlya Dryomov 	}
5616e010dd0aSIlya Dryomov 
5617e010dd0aSIlya Dryomov 	/* FIXME: "rbd map --exclusive" should be in interruptible */
5618e010dd0aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
56192f18d466SIlya Dryomov 	ret = rbd_wait_state_locked(rbd_dev, true);
5620e010dd0aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
56212f18d466SIlya Dryomov 	if (ret) {
5622e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5623e010dd0aSIlya Dryomov 		return -EROFS;
5624e010dd0aSIlya Dryomov 	}
5625e010dd0aSIlya Dryomov 
5626e010dd0aSIlya Dryomov 	return 0;
5627e010dd0aSIlya Dryomov }
5628e010dd0aSIlya Dryomov 
562930ba1f02SIlya Dryomov /*
5630589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5631589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5632589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5633589d30e0SAlex Elder  *
5634589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5635589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5636589d30e0SAlex Elder  * with the supplied name.
5637589d30e0SAlex Elder  *
5638589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5639589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5640589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5641589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5642589d30e0SAlex Elder  */
5643589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5644589d30e0SAlex Elder {
5645589d30e0SAlex Elder 	int ret;
5646589d30e0SAlex Elder 	size_t size;
5647ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
5648589d30e0SAlex Elder 	void *response;
5649c0fba368SAlex Elder 	char *image_id;
56502f82ee54SAlex Elder 
5651589d30e0SAlex Elder 	/*
56522c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
56532c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5654c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5655c0fba368SAlex Elder 	 * do still need to set the image format though.
56562c0d0a10SAlex Elder 	 */
5657c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5658c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5659c0fba368SAlex Elder 
56602c0d0a10SAlex Elder 		return 0;
5661c0fba368SAlex Elder 	}
56622c0d0a10SAlex Elder 
56632c0d0a10SAlex Elder 	/*
5664589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5665589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5666589d30e0SAlex Elder 	 */
5667ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5668ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
5669ecd4a68aSIlya Dryomov 	if (ret)
5670ecd4a68aSIlya Dryomov 		return ret;
5671ecd4a68aSIlya Dryomov 
5672ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
5673589d30e0SAlex Elder 
5674589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5675589d30e0SAlex Elder 
5676589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5677589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5678589d30e0SAlex Elder 	if (!response) {
5679589d30e0SAlex Elder 		ret = -ENOMEM;
5680589d30e0SAlex Elder 		goto out;
5681589d30e0SAlex Elder 	}
5682589d30e0SAlex Elder 
5683c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5684c0fba368SAlex Elder 
5685ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5686ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
5687e2a58ee5SAlex Elder 				  response, RBD_IMAGE_ID_LEN_MAX);
568836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5689c0fba368SAlex Elder 	if (ret == -ENOENT) {
5690c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5691c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5692c0fba368SAlex Elder 		if (!ret)
5693c0fba368SAlex Elder 			rbd_dev->image_format = 1;
56947dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5695c0fba368SAlex Elder 		void *p = response;
5696589d30e0SAlex Elder 
5697c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5698979ed480SAlex Elder 						NULL, GFP_NOIO);
5699461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5700c0fba368SAlex Elder 		if (!ret)
5701c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5702c0fba368SAlex Elder 	}
5703c0fba368SAlex Elder 
5704c0fba368SAlex Elder 	if (!ret) {
5705c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5706c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5707589d30e0SAlex Elder 	}
5708589d30e0SAlex Elder out:
5709589d30e0SAlex Elder 	kfree(response);
5710ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
5711589d30e0SAlex Elder 	return ret;
5712589d30e0SAlex Elder }
5713589d30e0SAlex Elder 
57143abef3b3SAlex Elder /*
57153abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
57163abef3b3SAlex Elder  * call.
57173abef3b3SAlex Elder  */
57186fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
57196fd48b3bSAlex Elder {
57206fd48b3bSAlex Elder 	struct rbd_image_header	*header;
57216fd48b3bSAlex Elder 
5722a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
57236fd48b3bSAlex Elder 
57246fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
57256fd48b3bSAlex Elder 
57266fd48b3bSAlex Elder 	header = &rbd_dev->header;
5727812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
57286fd48b3bSAlex Elder 	kfree(header->snap_sizes);
57296fd48b3bSAlex Elder 	kfree(header->snap_names);
57306fd48b3bSAlex Elder 	kfree(header->object_prefix);
57316fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
57326fd48b3bSAlex Elder }
57336fd48b3bSAlex Elder 
57342df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5735a30b71b9SAlex Elder {
5736a30b71b9SAlex Elder 	int ret;
5737a30b71b9SAlex Elder 
57381e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
573957385b51SAlex Elder 	if (ret)
57401e130199SAlex Elder 		goto out_err;
5741b1b5402aSAlex Elder 
57422df3fac7SAlex Elder 	/*
57432df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
57442df3fac7SAlex Elder 	 * features are assumed to never change.
57452df3fac7SAlex Elder 	 */
5746b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
574757385b51SAlex Elder 	if (ret)
5748b1b5402aSAlex Elder 		goto out_err;
574935d489f9SAlex Elder 
5750cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5751cc070d59SAlex Elder 
5752cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5753cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5754cc070d59SAlex Elder 		if (ret < 0)
5755cc070d59SAlex Elder 			goto out_err;
5756cc070d59SAlex Elder 	}
5757a30b71b9SAlex Elder 
57587e97332eSIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
57597e97332eSIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev);
57607e97332eSIlya Dryomov 		if (ret)
57617e97332eSIlya Dryomov 			goto out_err;
57627e97332eSIlya Dryomov 	}
57637e97332eSIlya Dryomov 
5764263423f8SIlya Dryomov 	rbd_init_layout(rbd_dev);
576535152979SAlex Elder 	return 0;
5766263423f8SIlya Dryomov 
57679d475de5SAlex Elder out_err:
5768642a2537SAlex Elder 	rbd_dev->header.features = 0;
57691e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
57701e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
57719d475de5SAlex Elder 	return ret;
5772a30b71b9SAlex Elder }
5773a30b71b9SAlex Elder 
57746d69bb53SIlya Dryomov /*
57756d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
57766d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
57776d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
57786d69bb53SIlya Dryomov  */
57796d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
578083a06263SAlex Elder {
57812f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5782124afba2SAlex Elder 	int ret;
5783124afba2SAlex Elder 
5784124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5785124afba2SAlex Elder 		return 0;
5786124afba2SAlex Elder 
57876d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
57886d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
57896d69bb53SIlya Dryomov 		ret = -EINVAL;
57906d69bb53SIlya Dryomov 		goto out_err;
57916d69bb53SIlya Dryomov 	}
57926d69bb53SIlya Dryomov 
57931643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
57941f2c6651SIlya Dryomov 	if (!parent) {
5795124afba2SAlex Elder 		ret = -ENOMEM;
5796124afba2SAlex Elder 		goto out_err;
57971f2c6651SIlya Dryomov 	}
57981f2c6651SIlya Dryomov 
57991f2c6651SIlya Dryomov 	/*
58001f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
58011f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
58021f2c6651SIlya Dryomov 	 */
58031f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
58041f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5805124afba2SAlex Elder 
58066d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5807124afba2SAlex Elder 	if (ret < 0)
5808124afba2SAlex Elder 		goto out_err;
58091f2c6651SIlya Dryomov 
5810124afba2SAlex Elder 	rbd_dev->parent = parent;
5811a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5812124afba2SAlex Elder 	return 0;
5813124afba2SAlex Elder 
58141f2c6651SIlya Dryomov out_err:
58151f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
58161f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5817124afba2SAlex Elder 	return ret;
5818124afba2SAlex Elder }
5819124afba2SAlex Elder 
58205769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
58215769ed0cSIlya Dryomov {
58225769ed0cSIlya Dryomov 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
58235769ed0cSIlya Dryomov 	rbd_dev_mapping_clear(rbd_dev);
58245769ed0cSIlya Dryomov 	rbd_free_disk(rbd_dev);
58255769ed0cSIlya Dryomov 	if (!single_major)
58265769ed0cSIlya Dryomov 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
58275769ed0cSIlya Dryomov }
58285769ed0cSIlya Dryomov 
5829811c6688SIlya Dryomov /*
5830811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5831811c6688SIlya Dryomov  * upon return.
5832811c6688SIlya Dryomov  */
5833200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5834124afba2SAlex Elder {
583583a06263SAlex Elder 	int ret;
583683a06263SAlex Elder 
58379b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
583883a06263SAlex Elder 
58399b60e70bSIlya Dryomov 	if (!single_major) {
584083a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
584183a06263SAlex Elder 		if (ret < 0)
58421643dfa4SIlya Dryomov 			goto err_out_unlock;
58439b60e70bSIlya Dryomov 
584483a06263SAlex Elder 		rbd_dev->major = ret;
5845dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
58469b60e70bSIlya Dryomov 	} else {
58479b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
58489b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
58499b60e70bSIlya Dryomov 	}
585083a06263SAlex Elder 
585183a06263SAlex Elder 	/* Set up the blkdev mapping. */
585283a06263SAlex Elder 
585383a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
585483a06263SAlex Elder 	if (ret)
585583a06263SAlex Elder 		goto err_out_blkdev;
585683a06263SAlex Elder 
5857f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
585883a06263SAlex Elder 	if (ret)
585983a06263SAlex Elder 		goto err_out_disk;
5860bc1ecc65SIlya Dryomov 
5861f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
58629568c93eSIlya Dryomov 	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
5863f35a4deeSAlex Elder 
58645769ed0cSIlya Dryomov 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5865f35a4deeSAlex Elder 	if (ret)
5866f5ee37bdSIlya Dryomov 		goto err_out_mapping;
586783a06263SAlex Elder 
5868129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5869811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
58705769ed0cSIlya Dryomov 	return 0;
58712f82ee54SAlex Elder 
5872f35a4deeSAlex Elder err_out_mapping:
5873f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
587483a06263SAlex Elder err_out_disk:
587583a06263SAlex Elder 	rbd_free_disk(rbd_dev);
587683a06263SAlex Elder err_out_blkdev:
58779b60e70bSIlya Dryomov 	if (!single_major)
587883a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5879811c6688SIlya Dryomov err_out_unlock:
5880811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
588183a06263SAlex Elder 	return ret;
588283a06263SAlex Elder }
588383a06263SAlex Elder 
5884332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5885332bb12dSAlex Elder {
5886332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5887c41d13a3SIlya Dryomov 	int ret;
5888332bb12dSAlex Elder 
5889332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5890332bb12dSAlex Elder 
5891332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5892332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5893c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5894332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
5895332bb12dSAlex Elder 	else
5896c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5897332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
5898c41d13a3SIlya Dryomov 
5899c41d13a3SIlya Dryomov 	return ret;
5900332bb12dSAlex Elder }
5901332bb12dSAlex Elder 
5902200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5903200a6a8bSAlex Elder {
59046fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5905fd22aef8SIlya Dryomov 	if (rbd_dev->opts)
5906fd22aef8SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
59076fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
59086fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
59096fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
5910200a6a8bSAlex Elder }
5911200a6a8bSAlex Elder 
5912a30b71b9SAlex Elder /*
5913a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
59141f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
59151f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
59161f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5917a30b71b9SAlex Elder  */
59186d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5919a30b71b9SAlex Elder {
5920a30b71b9SAlex Elder 	int ret;
5921a30b71b9SAlex Elder 
5922a30b71b9SAlex Elder 	/*
59233abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
59243abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
59253abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
59263abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5927a30b71b9SAlex Elder 	 */
5928a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5929a30b71b9SAlex Elder 	if (ret)
5930c0fba368SAlex Elder 		return ret;
5931c0fba368SAlex Elder 
5932332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5933332bb12dSAlex Elder 	if (ret)
5934332bb12dSAlex Elder 		goto err_out_format;
5935332bb12dSAlex Elder 
59366d69bb53SIlya Dryomov 	if (!depth) {
593799d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
59381fe48023SIlya Dryomov 		if (ret) {
59391fe48023SIlya Dryomov 			if (ret == -ENOENT)
5940b26c047bSIlya Dryomov 				pr_info("image %s/%s%s%s does not exist\n",
59411fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
5942b26c047bSIlya Dryomov 					rbd_dev->spec->pool_ns ?: "",
5943b26c047bSIlya Dryomov 					rbd_dev->spec->pool_ns ? "/" : "",
59441fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
5945c41d13a3SIlya Dryomov 			goto err_out_format;
59461f3ef788SAlex Elder 		}
59471fe48023SIlya Dryomov 	}
5948b644de2bSAlex Elder 
5949a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
59505655c4d9SAlex Elder 	if (ret)
5951b644de2bSAlex Elder 		goto err_out_watch;
5952a30b71b9SAlex Elder 
595304077599SIlya Dryomov 	/*
595404077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
595504077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
595604077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
595704077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
595804077599SIlya Dryomov 	 */
59596d69bb53SIlya Dryomov 	if (!depth)
596004077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
596104077599SIlya Dryomov 	else
596204077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
59631fe48023SIlya Dryomov 	if (ret) {
59641fe48023SIlya Dryomov 		if (ret == -ENOENT)
5965b26c047bSIlya Dryomov 			pr_info("snap %s/%s%s%s@%s does not exist\n",
59661fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
5967b26c047bSIlya Dryomov 				rbd_dev->spec->pool_ns ?: "",
5968b26c047bSIlya Dryomov 				rbd_dev->spec->pool_ns ? "/" : "",
59691fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
59701fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
597133dca39fSAlex Elder 		goto err_out_probe;
59721fe48023SIlya Dryomov 	}
59739bb81c9bSAlex Elder 
5974e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5975e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
5976e8f59b59SIlya Dryomov 		if (ret)
5977e8f59b59SIlya Dryomov 			goto err_out_probe;
5978e8f59b59SIlya Dryomov 	}
5979e8f59b59SIlya Dryomov 
59806d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
598130d60ba2SAlex Elder 	if (ret)
598230d60ba2SAlex Elder 		goto err_out_probe;
598383a06263SAlex Elder 
598430d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
5985c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
598630d60ba2SAlex Elder 	return 0;
5987e8f59b59SIlya Dryomov 
59886fd48b3bSAlex Elder err_out_probe:
59896fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5990b644de2bSAlex Elder err_out_watch:
59916d69bb53SIlya Dryomov 	if (!depth)
599299d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
5993332bb12dSAlex Elder err_out_format:
5994332bb12dSAlex Elder 	rbd_dev->image_format = 0;
59955655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
59965655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
59975655c4d9SAlex Elder 	return ret;
599883a06263SAlex Elder }
599983a06263SAlex Elder 
60009b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
600159c2be1eSYehuda Sadeh 			  const char *buf,
600259c2be1eSYehuda Sadeh 			  size_t count)
6003602adf40SYehuda Sadeh {
6004cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
6005dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
60064e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
6007859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
60089d3997fdSAlex Elder 	struct rbd_client *rbdc;
6009b51c83c2SIlya Dryomov 	int rc;
6010602adf40SYehuda Sadeh 
6011602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
6012602adf40SYehuda Sadeh 		return -ENODEV;
6013602adf40SYehuda Sadeh 
6014a725f65eSAlex Elder 	/* parse add command */
6015859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
6016dc79b113SAlex Elder 	if (rc < 0)
6017dd5ac32dSIlya Dryomov 		goto out;
6018a725f65eSAlex Elder 
60199d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
60209d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
60219d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
60220ddebc0cSAlex Elder 		goto err_out_args;
60239d3997fdSAlex Elder 	}
6024602adf40SYehuda Sadeh 
6025602adf40SYehuda Sadeh 	/* pick the pool */
6026dd435855SIlya Dryomov 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
60271fe48023SIlya Dryomov 	if (rc < 0) {
60281fe48023SIlya Dryomov 		if (rc == -ENOENT)
60291fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
6030602adf40SYehuda Sadeh 		goto err_out_client;
60311fe48023SIlya Dryomov 	}
6032859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
6033859c31dfSAlex Elder 
6034d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
6035b51c83c2SIlya Dryomov 	if (!rbd_dev) {
6036b51c83c2SIlya Dryomov 		rc = -ENOMEM;
6037bd4ba655SAlex Elder 		goto err_out_client;
6038b51c83c2SIlya Dryomov 	}
6039c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
6040c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
6041d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
6042602adf40SYehuda Sadeh 
60430d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
60440d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
60450d6d1e9cSMike Christie 		rc = -ENOMEM;
60460d6d1e9cSMike Christie 		goto err_out_rbd_dev;
60470d6d1e9cSMike Christie 	}
60480d6d1e9cSMike Christie 
6049811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
60506d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
60510d6d1e9cSMike Christie 	if (rc < 0) {
60520d6d1e9cSMike Christie 		up_write(&rbd_dev->header_rwsem);
6053c53d5893SAlex Elder 		goto err_out_rbd_dev;
60540d6d1e9cSMike Christie 	}
605505fd6f6fSAlex Elder 
60567ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
60577ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
60589568c93eSIlya Dryomov 		rbd_dev->opts->read_only = true;
60597ce4eef7SAlex Elder 
60600c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
60610c93e1b7SIlya Dryomov 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
60620c93e1b7SIlya Dryomov 			 rbd_dev->layout.object_size);
60630c93e1b7SIlya Dryomov 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
60640c93e1b7SIlya Dryomov 	}
60650c93e1b7SIlya Dryomov 
6066b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
6067fd22aef8SIlya Dryomov 	if (rc)
60688b679ec5SIlya Dryomov 		goto err_out_image_probe;
60693abef3b3SAlex Elder 
6070e010dd0aSIlya Dryomov 	if (rbd_dev->opts->exclusive) {
6071e010dd0aSIlya Dryomov 		rc = rbd_add_acquire_lock(rbd_dev);
6072e010dd0aSIlya Dryomov 		if (rc)
6073e010dd0aSIlya Dryomov 			goto err_out_device_setup;
6074b536f69aSAlex Elder 	}
6075b536f69aSAlex Elder 
60765769ed0cSIlya Dryomov 	/* Everything's ready.  Announce the disk to the world. */
60775769ed0cSIlya Dryomov 
60785769ed0cSIlya Dryomov 	rc = device_add(&rbd_dev->dev);
60795769ed0cSIlya Dryomov 	if (rc)
6080e010dd0aSIlya Dryomov 		goto err_out_image_lock;
60815769ed0cSIlya Dryomov 
60825769ed0cSIlya Dryomov 	add_disk(rbd_dev->disk);
60835769ed0cSIlya Dryomov 	/* see rbd_init_disk() */
60845769ed0cSIlya Dryomov 	blk_put_queue(rbd_dev->disk->queue);
60855769ed0cSIlya Dryomov 
60865769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
60875769ed0cSIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
60885769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
60895769ed0cSIlya Dryomov 
60905769ed0cSIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
60915769ed0cSIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
60925769ed0cSIlya Dryomov 		rbd_dev->header.features);
6093dd5ac32dSIlya Dryomov 	rc = count;
6094dd5ac32dSIlya Dryomov out:
6095dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
6096dd5ac32dSIlya Dryomov 	return rc;
6097b536f69aSAlex Elder 
6098e010dd0aSIlya Dryomov err_out_image_lock:
6099e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
61005769ed0cSIlya Dryomov err_out_device_setup:
61015769ed0cSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
61028b679ec5SIlya Dryomov err_out_image_probe:
61038b679ec5SIlya Dryomov 	rbd_dev_image_release(rbd_dev);
6104c53d5893SAlex Elder err_out_rbd_dev:
6105c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
6106bd4ba655SAlex Elder err_out_client:
61079d3997fdSAlex Elder 	rbd_put_client(rbdc);
61080ddebc0cSAlex Elder err_out_args:
6109859c31dfSAlex Elder 	rbd_spec_put(spec);
6110d147543dSIlya Dryomov 	kfree(rbd_opts);
6111dd5ac32dSIlya Dryomov 	goto out;
6112602adf40SYehuda Sadeh }
6113602adf40SYehuda Sadeh 
61147e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
61159b60e70bSIlya Dryomov {
61169b60e70bSIlya Dryomov 	if (single_major)
61179b60e70bSIlya Dryomov 		return -EINVAL;
61189b60e70bSIlya Dryomov 
61199b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
61209b60e70bSIlya Dryomov }
61219b60e70bSIlya Dryomov 
61227e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
61239b60e70bSIlya Dryomov 				      size_t count)
61249b60e70bSIlya Dryomov {
61259b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
61269b60e70bSIlya Dryomov }
61279b60e70bSIlya Dryomov 
612805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
612905a46afdSAlex Elder {
6130ad945fc1SAlex Elder 	while (rbd_dev->parent) {
613105a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
613205a46afdSAlex Elder 		struct rbd_device *second = first->parent;
613305a46afdSAlex Elder 		struct rbd_device *third;
613405a46afdSAlex Elder 
613505a46afdSAlex Elder 		/*
613605a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
613705a46afdSAlex Elder 		 * remove it.
613805a46afdSAlex Elder 		 */
613905a46afdSAlex Elder 		while (second && (third = second->parent)) {
614005a46afdSAlex Elder 			first = second;
614105a46afdSAlex Elder 			second = third;
614205a46afdSAlex Elder 		}
6143ad945fc1SAlex Elder 		rbd_assert(second);
61448ad42cd0SAlex Elder 		rbd_dev_image_release(second);
61458b679ec5SIlya Dryomov 		rbd_dev_destroy(second);
6146ad945fc1SAlex Elder 		first->parent = NULL;
6147ad945fc1SAlex Elder 		first->parent_overlap = 0;
6148ad945fc1SAlex Elder 
6149ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
615005a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
615105a46afdSAlex Elder 		first->parent_spec = NULL;
615205a46afdSAlex Elder 	}
615305a46afdSAlex Elder }
615405a46afdSAlex Elder 
61559b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
6156602adf40SYehuda Sadeh 			     const char *buf,
6157602adf40SYehuda Sadeh 			     size_t count)
6158602adf40SYehuda Sadeh {
6159602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
6160751cc0e3SAlex Elder 	struct list_head *tmp;
6161751cc0e3SAlex Elder 	int dev_id;
61620276dca6SMike Christie 	char opt_buf[6];
61630276dca6SMike Christie 	bool force = false;
61640d8189e1SAlex Elder 	int ret;
6165602adf40SYehuda Sadeh 
61660276dca6SMike Christie 	dev_id = -1;
61670276dca6SMike Christie 	opt_buf[0] = '\0';
61680276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
61690276dca6SMike Christie 	if (dev_id < 0) {
61700276dca6SMike Christie 		pr_err("dev_id out of range\n");
6171602adf40SYehuda Sadeh 		return -EINVAL;
61720276dca6SMike Christie 	}
61730276dca6SMike Christie 	if (opt_buf[0] != '\0') {
61740276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
61750276dca6SMike Christie 			force = true;
61760276dca6SMike Christie 		} else {
61770276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
61780276dca6SMike Christie 			return -EINVAL;
61790276dca6SMike Christie 		}
61800276dca6SMike Christie 	}
6181602adf40SYehuda Sadeh 
6182602adf40SYehuda Sadeh 	ret = -ENOENT;
6183751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
6184751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
6185751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
6186751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
6187751cc0e3SAlex Elder 			ret = 0;
6188751cc0e3SAlex Elder 			break;
6189602adf40SYehuda Sadeh 		}
6190751cc0e3SAlex Elder 	}
6191751cc0e3SAlex Elder 	if (!ret) {
6192a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
61930276dca6SMike Christie 		if (rbd_dev->open_count && !force)
619442382b70SAlex Elder 			ret = -EBUSY;
619585f5a4d6SIlya Dryomov 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
619685f5a4d6SIlya Dryomov 					  &rbd_dev->flags))
619785f5a4d6SIlya Dryomov 			ret = -EINPROGRESS;
6198a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
6199751cc0e3SAlex Elder 	}
6200751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
620185f5a4d6SIlya Dryomov 	if (ret)
62021ba0f1e7SAlex Elder 		return ret;
6203751cc0e3SAlex Elder 
62040276dca6SMike Christie 	if (force) {
62050276dca6SMike Christie 		/*
62060276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
62070276dca6SMike Christie 		 * IO to complete/fail.
62080276dca6SMike Christie 		 */
62090276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
62100276dca6SMike Christie 		blk_set_queue_dying(rbd_dev->disk->queue);
62110276dca6SMike Christie 	}
62120276dca6SMike Christie 
62135769ed0cSIlya Dryomov 	del_gendisk(rbd_dev->disk);
62145769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
62155769ed0cSIlya Dryomov 	list_del_init(&rbd_dev->node);
62165769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
62175769ed0cSIlya Dryomov 	device_del(&rbd_dev->dev);
6218fca27065SIlya Dryomov 
6219e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
6220dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
62218ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
62228b679ec5SIlya Dryomov 	rbd_dev_destroy(rbd_dev);
62231ba0f1e7SAlex Elder 	return count;
6224602adf40SYehuda Sadeh }
6225602adf40SYehuda Sadeh 
62267e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
62279b60e70bSIlya Dryomov {
62289b60e70bSIlya Dryomov 	if (single_major)
62299b60e70bSIlya Dryomov 		return -EINVAL;
62309b60e70bSIlya Dryomov 
62319b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
62329b60e70bSIlya Dryomov }
62339b60e70bSIlya Dryomov 
62347e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
62359b60e70bSIlya Dryomov 					 size_t count)
62369b60e70bSIlya Dryomov {
62379b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
62389b60e70bSIlya Dryomov }
62399b60e70bSIlya Dryomov 
6240602adf40SYehuda Sadeh /*
6241602adf40SYehuda Sadeh  * create control files in sysfs
6242dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
6243602adf40SYehuda Sadeh  */
62447d8dc534SChengguang Xu static int __init rbd_sysfs_init(void)
6245602adf40SYehuda Sadeh {
6246dfc5606dSYehuda Sadeh 	int ret;
6247602adf40SYehuda Sadeh 
6248fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
6249dfc5606dSYehuda Sadeh 	if (ret < 0)
6250dfc5606dSYehuda Sadeh 		return ret;
6251602adf40SYehuda Sadeh 
6252fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
6253fed4c143SAlex Elder 	if (ret < 0)
6254fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
6255602adf40SYehuda Sadeh 
6256602adf40SYehuda Sadeh 	return ret;
6257602adf40SYehuda Sadeh }
6258602adf40SYehuda Sadeh 
62597d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void)
6260602adf40SYehuda Sadeh {
6261dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
6262fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
6263602adf40SYehuda Sadeh }
6264602adf40SYehuda Sadeh 
62657d8dc534SChengguang Xu static int __init rbd_slab_init(void)
62661c2a9dfeSAlex Elder {
62671c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
626803d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
6269868311b1SAlex Elder 	if (!rbd_img_request_cache)
6270868311b1SAlex Elder 		return -ENOMEM;
6271868311b1SAlex Elder 
6272868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
627303d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
627478c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
627578c2a44aSAlex Elder 		goto out_err;
627678c2a44aSAlex Elder 
62771c2a9dfeSAlex Elder 	return 0;
62781c2a9dfeSAlex Elder 
62796c696d85SIlya Dryomov out_err:
6280868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
6281868311b1SAlex Elder 	rbd_img_request_cache = NULL;
62821c2a9dfeSAlex Elder 	return -ENOMEM;
62831c2a9dfeSAlex Elder }
62841c2a9dfeSAlex Elder 
62851c2a9dfeSAlex Elder static void rbd_slab_exit(void)
62861c2a9dfeSAlex Elder {
6287868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
6288868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
6289868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
6290868311b1SAlex Elder 
62911c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
62921c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
62931c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
62941c2a9dfeSAlex Elder }
62951c2a9dfeSAlex Elder 
6296cc344fa1SAlex Elder static int __init rbd_init(void)
6297602adf40SYehuda Sadeh {
6298602adf40SYehuda Sadeh 	int rc;
6299602adf40SYehuda Sadeh 
63001e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
63011e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
63021e32d34cSAlex Elder 		return -EINVAL;
63031e32d34cSAlex Elder 	}
6304e1b4d96dSIlya Dryomov 
63051c2a9dfeSAlex Elder 	rc = rbd_slab_init();
6306602adf40SYehuda Sadeh 	if (rc)
6307602adf40SYehuda Sadeh 		return rc;
6308e1b4d96dSIlya Dryomov 
6309f5ee37bdSIlya Dryomov 	/*
6310f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
6311f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
6312f5ee37bdSIlya Dryomov 	 */
6313f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6314f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
6315f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
6316f5ee37bdSIlya Dryomov 		goto err_out_slab;
6317f5ee37bdSIlya Dryomov 	}
6318f5ee37bdSIlya Dryomov 
63199b60e70bSIlya Dryomov 	if (single_major) {
63209b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
63219b60e70bSIlya Dryomov 		if (rbd_major < 0) {
63229b60e70bSIlya Dryomov 			rc = rbd_major;
6323f5ee37bdSIlya Dryomov 			goto err_out_wq;
63249b60e70bSIlya Dryomov 		}
63259b60e70bSIlya Dryomov 	}
63269b60e70bSIlya Dryomov 
63271c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
63281c2a9dfeSAlex Elder 	if (rc)
63299b60e70bSIlya Dryomov 		goto err_out_blkdev;
63301c2a9dfeSAlex Elder 
63319b60e70bSIlya Dryomov 	if (single_major)
63329b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
63339b60e70bSIlya Dryomov 	else
6334e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
63359b60e70bSIlya Dryomov 
6336e1b4d96dSIlya Dryomov 	return 0;
6337e1b4d96dSIlya Dryomov 
63389b60e70bSIlya Dryomov err_out_blkdev:
63399b60e70bSIlya Dryomov 	if (single_major)
63409b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6341f5ee37bdSIlya Dryomov err_out_wq:
6342f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
6343e1b4d96dSIlya Dryomov err_out_slab:
6344e1b4d96dSIlya Dryomov 	rbd_slab_exit();
63451c2a9dfeSAlex Elder 	return rc;
6346602adf40SYehuda Sadeh }
6347602adf40SYehuda Sadeh 
6348cc344fa1SAlex Elder static void __exit rbd_exit(void)
6349602adf40SYehuda Sadeh {
6350ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
6351602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
63529b60e70bSIlya Dryomov 	if (single_major)
63539b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6354f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
63551c2a9dfeSAlex Elder 	rbd_slab_exit();
6356602adf40SYehuda Sadeh }
6357602adf40SYehuda Sadeh 
6358602adf40SYehuda Sadeh module_init(rbd_init);
6359602adf40SYehuda Sadeh module_exit(rbd_exit);
6360602adf40SYehuda Sadeh 
6361d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6362602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6363602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6364602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
6365602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6366602adf40SYehuda Sadeh 
636790da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6368602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
6369