xref: /openbmc/linux/drivers/block/rbd.c (revision b9f6d447)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
3543df3d35SIlya Dryomov #include <linux/ceph/striper.h>
36602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3759c2be1eSYehuda Sadeh #include <linux/parser.h>
3830d1cff8SAlex Elder #include <linux/bsearch.h>
39602adf40SYehuda Sadeh 
40602adf40SYehuda Sadeh #include <linux/kernel.h>
41602adf40SYehuda Sadeh #include <linux/device.h>
42602adf40SYehuda Sadeh #include <linux/module.h>
437ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
44602adf40SYehuda Sadeh #include <linux/fs.h>
45602adf40SYehuda Sadeh #include <linux/blkdev.h>
461c2a9dfeSAlex Elder #include <linux/slab.h>
47f8a22fc2SIlya Dryomov #include <linux/idr.h>
48bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
49602adf40SYehuda Sadeh 
50602adf40SYehuda Sadeh #include "rbd_types.h"
51602adf40SYehuda Sadeh 
52aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
53aafb230eSAlex Elder 
54593a9e7bSAlex Elder /*
55a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
56a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
57a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
58a2acd00eSAlex Elder  * -EINVAL without updating it.
59a2acd00eSAlex Elder  */
60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
61a2acd00eSAlex Elder {
62a2acd00eSAlex Elder 	unsigned int counter;
63a2acd00eSAlex Elder 
64bfc18e38SMark Rutland 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
66a2acd00eSAlex Elder 		return (int)counter;
67a2acd00eSAlex Elder 
68a2acd00eSAlex Elder 	atomic_dec(v);
69a2acd00eSAlex Elder 
70a2acd00eSAlex Elder 	return -EINVAL;
71a2acd00eSAlex Elder }
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
75a2acd00eSAlex Elder {
76a2acd00eSAlex Elder 	int counter;
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
79a2acd00eSAlex Elder 	if (counter >= 0)
80a2acd00eSAlex Elder 		return counter;
81a2acd00eSAlex Elder 
82a2acd00eSAlex Elder 	atomic_inc(v);
83a2acd00eSAlex Elder 
84a2acd00eSAlex Elder 	return -EINVAL;
85a2acd00eSAlex Elder }
86a2acd00eSAlex Elder 
87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
88602adf40SYehuda Sadeh 
897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91602adf40SYehuda Sadeh 
926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN	16
936d69bb53SIlya Dryomov 
94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
96d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97d4b125e9SAlex Elder 
9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99602adf40SYehuda Sadeh 
100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
101602adf40SYehuda Sadeh 
1029682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1039682fc6dSAlex Elder 
1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1079e15b77dSAlex Elder 
1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
109589d30e0SAlex Elder 
110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
11199d16943SIlya Dryomov #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
11299d16943SIlya Dryomov 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING		(1ULL<<0)
1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
118b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
1198767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
120e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
1218767b293SIlya Dryomov 
122ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
123ed95b21aSIlya Dryomov 				 RBD_FEATURE_STRIPINGV2 |	\
1247e97332eSIlya Dryomov 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
125b9f6d447SIlya Dryomov 				 RBD_FEATURE_DEEP_FLATTEN |	\
126e573427aSIlya Dryomov 				 RBD_FEATURE_DATA_POOL |	\
127e573427aSIlya Dryomov 				 RBD_FEATURE_OPERATIONS)
128d889140cSAlex Elder 
129d889140cSAlex Elder /* Features supported by this (client software) implementation. */
130d889140cSAlex Elder 
131770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
132d889140cSAlex Elder 
13381a89793SAlex Elder /*
13481a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
13581a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
13681a89793SAlex Elder  */
137602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
138602adf40SYehuda Sadeh 
139602adf40SYehuda Sadeh /*
140602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
141602adf40SYehuda Sadeh  */
142602adf40SYehuda Sadeh struct rbd_image_header {
143f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
144849b4260SAlex Elder 	char *object_prefix;
145602adf40SYehuda Sadeh 	__u8 obj_order;
146f35a4deeSAlex Elder 	u64 stripe_unit;
147f35a4deeSAlex Elder 	u64 stripe_count;
1487e97332eSIlya Dryomov 	s64 data_pool_id;
149f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
150602adf40SYehuda Sadeh 
151f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
152f84344f3SAlex Elder 	u64 image_size;
153f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
154f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
155f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15659c2be1eSYehuda Sadeh };
15759c2be1eSYehuda Sadeh 
1580d7dbfceSAlex Elder /*
1590d7dbfceSAlex Elder  * An rbd image specification.
1600d7dbfceSAlex Elder  *
1610d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
162c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
163c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
164c66c6e0cSAlex Elder  *
165c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
166c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
167c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
168c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
169c66c6e0cSAlex Elder  *
170c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
171c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
172c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
173c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
174c66c6e0cSAlex Elder  * is shared between the parent and child).
175c66c6e0cSAlex Elder  *
176c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
177c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
178c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
179c66c6e0cSAlex Elder  *
180c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
181c66c6e0cSAlex Elder  * could be a null pointer).
1820d7dbfceSAlex Elder  */
1830d7dbfceSAlex Elder struct rbd_spec {
1840d7dbfceSAlex Elder 	u64		pool_id;
185ecb4dc22SAlex Elder 	const char	*pool_name;
186b26c047bSIlya Dryomov 	const char	*pool_ns;	/* NULL if default, never "" */
1870d7dbfceSAlex Elder 
188ecb4dc22SAlex Elder 	const char	*image_id;
189ecb4dc22SAlex Elder 	const char	*image_name;
1900d7dbfceSAlex Elder 
1910d7dbfceSAlex Elder 	u64		snap_id;
192ecb4dc22SAlex Elder 	const char	*snap_name;
1930d7dbfceSAlex Elder 
1940d7dbfceSAlex Elder 	struct kref	kref;
1950d7dbfceSAlex Elder };
1960d7dbfceSAlex Elder 
197602adf40SYehuda Sadeh /*
198f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
199602adf40SYehuda Sadeh  */
200602adf40SYehuda Sadeh struct rbd_client {
201602adf40SYehuda Sadeh 	struct ceph_client	*client;
202602adf40SYehuda Sadeh 	struct kref		kref;
203602adf40SYehuda Sadeh 	struct list_head	node;
204602adf40SYehuda Sadeh };
205602adf40SYehuda Sadeh 
206bf0d5f50SAlex Elder struct rbd_img_request;
207bf0d5f50SAlex Elder 
2089969ebc5SAlex Elder enum obj_request_type {
209a1fbb5e7SIlya Dryomov 	OBJ_REQUEST_NODATA = 1,
2105359a17dSIlya Dryomov 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
2117e07efb1SIlya Dryomov 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
212afb97888SIlya Dryomov 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
2139969ebc5SAlex Elder };
214bf0d5f50SAlex Elder 
2156d2940c8SGuangliang Zhao enum obj_operation_type {
216a1fbb5e7SIlya Dryomov 	OBJ_OP_READ = 1,
2176d2940c8SGuangliang Zhao 	OBJ_OP_WRITE,
21890e98c52SGuangliang Zhao 	OBJ_OP_DISCARD,
2196484cbe9SIlya Dryomov 	OBJ_OP_ZEROOUT,
2206d2940c8SGuangliang Zhao };
2216d2940c8SGuangliang Zhao 
2223da691bfSIlya Dryomov /*
2233da691bfSIlya Dryomov  * Writes go through the following state machine to deal with
2243da691bfSIlya Dryomov  * layering:
2253da691bfSIlya Dryomov  *
22689a59c1cSIlya Dryomov  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
22789a59c1cSIlya Dryomov  *            .                 |                                    .
22889a59c1cSIlya Dryomov  *            .                 v                                    .
22989a59c1cSIlya Dryomov  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
23089a59c1cSIlya Dryomov  *            .                 |                    .               .
23189a59c1cSIlya Dryomov  *            .                 v                    v (deep-copyup  .
23289a59c1cSIlya Dryomov  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
23389a59c1cSIlya Dryomov  * flattened) v                 |                    .               .
23489a59c1cSIlya Dryomov  *            .                 v                    .               .
23589a59c1cSIlya Dryomov  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
23689a59c1cSIlya Dryomov  *                              |                        not needed) v
23789a59c1cSIlya Dryomov  *                              v                                    .
23889a59c1cSIlya Dryomov  *                            done . . . . . . . . . . . . . . . . . .
2393da691bfSIlya Dryomov  *                              ^
2403da691bfSIlya Dryomov  *                              |
2413da691bfSIlya Dryomov  *                     RBD_OBJ_WRITE_FLAT
2423da691bfSIlya Dryomov  *
2433da691bfSIlya Dryomov  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
24489a59c1cSIlya Dryomov  * assert_exists guard is needed or not (in some cases it's not needed
24589a59c1cSIlya Dryomov  * even if there is a parent).
2463da691bfSIlya Dryomov  */
2473da691bfSIlya Dryomov enum rbd_obj_write_state {
2483da691bfSIlya Dryomov 	RBD_OBJ_WRITE_FLAT = 1,
2493da691bfSIlya Dryomov 	RBD_OBJ_WRITE_GUARD,
2503a482501SIlya Dryomov 	RBD_OBJ_WRITE_READ_FROM_PARENT,
25189a59c1cSIlya Dryomov 	RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC,
2523a482501SIlya Dryomov 	RBD_OBJ_WRITE_COPYUP_OPS,
253926f9b3fSAlex Elder };
254926f9b3fSAlex Elder 
255bf0d5f50SAlex Elder struct rbd_obj_request {
25643df3d35SIlya Dryomov 	struct ceph_object_extent ex;
257c5b5ef6cSAlex Elder 	union {
2583da691bfSIlya Dryomov 		bool			tried_parent;	/* for reads */
2593da691bfSIlya Dryomov 		enum rbd_obj_write_state write_state;	/* for writes */
2603da691bfSIlya Dryomov 	};
261bf0d5f50SAlex Elder 
262bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
26386bd7998SIlya Dryomov 	struct ceph_file_extent	*img_extents;
26486bd7998SIlya Dryomov 	u32			num_img_extents;
265bf0d5f50SAlex Elder 
266788e2df3SAlex Elder 	union {
2675359a17dSIlya Dryomov 		struct ceph_bio_iter	bio_pos;
268788e2df3SAlex Elder 		struct {
2697e07efb1SIlya Dryomov 			struct ceph_bvec_iter	bvec_pos;
2707e07efb1SIlya Dryomov 			u32			bvec_count;
271afb97888SIlya Dryomov 			u32			bvec_idx;
272788e2df3SAlex Elder 		};
273788e2df3SAlex Elder 	};
2747e07efb1SIlya Dryomov 	struct bio_vec		*copyup_bvecs;
2757e07efb1SIlya Dryomov 	u32			copyup_bvec_count;
276bf0d5f50SAlex Elder 
277bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
278bf0d5f50SAlex Elder 
279bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2801b83bef2SSage Weil 	int			result;
281bf0d5f50SAlex Elder 
282bf0d5f50SAlex Elder 	struct kref		kref;
283bf0d5f50SAlex Elder };
284bf0d5f50SAlex Elder 
2850c425248SAlex Elder enum img_req_flags {
2869849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
287d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2880c425248SAlex Elder };
2890c425248SAlex Elder 
290bf0d5f50SAlex Elder struct rbd_img_request {
291bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
2929bb0248dSIlya Dryomov 	enum obj_operation_type	op_type;
293ecc633caSIlya Dryomov 	enum obj_request_type	data_type;
2940c425248SAlex Elder 	unsigned long		flags;
295bf0d5f50SAlex Elder 	union {
296bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2979849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2989849e986SAlex Elder 	};
2999849e986SAlex Elder 	union {
3009849e986SAlex Elder 		struct request		*rq;		/* block request */
3019849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
302bf0d5f50SAlex Elder 	};
30315961b44SIlya Dryomov 	spinlock_t		completion_lock;
30455f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
305a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
306bf0d5f50SAlex Elder 
30743df3d35SIlya Dryomov 	struct list_head	object_extents;	/* obj_req.ex structs */
3087114edacSIlya Dryomov 	u32			pending_count;
309bf0d5f50SAlex Elder 
310bf0d5f50SAlex Elder 	struct kref		kref;
311bf0d5f50SAlex Elder };
312bf0d5f50SAlex Elder 
313bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
31443df3d35SIlya Dryomov 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
315bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
31643df3d35SIlya Dryomov 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
317bf0d5f50SAlex Elder 
31899d16943SIlya Dryomov enum rbd_watch_state {
31999d16943SIlya Dryomov 	RBD_WATCH_STATE_UNREGISTERED,
32099d16943SIlya Dryomov 	RBD_WATCH_STATE_REGISTERED,
32199d16943SIlya Dryomov 	RBD_WATCH_STATE_ERROR,
32299d16943SIlya Dryomov };
32399d16943SIlya Dryomov 
324ed95b21aSIlya Dryomov enum rbd_lock_state {
325ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_UNLOCKED,
326ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_LOCKED,
327ed95b21aSIlya Dryomov 	RBD_LOCK_STATE_RELEASING,
328ed95b21aSIlya Dryomov };
329ed95b21aSIlya Dryomov 
330ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
331ed95b21aSIlya Dryomov struct rbd_client_id {
332ed95b21aSIlya Dryomov 	u64 gid;
333ed95b21aSIlya Dryomov 	u64 handle;
334ed95b21aSIlya Dryomov };
335ed95b21aSIlya Dryomov 
336f84344f3SAlex Elder struct rbd_mapping {
33799c1f08fSAlex Elder 	u64                     size;
33834b13184SAlex Elder 	u64                     features;
339f84344f3SAlex Elder };
340f84344f3SAlex Elder 
341602adf40SYehuda Sadeh /*
342602adf40SYehuda Sadeh  * a single device
343602adf40SYehuda Sadeh  */
344602adf40SYehuda Sadeh struct rbd_device {
345de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
346602adf40SYehuda Sadeh 
347602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
348dd82fff1SIlya Dryomov 	int			minor;
349602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
350602adf40SYehuda Sadeh 
351a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
352602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
353602adf40SYehuda Sadeh 
354602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
355602adf40SYehuda Sadeh 
356b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
357602adf40SYehuda Sadeh 
358602adf40SYehuda Sadeh 	struct rbd_image_header	header;
359b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3600d7dbfceSAlex Elder 	struct rbd_spec		*spec;
361d147543dSIlya Dryomov 	struct rbd_options	*opts;
3620d6d1e9cSMike Christie 	char			*config_info;	/* add{,_single_major} string */
363602adf40SYehuda Sadeh 
364c41d13a3SIlya Dryomov 	struct ceph_object_id	header_oid;
365922dab61SIlya Dryomov 	struct ceph_object_locator header_oloc;
366971f839aSAlex Elder 
3671643dfa4SIlya Dryomov 	struct ceph_file_layout	layout;		/* used for all rbd requests */
3680903e875SAlex Elder 
36999d16943SIlya Dryomov 	struct mutex		watch_mutex;
37099d16943SIlya Dryomov 	enum rbd_watch_state	watch_state;
371922dab61SIlya Dryomov 	struct ceph_osd_linger_request *watch_handle;
37299d16943SIlya Dryomov 	u64			watch_cookie;
37399d16943SIlya Dryomov 	struct delayed_work	watch_dwork;
37459c2be1eSYehuda Sadeh 
375ed95b21aSIlya Dryomov 	struct rw_semaphore	lock_rwsem;
376ed95b21aSIlya Dryomov 	enum rbd_lock_state	lock_state;
377cbbfb0ffSIlya Dryomov 	char			lock_cookie[32];
378ed95b21aSIlya Dryomov 	struct rbd_client_id	owner_cid;
379ed95b21aSIlya Dryomov 	struct work_struct	acquired_lock_work;
380ed95b21aSIlya Dryomov 	struct work_struct	released_lock_work;
381ed95b21aSIlya Dryomov 	struct delayed_work	lock_dwork;
382ed95b21aSIlya Dryomov 	struct work_struct	unlock_work;
383ed95b21aSIlya Dryomov 	wait_queue_head_t	lock_waitq;
384ed95b21aSIlya Dryomov 
3851643dfa4SIlya Dryomov 	struct workqueue_struct	*task_wq;
386602adf40SYehuda Sadeh 
38786b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
38886b00e0dSAlex Elder 	u64			parent_overlap;
389a2acd00eSAlex Elder 	atomic_t		parent_ref;
3902f82ee54SAlex Elder 	struct rbd_device	*parent;
39186b00e0dSAlex Elder 
3927ad18afaSChristoph Hellwig 	/* Block layer tags. */
3937ad18afaSChristoph Hellwig 	struct blk_mq_tag_set	tag_set;
3947ad18afaSChristoph Hellwig 
395c666601aSJosh Durgin 	/* protects updating the header */
396c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
397f84344f3SAlex Elder 
398f84344f3SAlex Elder 	struct rbd_mapping	mapping;
399602adf40SYehuda Sadeh 
400602adf40SYehuda Sadeh 	struct list_head	node;
401dfc5606dSYehuda Sadeh 
402dfc5606dSYehuda Sadeh 	/* sysfs related */
403dfc5606dSYehuda Sadeh 	struct device		dev;
404b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
405dfc5606dSYehuda Sadeh };
406dfc5606dSYehuda Sadeh 
407b82d167bSAlex Elder /*
40887c0fdedSIlya Dryomov  * Flag bits for rbd_dev->flags:
40987c0fdedSIlya Dryomov  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
41087c0fdedSIlya Dryomov  *   by rbd_dev->lock
41187c0fdedSIlya Dryomov  * - BLACKLISTED is protected by rbd_dev->lock_rwsem
412b82d167bSAlex Elder  */
4136d292906SAlex Elder enum rbd_dev_flags {
4146d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
415b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
41687c0fdedSIlya Dryomov 	RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
4176d292906SAlex Elder };
4186d292906SAlex Elder 
419cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
420e124a82fSAlex Elder 
421602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
422e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
423e124a82fSAlex Elder 
424602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
425432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
426602adf40SYehuda Sadeh 
42778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
42878c2a44aSAlex Elder 
4291c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
430868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
4311c2a9dfeSAlex Elder 
4329b60e70bSIlya Dryomov static int rbd_major;
433f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
434f8a22fc2SIlya Dryomov 
435f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
436f5ee37bdSIlya Dryomov 
43789a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = {
43889a59c1cSIlya Dryomov 	.nref = REFCOUNT_INIT(1),
43989a59c1cSIlya Dryomov };
44089a59c1cSIlya Dryomov 
4419b60e70bSIlya Dryomov /*
4423cfa3b16SIlya Dryomov  * single-major requires >= 0.75 version of userspace rbd utility.
4439b60e70bSIlya Dryomov  */
4443cfa3b16SIlya Dryomov static bool single_major = true;
4455657a819SJoe Perches module_param(single_major, bool, 0444);
4463cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
4479b60e70bSIlya Dryomov 
448f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
449f0f8cef5SAlex Elder 		       size_t count);
450f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
451f0f8cef5SAlex Elder 			  size_t count);
4529b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4539b60e70bSIlya Dryomov 				    size_t count);
4549b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4559b60e70bSIlya Dryomov 				       size_t count);
4566d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
457f0f8cef5SAlex Elder 
4589b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4599b60e70bSIlya Dryomov {
4607e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4619b60e70bSIlya Dryomov }
4629b60e70bSIlya Dryomov 
4639b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4649b60e70bSIlya Dryomov {
4657e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4669b60e70bSIlya Dryomov }
4679b60e70bSIlya Dryomov 
468ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
469ed95b21aSIlya Dryomov {
470ed95b21aSIlya Dryomov 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
471ed95b21aSIlya Dryomov 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
472ed95b21aSIlya Dryomov }
473ed95b21aSIlya Dryomov 
474ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
475ed95b21aSIlya Dryomov {
476ed95b21aSIlya Dryomov 	bool is_lock_owner;
477ed95b21aSIlya Dryomov 
478ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
479ed95b21aSIlya Dryomov 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
480ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
481ed95b21aSIlya Dryomov 	return is_lock_owner;
482ed95b21aSIlya Dryomov }
483ed95b21aSIlya Dryomov 
4848767b293SIlya Dryomov static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
4858767b293SIlya Dryomov {
4868767b293SIlya Dryomov 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
4878767b293SIlya Dryomov }
4888767b293SIlya Dryomov 
4895657a819SJoe Perches static BUS_ATTR(add, 0200, NULL, rbd_add);
4905657a819SJoe Perches static BUS_ATTR(remove, 0200, NULL, rbd_remove);
4915657a819SJoe Perches static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
4925657a819SJoe Perches static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
4935657a819SJoe Perches static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
494b15a21ddSGreg Kroah-Hartman 
495b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
496b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
497b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4989b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4999b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
5008767b293SIlya Dryomov 	&bus_attr_supported_features.attr,
501b15a21ddSGreg Kroah-Hartman 	NULL,
502f0f8cef5SAlex Elder };
50392c76dc0SIlya Dryomov 
50492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
50592c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
50692c76dc0SIlya Dryomov {
5079b60e70bSIlya Dryomov 	if (!single_major &&
5089b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
5099b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
5109b60e70bSIlya Dryomov 		return 0;
5119b60e70bSIlya Dryomov 
51292c76dc0SIlya Dryomov 	return attr->mode;
51392c76dc0SIlya Dryomov }
51492c76dc0SIlya Dryomov 
51592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
51692c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
51792c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
51892c76dc0SIlya Dryomov };
51992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
520f0f8cef5SAlex Elder 
521f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
522f0f8cef5SAlex Elder 	.name		= "rbd",
523b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
524f0f8cef5SAlex Elder };
525f0f8cef5SAlex Elder 
526f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
527f0f8cef5SAlex Elder {
528f0f8cef5SAlex Elder }
529f0f8cef5SAlex Elder 
530f0f8cef5SAlex Elder static struct device rbd_root_dev = {
531f0f8cef5SAlex Elder 	.init_name =    "rbd",
532f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
533f0f8cef5SAlex Elder };
534f0f8cef5SAlex Elder 
53506ecc6cbSAlex Elder static __printf(2, 3)
53606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
53706ecc6cbSAlex Elder {
53806ecc6cbSAlex Elder 	struct va_format vaf;
53906ecc6cbSAlex Elder 	va_list args;
54006ecc6cbSAlex Elder 
54106ecc6cbSAlex Elder 	va_start(args, fmt);
54206ecc6cbSAlex Elder 	vaf.fmt = fmt;
54306ecc6cbSAlex Elder 	vaf.va = &args;
54406ecc6cbSAlex Elder 
54506ecc6cbSAlex Elder 	if (!rbd_dev)
54606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
54706ecc6cbSAlex Elder 	else if (rbd_dev->disk)
54806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
54906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
55006ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
55106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
55206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
55306ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
55406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
55506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
55606ecc6cbSAlex Elder 	else	/* punt */
55706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
55806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
55906ecc6cbSAlex Elder 	va_end(args);
56006ecc6cbSAlex Elder }
56106ecc6cbSAlex Elder 
562aafb230eSAlex Elder #ifdef RBD_DEBUG
563aafb230eSAlex Elder #define rbd_assert(expr)						\
564aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
565aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
566aafb230eSAlex Elder 						"at line %d:\n\n"	\
567aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
568aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
569aafb230eSAlex Elder 			BUG();						\
570aafb230eSAlex Elder 		}
571aafb230eSAlex Elder #else /* !RBD_DEBUG */
572aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
573aafb230eSAlex Elder #endif /* !RBD_DEBUG */
574dfc5606dSYehuda Sadeh 
57505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5768b3e1a56SAlex Elder 
577cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5782df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
579a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev);
580e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
58154cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
58254cac61fSAlex Elder 					u64 snap_id);
5832ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5842ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5852ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5862ad3d716SAlex Elder 		u64 *snap_features);
58759c2be1eSYehuda Sadeh 
588602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
589602adf40SYehuda Sadeh {
590f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
591b82d167bSAlex Elder 	bool removing = false;
592602adf40SYehuda Sadeh 
593a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
594b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
595b82d167bSAlex Elder 		removing = true;
596b82d167bSAlex Elder 	else
597b82d167bSAlex Elder 		rbd_dev->open_count++;
598a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
599b82d167bSAlex Elder 	if (removing)
600b82d167bSAlex Elder 		return -ENOENT;
601b82d167bSAlex Elder 
602c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
603340c7a2bSAlex Elder 
604602adf40SYehuda Sadeh 	return 0;
605602adf40SYehuda Sadeh }
606602adf40SYehuda Sadeh 
607db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
608dfc5606dSYehuda Sadeh {
609dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
610b82d167bSAlex Elder 	unsigned long open_count_before;
611b82d167bSAlex Elder 
612a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
613b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
614a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
615b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
616dfc5606dSYehuda Sadeh 
617c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
618dfc5606dSYehuda Sadeh }
619dfc5606dSYehuda Sadeh 
620131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
621131fd9f6SGuangliang Zhao {
6221de797bbSIlya Dryomov 	int ro;
623131fd9f6SGuangliang Zhao 
6241de797bbSIlya Dryomov 	if (get_user(ro, (int __user *)arg))
625131fd9f6SGuangliang Zhao 		return -EFAULT;
626131fd9f6SGuangliang Zhao 
6271de797bbSIlya Dryomov 	/* Snapshots can't be marked read-write */
628131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
629131fd9f6SGuangliang Zhao 		return -EROFS;
630131fd9f6SGuangliang Zhao 
6311de797bbSIlya Dryomov 	/* Let blkdev_roset() handle it */
6321de797bbSIlya Dryomov 	return -ENOTTY;
633131fd9f6SGuangliang Zhao }
634131fd9f6SGuangliang Zhao 
635131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
636131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
637131fd9f6SGuangliang Zhao {
638131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
6391de797bbSIlya Dryomov 	int ret;
640131fd9f6SGuangliang Zhao 
641131fd9f6SGuangliang Zhao 	switch (cmd) {
642131fd9f6SGuangliang Zhao 	case BLKROSET:
643131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
644131fd9f6SGuangliang Zhao 		break;
645131fd9f6SGuangliang Zhao 	default:
646131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
647131fd9f6SGuangliang Zhao 	}
648131fd9f6SGuangliang Zhao 
649131fd9f6SGuangliang Zhao 	return ret;
650131fd9f6SGuangliang Zhao }
651131fd9f6SGuangliang Zhao 
652131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
653131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
654131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
655131fd9f6SGuangliang Zhao {
656131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
657131fd9f6SGuangliang Zhao }
658131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
659131fd9f6SGuangliang Zhao 
660602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
661602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
662602adf40SYehuda Sadeh 	.open			= rbd_open,
663dfc5606dSYehuda Sadeh 	.release		= rbd_release,
664131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
665131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
666131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
667131fd9f6SGuangliang Zhao #endif
668602adf40SYehuda Sadeh };
669602adf40SYehuda Sadeh 
670602adf40SYehuda Sadeh /*
6717262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
672cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
673602adf40SYehuda Sadeh  */
674f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
675602adf40SYehuda Sadeh {
676602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
677602adf40SYehuda Sadeh 	int ret = -ENOMEM;
678602adf40SYehuda Sadeh 
67937206ee5SAlex Elder 	dout("%s:\n", __func__);
680602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
681602adf40SYehuda Sadeh 	if (!rbdc)
682602adf40SYehuda Sadeh 		goto out_opt;
683602adf40SYehuda Sadeh 
684602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
685602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
686602adf40SYehuda Sadeh 
68774da4a0fSIlya Dryomov 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
688602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
68908f75463SAlex Elder 		goto out_rbdc;
69043ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
691602adf40SYehuda Sadeh 
692602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
693602adf40SYehuda Sadeh 	if (ret < 0)
69408f75463SAlex Elder 		goto out_client;
695602adf40SYehuda Sadeh 
696432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
697602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
698432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
699602adf40SYehuda Sadeh 
70037206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
701bc534d86SAlex Elder 
702602adf40SYehuda Sadeh 	return rbdc;
70308f75463SAlex Elder out_client:
704602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
70508f75463SAlex Elder out_rbdc:
706602adf40SYehuda Sadeh 	kfree(rbdc);
707602adf40SYehuda Sadeh out_opt:
70843ae4701SAlex Elder 	if (ceph_opts)
70943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
71037206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
71137206ee5SAlex Elder 
71228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
713602adf40SYehuda Sadeh }
714602adf40SYehuda Sadeh 
7152f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7162f82ee54SAlex Elder {
7172f82ee54SAlex Elder 	kref_get(&rbdc->kref);
7182f82ee54SAlex Elder 
7192f82ee54SAlex Elder 	return rbdc;
7202f82ee54SAlex Elder }
7212f82ee54SAlex Elder 
722602adf40SYehuda Sadeh /*
7231f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
7241f7ba331SAlex Elder  * found, bump its reference count.
725602adf40SYehuda Sadeh  */
7261f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
727602adf40SYehuda Sadeh {
728602adf40SYehuda Sadeh 	struct rbd_client *client_node;
7291f7ba331SAlex Elder 	bool found = false;
730602adf40SYehuda Sadeh 
73143ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
732602adf40SYehuda Sadeh 		return NULL;
733602adf40SYehuda Sadeh 
7341f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
7351f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
7361f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
7372f82ee54SAlex Elder 			__rbd_get_client(client_node);
7382f82ee54SAlex Elder 
7391f7ba331SAlex Elder 			found = true;
7401f7ba331SAlex Elder 			break;
7411f7ba331SAlex Elder 		}
7421f7ba331SAlex Elder 	}
7431f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7441f7ba331SAlex Elder 
7451f7ba331SAlex Elder 	return found ? client_node : NULL;
746602adf40SYehuda Sadeh }
747602adf40SYehuda Sadeh 
748602adf40SYehuda Sadeh /*
749210c104cSIlya Dryomov  * (Per device) rbd map options
75059c2be1eSYehuda Sadeh  */
75159c2be1eSYehuda Sadeh enum {
752b5584180SIlya Dryomov 	Opt_queue_depth,
7530c93e1b7SIlya Dryomov 	Opt_alloc_size,
75434f55d0bSDongsheng Yang 	Opt_lock_timeout,
75559c2be1eSYehuda Sadeh 	Opt_last_int,
75659c2be1eSYehuda Sadeh 	/* int args above */
757b26c047bSIlya Dryomov 	Opt_pool_ns,
75859c2be1eSYehuda Sadeh 	Opt_last_string,
75959c2be1eSYehuda Sadeh 	/* string args above */
760cc0538b6SAlex Elder 	Opt_read_only,
761cc0538b6SAlex Elder 	Opt_read_write,
76280de1912SIlya Dryomov 	Opt_lock_on_read,
763e010dd0aSIlya Dryomov 	Opt_exclusive,
764d9360540SIlya Dryomov 	Opt_notrim,
765210c104cSIlya Dryomov 	Opt_err
76659c2be1eSYehuda Sadeh };
76759c2be1eSYehuda Sadeh 
76843ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
769b5584180SIlya Dryomov 	{Opt_queue_depth, "queue_depth=%d"},
7700c93e1b7SIlya Dryomov 	{Opt_alloc_size, "alloc_size=%d"},
77134f55d0bSDongsheng Yang 	{Opt_lock_timeout, "lock_timeout=%d"},
77259c2be1eSYehuda Sadeh 	/* int args above */
773b26c047bSIlya Dryomov 	{Opt_pool_ns, "_pool_ns=%s"},
77459c2be1eSYehuda Sadeh 	/* string args above */
775be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
776cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
777cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
778cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
77980de1912SIlya Dryomov 	{Opt_lock_on_read, "lock_on_read"},
780e010dd0aSIlya Dryomov 	{Opt_exclusive, "exclusive"},
781d9360540SIlya Dryomov 	{Opt_notrim, "notrim"},
782210c104cSIlya Dryomov 	{Opt_err, NULL}
78359c2be1eSYehuda Sadeh };
78459c2be1eSYehuda Sadeh 
78598571b5aSAlex Elder struct rbd_options {
786b5584180SIlya Dryomov 	int	queue_depth;
7870c93e1b7SIlya Dryomov 	int	alloc_size;
78834f55d0bSDongsheng Yang 	unsigned long	lock_timeout;
78998571b5aSAlex Elder 	bool	read_only;
79080de1912SIlya Dryomov 	bool	lock_on_read;
791e010dd0aSIlya Dryomov 	bool	exclusive;
792d9360540SIlya Dryomov 	bool	trim;
79398571b5aSAlex Elder };
79498571b5aSAlex Elder 
795b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
7960c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
79734f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
79898571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
79980de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
800e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT	false
801d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT	true
80298571b5aSAlex Elder 
803c300156bSIlya Dryomov struct parse_rbd_opts_ctx {
804c300156bSIlya Dryomov 	struct rbd_spec		*spec;
805c300156bSIlya Dryomov 	struct rbd_options	*opts;
806c300156bSIlya Dryomov };
807c300156bSIlya Dryomov 
80859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
80959c2be1eSYehuda Sadeh {
810c300156bSIlya Dryomov 	struct parse_rbd_opts_ctx *pctx = private;
81159c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
81259c2be1eSYehuda Sadeh 	int token, intval, ret;
81359c2be1eSYehuda Sadeh 
81443ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
81559c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
81659c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
81759c2be1eSYehuda Sadeh 		if (ret < 0) {
8182f56b6baSIlya Dryomov 			pr_err("bad option arg (not int) at '%s'\n", c);
81959c2be1eSYehuda Sadeh 			return ret;
82059c2be1eSYehuda Sadeh 		}
82159c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
82259c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
823210c104cSIlya Dryomov 		dout("got string token %d val %s\n", token, argstr[0].from);
82459c2be1eSYehuda Sadeh 	} else {
82559c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
82659c2be1eSYehuda Sadeh 	}
82759c2be1eSYehuda Sadeh 
82859c2be1eSYehuda Sadeh 	switch (token) {
829b5584180SIlya Dryomov 	case Opt_queue_depth:
830b5584180SIlya Dryomov 		if (intval < 1) {
831b5584180SIlya Dryomov 			pr_err("queue_depth out of range\n");
832b5584180SIlya Dryomov 			return -EINVAL;
833b5584180SIlya Dryomov 		}
834c300156bSIlya Dryomov 		pctx->opts->queue_depth = intval;
835b5584180SIlya Dryomov 		break;
8360c93e1b7SIlya Dryomov 	case Opt_alloc_size:
8370c93e1b7SIlya Dryomov 		if (intval < 1) {
8380c93e1b7SIlya Dryomov 			pr_err("alloc_size out of range\n");
8390c93e1b7SIlya Dryomov 			return -EINVAL;
8400c93e1b7SIlya Dryomov 		}
8410c93e1b7SIlya Dryomov 		if (!is_power_of_2(intval)) {
8420c93e1b7SIlya Dryomov 			pr_err("alloc_size must be a power of 2\n");
8430c93e1b7SIlya Dryomov 			return -EINVAL;
8440c93e1b7SIlya Dryomov 		}
8450c93e1b7SIlya Dryomov 		pctx->opts->alloc_size = intval;
8460c93e1b7SIlya Dryomov 		break;
84734f55d0bSDongsheng Yang 	case Opt_lock_timeout:
84834f55d0bSDongsheng Yang 		/* 0 is "wait forever" (i.e. infinite timeout) */
84934f55d0bSDongsheng Yang 		if (intval < 0 || intval > INT_MAX / 1000) {
85034f55d0bSDongsheng Yang 			pr_err("lock_timeout out of range\n");
85134f55d0bSDongsheng Yang 			return -EINVAL;
85234f55d0bSDongsheng Yang 		}
853c300156bSIlya Dryomov 		pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
85434f55d0bSDongsheng Yang 		break;
855b26c047bSIlya Dryomov 	case Opt_pool_ns:
856b26c047bSIlya Dryomov 		kfree(pctx->spec->pool_ns);
857b26c047bSIlya Dryomov 		pctx->spec->pool_ns = match_strdup(argstr);
858b26c047bSIlya Dryomov 		if (!pctx->spec->pool_ns)
859b26c047bSIlya Dryomov 			return -ENOMEM;
86059c2be1eSYehuda Sadeh 		break;
861cc0538b6SAlex Elder 	case Opt_read_only:
862c300156bSIlya Dryomov 		pctx->opts->read_only = true;
863cc0538b6SAlex Elder 		break;
864cc0538b6SAlex Elder 	case Opt_read_write:
865c300156bSIlya Dryomov 		pctx->opts->read_only = false;
866cc0538b6SAlex Elder 		break;
86780de1912SIlya Dryomov 	case Opt_lock_on_read:
868c300156bSIlya Dryomov 		pctx->opts->lock_on_read = true;
86980de1912SIlya Dryomov 		break;
870e010dd0aSIlya Dryomov 	case Opt_exclusive:
871c300156bSIlya Dryomov 		pctx->opts->exclusive = true;
872e010dd0aSIlya Dryomov 		break;
873d9360540SIlya Dryomov 	case Opt_notrim:
874c300156bSIlya Dryomov 		pctx->opts->trim = false;
875d9360540SIlya Dryomov 		break;
87659c2be1eSYehuda Sadeh 	default:
877210c104cSIlya Dryomov 		/* libceph prints "bad option" msg */
878210c104cSIlya Dryomov 		return -EINVAL;
87959c2be1eSYehuda Sadeh 	}
880210c104cSIlya Dryomov 
88159c2be1eSYehuda Sadeh 	return 0;
88259c2be1eSYehuda Sadeh }
88359c2be1eSYehuda Sadeh 
8846d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8856d2940c8SGuangliang Zhao {
8866d2940c8SGuangliang Zhao 	switch (op_type) {
8876d2940c8SGuangliang Zhao 	case OBJ_OP_READ:
8886d2940c8SGuangliang Zhao 		return "read";
8896d2940c8SGuangliang Zhao 	case OBJ_OP_WRITE:
8906d2940c8SGuangliang Zhao 		return "write";
89190e98c52SGuangliang Zhao 	case OBJ_OP_DISCARD:
89290e98c52SGuangliang Zhao 		return "discard";
8936484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
8946484cbe9SIlya Dryomov 		return "zeroout";
8956d2940c8SGuangliang Zhao 	default:
8966d2940c8SGuangliang Zhao 		return "???";
8976d2940c8SGuangliang Zhao 	}
8986d2940c8SGuangliang Zhao }
8996d2940c8SGuangliang Zhao 
90059c2be1eSYehuda Sadeh /*
901602adf40SYehuda Sadeh  * Destroy ceph client
902d23a4b3fSAlex Elder  *
903432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
904602adf40SYehuda Sadeh  */
905602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
906602adf40SYehuda Sadeh {
907602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
908602adf40SYehuda Sadeh 
90937206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
910cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
911602adf40SYehuda Sadeh 	list_del(&rbdc->node);
912cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
913602adf40SYehuda Sadeh 
914602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
915602adf40SYehuda Sadeh 	kfree(rbdc);
916602adf40SYehuda Sadeh }
917602adf40SYehuda Sadeh 
918602adf40SYehuda Sadeh /*
919602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
920602adf40SYehuda Sadeh  * it.
921602adf40SYehuda Sadeh  */
9229d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
923602adf40SYehuda Sadeh {
924c53d5893SAlex Elder 	if (rbdc)
9259d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
926602adf40SYehuda Sadeh }
927602adf40SYehuda Sadeh 
928dd435855SIlya Dryomov static int wait_for_latest_osdmap(struct ceph_client *client)
929dd435855SIlya Dryomov {
930dd435855SIlya Dryomov 	u64 newest_epoch;
931dd435855SIlya Dryomov 	int ret;
932dd435855SIlya Dryomov 
933dd435855SIlya Dryomov 	ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
934dd435855SIlya Dryomov 	if (ret)
935dd435855SIlya Dryomov 		return ret;
936dd435855SIlya Dryomov 
937dd435855SIlya Dryomov 	if (client->osdc.osdmap->epoch >= newest_epoch)
938dd435855SIlya Dryomov 		return 0;
939dd435855SIlya Dryomov 
940dd435855SIlya Dryomov 	ceph_osdc_maybe_request_map(&client->osdc);
941dd435855SIlya Dryomov 	return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
942dd435855SIlya Dryomov 				     client->options->mount_timeout);
943dd435855SIlya Dryomov }
944dd435855SIlya Dryomov 
9455feb0d8dSIlya Dryomov /*
9465feb0d8dSIlya Dryomov  * Get a ceph client with specific addr and configuration, if one does
9475feb0d8dSIlya Dryomov  * not exist create it.  Either way, ceph_opts is consumed by this
9485feb0d8dSIlya Dryomov  * function.
9495feb0d8dSIlya Dryomov  */
9505feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
9515feb0d8dSIlya Dryomov {
9525feb0d8dSIlya Dryomov 	struct rbd_client *rbdc;
953dd435855SIlya Dryomov 	int ret;
9545feb0d8dSIlya Dryomov 
9555feb0d8dSIlya Dryomov 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
9565feb0d8dSIlya Dryomov 	rbdc = rbd_client_find(ceph_opts);
957dd435855SIlya Dryomov 	if (rbdc) {
9585feb0d8dSIlya Dryomov 		ceph_destroy_options(ceph_opts);
959dd435855SIlya Dryomov 
960dd435855SIlya Dryomov 		/*
961dd435855SIlya Dryomov 		 * Using an existing client.  Make sure ->pg_pools is up to
962dd435855SIlya Dryomov 		 * date before we look up the pool id in do_rbd_add().
963dd435855SIlya Dryomov 		 */
964dd435855SIlya Dryomov 		ret = wait_for_latest_osdmap(rbdc->client);
965dd435855SIlya Dryomov 		if (ret) {
966dd435855SIlya Dryomov 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
967dd435855SIlya Dryomov 			rbd_put_client(rbdc);
968dd435855SIlya Dryomov 			rbdc = ERR_PTR(ret);
969dd435855SIlya Dryomov 		}
970dd435855SIlya Dryomov 	} else {
9715feb0d8dSIlya Dryomov 		rbdc = rbd_client_create(ceph_opts);
972dd435855SIlya Dryomov 	}
9735feb0d8dSIlya Dryomov 	mutex_unlock(&client_mutex);
9745feb0d8dSIlya Dryomov 
9755feb0d8dSIlya Dryomov 	return rbdc;
9765feb0d8dSIlya Dryomov }
9775feb0d8dSIlya Dryomov 
978a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
979a30b71b9SAlex Elder {
980a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
981a30b71b9SAlex Elder }
982a30b71b9SAlex Elder 
9838e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9848e94af8eSAlex Elder {
985103a150fSAlex Elder 	size_t size;
986103a150fSAlex Elder 	u32 snap_count;
987103a150fSAlex Elder 
988103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
989103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
990103a150fSAlex Elder 		return false;
991103a150fSAlex Elder 
992db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
993db2388b6SAlex Elder 
994db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
995db2388b6SAlex Elder 		return false;
996db2388b6SAlex Elder 
997db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
998db2388b6SAlex Elder 
999db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
1000db2388b6SAlex Elder 		return false;
1001db2388b6SAlex Elder 
1002103a150fSAlex Elder 	/*
1003103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
1004103a150fSAlex Elder 	 * that limits the number of snapshots.
1005103a150fSAlex Elder 	 */
1006103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
1007103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
1008103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
1009103a150fSAlex Elder 		return false;
1010103a150fSAlex Elder 
1011103a150fSAlex Elder 	/*
1012103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
1013103a150fSAlex Elder 	 * header must also be representable in a size_t.
1014103a150fSAlex Elder 	 */
1015103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
1016103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1017103a150fSAlex Elder 		return false;
1018103a150fSAlex Elder 
1019103a150fSAlex Elder 	return true;
10208e94af8eSAlex Elder }
10218e94af8eSAlex Elder 
1022602adf40SYehuda Sadeh /*
10235bc3fb17SIlya Dryomov  * returns the size of an object in the image
10245bc3fb17SIlya Dryomov  */
10255bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
10265bc3fb17SIlya Dryomov {
10275bc3fb17SIlya Dryomov 	return 1U << header->obj_order;
10285bc3fb17SIlya Dryomov }
10295bc3fb17SIlya Dryomov 
1030263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
1031263423f8SIlya Dryomov {
1032263423f8SIlya Dryomov 	if (rbd_dev->header.stripe_unit == 0 ||
1033263423f8SIlya Dryomov 	    rbd_dev->header.stripe_count == 0) {
1034263423f8SIlya Dryomov 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1035263423f8SIlya Dryomov 		rbd_dev->header.stripe_count = 1;
1036263423f8SIlya Dryomov 	}
1037263423f8SIlya Dryomov 
1038263423f8SIlya Dryomov 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1039263423f8SIlya Dryomov 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1040263423f8SIlya Dryomov 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
10417e97332eSIlya Dryomov 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
10427e97332eSIlya Dryomov 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1043263423f8SIlya Dryomov 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1044263423f8SIlya Dryomov }
1045263423f8SIlya Dryomov 
10465bc3fb17SIlya Dryomov /*
1047bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
1048bb23e37aSAlex Elder  * on-disk header.
1049602adf40SYehuda Sadeh  */
1050662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
10514156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
1052602adf40SYehuda Sadeh {
1053662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
1054bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
1055bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
1056bb23e37aSAlex Elder 	char *object_prefix = NULL;
1057bb23e37aSAlex Elder 	char *snap_names = NULL;
1058bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
1059ccece235SAlex Elder 	u32 snap_count;
1060bb23e37aSAlex Elder 	int ret = -ENOMEM;
1061621901d6SAlex Elder 	u32 i;
1062602adf40SYehuda Sadeh 
1063bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
1064103a150fSAlex Elder 
1065bb23e37aSAlex Elder 	if (first_time) {
1066848d796cSIlya Dryomov 		object_prefix = kstrndup(ondisk->object_prefix,
1067848d796cSIlya Dryomov 					 sizeof(ondisk->object_prefix),
1068848d796cSIlya Dryomov 					 GFP_KERNEL);
1069bb23e37aSAlex Elder 		if (!object_prefix)
1070602adf40SYehuda Sadeh 			return -ENOMEM;
1071bb23e37aSAlex Elder 	}
107200f1f36fSAlex Elder 
1073bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
1074d2bb24e5SAlex Elder 
1075602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
1076bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1077bb23e37aSAlex Elder 	if (!snapc)
1078bb23e37aSAlex Elder 		goto out_err;
1079bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1080602adf40SYehuda Sadeh 	if (snap_count) {
1081bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
1082f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1083f785cc1dSAlex Elder 
1084bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
1085621901d6SAlex Elder 
1086f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
1087bb23e37aSAlex Elder 			goto out_2big;
1088bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1089bb23e37aSAlex Elder 		if (!snap_names)
1090602adf40SYehuda Sadeh 			goto out_err;
1091bb23e37aSAlex Elder 
1092bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
109388a25a5fSMarkus Elfring 		snap_sizes = kmalloc_array(snap_count,
109488a25a5fSMarkus Elfring 					   sizeof(*header->snap_sizes),
109588a25a5fSMarkus Elfring 					   GFP_KERNEL);
1096bb23e37aSAlex Elder 		if (!snap_sizes)
1097bb23e37aSAlex Elder 			goto out_err;
1098bb23e37aSAlex Elder 
1099f785cc1dSAlex Elder 		/*
1100bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
1101bb23e37aSAlex Elder 		 * and size.
1102bb23e37aSAlex Elder 		 *
110399a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
1104bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
1105f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
1106f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
1107f785cc1dSAlex Elder 		 */
1108bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1109bb23e37aSAlex Elder 		snaps = ondisk->snaps;
1110bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
1111bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1112bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1113bb23e37aSAlex Elder 		}
1114602adf40SYehuda Sadeh 	}
1115849b4260SAlex Elder 
1116bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
1117bb23e37aSAlex Elder 
1118bb23e37aSAlex Elder 	if (first_time) {
1119bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
1120602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
1121263423f8SIlya Dryomov 		rbd_init_layout(rbd_dev);
1122662518b1SAlex Elder 	} else {
1123662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
1124662518b1SAlex Elder 		kfree(header->snap_names);
1125662518b1SAlex Elder 		kfree(header->snap_sizes);
1126bb23e37aSAlex Elder 	}
11276a52325fSAlex Elder 
1128bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
1129621901d6SAlex Elder 
1130f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
1131bb23e37aSAlex Elder 	header->snapc = snapc;
1132bb23e37aSAlex Elder 	header->snap_names = snap_names;
1133bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
1134468521c1SAlex Elder 
1135602adf40SYehuda Sadeh 	return 0;
1136bb23e37aSAlex Elder out_2big:
1137bb23e37aSAlex Elder 	ret = -EIO;
11386a52325fSAlex Elder out_err:
1139bb23e37aSAlex Elder 	kfree(snap_sizes);
1140bb23e37aSAlex Elder 	kfree(snap_names);
1141bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
1142bb23e37aSAlex Elder 	kfree(object_prefix);
1143ccece235SAlex Elder 
1144bb23e37aSAlex Elder 	return ret;
1145602adf40SYehuda Sadeh }
1146602adf40SYehuda Sadeh 
11479682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11489682fc6dSAlex Elder {
11499682fc6dSAlex Elder 	const char *snap_name;
11509682fc6dSAlex Elder 
11519682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11529682fc6dSAlex Elder 
11539682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
11549682fc6dSAlex Elder 
11559682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
11569682fc6dSAlex Elder 	while (which--)
11579682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
11589682fc6dSAlex Elder 
11599682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
11609682fc6dSAlex Elder }
11619682fc6dSAlex Elder 
116230d1cff8SAlex Elder /*
116330d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
116430d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
116530d1cff8SAlex Elder  */
116630d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
116730d1cff8SAlex Elder {
116830d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
116930d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
117030d1cff8SAlex Elder 
117130d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
117230d1cff8SAlex Elder 		return 1;
117330d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
117430d1cff8SAlex Elder }
117530d1cff8SAlex Elder 
117630d1cff8SAlex Elder /*
117730d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
117830d1cff8SAlex Elder  * present.
117930d1cff8SAlex Elder  *
118030d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
118130d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
118230d1cff8SAlex Elder  *
118330d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
118430d1cff8SAlex Elder  * reverse order, highest snapshot id first.
118530d1cff8SAlex Elder  */
11869682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11879682fc6dSAlex Elder {
11889682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
118930d1cff8SAlex Elder 	u64 *found;
11909682fc6dSAlex Elder 
119130d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
119230d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
11939682fc6dSAlex Elder 
119430d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11959682fc6dSAlex Elder }
11969682fc6dSAlex Elder 
11972ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11982ad3d716SAlex Elder 					u64 snap_id)
119954cac61fSAlex Elder {
120054cac61fSAlex Elder 	u32 which;
1201da6a6b63SJosh Durgin 	const char *snap_name;
120254cac61fSAlex Elder 
120354cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
120454cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1205da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
120654cac61fSAlex Elder 
1207da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1208da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
120954cac61fSAlex Elder }
121054cac61fSAlex Elder 
12119e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
12129e15b77dSAlex Elder {
12139e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
12149e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
12159e15b77dSAlex Elder 
121654cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
121754cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
121854cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
12199e15b77dSAlex Elder 
122054cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
12219e15b77dSAlex Elder }
12229e15b77dSAlex Elder 
12232ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
12242ad3d716SAlex Elder 				u64 *snap_size)
1225602adf40SYehuda Sadeh {
12262ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12272ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12282ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
12292ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12302ad3d716SAlex Elder 		u32 which;
123100f1f36fSAlex Elder 
12322ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
12332ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
12342ad3d716SAlex Elder 			return -ENOENT;
123500f1f36fSAlex Elder 
12362ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
12372ad3d716SAlex Elder 	} else {
12382ad3d716SAlex Elder 		u64 size = 0;
12392ad3d716SAlex Elder 		int ret;
12402ad3d716SAlex Elder 
12412ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
12422ad3d716SAlex Elder 		if (ret)
12432ad3d716SAlex Elder 			return ret;
12442ad3d716SAlex Elder 
12452ad3d716SAlex Elder 		*snap_size = size;
12462ad3d716SAlex Elder 	}
12472ad3d716SAlex Elder 	return 0;
12482ad3d716SAlex Elder }
12492ad3d716SAlex Elder 
12502ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
12512ad3d716SAlex Elder 			u64 *snap_features)
12522ad3d716SAlex Elder {
12532ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
12542ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
12552ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
12562ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
12572ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
12582ad3d716SAlex Elder 	} else {
12592ad3d716SAlex Elder 		u64 features = 0;
12602ad3d716SAlex Elder 		int ret;
12612ad3d716SAlex Elder 
12622ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
12632ad3d716SAlex Elder 		if (ret)
12642ad3d716SAlex Elder 			return ret;
12652ad3d716SAlex Elder 
12662ad3d716SAlex Elder 		*snap_features = features;
12672ad3d716SAlex Elder 	}
12682ad3d716SAlex Elder 	return 0;
126900f1f36fSAlex Elder }
1270602adf40SYehuda Sadeh 
1271d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1272602adf40SYehuda Sadeh {
12738f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
12742ad3d716SAlex Elder 	u64 size = 0;
12752ad3d716SAlex Elder 	u64 features = 0;
12762ad3d716SAlex Elder 	int ret;
12778b0241f8SAlex Elder 
12782ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
12792ad3d716SAlex Elder 	if (ret)
12802ad3d716SAlex Elder 		return ret;
12812ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
12822ad3d716SAlex Elder 	if (ret)
12832ad3d716SAlex Elder 		return ret;
12842ad3d716SAlex Elder 
12852ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
12862ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
12872ad3d716SAlex Elder 
12888b0241f8SAlex Elder 	return 0;
1289602adf40SYehuda Sadeh }
1290602adf40SYehuda Sadeh 
1291d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1292d1cf5788SAlex Elder {
1293d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1294d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1295200a6a8bSAlex Elder }
1296200a6a8bSAlex Elder 
12975359a17dSIlya Dryomov static void zero_bvec(struct bio_vec *bv)
129865ccfe21SAlex Elder {
1299602adf40SYehuda Sadeh 	void *buf;
13005359a17dSIlya Dryomov 	unsigned long flags;
1301602adf40SYehuda Sadeh 
13025359a17dSIlya Dryomov 	buf = bvec_kmap_irq(bv, &flags);
13035359a17dSIlya Dryomov 	memset(buf, 0, bv->bv_len);
13045359a17dSIlya Dryomov 	flush_dcache_page(bv->bv_page);
130585b5aaa6SDan Carpenter 	bvec_kunmap_irq(buf, &flags);
1306602adf40SYehuda Sadeh }
1307602adf40SYehuda Sadeh 
13085359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1309b9434c5bSAlex Elder {
13105359a17dSIlya Dryomov 	struct ceph_bio_iter it = *bio_pos;
1311b9434c5bSAlex Elder 
13125359a17dSIlya Dryomov 	ceph_bio_iter_advance(&it, off);
13135359a17dSIlya Dryomov 	ceph_bio_iter_advance_step(&it, bytes, ({
13145359a17dSIlya Dryomov 		zero_bvec(&bv);
13155359a17dSIlya Dryomov 	}));
1316b9434c5bSAlex Elder }
1317b9434c5bSAlex Elder 
13187e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1319602adf40SYehuda Sadeh {
13207e07efb1SIlya Dryomov 	struct ceph_bvec_iter it = *bvec_pos;
1321602adf40SYehuda Sadeh 
13227e07efb1SIlya Dryomov 	ceph_bvec_iter_advance(&it, off);
13237e07efb1SIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
13247e07efb1SIlya Dryomov 		zero_bvec(&bv);
13257e07efb1SIlya Dryomov 	}));
1326602adf40SYehuda Sadeh }
1327602adf40SYehuda Sadeh 
1328f7760dadSAlex Elder /*
13293da691bfSIlya Dryomov  * Zero a range in @obj_req data buffer defined by a bio (list) or
1330afb97888SIlya Dryomov  * (private) bio_vec array.
1331f7760dadSAlex Elder  *
13323da691bfSIlya Dryomov  * @off is relative to the start of the data buffer.
1333f7760dadSAlex Elder  */
13343da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
13353da691bfSIlya Dryomov 			       u32 bytes)
1336f7760dadSAlex Elder {
1337ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
13383da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
13393da691bfSIlya Dryomov 		zero_bios(&obj_req->bio_pos, off, bytes);
13403da691bfSIlya Dryomov 		break;
13413da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1342afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
13433da691bfSIlya Dryomov 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
13443da691bfSIlya Dryomov 		break;
13453da691bfSIlya Dryomov 	default:
13463da691bfSIlya Dryomov 		rbd_assert(0);
1347f5400b7aSAlex Elder 	}
1348bf0d5f50SAlex Elder }
1349bf0d5f50SAlex Elder 
1350bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1351bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1352bf0d5f50SAlex Elder {
1353bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
135437206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
13552c935bc5SPeter Zijlstra 		kref_read(&obj_request->kref));
1356bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1357bf0d5f50SAlex Elder }
1358bf0d5f50SAlex Elder 
13590f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
13600f2d5be7SAlex Elder {
13610f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
13622c935bc5SPeter Zijlstra 	     kref_read(&img_request->kref));
13630f2d5be7SAlex Elder 	kref_get(&img_request->kref);
13640f2d5be7SAlex Elder }
13650f2d5be7SAlex Elder 
1366bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1367bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1368bf0d5f50SAlex Elder {
1369bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
137037206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
13712c935bc5SPeter Zijlstra 		kref_read(&img_request->kref));
1372bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1373bf0d5f50SAlex Elder }
1374bf0d5f50SAlex Elder 
1375bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1376bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1377bf0d5f50SAlex Elder {
137825dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
137925dcf954SAlex Elder 
1380b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1381bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
13827114edacSIlya Dryomov 	img_request->pending_count++;
138315961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1384bf0d5f50SAlex Elder }
1385bf0d5f50SAlex Elder 
1386bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1387bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1388bf0d5f50SAlex Elder {
138915961b44SIlya Dryomov 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
139043df3d35SIlya Dryomov 	list_del(&obj_request->ex.oe_item);
1391bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1392bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1393bf0d5f50SAlex Elder }
1394bf0d5f50SAlex Elder 
1395980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1396bf0d5f50SAlex Elder {
1397980917fcSIlya Dryomov 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1398980917fcSIlya Dryomov 
1399a90bb0c1SIlya Dryomov 	dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
140043df3d35SIlya Dryomov 	     obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
140143df3d35SIlya Dryomov 	     obj_request->ex.oe_len, osd_req);
1402980917fcSIlya Dryomov 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1403bf0d5f50SAlex Elder }
1404bf0d5f50SAlex Elder 
14050c425248SAlex Elder /*
14060c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
14070c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
14080c425248SAlex Elder  * and currently never change thereafter.
14090c425248SAlex Elder  */
1410d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1411d0b2e944SAlex Elder {
1412d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1413d0b2e944SAlex Elder 	smp_mb();
1414d0b2e944SAlex Elder }
1415d0b2e944SAlex Elder 
1416a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1417a2acd00eSAlex Elder {
1418a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1419a2acd00eSAlex Elder 	smp_mb();
1420a2acd00eSAlex Elder }
1421a2acd00eSAlex Elder 
1422d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1423d0b2e944SAlex Elder {
1424d0b2e944SAlex Elder 	smp_mb();
1425d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1426d0b2e944SAlex Elder }
1427d0b2e944SAlex Elder 
14283da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
14293b434a2aSJosh Durgin {
14303da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
14313da691bfSIlya Dryomov 
143243df3d35SIlya Dryomov 	return !obj_req->ex.oe_off &&
143343df3d35SIlya Dryomov 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
14343b434a2aSJosh Durgin }
14353b434a2aSJosh Durgin 
14363da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
14376e2a4505SAlex Elder {
14383da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1439b9434c5bSAlex Elder 
144043df3d35SIlya Dryomov 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
14413da691bfSIlya Dryomov 					rbd_dev->layout.object_size;
14426e2a4505SAlex Elder }
14436e2a4505SAlex Elder 
144413488d53SIlya Dryomov /*
144513488d53SIlya Dryomov  * Must be called after rbd_obj_calc_img_extents().
144613488d53SIlya Dryomov  */
144713488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
144813488d53SIlya Dryomov {
144913488d53SIlya Dryomov 	if (!obj_req->num_img_extents ||
14509b17eb2cSIlya Dryomov 	    (rbd_obj_is_entire(obj_req) &&
14519b17eb2cSIlya Dryomov 	     !obj_req->img_request->snapc->num_snaps))
145213488d53SIlya Dryomov 		return false;
145313488d53SIlya Dryomov 
145413488d53SIlya Dryomov 	return true;
145513488d53SIlya Dryomov }
145613488d53SIlya Dryomov 
145786bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1458bf0d5f50SAlex Elder {
145986bd7998SIlya Dryomov 	return ceph_file_extents_bytes(obj_req->img_extents,
146086bd7998SIlya Dryomov 				       obj_req->num_img_extents);
1461bf0d5f50SAlex Elder }
1462bf0d5f50SAlex Elder 
14633da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req)
14640dcc685eSIlya Dryomov {
14659bb0248dSIlya Dryomov 	switch (img_req->op_type) {
14663da691bfSIlya Dryomov 	case OBJ_OP_READ:
14673da691bfSIlya Dryomov 		return false;
14683da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
14693da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
14706484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
14713da691bfSIlya Dryomov 		return true;
14723da691bfSIlya Dryomov 	default:
1473c6244b3bSArnd Bergmann 		BUG();
14740dcc685eSIlya Dryomov 	}
14750dcc685eSIlya Dryomov }
14760dcc685eSIlya Dryomov 
14773da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
14782761713dSIlya Dryomov 
147985e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1480bf0d5f50SAlex Elder {
14813da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1482bf0d5f50SAlex Elder 
14833da691bfSIlya Dryomov 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
14843da691bfSIlya Dryomov 	     osd_req->r_result, obj_req);
14853da691bfSIlya Dryomov 	rbd_assert(osd_req == obj_req->osd_req);
1486bf0d5f50SAlex Elder 
14873da691bfSIlya Dryomov 	obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
14883da691bfSIlya Dryomov 	if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
14893da691bfSIlya Dryomov 		obj_req->xferred = osd_req->r_result;
14903da691bfSIlya Dryomov 	else
1491c47f9371SAlex Elder 		/*
14923da691bfSIlya Dryomov 		 * Writes aren't allowed to return a data payload.  In some
14933da691bfSIlya Dryomov 		 * guarded write cases (e.g. stat + zero on an empty object)
14943da691bfSIlya Dryomov 		 * a stat response makes it through, but we don't care.
1495c47f9371SAlex Elder 		 */
14963da691bfSIlya Dryomov 		obj_req->xferred = 0;
14970ccd5926SIlya Dryomov 
14983da691bfSIlya Dryomov 	rbd_obj_handle_request(obj_req);
1499bf0d5f50SAlex Elder }
1500bf0d5f50SAlex Elder 
15019d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1502430c28c3SAlex Elder {
15038c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1504430c28c3SAlex Elder 
1505a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_READ;
15067c84883aSIlya Dryomov 	osd_req->r_snapid = obj_request->img_request->snap_id;
15079d4df01fSAlex Elder }
15089d4df01fSAlex Elder 
15099d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
15109d4df01fSAlex Elder {
15119d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
15129d4df01fSAlex Elder 
1513a162b308SIlya Dryomov 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1514fac02ddfSArnd Bergmann 	ktime_get_real_ts64(&osd_req->r_mtime);
151543df3d35SIlya Dryomov 	osd_req->r_data_offset = obj_request->ex.oe_off;
1516430c28c3SAlex Elder }
1517430c28c3SAlex Elder 
1518bc81207eSIlya Dryomov static struct ceph_osd_request *
1519e28eded5SIlya Dryomov __rbd_osd_req_create(struct rbd_obj_request *obj_req,
1520e28eded5SIlya Dryomov 		     struct ceph_snap_context *snapc, unsigned int num_ops)
1521bc81207eSIlya Dryomov {
1522e28eded5SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1523bc81207eSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1524bc81207eSIlya Dryomov 	struct ceph_osd_request *req;
1525a90bb0c1SIlya Dryomov 	const char *name_format = rbd_dev->image_format == 1 ?
1526a90bb0c1SIlya Dryomov 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1527bc81207eSIlya Dryomov 
1528e28eded5SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1529bc81207eSIlya Dryomov 	if (!req)
1530bc81207eSIlya Dryomov 		return NULL;
1531bc81207eSIlya Dryomov 
1532bc81207eSIlya Dryomov 	req->r_callback = rbd_osd_req_callback;
1533a162b308SIlya Dryomov 	req->r_priv = obj_req;
1534bc81207eSIlya Dryomov 
1535b26c047bSIlya Dryomov 	/*
1536b26c047bSIlya Dryomov 	 * Data objects may be stored in a separate pool, but always in
1537b26c047bSIlya Dryomov 	 * the same namespace in that pool as the header in its pool.
1538b26c047bSIlya Dryomov 	 */
1539b26c047bSIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1540bc81207eSIlya Dryomov 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1541b26c047bSIlya Dryomov 
1542a90bb0c1SIlya Dryomov 	if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
154343df3d35SIlya Dryomov 			rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
1544bc81207eSIlya Dryomov 		goto err_req;
1545bc81207eSIlya Dryomov 
1546bc81207eSIlya Dryomov 	return req;
1547bc81207eSIlya Dryomov 
1548bc81207eSIlya Dryomov err_req:
1549bc81207eSIlya Dryomov 	ceph_osdc_put_request(req);
1550bc81207eSIlya Dryomov 	return NULL;
1551bc81207eSIlya Dryomov }
1552bc81207eSIlya Dryomov 
1553e28eded5SIlya Dryomov static struct ceph_osd_request *
1554e28eded5SIlya Dryomov rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
1555e28eded5SIlya Dryomov {
1556e28eded5SIlya Dryomov 	return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc,
1557e28eded5SIlya Dryomov 				    num_ops);
1558e28eded5SIlya Dryomov }
1559e28eded5SIlya Dryomov 
1560bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1561bf0d5f50SAlex Elder {
1562bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1563bf0d5f50SAlex Elder }
1564bf0d5f50SAlex Elder 
1565ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void)
1566bf0d5f50SAlex Elder {
1567bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1568bf0d5f50SAlex Elder 
15695a60e876SIlya Dryomov 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
15706c696d85SIlya Dryomov 	if (!obj_request)
1571f907ad55SAlex Elder 		return NULL;
1572f907ad55SAlex Elder 
157343df3d35SIlya Dryomov 	ceph_object_extent_init(&obj_request->ex);
1574bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1575bf0d5f50SAlex Elder 
157667e2b652SIlya Dryomov 	dout("%s %p\n", __func__, obj_request);
1577bf0d5f50SAlex Elder 	return obj_request;
1578bf0d5f50SAlex Elder }
1579bf0d5f50SAlex Elder 
1580bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1581bf0d5f50SAlex Elder {
1582bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
15837e07efb1SIlya Dryomov 	u32 i;
1584bf0d5f50SAlex Elder 
1585bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1586bf0d5f50SAlex Elder 
158737206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
158837206ee5SAlex Elder 
1589bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1590bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1591bf0d5f50SAlex Elder 
1592ecc633caSIlya Dryomov 	switch (obj_request->img_request->data_type) {
15939969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1594bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
15957e07efb1SIlya Dryomov 	case OBJ_REQUEST_BVECS:
15965359a17dSIlya Dryomov 		break;		/* Nothing to do */
1597afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
1598afb97888SIlya Dryomov 		kfree(obj_request->bvec_pos.bvecs);
1599bf0d5f50SAlex Elder 		break;
16007e07efb1SIlya Dryomov 	default:
16017e07efb1SIlya Dryomov 		rbd_assert(0);
1602bf0d5f50SAlex Elder 	}
1603bf0d5f50SAlex Elder 
160486bd7998SIlya Dryomov 	kfree(obj_request->img_extents);
16057e07efb1SIlya Dryomov 	if (obj_request->copyup_bvecs) {
16067e07efb1SIlya Dryomov 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
16077e07efb1SIlya Dryomov 			if (obj_request->copyup_bvecs[i].bv_page)
16087e07efb1SIlya Dryomov 				__free_page(obj_request->copyup_bvecs[i].bv_page);
16097e07efb1SIlya Dryomov 		}
16107e07efb1SIlya Dryomov 		kfree(obj_request->copyup_bvecs);
1611bf0d5f50SAlex Elder 	}
1612bf0d5f50SAlex Elder 
1613868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1614bf0d5f50SAlex Elder }
1615bf0d5f50SAlex Elder 
1616fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1617fb65d228SAlex Elder 
1618fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1619fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1620fb65d228SAlex Elder {
1621fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1622fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1623fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1624fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1625fb65d228SAlex Elder }
1626fb65d228SAlex Elder 
1627bf0d5f50SAlex Elder /*
1628a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1629a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1630a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1631a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1632a2acd00eSAlex Elder  */
1633a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1634a2acd00eSAlex Elder {
1635a2acd00eSAlex Elder 	int counter;
1636a2acd00eSAlex Elder 
1637a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1638a2acd00eSAlex Elder 		return;
1639a2acd00eSAlex Elder 
1640a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1641a2acd00eSAlex Elder 	if (counter > 0)
1642a2acd00eSAlex Elder 		return;
1643a2acd00eSAlex Elder 
1644a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1645a2acd00eSAlex Elder 
1646a2acd00eSAlex Elder 	if (!counter)
1647a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1648a2acd00eSAlex Elder 	else
16499584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference underflow");
1650a2acd00eSAlex Elder }
1651a2acd00eSAlex Elder 
1652a2acd00eSAlex Elder /*
1653a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1654a2acd00eSAlex Elder  * parent.
1655a2acd00eSAlex Elder  *
1656a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1657a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1658a2acd00eSAlex Elder  * false otherwise.
1659a2acd00eSAlex Elder  */
1660a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1661a2acd00eSAlex Elder {
1662ae43e9d0SIlya Dryomov 	int counter = 0;
1663a2acd00eSAlex Elder 
1664a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1665a2acd00eSAlex Elder 		return false;
1666a2acd00eSAlex Elder 
1667ae43e9d0SIlya Dryomov 	down_read(&rbd_dev->header_rwsem);
1668ae43e9d0SIlya Dryomov 	if (rbd_dev->parent_overlap)
1669a2acd00eSAlex Elder 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1670ae43e9d0SIlya Dryomov 	up_read(&rbd_dev->header_rwsem);
1671a2acd00eSAlex Elder 
1672a2acd00eSAlex Elder 	if (counter < 0)
16739584d508SIlya Dryomov 		rbd_warn(rbd_dev, "parent reference overflow");
1674a2acd00eSAlex Elder 
1675ae43e9d0SIlya Dryomov 	return counter > 0;
1676a2acd00eSAlex Elder }
1677a2acd00eSAlex Elder 
1678bf0d5f50SAlex Elder /*
1679bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1680bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1681bf0d5f50SAlex Elder  * (if there is one).
1682bf0d5f50SAlex Elder  */
1683cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1684cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
16856d2940c8SGuangliang Zhao 					enum obj_operation_type op_type,
16864e752f0aSJosh Durgin 					struct ceph_snap_context *snapc)
1687bf0d5f50SAlex Elder {
1688bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1689bf0d5f50SAlex Elder 
1690a0c5895bSIlya Dryomov 	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
1691bf0d5f50SAlex Elder 	if (!img_request)
1692bf0d5f50SAlex Elder 		return NULL;
1693bf0d5f50SAlex Elder 
1694bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
16959bb0248dSIlya Dryomov 	img_request->op_type = op_type;
16969bb0248dSIlya Dryomov 	if (!rbd_img_is_write(img_request))
1697bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
16989bb0248dSIlya Dryomov 	else
16999bb0248dSIlya Dryomov 		img_request->snapc = snapc;
17009bb0248dSIlya Dryomov 
1701a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
1702d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1703a0c5895bSIlya Dryomov 
1704bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
170543df3d35SIlya Dryomov 	INIT_LIST_HEAD(&img_request->object_extents);
1706bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1707bf0d5f50SAlex Elder 
1708dfd9875fSIlya Dryomov 	dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1709dfd9875fSIlya Dryomov 	     obj_op_name(op_type), img_request);
1710bf0d5f50SAlex Elder 	return img_request;
1711bf0d5f50SAlex Elder }
1712bf0d5f50SAlex Elder 
1713bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1714bf0d5f50SAlex Elder {
1715bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1716bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1717bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1718bf0d5f50SAlex Elder 
1719bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1720bf0d5f50SAlex Elder 
172137206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
172237206ee5SAlex Elder 
1723bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1724bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
1725bf0d5f50SAlex Elder 
1726a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
1727a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
1728a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
1729a2acd00eSAlex Elder 	}
1730a2acd00eSAlex Elder 
17319bb0248dSIlya Dryomov 	if (rbd_img_is_write(img_request))
1732812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1733bf0d5f50SAlex Elder 
17341c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
1735bf0d5f50SAlex Elder }
1736bf0d5f50SAlex Elder 
173786bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents,
173886bd7998SIlya Dryomov 			  u32 *num_img_extents, u64 overlap)
1739e93f3152SAlex Elder {
174086bd7998SIlya Dryomov 	u32 cnt = *num_img_extents;
1741e93f3152SAlex Elder 
174286bd7998SIlya Dryomov 	/* drop extents completely beyond the overlap */
174386bd7998SIlya Dryomov 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
174486bd7998SIlya Dryomov 		cnt--;
1745e93f3152SAlex Elder 
174686bd7998SIlya Dryomov 	if (cnt) {
174786bd7998SIlya Dryomov 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
1748e93f3152SAlex Elder 
174986bd7998SIlya Dryomov 		/* trim final overlapping extent */
175086bd7998SIlya Dryomov 		if (ex->fe_off + ex->fe_len > overlap)
175186bd7998SIlya Dryomov 			ex->fe_len = overlap - ex->fe_off;
1752e93f3152SAlex Elder 	}
1753e93f3152SAlex Elder 
175486bd7998SIlya Dryomov 	*num_img_extents = cnt;
175586bd7998SIlya Dryomov }
175686bd7998SIlya Dryomov 
175786bd7998SIlya Dryomov /*
175886bd7998SIlya Dryomov  * Determine the byte range(s) covered by either just the object extent
175986bd7998SIlya Dryomov  * or the entire object in the parent image.
176086bd7998SIlya Dryomov  */
176186bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
176286bd7998SIlya Dryomov 				    bool entire)
1763e93f3152SAlex Elder {
176486bd7998SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1765c5b5ef6cSAlex Elder 	int ret;
1766c5b5ef6cSAlex Elder 
176786bd7998SIlya Dryomov 	if (!rbd_dev->parent_overlap)
176886bd7998SIlya Dryomov 		return 0;
176986bd7998SIlya Dryomov 
177086bd7998SIlya Dryomov 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
177186bd7998SIlya Dryomov 				  entire ? 0 : obj_req->ex.oe_off,
177286bd7998SIlya Dryomov 				  entire ? rbd_dev->layout.object_size :
177386bd7998SIlya Dryomov 							obj_req->ex.oe_len,
177486bd7998SIlya Dryomov 				  &obj_req->img_extents,
177586bd7998SIlya Dryomov 				  &obj_req->num_img_extents);
177686bd7998SIlya Dryomov 	if (ret)
177786bd7998SIlya Dryomov 		return ret;
177886bd7998SIlya Dryomov 
177986bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
178086bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
178186bd7998SIlya Dryomov 	return 0;
178286bd7998SIlya Dryomov }
178386bd7998SIlya Dryomov 
17843da691bfSIlya Dryomov static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
17853da691bfSIlya Dryomov {
1786ecc633caSIlya Dryomov 	switch (obj_req->img_request->data_type) {
17873da691bfSIlya Dryomov 	case OBJ_REQUEST_BIO:
17883da691bfSIlya Dryomov 		osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
17893da691bfSIlya Dryomov 					       &obj_req->bio_pos,
179043df3d35SIlya Dryomov 					       obj_req->ex.oe_len);
17913da691bfSIlya Dryomov 		break;
17923da691bfSIlya Dryomov 	case OBJ_REQUEST_BVECS:
1793afb97888SIlya Dryomov 	case OBJ_REQUEST_OWN_BVECS:
17943da691bfSIlya Dryomov 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
179543df3d35SIlya Dryomov 							obj_req->ex.oe_len);
1796afb97888SIlya Dryomov 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
17973da691bfSIlya Dryomov 		osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
17983da691bfSIlya Dryomov 						    &obj_req->bvec_pos);
17993da691bfSIlya Dryomov 		break;
18003da691bfSIlya Dryomov 	default:
18013da691bfSIlya Dryomov 		rbd_assert(0);
18023da691bfSIlya Dryomov 	}
18033da691bfSIlya Dryomov }
18043da691bfSIlya Dryomov 
18053da691bfSIlya Dryomov static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
18063da691bfSIlya Dryomov {
1807e28eded5SIlya Dryomov 	obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1);
18083da691bfSIlya Dryomov 	if (!obj_req->osd_req)
1809710214e3SIlya Dryomov 		return -ENOMEM;
1810710214e3SIlya Dryomov 
18113da691bfSIlya Dryomov 	osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
181243df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
18133da691bfSIlya Dryomov 	rbd_osd_req_setup_data(obj_req, 0);
1814a90bb0c1SIlya Dryomov 
18153da691bfSIlya Dryomov 	rbd_osd_req_format_read(obj_req);
18163da691bfSIlya Dryomov 	return 0;
1817710214e3SIlya Dryomov }
1818710214e3SIlya Dryomov 
18193da691bfSIlya Dryomov static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
18203da691bfSIlya Dryomov 				unsigned int which)
18213da691bfSIlya Dryomov {
18223da691bfSIlya Dryomov 	struct page **pages;
18233da691bfSIlya Dryomov 
1824c5b5ef6cSAlex Elder 	/*
1825c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
1826c5b5ef6cSAlex Elder 	 *     le64 length;
1827c5b5ef6cSAlex Elder 	 *     struct {
1828c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
1829c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
1830c5b5ef6cSAlex Elder 	 *     } mtime;
1831c5b5ef6cSAlex Elder 	 */
18323da691bfSIlya Dryomov 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
18333da691bfSIlya Dryomov 	if (IS_ERR(pages))
18343da691bfSIlya Dryomov 		return PTR_ERR(pages);
18353da691bfSIlya Dryomov 
18363da691bfSIlya Dryomov 	osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
18373da691bfSIlya Dryomov 	osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
18383da691bfSIlya Dryomov 				     8 + sizeof(struct ceph_timespec),
18393da691bfSIlya Dryomov 				     0, false, true);
18403da691bfSIlya Dryomov 	return 0;
1841710214e3SIlya Dryomov }
1842c5b5ef6cSAlex Elder 
184313488d53SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req)
184413488d53SIlya Dryomov {
184513488d53SIlya Dryomov 	return 2; /* setallochint + write/writefull */
184613488d53SIlya Dryomov }
184713488d53SIlya Dryomov 
18483da691bfSIlya Dryomov static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
18493da691bfSIlya Dryomov 				  unsigned int which)
18503da691bfSIlya Dryomov {
18513da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
18523da691bfSIlya Dryomov 	u16 opcode;
1853c5b5ef6cSAlex Elder 
18543da691bfSIlya Dryomov 	osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
18553da691bfSIlya Dryomov 				   rbd_dev->layout.object_size,
18563da691bfSIlya Dryomov 				   rbd_dev->layout.object_size);
1857c5b5ef6cSAlex Elder 
18583da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req))
18593da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITEFULL;
18603da691bfSIlya Dryomov 	else
18613da691bfSIlya Dryomov 		opcode = CEPH_OSD_OP_WRITE;
1862c5b5ef6cSAlex Elder 
18633da691bfSIlya Dryomov 	osd_req_op_extent_init(obj_req->osd_req, which, opcode,
186443df3d35SIlya Dryomov 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
18653da691bfSIlya Dryomov 	rbd_osd_req_setup_data(obj_req, which++);
18663da691bfSIlya Dryomov 
18673da691bfSIlya Dryomov 	rbd_assert(which == obj_req->osd_req->r_num_ops);
18683da691bfSIlya Dryomov 	rbd_osd_req_format_write(obj_req);
18693da691bfSIlya Dryomov }
18703da691bfSIlya Dryomov 
18713da691bfSIlya Dryomov static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
18723da691bfSIlya Dryomov {
18733da691bfSIlya Dryomov 	unsigned int num_osd_ops, which = 0;
187413488d53SIlya Dryomov 	bool need_guard;
18753da691bfSIlya Dryomov 	int ret;
18763da691bfSIlya Dryomov 
187786bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
187886bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
187986bd7998SIlya Dryomov 	if (ret)
188086bd7998SIlya Dryomov 		return ret;
188186bd7998SIlya Dryomov 
188213488d53SIlya Dryomov 	need_guard = rbd_obj_copyup_enabled(obj_req);
188313488d53SIlya Dryomov 	num_osd_ops = need_guard + count_write_ops(obj_req);
18843da691bfSIlya Dryomov 
1885a162b308SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
18863da691bfSIlya Dryomov 	if (!obj_req->osd_req)
18873da691bfSIlya Dryomov 		return -ENOMEM;
18883da691bfSIlya Dryomov 
188913488d53SIlya Dryomov 	if (need_guard) {
18903da691bfSIlya Dryomov 		ret = __rbd_obj_setup_stat(obj_req, which++);
18913da691bfSIlya Dryomov 		if (ret)
1892c5b5ef6cSAlex Elder 			return ret;
189313488d53SIlya Dryomov 
189413488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_GUARD;
189513488d53SIlya Dryomov 	} else {
189613488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1897c5b5ef6cSAlex Elder 	}
1898c5b5ef6cSAlex Elder 
18993da691bfSIlya Dryomov 	__rbd_obj_setup_write(obj_req, which);
19003da691bfSIlya Dryomov 	return 0;
190170d045f6SIlya Dryomov }
190270d045f6SIlya Dryomov 
19036484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
19046484cbe9SIlya Dryomov {
19056484cbe9SIlya Dryomov 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
19066484cbe9SIlya Dryomov 					  CEPH_OSD_OP_ZERO;
19076484cbe9SIlya Dryomov }
19086484cbe9SIlya Dryomov 
19096484cbe9SIlya Dryomov static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
19106484cbe9SIlya Dryomov {
19110c93e1b7SIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
19120c93e1b7SIlya Dryomov 	u64 off = obj_req->ex.oe_off;
19130c93e1b7SIlya Dryomov 	u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
19146484cbe9SIlya Dryomov 	int ret;
19156484cbe9SIlya Dryomov 
19160c93e1b7SIlya Dryomov 	/*
19170c93e1b7SIlya Dryomov 	 * Align the range to alloc_size boundary and punt on discards
19180c93e1b7SIlya Dryomov 	 * that are too small to free up any space.
19190c93e1b7SIlya Dryomov 	 *
19200c93e1b7SIlya Dryomov 	 * alloc_size == object_size && is_tail() is a special case for
19210c93e1b7SIlya Dryomov 	 * filestore with filestore_punch_hole = false, needed to allow
19220c93e1b7SIlya Dryomov 	 * truncate (in addition to delete).
19230c93e1b7SIlya Dryomov 	 */
19240c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
19250c93e1b7SIlya Dryomov 	    !rbd_obj_is_tail(obj_req)) {
19260c93e1b7SIlya Dryomov 		off = round_up(off, rbd_dev->opts->alloc_size);
19270c93e1b7SIlya Dryomov 		next_off = round_down(next_off, rbd_dev->opts->alloc_size);
19280c93e1b7SIlya Dryomov 		if (off >= next_off)
19290c93e1b7SIlya Dryomov 			return 1;
19300c93e1b7SIlya Dryomov 	}
19310c93e1b7SIlya Dryomov 
19326484cbe9SIlya Dryomov 	/* reverse map the entire object onto the parent */
19336484cbe9SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
19346484cbe9SIlya Dryomov 	if (ret)
19356484cbe9SIlya Dryomov 		return ret;
19366484cbe9SIlya Dryomov 
19376484cbe9SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
19386484cbe9SIlya Dryomov 	if (!obj_req->osd_req)
19396484cbe9SIlya Dryomov 		return -ENOMEM;
19406484cbe9SIlya Dryomov 
19416484cbe9SIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
19426484cbe9SIlya Dryomov 		osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
19436484cbe9SIlya Dryomov 	} else {
19440c93e1b7SIlya Dryomov 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
19450c93e1b7SIlya Dryomov 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
19460c93e1b7SIlya Dryomov 		     off, next_off - off);
19476484cbe9SIlya Dryomov 		osd_req_op_extent_init(obj_req->osd_req, 0,
19486484cbe9SIlya Dryomov 				       truncate_or_zero_opcode(obj_req),
19490c93e1b7SIlya Dryomov 				       off, next_off - off, 0, 0);
19506484cbe9SIlya Dryomov 	}
19516484cbe9SIlya Dryomov 
19526484cbe9SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_FLAT;
19536484cbe9SIlya Dryomov 	rbd_osd_req_format_write(obj_req);
19546484cbe9SIlya Dryomov 	return 0;
19556484cbe9SIlya Dryomov }
19566484cbe9SIlya Dryomov 
195713488d53SIlya Dryomov static int count_zeroout_ops(struct rbd_obj_request *obj_req)
195813488d53SIlya Dryomov {
195913488d53SIlya Dryomov 	int num_osd_ops;
196013488d53SIlya Dryomov 
19619b17eb2cSIlya Dryomov 	if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
19629b17eb2cSIlya Dryomov 	    !rbd_obj_copyup_enabled(obj_req))
196313488d53SIlya Dryomov 		num_osd_ops = 2; /* create + truncate */
196413488d53SIlya Dryomov 	else
196513488d53SIlya Dryomov 		num_osd_ops = 1; /* delete/truncate/zero */
196613488d53SIlya Dryomov 
196713488d53SIlya Dryomov 	return num_osd_ops;
196813488d53SIlya Dryomov }
196913488d53SIlya Dryomov 
19706484cbe9SIlya Dryomov static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
19713da691bfSIlya Dryomov 				    unsigned int which)
197270d045f6SIlya Dryomov {
19733da691bfSIlya Dryomov 	u16 opcode;
1974058aa991SIlya Dryomov 
19753da691bfSIlya Dryomov 	if (rbd_obj_is_entire(obj_req)) {
197686bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
19779b17eb2cSIlya Dryomov 			if (!rbd_obj_copyup_enabled(obj_req))
19782bb1e56eSIlya Dryomov 				osd_req_op_init(obj_req->osd_req, which++,
19792bb1e56eSIlya Dryomov 						CEPH_OSD_OP_CREATE, 0);
19803da691bfSIlya Dryomov 			opcode = CEPH_OSD_OP_TRUNCATE;
19813da691bfSIlya Dryomov 		} else {
19823da691bfSIlya Dryomov 			osd_req_op_init(obj_req->osd_req, which++,
19833da691bfSIlya Dryomov 					CEPH_OSD_OP_DELETE, 0);
19843da691bfSIlya Dryomov 			opcode = 0;
19853da691bfSIlya Dryomov 		}
19863da691bfSIlya Dryomov 	} else {
19876484cbe9SIlya Dryomov 		opcode = truncate_or_zero_opcode(obj_req);
19883da691bfSIlya Dryomov 	}
19893da691bfSIlya Dryomov 
19903da691bfSIlya Dryomov 	if (opcode)
19913da691bfSIlya Dryomov 		osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
199243df3d35SIlya Dryomov 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
19933da691bfSIlya Dryomov 				       0, 0);
19943da691bfSIlya Dryomov 
19953da691bfSIlya Dryomov 	rbd_assert(which == obj_req->osd_req->r_num_ops);
19963da691bfSIlya Dryomov 	rbd_osd_req_format_write(obj_req);
19973da691bfSIlya Dryomov }
19983da691bfSIlya Dryomov 
19996484cbe9SIlya Dryomov static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
20003da691bfSIlya Dryomov {
20013da691bfSIlya Dryomov 	unsigned int num_osd_ops, which = 0;
200213488d53SIlya Dryomov 	bool need_guard;
20033da691bfSIlya Dryomov 	int ret;
20043da691bfSIlya Dryomov 
200586bd7998SIlya Dryomov 	/* reverse map the entire object onto the parent */
200686bd7998SIlya Dryomov 	ret = rbd_obj_calc_img_extents(obj_req, true);
200786bd7998SIlya Dryomov 	if (ret)
200886bd7998SIlya Dryomov 		return ret;
200986bd7998SIlya Dryomov 
201013488d53SIlya Dryomov 	need_guard = rbd_obj_copyup_enabled(obj_req);
201113488d53SIlya Dryomov 	num_osd_ops = need_guard + count_zeroout_ops(obj_req);
20123da691bfSIlya Dryomov 
2013a162b308SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
20143da691bfSIlya Dryomov 	if (!obj_req->osd_req)
20153da691bfSIlya Dryomov 		return -ENOMEM;
20163da691bfSIlya Dryomov 
201713488d53SIlya Dryomov 	if (need_guard) {
20183da691bfSIlya Dryomov 		ret = __rbd_obj_setup_stat(obj_req, which++);
20193da691bfSIlya Dryomov 		if (ret)
20203da691bfSIlya Dryomov 			return ret;
202113488d53SIlya Dryomov 
202213488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_GUARD;
202313488d53SIlya Dryomov 	} else {
202413488d53SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
20253da691bfSIlya Dryomov 	}
20263da691bfSIlya Dryomov 
20276484cbe9SIlya Dryomov 	__rbd_obj_setup_zeroout(obj_req, which);
2028980917fcSIlya Dryomov 	return 0;
2029b454e36dSAlex Elder }
2030b454e36dSAlex Elder 
2031b454e36dSAlex Elder /*
20323da691bfSIlya Dryomov  * For each object request in @img_req, allocate an OSD request, add
20333da691bfSIlya Dryomov  * individual OSD ops and prepare them for submission.  The number of
20343da691bfSIlya Dryomov  * OSD ops depends on op_type and the overlap point (if any).
2035b454e36dSAlex Elder  */
20363da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req)
20373da691bfSIlya Dryomov {
20380c93e1b7SIlya Dryomov 	struct rbd_obj_request *obj_req, *next_obj_req;
20393da691bfSIlya Dryomov 	int ret;
20403d7efd18SAlex Elder 
20410c93e1b7SIlya Dryomov 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
20429bb0248dSIlya Dryomov 		switch (img_req->op_type) {
20433da691bfSIlya Dryomov 		case OBJ_OP_READ:
20443da691bfSIlya Dryomov 			ret = rbd_obj_setup_read(obj_req);
20453da691bfSIlya Dryomov 			break;
20463da691bfSIlya Dryomov 		case OBJ_OP_WRITE:
20473da691bfSIlya Dryomov 			ret = rbd_obj_setup_write(obj_req);
20483da691bfSIlya Dryomov 			break;
20493da691bfSIlya Dryomov 		case OBJ_OP_DISCARD:
20503da691bfSIlya Dryomov 			ret = rbd_obj_setup_discard(obj_req);
20513da691bfSIlya Dryomov 			break;
20526484cbe9SIlya Dryomov 		case OBJ_OP_ZEROOUT:
20536484cbe9SIlya Dryomov 			ret = rbd_obj_setup_zeroout(obj_req);
20546484cbe9SIlya Dryomov 			break;
20553da691bfSIlya Dryomov 		default:
20563da691bfSIlya Dryomov 			rbd_assert(0);
20573da691bfSIlya Dryomov 		}
20580c93e1b7SIlya Dryomov 		if (ret < 0)
20593da691bfSIlya Dryomov 			return ret;
20600c93e1b7SIlya Dryomov 		if (ret > 0) {
20610c93e1b7SIlya Dryomov 			img_req->xferred += obj_req->ex.oe_len;
20620c93e1b7SIlya Dryomov 			img_req->pending_count--;
20630c93e1b7SIlya Dryomov 			rbd_img_obj_request_del(img_req, obj_req);
20640c93e1b7SIlya Dryomov 			continue;
20650c93e1b7SIlya Dryomov 		}
206626f887e0SIlya Dryomov 
206726f887e0SIlya Dryomov 		ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
206826f887e0SIlya Dryomov 		if (ret)
206926f887e0SIlya Dryomov 			return ret;
2070b454e36dSAlex Elder 	}
2071b454e36dSAlex Elder 
20723da691bfSIlya Dryomov 	return 0;
20733da691bfSIlya Dryomov }
20743da691bfSIlya Dryomov 
20755a237819SIlya Dryomov union rbd_img_fill_iter {
20765a237819SIlya Dryomov 	struct ceph_bio_iter	bio_iter;
20775a237819SIlya Dryomov 	struct ceph_bvec_iter	bvec_iter;
20785a237819SIlya Dryomov };
20795a237819SIlya Dryomov 
20805a237819SIlya Dryomov struct rbd_img_fill_ctx {
20815a237819SIlya Dryomov 	enum obj_request_type	pos_type;
20825a237819SIlya Dryomov 	union rbd_img_fill_iter	*pos;
20835a237819SIlya Dryomov 	union rbd_img_fill_iter	iter;
20845a237819SIlya Dryomov 	ceph_object_extent_fn_t	set_pos_fn;
2085afb97888SIlya Dryomov 	ceph_object_extent_fn_t	count_fn;
2086afb97888SIlya Dryomov 	ceph_object_extent_fn_t	copy_fn;
20875a237819SIlya Dryomov };
20885a237819SIlya Dryomov 
20895a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg)
20905a237819SIlya Dryomov {
20915a237819SIlya Dryomov 	struct rbd_img_request *img_req = arg;
20925a237819SIlya Dryomov 	struct rbd_obj_request *obj_req;
20935a237819SIlya Dryomov 
20945a237819SIlya Dryomov 	obj_req = rbd_obj_request_create();
20955a237819SIlya Dryomov 	if (!obj_req)
20965a237819SIlya Dryomov 		return NULL;
20975a237819SIlya Dryomov 
20985a237819SIlya Dryomov 	rbd_img_obj_request_add(img_req, obj_req);
20995a237819SIlya Dryomov 	return &obj_req->ex;
21005a237819SIlya Dryomov }
21015a237819SIlya Dryomov 
21025a237819SIlya Dryomov /*
2103afb97888SIlya Dryomov  * While su != os && sc == 1 is technically not fancy (it's the same
2104afb97888SIlya Dryomov  * layout as su == os && sc == 1), we can't use the nocopy path for it
2105afb97888SIlya Dryomov  * because ->set_pos_fn() should be called only once per object.
2106afb97888SIlya Dryomov  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2107afb97888SIlya Dryomov  * treat su != os && sc == 1 as fancy.
21085a237819SIlya Dryomov  */
2109afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2110afb97888SIlya Dryomov {
2111afb97888SIlya Dryomov 	return l->stripe_unit != l->object_size;
2112afb97888SIlya Dryomov }
2113afb97888SIlya Dryomov 
2114afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
21155a237819SIlya Dryomov 				       struct ceph_file_extent *img_extents,
21165a237819SIlya Dryomov 				       u32 num_img_extents,
21175a237819SIlya Dryomov 				       struct rbd_img_fill_ctx *fctx)
21185a237819SIlya Dryomov {
21195a237819SIlya Dryomov 	u32 i;
21205a237819SIlya Dryomov 	int ret;
21215a237819SIlya Dryomov 
21225a237819SIlya Dryomov 	img_req->data_type = fctx->pos_type;
21235a237819SIlya Dryomov 
21245a237819SIlya Dryomov 	/*
21255a237819SIlya Dryomov 	 * Create object requests and set each object request's starting
21265a237819SIlya Dryomov 	 * position in the provided bio (list) or bio_vec array.
21275a237819SIlya Dryomov 	 */
21285a237819SIlya Dryomov 	fctx->iter = *fctx->pos;
21295a237819SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
21305a237819SIlya Dryomov 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
21315a237819SIlya Dryomov 					   img_extents[i].fe_off,
21325a237819SIlya Dryomov 					   img_extents[i].fe_len,
21335a237819SIlya Dryomov 					   &img_req->object_extents,
21345a237819SIlya Dryomov 					   alloc_object_extent, img_req,
21355a237819SIlya Dryomov 					   fctx->set_pos_fn, &fctx->iter);
21365a237819SIlya Dryomov 		if (ret)
21375a237819SIlya Dryomov 			return ret;
21385a237819SIlya Dryomov 	}
21395a237819SIlya Dryomov 
21405a237819SIlya Dryomov 	return __rbd_img_fill_request(img_req);
21415a237819SIlya Dryomov }
21425a237819SIlya Dryomov 
2143afb97888SIlya Dryomov /*
2144afb97888SIlya Dryomov  * Map a list of image extents to a list of object extents, create the
2145afb97888SIlya Dryomov  * corresponding object requests (normally each to a different object,
2146afb97888SIlya Dryomov  * but not always) and add them to @img_req.  For each object request,
2147afb97888SIlya Dryomov  * set up its data descriptor to point to the corresponding chunk(s) of
2148afb97888SIlya Dryomov  * @fctx->pos data buffer.
2149afb97888SIlya Dryomov  *
2150afb97888SIlya Dryomov  * Because ceph_file_to_extents() will merge adjacent object extents
2151afb97888SIlya Dryomov  * together, each object request's data descriptor may point to multiple
2152afb97888SIlya Dryomov  * different chunks of @fctx->pos data buffer.
2153afb97888SIlya Dryomov  *
2154afb97888SIlya Dryomov  * @fctx->pos data buffer is assumed to be large enough.
2155afb97888SIlya Dryomov  */
2156afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req,
2157afb97888SIlya Dryomov 				struct ceph_file_extent *img_extents,
2158afb97888SIlya Dryomov 				u32 num_img_extents,
2159afb97888SIlya Dryomov 				struct rbd_img_fill_ctx *fctx)
2160afb97888SIlya Dryomov {
2161afb97888SIlya Dryomov 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2162afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req;
2163afb97888SIlya Dryomov 	u32 i;
2164afb97888SIlya Dryomov 	int ret;
2165afb97888SIlya Dryomov 
2166afb97888SIlya Dryomov 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2167afb97888SIlya Dryomov 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2168afb97888SIlya Dryomov 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2169afb97888SIlya Dryomov 						   num_img_extents, fctx);
2170afb97888SIlya Dryomov 
2171afb97888SIlya Dryomov 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2172afb97888SIlya Dryomov 
2173afb97888SIlya Dryomov 	/*
2174afb97888SIlya Dryomov 	 * Create object requests and determine ->bvec_count for each object
2175afb97888SIlya Dryomov 	 * request.  Note that ->bvec_count sum over all object requests may
2176afb97888SIlya Dryomov 	 * be greater than the number of bio_vecs in the provided bio (list)
2177afb97888SIlya Dryomov 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2178afb97888SIlya Dryomov 	 * stripe unit boundaries.
2179afb97888SIlya Dryomov 	 */
2180afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2181afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2182afb97888SIlya Dryomov 		ret = ceph_file_to_extents(&rbd_dev->layout,
2183afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2184afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2185afb97888SIlya Dryomov 					   &img_req->object_extents,
2186afb97888SIlya Dryomov 					   alloc_object_extent, img_req,
2187afb97888SIlya Dryomov 					   fctx->count_fn, &fctx->iter);
2188afb97888SIlya Dryomov 		if (ret)
2189afb97888SIlya Dryomov 			return ret;
2190afb97888SIlya Dryomov 	}
2191afb97888SIlya Dryomov 
2192afb97888SIlya Dryomov 	for_each_obj_request(img_req, obj_req) {
2193afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2194afb97888SIlya Dryomov 					      sizeof(*obj_req->bvec_pos.bvecs),
2195afb97888SIlya Dryomov 					      GFP_NOIO);
2196afb97888SIlya Dryomov 		if (!obj_req->bvec_pos.bvecs)
2197afb97888SIlya Dryomov 			return -ENOMEM;
2198afb97888SIlya Dryomov 	}
2199afb97888SIlya Dryomov 
2200afb97888SIlya Dryomov 	/*
2201afb97888SIlya Dryomov 	 * Fill in each object request's private bio_vec array, splitting and
2202afb97888SIlya Dryomov 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2203afb97888SIlya Dryomov 	 */
2204afb97888SIlya Dryomov 	fctx->iter = *fctx->pos;
2205afb97888SIlya Dryomov 	for (i = 0; i < num_img_extents; i++) {
2206afb97888SIlya Dryomov 		ret = ceph_iterate_extents(&rbd_dev->layout,
2207afb97888SIlya Dryomov 					   img_extents[i].fe_off,
2208afb97888SIlya Dryomov 					   img_extents[i].fe_len,
2209afb97888SIlya Dryomov 					   &img_req->object_extents,
2210afb97888SIlya Dryomov 					   fctx->copy_fn, &fctx->iter);
2211afb97888SIlya Dryomov 		if (ret)
2212afb97888SIlya Dryomov 			return ret;
2213afb97888SIlya Dryomov 	}
2214afb97888SIlya Dryomov 
2215afb97888SIlya Dryomov 	return __rbd_img_fill_request(img_req);
2216afb97888SIlya Dryomov }
2217afb97888SIlya Dryomov 
22185a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
22195a237819SIlya Dryomov 			       u64 off, u64 len)
22205a237819SIlya Dryomov {
22215a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
22225a237819SIlya Dryomov 	union rbd_img_fill_iter dummy;
22235a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
22245a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_NODATA,
22255a237819SIlya Dryomov 		.pos = &dummy,
22265a237819SIlya Dryomov 	};
22275a237819SIlya Dryomov 
22285a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
22295a237819SIlya Dryomov }
22305a237819SIlya Dryomov 
22315a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
22325a237819SIlya Dryomov {
22335a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
22345a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
22355a237819SIlya Dryomov 	struct ceph_bio_iter *it = arg;
22365a237819SIlya Dryomov 
22375a237819SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
22385a237819SIlya Dryomov 	obj_req->bio_pos = *it;
22395a237819SIlya Dryomov 	ceph_bio_iter_advance(it, bytes);
22405a237819SIlya Dryomov }
22415a237819SIlya Dryomov 
2242afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2243afb97888SIlya Dryomov {
2244afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2245afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2246afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2247afb97888SIlya Dryomov 
2248afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2249afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2250afb97888SIlya Dryomov 		obj_req->bvec_count++;
2251afb97888SIlya Dryomov 	}));
2252afb97888SIlya Dryomov 
2253afb97888SIlya Dryomov }
2254afb97888SIlya Dryomov 
2255afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2256afb97888SIlya Dryomov {
2257afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2258afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2259afb97888SIlya Dryomov 	struct ceph_bio_iter *it = arg;
2260afb97888SIlya Dryomov 
2261afb97888SIlya Dryomov 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2262afb97888SIlya Dryomov 	ceph_bio_iter_advance_step(it, bytes, ({
2263afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2264afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2265afb97888SIlya Dryomov 	}));
2266afb97888SIlya Dryomov }
2267afb97888SIlya Dryomov 
22685a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
22695a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
22705a237819SIlya Dryomov 				   u32 num_img_extents,
22715a237819SIlya Dryomov 				   struct ceph_bio_iter *bio_pos)
22725a237819SIlya Dryomov {
22735a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
22745a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BIO,
22755a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bio_pos,
22765a237819SIlya Dryomov 		.set_pos_fn = set_bio_pos,
2277afb97888SIlya Dryomov 		.count_fn = count_bio_bvecs,
2278afb97888SIlya Dryomov 		.copy_fn = copy_bio_bvecs,
22795a237819SIlya Dryomov 	};
22805a237819SIlya Dryomov 
22815a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
22825a237819SIlya Dryomov 				    &fctx);
22835a237819SIlya Dryomov }
22845a237819SIlya Dryomov 
22855a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
22865a237819SIlya Dryomov 				 u64 off, u64 len, struct bio *bio)
22875a237819SIlya Dryomov {
22885a237819SIlya Dryomov 	struct ceph_file_extent ex = { off, len };
22895a237819SIlya Dryomov 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
22905a237819SIlya Dryomov 
22915a237819SIlya Dryomov 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
22925a237819SIlya Dryomov }
22935a237819SIlya Dryomov 
22945a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
22955a237819SIlya Dryomov {
22965a237819SIlya Dryomov 	struct rbd_obj_request *obj_req =
22975a237819SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
22985a237819SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
22995a237819SIlya Dryomov 
23005a237819SIlya Dryomov 	obj_req->bvec_pos = *it;
23015a237819SIlya Dryomov 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
23025a237819SIlya Dryomov 	ceph_bvec_iter_advance(it, bytes);
23035a237819SIlya Dryomov }
23045a237819SIlya Dryomov 
2305afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2306afb97888SIlya Dryomov {
2307afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2308afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2309afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2310afb97888SIlya Dryomov 
2311afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2312afb97888SIlya Dryomov 		obj_req->bvec_count++;
2313afb97888SIlya Dryomov 	}));
2314afb97888SIlya Dryomov }
2315afb97888SIlya Dryomov 
2316afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2317afb97888SIlya Dryomov {
2318afb97888SIlya Dryomov 	struct rbd_obj_request *obj_req =
2319afb97888SIlya Dryomov 	    container_of(ex, struct rbd_obj_request, ex);
2320afb97888SIlya Dryomov 	struct ceph_bvec_iter *it = arg;
2321afb97888SIlya Dryomov 
2322afb97888SIlya Dryomov 	ceph_bvec_iter_advance_step(it, bytes, ({
2323afb97888SIlya Dryomov 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2324afb97888SIlya Dryomov 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2325afb97888SIlya Dryomov 	}));
2326afb97888SIlya Dryomov }
2327afb97888SIlya Dryomov 
23285a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
23295a237819SIlya Dryomov 				     struct ceph_file_extent *img_extents,
23305a237819SIlya Dryomov 				     u32 num_img_extents,
23315a237819SIlya Dryomov 				     struct ceph_bvec_iter *bvec_pos)
23325a237819SIlya Dryomov {
23335a237819SIlya Dryomov 	struct rbd_img_fill_ctx fctx = {
23345a237819SIlya Dryomov 		.pos_type = OBJ_REQUEST_BVECS,
23355a237819SIlya Dryomov 		.pos = (union rbd_img_fill_iter *)bvec_pos,
23365a237819SIlya Dryomov 		.set_pos_fn = set_bvec_pos,
2337afb97888SIlya Dryomov 		.count_fn = count_bvecs,
2338afb97888SIlya Dryomov 		.copy_fn = copy_bvecs,
23395a237819SIlya Dryomov 	};
23405a237819SIlya Dryomov 
23415a237819SIlya Dryomov 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
23425a237819SIlya Dryomov 				    &fctx);
23435a237819SIlya Dryomov }
23445a237819SIlya Dryomov 
23455a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
23465a237819SIlya Dryomov 				   struct ceph_file_extent *img_extents,
23475a237819SIlya Dryomov 				   u32 num_img_extents,
23485a237819SIlya Dryomov 				   struct bio_vec *bvecs)
23495a237819SIlya Dryomov {
23505a237819SIlya Dryomov 	struct ceph_bvec_iter it = {
23515a237819SIlya Dryomov 		.bvecs = bvecs,
23525a237819SIlya Dryomov 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
23535a237819SIlya Dryomov 							     num_img_extents) },
23545a237819SIlya Dryomov 	};
23555a237819SIlya Dryomov 
23565a237819SIlya Dryomov 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
23575a237819SIlya Dryomov 					 &it);
23585a237819SIlya Dryomov }
23595a237819SIlya Dryomov 
2360efbd1a11SIlya Dryomov static void rbd_img_request_submit(struct rbd_img_request *img_request)
2361bf0d5f50SAlex Elder {
2362bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2363bf0d5f50SAlex Elder 
236437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
2365bf0d5f50SAlex Elder 
2366663ae2ccSIlya Dryomov 	rbd_img_request_get(img_request);
2367efbd1a11SIlya Dryomov 	for_each_obj_request(img_request, obj_request)
23683da691bfSIlya Dryomov 		rbd_obj_request_submit(obj_request);
2369bf0d5f50SAlex Elder 
2370663ae2ccSIlya Dryomov 	rbd_img_request_put(img_request);
2371bf0d5f50SAlex Elder }
2372bf0d5f50SAlex Elder 
237386bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
23743da691bfSIlya Dryomov {
23753da691bfSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
23763da691bfSIlya Dryomov 	struct rbd_img_request *child_img_req;
23773da691bfSIlya Dryomov 	int ret;
23783da691bfSIlya Dryomov 
2379e93aca0aSIlya Dryomov 	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2380e93aca0aSIlya Dryomov 					       OBJ_OP_READ, NULL);
23813da691bfSIlya Dryomov 	if (!child_img_req)
23823da691bfSIlya Dryomov 		return -ENOMEM;
23833da691bfSIlya Dryomov 
2384e93aca0aSIlya Dryomov 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2385e93aca0aSIlya Dryomov 	child_img_req->obj_request = obj_req;
2386e93aca0aSIlya Dryomov 
23873da691bfSIlya Dryomov 	if (!rbd_img_is_write(img_req)) {
2388ecc633caSIlya Dryomov 		switch (img_req->data_type) {
23893da691bfSIlya Dryomov 		case OBJ_REQUEST_BIO:
23905a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bio(child_img_req,
23915a237819SIlya Dryomov 						      obj_req->img_extents,
23925a237819SIlya Dryomov 						      obj_req->num_img_extents,
23933da691bfSIlya Dryomov 						      &obj_req->bio_pos);
23943da691bfSIlya Dryomov 			break;
23953da691bfSIlya Dryomov 		case OBJ_REQUEST_BVECS:
2396afb97888SIlya Dryomov 		case OBJ_REQUEST_OWN_BVECS:
23975a237819SIlya Dryomov 			ret = __rbd_img_fill_from_bvecs(child_img_req,
23985a237819SIlya Dryomov 						      obj_req->img_extents,
23995a237819SIlya Dryomov 						      obj_req->num_img_extents,
24003da691bfSIlya Dryomov 						      &obj_req->bvec_pos);
24013da691bfSIlya Dryomov 			break;
24023da691bfSIlya Dryomov 		default:
24033da691bfSIlya Dryomov 			rbd_assert(0);
24043da691bfSIlya Dryomov 		}
24053da691bfSIlya Dryomov 	} else {
24065a237819SIlya Dryomov 		ret = rbd_img_fill_from_bvecs(child_img_req,
24075a237819SIlya Dryomov 					      obj_req->img_extents,
24085a237819SIlya Dryomov 					      obj_req->num_img_extents,
24095a237819SIlya Dryomov 					      obj_req->copyup_bvecs);
24103da691bfSIlya Dryomov 	}
24113da691bfSIlya Dryomov 	if (ret) {
24123da691bfSIlya Dryomov 		rbd_img_request_put(child_img_req);
2413663ae2ccSIlya Dryomov 		return ret;
2414bf0d5f50SAlex Elder 	}
2415bf0d5f50SAlex Elder 
24163da691bfSIlya Dryomov 	rbd_img_request_submit(child_img_req);
24173da691bfSIlya Dryomov 	return 0;
24183da691bfSIlya Dryomov }
24193da691bfSIlya Dryomov 
24203da691bfSIlya Dryomov static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
24218b3e1a56SAlex Elder {
24223da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
24233da691bfSIlya Dryomov 	int ret;
24248b3e1a56SAlex Elder 
24253da691bfSIlya Dryomov 	if (obj_req->result == -ENOENT &&
242686bd7998SIlya Dryomov 	    rbd_dev->parent_overlap && !obj_req->tried_parent) {
242786bd7998SIlya Dryomov 		/* reverse map this object extent onto the parent */
242886bd7998SIlya Dryomov 		ret = rbd_obj_calc_img_extents(obj_req, false);
242986bd7998SIlya Dryomov 		if (ret) {
243086bd7998SIlya Dryomov 			obj_req->result = ret;
243186bd7998SIlya Dryomov 			return true;
243286bd7998SIlya Dryomov 		}
24338b3e1a56SAlex Elder 
243486bd7998SIlya Dryomov 		if (obj_req->num_img_extents) {
24353da691bfSIlya Dryomov 			obj_req->tried_parent = true;
243686bd7998SIlya Dryomov 			ret = rbd_obj_read_from_parent(obj_req);
24373da691bfSIlya Dryomov 			if (ret) {
24383da691bfSIlya Dryomov 				obj_req->result = ret;
24393da691bfSIlya Dryomov 				return true;
24403da691bfSIlya Dryomov 			}
24413da691bfSIlya Dryomov 			return false;
24423da691bfSIlya Dryomov 		}
244386bd7998SIlya Dryomov 	}
244402c74fbaSAlex Elder 
244502c74fbaSAlex Elder 	/*
24463da691bfSIlya Dryomov 	 * -ENOENT means a hole in the image -- zero-fill the entire
24473da691bfSIlya Dryomov 	 * length of the request.  A short read also implies zero-fill
24483da691bfSIlya Dryomov 	 * to the end of the request.  In both cases we update xferred
24493da691bfSIlya Dryomov 	 * count to indicate the whole request was satisfied.
245002c74fbaSAlex Elder 	 */
24513da691bfSIlya Dryomov 	if (obj_req->result == -ENOENT ||
245243df3d35SIlya Dryomov 	    (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
24533da691bfSIlya Dryomov 		rbd_assert(!obj_req->xferred || !obj_req->result);
24543da691bfSIlya Dryomov 		rbd_obj_zero_range(obj_req, obj_req->xferred,
245543df3d35SIlya Dryomov 				   obj_req->ex.oe_len - obj_req->xferred);
24563da691bfSIlya Dryomov 		obj_req->result = 0;
245743df3d35SIlya Dryomov 		obj_req->xferred = obj_req->ex.oe_len;
24583da691bfSIlya Dryomov 	}
24593da691bfSIlya Dryomov 
24603da691bfSIlya Dryomov 	return true;
24613da691bfSIlya Dryomov }
24623da691bfSIlya Dryomov 
24633da691bfSIlya Dryomov /*
24643da691bfSIlya Dryomov  * copyup_bvecs pages are never highmem pages
24653da691bfSIlya Dryomov  */
24663da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
24673da691bfSIlya Dryomov {
24683da691bfSIlya Dryomov 	struct ceph_bvec_iter it = {
24693da691bfSIlya Dryomov 		.bvecs = bvecs,
24703da691bfSIlya Dryomov 		.iter = { .bi_size = bytes },
24713da691bfSIlya Dryomov 	};
24723da691bfSIlya Dryomov 
24733da691bfSIlya Dryomov 	ceph_bvec_iter_advance_step(&it, bytes, ({
24743da691bfSIlya Dryomov 		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
24753da691bfSIlya Dryomov 			       bv.bv_len))
24763da691bfSIlya Dryomov 			return false;
24773da691bfSIlya Dryomov 	}));
24783da691bfSIlya Dryomov 	return true;
24793da691bfSIlya Dryomov }
24803da691bfSIlya Dryomov 
24813a482501SIlya Dryomov #define MODS_ONLY	U32_MAX
24823a482501SIlya Dryomov 
248389a59c1cSIlya Dryomov static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req,
248489a59c1cSIlya Dryomov 					    u32 bytes)
248589a59c1cSIlya Dryomov {
248689a59c1cSIlya Dryomov 	int ret;
248789a59c1cSIlya Dryomov 
248889a59c1cSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
248989a59c1cSIlya Dryomov 	rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
249089a59c1cSIlya Dryomov 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
249189a59c1cSIlya Dryomov 	rbd_osd_req_destroy(obj_req->osd_req);
249289a59c1cSIlya Dryomov 
249389a59c1cSIlya Dryomov 	obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1);
249489a59c1cSIlya Dryomov 	if (!obj_req->osd_req)
249589a59c1cSIlya Dryomov 		return -ENOMEM;
249689a59c1cSIlya Dryomov 
249789a59c1cSIlya Dryomov 	ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup");
249889a59c1cSIlya Dryomov 	if (ret)
249989a59c1cSIlya Dryomov 		return ret;
250089a59c1cSIlya Dryomov 
250189a59c1cSIlya Dryomov 	osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
250289a59c1cSIlya Dryomov 					  obj_req->copyup_bvecs,
250389a59c1cSIlya Dryomov 					  obj_req->copyup_bvec_count,
250489a59c1cSIlya Dryomov 					  bytes);
250589a59c1cSIlya Dryomov 	rbd_osd_req_format_write(obj_req);
250689a59c1cSIlya Dryomov 
250789a59c1cSIlya Dryomov 	ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
250889a59c1cSIlya Dryomov 	if (ret)
250989a59c1cSIlya Dryomov 		return ret;
251089a59c1cSIlya Dryomov 
251189a59c1cSIlya Dryomov 	rbd_obj_request_submit(obj_req);
251289a59c1cSIlya Dryomov 	return 0;
251389a59c1cSIlya Dryomov }
251489a59c1cSIlya Dryomov 
25153a482501SIlya Dryomov static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
25163da691bfSIlya Dryomov {
251713488d53SIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
25183a482501SIlya Dryomov 	unsigned int num_osd_ops = (bytes != MODS_ONLY);
25193a482501SIlya Dryomov 	unsigned int which = 0;
2520fe943d50SChengguang Xu 	int ret;
25213da691bfSIlya Dryomov 
25223da691bfSIlya Dryomov 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
252389a59c1cSIlya Dryomov 	rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT ||
252489a59c1cSIlya Dryomov 		   obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL);
25253da691bfSIlya Dryomov 	rbd_osd_req_destroy(obj_req->osd_req);
25263da691bfSIlya Dryomov 
252713488d53SIlya Dryomov 	switch (img_req->op_type) {
252813488d53SIlya Dryomov 	case OBJ_OP_WRITE:
252913488d53SIlya Dryomov 		num_osd_ops += count_write_ops(obj_req);
253013488d53SIlya Dryomov 		break;
253113488d53SIlya Dryomov 	case OBJ_OP_ZEROOUT:
253213488d53SIlya Dryomov 		num_osd_ops += count_zeroout_ops(obj_req);
253313488d53SIlya Dryomov 		break;
253413488d53SIlya Dryomov 	default:
253513488d53SIlya Dryomov 		rbd_assert(0);
253613488d53SIlya Dryomov 	}
253713488d53SIlya Dryomov 
2538a162b308SIlya Dryomov 	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
25393da691bfSIlya Dryomov 	if (!obj_req->osd_req)
25403da691bfSIlya Dryomov 		return -ENOMEM;
25413da691bfSIlya Dryomov 
25423a482501SIlya Dryomov 	if (bytes != MODS_ONLY) {
25433a482501SIlya Dryomov 		ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd",
25443a482501SIlya Dryomov 					  "copyup");
2545fe943d50SChengguang Xu 		if (ret)
2546fe943d50SChengguang Xu 			return ret;
2547fe943d50SChengguang Xu 
25483a482501SIlya Dryomov 		osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++,
25490010f705SIlya Dryomov 						  obj_req->copyup_bvecs,
25500010f705SIlya Dryomov 						  obj_req->copyup_bvec_count,
25510010f705SIlya Dryomov 						  bytes);
25523a482501SIlya Dryomov 	}
25533da691bfSIlya Dryomov 
255413488d53SIlya Dryomov 	switch (img_req->op_type) {
25553da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
25563a482501SIlya Dryomov 		__rbd_obj_setup_write(obj_req, which);
25573da691bfSIlya Dryomov 		break;
25586484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
25593a482501SIlya Dryomov 		__rbd_obj_setup_zeroout(obj_req, which);
25603da691bfSIlya Dryomov 		break;
25613da691bfSIlya Dryomov 	default:
25623da691bfSIlya Dryomov 		rbd_assert(0);
25633da691bfSIlya Dryomov 	}
25643da691bfSIlya Dryomov 
256526f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
256626f887e0SIlya Dryomov 	if (ret)
256726f887e0SIlya Dryomov 		return ret;
256826f887e0SIlya Dryomov 
25693da691bfSIlya Dryomov 	rbd_obj_request_submit(obj_req);
25703da691bfSIlya Dryomov 	return 0;
25713da691bfSIlya Dryomov }
25723da691bfSIlya Dryomov 
25733a482501SIlya Dryomov static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
25743a482501SIlya Dryomov {
25753a482501SIlya Dryomov 	/*
25763a482501SIlya Dryomov 	 * Only send non-zero copyup data to save some I/O and network
25773a482501SIlya Dryomov 	 * bandwidth -- zero copyup data is equivalent to the object not
25783a482501SIlya Dryomov 	 * existing.
25793a482501SIlya Dryomov 	 */
25803a482501SIlya Dryomov 	if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
25813a482501SIlya Dryomov 		dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
25823a482501SIlya Dryomov 		bytes = 0;
25833a482501SIlya Dryomov 	}
25843a482501SIlya Dryomov 
258589a59c1cSIlya Dryomov 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
258689a59c1cSIlya Dryomov 		/*
258789a59c1cSIlya Dryomov 		 * Send a copyup request with an empty snapshot context to
258889a59c1cSIlya Dryomov 		 * deep-copyup the object through all existing snapshots.
258989a59c1cSIlya Dryomov 		 * A second request with the current snapshot context will be
259089a59c1cSIlya Dryomov 		 * sent for the actual modification.
259189a59c1cSIlya Dryomov 		 */
259289a59c1cSIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
259389a59c1cSIlya Dryomov 		return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
259489a59c1cSIlya Dryomov 	}
259589a59c1cSIlya Dryomov 
25963a482501SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
25973a482501SIlya Dryomov 	return rbd_obj_issue_copyup_ops(obj_req, bytes);
25983a482501SIlya Dryomov }
25993a482501SIlya Dryomov 
26007e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
26017e07efb1SIlya Dryomov {
26027e07efb1SIlya Dryomov 	u32 i;
26037e07efb1SIlya Dryomov 
26047e07efb1SIlya Dryomov 	rbd_assert(!obj_req->copyup_bvecs);
26057e07efb1SIlya Dryomov 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
26067e07efb1SIlya Dryomov 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
26077e07efb1SIlya Dryomov 					sizeof(*obj_req->copyup_bvecs),
26087e07efb1SIlya Dryomov 					GFP_NOIO);
26097e07efb1SIlya Dryomov 	if (!obj_req->copyup_bvecs)
26107e07efb1SIlya Dryomov 		return -ENOMEM;
26117e07efb1SIlya Dryomov 
26127e07efb1SIlya Dryomov 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
26137e07efb1SIlya Dryomov 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
26147e07efb1SIlya Dryomov 
26157e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
26167e07efb1SIlya Dryomov 		if (!obj_req->copyup_bvecs[i].bv_page)
26177e07efb1SIlya Dryomov 			return -ENOMEM;
26187e07efb1SIlya Dryomov 
26197e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_offset = 0;
26207e07efb1SIlya Dryomov 		obj_req->copyup_bvecs[i].bv_len = len;
26217e07efb1SIlya Dryomov 		obj_overlap -= len;
26227e07efb1SIlya Dryomov 	}
26237e07efb1SIlya Dryomov 
26247e07efb1SIlya Dryomov 	rbd_assert(!obj_overlap);
26257e07efb1SIlya Dryomov 	return 0;
26267e07efb1SIlya Dryomov }
26277e07efb1SIlya Dryomov 
26283da691bfSIlya Dryomov static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
26293da691bfSIlya Dryomov {
26303da691bfSIlya Dryomov 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
26313da691bfSIlya Dryomov 	int ret;
26323da691bfSIlya Dryomov 
263386bd7998SIlya Dryomov 	rbd_assert(obj_req->num_img_extents);
263486bd7998SIlya Dryomov 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
263586bd7998SIlya Dryomov 		      rbd_dev->parent_overlap);
263686bd7998SIlya Dryomov 	if (!obj_req->num_img_extents) {
26373da691bfSIlya Dryomov 		/*
26383da691bfSIlya Dryomov 		 * The overlap has become 0 (most likely because the
26393a482501SIlya Dryomov 		 * image has been flattened).  Re-submit the original write
26403a482501SIlya Dryomov 		 * request -- pass MODS_ONLY since the copyup isn't needed
26413a482501SIlya Dryomov 		 * anymore.
26423da691bfSIlya Dryomov 		 */
26433a482501SIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
26443a482501SIlya Dryomov 		return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
26453da691bfSIlya Dryomov 	}
26463da691bfSIlya Dryomov 
264786bd7998SIlya Dryomov 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
26483da691bfSIlya Dryomov 	if (ret)
26493da691bfSIlya Dryomov 		return ret;
26503da691bfSIlya Dryomov 
26513a482501SIlya Dryomov 	obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
265286bd7998SIlya Dryomov 	return rbd_obj_read_from_parent(obj_req);
26533da691bfSIlya Dryomov }
26543da691bfSIlya Dryomov 
26553da691bfSIlya Dryomov static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
26563da691bfSIlya Dryomov {
26573da691bfSIlya Dryomov 	int ret;
26583da691bfSIlya Dryomov 
26593da691bfSIlya Dryomov 	switch (obj_req->write_state) {
26603da691bfSIlya Dryomov 	case RBD_OBJ_WRITE_GUARD:
26613da691bfSIlya Dryomov 		rbd_assert(!obj_req->xferred);
26623da691bfSIlya Dryomov 		if (obj_req->result == -ENOENT) {
26633da691bfSIlya Dryomov 			/*
26643da691bfSIlya Dryomov 			 * The target object doesn't exist.  Read the data for
26653da691bfSIlya Dryomov 			 * the entire target object up to the overlap point (if
26663da691bfSIlya Dryomov 			 * any) from the parent, so we can use it for a copyup.
26673da691bfSIlya Dryomov 			 */
26683da691bfSIlya Dryomov 			ret = rbd_obj_handle_write_guard(obj_req);
26693da691bfSIlya Dryomov 			if (ret) {
26703da691bfSIlya Dryomov 				obj_req->result = ret;
26713da691bfSIlya Dryomov 				return true;
26723da691bfSIlya Dryomov 			}
26733da691bfSIlya Dryomov 			return false;
26743da691bfSIlya Dryomov 		}
26753da691bfSIlya Dryomov 		/* fall through */
26763da691bfSIlya Dryomov 	case RBD_OBJ_WRITE_FLAT:
26773a482501SIlya Dryomov 	case RBD_OBJ_WRITE_COPYUP_OPS:
26783da691bfSIlya Dryomov 		if (!obj_req->result)
26793da691bfSIlya Dryomov 			/*
26803da691bfSIlya Dryomov 			 * There is no such thing as a successful short
26813da691bfSIlya Dryomov 			 * write -- indicate the whole request was satisfied.
26823da691bfSIlya Dryomov 			 */
268343df3d35SIlya Dryomov 			obj_req->xferred = obj_req->ex.oe_len;
26843da691bfSIlya Dryomov 		return true;
26853a482501SIlya Dryomov 	case RBD_OBJ_WRITE_READ_FROM_PARENT:
26863da691bfSIlya Dryomov 		if (obj_req->result)
26873a482501SIlya Dryomov 			return true;
26883da691bfSIlya Dryomov 
26893da691bfSIlya Dryomov 		rbd_assert(obj_req->xferred);
26903da691bfSIlya Dryomov 		ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
26913da691bfSIlya Dryomov 		if (ret) {
26923da691bfSIlya Dryomov 			obj_req->result = ret;
2693356889c4SIlya Dryomov 			obj_req->xferred = 0;
26943da691bfSIlya Dryomov 			return true;
26953da691bfSIlya Dryomov 		}
26963da691bfSIlya Dryomov 		return false;
269789a59c1cSIlya Dryomov 	case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC:
269889a59c1cSIlya Dryomov 		if (obj_req->result)
269989a59c1cSIlya Dryomov 			return true;
270089a59c1cSIlya Dryomov 
270189a59c1cSIlya Dryomov 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
270289a59c1cSIlya Dryomov 		ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
270389a59c1cSIlya Dryomov 		if (ret) {
270489a59c1cSIlya Dryomov 			obj_req->result = ret;
270589a59c1cSIlya Dryomov 			return true;
270689a59c1cSIlya Dryomov 		}
270789a59c1cSIlya Dryomov 		return false;
27083da691bfSIlya Dryomov 	default:
2709c6244b3bSArnd Bergmann 		BUG();
27103da691bfSIlya Dryomov 	}
27113da691bfSIlya Dryomov }
27123da691bfSIlya Dryomov 
27133da691bfSIlya Dryomov /*
27143da691bfSIlya Dryomov  * Returns true if @obj_req is completed, or false otherwise.
27153da691bfSIlya Dryomov  */
27163da691bfSIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
27173da691bfSIlya Dryomov {
27189bb0248dSIlya Dryomov 	switch (obj_req->img_request->op_type) {
27193da691bfSIlya Dryomov 	case OBJ_OP_READ:
27203da691bfSIlya Dryomov 		return rbd_obj_handle_read(obj_req);
27213da691bfSIlya Dryomov 	case OBJ_OP_WRITE:
27223da691bfSIlya Dryomov 		return rbd_obj_handle_write(obj_req);
27233da691bfSIlya Dryomov 	case OBJ_OP_DISCARD:
27246484cbe9SIlya Dryomov 	case OBJ_OP_ZEROOUT:
27253da691bfSIlya Dryomov 		if (rbd_obj_handle_write(obj_req)) {
27263da691bfSIlya Dryomov 			/*
27273da691bfSIlya Dryomov 			 * Hide -ENOENT from delete/truncate/zero -- discarding
27283da691bfSIlya Dryomov 			 * a non-existent object is not a problem.
27293da691bfSIlya Dryomov 			 */
27303da691bfSIlya Dryomov 			if (obj_req->result == -ENOENT) {
27313da691bfSIlya Dryomov 				obj_req->result = 0;
273243df3d35SIlya Dryomov 				obj_req->xferred = obj_req->ex.oe_len;
27333da691bfSIlya Dryomov 			}
27343da691bfSIlya Dryomov 			return true;
27353da691bfSIlya Dryomov 		}
27363da691bfSIlya Dryomov 		return false;
27373da691bfSIlya Dryomov 	default:
2738c6244b3bSArnd Bergmann 		BUG();
27393da691bfSIlya Dryomov 	}
27403da691bfSIlya Dryomov }
27413da691bfSIlya Dryomov 
27427114edacSIlya Dryomov static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
27437114edacSIlya Dryomov {
27447114edacSIlya Dryomov 	struct rbd_img_request *img_req = obj_req->img_request;
27457114edacSIlya Dryomov 
27467114edacSIlya Dryomov 	rbd_assert((!obj_req->result &&
274743df3d35SIlya Dryomov 		    obj_req->xferred == obj_req->ex.oe_len) ||
27487114edacSIlya Dryomov 		   (obj_req->result < 0 && !obj_req->xferred));
27497114edacSIlya Dryomov 	if (!obj_req->result) {
27507114edacSIlya Dryomov 		img_req->xferred += obj_req->xferred;
275102c74fbaSAlex Elder 		return;
275202c74fbaSAlex Elder 	}
275302c74fbaSAlex Elder 
27547114edacSIlya Dryomov 	rbd_warn(img_req->rbd_dev,
27557114edacSIlya Dryomov 		 "%s at objno %llu %llu~%llu result %d xferred %llu",
275643df3d35SIlya Dryomov 		 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
275743df3d35SIlya Dryomov 		 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
27587114edacSIlya Dryomov 		 obj_req->xferred);
27597114edacSIlya Dryomov 	if (!img_req->result) {
27607114edacSIlya Dryomov 		img_req->result = obj_req->result;
27617114edacSIlya Dryomov 		img_req->xferred = 0;
2762a9e8ba2cSAlex Elder 	}
27638b3e1a56SAlex Elder }
27648b3e1a56SAlex Elder 
27653da691bfSIlya Dryomov static void rbd_img_end_child_request(struct rbd_img_request *img_req)
27668b3e1a56SAlex Elder {
27673da691bfSIlya Dryomov 	struct rbd_obj_request *obj_req = img_req->obj_request;
27688b3e1a56SAlex Elder 
27693da691bfSIlya Dryomov 	rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
277086bd7998SIlya Dryomov 	rbd_assert((!img_req->result &&
277186bd7998SIlya Dryomov 		    img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
277286bd7998SIlya Dryomov 		   (img_req->result < 0 && !img_req->xferred));
27738b3e1a56SAlex Elder 
27743da691bfSIlya Dryomov 	obj_req->result = img_req->result;
27753da691bfSIlya Dryomov 	obj_req->xferred = img_req->xferred;
27763da691bfSIlya Dryomov 	rbd_img_request_put(img_req);
27777114edacSIlya Dryomov }
27788b3e1a56SAlex Elder 
27797114edacSIlya Dryomov static void rbd_img_end_request(struct rbd_img_request *img_req)
27807114edacSIlya Dryomov {
27817114edacSIlya Dryomov 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
27827114edacSIlya Dryomov 	rbd_assert((!img_req->result &&
27837114edacSIlya Dryomov 		    img_req->xferred == blk_rq_bytes(img_req->rq)) ||
27847114edacSIlya Dryomov 		   (img_req->result < 0 && !img_req->xferred));
27858b3e1a56SAlex Elder 
27867114edacSIlya Dryomov 	blk_mq_end_request(img_req->rq,
27877114edacSIlya Dryomov 			   errno_to_blk_status(img_req->result));
27887114edacSIlya Dryomov 	rbd_img_request_put(img_req);
27893da691bfSIlya Dryomov }
27908b3e1a56SAlex Elder 
27913da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
27923da691bfSIlya Dryomov {
27937114edacSIlya Dryomov 	struct rbd_img_request *img_req;
27947114edacSIlya Dryomov 
27957114edacSIlya Dryomov again:
27963da691bfSIlya Dryomov 	if (!__rbd_obj_handle_request(obj_req))
27978b3e1a56SAlex Elder 		return;
27983da691bfSIlya Dryomov 
27997114edacSIlya Dryomov 	img_req = obj_req->img_request;
28007114edacSIlya Dryomov 	spin_lock(&img_req->completion_lock);
28017114edacSIlya Dryomov 	rbd_obj_end_request(obj_req);
28027114edacSIlya Dryomov 	rbd_assert(img_req->pending_count);
28037114edacSIlya Dryomov 	if (--img_req->pending_count) {
28047114edacSIlya Dryomov 		spin_unlock(&img_req->completion_lock);
28057114edacSIlya Dryomov 		return;
28067114edacSIlya Dryomov 	}
28077114edacSIlya Dryomov 
28087114edacSIlya Dryomov 	spin_unlock(&img_req->completion_lock);
28097114edacSIlya Dryomov 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
28107114edacSIlya Dryomov 		obj_req = img_req->obj_request;
28117114edacSIlya Dryomov 		rbd_img_end_child_request(img_req);
28127114edacSIlya Dryomov 		goto again;
28137114edacSIlya Dryomov 	}
28147114edacSIlya Dryomov 	rbd_img_end_request(img_req);
28158b3e1a56SAlex Elder }
28168b3e1a56SAlex Elder 
2817ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
2818ed95b21aSIlya Dryomov 
2819ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2820ed95b21aSIlya Dryomov 			  const struct rbd_client_id *rhs)
2821ed95b21aSIlya Dryomov {
2822ed95b21aSIlya Dryomov 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2823ed95b21aSIlya Dryomov }
2824ed95b21aSIlya Dryomov 
2825ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2826ed95b21aSIlya Dryomov {
2827ed95b21aSIlya Dryomov 	struct rbd_client_id cid;
2828ed95b21aSIlya Dryomov 
2829ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
2830ed95b21aSIlya Dryomov 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2831ed95b21aSIlya Dryomov 	cid.handle = rbd_dev->watch_cookie;
2832ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
2833ed95b21aSIlya Dryomov 	return cid;
2834ed95b21aSIlya Dryomov }
2835ed95b21aSIlya Dryomov 
2836ed95b21aSIlya Dryomov /*
2837ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2838ed95b21aSIlya Dryomov  */
2839ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2840ed95b21aSIlya Dryomov 			      const struct rbd_client_id *cid)
2841ed95b21aSIlya Dryomov {
2842ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2843ed95b21aSIlya Dryomov 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2844ed95b21aSIlya Dryomov 	     cid->gid, cid->handle);
2845ed95b21aSIlya Dryomov 	rbd_dev->owner_cid = *cid; /* struct */
2846ed95b21aSIlya Dryomov }
2847ed95b21aSIlya Dryomov 
2848ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2849ed95b21aSIlya Dryomov {
2850ed95b21aSIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
2851ed95b21aSIlya Dryomov 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2852ed95b21aSIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
2853ed95b21aSIlya Dryomov }
2854ed95b21aSIlya Dryomov 
2855edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2856edd8ca80SFlorian Margaine {
2857edd8ca80SFlorian Margaine 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2858edd8ca80SFlorian Margaine 
2859edd8ca80SFlorian Margaine 	strcpy(rbd_dev->lock_cookie, cookie);
2860edd8ca80SFlorian Margaine 	rbd_set_owner_cid(rbd_dev, &cid);
2861edd8ca80SFlorian Margaine 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2862edd8ca80SFlorian Margaine }
2863edd8ca80SFlorian Margaine 
2864ed95b21aSIlya Dryomov /*
2865ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2866ed95b21aSIlya Dryomov  */
2867ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
2868ed95b21aSIlya Dryomov {
2869ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2870ed95b21aSIlya Dryomov 	char cookie[32];
2871ed95b21aSIlya Dryomov 	int ret;
2872ed95b21aSIlya Dryomov 
2873cbbfb0ffSIlya Dryomov 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2874cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] != '\0');
2875ed95b21aSIlya Dryomov 
2876ed95b21aSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
2877ed95b21aSIlya Dryomov 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2878ed95b21aSIlya Dryomov 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2879ed95b21aSIlya Dryomov 			    RBD_LOCK_TAG, "", 0);
2880ed95b21aSIlya Dryomov 	if (ret)
2881ed95b21aSIlya Dryomov 		return ret;
2882ed95b21aSIlya Dryomov 
2883ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
2884edd8ca80SFlorian Margaine 	__rbd_lock(rbd_dev, cookie);
2885ed95b21aSIlya Dryomov 	return 0;
2886ed95b21aSIlya Dryomov }
2887ed95b21aSIlya Dryomov 
2888ed95b21aSIlya Dryomov /*
2889ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
2890ed95b21aSIlya Dryomov  */
2891bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev)
2892ed95b21aSIlya Dryomov {
2893ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2894ed95b21aSIlya Dryomov 	int ret;
2895ed95b21aSIlya Dryomov 
2896cbbfb0ffSIlya Dryomov 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2897cbbfb0ffSIlya Dryomov 		rbd_dev->lock_cookie[0] == '\0');
2898ed95b21aSIlya Dryomov 
2899ed95b21aSIlya Dryomov 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2900cbbfb0ffSIlya Dryomov 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
2901bbead745SIlya Dryomov 	if (ret && ret != -ENOENT)
2902bbead745SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unlock: %d", ret);
2903ed95b21aSIlya Dryomov 
2904bbead745SIlya Dryomov 	/* treat errors as the image is unlocked */
2905bbead745SIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
2906cbbfb0ffSIlya Dryomov 	rbd_dev->lock_cookie[0] = '\0';
2907ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2908ed95b21aSIlya Dryomov 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
2909ed95b21aSIlya Dryomov }
2910ed95b21aSIlya Dryomov 
2911ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2912ed95b21aSIlya Dryomov 				enum rbd_notify_op notify_op,
2913ed95b21aSIlya Dryomov 				struct page ***preply_pages,
2914ed95b21aSIlya Dryomov 				size_t *preply_len)
2915ed95b21aSIlya Dryomov {
2916ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2917ed95b21aSIlya Dryomov 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
291808a79102SKyle Spiers 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
291908a79102SKyle Spiers 	int buf_size = sizeof(buf);
2920ed95b21aSIlya Dryomov 	void *p = buf;
2921ed95b21aSIlya Dryomov 
2922ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2923ed95b21aSIlya Dryomov 
2924ed95b21aSIlya Dryomov 	/* encode *LockPayload NotifyMessage (op + ClientId) */
2925ed95b21aSIlya Dryomov 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2926ed95b21aSIlya Dryomov 	ceph_encode_32(&p, notify_op);
2927ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.gid);
2928ed95b21aSIlya Dryomov 	ceph_encode_64(&p, cid.handle);
2929ed95b21aSIlya Dryomov 
2930ed95b21aSIlya Dryomov 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2931ed95b21aSIlya Dryomov 				&rbd_dev->header_oloc, buf, buf_size,
2932ed95b21aSIlya Dryomov 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2933ed95b21aSIlya Dryomov }
2934ed95b21aSIlya Dryomov 
2935ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2936ed95b21aSIlya Dryomov 			       enum rbd_notify_op notify_op)
2937ed95b21aSIlya Dryomov {
2938ed95b21aSIlya Dryomov 	struct page **reply_pages;
2939ed95b21aSIlya Dryomov 	size_t reply_len;
2940ed95b21aSIlya Dryomov 
2941ed95b21aSIlya Dryomov 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2942ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2943ed95b21aSIlya Dryomov }
2944ed95b21aSIlya Dryomov 
2945ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
2946ed95b21aSIlya Dryomov {
2947ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2948ed95b21aSIlya Dryomov 						  acquired_lock_work);
2949ed95b21aSIlya Dryomov 
2950ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2951ed95b21aSIlya Dryomov }
2952ed95b21aSIlya Dryomov 
2953ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
2954ed95b21aSIlya Dryomov {
2955ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2956ed95b21aSIlya Dryomov 						  released_lock_work);
2957ed95b21aSIlya Dryomov 
2958ed95b21aSIlya Dryomov 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2959ed95b21aSIlya Dryomov }
2960ed95b21aSIlya Dryomov 
2961ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
2962ed95b21aSIlya Dryomov {
2963ed95b21aSIlya Dryomov 	struct page **reply_pages;
2964ed95b21aSIlya Dryomov 	size_t reply_len;
2965ed95b21aSIlya Dryomov 	bool lock_owner_responded = false;
2966ed95b21aSIlya Dryomov 	int ret;
2967ed95b21aSIlya Dryomov 
2968ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
2969ed95b21aSIlya Dryomov 
2970ed95b21aSIlya Dryomov 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2971ed95b21aSIlya Dryomov 				   &reply_pages, &reply_len);
2972ed95b21aSIlya Dryomov 	if (ret && ret != -ETIMEDOUT) {
2973ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2974ed95b21aSIlya Dryomov 		goto out;
2975ed95b21aSIlya Dryomov 	}
2976ed95b21aSIlya Dryomov 
2977ed95b21aSIlya Dryomov 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2978ed95b21aSIlya Dryomov 		void *p = page_address(reply_pages[0]);
2979ed95b21aSIlya Dryomov 		void *const end = p + reply_len;
2980ed95b21aSIlya Dryomov 		u32 n;
2981ed95b21aSIlya Dryomov 
2982ed95b21aSIlya Dryomov 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2983ed95b21aSIlya Dryomov 		while (n--) {
2984ed95b21aSIlya Dryomov 			u8 struct_v;
2985ed95b21aSIlya Dryomov 			u32 len;
2986ed95b21aSIlya Dryomov 
2987ed95b21aSIlya Dryomov 			ceph_decode_need(&p, end, 8 + 8, e_inval);
2988ed95b21aSIlya Dryomov 			p += 8 + 8; /* skip gid and cookie */
2989ed95b21aSIlya Dryomov 
2990ed95b21aSIlya Dryomov 			ceph_decode_32_safe(&p, end, len, e_inval);
2991ed95b21aSIlya Dryomov 			if (!len)
2992ed95b21aSIlya Dryomov 				continue;
2993ed95b21aSIlya Dryomov 
2994ed95b21aSIlya Dryomov 			if (lock_owner_responded) {
2995ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
2996ed95b21aSIlya Dryomov 					 "duplicate lock owners detected");
2997ed95b21aSIlya Dryomov 				ret = -EIO;
2998ed95b21aSIlya Dryomov 				goto out;
2999ed95b21aSIlya Dryomov 			}
3000ed95b21aSIlya Dryomov 
3001ed95b21aSIlya Dryomov 			lock_owner_responded = true;
3002ed95b21aSIlya Dryomov 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3003ed95b21aSIlya Dryomov 						  &struct_v, &len);
3004ed95b21aSIlya Dryomov 			if (ret) {
3005ed95b21aSIlya Dryomov 				rbd_warn(rbd_dev,
3006ed95b21aSIlya Dryomov 					 "failed to decode ResponseMessage: %d",
3007ed95b21aSIlya Dryomov 					 ret);
3008ed95b21aSIlya Dryomov 				goto e_inval;
3009ed95b21aSIlya Dryomov 			}
3010ed95b21aSIlya Dryomov 
3011ed95b21aSIlya Dryomov 			ret = ceph_decode_32(&p);
3012ed95b21aSIlya Dryomov 		}
3013ed95b21aSIlya Dryomov 	}
3014ed95b21aSIlya Dryomov 
3015ed95b21aSIlya Dryomov 	if (!lock_owner_responded) {
3016ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "no lock owners detected");
3017ed95b21aSIlya Dryomov 		ret = -ETIMEDOUT;
3018ed95b21aSIlya Dryomov 	}
3019ed95b21aSIlya Dryomov 
3020ed95b21aSIlya Dryomov out:
3021ed95b21aSIlya Dryomov 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3022ed95b21aSIlya Dryomov 	return ret;
3023ed95b21aSIlya Dryomov 
3024ed95b21aSIlya Dryomov e_inval:
3025ed95b21aSIlya Dryomov 	ret = -EINVAL;
3026ed95b21aSIlya Dryomov 	goto out;
3027ed95b21aSIlya Dryomov }
3028ed95b21aSIlya Dryomov 
3029ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3030ed95b21aSIlya Dryomov {
3031ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3032ed95b21aSIlya Dryomov 
3033ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3034ed95b21aSIlya Dryomov 	if (wake_all)
3035ed95b21aSIlya Dryomov 		wake_up_all(&rbd_dev->lock_waitq);
3036ed95b21aSIlya Dryomov 	else
3037ed95b21aSIlya Dryomov 		wake_up(&rbd_dev->lock_waitq);
3038ed95b21aSIlya Dryomov }
3039ed95b21aSIlya Dryomov 
3040ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev,
3041ed95b21aSIlya Dryomov 			       struct ceph_locker **lockers, u32 *num_lockers)
3042ed95b21aSIlya Dryomov {
3043ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3044ed95b21aSIlya Dryomov 	u8 lock_type;
3045ed95b21aSIlya Dryomov 	char *lock_tag;
3046ed95b21aSIlya Dryomov 	int ret;
3047ed95b21aSIlya Dryomov 
3048ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3049ed95b21aSIlya Dryomov 
3050ed95b21aSIlya Dryomov 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3051ed95b21aSIlya Dryomov 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3052ed95b21aSIlya Dryomov 				 &lock_type, &lock_tag, lockers, num_lockers);
3053ed95b21aSIlya Dryomov 	if (ret)
3054ed95b21aSIlya Dryomov 		return ret;
3055ed95b21aSIlya Dryomov 
3056ed95b21aSIlya Dryomov 	if (*num_lockers == 0) {
3057ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3058ed95b21aSIlya Dryomov 		goto out;
3059ed95b21aSIlya Dryomov 	}
3060ed95b21aSIlya Dryomov 
3061ed95b21aSIlya Dryomov 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3062ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3063ed95b21aSIlya Dryomov 			 lock_tag);
3064ed95b21aSIlya Dryomov 		ret = -EBUSY;
3065ed95b21aSIlya Dryomov 		goto out;
3066ed95b21aSIlya Dryomov 	}
3067ed95b21aSIlya Dryomov 
3068ed95b21aSIlya Dryomov 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3069ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "shared lock type detected");
3070ed95b21aSIlya Dryomov 		ret = -EBUSY;
3071ed95b21aSIlya Dryomov 		goto out;
3072ed95b21aSIlya Dryomov 	}
3073ed95b21aSIlya Dryomov 
3074ed95b21aSIlya Dryomov 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3075ed95b21aSIlya Dryomov 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3076ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3077ed95b21aSIlya Dryomov 			 (*lockers)[0].id.cookie);
3078ed95b21aSIlya Dryomov 		ret = -EBUSY;
3079ed95b21aSIlya Dryomov 		goto out;
3080ed95b21aSIlya Dryomov 	}
3081ed95b21aSIlya Dryomov 
3082ed95b21aSIlya Dryomov out:
3083ed95b21aSIlya Dryomov 	kfree(lock_tag);
3084ed95b21aSIlya Dryomov 	return ret;
3085ed95b21aSIlya Dryomov }
3086ed95b21aSIlya Dryomov 
3087ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3088ed95b21aSIlya Dryomov 			const struct ceph_locker *locker)
3089ed95b21aSIlya Dryomov {
3090ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3091ed95b21aSIlya Dryomov 	struct ceph_watch_item *watchers;
3092ed95b21aSIlya Dryomov 	u32 num_watchers;
3093ed95b21aSIlya Dryomov 	u64 cookie;
3094ed95b21aSIlya Dryomov 	int i;
3095ed95b21aSIlya Dryomov 	int ret;
3096ed95b21aSIlya Dryomov 
3097ed95b21aSIlya Dryomov 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3098ed95b21aSIlya Dryomov 				      &rbd_dev->header_oloc, &watchers,
3099ed95b21aSIlya Dryomov 				      &num_watchers);
3100ed95b21aSIlya Dryomov 	if (ret)
3101ed95b21aSIlya Dryomov 		return ret;
3102ed95b21aSIlya Dryomov 
3103ed95b21aSIlya Dryomov 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3104ed95b21aSIlya Dryomov 	for (i = 0; i < num_watchers; i++) {
3105ed95b21aSIlya Dryomov 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3106ed95b21aSIlya Dryomov 			    sizeof(locker->info.addr)) &&
3107ed95b21aSIlya Dryomov 		    watchers[i].cookie == cookie) {
3108ed95b21aSIlya Dryomov 			struct rbd_client_id cid = {
3109ed95b21aSIlya Dryomov 				.gid = le64_to_cpu(watchers[i].name.num),
3110ed95b21aSIlya Dryomov 				.handle = cookie,
3111ed95b21aSIlya Dryomov 			};
3112ed95b21aSIlya Dryomov 
3113ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3114ed95b21aSIlya Dryomov 			     rbd_dev, cid.gid, cid.handle);
3115ed95b21aSIlya Dryomov 			rbd_set_owner_cid(rbd_dev, &cid);
3116ed95b21aSIlya Dryomov 			ret = 1;
3117ed95b21aSIlya Dryomov 			goto out;
3118ed95b21aSIlya Dryomov 		}
3119ed95b21aSIlya Dryomov 	}
3120ed95b21aSIlya Dryomov 
3121ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3122ed95b21aSIlya Dryomov 	ret = 0;
3123ed95b21aSIlya Dryomov out:
3124ed95b21aSIlya Dryomov 	kfree(watchers);
3125ed95b21aSIlya Dryomov 	return ret;
3126ed95b21aSIlya Dryomov }
3127ed95b21aSIlya Dryomov 
3128ed95b21aSIlya Dryomov /*
3129ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3130ed95b21aSIlya Dryomov  */
3131ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3132ed95b21aSIlya Dryomov {
3133ed95b21aSIlya Dryomov 	struct ceph_client *client = rbd_dev->rbd_client->client;
3134ed95b21aSIlya Dryomov 	struct ceph_locker *lockers;
3135ed95b21aSIlya Dryomov 	u32 num_lockers;
3136ed95b21aSIlya Dryomov 	int ret;
3137ed95b21aSIlya Dryomov 
3138ed95b21aSIlya Dryomov 	for (;;) {
3139ed95b21aSIlya Dryomov 		ret = rbd_lock(rbd_dev);
3140ed95b21aSIlya Dryomov 		if (ret != -EBUSY)
3141ed95b21aSIlya Dryomov 			return ret;
3142ed95b21aSIlya Dryomov 
3143ed95b21aSIlya Dryomov 		/* determine if the current lock holder is still alive */
3144ed95b21aSIlya Dryomov 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3145ed95b21aSIlya Dryomov 		if (ret)
3146ed95b21aSIlya Dryomov 			return ret;
3147ed95b21aSIlya Dryomov 
3148ed95b21aSIlya Dryomov 		if (num_lockers == 0)
3149ed95b21aSIlya Dryomov 			goto again;
3150ed95b21aSIlya Dryomov 
3151ed95b21aSIlya Dryomov 		ret = find_watcher(rbd_dev, lockers);
3152ed95b21aSIlya Dryomov 		if (ret) {
3153ed95b21aSIlya Dryomov 			if (ret > 0)
3154ed95b21aSIlya Dryomov 				ret = 0; /* have to request lock */
3155ed95b21aSIlya Dryomov 			goto out;
3156ed95b21aSIlya Dryomov 		}
3157ed95b21aSIlya Dryomov 
3158ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3159ed95b21aSIlya Dryomov 			 ENTITY_NAME(lockers[0].id.name));
3160ed95b21aSIlya Dryomov 
3161ed95b21aSIlya Dryomov 		ret = ceph_monc_blacklist_add(&client->monc,
3162ed95b21aSIlya Dryomov 					      &lockers[0].info.addr);
3163ed95b21aSIlya Dryomov 		if (ret) {
3164ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3165ed95b21aSIlya Dryomov 				 ENTITY_NAME(lockers[0].id.name), ret);
3166ed95b21aSIlya Dryomov 			goto out;
3167ed95b21aSIlya Dryomov 		}
3168ed95b21aSIlya Dryomov 
3169ed95b21aSIlya Dryomov 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3170ed95b21aSIlya Dryomov 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3171ed95b21aSIlya Dryomov 					  lockers[0].id.cookie,
3172ed95b21aSIlya Dryomov 					  &lockers[0].id.name);
3173ed95b21aSIlya Dryomov 		if (ret && ret != -ENOENT)
3174ed95b21aSIlya Dryomov 			goto out;
3175ed95b21aSIlya Dryomov 
3176ed95b21aSIlya Dryomov again:
3177ed95b21aSIlya Dryomov 		ceph_free_lockers(lockers, num_lockers);
3178ed95b21aSIlya Dryomov 	}
3179ed95b21aSIlya Dryomov 
3180ed95b21aSIlya Dryomov out:
3181ed95b21aSIlya Dryomov 	ceph_free_lockers(lockers, num_lockers);
3182ed95b21aSIlya Dryomov 	return ret;
3183ed95b21aSIlya Dryomov }
3184ed95b21aSIlya Dryomov 
3185ed95b21aSIlya Dryomov /*
3186ed95b21aSIlya Dryomov  * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3187ed95b21aSIlya Dryomov  */
3188ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3189ed95b21aSIlya Dryomov 						int *pret)
3190ed95b21aSIlya Dryomov {
3191ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
3192ed95b21aSIlya Dryomov 
3193ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
3194ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3195ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3196ed95b21aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
3197ed95b21aSIlya Dryomov 		lock_state = rbd_dev->lock_state;
3198ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3199ed95b21aSIlya Dryomov 		return lock_state;
3200ed95b21aSIlya Dryomov 	}
3201ed95b21aSIlya Dryomov 
3202ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3203ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3204ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3205ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3206ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev)) {
3207ed95b21aSIlya Dryomov 		*pret = rbd_try_lock(rbd_dev);
3208ed95b21aSIlya Dryomov 		if (*pret)
3209ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3210ed95b21aSIlya Dryomov 	}
3211ed95b21aSIlya Dryomov 
3212ed95b21aSIlya Dryomov 	lock_state = rbd_dev->lock_state;
3213ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3214ed95b21aSIlya Dryomov 	return lock_state;
3215ed95b21aSIlya Dryomov }
3216ed95b21aSIlya Dryomov 
3217ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
3218ed95b21aSIlya Dryomov {
3219ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3220ed95b21aSIlya Dryomov 					    struct rbd_device, lock_dwork);
3221ed95b21aSIlya Dryomov 	enum rbd_lock_state lock_state;
322237f13252SKefeng Wang 	int ret = 0;
3223ed95b21aSIlya Dryomov 
3224ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3225ed95b21aSIlya Dryomov again:
3226ed95b21aSIlya Dryomov 	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3227ed95b21aSIlya Dryomov 	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3228ed95b21aSIlya Dryomov 		if (lock_state == RBD_LOCK_STATE_LOCKED)
3229ed95b21aSIlya Dryomov 			wake_requests(rbd_dev, true);
3230ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3231ed95b21aSIlya Dryomov 		     rbd_dev, lock_state, ret);
3232ed95b21aSIlya Dryomov 		return;
3233ed95b21aSIlya Dryomov 	}
3234ed95b21aSIlya Dryomov 
3235ed95b21aSIlya Dryomov 	ret = rbd_request_lock(rbd_dev);
3236ed95b21aSIlya Dryomov 	if (ret == -ETIMEDOUT) {
3237ed95b21aSIlya Dryomov 		goto again; /* treat this as a dead client */
3238e010dd0aSIlya Dryomov 	} else if (ret == -EROFS) {
3239e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "peer will not release lock");
3240e010dd0aSIlya Dryomov 		/*
3241e010dd0aSIlya Dryomov 		 * If this is rbd_add_acquire_lock(), we want to fail
3242e010dd0aSIlya Dryomov 		 * immediately -- reuse BLACKLISTED flag.  Otherwise we
3243e010dd0aSIlya Dryomov 		 * want to block.
3244e010dd0aSIlya Dryomov 		 */
3245e010dd0aSIlya Dryomov 		if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3246e010dd0aSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3247e010dd0aSIlya Dryomov 			/* wake "rbd map --exclusive" process */
3248e010dd0aSIlya Dryomov 			wake_requests(rbd_dev, false);
3249e010dd0aSIlya Dryomov 		}
3250ed95b21aSIlya Dryomov 	} else if (ret < 0) {
3251ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3252ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3253ed95b21aSIlya Dryomov 				 RBD_RETRY_DELAY);
3254ed95b21aSIlya Dryomov 	} else {
3255ed95b21aSIlya Dryomov 		/*
3256ed95b21aSIlya Dryomov 		 * lock owner acked, but resend if we don't see them
3257ed95b21aSIlya Dryomov 		 * release the lock
3258ed95b21aSIlya Dryomov 		 */
3259ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3260ed95b21aSIlya Dryomov 		     rbd_dev);
3261ed95b21aSIlya Dryomov 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3262ed95b21aSIlya Dryomov 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3263ed95b21aSIlya Dryomov 	}
3264ed95b21aSIlya Dryomov }
3265ed95b21aSIlya Dryomov 
3266ed95b21aSIlya Dryomov /*
3267ed95b21aSIlya Dryomov  * lock_rwsem must be held for write
3268ed95b21aSIlya Dryomov  */
3269ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev)
3270ed95b21aSIlya Dryomov {
3271ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3272ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3273ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3274ed95b21aSIlya Dryomov 		return false;
3275ed95b21aSIlya Dryomov 
3276ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3277ed95b21aSIlya Dryomov 	downgrade_write(&rbd_dev->lock_rwsem);
3278ed95b21aSIlya Dryomov 	/*
3279ed95b21aSIlya Dryomov 	 * Ensure that all in-flight IO is flushed.
3280ed95b21aSIlya Dryomov 	 *
3281ed95b21aSIlya Dryomov 	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3282ed95b21aSIlya Dryomov 	 * may be shared with other devices.
3283ed95b21aSIlya Dryomov 	 */
3284ed95b21aSIlya Dryomov 	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3285ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3286ed95b21aSIlya Dryomov 
3287ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3288ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3289ed95b21aSIlya Dryomov 	     rbd_dev->lock_state);
3290ed95b21aSIlya Dryomov 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3291ed95b21aSIlya Dryomov 		return false;
3292ed95b21aSIlya Dryomov 
3293bbead745SIlya Dryomov 	rbd_unlock(rbd_dev);
3294ed95b21aSIlya Dryomov 	/*
3295ed95b21aSIlya Dryomov 	 * Give others a chance to grab the lock - we would re-acquire
3296ed95b21aSIlya Dryomov 	 * almost immediately if we got new IO during ceph_osdc_sync()
3297ed95b21aSIlya Dryomov 	 * otherwise.  We need to ack our own notifications, so this
3298ed95b21aSIlya Dryomov 	 * lock_dwork will be requeued from rbd_wait_state_locked()
3299ed95b21aSIlya Dryomov 	 * after wake_requests() in rbd_handle_released_lock().
3300ed95b21aSIlya Dryomov 	 */
3301ed95b21aSIlya Dryomov 	cancel_delayed_work(&rbd_dev->lock_dwork);
3302ed95b21aSIlya Dryomov 	return true;
3303ed95b21aSIlya Dryomov }
3304ed95b21aSIlya Dryomov 
3305ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
3306ed95b21aSIlya Dryomov {
3307ed95b21aSIlya Dryomov 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3308ed95b21aSIlya Dryomov 						  unlock_work);
3309ed95b21aSIlya Dryomov 
3310ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3311ed95b21aSIlya Dryomov 	rbd_release_lock(rbd_dev);
3312ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3313ed95b21aSIlya Dryomov }
3314ed95b21aSIlya Dryomov 
3315ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3316ed95b21aSIlya Dryomov 				     void **p)
3317ed95b21aSIlya Dryomov {
3318ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3319ed95b21aSIlya Dryomov 
3320ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3321ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3322ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3323ed95b21aSIlya Dryomov 	}
3324ed95b21aSIlya Dryomov 
3325ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3326ed95b21aSIlya Dryomov 	     cid.handle);
3327ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3328ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3329ed95b21aSIlya Dryomov 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3330ed95b21aSIlya Dryomov 			/*
3331ed95b21aSIlya Dryomov 			 * we already know that the remote client is
3332ed95b21aSIlya Dryomov 			 * the owner
3333ed95b21aSIlya Dryomov 			 */
3334ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3335ed95b21aSIlya Dryomov 			return;
3336ed95b21aSIlya Dryomov 		}
3337ed95b21aSIlya Dryomov 
3338ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &cid);
3339ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3340ed95b21aSIlya Dryomov 	} else {
3341ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3342ed95b21aSIlya Dryomov 	}
3343ed95b21aSIlya Dryomov 
3344ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3345ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3346ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3347ed95b21aSIlya Dryomov }
3348ed95b21aSIlya Dryomov 
3349ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3350ed95b21aSIlya Dryomov 				     void **p)
3351ed95b21aSIlya Dryomov {
3352ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
3353ed95b21aSIlya Dryomov 
3354ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3355ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3356ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3357ed95b21aSIlya Dryomov 	}
3358ed95b21aSIlya Dryomov 
3359ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3360ed95b21aSIlya Dryomov 	     cid.handle);
3361ed95b21aSIlya Dryomov 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3362ed95b21aSIlya Dryomov 		down_write(&rbd_dev->lock_rwsem);
3363ed95b21aSIlya Dryomov 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3364ed95b21aSIlya Dryomov 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3365ed95b21aSIlya Dryomov 			     __func__, rbd_dev, cid.gid, cid.handle,
3366ed95b21aSIlya Dryomov 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3367ed95b21aSIlya Dryomov 			up_write(&rbd_dev->lock_rwsem);
3368ed95b21aSIlya Dryomov 			return;
3369ed95b21aSIlya Dryomov 		}
3370ed95b21aSIlya Dryomov 
3371ed95b21aSIlya Dryomov 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3372ed95b21aSIlya Dryomov 		downgrade_write(&rbd_dev->lock_rwsem);
3373ed95b21aSIlya Dryomov 	} else {
3374ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
3375ed95b21aSIlya Dryomov 	}
3376ed95b21aSIlya Dryomov 
3377ed95b21aSIlya Dryomov 	if (!__rbd_is_lock_owner(rbd_dev))
3378ed95b21aSIlya Dryomov 		wake_requests(rbd_dev, false);
3379ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
3380ed95b21aSIlya Dryomov }
3381ed95b21aSIlya Dryomov 
33823b77faa0SIlya Dryomov /*
33833b77faa0SIlya Dryomov  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
33843b77faa0SIlya Dryomov  * ResponseMessage is needed.
33853b77faa0SIlya Dryomov  */
33863b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3387ed95b21aSIlya Dryomov 				   void **p)
3388ed95b21aSIlya Dryomov {
3389ed95b21aSIlya Dryomov 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3390ed95b21aSIlya Dryomov 	struct rbd_client_id cid = { 0 };
33913b77faa0SIlya Dryomov 	int result = 1;
3392ed95b21aSIlya Dryomov 
3393ed95b21aSIlya Dryomov 	if (struct_v >= 2) {
3394ed95b21aSIlya Dryomov 		cid.gid = ceph_decode_64(p);
3395ed95b21aSIlya Dryomov 		cid.handle = ceph_decode_64(p);
3396ed95b21aSIlya Dryomov 	}
3397ed95b21aSIlya Dryomov 
3398ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3399ed95b21aSIlya Dryomov 	     cid.handle);
3400ed95b21aSIlya Dryomov 	if (rbd_cid_equal(&cid, &my_cid))
34013b77faa0SIlya Dryomov 		return result;
3402ed95b21aSIlya Dryomov 
3403ed95b21aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
34043b77faa0SIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev)) {
34053b77faa0SIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
34063b77faa0SIlya Dryomov 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
34073b77faa0SIlya Dryomov 			goto out_unlock;
34083b77faa0SIlya Dryomov 
34093b77faa0SIlya Dryomov 		/*
34103b77faa0SIlya Dryomov 		 * encode ResponseMessage(0) so the peer can detect
34113b77faa0SIlya Dryomov 		 * a missing owner
34123b77faa0SIlya Dryomov 		 */
34133b77faa0SIlya Dryomov 		result = 0;
34143b77faa0SIlya Dryomov 
3415ed95b21aSIlya Dryomov 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3416e010dd0aSIlya Dryomov 			if (!rbd_dev->opts->exclusive) {
3417e010dd0aSIlya Dryomov 				dout("%s rbd_dev %p queueing unlock_work\n",
3418e010dd0aSIlya Dryomov 				     __func__, rbd_dev);
3419e010dd0aSIlya Dryomov 				queue_work(rbd_dev->task_wq,
3420e010dd0aSIlya Dryomov 					   &rbd_dev->unlock_work);
3421e010dd0aSIlya Dryomov 			} else {
3422e010dd0aSIlya Dryomov 				/* refuse to release the lock */
3423e010dd0aSIlya Dryomov 				result = -EROFS;
3424ed95b21aSIlya Dryomov 			}
3425ed95b21aSIlya Dryomov 		}
3426ed95b21aSIlya Dryomov 	}
34273b77faa0SIlya Dryomov 
34283b77faa0SIlya Dryomov out_unlock:
3429ed95b21aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
34303b77faa0SIlya Dryomov 	return result;
3431ed95b21aSIlya Dryomov }
3432ed95b21aSIlya Dryomov 
3433ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3434ed95b21aSIlya Dryomov 				     u64 notify_id, u64 cookie, s32 *result)
3435ed95b21aSIlya Dryomov {
3436ed95b21aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
343708a79102SKyle Spiers 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
343808a79102SKyle Spiers 	int buf_size = sizeof(buf);
3439ed95b21aSIlya Dryomov 	int ret;
3440ed95b21aSIlya Dryomov 
3441ed95b21aSIlya Dryomov 	if (result) {
3442ed95b21aSIlya Dryomov 		void *p = buf;
3443ed95b21aSIlya Dryomov 
3444ed95b21aSIlya Dryomov 		/* encode ResponseMessage */
3445ed95b21aSIlya Dryomov 		ceph_start_encoding(&p, 1, 1,
3446ed95b21aSIlya Dryomov 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3447ed95b21aSIlya Dryomov 		ceph_encode_32(&p, *result);
3448ed95b21aSIlya Dryomov 	} else {
3449ed95b21aSIlya Dryomov 		buf_size = 0;
3450ed95b21aSIlya Dryomov 	}
3451ed95b21aSIlya Dryomov 
3452ed95b21aSIlya Dryomov 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3453ed95b21aSIlya Dryomov 				   &rbd_dev->header_oloc, notify_id, cookie,
3454ed95b21aSIlya Dryomov 				   buf, buf_size);
3455ed95b21aSIlya Dryomov 	if (ret)
3456ed95b21aSIlya Dryomov 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3457ed95b21aSIlya Dryomov }
3458ed95b21aSIlya Dryomov 
3459ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3460ed95b21aSIlya Dryomov 				   u64 cookie)
3461ed95b21aSIlya Dryomov {
3462ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3463ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3464ed95b21aSIlya Dryomov }
3465ed95b21aSIlya Dryomov 
3466ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3467ed95b21aSIlya Dryomov 					  u64 notify_id, u64 cookie, s32 result)
3468ed95b21aSIlya Dryomov {
3469ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3470ed95b21aSIlya Dryomov 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3471ed95b21aSIlya Dryomov }
3472922dab61SIlya Dryomov 
3473922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3474922dab61SIlya Dryomov 			 u64 notifier_id, void *data, size_t data_len)
3475b8d70035SAlex Elder {
3476922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3477ed95b21aSIlya Dryomov 	void *p = data;
3478ed95b21aSIlya Dryomov 	void *const end = p + data_len;
3479d4c2269bSIlya Dryomov 	u8 struct_v = 0;
3480ed95b21aSIlya Dryomov 	u32 len;
3481ed95b21aSIlya Dryomov 	u32 notify_op;
3482b8d70035SAlex Elder 	int ret;
3483b8d70035SAlex Elder 
3484ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3485ed95b21aSIlya Dryomov 	     __func__, rbd_dev, cookie, notify_id, data_len);
3486ed95b21aSIlya Dryomov 	if (data_len) {
3487ed95b21aSIlya Dryomov 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3488ed95b21aSIlya Dryomov 					  &struct_v, &len);
3489ed95b21aSIlya Dryomov 		if (ret) {
3490ed95b21aSIlya Dryomov 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3491ed95b21aSIlya Dryomov 				 ret);
3492ed95b21aSIlya Dryomov 			return;
3493ed95b21aSIlya Dryomov 		}
349452bb1f9bSIlya Dryomov 
3495ed95b21aSIlya Dryomov 		notify_op = ceph_decode_32(&p);
3496ed95b21aSIlya Dryomov 	} else {
3497ed95b21aSIlya Dryomov 		/* legacy notification for header updates */
3498ed95b21aSIlya Dryomov 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3499ed95b21aSIlya Dryomov 		len = 0;
3500ed95b21aSIlya Dryomov 	}
3501ed95b21aSIlya Dryomov 
3502ed95b21aSIlya Dryomov 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3503ed95b21aSIlya Dryomov 	switch (notify_op) {
3504ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3505ed95b21aSIlya Dryomov 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3506ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3507ed95b21aSIlya Dryomov 		break;
3508ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_RELEASED_LOCK:
3509ed95b21aSIlya Dryomov 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3510ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3511ed95b21aSIlya Dryomov 		break;
3512ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_REQUEST_LOCK:
35133b77faa0SIlya Dryomov 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
35143b77faa0SIlya Dryomov 		if (ret <= 0)
3515ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
35163b77faa0SIlya Dryomov 						      cookie, ret);
3517ed95b21aSIlya Dryomov 		else
3518ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3519ed95b21aSIlya Dryomov 		break;
3520ed95b21aSIlya Dryomov 	case RBD_NOTIFY_OP_HEADER_UPDATE:
3521e627db08SAlex Elder 		ret = rbd_dev_refresh(rbd_dev);
3522e627db08SAlex Elder 		if (ret)
35239584d508SIlya Dryomov 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3524b8d70035SAlex Elder 
3525ed95b21aSIlya Dryomov 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3526ed95b21aSIlya Dryomov 		break;
3527ed95b21aSIlya Dryomov 	default:
3528ed95b21aSIlya Dryomov 		if (rbd_is_lock_owner(rbd_dev))
3529ed95b21aSIlya Dryomov 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3530ed95b21aSIlya Dryomov 						      cookie, -EOPNOTSUPP);
3531ed95b21aSIlya Dryomov 		else
3532ed95b21aSIlya Dryomov 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3533ed95b21aSIlya Dryomov 		break;
3534b8d70035SAlex Elder 	}
3535b8d70035SAlex Elder }
3536b8d70035SAlex Elder 
353799d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
35389969ebc5SAlex Elder 
3539922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3540bb040aa0SIlya Dryomov {
3541922dab61SIlya Dryomov 	struct rbd_device *rbd_dev = arg;
3542bb040aa0SIlya Dryomov 
3543922dab61SIlya Dryomov 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3544bb040aa0SIlya Dryomov 
3545ed95b21aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
3546ed95b21aSIlya Dryomov 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3547ed95b21aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
3548bb040aa0SIlya Dryomov 
354999d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
355099d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
355199d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
355299d16943SIlya Dryomov 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3553bb040aa0SIlya Dryomov 
355499d16943SIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3555bb040aa0SIlya Dryomov 	}
355699d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
3557bb040aa0SIlya Dryomov }
3558bb040aa0SIlya Dryomov 
3559bb040aa0SIlya Dryomov /*
356099d16943SIlya Dryomov  * watch_mutex must be locked
35619969ebc5SAlex Elder  */
356299d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
35639969ebc5SAlex Elder {
35649969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3565922dab61SIlya Dryomov 	struct ceph_osd_linger_request *handle;
35669969ebc5SAlex Elder 
3567922dab61SIlya Dryomov 	rbd_assert(!rbd_dev->watch_handle);
356899d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
35699969ebc5SAlex Elder 
3570922dab61SIlya Dryomov 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3571922dab61SIlya Dryomov 				 &rbd_dev->header_oloc, rbd_watch_cb,
3572922dab61SIlya Dryomov 				 rbd_watch_errcb, rbd_dev);
3573922dab61SIlya Dryomov 	if (IS_ERR(handle))
3574922dab61SIlya Dryomov 		return PTR_ERR(handle);
35759969ebc5SAlex Elder 
3576922dab61SIlya Dryomov 	rbd_dev->watch_handle = handle;
35778eb87565SAlex Elder 	return 0;
35789969ebc5SAlex Elder }
35799969ebc5SAlex Elder 
358099d16943SIlya Dryomov /*
358199d16943SIlya Dryomov  * watch_mutex must be locked
358299d16943SIlya Dryomov  */
358399d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3584fca27065SIlya Dryomov {
3585922dab61SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3586922dab61SIlya Dryomov 	int ret;
3587b30a01f2SIlya Dryomov 
358899d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_handle);
358999d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3590b30a01f2SIlya Dryomov 
3591922dab61SIlya Dryomov 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3592922dab61SIlya Dryomov 	if (ret)
3593922dab61SIlya Dryomov 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3594b30a01f2SIlya Dryomov 
3595922dab61SIlya Dryomov 	rbd_dev->watch_handle = NULL;
3596c525f036SIlya Dryomov }
3597c525f036SIlya Dryomov 
359899d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
3599c525f036SIlya Dryomov {
360099d16943SIlya Dryomov 	int ret;
3601811c6688SIlya Dryomov 
360299d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
360399d16943SIlya Dryomov 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
360499d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
360599d16943SIlya Dryomov 	if (ret)
360699d16943SIlya Dryomov 		goto out;
360799d16943SIlya Dryomov 
360899d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
360999d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
361099d16943SIlya Dryomov 
361199d16943SIlya Dryomov out:
361299d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
361399d16943SIlya Dryomov 	return ret;
361499d16943SIlya Dryomov }
361599d16943SIlya Dryomov 
361699d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
361799d16943SIlya Dryomov {
361899d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
361999d16943SIlya Dryomov 
3620ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->acquired_lock_work);
3621ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->released_lock_work);
3622ed95b21aSIlya Dryomov 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3623ed95b21aSIlya Dryomov 	cancel_work_sync(&rbd_dev->unlock_work);
362499d16943SIlya Dryomov }
362599d16943SIlya Dryomov 
362699d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
362799d16943SIlya Dryomov {
3628ed95b21aSIlya Dryomov 	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
362999d16943SIlya Dryomov 	cancel_tasks_sync(rbd_dev);
363099d16943SIlya Dryomov 
363199d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
363299d16943SIlya Dryomov 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
363399d16943SIlya Dryomov 		__rbd_unregister_watch(rbd_dev);
363499d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
363599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
363699d16943SIlya Dryomov 
363723edca86SDongsheng Yang 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3638811c6688SIlya Dryomov 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3639fca27065SIlya Dryomov }
3640fca27065SIlya Dryomov 
364114bb211dSIlya Dryomov /*
364214bb211dSIlya Dryomov  * lock_rwsem must be held for write
364314bb211dSIlya Dryomov  */
364414bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
364514bb211dSIlya Dryomov {
364614bb211dSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
364714bb211dSIlya Dryomov 	char cookie[32];
364814bb211dSIlya Dryomov 	int ret;
364914bb211dSIlya Dryomov 
365014bb211dSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
365114bb211dSIlya Dryomov 
365214bb211dSIlya Dryomov 	format_lock_cookie(rbd_dev, cookie);
365314bb211dSIlya Dryomov 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
365414bb211dSIlya Dryomov 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
365514bb211dSIlya Dryomov 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
365614bb211dSIlya Dryomov 				  RBD_LOCK_TAG, cookie);
365714bb211dSIlya Dryomov 	if (ret) {
365814bb211dSIlya Dryomov 		if (ret != -EOPNOTSUPP)
365914bb211dSIlya Dryomov 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
366014bb211dSIlya Dryomov 				 ret);
366114bb211dSIlya Dryomov 
366214bb211dSIlya Dryomov 		/*
366314bb211dSIlya Dryomov 		 * Lock cookie cannot be updated on older OSDs, so do
366414bb211dSIlya Dryomov 		 * a manual release and queue an acquire.
366514bb211dSIlya Dryomov 		 */
366614bb211dSIlya Dryomov 		if (rbd_release_lock(rbd_dev))
366714bb211dSIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
366814bb211dSIlya Dryomov 					   &rbd_dev->lock_dwork, 0);
366914bb211dSIlya Dryomov 	} else {
3670edd8ca80SFlorian Margaine 		__rbd_lock(rbd_dev, cookie);
367114bb211dSIlya Dryomov 	}
367214bb211dSIlya Dryomov }
367314bb211dSIlya Dryomov 
367499d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
367599d16943SIlya Dryomov {
367699d16943SIlya Dryomov 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
367799d16943SIlya Dryomov 					    struct rbd_device, watch_dwork);
367899d16943SIlya Dryomov 	int ret;
367999d16943SIlya Dryomov 
368099d16943SIlya Dryomov 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
368199d16943SIlya Dryomov 
368299d16943SIlya Dryomov 	mutex_lock(&rbd_dev->watch_mutex);
368387c0fdedSIlya Dryomov 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
368487c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
368514bb211dSIlya Dryomov 		return;
368687c0fdedSIlya Dryomov 	}
368799d16943SIlya Dryomov 
368899d16943SIlya Dryomov 	ret = __rbd_register_watch(rbd_dev);
368999d16943SIlya Dryomov 	if (ret) {
369099d16943SIlya Dryomov 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
36914d73644bSIlya Dryomov 		if (ret == -EBLACKLISTED || ret == -ENOENT) {
369287c0fdedSIlya Dryomov 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
369314bb211dSIlya Dryomov 			wake_requests(rbd_dev, true);
369487c0fdedSIlya Dryomov 		} else {
369599d16943SIlya Dryomov 			queue_delayed_work(rbd_dev->task_wq,
369699d16943SIlya Dryomov 					   &rbd_dev->watch_dwork,
369799d16943SIlya Dryomov 					   RBD_RETRY_DELAY);
369887c0fdedSIlya Dryomov 		}
369987c0fdedSIlya Dryomov 		mutex_unlock(&rbd_dev->watch_mutex);
370014bb211dSIlya Dryomov 		return;
370199d16943SIlya Dryomov 	}
370299d16943SIlya Dryomov 
370399d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
370499d16943SIlya Dryomov 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
370599d16943SIlya Dryomov 	mutex_unlock(&rbd_dev->watch_mutex);
370699d16943SIlya Dryomov 
370714bb211dSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
370814bb211dSIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
370914bb211dSIlya Dryomov 		rbd_reacquire_lock(rbd_dev);
371014bb211dSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
371114bb211dSIlya Dryomov 
371299d16943SIlya Dryomov 	ret = rbd_dev_refresh(rbd_dev);
371399d16943SIlya Dryomov 	if (ret)
3714f6870cc9SColin Ian King 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
371599d16943SIlya Dryomov }
371699d16943SIlya Dryomov 
371736be9a76SAlex Elder /*
3718f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3719f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
372036be9a76SAlex Elder  */
372136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3722ecd4a68aSIlya Dryomov 			     struct ceph_object_id *oid,
3723ecd4a68aSIlya Dryomov 			     struct ceph_object_locator *oloc,
372436be9a76SAlex Elder 			     const char *method_name,
37254157976bSAlex Elder 			     const void *outbound,
372636be9a76SAlex Elder 			     size_t outbound_size,
37274157976bSAlex Elder 			     void *inbound,
3728e2a58ee5SAlex Elder 			     size_t inbound_size)
372936be9a76SAlex Elder {
3730ecd4a68aSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3731ecd4a68aSIlya Dryomov 	struct page *req_page = NULL;
3732ecd4a68aSIlya Dryomov 	struct page *reply_page;
373336be9a76SAlex Elder 	int ret;
373436be9a76SAlex Elder 
373536be9a76SAlex Elder 	/*
37366010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
37376010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
37386010a451SAlex Elder 	 * also supply outbound data--parameters for the object
37396010a451SAlex Elder 	 * method.  Currently if this is present it will be a
37406010a451SAlex Elder 	 * snapshot id.
374136be9a76SAlex Elder 	 */
3742ecd4a68aSIlya Dryomov 	if (outbound) {
3743ecd4a68aSIlya Dryomov 		if (outbound_size > PAGE_SIZE)
3744ecd4a68aSIlya Dryomov 			return -E2BIG;
374536be9a76SAlex Elder 
3746ecd4a68aSIlya Dryomov 		req_page = alloc_page(GFP_KERNEL);
3747ecd4a68aSIlya Dryomov 		if (!req_page)
3748ecd4a68aSIlya Dryomov 			return -ENOMEM;
374936be9a76SAlex Elder 
3750ecd4a68aSIlya Dryomov 		memcpy(page_address(req_page), outbound, outbound_size);
375104017e29SAlex Elder 	}
3752430c28c3SAlex Elder 
3753ecd4a68aSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
3754ecd4a68aSIlya Dryomov 	if (!reply_page) {
3755ecd4a68aSIlya Dryomov 		if (req_page)
3756ecd4a68aSIlya Dryomov 			__free_page(req_page);
3757ecd4a68aSIlya Dryomov 		return -ENOMEM;
3758ecd4a68aSIlya Dryomov 	}
375936be9a76SAlex Elder 
3760ecd4a68aSIlya Dryomov 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3761ecd4a68aSIlya Dryomov 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
3762ecd4a68aSIlya Dryomov 			     reply_page, &inbound_size);
3763ecd4a68aSIlya Dryomov 	if (!ret) {
3764ecd4a68aSIlya Dryomov 		memcpy(inbound, page_address(reply_page), inbound_size);
3765ecd4a68aSIlya Dryomov 		ret = inbound_size;
3766ecd4a68aSIlya Dryomov 	}
376757385b51SAlex Elder 
3768ecd4a68aSIlya Dryomov 	if (req_page)
3769ecd4a68aSIlya Dryomov 		__free_page(req_page);
3770ecd4a68aSIlya Dryomov 	__free_page(reply_page);
377136be9a76SAlex Elder 	return ret;
377236be9a76SAlex Elder }
377336be9a76SAlex Elder 
3774ed95b21aSIlya Dryomov /*
3775ed95b21aSIlya Dryomov  * lock_rwsem must be held for read
3776ed95b21aSIlya Dryomov  */
37772f18d466SIlya Dryomov static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
3778ed95b21aSIlya Dryomov {
3779ed95b21aSIlya Dryomov 	DEFINE_WAIT(wait);
378034f55d0bSDongsheng Yang 	unsigned long timeout;
37812f18d466SIlya Dryomov 	int ret = 0;
37822f18d466SIlya Dryomov 
37832f18d466SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
37842f18d466SIlya Dryomov 		return -EBLACKLISTED;
37852f18d466SIlya Dryomov 
37862f18d466SIlya Dryomov 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
37872f18d466SIlya Dryomov 		return 0;
37882f18d466SIlya Dryomov 
37892f18d466SIlya Dryomov 	if (!may_acquire) {
37902f18d466SIlya Dryomov 		rbd_warn(rbd_dev, "exclusive lock required");
37912f18d466SIlya Dryomov 		return -EROFS;
37922f18d466SIlya Dryomov 	}
3793ed95b21aSIlya Dryomov 
3794ed95b21aSIlya Dryomov 	do {
3795ed95b21aSIlya Dryomov 		/*
3796ed95b21aSIlya Dryomov 		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3797ed95b21aSIlya Dryomov 		 * and cancel_delayed_work() in wake_requests().
3798ed95b21aSIlya Dryomov 		 */
3799ed95b21aSIlya Dryomov 		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3800ed95b21aSIlya Dryomov 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3801ed95b21aSIlya Dryomov 		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3802ed95b21aSIlya Dryomov 					  TASK_UNINTERRUPTIBLE);
3803ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
380434f55d0bSDongsheng Yang 		timeout = schedule_timeout(ceph_timeout_jiffies(
380534f55d0bSDongsheng Yang 						rbd_dev->opts->lock_timeout));
3806ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
38072f18d466SIlya Dryomov 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
38082f18d466SIlya Dryomov 			ret = -EBLACKLISTED;
38092f18d466SIlya Dryomov 			break;
38102f18d466SIlya Dryomov 		}
381134f55d0bSDongsheng Yang 		if (!timeout) {
381234f55d0bSDongsheng Yang 			rbd_warn(rbd_dev, "timed out waiting for lock");
381334f55d0bSDongsheng Yang 			ret = -ETIMEDOUT;
381434f55d0bSDongsheng Yang 			break;
381534f55d0bSDongsheng Yang 		}
38162f18d466SIlya Dryomov 	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
381787c0fdedSIlya Dryomov 
3818ed95b21aSIlya Dryomov 	finish_wait(&rbd_dev->lock_waitq, &wait);
38192f18d466SIlya Dryomov 	return ret;
3820ed95b21aSIlya Dryomov }
3821ed95b21aSIlya Dryomov 
38227ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
3823bc1ecc65SIlya Dryomov {
38247ad18afaSChristoph Hellwig 	struct request *rq = blk_mq_rq_from_pdu(work);
38257ad18afaSChristoph Hellwig 	struct rbd_device *rbd_dev = rq->q->queuedata;
3826bc1ecc65SIlya Dryomov 	struct rbd_img_request *img_request;
38274e752f0aSJosh Durgin 	struct ceph_snap_context *snapc = NULL;
3828bc1ecc65SIlya Dryomov 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3829bc1ecc65SIlya Dryomov 	u64 length = blk_rq_bytes(rq);
38306d2940c8SGuangliang Zhao 	enum obj_operation_type op_type;
38314e752f0aSJosh Durgin 	u64 mapping_size;
383280de1912SIlya Dryomov 	bool must_be_locked;
3833bc1ecc65SIlya Dryomov 	int result;
3834bc1ecc65SIlya Dryomov 
3835aebf526bSChristoph Hellwig 	switch (req_op(rq)) {
3836aebf526bSChristoph Hellwig 	case REQ_OP_DISCARD:
3837aebf526bSChristoph Hellwig 		op_type = OBJ_OP_DISCARD;
3838aebf526bSChristoph Hellwig 		break;
38396484cbe9SIlya Dryomov 	case REQ_OP_WRITE_ZEROES:
38406484cbe9SIlya Dryomov 		op_type = OBJ_OP_ZEROOUT;
38416484cbe9SIlya Dryomov 		break;
3842aebf526bSChristoph Hellwig 	case REQ_OP_WRITE:
3843aebf526bSChristoph Hellwig 		op_type = OBJ_OP_WRITE;
3844aebf526bSChristoph Hellwig 		break;
3845aebf526bSChristoph Hellwig 	case REQ_OP_READ:
3846aebf526bSChristoph Hellwig 		op_type = OBJ_OP_READ;
3847aebf526bSChristoph Hellwig 		break;
3848aebf526bSChristoph Hellwig 	default:
3849aebf526bSChristoph Hellwig 		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
38507ad18afaSChristoph Hellwig 		result = -EIO;
38517ad18afaSChristoph Hellwig 		goto err;
38527ad18afaSChristoph Hellwig 	}
38537ad18afaSChristoph Hellwig 
3854bc1ecc65SIlya Dryomov 	/* Ignore/skip any zero-length requests */
3855bc1ecc65SIlya Dryomov 
3856bc1ecc65SIlya Dryomov 	if (!length) {
3857bc1ecc65SIlya Dryomov 		dout("%s: zero-length request\n", __func__);
3858bc1ecc65SIlya Dryomov 		result = 0;
3859bc1ecc65SIlya Dryomov 		goto err_rq;
3860bc1ecc65SIlya Dryomov 	}
3861bc1ecc65SIlya Dryomov 
38629568c93eSIlya Dryomov 	rbd_assert(op_type == OBJ_OP_READ ||
38639568c93eSIlya Dryomov 		   rbd_dev->spec->snap_id == CEPH_NOSNAP);
3864bc1ecc65SIlya Dryomov 
3865bc1ecc65SIlya Dryomov 	/*
3866bc1ecc65SIlya Dryomov 	 * Quit early if the mapped snapshot no longer exists.  It's
3867bc1ecc65SIlya Dryomov 	 * still possible the snapshot will have disappeared by the
3868bc1ecc65SIlya Dryomov 	 * time our request arrives at the osd, but there's no sense in
3869bc1ecc65SIlya Dryomov 	 * sending it if we already know.
3870bc1ecc65SIlya Dryomov 	 */
3871bc1ecc65SIlya Dryomov 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3872bc1ecc65SIlya Dryomov 		dout("request for non-existent snapshot");
3873bc1ecc65SIlya Dryomov 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3874bc1ecc65SIlya Dryomov 		result = -ENXIO;
3875bc1ecc65SIlya Dryomov 		goto err_rq;
3876bc1ecc65SIlya Dryomov 	}
3877bc1ecc65SIlya Dryomov 
3878bc1ecc65SIlya Dryomov 	if (offset && length > U64_MAX - offset + 1) {
3879bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3880bc1ecc65SIlya Dryomov 			 length);
3881bc1ecc65SIlya Dryomov 		result = -EINVAL;
3882bc1ecc65SIlya Dryomov 		goto err_rq;	/* Shouldn't happen */
3883bc1ecc65SIlya Dryomov 	}
3884bc1ecc65SIlya Dryomov 
38857ad18afaSChristoph Hellwig 	blk_mq_start_request(rq);
38867ad18afaSChristoph Hellwig 
38874e752f0aSJosh Durgin 	down_read(&rbd_dev->header_rwsem);
38884e752f0aSJosh Durgin 	mapping_size = rbd_dev->mapping.size;
38896d2940c8SGuangliang Zhao 	if (op_type != OBJ_OP_READ) {
38904e752f0aSJosh Durgin 		snapc = rbd_dev->header.snapc;
38914e752f0aSJosh Durgin 		ceph_get_snap_context(snapc);
38924e752f0aSJosh Durgin 	}
38934e752f0aSJosh Durgin 	up_read(&rbd_dev->header_rwsem);
38944e752f0aSJosh Durgin 
38954e752f0aSJosh Durgin 	if (offset + length > mapping_size) {
3896bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
38974e752f0aSJosh Durgin 			 length, mapping_size);
3898bc1ecc65SIlya Dryomov 		result = -EIO;
3899bc1ecc65SIlya Dryomov 		goto err_rq;
3900bc1ecc65SIlya Dryomov 	}
3901bc1ecc65SIlya Dryomov 
3902f9bebd58SIlya Dryomov 	must_be_locked =
3903f9bebd58SIlya Dryomov 	    (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3904f9bebd58SIlya Dryomov 	    (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
3905ed95b21aSIlya Dryomov 	if (must_be_locked) {
3906ed95b21aSIlya Dryomov 		down_read(&rbd_dev->lock_rwsem);
39072f18d466SIlya Dryomov 		result = rbd_wait_state_locked(rbd_dev,
39082f18d466SIlya Dryomov 					       !rbd_dev->opts->exclusive);
39092f18d466SIlya Dryomov 		if (result)
3910e010dd0aSIlya Dryomov 			goto err_unlock;
3911e010dd0aSIlya Dryomov 	}
3912ed95b21aSIlya Dryomov 
3913dfd9875fSIlya Dryomov 	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
3914bc1ecc65SIlya Dryomov 	if (!img_request) {
3915bc1ecc65SIlya Dryomov 		result = -ENOMEM;
3916ed95b21aSIlya Dryomov 		goto err_unlock;
3917bc1ecc65SIlya Dryomov 	}
3918bc1ecc65SIlya Dryomov 	img_request->rq = rq;
391970b16db8SIlya Dryomov 	snapc = NULL; /* img_request consumes a ref */
3920bc1ecc65SIlya Dryomov 
39216484cbe9SIlya Dryomov 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
39225a237819SIlya Dryomov 		result = rbd_img_fill_nodata(img_request, offset, length);
392390e98c52SGuangliang Zhao 	else
39245a237819SIlya Dryomov 		result = rbd_img_fill_from_bio(img_request, offset, length,
392590e98c52SGuangliang Zhao 					       rq->bio);
39260c93e1b7SIlya Dryomov 	if (result || !img_request->pending_count)
3927bc1ecc65SIlya Dryomov 		goto err_img_request;
3928bc1ecc65SIlya Dryomov 
3929efbd1a11SIlya Dryomov 	rbd_img_request_submit(img_request);
3930ed95b21aSIlya Dryomov 	if (must_be_locked)
3931ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3932bc1ecc65SIlya Dryomov 	return;
3933bc1ecc65SIlya Dryomov 
3934bc1ecc65SIlya Dryomov err_img_request:
3935bc1ecc65SIlya Dryomov 	rbd_img_request_put(img_request);
3936ed95b21aSIlya Dryomov err_unlock:
3937ed95b21aSIlya Dryomov 	if (must_be_locked)
3938ed95b21aSIlya Dryomov 		up_read(&rbd_dev->lock_rwsem);
3939bc1ecc65SIlya Dryomov err_rq:
3940bc1ecc65SIlya Dryomov 	if (result)
3941bc1ecc65SIlya Dryomov 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
39426d2940c8SGuangliang Zhao 			 obj_op_name(op_type), length, offset, result);
39434e752f0aSJosh Durgin 	ceph_put_snap_context(snapc);
39447ad18afaSChristoph Hellwig err:
39452a842acaSChristoph Hellwig 	blk_mq_end_request(rq, errno_to_blk_status(result));
3946bc1ecc65SIlya Dryomov }
3947bc1ecc65SIlya Dryomov 
3948fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
39497ad18afaSChristoph Hellwig 		const struct blk_mq_queue_data *bd)
3950bc1ecc65SIlya Dryomov {
39517ad18afaSChristoph Hellwig 	struct request *rq = bd->rq;
39527ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
3953bc1ecc65SIlya Dryomov 
39547ad18afaSChristoph Hellwig 	queue_work(rbd_wq, work);
3955fc17b653SChristoph Hellwig 	return BLK_STS_OK;
3956bf0d5f50SAlex Elder }
3957bf0d5f50SAlex Elder 
3958602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3959602adf40SYehuda Sadeh {
39605769ed0cSIlya Dryomov 	blk_cleanup_queue(rbd_dev->disk->queue);
39617ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
39625769ed0cSIlya Dryomov 	put_disk(rbd_dev->disk);
39635769ed0cSIlya Dryomov 	rbd_dev->disk = NULL;
3964602adf40SYehuda Sadeh }
3965602adf40SYehuda Sadeh 
3966788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3967fe5478e0SIlya Dryomov 			     struct ceph_object_id *oid,
3968fe5478e0SIlya Dryomov 			     struct ceph_object_locator *oloc,
3969fe5478e0SIlya Dryomov 			     void *buf, int buf_len)
3970788e2df3SAlex Elder 
3971788e2df3SAlex Elder {
3972fe5478e0SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3973fe5478e0SIlya Dryomov 	struct ceph_osd_request *req;
3974fe5478e0SIlya Dryomov 	struct page **pages;
3975fe5478e0SIlya Dryomov 	int num_pages = calc_pages_for(0, buf_len);
3976788e2df3SAlex Elder 	int ret;
3977788e2df3SAlex Elder 
3978fe5478e0SIlya Dryomov 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3979fe5478e0SIlya Dryomov 	if (!req)
3980fe5478e0SIlya Dryomov 		return -ENOMEM;
3981788e2df3SAlex Elder 
3982fe5478e0SIlya Dryomov 	ceph_oid_copy(&req->r_base_oid, oid);
3983fe5478e0SIlya Dryomov 	ceph_oloc_copy(&req->r_base_oloc, oloc);
3984fe5478e0SIlya Dryomov 	req->r_flags = CEPH_OSD_FLAG_READ;
3985788e2df3SAlex Elder 
3986fe5478e0SIlya Dryomov 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3987fe5478e0SIlya Dryomov 	if (IS_ERR(pages)) {
3988fe5478e0SIlya Dryomov 		ret = PTR_ERR(pages);
3989fe5478e0SIlya Dryomov 		goto out_req;
3990fe5478e0SIlya Dryomov 	}
39911ceae7efSAlex Elder 
3992fe5478e0SIlya Dryomov 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3993fe5478e0SIlya Dryomov 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3994fe5478e0SIlya Dryomov 					 true);
3995788e2df3SAlex Elder 
399626f887e0SIlya Dryomov 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
399726f887e0SIlya Dryomov 	if (ret)
399826f887e0SIlya Dryomov 		goto out_req;
399926f887e0SIlya Dryomov 
4000fe5478e0SIlya Dryomov 	ceph_osdc_start_request(osdc, req, false);
4001fe5478e0SIlya Dryomov 	ret = ceph_osdc_wait_request(osdc, req);
4002fe5478e0SIlya Dryomov 	if (ret >= 0)
4003fe5478e0SIlya Dryomov 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4004fe5478e0SIlya Dryomov 
4005fe5478e0SIlya Dryomov out_req:
4006fe5478e0SIlya Dryomov 	ceph_osdc_put_request(req);
4007788e2df3SAlex Elder 	return ret;
4008788e2df3SAlex Elder }
4009788e2df3SAlex Elder 
4010602adf40SYehuda Sadeh /*
4011662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
4012662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
4013662518b1SAlex Elder  * information about the image.
40144156d998SAlex Elder  */
401599a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
40164156d998SAlex Elder {
40174156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
40184156d998SAlex Elder 	u32 snap_count = 0;
40194156d998SAlex Elder 	u64 names_size = 0;
40204156d998SAlex Elder 	u32 want_count;
40214156d998SAlex Elder 	int ret;
40224156d998SAlex Elder 
40234156d998SAlex Elder 	/*
40244156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
40254156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
40264156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
40274156d998SAlex Elder 	 * the number of snapshots could change by the time we read
40284156d998SAlex Elder 	 * it in, in which case we re-read it.
40294156d998SAlex Elder 	 */
40304156d998SAlex Elder 	do {
40314156d998SAlex Elder 		size_t size;
40324156d998SAlex Elder 
40334156d998SAlex Elder 		kfree(ondisk);
40344156d998SAlex Elder 
40354156d998SAlex Elder 		size = sizeof (*ondisk);
40364156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
40374156d998SAlex Elder 		size += names_size;
40384156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
40394156d998SAlex Elder 		if (!ondisk)
4040662518b1SAlex Elder 			return -ENOMEM;
40414156d998SAlex Elder 
4042fe5478e0SIlya Dryomov 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4043fe5478e0SIlya Dryomov 					&rbd_dev->header_oloc, ondisk, size);
40444156d998SAlex Elder 		if (ret < 0)
4045662518b1SAlex Elder 			goto out;
4046c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
40474156d998SAlex Elder 			ret = -ENXIO;
404806ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
404906ecc6cbSAlex Elder 				size, ret);
4050662518b1SAlex Elder 			goto out;
40514156d998SAlex Elder 		}
40524156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
40534156d998SAlex Elder 			ret = -ENXIO;
405406ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
4055662518b1SAlex Elder 			goto out;
40564156d998SAlex Elder 		}
40574156d998SAlex Elder 
40584156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
40594156d998SAlex Elder 		want_count = snap_count;
40604156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
40614156d998SAlex Elder 	} while (snap_count != want_count);
40624156d998SAlex Elder 
4063662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4064662518b1SAlex Elder out:
40654156d998SAlex Elder 	kfree(ondisk);
40664156d998SAlex Elder 
4067dfc5606dSYehuda Sadeh 	return ret;
4068602adf40SYehuda Sadeh }
4069602adf40SYehuda Sadeh 
407015228edeSAlex Elder /*
407115228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
407215228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
407315228edeSAlex Elder  */
407415228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
407515228edeSAlex Elder {
407615228edeSAlex Elder 	u64 snap_id;
407715228edeSAlex Elder 
407815228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
407915228edeSAlex Elder 		return;
408015228edeSAlex Elder 
408115228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
408215228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
408315228edeSAlex Elder 		return;
408415228edeSAlex Elder 
408515228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
408615228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
408715228edeSAlex Elder }
408815228edeSAlex Elder 
40899875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
40909875201eSJosh Durgin {
40919875201eSJosh Durgin 	sector_t size;
40929875201eSJosh Durgin 
40939875201eSJosh Durgin 	/*
4094811c6688SIlya Dryomov 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4095811c6688SIlya Dryomov 	 * try to update its size.  If REMOVING is set, updating size
4096811c6688SIlya Dryomov 	 * is just useless work since the device can't be opened.
40979875201eSJosh Durgin 	 */
4098811c6688SIlya Dryomov 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4099811c6688SIlya Dryomov 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
41009875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
41019875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
41029875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
41039875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
41049875201eSJosh Durgin 	}
41059875201eSJosh Durgin }
41069875201eSJosh Durgin 
4107cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
41081fe5e993SAlex Elder {
4109e627db08SAlex Elder 	u64 mapping_size;
41101fe5e993SAlex Elder 	int ret;
41111fe5e993SAlex Elder 
4112cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
41133b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
4114a720ae09SIlya Dryomov 
4115a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
411652bb1f9bSIlya Dryomov 	if (ret)
411773e39e4dSIlya Dryomov 		goto out;
411815228edeSAlex Elder 
4119e8f59b59SIlya Dryomov 	/*
4120e8f59b59SIlya Dryomov 	 * If there is a parent, see if it has disappeared due to the
4121e8f59b59SIlya Dryomov 	 * mapped image getting flattened.
4122e8f59b59SIlya Dryomov 	 */
4123e8f59b59SIlya Dryomov 	if (rbd_dev->parent) {
4124e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
4125e8f59b59SIlya Dryomov 		if (ret)
412673e39e4dSIlya Dryomov 			goto out;
4127e8f59b59SIlya Dryomov 	}
4128e8f59b59SIlya Dryomov 
41295ff1108cSIlya Dryomov 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
41305ff1108cSIlya Dryomov 		rbd_dev->mapping.size = rbd_dev->header.image_size;
41315ff1108cSIlya Dryomov 	} else {
41325ff1108cSIlya Dryomov 		/* validate mapped snapshot's EXISTS flag */
413315228edeSAlex Elder 		rbd_exists_validate(rbd_dev);
41345ff1108cSIlya Dryomov 	}
41355ff1108cSIlya Dryomov 
413673e39e4dSIlya Dryomov out:
4137cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
413873e39e4dSIlya Dryomov 	if (!ret && mapping_size != rbd_dev->mapping.size)
41399875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
41401fe5e993SAlex Elder 
414173e39e4dSIlya Dryomov 	return ret;
41421fe5e993SAlex Elder }
41431fe5e993SAlex Elder 
4144d6296d39SChristoph Hellwig static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4145d6296d39SChristoph Hellwig 		unsigned int hctx_idx, unsigned int numa_node)
41467ad18afaSChristoph Hellwig {
41477ad18afaSChristoph Hellwig 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
41487ad18afaSChristoph Hellwig 
41497ad18afaSChristoph Hellwig 	INIT_WORK(work, rbd_queue_workfn);
41507ad18afaSChristoph Hellwig 	return 0;
41517ad18afaSChristoph Hellwig }
41527ad18afaSChristoph Hellwig 
4153f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = {
41547ad18afaSChristoph Hellwig 	.queue_rq	= rbd_queue_rq,
41557ad18afaSChristoph Hellwig 	.init_request	= rbd_init_request,
41567ad18afaSChristoph Hellwig };
41577ad18afaSChristoph Hellwig 
4158602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4159602adf40SYehuda Sadeh {
4160602adf40SYehuda Sadeh 	struct gendisk *disk;
4161602adf40SYehuda Sadeh 	struct request_queue *q;
4162420efbdfSIlya Dryomov 	unsigned int objset_bytes =
4163420efbdfSIlya Dryomov 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
41647ad18afaSChristoph Hellwig 	int err;
4165602adf40SYehuda Sadeh 
4166602adf40SYehuda Sadeh 	/* create gendisk info */
41677e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
41687e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
41697e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
4170602adf40SYehuda Sadeh 	if (!disk)
41711fcdb8aaSAlex Elder 		return -ENOMEM;
4172602adf40SYehuda Sadeh 
4173f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4174de71a297SAlex Elder 		 rbd_dev->dev_id);
4175602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
4176dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
41777e513d43SIlya Dryomov 	if (single_major)
41787e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
4179602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
4180602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
4181602adf40SYehuda Sadeh 
41827ad18afaSChristoph Hellwig 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
41837ad18afaSChristoph Hellwig 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4184b5584180SIlya Dryomov 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
41857ad18afaSChristoph Hellwig 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
4186b5584180SIlya Dryomov 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
41877ad18afaSChristoph Hellwig 	rbd_dev->tag_set.nr_hw_queues = 1;
41887ad18afaSChristoph Hellwig 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
41897ad18afaSChristoph Hellwig 
41907ad18afaSChristoph Hellwig 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
41917ad18afaSChristoph Hellwig 	if (err)
4192602adf40SYehuda Sadeh 		goto out_disk;
4193029bcbd8SJosh Durgin 
41947ad18afaSChristoph Hellwig 	q = blk_mq_init_queue(&rbd_dev->tag_set);
41957ad18afaSChristoph Hellwig 	if (IS_ERR(q)) {
41967ad18afaSChristoph Hellwig 		err = PTR_ERR(q);
41977ad18afaSChristoph Hellwig 		goto out_tag_set;
41987ad18afaSChristoph Hellwig 	}
41997ad18afaSChristoph Hellwig 
42008b904b5bSBart Van Assche 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
4201d8a2c89cSIlya Dryomov 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4202593a9e7bSAlex Elder 
4203420efbdfSIlya Dryomov 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
42040d9fde4fSIlya Dryomov 	q->limits.max_sectors = queue_max_hw_sectors(q);
420521acdf45SIlya Dryomov 	blk_queue_max_segments(q, USHRT_MAX);
420624f1df60SIlya Dryomov 	blk_queue_max_segment_size(q, UINT_MAX);
4207420efbdfSIlya Dryomov 	blk_queue_io_min(q, objset_bytes);
4208420efbdfSIlya Dryomov 	blk_queue_io_opt(q, objset_bytes);
4209029bcbd8SJosh Durgin 
4210d9360540SIlya Dryomov 	if (rbd_dev->opts->trim) {
42118b904b5bSBart Van Assche 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
4212420efbdfSIlya Dryomov 		q->limits.discard_granularity = objset_bytes;
4213420efbdfSIlya Dryomov 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4214420efbdfSIlya Dryomov 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4215d9360540SIlya Dryomov 	}
421690e98c52SGuangliang Zhao 
4217bae818eeSRonny Hegewald 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4218dc3b17ccSJan Kara 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
4219bae818eeSRonny Hegewald 
42205769ed0cSIlya Dryomov 	/*
42215769ed0cSIlya Dryomov 	 * disk_release() expects a queue ref from add_disk() and will
42225769ed0cSIlya Dryomov 	 * put it.  Hold an extra ref until add_disk() is called.
42235769ed0cSIlya Dryomov 	 */
42245769ed0cSIlya Dryomov 	WARN_ON(!blk_get_queue(q));
4225602adf40SYehuda Sadeh 	disk->queue = q;
4226602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
4227602adf40SYehuda Sadeh 
4228602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
4229602adf40SYehuda Sadeh 
4230602adf40SYehuda Sadeh 	return 0;
42317ad18afaSChristoph Hellwig out_tag_set:
42327ad18afaSChristoph Hellwig 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4233602adf40SYehuda Sadeh out_disk:
4234602adf40SYehuda Sadeh 	put_disk(disk);
42357ad18afaSChristoph Hellwig 	return err;
4236602adf40SYehuda Sadeh }
4237602adf40SYehuda Sadeh 
4238dfc5606dSYehuda Sadeh /*
4239dfc5606dSYehuda Sadeh   sysfs
4240dfc5606dSYehuda Sadeh */
4241602adf40SYehuda Sadeh 
4242593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4243593a9e7bSAlex Elder {
4244593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
4245593a9e7bSAlex Elder }
4246593a9e7bSAlex Elder 
4247dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
4248dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4249602adf40SYehuda Sadeh {
4250593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4251dfc5606dSYehuda Sadeh 
4252fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
4253fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
4254602adf40SYehuda Sadeh }
4255602adf40SYehuda Sadeh 
425634b13184SAlex Elder /*
425734b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
425834b13184SAlex Elder  * necessarily the base image.
425934b13184SAlex Elder  */
426034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
426134b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
426234b13184SAlex Elder {
426334b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
426434b13184SAlex Elder 
426534b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
426634b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
426734b13184SAlex Elder }
426834b13184SAlex Elder 
4269dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
4270dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
4271602adf40SYehuda Sadeh {
4272593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4273dfc5606dSYehuda Sadeh 
4274fc71d833SAlex Elder 	if (rbd_dev->major)
4275dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
4276fc71d833SAlex Elder 
4277fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
4278dd82fff1SIlya Dryomov }
4279fc71d833SAlex Elder 
4280dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
4281dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
4282dd82fff1SIlya Dryomov {
4283dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4284dd82fff1SIlya Dryomov 
4285dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
4286dfc5606dSYehuda Sadeh }
4287dfc5606dSYehuda Sadeh 
4288005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
4289005a07bfSIlya Dryomov 				    struct device_attribute *attr, char *buf)
4290005a07bfSIlya Dryomov {
4291005a07bfSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4292005a07bfSIlya Dryomov 	struct ceph_entity_addr *client_addr =
4293005a07bfSIlya Dryomov 	    ceph_client_addr(rbd_dev->rbd_client->client);
4294005a07bfSIlya Dryomov 
4295005a07bfSIlya Dryomov 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4296005a07bfSIlya Dryomov 		       le32_to_cpu(client_addr->nonce));
4297005a07bfSIlya Dryomov }
4298005a07bfSIlya Dryomov 
4299dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
4300dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
4301dfc5606dSYehuda Sadeh {
4302593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4303dfc5606dSYehuda Sadeh 
43041dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
4305033268a5SIlya Dryomov 		       ceph_client_gid(rbd_dev->rbd_client->client));
4306dfc5606dSYehuda Sadeh }
4307dfc5606dSYehuda Sadeh 
4308267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
4309267fb90bSMike Christie 				     struct device_attribute *attr, char *buf)
4310267fb90bSMike Christie {
4311267fb90bSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4312267fb90bSMike Christie 
4313267fb90bSMike Christie 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4314267fb90bSMike Christie }
4315267fb90bSMike Christie 
43160d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
43170d6d1e9cSMike Christie 				    struct device_attribute *attr, char *buf)
43180d6d1e9cSMike Christie {
43190d6d1e9cSMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
43200d6d1e9cSMike Christie 
43210d6d1e9cSMike Christie 	return sprintf(buf, "%s\n", rbd_dev->config_info);
4322dfc5606dSYehuda Sadeh }
4323dfc5606dSYehuda Sadeh 
4324dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
4325dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4326dfc5606dSYehuda Sadeh {
4327593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4328dfc5606dSYehuda Sadeh 
43290d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4330dfc5606dSYehuda Sadeh }
4331dfc5606dSYehuda Sadeh 
43329bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
43339bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
43349bb2f334SAlex Elder {
43359bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
43369bb2f334SAlex Elder 
43370d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
43380d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
43399bb2f334SAlex Elder }
43409bb2f334SAlex Elder 
4341b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev,
4342b26c047bSIlya Dryomov 				struct device_attribute *attr, char *buf)
4343b26c047bSIlya Dryomov {
4344b26c047bSIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4345b26c047bSIlya Dryomov 
4346b26c047bSIlya Dryomov 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4347b26c047bSIlya Dryomov }
4348b26c047bSIlya Dryomov 
4349dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
4350dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
4351dfc5606dSYehuda Sadeh {
4352593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4353dfc5606dSYehuda Sadeh 
4354a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
43550d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4356a92ffdf8SAlex Elder 
4357a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
4358dfc5606dSYehuda Sadeh }
4359dfc5606dSYehuda Sadeh 
4360589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
4361589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
4362589d30e0SAlex Elder {
4363589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4364589d30e0SAlex Elder 
43650d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4366589d30e0SAlex Elder }
4367589d30e0SAlex Elder 
436834b13184SAlex Elder /*
436934b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
437034b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
437134b13184SAlex Elder  */
4372dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
4373dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
4374dfc5606dSYehuda Sadeh 			     char *buf)
4375dfc5606dSYehuda Sadeh {
4376593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4377dfc5606dSYehuda Sadeh 
43780d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4379dfc5606dSYehuda Sadeh }
4380dfc5606dSYehuda Sadeh 
438192a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
438292a58671SMike Christie 				struct device_attribute *attr, char *buf)
438392a58671SMike Christie {
438492a58671SMike Christie 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
438592a58671SMike Christie 
438692a58671SMike Christie 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
438792a58671SMike Christie }
438892a58671SMike Christie 
438986b00e0dSAlex Elder /*
4390ff96128fSIlya Dryomov  * For a v2 image, shows the chain of parent images, separated by empty
4391ff96128fSIlya Dryomov  * lines.  For v1 images or if there is no parent, shows "(no parent
4392ff96128fSIlya Dryomov  * image)".
439386b00e0dSAlex Elder  */
439486b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
439586b00e0dSAlex Elder 			       struct device_attribute *attr,
439686b00e0dSAlex Elder 			       char *buf)
439786b00e0dSAlex Elder {
439886b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4399ff96128fSIlya Dryomov 	ssize_t count = 0;
440086b00e0dSAlex Elder 
4401ff96128fSIlya Dryomov 	if (!rbd_dev->parent)
440286b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
440386b00e0dSAlex Elder 
4404ff96128fSIlya Dryomov 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4405ff96128fSIlya Dryomov 		struct rbd_spec *spec = rbd_dev->parent_spec;
440686b00e0dSAlex Elder 
4407ff96128fSIlya Dryomov 		count += sprintf(&buf[count], "%s"
4408ff96128fSIlya Dryomov 			    "pool_id %llu\npool_name %s\n"
4409e92c0eafSIlya Dryomov 			    "pool_ns %s\n"
4410ff96128fSIlya Dryomov 			    "image_id %s\nimage_name %s\n"
4411ff96128fSIlya Dryomov 			    "snap_id %llu\nsnap_name %s\n"
4412ff96128fSIlya Dryomov 			    "overlap %llu\n",
4413ff96128fSIlya Dryomov 			    !count ? "" : "\n", /* first? */
4414ff96128fSIlya Dryomov 			    spec->pool_id, spec->pool_name,
4415e92c0eafSIlya Dryomov 			    spec->pool_ns ?: "",
4416ff96128fSIlya Dryomov 			    spec->image_id, spec->image_name ?: "(unknown)",
4417ff96128fSIlya Dryomov 			    spec->snap_id, spec->snap_name,
4418ff96128fSIlya Dryomov 			    rbd_dev->parent_overlap);
4419ff96128fSIlya Dryomov 	}
442086b00e0dSAlex Elder 
442186b00e0dSAlex Elder 	return count;
442286b00e0dSAlex Elder }
442386b00e0dSAlex Elder 
4424dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
4425dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
4426dfc5606dSYehuda Sadeh 				 const char *buf,
4427dfc5606dSYehuda Sadeh 				 size_t size)
4428dfc5606dSYehuda Sadeh {
4429593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4430b813623aSAlex Elder 	int ret;
4431602adf40SYehuda Sadeh 
4432cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
4433e627db08SAlex Elder 	if (ret)
443452bb1f9bSIlya Dryomov 		return ret;
4435b813623aSAlex Elder 
443652bb1f9bSIlya Dryomov 	return size;
4437dfc5606dSYehuda Sadeh }
4438602adf40SYehuda Sadeh 
44395657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
44405657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
44415657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
44425657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
44435657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
44445657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
44455657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
44465657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
44475657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
44485657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
4449b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
44505657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
44515657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
44525657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
44535657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
44545657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
44555657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
4456dfc5606dSYehuda Sadeh 
4457dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
4458dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
445934b13184SAlex Elder 	&dev_attr_features.attr,
4460dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
4461dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
4462005a07bfSIlya Dryomov 	&dev_attr_client_addr.attr,
4463dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
4464267fb90bSMike Christie 	&dev_attr_cluster_fsid.attr,
44650d6d1e9cSMike Christie 	&dev_attr_config_info.attr,
4466dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
44679bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
4468b26c047bSIlya Dryomov 	&dev_attr_pool_ns.attr,
4469dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
4470589d30e0SAlex Elder 	&dev_attr_image_id.attr,
4471dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
447292a58671SMike Christie 	&dev_attr_snap_id.attr,
447386b00e0dSAlex Elder 	&dev_attr_parent.attr,
4474dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
4475dfc5606dSYehuda Sadeh 	NULL
4476dfc5606dSYehuda Sadeh };
4477dfc5606dSYehuda Sadeh 
4478dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
4479dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
4480dfc5606dSYehuda Sadeh };
4481dfc5606dSYehuda Sadeh 
4482dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
4483dfc5606dSYehuda Sadeh 	&rbd_attr_group,
4484dfc5606dSYehuda Sadeh 	NULL
4485dfc5606dSYehuda Sadeh };
4486dfc5606dSYehuda Sadeh 
44876cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
4488dfc5606dSYehuda Sadeh 
4489b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
4490dfc5606dSYehuda Sadeh 	.name		= "rbd",
4491dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
44926cac4695SIlya Dryomov 	.release	= rbd_dev_release,
4493dfc5606dSYehuda Sadeh };
4494dfc5606dSYehuda Sadeh 
44958b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
44968b8fb99cSAlex Elder {
44978b8fb99cSAlex Elder 	kref_get(&spec->kref);
44988b8fb99cSAlex Elder 
44998b8fb99cSAlex Elder 	return spec;
45008b8fb99cSAlex Elder }
45018b8fb99cSAlex Elder 
45028b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
45038b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
45048b8fb99cSAlex Elder {
45058b8fb99cSAlex Elder 	if (spec)
45068b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
45078b8fb99cSAlex Elder }
45088b8fb99cSAlex Elder 
45098b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
45108b8fb99cSAlex Elder {
45118b8fb99cSAlex Elder 	struct rbd_spec *spec;
45128b8fb99cSAlex Elder 
45138b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
45148b8fb99cSAlex Elder 	if (!spec)
45158b8fb99cSAlex Elder 		return NULL;
451604077599SIlya Dryomov 
451704077599SIlya Dryomov 	spec->pool_id = CEPH_NOPOOL;
451804077599SIlya Dryomov 	spec->snap_id = CEPH_NOSNAP;
45198b8fb99cSAlex Elder 	kref_init(&spec->kref);
45208b8fb99cSAlex Elder 
45218b8fb99cSAlex Elder 	return spec;
45228b8fb99cSAlex Elder }
45238b8fb99cSAlex Elder 
45248b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
45258b8fb99cSAlex Elder {
45268b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
45278b8fb99cSAlex Elder 
45288b8fb99cSAlex Elder 	kfree(spec->pool_name);
4529b26c047bSIlya Dryomov 	kfree(spec->pool_ns);
45308b8fb99cSAlex Elder 	kfree(spec->image_id);
45318b8fb99cSAlex Elder 	kfree(spec->image_name);
45328b8fb99cSAlex Elder 	kfree(spec->snap_name);
45338b8fb99cSAlex Elder 	kfree(spec);
45348b8fb99cSAlex Elder }
45358b8fb99cSAlex Elder 
45361643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
4537dd5ac32dSIlya Dryomov {
453899d16943SIlya Dryomov 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4539ed95b21aSIlya Dryomov 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
4540dd5ac32dSIlya Dryomov 
4541c41d13a3SIlya Dryomov 	ceph_oid_destroy(&rbd_dev->header_oid);
45426b6dddbeSIlya Dryomov 	ceph_oloc_destroy(&rbd_dev->header_oloc);
45430d6d1e9cSMike Christie 	kfree(rbd_dev->config_info);
4544c41d13a3SIlya Dryomov 
4545dd5ac32dSIlya Dryomov 	rbd_put_client(rbd_dev->rbd_client);
4546dd5ac32dSIlya Dryomov 	rbd_spec_put(rbd_dev->spec);
4547dd5ac32dSIlya Dryomov 	kfree(rbd_dev->opts);
4548dd5ac32dSIlya Dryomov 	kfree(rbd_dev);
45491643dfa4SIlya Dryomov }
45501643dfa4SIlya Dryomov 
45511643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
45521643dfa4SIlya Dryomov {
45531643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
45541643dfa4SIlya Dryomov 	bool need_put = !!rbd_dev->opts;
45551643dfa4SIlya Dryomov 
45561643dfa4SIlya Dryomov 	if (need_put) {
45571643dfa4SIlya Dryomov 		destroy_workqueue(rbd_dev->task_wq);
45581643dfa4SIlya Dryomov 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
45591643dfa4SIlya Dryomov 	}
45601643dfa4SIlya Dryomov 
45611643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
4562dd5ac32dSIlya Dryomov 
4563dd5ac32dSIlya Dryomov 	/*
4564dd5ac32dSIlya Dryomov 	 * This is racy, but way better than putting module outside of
4565dd5ac32dSIlya Dryomov 	 * the release callback.  The race window is pretty small, so
4566dd5ac32dSIlya Dryomov 	 * doing something similar to dm (dm-builtin.c) is overkill.
4567dd5ac32dSIlya Dryomov 	 */
4568dd5ac32dSIlya Dryomov 	if (need_put)
4569dd5ac32dSIlya Dryomov 		module_put(THIS_MODULE);
4570dd5ac32dSIlya Dryomov }
4571dd5ac32dSIlya Dryomov 
45721643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
45731643dfa4SIlya Dryomov 					   struct rbd_spec *spec)
4574c53d5893SAlex Elder {
4575c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
4576c53d5893SAlex Elder 
4577c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4578c53d5893SAlex Elder 	if (!rbd_dev)
4579c53d5893SAlex Elder 		return NULL;
4580c53d5893SAlex Elder 
4581c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
4582c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
4583c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
4584c53d5893SAlex Elder 
45857e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
4586c41d13a3SIlya Dryomov 	ceph_oid_init(&rbd_dev->header_oid);
4587431a02cdSIlya Dryomov 	rbd_dev->header_oloc.pool = spec->pool_id;
4588b26c047bSIlya Dryomov 	if (spec->pool_ns) {
4589b26c047bSIlya Dryomov 		WARN_ON(!*spec->pool_ns);
4590b26c047bSIlya Dryomov 		rbd_dev->header_oloc.pool_ns =
4591b26c047bSIlya Dryomov 		    ceph_find_or_create_string(spec->pool_ns,
4592b26c047bSIlya Dryomov 					       strlen(spec->pool_ns));
4593b26c047bSIlya Dryomov 	}
4594c41d13a3SIlya Dryomov 
459599d16943SIlya Dryomov 	mutex_init(&rbd_dev->watch_mutex);
459699d16943SIlya Dryomov 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
459799d16943SIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
459899d16943SIlya Dryomov 
4599ed95b21aSIlya Dryomov 	init_rwsem(&rbd_dev->lock_rwsem);
4600ed95b21aSIlya Dryomov 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4601ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4602ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4603ed95b21aSIlya Dryomov 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4604ed95b21aSIlya Dryomov 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4605ed95b21aSIlya Dryomov 	init_waitqueue_head(&rbd_dev->lock_waitq);
4606ed95b21aSIlya Dryomov 
4607dd5ac32dSIlya Dryomov 	rbd_dev->dev.bus = &rbd_bus_type;
4608dd5ac32dSIlya Dryomov 	rbd_dev->dev.type = &rbd_device_type;
4609dd5ac32dSIlya Dryomov 	rbd_dev->dev.parent = &rbd_root_dev;
4610dd5ac32dSIlya Dryomov 	device_initialize(&rbd_dev->dev);
4611dd5ac32dSIlya Dryomov 
4612c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
4613d147543dSIlya Dryomov 	rbd_dev->spec = spec;
46140903e875SAlex Elder 
46151643dfa4SIlya Dryomov 	return rbd_dev;
46161643dfa4SIlya Dryomov }
46171643dfa4SIlya Dryomov 
4618dd5ac32dSIlya Dryomov /*
46191643dfa4SIlya Dryomov  * Create a mapping rbd_dev.
4620dd5ac32dSIlya Dryomov  */
46211643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
46221643dfa4SIlya Dryomov 					 struct rbd_spec *spec,
46231643dfa4SIlya Dryomov 					 struct rbd_options *opts)
46241643dfa4SIlya Dryomov {
46251643dfa4SIlya Dryomov 	struct rbd_device *rbd_dev;
46261643dfa4SIlya Dryomov 
46271643dfa4SIlya Dryomov 	rbd_dev = __rbd_dev_create(rbdc, spec);
46281643dfa4SIlya Dryomov 	if (!rbd_dev)
46291643dfa4SIlya Dryomov 		return NULL;
46301643dfa4SIlya Dryomov 
46311643dfa4SIlya Dryomov 	rbd_dev->opts = opts;
46321643dfa4SIlya Dryomov 
46331643dfa4SIlya Dryomov 	/* get an id and fill in device name */
46341643dfa4SIlya Dryomov 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
46351643dfa4SIlya Dryomov 					 minor_to_rbd_dev_id(1 << MINORBITS),
46361643dfa4SIlya Dryomov 					 GFP_KERNEL);
46371643dfa4SIlya Dryomov 	if (rbd_dev->dev_id < 0)
46381643dfa4SIlya Dryomov 		goto fail_rbd_dev;
46391643dfa4SIlya Dryomov 
46401643dfa4SIlya Dryomov 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
46411643dfa4SIlya Dryomov 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
46421643dfa4SIlya Dryomov 						   rbd_dev->name);
46431643dfa4SIlya Dryomov 	if (!rbd_dev->task_wq)
46441643dfa4SIlya Dryomov 		goto fail_dev_id;
46451643dfa4SIlya Dryomov 
46461643dfa4SIlya Dryomov 	/* we have a ref from do_rbd_add() */
4647dd5ac32dSIlya Dryomov 	__module_get(THIS_MODULE);
4648dd5ac32dSIlya Dryomov 
46491643dfa4SIlya Dryomov 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4650c53d5893SAlex Elder 	return rbd_dev;
46511643dfa4SIlya Dryomov 
46521643dfa4SIlya Dryomov fail_dev_id:
46531643dfa4SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
46541643dfa4SIlya Dryomov fail_rbd_dev:
46551643dfa4SIlya Dryomov 	rbd_dev_free(rbd_dev);
46561643dfa4SIlya Dryomov 	return NULL;
4657c53d5893SAlex Elder }
4658c53d5893SAlex Elder 
4659c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4660c53d5893SAlex Elder {
4661dd5ac32dSIlya Dryomov 	if (rbd_dev)
4662dd5ac32dSIlya Dryomov 		put_device(&rbd_dev->dev);
4663c53d5893SAlex Elder }
4664c53d5893SAlex Elder 
4665dfc5606dSYehuda Sadeh /*
46669d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
46679d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
46689d475de5SAlex Elder  * image.
46699d475de5SAlex Elder  */
46709d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
46719d475de5SAlex Elder 				u8 *order, u64 *snap_size)
46729d475de5SAlex Elder {
46739d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
46749d475de5SAlex Elder 	int ret;
46759d475de5SAlex Elder 	struct {
46769d475de5SAlex Elder 		u8 order;
46779d475de5SAlex Elder 		__le64 size;
46789d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
46799d475de5SAlex Elder 
4680ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4681ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_size",
46824157976bSAlex Elder 				  &snapid, sizeof(snapid),
4683e2a58ee5SAlex Elder 				  &size_buf, sizeof(size_buf));
468436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
46859d475de5SAlex Elder 	if (ret < 0)
46869d475de5SAlex Elder 		return ret;
468757385b51SAlex Elder 	if (ret < sizeof (size_buf))
468857385b51SAlex Elder 		return -ERANGE;
46899d475de5SAlex Elder 
4690c3545579SJosh Durgin 	if (order) {
46919d475de5SAlex Elder 		*order = size_buf.order;
4692c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
4693c3545579SJosh Durgin 	}
46949d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
46959d475de5SAlex Elder 
4696c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
4697c3545579SJosh Durgin 		(unsigned long long)snap_id,
46989d475de5SAlex Elder 		(unsigned long long)*snap_size);
46999d475de5SAlex Elder 
47009d475de5SAlex Elder 	return 0;
47019d475de5SAlex Elder }
47029d475de5SAlex Elder 
47039d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
47049d475de5SAlex Elder {
47059d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
47069d475de5SAlex Elder 					&rbd_dev->header.obj_order,
47079d475de5SAlex Elder 					&rbd_dev->header.image_size);
47089d475de5SAlex Elder }
47099d475de5SAlex Elder 
47101e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
47111e130199SAlex Elder {
47121e130199SAlex Elder 	void *reply_buf;
47131e130199SAlex Elder 	int ret;
47141e130199SAlex Elder 	void *p;
47151e130199SAlex Elder 
47161e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
47171e130199SAlex Elder 	if (!reply_buf)
47181e130199SAlex Elder 		return -ENOMEM;
47191e130199SAlex Elder 
4720ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4721ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_object_prefix",
4722ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
472336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
47241e130199SAlex Elder 	if (ret < 0)
47251e130199SAlex Elder 		goto out;
47261e130199SAlex Elder 
47271e130199SAlex Elder 	p = reply_buf;
47281e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
472957385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
473057385b51SAlex Elder 	ret = 0;
47311e130199SAlex Elder 
47321e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
47331e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
47341e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
47351e130199SAlex Elder 	} else {
47361e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
47371e130199SAlex Elder 	}
47381e130199SAlex Elder out:
47391e130199SAlex Elder 	kfree(reply_buf);
47401e130199SAlex Elder 
47411e130199SAlex Elder 	return ret;
47421e130199SAlex Elder }
47431e130199SAlex Elder 
4744b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4745b1b5402aSAlex Elder 		u64 *snap_features)
4746b1b5402aSAlex Elder {
4747b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
4748b1b5402aSAlex Elder 	struct {
4749b1b5402aSAlex Elder 		__le64 features;
4750b1b5402aSAlex Elder 		__le64 incompat;
47514157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
4752d3767f0fSIlya Dryomov 	u64 unsup;
4753b1b5402aSAlex Elder 	int ret;
4754b1b5402aSAlex Elder 
4755ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4756ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_features",
47574157976bSAlex Elder 				  &snapid, sizeof(snapid),
4758e2a58ee5SAlex Elder 				  &features_buf, sizeof(features_buf));
475936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4760b1b5402aSAlex Elder 	if (ret < 0)
4761b1b5402aSAlex Elder 		return ret;
476257385b51SAlex Elder 	if (ret < sizeof (features_buf))
476357385b51SAlex Elder 		return -ERANGE;
4764d889140cSAlex Elder 
4765d3767f0fSIlya Dryomov 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4766d3767f0fSIlya Dryomov 	if (unsup) {
4767d3767f0fSIlya Dryomov 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4768d3767f0fSIlya Dryomov 			 unsup);
4769b8f5c6edSAlex Elder 		return -ENXIO;
4770d3767f0fSIlya Dryomov 	}
4771d889140cSAlex Elder 
4772b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
4773b1b5402aSAlex Elder 
4774b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4775b1b5402aSAlex Elder 		(unsigned long long)snap_id,
4776b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
4777b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
4778b1b5402aSAlex Elder 
4779b1b5402aSAlex Elder 	return 0;
4780b1b5402aSAlex Elder }
4781b1b5402aSAlex Elder 
4782b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4783b1b5402aSAlex Elder {
4784b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4785b1b5402aSAlex Elder 						&rbd_dev->header.features);
4786b1b5402aSAlex Elder }
4787b1b5402aSAlex Elder 
4788eb3b2d6bSIlya Dryomov struct parent_image_info {
4789eb3b2d6bSIlya Dryomov 	u64		pool_id;
4790e92c0eafSIlya Dryomov 	const char	*pool_ns;
4791eb3b2d6bSIlya Dryomov 	const char	*image_id;
4792eb3b2d6bSIlya Dryomov 	u64		snap_id;
4793eb3b2d6bSIlya Dryomov 
4794e92c0eafSIlya Dryomov 	bool		has_overlap;
4795eb3b2d6bSIlya Dryomov 	u64		overlap;
4796eb3b2d6bSIlya Dryomov };
4797eb3b2d6bSIlya Dryomov 
4798eb3b2d6bSIlya Dryomov /*
4799eb3b2d6bSIlya Dryomov  * The caller is responsible for @pii.
4800eb3b2d6bSIlya Dryomov  */
4801e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end,
4802e92c0eafSIlya Dryomov 				    struct parent_image_info *pii)
4803e92c0eafSIlya Dryomov {
4804e92c0eafSIlya Dryomov 	u8 struct_v;
4805e92c0eafSIlya Dryomov 	u32 struct_len;
4806e92c0eafSIlya Dryomov 	int ret;
4807e92c0eafSIlya Dryomov 
4808e92c0eafSIlya Dryomov 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4809e92c0eafSIlya Dryomov 				  &struct_v, &struct_len);
4810e92c0eafSIlya Dryomov 	if (ret)
4811e92c0eafSIlya Dryomov 		return ret;
4812e92c0eafSIlya Dryomov 
4813e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4814e92c0eafSIlya Dryomov 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4815e92c0eafSIlya Dryomov 	if (IS_ERR(pii->pool_ns)) {
4816e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->pool_ns);
4817e92c0eafSIlya Dryomov 		pii->pool_ns = NULL;
4818e92c0eafSIlya Dryomov 		return ret;
4819e92c0eafSIlya Dryomov 	}
4820e92c0eafSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4821e92c0eafSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
4822e92c0eafSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
4823e92c0eafSIlya Dryomov 		pii->image_id = NULL;
4824e92c0eafSIlya Dryomov 		return ret;
4825e92c0eafSIlya Dryomov 	}
4826e92c0eafSIlya Dryomov 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4827e92c0eafSIlya Dryomov 	return 0;
4828e92c0eafSIlya Dryomov 
4829e92c0eafSIlya Dryomov e_inval:
4830e92c0eafSIlya Dryomov 	return -EINVAL;
4831e92c0eafSIlya Dryomov }
4832e92c0eafSIlya Dryomov 
4833e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev,
4834e92c0eafSIlya Dryomov 			     struct page *req_page,
4835e92c0eafSIlya Dryomov 			     struct page *reply_page,
4836e92c0eafSIlya Dryomov 			     struct parent_image_info *pii)
4837e92c0eafSIlya Dryomov {
4838e92c0eafSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4839e92c0eafSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
4840e92c0eafSIlya Dryomov 	void *p, *end;
4841e92c0eafSIlya Dryomov 	int ret;
4842e92c0eafSIlya Dryomov 
4843e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4844e92c0eafSIlya Dryomov 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4845e92c0eafSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4846e92c0eafSIlya Dryomov 	if (ret)
4847e92c0eafSIlya Dryomov 		return ret == -EOPNOTSUPP ? 1 : ret;
4848e92c0eafSIlya Dryomov 
4849e92c0eafSIlya Dryomov 	p = page_address(reply_page);
4850e92c0eafSIlya Dryomov 	end = p + reply_len;
4851e92c0eafSIlya Dryomov 	ret = decode_parent_image_spec(&p, end, pii);
4852e92c0eafSIlya Dryomov 	if (ret)
4853e92c0eafSIlya Dryomov 		return ret;
4854e92c0eafSIlya Dryomov 
4855e92c0eafSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4856e92c0eafSIlya Dryomov 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4857e92c0eafSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4858e92c0eafSIlya Dryomov 	if (ret)
4859e92c0eafSIlya Dryomov 		return ret;
4860e92c0eafSIlya Dryomov 
4861e92c0eafSIlya Dryomov 	p = page_address(reply_page);
4862e92c0eafSIlya Dryomov 	end = p + reply_len;
4863e92c0eafSIlya Dryomov 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4864e92c0eafSIlya Dryomov 	if (pii->has_overlap)
4865e92c0eafSIlya Dryomov 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4866e92c0eafSIlya Dryomov 
4867e92c0eafSIlya Dryomov 	return 0;
4868e92c0eafSIlya Dryomov 
4869e92c0eafSIlya Dryomov e_inval:
4870e92c0eafSIlya Dryomov 	return -EINVAL;
4871e92c0eafSIlya Dryomov }
4872e92c0eafSIlya Dryomov 
4873e92c0eafSIlya Dryomov /*
4874e92c0eafSIlya Dryomov  * The caller is responsible for @pii.
4875e92c0eafSIlya Dryomov  */
4876eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4877eb3b2d6bSIlya Dryomov 				    struct page *req_page,
4878eb3b2d6bSIlya Dryomov 				    struct page *reply_page,
4879eb3b2d6bSIlya Dryomov 				    struct parent_image_info *pii)
4880eb3b2d6bSIlya Dryomov {
4881eb3b2d6bSIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4882eb3b2d6bSIlya Dryomov 	size_t reply_len = PAGE_SIZE;
4883eb3b2d6bSIlya Dryomov 	void *p, *end;
4884eb3b2d6bSIlya Dryomov 	int ret;
4885eb3b2d6bSIlya Dryomov 
4886eb3b2d6bSIlya Dryomov 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4887eb3b2d6bSIlya Dryomov 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4888eb3b2d6bSIlya Dryomov 			     req_page, sizeof(u64), reply_page, &reply_len);
4889eb3b2d6bSIlya Dryomov 	if (ret)
4890eb3b2d6bSIlya Dryomov 		return ret;
4891eb3b2d6bSIlya Dryomov 
4892eb3b2d6bSIlya Dryomov 	p = page_address(reply_page);
4893eb3b2d6bSIlya Dryomov 	end = p + reply_len;
4894eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4895eb3b2d6bSIlya Dryomov 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4896eb3b2d6bSIlya Dryomov 	if (IS_ERR(pii->image_id)) {
4897eb3b2d6bSIlya Dryomov 		ret = PTR_ERR(pii->image_id);
4898eb3b2d6bSIlya Dryomov 		pii->image_id = NULL;
4899eb3b2d6bSIlya Dryomov 		return ret;
4900eb3b2d6bSIlya Dryomov 	}
4901eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
4902e92c0eafSIlya Dryomov 	pii->has_overlap = true;
4903eb3b2d6bSIlya Dryomov 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4904eb3b2d6bSIlya Dryomov 
4905eb3b2d6bSIlya Dryomov 	return 0;
4906eb3b2d6bSIlya Dryomov 
4907eb3b2d6bSIlya Dryomov e_inval:
4908eb3b2d6bSIlya Dryomov 	return -EINVAL;
4909eb3b2d6bSIlya Dryomov }
4910eb3b2d6bSIlya Dryomov 
4911eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev,
4912eb3b2d6bSIlya Dryomov 			   struct parent_image_info *pii)
4913eb3b2d6bSIlya Dryomov {
4914eb3b2d6bSIlya Dryomov 	struct page *req_page, *reply_page;
4915eb3b2d6bSIlya Dryomov 	void *p;
4916eb3b2d6bSIlya Dryomov 	int ret;
4917eb3b2d6bSIlya Dryomov 
4918eb3b2d6bSIlya Dryomov 	req_page = alloc_page(GFP_KERNEL);
4919eb3b2d6bSIlya Dryomov 	if (!req_page)
4920eb3b2d6bSIlya Dryomov 		return -ENOMEM;
4921eb3b2d6bSIlya Dryomov 
4922eb3b2d6bSIlya Dryomov 	reply_page = alloc_page(GFP_KERNEL);
4923eb3b2d6bSIlya Dryomov 	if (!reply_page) {
4924eb3b2d6bSIlya Dryomov 		__free_page(req_page);
4925eb3b2d6bSIlya Dryomov 		return -ENOMEM;
4926eb3b2d6bSIlya Dryomov 	}
4927eb3b2d6bSIlya Dryomov 
4928eb3b2d6bSIlya Dryomov 	p = page_address(req_page);
4929eb3b2d6bSIlya Dryomov 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
4930e92c0eafSIlya Dryomov 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
4931e92c0eafSIlya Dryomov 	if (ret > 0)
4932e92c0eafSIlya Dryomov 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
4933e92c0eafSIlya Dryomov 					       pii);
4934eb3b2d6bSIlya Dryomov 
4935eb3b2d6bSIlya Dryomov 	__free_page(req_page);
4936eb3b2d6bSIlya Dryomov 	__free_page(reply_page);
4937eb3b2d6bSIlya Dryomov 	return ret;
4938eb3b2d6bSIlya Dryomov }
4939eb3b2d6bSIlya Dryomov 
494086b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
494186b00e0dSAlex Elder {
494286b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
4943eb3b2d6bSIlya Dryomov 	struct parent_image_info pii = { 0 };
494486b00e0dSAlex Elder 	int ret;
494586b00e0dSAlex Elder 
494686b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
494786b00e0dSAlex Elder 	if (!parent_spec)
494886b00e0dSAlex Elder 		return -ENOMEM;
494986b00e0dSAlex Elder 
4950eb3b2d6bSIlya Dryomov 	ret = get_parent_info(rbd_dev, &pii);
4951eb3b2d6bSIlya Dryomov 	if (ret)
495286b00e0dSAlex Elder 		goto out_err;
495386b00e0dSAlex Elder 
4954e92c0eafSIlya Dryomov 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
4955e92c0eafSIlya Dryomov 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
4956e92c0eafSIlya Dryomov 	     pii.has_overlap, pii.overlap);
4957eb3b2d6bSIlya Dryomov 
4958e92c0eafSIlya Dryomov 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
4959392a9dadSAlex Elder 		/*
4960392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4961392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4962392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4963392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4964392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4965392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4966392a9dadSAlex Elder 		 * parent.
4967e92c0eafSIlya Dryomov 		 *
4968e92c0eafSIlya Dryomov 		 * If !pii.has_overlap, the parent image spec is not
4969e92c0eafSIlya Dryomov 		 * applicable.  It's there to avoid duplication in each
4970e92c0eafSIlya Dryomov 		 * snapshot record.
4971392a9dadSAlex Elder 		 */
4972392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4973392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4974392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4975392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4976392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4977392a9dadSAlex Elder 		}
4978392a9dadSAlex Elder 
497986b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4980392a9dadSAlex Elder 	}
498186b00e0dSAlex Elder 
49820903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
49830903e875SAlex Elder 
49840903e875SAlex Elder 	ret = -EIO;
4985eb3b2d6bSIlya Dryomov 	if (pii.pool_id > (u64)U32_MAX) {
49869584d508SIlya Dryomov 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4987eb3b2d6bSIlya Dryomov 			(unsigned long long)pii.pool_id, U32_MAX);
498857385b51SAlex Elder 		goto out_err;
4989c0cd10dbSAlex Elder 	}
49900903e875SAlex Elder 
49913b5cf2a2SAlex Elder 	/*
49923b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
49933b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
49943b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
49953b5cf2a2SAlex Elder 	 */
49963b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
4997eb3b2d6bSIlya Dryomov 		parent_spec->pool_id = pii.pool_id;
4998e92c0eafSIlya Dryomov 		if (pii.pool_ns && *pii.pool_ns) {
4999e92c0eafSIlya Dryomov 			parent_spec->pool_ns = pii.pool_ns;
5000e92c0eafSIlya Dryomov 			pii.pool_ns = NULL;
5001e92c0eafSIlya Dryomov 		}
5002eb3b2d6bSIlya Dryomov 		parent_spec->image_id = pii.image_id;
5003eb3b2d6bSIlya Dryomov 		pii.image_id = NULL;
5004eb3b2d6bSIlya Dryomov 		parent_spec->snap_id = pii.snap_id;
5005b26c047bSIlya Dryomov 
500686b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
500786b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
50083b5cf2a2SAlex Elder 	}
50093b5cf2a2SAlex Elder 
50103b5cf2a2SAlex Elder 	/*
5011cf32bd9cSIlya Dryomov 	 * We always update the parent overlap.  If it's zero we issue
5012cf32bd9cSIlya Dryomov 	 * a warning, as we will proceed as if there was no parent.
50133b5cf2a2SAlex Elder 	 */
5014eb3b2d6bSIlya Dryomov 	if (!pii.overlap) {
50153b5cf2a2SAlex Elder 		if (parent_spec) {
5016cf32bd9cSIlya Dryomov 			/* refresh, careful to warn just once */
5017cf32bd9cSIlya Dryomov 			if (rbd_dev->parent_overlap)
5018cf32bd9cSIlya Dryomov 				rbd_warn(rbd_dev,
5019cf32bd9cSIlya Dryomov 				    "clone now standalone (overlap became 0)");
502070cf49cfSAlex Elder 		} else {
5021cf32bd9cSIlya Dryomov 			/* initial probe */
5022cf32bd9cSIlya Dryomov 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
50233b5cf2a2SAlex Elder 		}
502470cf49cfSAlex Elder 	}
5025eb3b2d6bSIlya Dryomov 	rbd_dev->parent_overlap = pii.overlap;
5026cf32bd9cSIlya Dryomov 
502786b00e0dSAlex Elder out:
502886b00e0dSAlex Elder 	ret = 0;
502986b00e0dSAlex Elder out_err:
5030e92c0eafSIlya Dryomov 	kfree(pii.pool_ns);
5031eb3b2d6bSIlya Dryomov 	kfree(pii.image_id);
503286b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
503386b00e0dSAlex Elder 	return ret;
503486b00e0dSAlex Elder }
503586b00e0dSAlex Elder 
5036cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5037cc070d59SAlex Elder {
5038cc070d59SAlex Elder 	struct {
5039cc070d59SAlex Elder 		__le64 stripe_unit;
5040cc070d59SAlex Elder 		__le64 stripe_count;
5041cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5042cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
5043cc070d59SAlex Elder 	void *p;
5044cc070d59SAlex Elder 	int ret;
5045cc070d59SAlex Elder 
5046ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5047ecd4a68aSIlya Dryomov 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5048ecd4a68aSIlya Dryomov 				NULL, 0, &striping_info_buf, size);
5049cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5050cc070d59SAlex Elder 	if (ret < 0)
5051cc070d59SAlex Elder 		return ret;
5052cc070d59SAlex Elder 	if (ret < size)
5053cc070d59SAlex Elder 		return -ERANGE;
5054cc070d59SAlex Elder 
5055cc070d59SAlex Elder 	p = &striping_info_buf;
5056b1331852SIlya Dryomov 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5057b1331852SIlya Dryomov 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
5058cc070d59SAlex Elder 	return 0;
5059cc070d59SAlex Elder }
5060cc070d59SAlex Elder 
50617e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
50627e97332eSIlya Dryomov {
50637e97332eSIlya Dryomov 	__le64 data_pool_id;
50647e97332eSIlya Dryomov 	int ret;
50657e97332eSIlya Dryomov 
50667e97332eSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
50677e97332eSIlya Dryomov 				  &rbd_dev->header_oloc, "get_data_pool",
50687e97332eSIlya Dryomov 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
50697e97332eSIlya Dryomov 	if (ret < 0)
50707e97332eSIlya Dryomov 		return ret;
50717e97332eSIlya Dryomov 	if (ret < sizeof(data_pool_id))
50727e97332eSIlya Dryomov 		return -EBADMSG;
50737e97332eSIlya Dryomov 
50747e97332eSIlya Dryomov 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
50757e97332eSIlya Dryomov 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
50767e97332eSIlya Dryomov 	return 0;
50777e97332eSIlya Dryomov }
50787e97332eSIlya Dryomov 
50799e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
50809e15b77dSAlex Elder {
5081ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
50829e15b77dSAlex Elder 	size_t image_id_size;
50839e15b77dSAlex Elder 	char *image_id;
50849e15b77dSAlex Elder 	void *p;
50859e15b77dSAlex Elder 	void *end;
50869e15b77dSAlex Elder 	size_t size;
50879e15b77dSAlex Elder 	void *reply_buf = NULL;
50889e15b77dSAlex Elder 	size_t len = 0;
50899e15b77dSAlex Elder 	char *image_name = NULL;
50909e15b77dSAlex Elder 	int ret;
50919e15b77dSAlex Elder 
50929e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
50939e15b77dSAlex Elder 
509469e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
509569e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
50969e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
50979e15b77dSAlex Elder 	if (!image_id)
50989e15b77dSAlex Elder 		return NULL;
50999e15b77dSAlex Elder 
51009e15b77dSAlex Elder 	p = image_id;
51014157976bSAlex Elder 	end = image_id + image_id_size;
510269e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
51039e15b77dSAlex Elder 
51049e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
51059e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
51069e15b77dSAlex Elder 	if (!reply_buf)
51079e15b77dSAlex Elder 		goto out;
51089e15b77dSAlex Elder 
5109ecd4a68aSIlya Dryomov 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5110ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5111ecd4a68aSIlya Dryomov 				  "dir_get_name", image_id, image_id_size,
5112e2a58ee5SAlex Elder 				  reply_buf, size);
51139e15b77dSAlex Elder 	if (ret < 0)
51149e15b77dSAlex Elder 		goto out;
51159e15b77dSAlex Elder 	p = reply_buf;
5116f40eb349SAlex Elder 	end = reply_buf + ret;
5117f40eb349SAlex Elder 
51189e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
51199e15b77dSAlex Elder 	if (IS_ERR(image_name))
51209e15b77dSAlex Elder 		image_name = NULL;
51219e15b77dSAlex Elder 	else
51229e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
51239e15b77dSAlex Elder out:
51249e15b77dSAlex Elder 	kfree(reply_buf);
51259e15b77dSAlex Elder 	kfree(image_id);
51269e15b77dSAlex Elder 
51279e15b77dSAlex Elder 	return image_name;
51289e15b77dSAlex Elder }
51299e15b77dSAlex Elder 
51302ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51312ad3d716SAlex Elder {
51322ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
51332ad3d716SAlex Elder 	const char *snap_name;
51342ad3d716SAlex Elder 	u32 which = 0;
51352ad3d716SAlex Elder 
51362ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
51372ad3d716SAlex Elder 
51382ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
51392ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
51402ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
51412ad3d716SAlex Elder 			return snapc->snaps[which];
51422ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
51432ad3d716SAlex Elder 		which++;
51442ad3d716SAlex Elder 	}
51452ad3d716SAlex Elder 	return CEPH_NOSNAP;
51462ad3d716SAlex Elder }
51472ad3d716SAlex Elder 
51482ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51492ad3d716SAlex Elder {
51502ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
51512ad3d716SAlex Elder 	u32 which;
51522ad3d716SAlex Elder 	bool found = false;
51532ad3d716SAlex Elder 	u64 snap_id;
51542ad3d716SAlex Elder 
51552ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
51562ad3d716SAlex Elder 		const char *snap_name;
51572ad3d716SAlex Elder 
51582ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
51592ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5160efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
5161efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
5162efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
5163efadc98aSJosh Durgin 				continue;
5164efadc98aSJosh Durgin 			else
51652ad3d716SAlex Elder 				break;
5166efadc98aSJosh Durgin 		}
51672ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
51682ad3d716SAlex Elder 		kfree(snap_name);
51692ad3d716SAlex Elder 	}
51702ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
51712ad3d716SAlex Elder }
51722ad3d716SAlex Elder 
51732ad3d716SAlex Elder /*
51742ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
51752ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
51762ad3d716SAlex Elder  */
51772ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
51782ad3d716SAlex Elder {
51792ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
51802ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
51812ad3d716SAlex Elder 
51822ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
51832ad3d716SAlex Elder }
51842ad3d716SAlex Elder 
51859e15b77dSAlex Elder /*
518604077599SIlya Dryomov  * An image being mapped will have everything but the snap id.
51879e15b77dSAlex Elder  */
518804077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
518904077599SIlya Dryomov {
519004077599SIlya Dryomov 	struct rbd_spec *spec = rbd_dev->spec;
519104077599SIlya Dryomov 
519204077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
519304077599SIlya Dryomov 	rbd_assert(spec->image_id && spec->image_name);
519404077599SIlya Dryomov 	rbd_assert(spec->snap_name);
519504077599SIlya Dryomov 
519604077599SIlya Dryomov 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
519704077599SIlya Dryomov 		u64 snap_id;
519804077599SIlya Dryomov 
519904077599SIlya Dryomov 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
520004077599SIlya Dryomov 		if (snap_id == CEPH_NOSNAP)
520104077599SIlya Dryomov 			return -ENOENT;
520204077599SIlya Dryomov 
520304077599SIlya Dryomov 		spec->snap_id = snap_id;
520404077599SIlya Dryomov 	} else {
520504077599SIlya Dryomov 		spec->snap_id = CEPH_NOSNAP;
520604077599SIlya Dryomov 	}
520704077599SIlya Dryomov 
520804077599SIlya Dryomov 	return 0;
520904077599SIlya Dryomov }
521004077599SIlya Dryomov 
521104077599SIlya Dryomov /*
521204077599SIlya Dryomov  * A parent image will have all ids but none of the names.
521304077599SIlya Dryomov  *
521404077599SIlya Dryomov  * All names in an rbd spec are dynamically allocated.  It's OK if we
521504077599SIlya Dryomov  * can't figure out the name for an image id.
521604077599SIlya Dryomov  */
521704077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
52189e15b77dSAlex Elder {
52192e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
52202e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
52212e9f7f1cSAlex Elder 	const char *pool_name;
52222e9f7f1cSAlex Elder 	const char *image_name;
52232e9f7f1cSAlex Elder 	const char *snap_name;
52249e15b77dSAlex Elder 	int ret;
52259e15b77dSAlex Elder 
522604077599SIlya Dryomov 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
522704077599SIlya Dryomov 	rbd_assert(spec->image_id);
522804077599SIlya Dryomov 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
52299e15b77dSAlex Elder 
52302e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
52319e15b77dSAlex Elder 
52322e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
52332e9f7f1cSAlex Elder 	if (!pool_name) {
52342e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
5235935dc89fSAlex Elder 		return -EIO;
5236935dc89fSAlex Elder 	}
52372e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
52382e9f7f1cSAlex Elder 	if (!pool_name)
52399e15b77dSAlex Elder 		return -ENOMEM;
52409e15b77dSAlex Elder 
52419e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
52429e15b77dSAlex Elder 
52432e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
52442e9f7f1cSAlex Elder 	if (!image_name)
524506ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
52469e15b77dSAlex Elder 
524704077599SIlya Dryomov 	/* Fetch the snapshot name */
52489e15b77dSAlex Elder 
52492e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
5250da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
5251da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
52529e15b77dSAlex Elder 		goto out_err;
52532e9f7f1cSAlex Elder 	}
52542e9f7f1cSAlex Elder 
52552e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
52562e9f7f1cSAlex Elder 	spec->image_name = image_name;
52572e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
52589e15b77dSAlex Elder 
52599e15b77dSAlex Elder 	return 0;
526004077599SIlya Dryomov 
52619e15b77dSAlex Elder out_err:
52622e9f7f1cSAlex Elder 	kfree(image_name);
52632e9f7f1cSAlex Elder 	kfree(pool_name);
52649e15b77dSAlex Elder 	return ret;
52659e15b77dSAlex Elder }
52669e15b77dSAlex Elder 
5267cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
526835d489f9SAlex Elder {
526935d489f9SAlex Elder 	size_t size;
527035d489f9SAlex Elder 	int ret;
527135d489f9SAlex Elder 	void *reply_buf;
527235d489f9SAlex Elder 	void *p;
527335d489f9SAlex Elder 	void *end;
527435d489f9SAlex Elder 	u64 seq;
527535d489f9SAlex Elder 	u32 snap_count;
527635d489f9SAlex Elder 	struct ceph_snap_context *snapc;
527735d489f9SAlex Elder 	u32 i;
527835d489f9SAlex Elder 
527935d489f9SAlex Elder 	/*
528035d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
528135d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
528235d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
528335d489f9SAlex Elder 	 * prepared to receive.
528435d489f9SAlex Elder 	 */
528535d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
528635d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
528735d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
528835d489f9SAlex Elder 	if (!reply_buf)
528935d489f9SAlex Elder 		return -ENOMEM;
529035d489f9SAlex Elder 
5291ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5292ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapcontext",
5293ecd4a68aSIlya Dryomov 				  NULL, 0, reply_buf, size);
529436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
529535d489f9SAlex Elder 	if (ret < 0)
529635d489f9SAlex Elder 		goto out;
529735d489f9SAlex Elder 
529835d489f9SAlex Elder 	p = reply_buf;
529957385b51SAlex Elder 	end = reply_buf + ret;
530057385b51SAlex Elder 	ret = -ERANGE;
530135d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
530235d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
530335d489f9SAlex Elder 
530435d489f9SAlex Elder 	/*
530535d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
530635d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
530735d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
530835d489f9SAlex Elder 	 * allocate is representable in a size_t.
530935d489f9SAlex Elder 	 */
531035d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
531135d489f9SAlex Elder 				 / sizeof (u64)) {
531235d489f9SAlex Elder 		ret = -EINVAL;
531335d489f9SAlex Elder 		goto out;
531435d489f9SAlex Elder 	}
531535d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
531635d489f9SAlex Elder 		goto out;
5317468521c1SAlex Elder 	ret = 0;
531835d489f9SAlex Elder 
5319812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
532035d489f9SAlex Elder 	if (!snapc) {
532135d489f9SAlex Elder 		ret = -ENOMEM;
532235d489f9SAlex Elder 		goto out;
532335d489f9SAlex Elder 	}
532435d489f9SAlex Elder 	snapc->seq = seq;
532535d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
532635d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
532735d489f9SAlex Elder 
532849ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
532935d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
533035d489f9SAlex Elder 
533135d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
533235d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
533335d489f9SAlex Elder out:
533435d489f9SAlex Elder 	kfree(reply_buf);
533535d489f9SAlex Elder 
533657385b51SAlex Elder 	return ret;
533735d489f9SAlex Elder }
533835d489f9SAlex Elder 
533954cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
534054cac61fSAlex Elder 					u64 snap_id)
5341b8b1e2dbSAlex Elder {
5342b8b1e2dbSAlex Elder 	size_t size;
5343b8b1e2dbSAlex Elder 	void *reply_buf;
534454cac61fSAlex Elder 	__le64 snapid;
5345b8b1e2dbSAlex Elder 	int ret;
5346b8b1e2dbSAlex Elder 	void *p;
5347b8b1e2dbSAlex Elder 	void *end;
5348b8b1e2dbSAlex Elder 	char *snap_name;
5349b8b1e2dbSAlex Elder 
5350b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5351b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
5352b8b1e2dbSAlex Elder 	if (!reply_buf)
5353b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
5354b8b1e2dbSAlex Elder 
535554cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
5356ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5357ecd4a68aSIlya Dryomov 				  &rbd_dev->header_oloc, "get_snapshot_name",
5358ecd4a68aSIlya Dryomov 				  &snapid, sizeof(snapid), reply_buf, size);
535936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5360f40eb349SAlex Elder 	if (ret < 0) {
5361f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
5362b8b1e2dbSAlex Elder 		goto out;
5363f40eb349SAlex Elder 	}
5364b8b1e2dbSAlex Elder 
5365b8b1e2dbSAlex Elder 	p = reply_buf;
5366f40eb349SAlex Elder 	end = reply_buf + ret;
5367e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5368f40eb349SAlex Elder 	if (IS_ERR(snap_name))
5369b8b1e2dbSAlex Elder 		goto out;
5370f40eb349SAlex Elder 
5371b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
537254cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
5373b8b1e2dbSAlex Elder out:
5374b8b1e2dbSAlex Elder 	kfree(reply_buf);
5375b8b1e2dbSAlex Elder 
5376f40eb349SAlex Elder 	return snap_name;
5377b8b1e2dbSAlex Elder }
5378b8b1e2dbSAlex Elder 
53792df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5380117973fbSAlex Elder {
53812df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
5382117973fbSAlex Elder 	int ret;
5383117973fbSAlex Elder 
53841617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
53851617e40cSJosh Durgin 	if (ret)
5386cfbf6377SAlex Elder 		return ret;
53871617e40cSJosh Durgin 
53882df3fac7SAlex Elder 	if (first_time) {
53892df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
53902df3fac7SAlex Elder 		if (ret)
5391cfbf6377SAlex Elder 			return ret;
53922df3fac7SAlex Elder 	}
53932df3fac7SAlex Elder 
5394cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
5395d194cd1dSIlya Dryomov 	if (ret && first_time) {
5396d194cd1dSIlya Dryomov 		kfree(rbd_dev->header.object_prefix);
5397d194cd1dSIlya Dryomov 		rbd_dev->header.object_prefix = NULL;
5398d194cd1dSIlya Dryomov 	}
5399117973fbSAlex Elder 
5400117973fbSAlex Elder 	return ret;
5401117973fbSAlex Elder }
5402117973fbSAlex Elder 
5403a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5404a720ae09SIlya Dryomov {
5405a720ae09SIlya Dryomov 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5406a720ae09SIlya Dryomov 
5407a720ae09SIlya Dryomov 	if (rbd_dev->image_format == 1)
5408a720ae09SIlya Dryomov 		return rbd_dev_v1_header_info(rbd_dev);
5409a720ae09SIlya Dryomov 
5410a720ae09SIlya Dryomov 	return rbd_dev_v2_header_info(rbd_dev);
5411a720ae09SIlya Dryomov }
5412a720ae09SIlya Dryomov 
54131ddbe94eSAlex Elder /*
5414e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
5415e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
5416593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
5417593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
5418e28fff26SAlex Elder  */
5419e28fff26SAlex Elder static inline size_t next_token(const char **buf)
5420e28fff26SAlex Elder {
5421e28fff26SAlex Elder         /*
5422e28fff26SAlex Elder         * These are the characters that produce nonzero for
5423e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
5424e28fff26SAlex Elder         */
5425e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
5426e28fff26SAlex Elder 
5427e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
5428e28fff26SAlex Elder 
5429e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
5430e28fff26SAlex Elder }
5431e28fff26SAlex Elder 
5432e28fff26SAlex Elder /*
5433ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
5434ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
5435ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5436ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
5437ea3352f4SAlex Elder  *
5438ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
5439ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
5440ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
5441ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
5442ea3352f4SAlex Elder  *
5443ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
5444ea3352f4SAlex Elder  * the end of the found token.
5445ea3352f4SAlex Elder  *
5446ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
5447ea3352f4SAlex Elder  */
5448ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
5449ea3352f4SAlex Elder {
5450ea3352f4SAlex Elder 	char *dup;
5451ea3352f4SAlex Elder 	size_t len;
5452ea3352f4SAlex Elder 
5453ea3352f4SAlex Elder 	len = next_token(buf);
54544caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5455ea3352f4SAlex Elder 	if (!dup)
5456ea3352f4SAlex Elder 		return NULL;
5457ea3352f4SAlex Elder 	*(dup + len) = '\0';
5458ea3352f4SAlex Elder 	*buf += len;
5459ea3352f4SAlex Elder 
5460ea3352f4SAlex Elder 	if (lenp)
5461ea3352f4SAlex Elder 		*lenp = len;
5462ea3352f4SAlex Elder 
5463ea3352f4SAlex Elder 	return dup;
5464ea3352f4SAlex Elder }
5465ea3352f4SAlex Elder 
5466ea3352f4SAlex Elder /*
5467859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
5468859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5469859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
5470859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
5471d22f76e7SAlex Elder  *
5472859c31dfSAlex Elder  * The information extracted from these options is recorded in
5473859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
5474859c31dfSAlex Elder  * structures:
5475859c31dfSAlex Elder  *  ceph_opts
5476859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
5477859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
5478859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
5479859c31dfSAlex Elder  *  rbd_opts
5480859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
5481859c31dfSAlex Elder  *	this function; caller must release with kfree().
5482859c31dfSAlex Elder  *  spec
5483859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
5484859c31dfSAlex Elder  *	initialized by this function based on parsed options.
5485859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
5486859c31dfSAlex Elder  *
5487859c31dfSAlex Elder  * The options passed take this form:
5488859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5489859c31dfSAlex Elder  * where:
5490859c31dfSAlex Elder  *  <mon_addrs>
5491859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
5492859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
5493859c31dfSAlex Elder  *      by a port number (separated by a colon).
5494859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
5495859c31dfSAlex Elder  *  <options>
5496859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
5497859c31dfSAlex Elder  *  <pool_name>
5498859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
5499859c31dfSAlex Elder  *  <image_name>
5500859c31dfSAlex Elder  *      The name of the image in that pool to map.
5501859c31dfSAlex Elder  *  <snap_id>
5502859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
5503859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
5504859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
5505859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
5506a725f65eSAlex Elder  */
5507859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
5508dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
5509859c31dfSAlex Elder 				struct rbd_options **opts,
5510859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
5511a725f65eSAlex Elder {
5512e28fff26SAlex Elder 	size_t len;
5513859c31dfSAlex Elder 	char *options;
55140ddebc0cSAlex Elder 	const char *mon_addrs;
5515ecb4dc22SAlex Elder 	char *snap_name;
55160ddebc0cSAlex Elder 	size_t mon_addrs_size;
5517c300156bSIlya Dryomov 	struct parse_rbd_opts_ctx pctx = { 0 };
5518859c31dfSAlex Elder 	struct ceph_options *copts;
5519dc79b113SAlex Elder 	int ret;
5520e28fff26SAlex Elder 
5521e28fff26SAlex Elder 	/* The first four tokens are required */
5522e28fff26SAlex Elder 
55237ef3214aSAlex Elder 	len = next_token(&buf);
55244fb5d671SAlex Elder 	if (!len) {
55254fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
55264fb5d671SAlex Elder 		return -EINVAL;
55274fb5d671SAlex Elder 	}
55280ddebc0cSAlex Elder 	mon_addrs = buf;
5529f28e565aSAlex Elder 	mon_addrs_size = len + 1;
55307ef3214aSAlex Elder 	buf += len;
5531a725f65eSAlex Elder 
5532dc79b113SAlex Elder 	ret = -EINVAL;
5533f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
5534f28e565aSAlex Elder 	if (!options)
5535dc79b113SAlex Elder 		return -ENOMEM;
55364fb5d671SAlex Elder 	if (!*options) {
55374fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
55384fb5d671SAlex Elder 		goto out_err;
55394fb5d671SAlex Elder 	}
5540a725f65eSAlex Elder 
5541c300156bSIlya Dryomov 	pctx.spec = rbd_spec_alloc();
5542c300156bSIlya Dryomov 	if (!pctx.spec)
5543f28e565aSAlex Elder 		goto out_mem;
5544859c31dfSAlex Elder 
5545c300156bSIlya Dryomov 	pctx.spec->pool_name = dup_token(&buf, NULL);
5546c300156bSIlya Dryomov 	if (!pctx.spec->pool_name)
5547859c31dfSAlex Elder 		goto out_mem;
5548c300156bSIlya Dryomov 	if (!*pctx.spec->pool_name) {
55494fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
55504fb5d671SAlex Elder 		goto out_err;
55514fb5d671SAlex Elder 	}
5552e28fff26SAlex Elder 
5553c300156bSIlya Dryomov 	pctx.spec->image_name = dup_token(&buf, NULL);
5554c300156bSIlya Dryomov 	if (!pctx.spec->image_name)
5555f28e565aSAlex Elder 		goto out_mem;
5556c300156bSIlya Dryomov 	if (!*pctx.spec->image_name) {
55574fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
55584fb5d671SAlex Elder 		goto out_err;
55594fb5d671SAlex Elder 	}
5560e28fff26SAlex Elder 
5561f28e565aSAlex Elder 	/*
5562f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
5563f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
5564f28e565aSAlex Elder 	 */
55653feeb894SAlex Elder 	len = next_token(&buf);
5566820a5f3eSAlex Elder 	if (!len) {
55673feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
55683feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5569f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5570dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
5571f28e565aSAlex Elder 		goto out_err;
5572849b4260SAlex Elder 	}
5573ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5574ecb4dc22SAlex Elder 	if (!snap_name)
5575f28e565aSAlex Elder 		goto out_mem;
5576ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
5577c300156bSIlya Dryomov 	pctx.spec->snap_name = snap_name;
5578e5c35534SAlex Elder 
55790ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
5580e28fff26SAlex Elder 
5581c300156bSIlya Dryomov 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5582c300156bSIlya Dryomov 	if (!pctx.opts)
55834e9afebaSAlex Elder 		goto out_mem;
55844e9afebaSAlex Elder 
5585c300156bSIlya Dryomov 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5586c300156bSIlya Dryomov 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
55870c93e1b7SIlya Dryomov 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
5588c300156bSIlya Dryomov 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5589c300156bSIlya Dryomov 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5590c300156bSIlya Dryomov 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5591c300156bSIlya Dryomov 	pctx.opts->trim = RBD_TRIM_DEFAULT;
5592d22f76e7SAlex Elder 
5593859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
55940ddebc0cSAlex Elder 				   mon_addrs + mon_addrs_size - 1,
5595c300156bSIlya Dryomov 				   parse_rbd_opts_token, &pctx);
5596859c31dfSAlex Elder 	if (IS_ERR(copts)) {
5597859c31dfSAlex Elder 		ret = PTR_ERR(copts);
5598dc79b113SAlex Elder 		goto out_err;
5599dc79b113SAlex Elder 	}
5600859c31dfSAlex Elder 	kfree(options);
5601859c31dfSAlex Elder 
5602859c31dfSAlex Elder 	*ceph_opts = copts;
5603c300156bSIlya Dryomov 	*opts = pctx.opts;
5604c300156bSIlya Dryomov 	*rbd_spec = pctx.spec;
56050ddebc0cSAlex Elder 
5606dc79b113SAlex Elder 	return 0;
5607f28e565aSAlex Elder out_mem:
5608dc79b113SAlex Elder 	ret = -ENOMEM;
5609d22f76e7SAlex Elder out_err:
5610c300156bSIlya Dryomov 	kfree(pctx.opts);
5611c300156bSIlya Dryomov 	rbd_spec_put(pctx.spec);
5612f28e565aSAlex Elder 	kfree(options);
5613d22f76e7SAlex Elder 
5614dc79b113SAlex Elder 	return ret;
5615a725f65eSAlex Elder }
5616a725f65eSAlex Elder 
5617e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5618e010dd0aSIlya Dryomov {
5619e010dd0aSIlya Dryomov 	down_write(&rbd_dev->lock_rwsem);
5620e010dd0aSIlya Dryomov 	if (__rbd_is_lock_owner(rbd_dev))
5621e010dd0aSIlya Dryomov 		rbd_unlock(rbd_dev);
5622e010dd0aSIlya Dryomov 	up_write(&rbd_dev->lock_rwsem);
5623e010dd0aSIlya Dryomov }
5624e010dd0aSIlya Dryomov 
5625e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5626e010dd0aSIlya Dryomov {
56272f18d466SIlya Dryomov 	int ret;
56282f18d466SIlya Dryomov 
5629e010dd0aSIlya Dryomov 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5630e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5631e010dd0aSIlya Dryomov 		return -EINVAL;
5632e010dd0aSIlya Dryomov 	}
5633e010dd0aSIlya Dryomov 
5634e010dd0aSIlya Dryomov 	/* FIXME: "rbd map --exclusive" should be in interruptible */
5635e010dd0aSIlya Dryomov 	down_read(&rbd_dev->lock_rwsem);
56362f18d466SIlya Dryomov 	ret = rbd_wait_state_locked(rbd_dev, true);
5637e010dd0aSIlya Dryomov 	up_read(&rbd_dev->lock_rwsem);
56382f18d466SIlya Dryomov 	if (ret) {
5639e010dd0aSIlya Dryomov 		rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5640e010dd0aSIlya Dryomov 		return -EROFS;
5641e010dd0aSIlya Dryomov 	}
5642e010dd0aSIlya Dryomov 
5643e010dd0aSIlya Dryomov 	return 0;
5644e010dd0aSIlya Dryomov }
5645e010dd0aSIlya Dryomov 
564630ba1f02SIlya Dryomov /*
5647589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
5648589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
5649589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
5650589d30e0SAlex Elder  *
5651589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
5652589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
5653589d30e0SAlex Elder  * with the supplied name.
5654589d30e0SAlex Elder  *
5655589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
5656589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
5657589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
5658589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
5659589d30e0SAlex Elder  */
5660589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5661589d30e0SAlex Elder {
5662589d30e0SAlex Elder 	int ret;
5663589d30e0SAlex Elder 	size_t size;
5664ecd4a68aSIlya Dryomov 	CEPH_DEFINE_OID_ONSTACK(oid);
5665589d30e0SAlex Elder 	void *response;
5666c0fba368SAlex Elder 	char *image_id;
56672f82ee54SAlex Elder 
5668589d30e0SAlex Elder 	/*
56692c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
56702c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
5671c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
5672c0fba368SAlex Elder 	 * do still need to set the image format though.
56732c0d0a10SAlex Elder 	 */
5674c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
5675c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5676c0fba368SAlex Elder 
56772c0d0a10SAlex Elder 		return 0;
5678c0fba368SAlex Elder 	}
56792c0d0a10SAlex Elder 
56802c0d0a10SAlex Elder 	/*
5681589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
5682589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
5683589d30e0SAlex Elder 	 */
5684ecd4a68aSIlya Dryomov 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5685ecd4a68aSIlya Dryomov 			       rbd_dev->spec->image_name);
5686ecd4a68aSIlya Dryomov 	if (ret)
5687ecd4a68aSIlya Dryomov 		return ret;
5688ecd4a68aSIlya Dryomov 
5689ecd4a68aSIlya Dryomov 	dout("rbd id object name is %s\n", oid.name);
5690589d30e0SAlex Elder 
5691589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
5692589d30e0SAlex Elder 
5693589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5694589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
5695589d30e0SAlex Elder 	if (!response) {
5696589d30e0SAlex Elder 		ret = -ENOMEM;
5697589d30e0SAlex Elder 		goto out;
5698589d30e0SAlex Elder 	}
5699589d30e0SAlex Elder 
5700c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
5701c0fba368SAlex Elder 
5702ecd4a68aSIlya Dryomov 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5703ecd4a68aSIlya Dryomov 				  "get_id", NULL, 0,
5704e2a58ee5SAlex Elder 				  response, RBD_IMAGE_ID_LEN_MAX);
570536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5706c0fba368SAlex Elder 	if (ret == -ENOENT) {
5707c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
5708c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
5709c0fba368SAlex Elder 		if (!ret)
5710c0fba368SAlex Elder 			rbd_dev->image_format = 1;
57117dd440c9SIlya Dryomov 	} else if (ret >= 0) {
5712c0fba368SAlex Elder 		void *p = response;
5713589d30e0SAlex Elder 
5714c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
5715979ed480SAlex Elder 						NULL, GFP_NOIO);
5716461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
5717c0fba368SAlex Elder 		if (!ret)
5718c0fba368SAlex Elder 			rbd_dev->image_format = 2;
5719c0fba368SAlex Elder 	}
5720c0fba368SAlex Elder 
5721c0fba368SAlex Elder 	if (!ret) {
5722c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
5723c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
5724589d30e0SAlex Elder 	}
5725589d30e0SAlex Elder out:
5726589d30e0SAlex Elder 	kfree(response);
5727ecd4a68aSIlya Dryomov 	ceph_oid_destroy(&oid);
5728589d30e0SAlex Elder 	return ret;
5729589d30e0SAlex Elder }
5730589d30e0SAlex Elder 
57313abef3b3SAlex Elder /*
57323abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
57333abef3b3SAlex Elder  * call.
57343abef3b3SAlex Elder  */
57356fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
57366fd48b3bSAlex Elder {
57376fd48b3bSAlex Elder 	struct rbd_image_header	*header;
57386fd48b3bSAlex Elder 
5739a2acd00eSAlex Elder 	rbd_dev_parent_put(rbd_dev);
57406fd48b3bSAlex Elder 
57416fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
57426fd48b3bSAlex Elder 
57436fd48b3bSAlex Elder 	header = &rbd_dev->header;
5744812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
57456fd48b3bSAlex Elder 	kfree(header->snap_sizes);
57466fd48b3bSAlex Elder 	kfree(header->snap_names);
57476fd48b3bSAlex Elder 	kfree(header->object_prefix);
57486fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
57496fd48b3bSAlex Elder }
57506fd48b3bSAlex Elder 
57512df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5752a30b71b9SAlex Elder {
5753a30b71b9SAlex Elder 	int ret;
5754a30b71b9SAlex Elder 
57551e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
575657385b51SAlex Elder 	if (ret)
57571e130199SAlex Elder 		goto out_err;
5758b1b5402aSAlex Elder 
57592df3fac7SAlex Elder 	/*
57602df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
57612df3fac7SAlex Elder 	 * features are assumed to never change.
57622df3fac7SAlex Elder 	 */
5763b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
576457385b51SAlex Elder 	if (ret)
5765b1b5402aSAlex Elder 		goto out_err;
576635d489f9SAlex Elder 
5767cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
5768cc070d59SAlex Elder 
5769cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5770cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
5771cc070d59SAlex Elder 		if (ret < 0)
5772cc070d59SAlex Elder 			goto out_err;
5773cc070d59SAlex Elder 	}
5774a30b71b9SAlex Elder 
57757e97332eSIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
57767e97332eSIlya Dryomov 		ret = rbd_dev_v2_data_pool(rbd_dev);
57777e97332eSIlya Dryomov 		if (ret)
57787e97332eSIlya Dryomov 			goto out_err;
57797e97332eSIlya Dryomov 	}
57807e97332eSIlya Dryomov 
5781263423f8SIlya Dryomov 	rbd_init_layout(rbd_dev);
578235152979SAlex Elder 	return 0;
5783263423f8SIlya Dryomov 
57849d475de5SAlex Elder out_err:
5785642a2537SAlex Elder 	rbd_dev->header.features = 0;
57861e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
57871e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
57889d475de5SAlex Elder 	return ret;
5789a30b71b9SAlex Elder }
5790a30b71b9SAlex Elder 
57916d69bb53SIlya Dryomov /*
57926d69bb53SIlya Dryomov  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
57936d69bb53SIlya Dryomov  * rbd_dev_image_probe() recursion depth, which means it's also the
57946d69bb53SIlya Dryomov  * length of the already discovered part of the parent chain.
57956d69bb53SIlya Dryomov  */
57966d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
579783a06263SAlex Elder {
57982f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
5799124afba2SAlex Elder 	int ret;
5800124afba2SAlex Elder 
5801124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
5802124afba2SAlex Elder 		return 0;
5803124afba2SAlex Elder 
58046d69bb53SIlya Dryomov 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
58056d69bb53SIlya Dryomov 		pr_info("parent chain is too long (%d)\n", depth);
58066d69bb53SIlya Dryomov 		ret = -EINVAL;
58076d69bb53SIlya Dryomov 		goto out_err;
58086d69bb53SIlya Dryomov 	}
58096d69bb53SIlya Dryomov 
58101643dfa4SIlya Dryomov 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
58111f2c6651SIlya Dryomov 	if (!parent) {
5812124afba2SAlex Elder 		ret = -ENOMEM;
5813124afba2SAlex Elder 		goto out_err;
58141f2c6651SIlya Dryomov 	}
58151f2c6651SIlya Dryomov 
58161f2c6651SIlya Dryomov 	/*
58171f2c6651SIlya Dryomov 	 * Images related by parent/child relationships always share
58181f2c6651SIlya Dryomov 	 * rbd_client and spec/parent_spec, so bump their refcounts.
58191f2c6651SIlya Dryomov 	 */
58201f2c6651SIlya Dryomov 	__rbd_get_client(rbd_dev->rbd_client);
58211f2c6651SIlya Dryomov 	rbd_spec_get(rbd_dev->parent_spec);
5822124afba2SAlex Elder 
58236d69bb53SIlya Dryomov 	ret = rbd_dev_image_probe(parent, depth);
5824124afba2SAlex Elder 	if (ret < 0)
5825124afba2SAlex Elder 		goto out_err;
58261f2c6651SIlya Dryomov 
5827124afba2SAlex Elder 	rbd_dev->parent = parent;
5828a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5829124afba2SAlex Elder 	return 0;
5830124afba2SAlex Elder 
58311f2c6651SIlya Dryomov out_err:
58321f2c6651SIlya Dryomov 	rbd_dev_unparent(rbd_dev);
58331f2c6651SIlya Dryomov 	rbd_dev_destroy(parent);
5834124afba2SAlex Elder 	return ret;
5835124afba2SAlex Elder }
5836124afba2SAlex Elder 
58375769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
58385769ed0cSIlya Dryomov {
58395769ed0cSIlya Dryomov 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
58405769ed0cSIlya Dryomov 	rbd_dev_mapping_clear(rbd_dev);
58415769ed0cSIlya Dryomov 	rbd_free_disk(rbd_dev);
58425769ed0cSIlya Dryomov 	if (!single_major)
58435769ed0cSIlya Dryomov 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
58445769ed0cSIlya Dryomov }
58455769ed0cSIlya Dryomov 
5846811c6688SIlya Dryomov /*
5847811c6688SIlya Dryomov  * rbd_dev->header_rwsem must be locked for write and will be unlocked
5848811c6688SIlya Dryomov  * upon return.
5849811c6688SIlya Dryomov  */
5850200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5851124afba2SAlex Elder {
585283a06263SAlex Elder 	int ret;
585383a06263SAlex Elder 
58549b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
585583a06263SAlex Elder 
58569b60e70bSIlya Dryomov 	if (!single_major) {
585783a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
585883a06263SAlex Elder 		if (ret < 0)
58591643dfa4SIlya Dryomov 			goto err_out_unlock;
58609b60e70bSIlya Dryomov 
586183a06263SAlex Elder 		rbd_dev->major = ret;
5862dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
58639b60e70bSIlya Dryomov 	} else {
58649b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
58659b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
58669b60e70bSIlya Dryomov 	}
586783a06263SAlex Elder 
586883a06263SAlex Elder 	/* Set up the blkdev mapping. */
586983a06263SAlex Elder 
587083a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
587183a06263SAlex Elder 	if (ret)
587283a06263SAlex Elder 		goto err_out_blkdev;
587383a06263SAlex Elder 
5874f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
587583a06263SAlex Elder 	if (ret)
587683a06263SAlex Elder 		goto err_out_disk;
5877bc1ecc65SIlya Dryomov 
5878f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
58799568c93eSIlya Dryomov 	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
5880f35a4deeSAlex Elder 
58815769ed0cSIlya Dryomov 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5882f35a4deeSAlex Elder 	if (ret)
5883f5ee37bdSIlya Dryomov 		goto err_out_mapping;
588483a06263SAlex Elder 
5885129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5886811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
58875769ed0cSIlya Dryomov 	return 0;
58882f82ee54SAlex Elder 
5889f35a4deeSAlex Elder err_out_mapping:
5890f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
589183a06263SAlex Elder err_out_disk:
589283a06263SAlex Elder 	rbd_free_disk(rbd_dev);
589383a06263SAlex Elder err_out_blkdev:
58949b60e70bSIlya Dryomov 	if (!single_major)
589583a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5896811c6688SIlya Dryomov err_out_unlock:
5897811c6688SIlya Dryomov 	up_write(&rbd_dev->header_rwsem);
589883a06263SAlex Elder 	return ret;
589983a06263SAlex Elder }
590083a06263SAlex Elder 
5901332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5902332bb12dSAlex Elder {
5903332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5904c41d13a3SIlya Dryomov 	int ret;
5905332bb12dSAlex Elder 
5906332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5907332bb12dSAlex Elder 
5908332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5909332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5910c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5911332bb12dSAlex Elder 				       spec->image_name, RBD_SUFFIX);
5912332bb12dSAlex Elder 	else
5913c41d13a3SIlya Dryomov 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5914332bb12dSAlex Elder 				       RBD_HEADER_PREFIX, spec->image_id);
5915c41d13a3SIlya Dryomov 
5916c41d13a3SIlya Dryomov 	return ret;
5917332bb12dSAlex Elder }
5918332bb12dSAlex Elder 
5919200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5920200a6a8bSAlex Elder {
59216fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5922fd22aef8SIlya Dryomov 	if (rbd_dev->opts)
5923fd22aef8SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
59246fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
59256fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
59266fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
5927200a6a8bSAlex Elder }
5928200a6a8bSAlex Elder 
5929a30b71b9SAlex Elder /*
5930a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
59311f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
59321f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
59331f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5934a30b71b9SAlex Elder  */
59356d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5936a30b71b9SAlex Elder {
5937a30b71b9SAlex Elder 	int ret;
5938a30b71b9SAlex Elder 
5939a30b71b9SAlex Elder 	/*
59403abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
59413abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
59423abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
59433abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5944a30b71b9SAlex Elder 	 */
5945a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5946a30b71b9SAlex Elder 	if (ret)
5947c0fba368SAlex Elder 		return ret;
5948c0fba368SAlex Elder 
5949332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5950332bb12dSAlex Elder 	if (ret)
5951332bb12dSAlex Elder 		goto err_out_format;
5952332bb12dSAlex Elder 
59536d69bb53SIlya Dryomov 	if (!depth) {
595499d16943SIlya Dryomov 		ret = rbd_register_watch(rbd_dev);
59551fe48023SIlya Dryomov 		if (ret) {
59561fe48023SIlya Dryomov 			if (ret == -ENOENT)
5957b26c047bSIlya Dryomov 				pr_info("image %s/%s%s%s does not exist\n",
59581fe48023SIlya Dryomov 					rbd_dev->spec->pool_name,
5959b26c047bSIlya Dryomov 					rbd_dev->spec->pool_ns ?: "",
5960b26c047bSIlya Dryomov 					rbd_dev->spec->pool_ns ? "/" : "",
59611fe48023SIlya Dryomov 					rbd_dev->spec->image_name);
5962c41d13a3SIlya Dryomov 			goto err_out_format;
59631f3ef788SAlex Elder 		}
59641fe48023SIlya Dryomov 	}
5965b644de2bSAlex Elder 
5966a720ae09SIlya Dryomov 	ret = rbd_dev_header_info(rbd_dev);
59675655c4d9SAlex Elder 	if (ret)
5968b644de2bSAlex Elder 		goto err_out_watch;
5969a30b71b9SAlex Elder 
597004077599SIlya Dryomov 	/*
597104077599SIlya Dryomov 	 * If this image is the one being mapped, we have pool name and
597204077599SIlya Dryomov 	 * id, image name and id, and snap name - need to fill snap id.
597304077599SIlya Dryomov 	 * Otherwise this is a parent image, identified by pool, image
597404077599SIlya Dryomov 	 * and snap ids - need to fill in names for those ids.
597504077599SIlya Dryomov 	 */
59766d69bb53SIlya Dryomov 	if (!depth)
597704077599SIlya Dryomov 		ret = rbd_spec_fill_snap_id(rbd_dev);
597804077599SIlya Dryomov 	else
597904077599SIlya Dryomov 		ret = rbd_spec_fill_names(rbd_dev);
59801fe48023SIlya Dryomov 	if (ret) {
59811fe48023SIlya Dryomov 		if (ret == -ENOENT)
5982b26c047bSIlya Dryomov 			pr_info("snap %s/%s%s%s@%s does not exist\n",
59831fe48023SIlya Dryomov 				rbd_dev->spec->pool_name,
5984b26c047bSIlya Dryomov 				rbd_dev->spec->pool_ns ?: "",
5985b26c047bSIlya Dryomov 				rbd_dev->spec->pool_ns ? "/" : "",
59861fe48023SIlya Dryomov 				rbd_dev->spec->image_name,
59871fe48023SIlya Dryomov 				rbd_dev->spec->snap_name);
598833dca39fSAlex Elder 		goto err_out_probe;
59891fe48023SIlya Dryomov 	}
59909bb81c9bSAlex Elder 
5991e8f59b59SIlya Dryomov 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5992e8f59b59SIlya Dryomov 		ret = rbd_dev_v2_parent_info(rbd_dev);
5993e8f59b59SIlya Dryomov 		if (ret)
5994e8f59b59SIlya Dryomov 			goto err_out_probe;
5995e8f59b59SIlya Dryomov 	}
5996e8f59b59SIlya Dryomov 
59976d69bb53SIlya Dryomov 	ret = rbd_dev_probe_parent(rbd_dev, depth);
599830d60ba2SAlex Elder 	if (ret)
599930d60ba2SAlex Elder 		goto err_out_probe;
600083a06263SAlex Elder 
600130d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
6002c41d13a3SIlya Dryomov 		rbd_dev->image_format, rbd_dev->header_oid.name);
600330d60ba2SAlex Elder 	return 0;
6004e8f59b59SIlya Dryomov 
60056fd48b3bSAlex Elder err_out_probe:
60066fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
6007b644de2bSAlex Elder err_out_watch:
60086d69bb53SIlya Dryomov 	if (!depth)
600999d16943SIlya Dryomov 		rbd_unregister_watch(rbd_dev);
6010332bb12dSAlex Elder err_out_format:
6011332bb12dSAlex Elder 	rbd_dev->image_format = 0;
60125655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
60135655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
60145655c4d9SAlex Elder 	return ret;
601583a06263SAlex Elder }
601683a06263SAlex Elder 
60179b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
601859c2be1eSYehuda Sadeh 			  const char *buf,
601959c2be1eSYehuda Sadeh 			  size_t count)
6020602adf40SYehuda Sadeh {
6021cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
6022dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
60234e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
6024859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
60259d3997fdSAlex Elder 	struct rbd_client *rbdc;
6026b51c83c2SIlya Dryomov 	int rc;
6027602adf40SYehuda Sadeh 
6028602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
6029602adf40SYehuda Sadeh 		return -ENODEV;
6030602adf40SYehuda Sadeh 
6031a725f65eSAlex Elder 	/* parse add command */
6032859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
6033dc79b113SAlex Elder 	if (rc < 0)
6034dd5ac32dSIlya Dryomov 		goto out;
6035a725f65eSAlex Elder 
60369d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
60379d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
60389d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
60390ddebc0cSAlex Elder 		goto err_out_args;
60409d3997fdSAlex Elder 	}
6041602adf40SYehuda Sadeh 
6042602adf40SYehuda Sadeh 	/* pick the pool */
6043dd435855SIlya Dryomov 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
60441fe48023SIlya Dryomov 	if (rc < 0) {
60451fe48023SIlya Dryomov 		if (rc == -ENOENT)
60461fe48023SIlya Dryomov 			pr_info("pool %s does not exist\n", spec->pool_name);
6047602adf40SYehuda Sadeh 		goto err_out_client;
60481fe48023SIlya Dryomov 	}
6049859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
6050859c31dfSAlex Elder 
6051d147543dSIlya Dryomov 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
6052b51c83c2SIlya Dryomov 	if (!rbd_dev) {
6053b51c83c2SIlya Dryomov 		rc = -ENOMEM;
6054bd4ba655SAlex Elder 		goto err_out_client;
6055b51c83c2SIlya Dryomov 	}
6056c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
6057c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
6058d147543dSIlya Dryomov 	rbd_opts = NULL;	/* rbd_dev now owns this */
6059602adf40SYehuda Sadeh 
60600d6d1e9cSMike Christie 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
60610d6d1e9cSMike Christie 	if (!rbd_dev->config_info) {
60620d6d1e9cSMike Christie 		rc = -ENOMEM;
60630d6d1e9cSMike Christie 		goto err_out_rbd_dev;
60640d6d1e9cSMike Christie 	}
60650d6d1e9cSMike Christie 
6066811c6688SIlya Dryomov 	down_write(&rbd_dev->header_rwsem);
60676d69bb53SIlya Dryomov 	rc = rbd_dev_image_probe(rbd_dev, 0);
60680d6d1e9cSMike Christie 	if (rc < 0) {
60690d6d1e9cSMike Christie 		up_write(&rbd_dev->header_rwsem);
6070c53d5893SAlex Elder 		goto err_out_rbd_dev;
60710d6d1e9cSMike Christie 	}
607205fd6f6fSAlex Elder 
60737ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
60747ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
60759568c93eSIlya Dryomov 		rbd_dev->opts->read_only = true;
60767ce4eef7SAlex Elder 
60770c93e1b7SIlya Dryomov 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
60780c93e1b7SIlya Dryomov 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
60790c93e1b7SIlya Dryomov 			 rbd_dev->layout.object_size);
60800c93e1b7SIlya Dryomov 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
60810c93e1b7SIlya Dryomov 	}
60820c93e1b7SIlya Dryomov 
6083b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
6084fd22aef8SIlya Dryomov 	if (rc)
60858b679ec5SIlya Dryomov 		goto err_out_image_probe;
60863abef3b3SAlex Elder 
6087e010dd0aSIlya Dryomov 	if (rbd_dev->opts->exclusive) {
6088e010dd0aSIlya Dryomov 		rc = rbd_add_acquire_lock(rbd_dev);
6089e010dd0aSIlya Dryomov 		if (rc)
6090e010dd0aSIlya Dryomov 			goto err_out_device_setup;
6091b536f69aSAlex Elder 	}
6092b536f69aSAlex Elder 
60935769ed0cSIlya Dryomov 	/* Everything's ready.  Announce the disk to the world. */
60945769ed0cSIlya Dryomov 
60955769ed0cSIlya Dryomov 	rc = device_add(&rbd_dev->dev);
60965769ed0cSIlya Dryomov 	if (rc)
6097e010dd0aSIlya Dryomov 		goto err_out_image_lock;
60985769ed0cSIlya Dryomov 
60995769ed0cSIlya Dryomov 	add_disk(rbd_dev->disk);
61005769ed0cSIlya Dryomov 	/* see rbd_init_disk() */
61015769ed0cSIlya Dryomov 	blk_put_queue(rbd_dev->disk->queue);
61025769ed0cSIlya Dryomov 
61035769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
61045769ed0cSIlya Dryomov 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
61055769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
61065769ed0cSIlya Dryomov 
61075769ed0cSIlya Dryomov 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
61085769ed0cSIlya Dryomov 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
61095769ed0cSIlya Dryomov 		rbd_dev->header.features);
6110dd5ac32dSIlya Dryomov 	rc = count;
6111dd5ac32dSIlya Dryomov out:
6112dd5ac32dSIlya Dryomov 	module_put(THIS_MODULE);
6113dd5ac32dSIlya Dryomov 	return rc;
6114b536f69aSAlex Elder 
6115e010dd0aSIlya Dryomov err_out_image_lock:
6116e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
61175769ed0cSIlya Dryomov err_out_device_setup:
61185769ed0cSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
61198b679ec5SIlya Dryomov err_out_image_probe:
61208b679ec5SIlya Dryomov 	rbd_dev_image_release(rbd_dev);
6121c53d5893SAlex Elder err_out_rbd_dev:
6122c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
6123bd4ba655SAlex Elder err_out_client:
61249d3997fdSAlex Elder 	rbd_put_client(rbdc);
61250ddebc0cSAlex Elder err_out_args:
6126859c31dfSAlex Elder 	rbd_spec_put(spec);
6127d147543dSIlya Dryomov 	kfree(rbd_opts);
6128dd5ac32dSIlya Dryomov 	goto out;
6129602adf40SYehuda Sadeh }
6130602adf40SYehuda Sadeh 
61319b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
61329b60e70bSIlya Dryomov 		       const char *buf,
61339b60e70bSIlya Dryomov 		       size_t count)
61349b60e70bSIlya Dryomov {
61359b60e70bSIlya Dryomov 	if (single_major)
61369b60e70bSIlya Dryomov 		return -EINVAL;
61379b60e70bSIlya Dryomov 
61389b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
61399b60e70bSIlya Dryomov }
61409b60e70bSIlya Dryomov 
61419b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
61429b60e70bSIlya Dryomov 				    const char *buf,
61439b60e70bSIlya Dryomov 				    size_t count)
61449b60e70bSIlya Dryomov {
61459b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
61469b60e70bSIlya Dryomov }
61479b60e70bSIlya Dryomov 
614805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
614905a46afdSAlex Elder {
6150ad945fc1SAlex Elder 	while (rbd_dev->parent) {
615105a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
615205a46afdSAlex Elder 		struct rbd_device *second = first->parent;
615305a46afdSAlex Elder 		struct rbd_device *third;
615405a46afdSAlex Elder 
615505a46afdSAlex Elder 		/*
615605a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
615705a46afdSAlex Elder 		 * remove it.
615805a46afdSAlex Elder 		 */
615905a46afdSAlex Elder 		while (second && (third = second->parent)) {
616005a46afdSAlex Elder 			first = second;
616105a46afdSAlex Elder 			second = third;
616205a46afdSAlex Elder 		}
6163ad945fc1SAlex Elder 		rbd_assert(second);
61648ad42cd0SAlex Elder 		rbd_dev_image_release(second);
61658b679ec5SIlya Dryomov 		rbd_dev_destroy(second);
6166ad945fc1SAlex Elder 		first->parent = NULL;
6167ad945fc1SAlex Elder 		first->parent_overlap = 0;
6168ad945fc1SAlex Elder 
6169ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
617005a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
617105a46afdSAlex Elder 		first->parent_spec = NULL;
617205a46afdSAlex Elder 	}
617305a46afdSAlex Elder }
617405a46afdSAlex Elder 
61759b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
6176602adf40SYehuda Sadeh 			     const char *buf,
6177602adf40SYehuda Sadeh 			     size_t count)
6178602adf40SYehuda Sadeh {
6179602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
6180751cc0e3SAlex Elder 	struct list_head *tmp;
6181751cc0e3SAlex Elder 	int dev_id;
61820276dca6SMike Christie 	char opt_buf[6];
61830276dca6SMike Christie 	bool force = false;
61840d8189e1SAlex Elder 	int ret;
6185602adf40SYehuda Sadeh 
61860276dca6SMike Christie 	dev_id = -1;
61870276dca6SMike Christie 	opt_buf[0] = '\0';
61880276dca6SMike Christie 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
61890276dca6SMike Christie 	if (dev_id < 0) {
61900276dca6SMike Christie 		pr_err("dev_id out of range\n");
6191602adf40SYehuda Sadeh 		return -EINVAL;
61920276dca6SMike Christie 	}
61930276dca6SMike Christie 	if (opt_buf[0] != '\0') {
61940276dca6SMike Christie 		if (!strcmp(opt_buf, "force")) {
61950276dca6SMike Christie 			force = true;
61960276dca6SMike Christie 		} else {
61970276dca6SMike Christie 			pr_err("bad remove option at '%s'\n", opt_buf);
61980276dca6SMike Christie 			return -EINVAL;
61990276dca6SMike Christie 		}
62000276dca6SMike Christie 	}
6201602adf40SYehuda Sadeh 
6202602adf40SYehuda Sadeh 	ret = -ENOENT;
6203751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
6204751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
6205751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
6206751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
6207751cc0e3SAlex Elder 			ret = 0;
6208751cc0e3SAlex Elder 			break;
6209602adf40SYehuda Sadeh 		}
6210751cc0e3SAlex Elder 	}
6211751cc0e3SAlex Elder 	if (!ret) {
6212a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
62130276dca6SMike Christie 		if (rbd_dev->open_count && !force)
621442382b70SAlex Elder 			ret = -EBUSY;
621585f5a4d6SIlya Dryomov 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
621685f5a4d6SIlya Dryomov 					  &rbd_dev->flags))
621785f5a4d6SIlya Dryomov 			ret = -EINPROGRESS;
6218a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
6219751cc0e3SAlex Elder 	}
6220751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
622185f5a4d6SIlya Dryomov 	if (ret)
62221ba0f1e7SAlex Elder 		return ret;
6223751cc0e3SAlex Elder 
62240276dca6SMike Christie 	if (force) {
62250276dca6SMike Christie 		/*
62260276dca6SMike Christie 		 * Prevent new IO from being queued and wait for existing
62270276dca6SMike Christie 		 * IO to complete/fail.
62280276dca6SMike Christie 		 */
62290276dca6SMike Christie 		blk_mq_freeze_queue(rbd_dev->disk->queue);
62300276dca6SMike Christie 		blk_set_queue_dying(rbd_dev->disk->queue);
62310276dca6SMike Christie 	}
62320276dca6SMike Christie 
62335769ed0cSIlya Dryomov 	del_gendisk(rbd_dev->disk);
62345769ed0cSIlya Dryomov 	spin_lock(&rbd_dev_list_lock);
62355769ed0cSIlya Dryomov 	list_del_init(&rbd_dev->node);
62365769ed0cSIlya Dryomov 	spin_unlock(&rbd_dev_list_lock);
62375769ed0cSIlya Dryomov 	device_del(&rbd_dev->dev);
6238fca27065SIlya Dryomov 
6239e010dd0aSIlya Dryomov 	rbd_dev_image_unlock(rbd_dev);
6240dd5ac32dSIlya Dryomov 	rbd_dev_device_release(rbd_dev);
62418ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
62428b679ec5SIlya Dryomov 	rbd_dev_destroy(rbd_dev);
62431ba0f1e7SAlex Elder 	return count;
6244602adf40SYehuda Sadeh }
6245602adf40SYehuda Sadeh 
62469b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
62479b60e70bSIlya Dryomov 			  const char *buf,
62489b60e70bSIlya Dryomov 			  size_t count)
62499b60e70bSIlya Dryomov {
62509b60e70bSIlya Dryomov 	if (single_major)
62519b60e70bSIlya Dryomov 		return -EINVAL;
62529b60e70bSIlya Dryomov 
62539b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
62549b60e70bSIlya Dryomov }
62559b60e70bSIlya Dryomov 
62569b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
62579b60e70bSIlya Dryomov 				       const char *buf,
62589b60e70bSIlya Dryomov 				       size_t count)
62599b60e70bSIlya Dryomov {
62609b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
62619b60e70bSIlya Dryomov }
62629b60e70bSIlya Dryomov 
6263602adf40SYehuda Sadeh /*
6264602adf40SYehuda Sadeh  * create control files in sysfs
6265dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
6266602adf40SYehuda Sadeh  */
62677d8dc534SChengguang Xu static int __init rbd_sysfs_init(void)
6268602adf40SYehuda Sadeh {
6269dfc5606dSYehuda Sadeh 	int ret;
6270602adf40SYehuda Sadeh 
6271fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
6272dfc5606dSYehuda Sadeh 	if (ret < 0)
6273dfc5606dSYehuda Sadeh 		return ret;
6274602adf40SYehuda Sadeh 
6275fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
6276fed4c143SAlex Elder 	if (ret < 0)
6277fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
6278602adf40SYehuda Sadeh 
6279602adf40SYehuda Sadeh 	return ret;
6280602adf40SYehuda Sadeh }
6281602adf40SYehuda Sadeh 
62827d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void)
6283602adf40SYehuda Sadeh {
6284dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
6285fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
6286602adf40SYehuda Sadeh }
6287602adf40SYehuda Sadeh 
62887d8dc534SChengguang Xu static int __init rbd_slab_init(void)
62891c2a9dfeSAlex Elder {
62901c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
629103d94406SGeliang Tang 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
6292868311b1SAlex Elder 	if (!rbd_img_request_cache)
6293868311b1SAlex Elder 		return -ENOMEM;
6294868311b1SAlex Elder 
6295868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
629603d94406SGeliang Tang 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
629778c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
629878c2a44aSAlex Elder 		goto out_err;
629978c2a44aSAlex Elder 
63001c2a9dfeSAlex Elder 	return 0;
63011c2a9dfeSAlex Elder 
63026c696d85SIlya Dryomov out_err:
6303868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
6304868311b1SAlex Elder 	rbd_img_request_cache = NULL;
63051c2a9dfeSAlex Elder 	return -ENOMEM;
63061c2a9dfeSAlex Elder }
63071c2a9dfeSAlex Elder 
63081c2a9dfeSAlex Elder static void rbd_slab_exit(void)
63091c2a9dfeSAlex Elder {
6310868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
6311868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
6312868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
6313868311b1SAlex Elder 
63141c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
63151c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
63161c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
63171c2a9dfeSAlex Elder }
63181c2a9dfeSAlex Elder 
6319cc344fa1SAlex Elder static int __init rbd_init(void)
6320602adf40SYehuda Sadeh {
6321602adf40SYehuda Sadeh 	int rc;
6322602adf40SYehuda Sadeh 
63231e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
63241e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
63251e32d34cSAlex Elder 		return -EINVAL;
63261e32d34cSAlex Elder 	}
6327e1b4d96dSIlya Dryomov 
63281c2a9dfeSAlex Elder 	rc = rbd_slab_init();
6329602adf40SYehuda Sadeh 	if (rc)
6330602adf40SYehuda Sadeh 		return rc;
6331e1b4d96dSIlya Dryomov 
6332f5ee37bdSIlya Dryomov 	/*
6333f5ee37bdSIlya Dryomov 	 * The number of active work items is limited by the number of
6334f77303bdSIlya Dryomov 	 * rbd devices * queue depth, so leave @max_active at default.
6335f5ee37bdSIlya Dryomov 	 */
6336f5ee37bdSIlya Dryomov 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6337f5ee37bdSIlya Dryomov 	if (!rbd_wq) {
6338f5ee37bdSIlya Dryomov 		rc = -ENOMEM;
6339f5ee37bdSIlya Dryomov 		goto err_out_slab;
6340f5ee37bdSIlya Dryomov 	}
6341f5ee37bdSIlya Dryomov 
63429b60e70bSIlya Dryomov 	if (single_major) {
63439b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
63449b60e70bSIlya Dryomov 		if (rbd_major < 0) {
63459b60e70bSIlya Dryomov 			rc = rbd_major;
6346f5ee37bdSIlya Dryomov 			goto err_out_wq;
63479b60e70bSIlya Dryomov 		}
63489b60e70bSIlya Dryomov 	}
63499b60e70bSIlya Dryomov 
63501c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
63511c2a9dfeSAlex Elder 	if (rc)
63529b60e70bSIlya Dryomov 		goto err_out_blkdev;
63531c2a9dfeSAlex Elder 
63549b60e70bSIlya Dryomov 	if (single_major)
63559b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
63569b60e70bSIlya Dryomov 	else
6357e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
63589b60e70bSIlya Dryomov 
6359e1b4d96dSIlya Dryomov 	return 0;
6360e1b4d96dSIlya Dryomov 
63619b60e70bSIlya Dryomov err_out_blkdev:
63629b60e70bSIlya Dryomov 	if (single_major)
63639b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6364f5ee37bdSIlya Dryomov err_out_wq:
6365f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
6366e1b4d96dSIlya Dryomov err_out_slab:
6367e1b4d96dSIlya Dryomov 	rbd_slab_exit();
63681c2a9dfeSAlex Elder 	return rc;
6369602adf40SYehuda Sadeh }
6370602adf40SYehuda Sadeh 
6371cc344fa1SAlex Elder static void __exit rbd_exit(void)
6372602adf40SYehuda Sadeh {
6373ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
6374602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
63759b60e70bSIlya Dryomov 	if (single_major)
63769b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6377f5ee37bdSIlya Dryomov 	destroy_workqueue(rbd_wq);
63781c2a9dfeSAlex Elder 	rbd_slab_exit();
6379602adf40SYehuda Sadeh }
6380602adf40SYehuda Sadeh 
6381602adf40SYehuda Sadeh module_init(rbd_init);
6382602adf40SYehuda Sadeh module_exit(rbd_exit);
6383602adf40SYehuda Sadeh 
6384d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6385602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6386602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6387602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
6388602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6389602adf40SYehuda Sadeh 
639090da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6391602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
6392