xref: /openbmc/linux/drivers/block/rbd.c (revision 30ba1f02)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44f8a22fc2SIlya Dryomov #include <linux/idr.h>
45602adf40SYehuda Sadeh 
46602adf40SYehuda Sadeh #include "rbd_types.h"
47602adf40SYehuda Sadeh 
48aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
49aafb230eSAlex Elder 
50593a9e7bSAlex Elder /*
51593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
52593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
53593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
54593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
55593a9e7bSAlex Elder  */
56593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
57593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
58593a9e7bSAlex Elder 
59a2acd00eSAlex Elder /*
60a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
61a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
62a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
63a2acd00eSAlex Elder  * -EINVAL without updating it.
64a2acd00eSAlex Elder  */
65a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
66a2acd00eSAlex Elder {
67a2acd00eSAlex Elder 	unsigned int counter;
68a2acd00eSAlex Elder 
69a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
70a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
71a2acd00eSAlex Elder 		return (int)counter;
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder 	atomic_dec(v);
74a2acd00eSAlex Elder 
75a2acd00eSAlex Elder 	return -EINVAL;
76a2acd00eSAlex Elder }
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
79a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
80a2acd00eSAlex Elder {
81a2acd00eSAlex Elder 	int counter;
82a2acd00eSAlex Elder 
83a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
84a2acd00eSAlex Elder 	if (counter >= 0)
85a2acd00eSAlex Elder 		return counter;
86a2acd00eSAlex Elder 
87a2acd00eSAlex Elder 	atomic_inc(v);
88a2acd00eSAlex Elder 
89a2acd00eSAlex Elder 	return -EINVAL;
90a2acd00eSAlex Elder }
91a2acd00eSAlex Elder 
92f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
93602adf40SYehuda Sadeh 
947e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
957e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
96602adf40SYehuda Sadeh 
97d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
98d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
99d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
100d4b125e9SAlex Elder 
10135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
102602adf40SYehuda Sadeh 
103602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
104602adf40SYehuda Sadeh 
1059682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1069682fc6dSAlex Elder 
1079e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1089e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
109589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1109e15b77dSAlex Elder 
1111e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
112589d30e0SAlex Elder 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1155cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1165cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1175cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1185cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
119d889140cSAlex Elder 
120d889140cSAlex Elder /* Features supported by this (client software) implementation. */
121d889140cSAlex Elder 
122770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
123d889140cSAlex Elder 
12481a89793SAlex Elder /*
12581a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
12681a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
12781a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
12881a89793SAlex Elder  * enough to hold all possible device names.
12981a89793SAlex Elder  */
130602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
132602adf40SYehuda Sadeh 
133602adf40SYehuda Sadeh /*
134602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
135602adf40SYehuda Sadeh  */
136602adf40SYehuda Sadeh struct rbd_image_header {
137f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
138849b4260SAlex Elder 	char *object_prefix;
139602adf40SYehuda Sadeh 	__u8 obj_order;
140602adf40SYehuda Sadeh 	__u8 crypt_type;
141602adf40SYehuda Sadeh 	__u8 comp_type;
142f35a4deeSAlex Elder 	u64 stripe_unit;
143f35a4deeSAlex Elder 	u64 stripe_count;
144f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
145602adf40SYehuda Sadeh 
146f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
147f84344f3SAlex Elder 	u64 image_size;
148f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
149f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
150f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15159c2be1eSYehuda Sadeh };
15259c2be1eSYehuda Sadeh 
1530d7dbfceSAlex Elder /*
1540d7dbfceSAlex Elder  * An rbd image specification.
1550d7dbfceSAlex Elder  *
1560d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
157c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
158c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
159c66c6e0cSAlex Elder  *
160c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
161c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
162c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
163c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
164c66c6e0cSAlex Elder  *
165c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
166c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
167c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
168c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
169c66c6e0cSAlex Elder  * is shared between the parent and child).
170c66c6e0cSAlex Elder  *
171c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
172c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
173c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
174c66c6e0cSAlex Elder  *
175c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
176c66c6e0cSAlex Elder  * could be a null pointer).
1770d7dbfceSAlex Elder  */
1780d7dbfceSAlex Elder struct rbd_spec {
1790d7dbfceSAlex Elder 	u64		pool_id;
180ecb4dc22SAlex Elder 	const char	*pool_name;
1810d7dbfceSAlex Elder 
182ecb4dc22SAlex Elder 	const char	*image_id;
183ecb4dc22SAlex Elder 	const char	*image_name;
1840d7dbfceSAlex Elder 
1850d7dbfceSAlex Elder 	u64		snap_id;
186ecb4dc22SAlex Elder 	const char	*snap_name;
1870d7dbfceSAlex Elder 
1880d7dbfceSAlex Elder 	struct kref	kref;
1890d7dbfceSAlex Elder };
1900d7dbfceSAlex Elder 
191602adf40SYehuda Sadeh /*
192f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
193602adf40SYehuda Sadeh  */
194602adf40SYehuda Sadeh struct rbd_client {
195602adf40SYehuda Sadeh 	struct ceph_client	*client;
196602adf40SYehuda Sadeh 	struct kref		kref;
197602adf40SYehuda Sadeh 	struct list_head	node;
198602adf40SYehuda Sadeh };
199602adf40SYehuda Sadeh 
200bf0d5f50SAlex Elder struct rbd_img_request;
201bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
202bf0d5f50SAlex Elder 
203bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
204bf0d5f50SAlex Elder 
205bf0d5f50SAlex Elder struct rbd_obj_request;
206bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
207bf0d5f50SAlex Elder 
2089969ebc5SAlex Elder enum obj_request_type {
2099969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2109969ebc5SAlex Elder };
211bf0d5f50SAlex Elder 
212926f9b3fSAlex Elder enum obj_req_flags {
213926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2146365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2155679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2165679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
217926f9b3fSAlex Elder };
218926f9b3fSAlex Elder 
219bf0d5f50SAlex Elder struct rbd_obj_request {
220bf0d5f50SAlex Elder 	const char		*object_name;
221bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
222bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
223926f9b3fSAlex Elder 	unsigned long		flags;
224bf0d5f50SAlex Elder 
225c5b5ef6cSAlex Elder 	/*
226c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
227c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
228c5b5ef6cSAlex Elder 	 *
229c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
230c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
231c5b5ef6cSAlex Elder 	 *
232c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
233c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
234c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
235c5b5ef6cSAlex Elder 	 *
236c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
237c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
238c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
239c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
240c5b5ef6cSAlex Elder 	 */
241c5b5ef6cSAlex Elder 	union {
242c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
243c5b5ef6cSAlex Elder 		struct {
244bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
245c5b5ef6cSAlex Elder 			u64			img_offset;
246c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
247c5b5ef6cSAlex Elder 			struct list_head	links;
248c5b5ef6cSAlex Elder 		};
249c5b5ef6cSAlex Elder 	};
250bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
251bf0d5f50SAlex Elder 
252bf0d5f50SAlex Elder 	enum obj_request_type	type;
253788e2df3SAlex Elder 	union {
254bf0d5f50SAlex Elder 		struct bio	*bio_list;
255788e2df3SAlex Elder 		struct {
256788e2df3SAlex Elder 			struct page	**pages;
257788e2df3SAlex Elder 			u32		page_count;
258788e2df3SAlex Elder 		};
259788e2df3SAlex Elder 	};
2600eefd470SAlex Elder 	struct page		**copyup_pages;
261ebda6408SAlex Elder 	u32			copyup_page_count;
262bf0d5f50SAlex Elder 
263bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2661b83bef2SSage Weil 	int			result;
267bf0d5f50SAlex Elder 
268bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
269788e2df3SAlex Elder 	struct completion	completion;
270bf0d5f50SAlex Elder 
271bf0d5f50SAlex Elder 	struct kref		kref;
272bf0d5f50SAlex Elder };
273bf0d5f50SAlex Elder 
2740c425248SAlex Elder enum img_req_flags {
2759849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2769849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
277d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2780c425248SAlex Elder };
2790c425248SAlex Elder 
280bf0d5f50SAlex Elder struct rbd_img_request {
281bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
282bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
283bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2840c425248SAlex Elder 	unsigned long		flags;
285bf0d5f50SAlex Elder 	union {
286bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2879849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2889849e986SAlex Elder 	};
2899849e986SAlex Elder 	union {
2909849e986SAlex Elder 		struct request		*rq;		/* block request */
2919849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
292bf0d5f50SAlex Elder 	};
2933d7efd18SAlex Elder 	struct page		**copyup_pages;
294ebda6408SAlex Elder 	u32			copyup_page_count;
295bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
296bf0d5f50SAlex Elder 	u32			next_completion;
297bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
29855f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
299a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
300bf0d5f50SAlex Elder 
301bf0d5f50SAlex Elder 	u32			obj_request_count;
302bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
303bf0d5f50SAlex Elder 
304bf0d5f50SAlex Elder 	struct kref		kref;
305bf0d5f50SAlex Elder };
306bf0d5f50SAlex Elder 
307bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
308ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
309bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
310ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
311bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
312ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
313bf0d5f50SAlex Elder 
314f84344f3SAlex Elder struct rbd_mapping {
31599c1f08fSAlex Elder 	u64                     size;
31634b13184SAlex Elder 	u64                     features;
317f84344f3SAlex Elder 	bool			read_only;
318f84344f3SAlex Elder };
319f84344f3SAlex Elder 
320602adf40SYehuda Sadeh /*
321602adf40SYehuda Sadeh  * a single device
322602adf40SYehuda Sadeh  */
323602adf40SYehuda Sadeh struct rbd_device {
324de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
325602adf40SYehuda Sadeh 
326602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
327dd82fff1SIlya Dryomov 	int			minor;
328602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
329602adf40SYehuda Sadeh 
330a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
331602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
332602adf40SYehuda Sadeh 
333602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
334602adf40SYehuda Sadeh 
335b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
336602adf40SYehuda Sadeh 
337602adf40SYehuda Sadeh 	struct rbd_image_header	header;
338b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3390d7dbfceSAlex Elder 	struct rbd_spec		*spec;
340602adf40SYehuda Sadeh 
3410d7dbfceSAlex Elder 	char			*header_name;
342971f839aSAlex Elder 
3430903e875SAlex Elder 	struct ceph_file_layout	layout;
3440903e875SAlex Elder 
34559c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
346975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
34759c2be1eSYehuda Sadeh 
34886b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
34986b00e0dSAlex Elder 	u64			parent_overlap;
350a2acd00eSAlex Elder 	atomic_t		parent_ref;
3512f82ee54SAlex Elder 	struct rbd_device	*parent;
35286b00e0dSAlex Elder 
353c666601aSJosh Durgin 	/* protects updating the header */
354c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
355f84344f3SAlex Elder 
356f84344f3SAlex Elder 	struct rbd_mapping	mapping;
357602adf40SYehuda Sadeh 
358602adf40SYehuda Sadeh 	struct list_head	node;
359dfc5606dSYehuda Sadeh 
360dfc5606dSYehuda Sadeh 	/* sysfs related */
361dfc5606dSYehuda Sadeh 	struct device		dev;
362b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
363dfc5606dSYehuda Sadeh };
364dfc5606dSYehuda Sadeh 
365b82d167bSAlex Elder /*
366b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
367b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
368b82d167bSAlex Elder  *
369b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
370b82d167bSAlex Elder  * "open_count" field) requires atomic access.
371b82d167bSAlex Elder  */
3726d292906SAlex Elder enum rbd_dev_flags {
3736d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
374b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3756d292906SAlex Elder };
3766d292906SAlex Elder 
377cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
378e124a82fSAlex Elder 
379602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
380e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
381e124a82fSAlex Elder 
382602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
383432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
384602adf40SYehuda Sadeh 
38578c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
38678c2a44aSAlex Elder 
3871c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
388868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
38978c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3901c2a9dfeSAlex Elder 
3919b60e70bSIlya Dryomov static int rbd_major;
392f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
393f8a22fc2SIlya Dryomov 
3949b60e70bSIlya Dryomov /*
3959b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
3969b60e70bSIlya Dryomov  * userspace rbd utility.
3979b60e70bSIlya Dryomov  */
3989b60e70bSIlya Dryomov static bool single_major = false;
3999b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4009b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4019b60e70bSIlya Dryomov 
4023d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4033d7efd18SAlex Elder 
404200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
405dfc5606dSYehuda Sadeh 
406f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
407f0f8cef5SAlex Elder 		       size_t count);
408f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
409f0f8cef5SAlex Elder 			  size_t count);
4109b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4119b60e70bSIlya Dryomov 				    size_t count);
4129b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4139b60e70bSIlya Dryomov 				       size_t count);
4141f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
415a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
416f0f8cef5SAlex Elder 
4179b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4189b60e70bSIlya Dryomov {
4197e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4209b60e70bSIlya Dryomov }
4219b60e70bSIlya Dryomov 
4229b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4239b60e70bSIlya Dryomov {
4247e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4259b60e70bSIlya Dryomov }
4269b60e70bSIlya Dryomov 
427b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
428b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
4299b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
4309b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
431b15a21ddSGreg Kroah-Hartman 
432b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
433b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
434b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4359b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4369b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
437b15a21ddSGreg Kroah-Hartman 	NULL,
438f0f8cef5SAlex Elder };
43992c76dc0SIlya Dryomov 
44092c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
44192c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
44292c76dc0SIlya Dryomov {
4439b60e70bSIlya Dryomov 	if (!single_major &&
4449b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4459b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4469b60e70bSIlya Dryomov 		return 0;
4479b60e70bSIlya Dryomov 
44892c76dc0SIlya Dryomov 	return attr->mode;
44992c76dc0SIlya Dryomov }
45092c76dc0SIlya Dryomov 
45192c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
45292c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
45392c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
45492c76dc0SIlya Dryomov };
45592c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
456f0f8cef5SAlex Elder 
457f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
458f0f8cef5SAlex Elder 	.name		= "rbd",
459b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
460f0f8cef5SAlex Elder };
461f0f8cef5SAlex Elder 
462f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
463f0f8cef5SAlex Elder {
464f0f8cef5SAlex Elder }
465f0f8cef5SAlex Elder 
466f0f8cef5SAlex Elder static struct device rbd_root_dev = {
467f0f8cef5SAlex Elder 	.init_name =    "rbd",
468f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
469f0f8cef5SAlex Elder };
470f0f8cef5SAlex Elder 
47106ecc6cbSAlex Elder static __printf(2, 3)
47206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
47306ecc6cbSAlex Elder {
47406ecc6cbSAlex Elder 	struct va_format vaf;
47506ecc6cbSAlex Elder 	va_list args;
47606ecc6cbSAlex Elder 
47706ecc6cbSAlex Elder 	va_start(args, fmt);
47806ecc6cbSAlex Elder 	vaf.fmt = fmt;
47906ecc6cbSAlex Elder 	vaf.va = &args;
48006ecc6cbSAlex Elder 
48106ecc6cbSAlex Elder 	if (!rbd_dev)
48206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
48306ecc6cbSAlex Elder 	else if (rbd_dev->disk)
48406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
48506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
48606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
48706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
48806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
48906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
49006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
49106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
49206ecc6cbSAlex Elder 	else	/* punt */
49306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
49406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
49506ecc6cbSAlex Elder 	va_end(args);
49606ecc6cbSAlex Elder }
49706ecc6cbSAlex Elder 
498aafb230eSAlex Elder #ifdef RBD_DEBUG
499aafb230eSAlex Elder #define rbd_assert(expr)						\
500aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
501aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
502aafb230eSAlex Elder 						"at line %d:\n\n"	\
503aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
504aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
505aafb230eSAlex Elder 			BUG();						\
506aafb230eSAlex Elder 		}
507aafb230eSAlex Elder #else /* !RBD_DEBUG */
508aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
509aafb230eSAlex Elder #endif /* !RBD_DEBUG */
510dfc5606dSYehuda Sadeh 
511b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
51205a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
51305a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5148b3e1a56SAlex Elder 
515cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5162df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
5172df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
51854cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
51954cac61fSAlex Elder 					u64 snap_id);
5202ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5212ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5222ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5232ad3d716SAlex Elder 		u64 *snap_features);
5242ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
52559c2be1eSYehuda Sadeh 
526602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
527602adf40SYehuda Sadeh {
528f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
529b82d167bSAlex Elder 	bool removing = false;
530602adf40SYehuda Sadeh 
531f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
532602adf40SYehuda Sadeh 		return -EROFS;
533602adf40SYehuda Sadeh 
534a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
535b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
536b82d167bSAlex Elder 		removing = true;
537b82d167bSAlex Elder 	else
538b82d167bSAlex Elder 		rbd_dev->open_count++;
539a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
540b82d167bSAlex Elder 	if (removing)
541b82d167bSAlex Elder 		return -ENOENT;
542b82d167bSAlex Elder 
543c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
544f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
545340c7a2bSAlex Elder 
546602adf40SYehuda Sadeh 	return 0;
547602adf40SYehuda Sadeh }
548602adf40SYehuda Sadeh 
549db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
550dfc5606dSYehuda Sadeh {
551dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
552b82d167bSAlex Elder 	unsigned long open_count_before;
553b82d167bSAlex Elder 
554a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
555b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
556a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
557b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
558dfc5606dSYehuda Sadeh 
559c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
560dfc5606dSYehuda Sadeh }
561dfc5606dSYehuda Sadeh 
562602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
563602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
564602adf40SYehuda Sadeh 	.open			= rbd_open,
565dfc5606dSYehuda Sadeh 	.release		= rbd_release,
566602adf40SYehuda Sadeh };
567602adf40SYehuda Sadeh 
568602adf40SYehuda Sadeh /*
5697262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
570cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
571602adf40SYehuda Sadeh  */
572f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
573602adf40SYehuda Sadeh {
574602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
575602adf40SYehuda Sadeh 	int ret = -ENOMEM;
576602adf40SYehuda Sadeh 
57737206ee5SAlex Elder 	dout("%s:\n", __func__);
578602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
579602adf40SYehuda Sadeh 	if (!rbdc)
580602adf40SYehuda Sadeh 		goto out_opt;
581602adf40SYehuda Sadeh 
582602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
583602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
584602adf40SYehuda Sadeh 
58543ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
586602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
58708f75463SAlex Elder 		goto out_rbdc;
58843ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
589602adf40SYehuda Sadeh 
590602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
591602adf40SYehuda Sadeh 	if (ret < 0)
59208f75463SAlex Elder 		goto out_client;
593602adf40SYehuda Sadeh 
594432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
595602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
596432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
597602adf40SYehuda Sadeh 
59837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
599bc534d86SAlex Elder 
600602adf40SYehuda Sadeh 	return rbdc;
60108f75463SAlex Elder out_client:
602602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
60308f75463SAlex Elder out_rbdc:
604602adf40SYehuda Sadeh 	kfree(rbdc);
605602adf40SYehuda Sadeh out_opt:
60643ae4701SAlex Elder 	if (ceph_opts)
60743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
60837206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
60937206ee5SAlex Elder 
61028f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
611602adf40SYehuda Sadeh }
612602adf40SYehuda Sadeh 
6132f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
6142f82ee54SAlex Elder {
6152f82ee54SAlex Elder 	kref_get(&rbdc->kref);
6162f82ee54SAlex Elder 
6172f82ee54SAlex Elder 	return rbdc;
6182f82ee54SAlex Elder }
6192f82ee54SAlex Elder 
620602adf40SYehuda Sadeh /*
6211f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
6221f7ba331SAlex Elder  * found, bump its reference count.
623602adf40SYehuda Sadeh  */
6241f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
625602adf40SYehuda Sadeh {
626602adf40SYehuda Sadeh 	struct rbd_client *client_node;
6271f7ba331SAlex Elder 	bool found = false;
628602adf40SYehuda Sadeh 
62943ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
630602adf40SYehuda Sadeh 		return NULL;
631602adf40SYehuda Sadeh 
6321f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
6331f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
6341f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
6352f82ee54SAlex Elder 			__rbd_get_client(client_node);
6362f82ee54SAlex Elder 
6371f7ba331SAlex Elder 			found = true;
6381f7ba331SAlex Elder 			break;
6391f7ba331SAlex Elder 		}
6401f7ba331SAlex Elder 	}
6411f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
6421f7ba331SAlex Elder 
6431f7ba331SAlex Elder 	return found ? client_node : NULL;
644602adf40SYehuda Sadeh }
645602adf40SYehuda Sadeh 
646602adf40SYehuda Sadeh /*
64759c2be1eSYehuda Sadeh  * mount options
64859c2be1eSYehuda Sadeh  */
64959c2be1eSYehuda Sadeh enum {
65059c2be1eSYehuda Sadeh 	Opt_last_int,
65159c2be1eSYehuda Sadeh 	/* int args above */
65259c2be1eSYehuda Sadeh 	Opt_last_string,
65359c2be1eSYehuda Sadeh 	/* string args above */
654cc0538b6SAlex Elder 	Opt_read_only,
655cc0538b6SAlex Elder 	Opt_read_write,
656cc0538b6SAlex Elder 	/* Boolean args above */
657cc0538b6SAlex Elder 	Opt_last_bool,
65859c2be1eSYehuda Sadeh };
65959c2be1eSYehuda Sadeh 
66043ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
66159c2be1eSYehuda Sadeh 	/* int args above */
66259c2be1eSYehuda Sadeh 	/* string args above */
663be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
664cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
665cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
666cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
667cc0538b6SAlex Elder 	/* Boolean args above */
66859c2be1eSYehuda Sadeh 	{-1, NULL}
66959c2be1eSYehuda Sadeh };
67059c2be1eSYehuda Sadeh 
67198571b5aSAlex Elder struct rbd_options {
67298571b5aSAlex Elder 	bool	read_only;
67398571b5aSAlex Elder };
67498571b5aSAlex Elder 
67598571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
67698571b5aSAlex Elder 
67759c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
67859c2be1eSYehuda Sadeh {
67943ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
68059c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
68159c2be1eSYehuda Sadeh 	int token, intval, ret;
68259c2be1eSYehuda Sadeh 
68343ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
68459c2be1eSYehuda Sadeh 	if (token < 0)
68559c2be1eSYehuda Sadeh 		return -EINVAL;
68659c2be1eSYehuda Sadeh 
68759c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
68859c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
68959c2be1eSYehuda Sadeh 		if (ret < 0) {
69059c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
69159c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
69259c2be1eSYehuda Sadeh 			return ret;
69359c2be1eSYehuda Sadeh 		}
69459c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
69559c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
69659c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
69759c2be1eSYehuda Sadeh 		     argstr[0].from);
698cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
699cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
70059c2be1eSYehuda Sadeh 	} else {
70159c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
70259c2be1eSYehuda Sadeh 	}
70359c2be1eSYehuda Sadeh 
70459c2be1eSYehuda Sadeh 	switch (token) {
705cc0538b6SAlex Elder 	case Opt_read_only:
706cc0538b6SAlex Elder 		rbd_opts->read_only = true;
707cc0538b6SAlex Elder 		break;
708cc0538b6SAlex Elder 	case Opt_read_write:
709cc0538b6SAlex Elder 		rbd_opts->read_only = false;
710cc0538b6SAlex Elder 		break;
71159c2be1eSYehuda Sadeh 	default:
712aafb230eSAlex Elder 		rbd_assert(false);
713aafb230eSAlex Elder 		break;
71459c2be1eSYehuda Sadeh 	}
71559c2be1eSYehuda Sadeh 	return 0;
71659c2be1eSYehuda Sadeh }
71759c2be1eSYehuda Sadeh 
71859c2be1eSYehuda Sadeh /*
719602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
7207262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
7217262cfcaSAlex Elder  * function.
722602adf40SYehuda Sadeh  */
7239d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
724602adf40SYehuda Sadeh {
725f8c38929SAlex Elder 	struct rbd_client *rbdc;
72659c2be1eSYehuda Sadeh 
727cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
7281f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
7299d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
73043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
7319d3997fdSAlex Elder 	else
732f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
733cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
734d720bcb0SAlex Elder 
7359d3997fdSAlex Elder 	return rbdc;
736602adf40SYehuda Sadeh }
737602adf40SYehuda Sadeh 
738602adf40SYehuda Sadeh /*
739602adf40SYehuda Sadeh  * Destroy ceph client
740d23a4b3fSAlex Elder  *
741432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
742602adf40SYehuda Sadeh  */
743602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
744602adf40SYehuda Sadeh {
745602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
746602adf40SYehuda Sadeh 
74737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
748cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
749602adf40SYehuda Sadeh 	list_del(&rbdc->node);
750cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
751602adf40SYehuda Sadeh 
752602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
753602adf40SYehuda Sadeh 	kfree(rbdc);
754602adf40SYehuda Sadeh }
755602adf40SYehuda Sadeh 
756602adf40SYehuda Sadeh /*
757602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
758602adf40SYehuda Sadeh  * it.
759602adf40SYehuda Sadeh  */
7609d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
761602adf40SYehuda Sadeh {
762c53d5893SAlex Elder 	if (rbdc)
7639d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
764602adf40SYehuda Sadeh }
765602adf40SYehuda Sadeh 
766a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
767a30b71b9SAlex Elder {
768a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
769a30b71b9SAlex Elder }
770a30b71b9SAlex Elder 
7718e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
7728e94af8eSAlex Elder {
773103a150fSAlex Elder 	size_t size;
774103a150fSAlex Elder 	u32 snap_count;
775103a150fSAlex Elder 
776103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
777103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
778103a150fSAlex Elder 		return false;
779103a150fSAlex Elder 
780db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
781db2388b6SAlex Elder 
782db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
783db2388b6SAlex Elder 		return false;
784db2388b6SAlex Elder 
785db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
786db2388b6SAlex Elder 
787db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
788db2388b6SAlex Elder 		return false;
789db2388b6SAlex Elder 
790103a150fSAlex Elder 	/*
791103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
792103a150fSAlex Elder 	 * that limits the number of snapshots.
793103a150fSAlex Elder 	 */
794103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
795103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
796103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
797103a150fSAlex Elder 		return false;
798103a150fSAlex Elder 
799103a150fSAlex Elder 	/*
800103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
801103a150fSAlex Elder 	 * header must also be representable in a size_t.
802103a150fSAlex Elder 	 */
803103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
804103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
805103a150fSAlex Elder 		return false;
806103a150fSAlex Elder 
807103a150fSAlex Elder 	return true;
8088e94af8eSAlex Elder }
8098e94af8eSAlex Elder 
810602adf40SYehuda Sadeh /*
811bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
812bb23e37aSAlex Elder  * on-disk header.
813602adf40SYehuda Sadeh  */
814662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
8154156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
816602adf40SYehuda Sadeh {
817662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
818bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
819bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
820bb23e37aSAlex Elder 	char *object_prefix = NULL;
821bb23e37aSAlex Elder 	char *snap_names = NULL;
822bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
823ccece235SAlex Elder 	u32 snap_count;
824d2bb24e5SAlex Elder 	size_t size;
825bb23e37aSAlex Elder 	int ret = -ENOMEM;
826621901d6SAlex Elder 	u32 i;
827602adf40SYehuda Sadeh 
828bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
829103a150fSAlex Elder 
830bb23e37aSAlex Elder 	if (first_time) {
831bb23e37aSAlex Elder 		size_t len;
832bb23e37aSAlex Elder 
833bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
834bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
835bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
836bb23e37aSAlex Elder 		if (!object_prefix)
837602adf40SYehuda Sadeh 			return -ENOMEM;
838bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
839bb23e37aSAlex Elder 		object_prefix[len] = '\0';
840bb23e37aSAlex Elder 	}
84100f1f36fSAlex Elder 
842bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
843d2bb24e5SAlex Elder 
844602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
845bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
846bb23e37aSAlex Elder 	if (!snapc)
847bb23e37aSAlex Elder 		goto out_err;
848bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
849602adf40SYehuda Sadeh 	if (snap_count) {
850bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
851f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
852f785cc1dSAlex Elder 
853bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
854621901d6SAlex Elder 
855f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
856bb23e37aSAlex Elder 			goto out_2big;
857bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
858bb23e37aSAlex Elder 		if (!snap_names)
859602adf40SYehuda Sadeh 			goto out_err;
860bb23e37aSAlex Elder 
861bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
862bb23e37aSAlex Elder 
863bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
864bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
865bb23e37aSAlex Elder 		if (!snap_sizes)
866bb23e37aSAlex Elder 			goto out_err;
867bb23e37aSAlex Elder 
868f785cc1dSAlex Elder 		/*
869bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
870bb23e37aSAlex Elder 		 * and size.
871bb23e37aSAlex Elder 		 *
87299a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
873bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
874f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
875f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
876f785cc1dSAlex Elder 		 */
877bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
878bb23e37aSAlex Elder 		snaps = ondisk->snaps;
879bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
880bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
881bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
882bb23e37aSAlex Elder 		}
883602adf40SYehuda Sadeh 	}
884849b4260SAlex Elder 
885bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
886bb23e37aSAlex Elder 
887bb23e37aSAlex Elder 	if (first_time) {
888bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
889602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
890602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
891602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
892bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
893bb23e37aSAlex Elder 		header->stripe_unit = 0;
894bb23e37aSAlex Elder 		header->stripe_count = 0;
895bb23e37aSAlex Elder 		header->features = 0;
896662518b1SAlex Elder 	} else {
897662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
898662518b1SAlex Elder 		kfree(header->snap_names);
899662518b1SAlex Elder 		kfree(header->snap_sizes);
900bb23e37aSAlex Elder 	}
9016a52325fSAlex Elder 
902bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
903621901d6SAlex Elder 
904f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
905bb23e37aSAlex Elder 	header->snapc = snapc;
906bb23e37aSAlex Elder 	header->snap_names = snap_names;
907bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
908468521c1SAlex Elder 
909662518b1SAlex Elder 	/* Make sure mapping size is consistent with header info */
910662518b1SAlex Elder 
911662518b1SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
912662518b1SAlex Elder 		if (rbd_dev->mapping.size != header->image_size)
913662518b1SAlex Elder 			rbd_dev->mapping.size = header->image_size;
914662518b1SAlex Elder 
915602adf40SYehuda Sadeh 	return 0;
916bb23e37aSAlex Elder out_2big:
917bb23e37aSAlex Elder 	ret = -EIO;
9186a52325fSAlex Elder out_err:
919bb23e37aSAlex Elder 	kfree(snap_sizes);
920bb23e37aSAlex Elder 	kfree(snap_names);
921bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
922bb23e37aSAlex Elder 	kfree(object_prefix);
923ccece235SAlex Elder 
924bb23e37aSAlex Elder 	return ret;
925602adf40SYehuda Sadeh }
926602adf40SYehuda Sadeh 
9279682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
9289682fc6dSAlex Elder {
9299682fc6dSAlex Elder 	const char *snap_name;
9309682fc6dSAlex Elder 
9319682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
9329682fc6dSAlex Elder 
9339682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
9349682fc6dSAlex Elder 
9359682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
9369682fc6dSAlex Elder 	while (which--)
9379682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
9389682fc6dSAlex Elder 
9399682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
9409682fc6dSAlex Elder }
9419682fc6dSAlex Elder 
94230d1cff8SAlex Elder /*
94330d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
94430d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
94530d1cff8SAlex Elder  */
94630d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
94730d1cff8SAlex Elder {
94830d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
94930d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
95030d1cff8SAlex Elder 
95130d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
95230d1cff8SAlex Elder 		return 1;
95330d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
95430d1cff8SAlex Elder }
95530d1cff8SAlex Elder 
95630d1cff8SAlex Elder /*
95730d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
95830d1cff8SAlex Elder  * present.
95930d1cff8SAlex Elder  *
96030d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
96130d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
96230d1cff8SAlex Elder  *
96330d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
96430d1cff8SAlex Elder  * reverse order, highest snapshot id first.
96530d1cff8SAlex Elder  */
9669682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
9679682fc6dSAlex Elder {
9689682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
96930d1cff8SAlex Elder 	u64 *found;
9709682fc6dSAlex Elder 
97130d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
97230d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
9739682fc6dSAlex Elder 
97430d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
9759682fc6dSAlex Elder }
9769682fc6dSAlex Elder 
9772ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
9782ad3d716SAlex Elder 					u64 snap_id)
97954cac61fSAlex Elder {
98054cac61fSAlex Elder 	u32 which;
981da6a6b63SJosh Durgin 	const char *snap_name;
98254cac61fSAlex Elder 
98354cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
98454cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
985da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
98654cac61fSAlex Elder 
987da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
988da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
98954cac61fSAlex Elder }
99054cac61fSAlex Elder 
9919e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
9929e15b77dSAlex Elder {
9939e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
9949e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
9959e15b77dSAlex Elder 
99654cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
99754cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
99854cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9999e15b77dSAlex Elder 
100054cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
10019e15b77dSAlex Elder }
10029e15b77dSAlex Elder 
10032ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
10042ad3d716SAlex Elder 				u64 *snap_size)
1005602adf40SYehuda Sadeh {
10062ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10072ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10082ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
10092ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
10102ad3d716SAlex Elder 		u32 which;
101100f1f36fSAlex Elder 
10122ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
10132ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
10142ad3d716SAlex Elder 			return -ENOENT;
101500f1f36fSAlex Elder 
10162ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
10172ad3d716SAlex Elder 	} else {
10182ad3d716SAlex Elder 		u64 size = 0;
10192ad3d716SAlex Elder 		int ret;
10202ad3d716SAlex Elder 
10212ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
10222ad3d716SAlex Elder 		if (ret)
10232ad3d716SAlex Elder 			return ret;
10242ad3d716SAlex Elder 
10252ad3d716SAlex Elder 		*snap_size = size;
10262ad3d716SAlex Elder 	}
10272ad3d716SAlex Elder 	return 0;
10282ad3d716SAlex Elder }
10292ad3d716SAlex Elder 
10302ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
10312ad3d716SAlex Elder 			u64 *snap_features)
10322ad3d716SAlex Elder {
10332ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10342ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10352ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
10362ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
10372ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
10382ad3d716SAlex Elder 	} else {
10392ad3d716SAlex Elder 		u64 features = 0;
10402ad3d716SAlex Elder 		int ret;
10412ad3d716SAlex Elder 
10422ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
10432ad3d716SAlex Elder 		if (ret)
10442ad3d716SAlex Elder 			return ret;
10452ad3d716SAlex Elder 
10462ad3d716SAlex Elder 		*snap_features = features;
10472ad3d716SAlex Elder 	}
10482ad3d716SAlex Elder 	return 0;
104900f1f36fSAlex Elder }
1050602adf40SYehuda Sadeh 
1051d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1052602adf40SYehuda Sadeh {
10538f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
10542ad3d716SAlex Elder 	u64 size = 0;
10552ad3d716SAlex Elder 	u64 features = 0;
10562ad3d716SAlex Elder 	int ret;
10578b0241f8SAlex Elder 
10582ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
10592ad3d716SAlex Elder 	if (ret)
10602ad3d716SAlex Elder 		return ret;
10612ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
10622ad3d716SAlex Elder 	if (ret)
10632ad3d716SAlex Elder 		return ret;
10642ad3d716SAlex Elder 
10652ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
10662ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
10672ad3d716SAlex Elder 
10688b0241f8SAlex Elder 	return 0;
1069602adf40SYehuda Sadeh }
1070602adf40SYehuda Sadeh 
1071d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1072d1cf5788SAlex Elder {
1073d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1074d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1075200a6a8bSAlex Elder }
1076200a6a8bSAlex Elder 
107798571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1078602adf40SYehuda Sadeh {
107965ccfe21SAlex Elder 	char *name;
108065ccfe21SAlex Elder 	u64 segment;
108165ccfe21SAlex Elder 	int ret;
10823a96d5cdSJosh Durgin 	char *name_format;
1083602adf40SYehuda Sadeh 
108478c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
108565ccfe21SAlex Elder 	if (!name)
108665ccfe21SAlex Elder 		return NULL;
108765ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
10883a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
10893a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
10903a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
10912d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
109265ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
10932d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
109465ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
109565ccfe21SAlex Elder 			segment, ret);
109665ccfe21SAlex Elder 		kfree(name);
109765ccfe21SAlex Elder 		name = NULL;
109865ccfe21SAlex Elder 	}
1099602adf40SYehuda Sadeh 
110065ccfe21SAlex Elder 	return name;
110165ccfe21SAlex Elder }
1102602adf40SYehuda Sadeh 
110378c2a44aSAlex Elder static void rbd_segment_name_free(const char *name)
110478c2a44aSAlex Elder {
110578c2a44aSAlex Elder 	/* The explicit cast here is needed to drop the const qualifier */
110678c2a44aSAlex Elder 
110778c2a44aSAlex Elder 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
110878c2a44aSAlex Elder }
110978c2a44aSAlex Elder 
111065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
111165ccfe21SAlex Elder {
111265ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1113602adf40SYehuda Sadeh 
111465ccfe21SAlex Elder 	return offset & (segment_size - 1);
111565ccfe21SAlex Elder }
111665ccfe21SAlex Elder 
111765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
111865ccfe21SAlex Elder 				u64 offset, u64 length)
111965ccfe21SAlex Elder {
112065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
112165ccfe21SAlex Elder 
112265ccfe21SAlex Elder 	offset &= segment_size - 1;
112365ccfe21SAlex Elder 
1124aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
112565ccfe21SAlex Elder 	if (offset + length > segment_size)
112665ccfe21SAlex Elder 		length = segment_size - offset;
112765ccfe21SAlex Elder 
112865ccfe21SAlex Elder 	return length;
1129602adf40SYehuda Sadeh }
1130602adf40SYehuda Sadeh 
1131602adf40SYehuda Sadeh /*
1132029bcbd8SJosh Durgin  * returns the size of an object in the image
1133029bcbd8SJosh Durgin  */
1134029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1135029bcbd8SJosh Durgin {
1136029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1137029bcbd8SJosh Durgin }
1138029bcbd8SJosh Durgin 
1139029bcbd8SJosh Durgin /*
1140602adf40SYehuda Sadeh  * bio helpers
1141602adf40SYehuda Sadeh  */
1142602adf40SYehuda Sadeh 
1143602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1144602adf40SYehuda Sadeh {
1145602adf40SYehuda Sadeh 	struct bio *tmp;
1146602adf40SYehuda Sadeh 
1147602adf40SYehuda Sadeh 	while (chain) {
1148602adf40SYehuda Sadeh 		tmp = chain;
1149602adf40SYehuda Sadeh 		chain = chain->bi_next;
1150602adf40SYehuda Sadeh 		bio_put(tmp);
1151602adf40SYehuda Sadeh 	}
1152602adf40SYehuda Sadeh }
1153602adf40SYehuda Sadeh 
1154602adf40SYehuda Sadeh /*
1155602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1156602adf40SYehuda Sadeh  */
1157602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1158602adf40SYehuda Sadeh {
11597988613bSKent Overstreet 	struct bio_vec bv;
11607988613bSKent Overstreet 	struct bvec_iter iter;
1161602adf40SYehuda Sadeh 	unsigned long flags;
1162602adf40SYehuda Sadeh 	void *buf;
1163602adf40SYehuda Sadeh 	int pos = 0;
1164602adf40SYehuda Sadeh 
1165602adf40SYehuda Sadeh 	while (chain) {
11667988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
11677988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1168602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
11697988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1170602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
11717988613bSKent Overstreet 				       bv.bv_len - remainder);
11727988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
117385b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1174602adf40SYehuda Sadeh 			}
11757988613bSKent Overstreet 			pos += bv.bv_len;
1176602adf40SYehuda Sadeh 		}
1177602adf40SYehuda Sadeh 
1178602adf40SYehuda Sadeh 		chain = chain->bi_next;
1179602adf40SYehuda Sadeh 	}
1180602adf40SYehuda Sadeh }
1181602adf40SYehuda Sadeh 
1182602adf40SYehuda Sadeh /*
1183b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1184b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1185b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1186b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1187b9434c5bSAlex Elder  */
1188b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1189b9434c5bSAlex Elder {
1190b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1191b9434c5bSAlex Elder 
1192b9434c5bSAlex Elder 	rbd_assert(end > offset);
1193b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1194b9434c5bSAlex Elder 	while (offset < end) {
1195b9434c5bSAlex Elder 		size_t page_offset;
1196b9434c5bSAlex Elder 		size_t length;
1197b9434c5bSAlex Elder 		unsigned long flags;
1198b9434c5bSAlex Elder 		void *kaddr;
1199b9434c5bSAlex Elder 
1200491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1201491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1202b9434c5bSAlex Elder 		local_irq_save(flags);
1203b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1204b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1205e2156054SAlex Elder 		flush_dcache_page(*page);
1206b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1207b9434c5bSAlex Elder 		local_irq_restore(flags);
1208b9434c5bSAlex Elder 
1209b9434c5bSAlex Elder 		offset += length;
1210b9434c5bSAlex Elder 		page++;
1211b9434c5bSAlex Elder 	}
1212b9434c5bSAlex Elder }
1213b9434c5bSAlex Elder 
1214b9434c5bSAlex Elder /*
1215f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1216f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1217602adf40SYehuda Sadeh  */
1218f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1219f7760dadSAlex Elder 					unsigned int offset,
1220f7760dadSAlex Elder 					unsigned int len,
1221f7760dadSAlex Elder 					gfp_t gfpmask)
1222602adf40SYehuda Sadeh {
1223f7760dadSAlex Elder 	struct bio *bio;
1224602adf40SYehuda Sadeh 
12255341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1226f7760dadSAlex Elder 	if (!bio)
1227f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1228f7760dadSAlex Elder 
12295341a627SKent Overstreet 	bio_advance(bio, offset);
12304f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1231602adf40SYehuda Sadeh 
1232f7760dadSAlex Elder 	return bio;
1233602adf40SYehuda Sadeh }
1234602adf40SYehuda Sadeh 
1235f7760dadSAlex Elder /*
1236f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1237f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1238f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1239f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1240f7760dadSAlex Elder  *
1241f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1242f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1243f7760dadSAlex Elder  * the start of data to be cloned is located.
1244f7760dadSAlex Elder  *
1245f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1246f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1247f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1248f7760dadSAlex Elder  */
1249f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1250f7760dadSAlex Elder 					unsigned int *offset,
1251f7760dadSAlex Elder 					unsigned int len,
1252f7760dadSAlex Elder 					gfp_t gfpmask)
1253f7760dadSAlex Elder {
1254f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1255f7760dadSAlex Elder 	unsigned int off = *offset;
1256f7760dadSAlex Elder 	struct bio *chain = NULL;
1257f7760dadSAlex Elder 	struct bio **end;
1258602adf40SYehuda Sadeh 
1259f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1260602adf40SYehuda Sadeh 
12614f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1262f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1263602adf40SYehuda Sadeh 
1264f7760dadSAlex Elder 	end = &chain;
1265f7760dadSAlex Elder 	while (len) {
1266f7760dadSAlex Elder 		unsigned int bi_size;
1267f7760dadSAlex Elder 		struct bio *bio;
1268f7760dadSAlex Elder 
1269f5400b7aSAlex Elder 		if (!bi) {
1270f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1271f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1272f5400b7aSAlex Elder 		}
12734f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1274f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1275f7760dadSAlex Elder 		if (!bio)
1276f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1277f7760dadSAlex Elder 
1278f7760dadSAlex Elder 		*end = bio;
1279f7760dadSAlex Elder 		end = &bio->bi_next;
1280f7760dadSAlex Elder 
1281f7760dadSAlex Elder 		off += bi_size;
12824f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1283f7760dadSAlex Elder 			bi = bi->bi_next;
1284f7760dadSAlex Elder 			off = 0;
1285f7760dadSAlex Elder 		}
1286f7760dadSAlex Elder 		len -= bi_size;
1287f7760dadSAlex Elder 	}
1288f7760dadSAlex Elder 	*bio_src = bi;
1289f7760dadSAlex Elder 	*offset = off;
1290f7760dadSAlex Elder 
1291f7760dadSAlex Elder 	return chain;
1292f7760dadSAlex Elder out_err:
1293f7760dadSAlex Elder 	bio_chain_put(chain);
1294f7760dadSAlex Elder 
1295602adf40SYehuda Sadeh 	return NULL;
1296602adf40SYehuda Sadeh }
1297602adf40SYehuda Sadeh 
1298926f9b3fSAlex Elder /*
1299926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1300926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1301926f9b3fSAlex Elder  * again.
1302926f9b3fSAlex Elder  */
13036365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13046365d33aSAlex Elder {
13056365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13066365d33aSAlex Elder 		struct rbd_device *rbd_dev;
13076365d33aSAlex Elder 
130857acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
13096365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
13106365d33aSAlex Elder 			obj_request);
13116365d33aSAlex Elder 	}
13126365d33aSAlex Elder }
13136365d33aSAlex Elder 
13146365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
13156365d33aSAlex Elder {
13166365d33aSAlex Elder 	smp_mb();
13176365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
13186365d33aSAlex Elder }
13196365d33aSAlex Elder 
132057acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
132157acbaa7SAlex Elder {
132257acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
132357acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
132457acbaa7SAlex Elder 
132557acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
132657acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
132757acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
132857acbaa7SAlex Elder 			obj_request);
132957acbaa7SAlex Elder 	}
133057acbaa7SAlex Elder }
133157acbaa7SAlex Elder 
133257acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
133357acbaa7SAlex Elder {
133457acbaa7SAlex Elder 	smp_mb();
133557acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
133657acbaa7SAlex Elder }
133757acbaa7SAlex Elder 
13385679c59fSAlex Elder /*
13395679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
13405679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
13415679c59fSAlex Elder  *
13425679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
13435679c59fSAlex Elder  * away again.  It's possible that the response from two existence
13445679c59fSAlex Elder  * checks are separated by the creation of the target object, and
13455679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
13465679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
13475679c59fSAlex Elder  */
13485679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
13495679c59fSAlex Elder 				bool exists)
13505679c59fSAlex Elder {
13515679c59fSAlex Elder 	if (exists)
13525679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
13535679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
13545679c59fSAlex Elder 	smp_mb();
13555679c59fSAlex Elder }
13565679c59fSAlex Elder 
13575679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
13585679c59fSAlex Elder {
13595679c59fSAlex Elder 	smp_mb();
13605679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
13615679c59fSAlex Elder }
13625679c59fSAlex Elder 
13635679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
13645679c59fSAlex Elder {
13655679c59fSAlex Elder 	smp_mb();
13665679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
13675679c59fSAlex Elder }
13685679c59fSAlex Elder 
1369bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1370bf0d5f50SAlex Elder {
137137206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
137237206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1373bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1374bf0d5f50SAlex Elder }
1375bf0d5f50SAlex Elder 
1376bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1377bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1378bf0d5f50SAlex Elder {
1379bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
138037206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
138137206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1382bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1383bf0d5f50SAlex Elder }
1384bf0d5f50SAlex Elder 
1385e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1386e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1387bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1388bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1389bf0d5f50SAlex Elder {
1390bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
139137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
139237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1393e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1394e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1395e93f3152SAlex Elder 	else
1396bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1397bf0d5f50SAlex Elder }
1398bf0d5f50SAlex Elder 
1399bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1400bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1401bf0d5f50SAlex Elder {
140225dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
140325dcf954SAlex Elder 
1404b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1405bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
140625dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
14076365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
14086365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1409bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
141025dcf954SAlex Elder 	img_request->obj_request_count++;
141125dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
141237206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
141337206ee5SAlex Elder 		obj_request->which);
1414bf0d5f50SAlex Elder }
1415bf0d5f50SAlex Elder 
1416bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1417bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1418bf0d5f50SAlex Elder {
1419bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
142025dcf954SAlex Elder 
142137206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
142237206ee5SAlex Elder 		obj_request->which);
1423bf0d5f50SAlex Elder 	list_del(&obj_request->links);
142425dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
142525dcf954SAlex Elder 	img_request->obj_request_count--;
142625dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
142725dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
14286365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1429bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1430bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
143125dcf954SAlex Elder 	obj_request->callback = NULL;
1432bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1433bf0d5f50SAlex Elder }
1434bf0d5f50SAlex Elder 
1435bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1436bf0d5f50SAlex Elder {
1437bf0d5f50SAlex Elder 	switch (type) {
14389969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1439bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1440788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1441bf0d5f50SAlex Elder 		return true;
1442bf0d5f50SAlex Elder 	default:
1443bf0d5f50SAlex Elder 		return false;
1444bf0d5f50SAlex Elder 	}
1445bf0d5f50SAlex Elder }
1446bf0d5f50SAlex Elder 
1447bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1448bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1449bf0d5f50SAlex Elder {
145037206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
145137206ee5SAlex Elder 
1452bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1453bf0d5f50SAlex Elder }
1454bf0d5f50SAlex Elder 
1455bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1456bf0d5f50SAlex Elder {
145755f27e09SAlex Elder 
145837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
145955f27e09SAlex Elder 
146055f27e09SAlex Elder 	/*
146155f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
146255f27e09SAlex Elder 	 * count for the image request.  We could instead use
146355f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
146455f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
146555f27e09SAlex Elder 	 */
146655f27e09SAlex Elder 	if (!img_request->result) {
146755f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
146855f27e09SAlex Elder 		u64 xferred = 0;
146955f27e09SAlex Elder 
147055f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
147155f27e09SAlex Elder 			xferred += obj_request->xferred;
147255f27e09SAlex Elder 		img_request->xferred = xferred;
147355f27e09SAlex Elder 	}
147455f27e09SAlex Elder 
1475bf0d5f50SAlex Elder 	if (img_request->callback)
1476bf0d5f50SAlex Elder 		img_request->callback(img_request);
1477bf0d5f50SAlex Elder 	else
1478bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1479bf0d5f50SAlex Elder }
1480bf0d5f50SAlex Elder 
1481788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1482788e2df3SAlex Elder 
1483788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1484788e2df3SAlex Elder {
148537206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
148637206ee5SAlex Elder 
1487788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1488788e2df3SAlex Elder }
1489788e2df3SAlex Elder 
14900c425248SAlex Elder /*
14910c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
14920c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
14930c425248SAlex Elder  * and currently never change thereafter.
14940c425248SAlex Elder  */
14950c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
14960c425248SAlex Elder {
14970c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
14980c425248SAlex Elder 	smp_mb();
14990c425248SAlex Elder }
15000c425248SAlex Elder 
15010c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
15020c425248SAlex Elder {
15030c425248SAlex Elder 	smp_mb();
15040c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
15050c425248SAlex Elder }
15060c425248SAlex Elder 
15079849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
15089849e986SAlex Elder {
15099849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
15109849e986SAlex Elder 	smp_mb();
15119849e986SAlex Elder }
15129849e986SAlex Elder 
1513e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1514e93f3152SAlex Elder {
1515e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1516e93f3152SAlex Elder 	smp_mb();
1517e93f3152SAlex Elder }
1518e93f3152SAlex Elder 
15199849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
15209849e986SAlex Elder {
15219849e986SAlex Elder 	smp_mb();
15229849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
15239849e986SAlex Elder }
15249849e986SAlex Elder 
1525d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1526d0b2e944SAlex Elder {
1527d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1528d0b2e944SAlex Elder 	smp_mb();
1529d0b2e944SAlex Elder }
1530d0b2e944SAlex Elder 
1531a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1532a2acd00eSAlex Elder {
1533a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1534a2acd00eSAlex Elder 	smp_mb();
1535a2acd00eSAlex Elder }
1536a2acd00eSAlex Elder 
1537d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1538d0b2e944SAlex Elder {
1539d0b2e944SAlex Elder 	smp_mb();
1540d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1541d0b2e944SAlex Elder }
1542d0b2e944SAlex Elder 
15436e2a4505SAlex Elder static void
15446e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
15456e2a4505SAlex Elder {
1546b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1547b9434c5bSAlex Elder 	u64 length = obj_request->length;
1548b9434c5bSAlex Elder 
15496e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
15506e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1551b9434c5bSAlex Elder 		xferred, length);
15526e2a4505SAlex Elder 	/*
155317c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
155417c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
155517c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
155617c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
155717c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
155817c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
15596e2a4505SAlex Elder 	 */
1560b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
15616e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1562b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
15636e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1564b9434c5bSAlex Elder 		else
1565b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
15666e2a4505SAlex Elder 		obj_request->result = 0;
1567b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1568b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1569b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1570b9434c5bSAlex Elder 		else
1571b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
15726e2a4505SAlex Elder 	}
157317c1cc1dSJosh Durgin 	obj_request->xferred = length;
15746e2a4505SAlex Elder 	obj_request_done_set(obj_request);
15756e2a4505SAlex Elder }
15766e2a4505SAlex Elder 
1577bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1578bf0d5f50SAlex Elder {
157937206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
158037206ee5SAlex Elder 		obj_request->callback);
1581bf0d5f50SAlex Elder 	if (obj_request->callback)
1582bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1583788e2df3SAlex Elder 	else
1584788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1585bf0d5f50SAlex Elder }
1586bf0d5f50SAlex Elder 
1587c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
158839bf2c5dSAlex Elder {
158939bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
159039bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
159139bf2c5dSAlex Elder }
159239bf2c5dSAlex Elder 
1593c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1594bf0d5f50SAlex Elder {
159557acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1596a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
159757acbaa7SAlex Elder 	bool layered = false;
159857acbaa7SAlex Elder 
159957acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
160057acbaa7SAlex Elder 		img_request = obj_request->img_request;
160157acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1602a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
160357acbaa7SAlex Elder 	}
16048b3e1a56SAlex Elder 
16058b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
16068b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
16078b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1608a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1609a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
16108b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
16118b3e1a56SAlex Elder 	else if (img_request)
16126e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
16136e2a4505SAlex Elder 	else
161407741308SAlex Elder 		obj_request_done_set(obj_request);
1615bf0d5f50SAlex Elder }
1616bf0d5f50SAlex Elder 
1617c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1618bf0d5f50SAlex Elder {
16191b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
16201b83bef2SSage Weil 		obj_request->result, obj_request->length);
16211b83bef2SSage Weil 	/*
16228b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
16238b3e1a56SAlex Elder 	 * it to our originally-requested length.
16241b83bef2SSage Weil 	 */
16251b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
162607741308SAlex Elder 	obj_request_done_set(obj_request);
1627bf0d5f50SAlex Elder }
1628bf0d5f50SAlex Elder 
1629fbfab539SAlex Elder /*
1630fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1631fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1632fbfab539SAlex Elder  */
1633c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1634fbfab539SAlex Elder {
163537206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1636fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1637fbfab539SAlex Elder }
1638fbfab539SAlex Elder 
1639bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1640bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1641bf0d5f50SAlex Elder {
1642bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1643bf0d5f50SAlex Elder 	u16 opcode;
1644bf0d5f50SAlex Elder 
164537206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1646bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
164757acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
164857acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
164957acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
165057acbaa7SAlex Elder 	} else {
165157acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
165257acbaa7SAlex Elder 	}
1653bf0d5f50SAlex Elder 
16541b83bef2SSage Weil 	if (osd_req->r_result < 0)
16551b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1656bf0d5f50SAlex Elder 
16577cc69d42SIlya Dryomov 	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
1658bf0d5f50SAlex Elder 
1659c47f9371SAlex Elder 	/*
1660c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1661c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1662c47f9371SAlex Elder 	 */
16631b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1664c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
16650ccd5926SIlya Dryomov 
166679528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1667bf0d5f50SAlex Elder 	switch (opcode) {
1668bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1669c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1670bf0d5f50SAlex Elder 		break;
16710ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
16720ccd5926SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
16730ccd5926SIlya Dryomov 		/* fall through */
1674bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1675c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1676bf0d5f50SAlex Elder 		break;
1677fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1678c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1679fbfab539SAlex Elder 		break;
168036be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1681b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
16829969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1683c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
16849969ebc5SAlex Elder 		break;
1685bf0d5f50SAlex Elder 	default:
1686bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1687bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1688bf0d5f50SAlex Elder 		break;
1689bf0d5f50SAlex Elder 	}
1690bf0d5f50SAlex Elder 
169107741308SAlex Elder 	if (obj_request_done_test(obj_request))
1692bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1693bf0d5f50SAlex Elder }
1694bf0d5f50SAlex Elder 
16959d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1696430c28c3SAlex Elder {
1697430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
16988c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
16999d4df01fSAlex Elder 	u64 snap_id;
1700430c28c3SAlex Elder 
17018c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1702430c28c3SAlex Elder 
17039d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
17048c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17059d4df01fSAlex Elder 			NULL, snap_id, NULL);
17069d4df01fSAlex Elder }
17079d4df01fSAlex Elder 
17089d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
17099d4df01fSAlex Elder {
17109d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17119d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17129d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
17139d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
17149d4df01fSAlex Elder 
17159d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
17169d4df01fSAlex Elder 
17179d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
17189d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17199d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1720430c28c3SAlex Elder }
1721430c28c3SAlex Elder 
17220ccd5926SIlya Dryomov /*
17230ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
17240ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
17250ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
17260ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
17270ccd5926SIlya Dryomov  */
1728bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1729bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1730bf0d5f50SAlex Elder 					bool write_request,
1731deb236b3SIlya Dryomov 					unsigned int num_ops,
1732430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1733bf0d5f50SAlex Elder {
1734bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1735bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1736bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1737bf0d5f50SAlex Elder 
17386365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
17396365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
17406365d33aSAlex Elder 
17410c425248SAlex Elder 		rbd_assert(write_request ==
17420c425248SAlex Elder 				img_request_write_test(img_request));
17430c425248SAlex Elder 		if (write_request)
1744bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1745bf0d5f50SAlex Elder 	}
1746bf0d5f50SAlex Elder 
17470ccd5926SIlya Dryomov 	rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
1748deb236b3SIlya Dryomov 
1749deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1750bf0d5f50SAlex Elder 
1751bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1752deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1753deb236b3SIlya Dryomov 					  GFP_ATOMIC);
1754bf0d5f50SAlex Elder 	if (!osd_req)
1755bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1756bf0d5f50SAlex Elder 
1757430c28c3SAlex Elder 	if (write_request)
1758bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1759430c28c3SAlex Elder 	else
1760bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1761bf0d5f50SAlex Elder 
1762bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1763bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1764bf0d5f50SAlex Elder 
17653c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
17663c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1767bf0d5f50SAlex Elder 
1768bf0d5f50SAlex Elder 	return osd_req;
1769bf0d5f50SAlex Elder }
1770bf0d5f50SAlex Elder 
17710eefd470SAlex Elder /*
17720eefd470SAlex Elder  * Create a copyup osd request based on the information in the
17730ccd5926SIlya Dryomov  * object request supplied.  A copyup request has three osd ops,
17740ccd5926SIlya Dryomov  * a copyup method call, a hint op, and a write op.
17750eefd470SAlex Elder  */
17760eefd470SAlex Elder static struct ceph_osd_request *
17770eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
17780eefd470SAlex Elder {
17790eefd470SAlex Elder 	struct rbd_img_request *img_request;
17800eefd470SAlex Elder 	struct ceph_snap_context *snapc;
17810eefd470SAlex Elder 	struct rbd_device *rbd_dev;
17820eefd470SAlex Elder 	struct ceph_osd_client *osdc;
17830eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
17840eefd470SAlex Elder 
17850eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17860eefd470SAlex Elder 	img_request = obj_request->img_request;
17870eefd470SAlex Elder 	rbd_assert(img_request);
17880eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
17890eefd470SAlex Elder 
17900ccd5926SIlya Dryomov 	/* Allocate and initialize the request, for the three ops */
17910eefd470SAlex Elder 
17920eefd470SAlex Elder 	snapc = img_request->snapc;
17930eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
17940eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
17950ccd5926SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
17960eefd470SAlex Elder 	if (!osd_req)
17970eefd470SAlex Elder 		return NULL;	/* ENOMEM */
17980eefd470SAlex Elder 
17990eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
18000eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
18010eefd470SAlex Elder 	osd_req->r_priv = obj_request;
18020eefd470SAlex Elder 
18033c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
18043c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
18050eefd470SAlex Elder 
18060eefd470SAlex Elder 	return osd_req;
18070eefd470SAlex Elder }
18080eefd470SAlex Elder 
18090eefd470SAlex Elder 
1810bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1811bf0d5f50SAlex Elder {
1812bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1813bf0d5f50SAlex Elder }
1814bf0d5f50SAlex Elder 
1815bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1816bf0d5f50SAlex Elder 
1817bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1818bf0d5f50SAlex Elder 						u64 offset, u64 length,
1819bf0d5f50SAlex Elder 						enum obj_request_type type)
1820bf0d5f50SAlex Elder {
1821bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1822bf0d5f50SAlex Elder 	size_t size;
1823bf0d5f50SAlex Elder 	char *name;
1824bf0d5f50SAlex Elder 
1825bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1826bf0d5f50SAlex Elder 
1827bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1828f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1829f907ad55SAlex Elder 	if (!name)
1830bf0d5f50SAlex Elder 		return NULL;
1831bf0d5f50SAlex Elder 
1832868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1833f907ad55SAlex Elder 	if (!obj_request) {
1834f907ad55SAlex Elder 		kfree(name);
1835f907ad55SAlex Elder 		return NULL;
1836f907ad55SAlex Elder 	}
1837f907ad55SAlex Elder 
1838bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1839bf0d5f50SAlex Elder 	obj_request->offset = offset;
1840bf0d5f50SAlex Elder 	obj_request->length = length;
1841926f9b3fSAlex Elder 	obj_request->flags = 0;
1842bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1843bf0d5f50SAlex Elder 	obj_request->type = type;
1844bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1845788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1846bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1847bf0d5f50SAlex Elder 
184837206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
184937206ee5SAlex Elder 		offset, length, (int)type, obj_request);
185037206ee5SAlex Elder 
1851bf0d5f50SAlex Elder 	return obj_request;
1852bf0d5f50SAlex Elder }
1853bf0d5f50SAlex Elder 
1854bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1855bf0d5f50SAlex Elder {
1856bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1857bf0d5f50SAlex Elder 
1858bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1859bf0d5f50SAlex Elder 
186037206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
186137206ee5SAlex Elder 
1862bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1863bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1864bf0d5f50SAlex Elder 
1865bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1866bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1867bf0d5f50SAlex Elder 
1868bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1869bf0d5f50SAlex Elder 	switch (obj_request->type) {
18709969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
18719969ebc5SAlex Elder 		break;		/* Nothing to do */
1872bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1873bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1874bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1875bf0d5f50SAlex Elder 		break;
1876788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1877788e2df3SAlex Elder 		if (obj_request->pages)
1878788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1879788e2df3SAlex Elder 						obj_request->page_count);
1880788e2df3SAlex Elder 		break;
1881bf0d5f50SAlex Elder 	}
1882bf0d5f50SAlex Elder 
1883f907ad55SAlex Elder 	kfree(obj_request->object_name);
1884868311b1SAlex Elder 	obj_request->object_name = NULL;
1885868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1886bf0d5f50SAlex Elder }
1887bf0d5f50SAlex Elder 
1888fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1889fb65d228SAlex Elder 
1890fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1891fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1892fb65d228SAlex Elder {
1893fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1894fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1895fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1896fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1897fb65d228SAlex Elder }
1898fb65d228SAlex Elder 
1899bf0d5f50SAlex Elder /*
1900a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1901a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1902a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1903a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1904a2acd00eSAlex Elder  */
1905a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1906a2acd00eSAlex Elder {
1907a2acd00eSAlex Elder 	int counter;
1908a2acd00eSAlex Elder 
1909a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1910a2acd00eSAlex Elder 		return;
1911a2acd00eSAlex Elder 
1912a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1913a2acd00eSAlex Elder 	if (counter > 0)
1914a2acd00eSAlex Elder 		return;
1915a2acd00eSAlex Elder 
1916a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1917a2acd00eSAlex Elder 
1918a2acd00eSAlex Elder 	if (!counter)
1919a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1920a2acd00eSAlex Elder 	else
1921a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference underflow\n");
1922a2acd00eSAlex Elder }
1923a2acd00eSAlex Elder 
1924a2acd00eSAlex Elder /*
1925a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1926a2acd00eSAlex Elder  * parent.
1927a2acd00eSAlex Elder  *
1928392a9dadSAlex Elder  * We must get the reference before checking for the overlap to
1929392a9dadSAlex Elder  * coordinate properly with zeroing the parent overlap in
1930392a9dadSAlex Elder  * rbd_dev_v2_parent_info() when an image gets flattened.  We
1931392a9dadSAlex Elder  * drop it again if there is no overlap.
1932392a9dadSAlex Elder  *
1933a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
1934a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
1935a2acd00eSAlex Elder  * false otherwise.
1936a2acd00eSAlex Elder  */
1937a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1938a2acd00eSAlex Elder {
1939a2acd00eSAlex Elder 	int counter;
1940a2acd00eSAlex Elder 
1941a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1942a2acd00eSAlex Elder 		return false;
1943a2acd00eSAlex Elder 
1944a2acd00eSAlex Elder 	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1945a2acd00eSAlex Elder 	if (counter > 0 && rbd_dev->parent_overlap)
1946a2acd00eSAlex Elder 		return true;
1947a2acd00eSAlex Elder 
1948a2acd00eSAlex Elder 	/* Image was flattened, but parent is not yet torn down */
1949a2acd00eSAlex Elder 
1950a2acd00eSAlex Elder 	if (counter < 0)
1951a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference overflow\n");
1952a2acd00eSAlex Elder 
1953a2acd00eSAlex Elder 	return false;
1954a2acd00eSAlex Elder }
1955a2acd00eSAlex Elder 
1956bf0d5f50SAlex Elder /*
1957bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1958bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1959bf0d5f50SAlex Elder  * (if there is one).
1960bf0d5f50SAlex Elder  */
1961cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1962cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1963bf0d5f50SAlex Elder 					u64 offset, u64 length,
1964e93f3152SAlex Elder 					bool write_request)
1965bf0d5f50SAlex Elder {
1966bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1967bf0d5f50SAlex Elder 
19681c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1969bf0d5f50SAlex Elder 	if (!img_request)
1970bf0d5f50SAlex Elder 		return NULL;
1971bf0d5f50SAlex Elder 
1972bf0d5f50SAlex Elder 	if (write_request) {
1973bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1974812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
1975bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1976bf0d5f50SAlex Elder 	}
1977bf0d5f50SAlex Elder 
1978bf0d5f50SAlex Elder 	img_request->rq = NULL;
1979bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1980bf0d5f50SAlex Elder 	img_request->offset = offset;
1981bf0d5f50SAlex Elder 	img_request->length = length;
19820c425248SAlex Elder 	img_request->flags = 0;
19830c425248SAlex Elder 	if (write_request) {
19840c425248SAlex Elder 		img_request_write_set(img_request);
1985468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
19860c425248SAlex Elder 	} else {
1987bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
19880c425248SAlex Elder 	}
1989a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
1990d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1991bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1992bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1993bf0d5f50SAlex Elder 	img_request->callback = NULL;
1994a5a337d4SAlex Elder 	img_request->result = 0;
1995bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1996bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1997bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1998bf0d5f50SAlex Elder 
199937206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
200037206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
200137206ee5SAlex Elder 		img_request);
200237206ee5SAlex Elder 
2003bf0d5f50SAlex Elder 	return img_request;
2004bf0d5f50SAlex Elder }
2005bf0d5f50SAlex Elder 
2006bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2007bf0d5f50SAlex Elder {
2008bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2009bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2010bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2011bf0d5f50SAlex Elder 
2012bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2013bf0d5f50SAlex Elder 
201437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
201537206ee5SAlex Elder 
2016bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2017bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
201825dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2019bf0d5f50SAlex Elder 
2020a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2021a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2022a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2023a2acd00eSAlex Elder 	}
2024a2acd00eSAlex Elder 
20250c425248SAlex Elder 	if (img_request_write_test(img_request))
2026812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2027bf0d5f50SAlex Elder 
20281c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2029bf0d5f50SAlex Elder }
2030bf0d5f50SAlex Elder 
2031e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2032e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2033e93f3152SAlex Elder 					u64 img_offset, u64 length)
2034e93f3152SAlex Elder {
2035e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2036e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2037e93f3152SAlex Elder 
2038e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2039e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2040e93f3152SAlex Elder 
2041e93f3152SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
2042e93f3152SAlex Elder 						img_offset, length, false);
2043e93f3152SAlex Elder 	if (!parent_request)
2044e93f3152SAlex Elder 		return NULL;
2045e93f3152SAlex Elder 
2046e93f3152SAlex Elder 	img_request_child_set(parent_request);
2047e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2048e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2049e93f3152SAlex Elder 
2050e93f3152SAlex Elder 	return parent_request;
2051e93f3152SAlex Elder }
2052e93f3152SAlex Elder 
2053e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2054e93f3152SAlex Elder {
2055e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2056e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2057e93f3152SAlex Elder 
2058e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2059e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2060e93f3152SAlex Elder 
2061e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2062e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2063e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2064e93f3152SAlex Elder 
2065e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2066e93f3152SAlex Elder }
2067e93f3152SAlex Elder 
20681217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
20691217857fSAlex Elder {
20706365d33aSAlex Elder 	struct rbd_img_request *img_request;
20711217857fSAlex Elder 	unsigned int xferred;
20721217857fSAlex Elder 	int result;
20738b3e1a56SAlex Elder 	bool more;
20741217857fSAlex Elder 
20756365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20766365d33aSAlex Elder 	img_request = obj_request->img_request;
20776365d33aSAlex Elder 
20781217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
20791217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
20801217857fSAlex Elder 	result = obj_request->result;
20811217857fSAlex Elder 	if (result) {
20821217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
20831217857fSAlex Elder 
20841217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
20851217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
20861217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
20871217857fSAlex Elder 			obj_request->offset);
20881217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
20891217857fSAlex Elder 			result, xferred);
20901217857fSAlex Elder 		if (!img_request->result)
20911217857fSAlex Elder 			img_request->result = result;
20921217857fSAlex Elder 	}
20931217857fSAlex Elder 
2094f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2095f1a4739fSAlex Elder 
2096f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2097f1a4739fSAlex Elder 		obj_request->pages = NULL;
2098f1a4739fSAlex Elder 		obj_request->page_count = 0;
2099f1a4739fSAlex Elder 	}
2100f1a4739fSAlex Elder 
21018b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
21028b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
21038b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
21048b3e1a56SAlex Elder 	} else {
21058b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
21068b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
21078b3e1a56SAlex Elder 	}
21088b3e1a56SAlex Elder 
21098b3e1a56SAlex Elder 	return more;
21101217857fSAlex Elder }
21111217857fSAlex Elder 
21122169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
21132169238dSAlex Elder {
21142169238dSAlex Elder 	struct rbd_img_request *img_request;
21152169238dSAlex Elder 	u32 which = obj_request->which;
21162169238dSAlex Elder 	bool more = true;
21172169238dSAlex Elder 
21186365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21192169238dSAlex Elder 	img_request = obj_request->img_request;
21202169238dSAlex Elder 
21212169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
21222169238dSAlex Elder 	rbd_assert(img_request != NULL);
21232169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
21242169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
21252169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
21262169238dSAlex Elder 
21272169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
21282169238dSAlex Elder 	if (which != img_request->next_completion)
21292169238dSAlex Elder 		goto out;
21302169238dSAlex Elder 
21312169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
21322169238dSAlex Elder 		rbd_assert(more);
21332169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
21342169238dSAlex Elder 
21352169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
21362169238dSAlex Elder 			break;
21371217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
21382169238dSAlex Elder 		which++;
21392169238dSAlex Elder 	}
21402169238dSAlex Elder 
21412169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
21422169238dSAlex Elder 	img_request->next_completion = which;
21432169238dSAlex Elder out:
21442169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
21452169238dSAlex Elder 
21462169238dSAlex Elder 	if (!more)
21472169238dSAlex Elder 		rbd_img_request_complete(img_request);
21482169238dSAlex Elder }
21492169238dSAlex Elder 
2150f1a4739fSAlex Elder /*
2151f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2152f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2153f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2154f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2155f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2156f1a4739fSAlex Elder  * all data described by the image request.
2157f1a4739fSAlex Elder  */
2158f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2159f1a4739fSAlex Elder 					enum obj_request_type type,
2160f1a4739fSAlex Elder 					void *data_desc)
2161bf0d5f50SAlex Elder {
2162bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2163bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2164bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
21650c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
2166a158073cSJingoo Han 	struct bio *bio_list = NULL;
2167f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2168a158073cSJingoo Han 	struct page **pages = NULL;
21697da22d29SAlex Elder 	u64 img_offset;
2170bf0d5f50SAlex Elder 	u64 resid;
2171bf0d5f50SAlex Elder 	u16 opcode;
2172bf0d5f50SAlex Elder 
2173f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2174f1a4739fSAlex Elder 		(int)type, data_desc);
217537206ee5SAlex Elder 
2176430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
21777da22d29SAlex Elder 	img_offset = img_request->offset;
2178bf0d5f50SAlex Elder 	resid = img_request->length;
21794dda41d3SAlex Elder 	rbd_assert(resid > 0);
2180f1a4739fSAlex Elder 
2181f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2182f1a4739fSAlex Elder 		bio_list = data_desc;
21834f024f37SKent Overstreet 		rbd_assert(img_offset ==
21844f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
2185f1a4739fSAlex Elder 	} else {
2186f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2187f1a4739fSAlex Elder 		pages = data_desc;
2188f1a4739fSAlex Elder 	}
2189f1a4739fSAlex Elder 
2190bf0d5f50SAlex Elder 	while (resid) {
21912fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2192bf0d5f50SAlex Elder 		const char *object_name;
2193bf0d5f50SAlex Elder 		u64 offset;
2194bf0d5f50SAlex Elder 		u64 length;
21950ccd5926SIlya Dryomov 		unsigned int which = 0;
2196bf0d5f50SAlex Elder 
21977da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2198bf0d5f50SAlex Elder 		if (!object_name)
2199bf0d5f50SAlex Elder 			goto out_unwind;
22007da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
22017da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2202bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2203f1a4739fSAlex Elder 						offset, length, type);
220478c2a44aSAlex Elder 		/* object request has its own copy of the object name */
220578c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2206bf0d5f50SAlex Elder 		if (!obj_request)
2207bf0d5f50SAlex Elder 			goto out_unwind;
220862054da6SIlya Dryomov 
220903507db6SJosh Durgin 		/*
221003507db6SJosh Durgin 		 * set obj_request->img_request before creating the
221103507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
221203507db6SJosh Durgin 		 */
221303507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2214bf0d5f50SAlex Elder 
2215f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2216f1a4739fSAlex Elder 			unsigned int clone_size;
2217f1a4739fSAlex Elder 
2218bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2219bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2220f1a4739fSAlex Elder 			obj_request->bio_list =
2221f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2222f1a4739fSAlex Elder 								&bio_offset,
2223f1a4739fSAlex Elder 								clone_size,
2224bf0d5f50SAlex Elder 								GFP_ATOMIC);
2225bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
222662054da6SIlya Dryomov 				goto out_unwind;
2227f1a4739fSAlex Elder 		} else {
2228f1a4739fSAlex Elder 			unsigned int page_count;
2229f1a4739fSAlex Elder 
2230f1a4739fSAlex Elder 			obj_request->pages = pages;
2231f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2232f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2233f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2234f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2235f1a4739fSAlex Elder 			pages += page_count;
2236f1a4739fSAlex Elder 		}
2237bf0d5f50SAlex Elder 
22380ccd5926SIlya Dryomov 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
22390ccd5926SIlya Dryomov 					     (write_request ? 2 : 1),
22402fa12320SAlex Elder 					     obj_request);
22412fa12320SAlex Elder 		if (!osd_req)
224262054da6SIlya Dryomov 			goto out_unwind;
22432fa12320SAlex Elder 		obj_request->osd_req = osd_req;
22442169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
2245430c28c3SAlex Elder 
22460ccd5926SIlya Dryomov 		if (write_request) {
22470ccd5926SIlya Dryomov 			osd_req_op_alloc_hint_init(osd_req, which,
22480ccd5926SIlya Dryomov 					     rbd_obj_bytes(&rbd_dev->header),
22490ccd5926SIlya Dryomov 					     rbd_obj_bytes(&rbd_dev->header));
22500ccd5926SIlya Dryomov 			which++;
22510ccd5926SIlya Dryomov 		}
22520ccd5926SIlya Dryomov 
22530ccd5926SIlya Dryomov 		osd_req_op_extent_init(osd_req, which, opcode, offset, length,
22542fa12320SAlex Elder 				       0, 0);
2255f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
22560ccd5926SIlya Dryomov 			osd_req_op_extent_osd_data_bio(osd_req, which,
2257f1a4739fSAlex Elder 					obj_request->bio_list, length);
2258f1a4739fSAlex Elder 		else
22590ccd5926SIlya Dryomov 			osd_req_op_extent_osd_data_pages(osd_req, which,
2260f1a4739fSAlex Elder 					obj_request->pages, length,
2261f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
22629d4df01fSAlex Elder 
22639d4df01fSAlex Elder 		if (write_request)
22649d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
22659d4df01fSAlex Elder 		else
22669d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2267430c28c3SAlex Elder 
22687da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2269bf0d5f50SAlex Elder 
22707da22d29SAlex Elder 		img_offset += length;
2271bf0d5f50SAlex Elder 		resid -= length;
2272bf0d5f50SAlex Elder 	}
2273bf0d5f50SAlex Elder 
2274bf0d5f50SAlex Elder 	return 0;
2275bf0d5f50SAlex Elder 
2276bf0d5f50SAlex Elder out_unwind:
2277bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
227842dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2279bf0d5f50SAlex Elder 
2280bf0d5f50SAlex Elder 	return -ENOMEM;
2281bf0d5f50SAlex Elder }
2282bf0d5f50SAlex Elder 
22833d7efd18SAlex Elder static void
22840eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
22850eefd470SAlex Elder {
22860eefd470SAlex Elder 	struct rbd_img_request *img_request;
22870eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2288ebda6408SAlex Elder 	struct page **pages;
22890eefd470SAlex Elder 	u32 page_count;
22900eefd470SAlex Elder 
22910eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
22920eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22930eefd470SAlex Elder 	img_request = obj_request->img_request;
22940eefd470SAlex Elder 	rbd_assert(img_request);
22950eefd470SAlex Elder 
22960eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
22970eefd470SAlex Elder 	rbd_assert(rbd_dev);
22980eefd470SAlex Elder 
2299ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2300ebda6408SAlex Elder 	rbd_assert(pages != NULL);
23010eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2302ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2303ebda6408SAlex Elder 	rbd_assert(page_count);
2304ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2305ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
23060eefd470SAlex Elder 
23070eefd470SAlex Elder 	/*
23080eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
23090eefd470SAlex Elder 	 * original write request.  There is no such thing as a
23100eefd470SAlex Elder 	 * successful short write, so if the request was successful
23110eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
23120eefd470SAlex Elder 	 */
23130eefd470SAlex Elder 	if (!obj_request->result)
23140eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
23150eefd470SAlex Elder 
23160eefd470SAlex Elder 	/* Finish up with the normal image object callback */
23170eefd470SAlex Elder 
23180eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
23190eefd470SAlex Elder }
23200eefd470SAlex Elder 
23210eefd470SAlex Elder static void
23223d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
23233d7efd18SAlex Elder {
23243d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
23250eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
23260eefd470SAlex Elder 	struct ceph_osd_client *osdc;
23270eefd470SAlex Elder 	struct rbd_device *rbd_dev;
23283d7efd18SAlex Elder 	struct page **pages;
2329ebda6408SAlex Elder 	u32 page_count;
2330bbea1c1aSAlex Elder 	int img_result;
2331ebda6408SAlex Elder 	u64 parent_length;
2332b91f09f1SAlex Elder 	u64 offset;
2333b91f09f1SAlex Elder 	u64 length;
23343d7efd18SAlex Elder 
23353d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
23363d7efd18SAlex Elder 
23373d7efd18SAlex Elder 	/* First get what we need from the image request */
23383d7efd18SAlex Elder 
23393d7efd18SAlex Elder 	pages = img_request->copyup_pages;
23403d7efd18SAlex Elder 	rbd_assert(pages != NULL);
23413d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2342ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2343ebda6408SAlex Elder 	rbd_assert(page_count);
2344ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
23453d7efd18SAlex Elder 
23463d7efd18SAlex Elder 	orig_request = img_request->obj_request;
23473d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2348b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2349bbea1c1aSAlex Elder 	img_result = img_request->result;
2350ebda6408SAlex Elder 	parent_length = img_request->length;
2351ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
23523d7efd18SAlex Elder 	rbd_img_request_put(img_request);
23533d7efd18SAlex Elder 
235491c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
235591c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
23563d7efd18SAlex Elder 	rbd_assert(rbd_dev);
23573d7efd18SAlex Elder 
2358bbea1c1aSAlex Elder 	/*
2359bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2360bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2361bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2362bbea1c1aSAlex Elder 	 */
2363bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2364bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2365bbea1c1aSAlex Elder 
2366bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2367bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2368bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2369bbea1c1aSAlex Elder 		if (!img_result)
2370bbea1c1aSAlex Elder 			return;
2371bbea1c1aSAlex Elder 	}
2372bbea1c1aSAlex Elder 
2373bbea1c1aSAlex Elder 	if (img_result)
23740eefd470SAlex Elder 		goto out_err;
23753d7efd18SAlex Elder 
23768785b1d4SAlex Elder 	/*
23778785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
23780ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
23798785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
23808785b1d4SAlex Elder 	 * original request, and release the old one.
23818785b1d4SAlex Elder 	 */
2382bbea1c1aSAlex Elder 	img_result = -ENOMEM;
23830eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
23840eefd470SAlex Elder 	if (!osd_req)
23850eefd470SAlex Elder 		goto out_err;
23868785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
23870eefd470SAlex Elder 	orig_request->osd_req = osd_req;
23880eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2389ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
23903d7efd18SAlex Elder 
23910eefd470SAlex Elder 	/* Initialize the copyup op */
23920eefd470SAlex Elder 
23930eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2394ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
23950eefd470SAlex Elder 						false, false);
23960eefd470SAlex Elder 
23970ccd5926SIlya Dryomov 	/* Then the hint op */
23980ccd5926SIlya Dryomov 
23990ccd5926SIlya Dryomov 	osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
24000ccd5926SIlya Dryomov 				   rbd_obj_bytes(&rbd_dev->header));
24010ccd5926SIlya Dryomov 
24020ccd5926SIlya Dryomov 	/* And the original write request op */
24030eefd470SAlex Elder 
2404b91f09f1SAlex Elder 	offset = orig_request->offset;
2405b91f09f1SAlex Elder 	length = orig_request->length;
24060ccd5926SIlya Dryomov 	osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
2407b91f09f1SAlex Elder 					offset, length, 0, 0);
2408b91f09f1SAlex Elder 	if (orig_request->type == OBJ_REQUEST_BIO)
24090ccd5926SIlya Dryomov 		osd_req_op_extent_osd_data_bio(osd_req, 2,
2410b91f09f1SAlex Elder 					orig_request->bio_list, length);
2411b91f09f1SAlex Elder 	else
24120ccd5926SIlya Dryomov 		osd_req_op_extent_osd_data_pages(osd_req, 2,
2413b91f09f1SAlex Elder 					orig_request->pages, length,
2414b91f09f1SAlex Elder 					offset & ~PAGE_MASK, false, false);
24150eefd470SAlex Elder 
24160eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
24170eefd470SAlex Elder 
24180eefd470SAlex Elder 	/* All set, send it off. */
24190eefd470SAlex Elder 
24200eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
24210eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2422bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2423bbea1c1aSAlex Elder 	if (!img_result)
24240eefd470SAlex Elder 		return;
24250eefd470SAlex Elder out_err:
24260eefd470SAlex Elder 	/* Record the error code and complete the request */
24270eefd470SAlex Elder 
2428bbea1c1aSAlex Elder 	orig_request->result = img_result;
24290eefd470SAlex Elder 	orig_request->xferred = 0;
24303d7efd18SAlex Elder 	obj_request_done_set(orig_request);
24313d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
24323d7efd18SAlex Elder }
24333d7efd18SAlex Elder 
24343d7efd18SAlex Elder /*
24353d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
24363d7efd18SAlex Elder  * entire target of the given object request.  This is used for
24373d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
24383d7efd18SAlex Elder  * object request from the image request does not exist.
24393d7efd18SAlex Elder  *
24403d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
24413d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
24423d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
24433d7efd18SAlex Elder  * the original object request for the copyup operation.
24443d7efd18SAlex Elder  *
24453d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
24463d7efd18SAlex Elder  * object request and mark it done so it gets completed.
24473d7efd18SAlex Elder  */
24483d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
24493d7efd18SAlex Elder {
24503d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
24513d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
24523d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
24533d7efd18SAlex Elder 	u64 img_offset;
24543d7efd18SAlex Elder 	u64 length;
24553d7efd18SAlex Elder 	struct page **pages = NULL;
24563d7efd18SAlex Elder 	u32 page_count;
24573d7efd18SAlex Elder 	int result;
24583d7efd18SAlex Elder 
24593d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2460b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
24613d7efd18SAlex Elder 
24623d7efd18SAlex Elder 	img_request = obj_request->img_request;
24633d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
24643d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
24653d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
24663d7efd18SAlex Elder 
24673d7efd18SAlex Elder 	/*
24683d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
24693d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
24703d7efd18SAlex Elder 	 */
24713d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
24723d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
24733d7efd18SAlex Elder 
24743d7efd18SAlex Elder 	/*
2475a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2476a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2477a9e8ba2cSAlex Elder 	 * necessary.
2478a9e8ba2cSAlex Elder 	 */
2479a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2480a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2481a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2482a9e8ba2cSAlex Elder 	}
2483a9e8ba2cSAlex Elder 
2484a9e8ba2cSAlex Elder 	/*
24853d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
24863d7efd18SAlex Elder 	 * from the parent.
24873d7efd18SAlex Elder 	 */
24883d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
24893d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
24903d7efd18SAlex Elder 	if (IS_ERR(pages)) {
24913d7efd18SAlex Elder 		result = PTR_ERR(pages);
24923d7efd18SAlex Elder 		pages = NULL;
24933d7efd18SAlex Elder 		goto out_err;
24943d7efd18SAlex Elder 	}
24953d7efd18SAlex Elder 
24963d7efd18SAlex Elder 	result = -ENOMEM;
2497e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2498e93f3152SAlex Elder 						img_offset, length);
24993d7efd18SAlex Elder 	if (!parent_request)
25003d7efd18SAlex Elder 		goto out_err;
25013d7efd18SAlex Elder 
25023d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
25033d7efd18SAlex Elder 	if (result)
25043d7efd18SAlex Elder 		goto out_err;
25053d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2506ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
25073d7efd18SAlex Elder 
25083d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
25093d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
25103d7efd18SAlex Elder 	if (!result)
25113d7efd18SAlex Elder 		return 0;
25123d7efd18SAlex Elder 
25133d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2514ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
25153d7efd18SAlex Elder 	parent_request->obj_request = NULL;
25163d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
25173d7efd18SAlex Elder out_err:
25183d7efd18SAlex Elder 	if (pages)
25193d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
25203d7efd18SAlex Elder 	if (parent_request)
25213d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
25223d7efd18SAlex Elder 	obj_request->result = result;
25233d7efd18SAlex Elder 	obj_request->xferred = 0;
25243d7efd18SAlex Elder 	obj_request_done_set(obj_request);
25253d7efd18SAlex Elder 
25263d7efd18SAlex Elder 	return result;
25273d7efd18SAlex Elder }
25283d7efd18SAlex Elder 
2529c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2530c5b5ef6cSAlex Elder {
2531c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2532638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2533c5b5ef6cSAlex Elder 	int result;
2534c5b5ef6cSAlex Elder 
2535c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2536c5b5ef6cSAlex Elder 
2537c5b5ef6cSAlex Elder 	/*
2538c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2539c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2540c5b5ef6cSAlex Elder 	 * we're done with the request.
2541c5b5ef6cSAlex Elder 	 */
2542c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2543c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2544912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2545c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2546c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2547c5b5ef6cSAlex Elder 
2548c5b5ef6cSAlex Elder 	result = obj_request->result;
2549c5b5ef6cSAlex Elder 	obj_request->result = 0;
2550c5b5ef6cSAlex Elder 
2551c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2552c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2553c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2554c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2555c5b5ef6cSAlex Elder 
2556638f5abeSAlex Elder 	/*
2557638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2558638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2559638f5abeSAlex Elder 	 * and re-submit the original write request.
2560638f5abeSAlex Elder 	 */
2561638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2562638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2563638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2564638f5abeSAlex Elder 
2565638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2566638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2567638f5abeSAlex Elder 		if (!result)
2568638f5abeSAlex Elder 			return;
2569638f5abeSAlex Elder 	}
2570c5b5ef6cSAlex Elder 
2571c5b5ef6cSAlex Elder 	/*
2572c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2573c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2574c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2575c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2576c5b5ef6cSAlex Elder 	 */
2577c5b5ef6cSAlex Elder 	if (!result) {
2578c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2579c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2580c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2581c5b5ef6cSAlex Elder 	} else if (result) {
2582c5b5ef6cSAlex Elder 		orig_request->result = result;
25833d7efd18SAlex Elder 		goto out;
2584c5b5ef6cSAlex Elder 	}
2585c5b5ef6cSAlex Elder 
2586c5b5ef6cSAlex Elder 	/*
2587c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2588c5b5ef6cSAlex Elder 	 * whether the target object exists.
2589c5b5ef6cSAlex Elder 	 */
2590b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
25913d7efd18SAlex Elder out:
2592c5b5ef6cSAlex Elder 	if (orig_request->result)
2593c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2594c5b5ef6cSAlex Elder }
2595c5b5ef6cSAlex Elder 
2596c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2597c5b5ef6cSAlex Elder {
2598c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2599c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2600c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2601c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2602c5b5ef6cSAlex Elder 	u32 page_count;
2603c5b5ef6cSAlex Elder 	size_t size;
2604c5b5ef6cSAlex Elder 	int ret;
2605c5b5ef6cSAlex Elder 
2606c5b5ef6cSAlex Elder 	/*
2607c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2608c5b5ef6cSAlex Elder 	 *     le64 length;
2609c5b5ef6cSAlex Elder 	 *     struct {
2610c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2611c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2612c5b5ef6cSAlex Elder 	 *     } mtime;
2613c5b5ef6cSAlex Elder 	 */
2614c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2615c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2616c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2617c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2618c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2619c5b5ef6cSAlex Elder 
2620c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2621c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2622c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2623c5b5ef6cSAlex Elder 	if (!stat_request)
2624c5b5ef6cSAlex Elder 		goto out;
2625c5b5ef6cSAlex Elder 
2626c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2627c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2628c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2629c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2630c5b5ef6cSAlex Elder 
2631c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2632c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2633deb236b3SIlya Dryomov 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2634c5b5ef6cSAlex Elder 						   stat_request);
2635c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2636c5b5ef6cSAlex Elder 		goto out;
2637c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2638c5b5ef6cSAlex Elder 
2639c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2640c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2641c5b5ef6cSAlex Elder 					false, false);
26429d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2643c5b5ef6cSAlex Elder 
2644c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2645c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2646c5b5ef6cSAlex Elder out:
2647c5b5ef6cSAlex Elder 	if (ret)
2648c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2649c5b5ef6cSAlex Elder 
2650c5b5ef6cSAlex Elder 	return ret;
2651c5b5ef6cSAlex Elder }
2652c5b5ef6cSAlex Elder 
2653b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2654b454e36dSAlex Elder {
2655b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2656a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
26573d7efd18SAlex Elder 	bool known;
2658b454e36dSAlex Elder 
2659b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2660b454e36dSAlex Elder 
2661b454e36dSAlex Elder 	img_request = obj_request->img_request;
2662b454e36dSAlex Elder 	rbd_assert(img_request);
2663a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2664b454e36dSAlex Elder 
2665b454e36dSAlex Elder 	/*
2666a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2667a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2668a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2669a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2670a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2671a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2672a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2673a9e8ba2cSAlex Elder 	 * simple object request.
2674b454e36dSAlex Elder 	 */
2675b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2676b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2677a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
26783d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
26793d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2680b454e36dSAlex Elder 
2681b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2682b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2683b454e36dSAlex Elder 
2684b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2685b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2686b454e36dSAlex Elder 
2687b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2688b454e36dSAlex Elder 	}
2689b454e36dSAlex Elder 
2690b454e36dSAlex Elder 	/*
26913d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
26923d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
26933d7efd18SAlex Elder 	 * start by reading the data for the full target object from
26943d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2695b454e36dSAlex Elder 	 */
26963d7efd18SAlex Elder 	if (known)
26973d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
26983d7efd18SAlex Elder 
26993d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2700b454e36dSAlex Elder 
2701b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2702b454e36dSAlex Elder }
2703b454e36dSAlex Elder 
2704bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2705bf0d5f50SAlex Elder {
2706bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
270746faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2708bf0d5f50SAlex Elder 
270937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
271046faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2711bf0d5f50SAlex Elder 		int ret;
2712bf0d5f50SAlex Elder 
2713b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2714bf0d5f50SAlex Elder 		if (ret)
2715bf0d5f50SAlex Elder 			return ret;
2716bf0d5f50SAlex Elder 	}
2717bf0d5f50SAlex Elder 
2718bf0d5f50SAlex Elder 	return 0;
2719bf0d5f50SAlex Elder }
2720bf0d5f50SAlex Elder 
27218b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
27228b3e1a56SAlex Elder {
27238b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2724a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2725a9e8ba2cSAlex Elder 	u64 obj_end;
272602c74fbaSAlex Elder 	u64 img_xferred;
272702c74fbaSAlex Elder 	int img_result;
27288b3e1a56SAlex Elder 
27298b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
27308b3e1a56SAlex Elder 
273102c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
273202c74fbaSAlex Elder 
27338b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
273402c74fbaSAlex Elder 	img_xferred = img_request->xferred;
273502c74fbaSAlex Elder 	img_result = img_request->result;
273602c74fbaSAlex Elder 	rbd_img_request_put(img_request);
273702c74fbaSAlex Elder 
273802c74fbaSAlex Elder 	/*
273902c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
274002c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
274102c74fbaSAlex Elder 	 * original request.
274202c74fbaSAlex Elder 	 */
2743a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2744a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
274502c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
274602c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
274702c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
27488b3e1a56SAlex Elder 
274902c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
275002c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
275102c74fbaSAlex Elder 		if (!img_result)
275202c74fbaSAlex Elder 			return;
275302c74fbaSAlex Elder 	}
275402c74fbaSAlex Elder 
275502c74fbaSAlex Elder 	obj_request->result = img_result;
2756a9e8ba2cSAlex Elder 	if (obj_request->result)
2757a9e8ba2cSAlex Elder 		goto out;
2758a9e8ba2cSAlex Elder 
2759a9e8ba2cSAlex Elder 	/*
2760a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2761a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2762a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2763a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2764a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2765a9e8ba2cSAlex Elder 	 */
2766a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2767a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2768a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2769a9e8ba2cSAlex Elder 		u64 xferred = 0;
2770a9e8ba2cSAlex Elder 
2771a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2772a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2773a9e8ba2cSAlex Elder 					obj_request->img_offset;
2774a9e8ba2cSAlex Elder 
277502c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
2776a9e8ba2cSAlex Elder 	} else {
277702c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
2778a9e8ba2cSAlex Elder 	}
2779a9e8ba2cSAlex Elder out:
27808b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
27818b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
27828b3e1a56SAlex Elder }
27838b3e1a56SAlex Elder 
27848b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
27858b3e1a56SAlex Elder {
27868b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
27878b3e1a56SAlex Elder 	int result;
27888b3e1a56SAlex Elder 
27898b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
27908b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
27918b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
27925b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
27938b3e1a56SAlex Elder 
27948b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
2795e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
27968b3e1a56SAlex Elder 						obj_request->img_offset,
2797e93f3152SAlex Elder 						obj_request->length);
27988b3e1a56SAlex Elder 	result = -ENOMEM;
27998b3e1a56SAlex Elder 	if (!img_request)
28008b3e1a56SAlex Elder 		goto out_err;
28018b3e1a56SAlex Elder 
28025b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
2803f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2804f1a4739fSAlex Elder 						obj_request->bio_list);
28055b2ab72dSAlex Elder 	else
28065b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
28075b2ab72dSAlex Elder 						obj_request->pages);
28088b3e1a56SAlex Elder 	if (result)
28098b3e1a56SAlex Elder 		goto out_err;
28108b3e1a56SAlex Elder 
28118b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
28128b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
28138b3e1a56SAlex Elder 	if (result)
28148b3e1a56SAlex Elder 		goto out_err;
28158b3e1a56SAlex Elder 
28168b3e1a56SAlex Elder 	return;
28178b3e1a56SAlex Elder out_err:
28188b3e1a56SAlex Elder 	if (img_request)
28198b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
28208b3e1a56SAlex Elder 	obj_request->result = result;
28218b3e1a56SAlex Elder 	obj_request->xferred = 0;
28228b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
28238b3e1a56SAlex Elder }
28248b3e1a56SAlex Elder 
282520e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2826b8d70035SAlex Elder {
2827b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
28282169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2829b8d70035SAlex Elder 	int ret;
2830b8d70035SAlex Elder 
2831b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2832b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2833b8d70035SAlex Elder 	if (!obj_request)
2834b8d70035SAlex Elder 		return -ENOMEM;
2835b8d70035SAlex Elder 
2836b8d70035SAlex Elder 	ret = -ENOMEM;
2837deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2838deb236b3SIlya Dryomov 						  obj_request);
2839b8d70035SAlex Elder 	if (!obj_request->osd_req)
2840b8d70035SAlex Elder 		goto out;
2841b8d70035SAlex Elder 
2842c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2843cc4a38bdSAlex Elder 					notify_id, 0, 0);
28449d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2845430c28c3SAlex Elder 
2846b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2847cf81b60eSAlex Elder 	if (ret)
284820e0af67SJosh Durgin 		goto out;
284920e0af67SJosh Durgin 	ret = rbd_obj_request_wait(obj_request);
285020e0af67SJosh Durgin out:
2851b8d70035SAlex Elder 	rbd_obj_request_put(obj_request);
2852b8d70035SAlex Elder 
2853b8d70035SAlex Elder 	return ret;
2854b8d70035SAlex Elder }
2855b8d70035SAlex Elder 
2856b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2857b8d70035SAlex Elder {
2858b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2859e627db08SAlex Elder 	int ret;
2860b8d70035SAlex Elder 
2861b8d70035SAlex Elder 	if (!rbd_dev)
2862b8d70035SAlex Elder 		return;
2863b8d70035SAlex Elder 
286437206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2865b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2866b8d70035SAlex Elder 		(unsigned int)opcode);
2867e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
2868e627db08SAlex Elder 	if (ret)
28693b5cf2a2SAlex Elder 		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2870b8d70035SAlex Elder 
287120e0af67SJosh Durgin 	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
2872b8d70035SAlex Elder }
2873b8d70035SAlex Elder 
28749969ebc5SAlex Elder /*
28759969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
28769969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
28779969ebc5SAlex Elder  */
2878fca27065SIlya Dryomov static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
28799969ebc5SAlex Elder {
28809969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
28819969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
28829969ebc5SAlex Elder 	int ret;
28839969ebc5SAlex Elder 
28849969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
28859969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
28869969ebc5SAlex Elder 
28879969ebc5SAlex Elder 	if (start) {
28883c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
28899969ebc5SAlex Elder 						&rbd_dev->watch_event);
28909969ebc5SAlex Elder 		if (ret < 0)
28919969ebc5SAlex Elder 			return ret;
28928eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
28939969ebc5SAlex Elder 	}
28949969ebc5SAlex Elder 
28959969ebc5SAlex Elder 	ret = -ENOMEM;
28969969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
28979969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
28989969ebc5SAlex Elder 	if (!obj_request)
28999969ebc5SAlex Elder 		goto out_cancel;
29009969ebc5SAlex Elder 
2901deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
2902deb236b3SIlya Dryomov 						  obj_request);
2903430c28c3SAlex Elder 	if (!obj_request->osd_req)
2904430c28c3SAlex Elder 		goto out_cancel;
2905430c28c3SAlex Elder 
29068eb87565SAlex Elder 	if (start)
2907975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
29088eb87565SAlex Elder 	else
29096977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2910975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
29112169238dSAlex Elder 
29122169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
29131f3ef788SAlex Elder 				rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
29149d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
29152169238dSAlex Elder 
29169969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
29179969ebc5SAlex Elder 	if (ret)
29189969ebc5SAlex Elder 		goto out_cancel;
29199969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
29209969ebc5SAlex Elder 	if (ret)
29219969ebc5SAlex Elder 		goto out_cancel;
29229969ebc5SAlex Elder 	ret = obj_request->result;
29239969ebc5SAlex Elder 	if (ret)
29249969ebc5SAlex Elder 		goto out_cancel;
29259969ebc5SAlex Elder 
29268eb87565SAlex Elder 	/*
29278eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
29288eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
29298eb87565SAlex Elder 	 * a pointer to the object request during that time (in
29308eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
29318eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
29328eb87565SAlex Elder 	 * unregistered it.
29338eb87565SAlex Elder 	 */
29348eb87565SAlex Elder 	if (start) {
29358eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
29368eb87565SAlex Elder 
29378eb87565SAlex Elder 		return 0;
29388eb87565SAlex Elder 	}
29398eb87565SAlex Elder 
29408eb87565SAlex Elder 	/* We have successfully torn down the watch request */
29418eb87565SAlex Elder 
29428eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
29438eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
29449969ebc5SAlex Elder out_cancel:
29459969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
29469969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
29479969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
29489969ebc5SAlex Elder 	if (obj_request)
29499969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
29509969ebc5SAlex Elder 
29519969ebc5SAlex Elder 	return ret;
29529969ebc5SAlex Elder }
29539969ebc5SAlex Elder 
2954fca27065SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
2955fca27065SIlya Dryomov {
2956fca27065SIlya Dryomov 	return __rbd_dev_header_watch_sync(rbd_dev, true);
2957fca27065SIlya Dryomov }
2958fca27065SIlya Dryomov 
2959fca27065SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
2960fca27065SIlya Dryomov {
2961fca27065SIlya Dryomov 	int ret;
2962fca27065SIlya Dryomov 
2963fca27065SIlya Dryomov 	ret = __rbd_dev_header_watch_sync(rbd_dev, false);
2964fca27065SIlya Dryomov 	if (ret) {
2965fca27065SIlya Dryomov 		rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
2966fca27065SIlya Dryomov 			 ret);
2967fca27065SIlya Dryomov 	}
2968fca27065SIlya Dryomov }
2969fca27065SIlya Dryomov 
297036be9a76SAlex Elder /*
2971f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
2972f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
297336be9a76SAlex Elder  */
297436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
297536be9a76SAlex Elder 			     const char *object_name,
297636be9a76SAlex Elder 			     const char *class_name,
297736be9a76SAlex Elder 			     const char *method_name,
29784157976bSAlex Elder 			     const void *outbound,
297936be9a76SAlex Elder 			     size_t outbound_size,
29804157976bSAlex Elder 			     void *inbound,
2981e2a58ee5SAlex Elder 			     size_t inbound_size)
298236be9a76SAlex Elder {
29832169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
298436be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
298536be9a76SAlex Elder 	struct page **pages;
298636be9a76SAlex Elder 	u32 page_count;
298736be9a76SAlex Elder 	int ret;
298836be9a76SAlex Elder 
298936be9a76SAlex Elder 	/*
29906010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
29916010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
29926010a451SAlex Elder 	 * also supply outbound data--parameters for the object
29936010a451SAlex Elder 	 * method.  Currently if this is present it will be a
29946010a451SAlex Elder 	 * snapshot id.
299536be9a76SAlex Elder 	 */
299636be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
299736be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
299836be9a76SAlex Elder 	if (IS_ERR(pages))
299936be9a76SAlex Elder 		return PTR_ERR(pages);
300036be9a76SAlex Elder 
300136be9a76SAlex Elder 	ret = -ENOMEM;
30026010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
300336be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
300436be9a76SAlex Elder 	if (!obj_request)
300536be9a76SAlex Elder 		goto out;
300636be9a76SAlex Elder 
300736be9a76SAlex Elder 	obj_request->pages = pages;
300836be9a76SAlex Elder 	obj_request->page_count = page_count;
300936be9a76SAlex Elder 
3010deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3011deb236b3SIlya Dryomov 						  obj_request);
301236be9a76SAlex Elder 	if (!obj_request->osd_req)
301336be9a76SAlex Elder 		goto out;
301436be9a76SAlex Elder 
3015c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
301604017e29SAlex Elder 					class_name, method_name);
301704017e29SAlex Elder 	if (outbound_size) {
301804017e29SAlex Elder 		struct ceph_pagelist *pagelist;
301904017e29SAlex Elder 
302004017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
302104017e29SAlex Elder 		if (!pagelist)
302204017e29SAlex Elder 			goto out;
302304017e29SAlex Elder 
302404017e29SAlex Elder 		ceph_pagelist_init(pagelist);
302504017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
302604017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
302704017e29SAlex Elder 						pagelist);
302804017e29SAlex Elder 	}
3029a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3030a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
303144cd188dSAlex Elder 					0, false, false);
30329d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3033430c28c3SAlex Elder 
303436be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
303536be9a76SAlex Elder 	if (ret)
303636be9a76SAlex Elder 		goto out;
303736be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
303836be9a76SAlex Elder 	if (ret)
303936be9a76SAlex Elder 		goto out;
304036be9a76SAlex Elder 
304136be9a76SAlex Elder 	ret = obj_request->result;
304236be9a76SAlex Elder 	if (ret < 0)
304336be9a76SAlex Elder 		goto out;
304457385b51SAlex Elder 
304557385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
304657385b51SAlex Elder 	ret = (int)obj_request->xferred;
3047903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
304836be9a76SAlex Elder out:
304936be9a76SAlex Elder 	if (obj_request)
305036be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
305136be9a76SAlex Elder 	else
305236be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
305336be9a76SAlex Elder 
305436be9a76SAlex Elder 	return ret;
305536be9a76SAlex Elder }
305636be9a76SAlex Elder 
3057bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
3058cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
3059bf0d5f50SAlex Elder {
3060bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
3061bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
3062bf0d5f50SAlex Elder 	struct request *rq;
3063bf0d5f50SAlex Elder 	int result;
3064bf0d5f50SAlex Elder 
3065bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
3066bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
3067bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
3068bf0d5f50SAlex Elder 		u64 offset;
3069bf0d5f50SAlex Elder 		u64 length;
3070bf0d5f50SAlex Elder 
3071bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
3072bf0d5f50SAlex Elder 
3073bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
30744dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
30754dda41d3SAlex Elder 				(int) rq->cmd_type);
30764dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
30774dda41d3SAlex Elder 			continue;
30784dda41d3SAlex Elder 		}
30794dda41d3SAlex Elder 
30804dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
30814dda41d3SAlex Elder 
30824dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
30834dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
30844dda41d3SAlex Elder 
30854dda41d3SAlex Elder 		if (!length) {
30864dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
3087bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
3088bf0d5f50SAlex Elder 			continue;
3089bf0d5f50SAlex Elder 		}
3090bf0d5f50SAlex Elder 
3091bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
3092bf0d5f50SAlex Elder 
3093bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
3094bf0d5f50SAlex Elder 
3095bf0d5f50SAlex Elder 		if (write_request) {
3096bf0d5f50SAlex Elder 			result = -EROFS;
3097bf0d5f50SAlex Elder 			if (read_only)
3098bf0d5f50SAlex Elder 				goto end_request;
3099bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3100bf0d5f50SAlex Elder 		}
3101bf0d5f50SAlex Elder 
31026d292906SAlex Elder 		/*
31036d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
31046d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
31056d292906SAlex Elder 		 * have disappeared by the time our request arrives
31066d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
31076d292906SAlex Elder 		 * we already know.
31086d292906SAlex Elder 		 */
31096d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3110bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
3111bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3112bf0d5f50SAlex Elder 			result = -ENXIO;
3113bf0d5f50SAlex Elder 			goto end_request;
3114bf0d5f50SAlex Elder 		}
3115bf0d5f50SAlex Elder 
3116bf0d5f50SAlex Elder 		result = -EINVAL;
3117c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
3118c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3119c0cd10dbSAlex Elder 				offset, length);
3120bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
3121c0cd10dbSAlex Elder 		}
3122bf0d5f50SAlex Elder 
312300a653e2SAlex Elder 		result = -EIO;
312400a653e2SAlex Elder 		if (offset + length > rbd_dev->mapping.size) {
312500a653e2SAlex Elder 			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
312600a653e2SAlex Elder 				offset, length, rbd_dev->mapping.size);
312700a653e2SAlex Elder 			goto end_request;
312800a653e2SAlex Elder 		}
312900a653e2SAlex Elder 
3130bf0d5f50SAlex Elder 		result = -ENOMEM;
3131bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
3132e93f3152SAlex Elder 							write_request);
3133bf0d5f50SAlex Elder 		if (!img_request)
3134bf0d5f50SAlex Elder 			goto end_request;
3135bf0d5f50SAlex Elder 
3136bf0d5f50SAlex Elder 		img_request->rq = rq;
3137bf0d5f50SAlex Elder 
3138f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3139f1a4739fSAlex Elder 						rq->bio);
3140bf0d5f50SAlex Elder 		if (!result)
3141bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
3142bf0d5f50SAlex Elder 		if (result)
3143bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
3144bf0d5f50SAlex Elder end_request:
3145bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
3146bf0d5f50SAlex Elder 		if (result < 0) {
31477da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
31487da22d29SAlex Elder 				write_request ? "write" : "read",
31497da22d29SAlex Elder 				length, offset, result);
31507da22d29SAlex Elder 
3151bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
3152bf0d5f50SAlex Elder 		}
3153bf0d5f50SAlex Elder 	}
3154bf0d5f50SAlex Elder }
3155bf0d5f50SAlex Elder 
3156602adf40SYehuda Sadeh /*
3157602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
3158602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
3159f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
3160602adf40SYehuda Sadeh  */
3161602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3162602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
3163602adf40SYehuda Sadeh {
3164602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
3165e5cfeed2SAlex Elder 	sector_t sector_offset;
3166e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
3167e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
3168e5cfeed2SAlex Elder 	int ret;
3169602adf40SYehuda Sadeh 
3170e5cfeed2SAlex Elder 	/*
3171e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
3172e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
3173e5cfeed2SAlex Elder 	 * device.
3174e5cfeed2SAlex Elder 	 */
3175e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3176e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3177e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3178593a9e7bSAlex Elder 
3179e5cfeed2SAlex Elder 	/*
3180e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
3181e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
3182e5cfeed2SAlex Elder 	 */
3183e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3184e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
3185e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
3186e5cfeed2SAlex Elder 	else
3187e5cfeed2SAlex Elder 		ret = 0;
3188e5cfeed2SAlex Elder 
3189e5cfeed2SAlex Elder 	/*
3190e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
3191e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
3192e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
3193e5cfeed2SAlex Elder 	 * added to an empty bio."
3194e5cfeed2SAlex Elder 	 */
3195e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3196e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3197e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
3198e5cfeed2SAlex Elder 
3199e5cfeed2SAlex Elder 	return ret;
3200602adf40SYehuda Sadeh }
3201602adf40SYehuda Sadeh 
3202602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3203602adf40SYehuda Sadeh {
3204602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3205602adf40SYehuda Sadeh 
3206602adf40SYehuda Sadeh 	if (!disk)
3207602adf40SYehuda Sadeh 		return;
3208602adf40SYehuda Sadeh 
3209a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3210a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3211602adf40SYehuda Sadeh 		del_gendisk(disk);
3212602adf40SYehuda Sadeh 		if (disk->queue)
3213602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
3214a0cab924SAlex Elder 	}
3215602adf40SYehuda Sadeh 	put_disk(disk);
3216602adf40SYehuda Sadeh }
3217602adf40SYehuda Sadeh 
3218788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3219788e2df3SAlex Elder 				const char *object_name,
32207097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3221788e2df3SAlex Elder 
3222788e2df3SAlex Elder {
32232169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3224788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3225788e2df3SAlex Elder 	struct page **pages = NULL;
3226788e2df3SAlex Elder 	u32 page_count;
32271ceae7efSAlex Elder 	size_t size;
3228788e2df3SAlex Elder 	int ret;
3229788e2df3SAlex Elder 
3230788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3231788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3232788e2df3SAlex Elder 	if (IS_ERR(pages))
3233788e2df3SAlex Elder 		ret = PTR_ERR(pages);
3234788e2df3SAlex Elder 
3235788e2df3SAlex Elder 	ret = -ENOMEM;
3236788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3237788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3238788e2df3SAlex Elder 	if (!obj_request)
3239788e2df3SAlex Elder 		goto out;
3240788e2df3SAlex Elder 
3241788e2df3SAlex Elder 	obj_request->pages = pages;
3242788e2df3SAlex Elder 	obj_request->page_count = page_count;
3243788e2df3SAlex Elder 
3244deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3245deb236b3SIlya Dryomov 						  obj_request);
3246788e2df3SAlex Elder 	if (!obj_request->osd_req)
3247788e2df3SAlex Elder 		goto out;
3248788e2df3SAlex Elder 
3249c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3250c99d2d4aSAlex Elder 					offset, length, 0, 0);
3251406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3252a4ce40a9SAlex Elder 					obj_request->pages,
325344cd188dSAlex Elder 					obj_request->length,
325444cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
325544cd188dSAlex Elder 					false, false);
32569d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3257430c28c3SAlex Elder 
3258788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3259788e2df3SAlex Elder 	if (ret)
3260788e2df3SAlex Elder 		goto out;
3261788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3262788e2df3SAlex Elder 	if (ret)
3263788e2df3SAlex Elder 		goto out;
3264788e2df3SAlex Elder 
3265788e2df3SAlex Elder 	ret = obj_request->result;
3266788e2df3SAlex Elder 	if (ret < 0)
3267788e2df3SAlex Elder 		goto out;
32681ceae7efSAlex Elder 
32691ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
32701ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3271903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
327223ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
327323ed6e13SAlex Elder 	ret = (int)size;
3274788e2df3SAlex Elder out:
3275788e2df3SAlex Elder 	if (obj_request)
3276788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3277788e2df3SAlex Elder 	else
3278788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3279788e2df3SAlex Elder 
3280788e2df3SAlex Elder 	return ret;
3281788e2df3SAlex Elder }
3282788e2df3SAlex Elder 
3283602adf40SYehuda Sadeh /*
3284662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3285662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3286662518b1SAlex Elder  * information about the image.
32874156d998SAlex Elder  */
328899a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
32894156d998SAlex Elder {
32904156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
32914156d998SAlex Elder 	u32 snap_count = 0;
32924156d998SAlex Elder 	u64 names_size = 0;
32934156d998SAlex Elder 	u32 want_count;
32944156d998SAlex Elder 	int ret;
32954156d998SAlex Elder 
32964156d998SAlex Elder 	/*
32974156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
32984156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
32994156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
33004156d998SAlex Elder 	 * the number of snapshots could change by the time we read
33014156d998SAlex Elder 	 * it in, in which case we re-read it.
33024156d998SAlex Elder 	 */
33034156d998SAlex Elder 	do {
33044156d998SAlex Elder 		size_t size;
33054156d998SAlex Elder 
33064156d998SAlex Elder 		kfree(ondisk);
33074156d998SAlex Elder 
33084156d998SAlex Elder 		size = sizeof (*ondisk);
33094156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
33104156d998SAlex Elder 		size += names_size;
33114156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
33124156d998SAlex Elder 		if (!ondisk)
3313662518b1SAlex Elder 			return -ENOMEM;
33144156d998SAlex Elder 
3315788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
33167097f8dfSAlex Elder 				       0, size, ondisk);
33174156d998SAlex Elder 		if (ret < 0)
3318662518b1SAlex Elder 			goto out;
3319c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
33204156d998SAlex Elder 			ret = -ENXIO;
332106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
332206ecc6cbSAlex Elder 				size, ret);
3323662518b1SAlex Elder 			goto out;
33244156d998SAlex Elder 		}
33254156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
33264156d998SAlex Elder 			ret = -ENXIO;
332706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3328662518b1SAlex Elder 			goto out;
33294156d998SAlex Elder 		}
33304156d998SAlex Elder 
33314156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
33324156d998SAlex Elder 		want_count = snap_count;
33334156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
33344156d998SAlex Elder 	} while (snap_count != want_count);
33354156d998SAlex Elder 
3336662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3337662518b1SAlex Elder out:
33384156d998SAlex Elder 	kfree(ondisk);
33394156d998SAlex Elder 
3340dfc5606dSYehuda Sadeh 	return ret;
3341602adf40SYehuda Sadeh }
3342602adf40SYehuda Sadeh 
334315228edeSAlex Elder /*
334415228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
334515228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
334615228edeSAlex Elder  */
334715228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
334815228edeSAlex Elder {
334915228edeSAlex Elder 	u64 snap_id;
335015228edeSAlex Elder 
335115228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
335215228edeSAlex Elder 		return;
335315228edeSAlex Elder 
335415228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
335515228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
335615228edeSAlex Elder 		return;
335715228edeSAlex Elder 
335815228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
335915228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
336015228edeSAlex Elder }
336115228edeSAlex Elder 
33629875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
33639875201eSJosh Durgin {
33649875201eSJosh Durgin 	sector_t size;
33659875201eSJosh Durgin 	bool removing;
33669875201eSJosh Durgin 
33679875201eSJosh Durgin 	/*
33689875201eSJosh Durgin 	 * Don't hold the lock while doing disk operations,
33699875201eSJosh Durgin 	 * or lock ordering will conflict with the bdev mutex via:
33709875201eSJosh Durgin 	 * rbd_add() -> blkdev_get() -> rbd_open()
33719875201eSJosh Durgin 	 */
33729875201eSJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
33739875201eSJosh Durgin 	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
33749875201eSJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
33759875201eSJosh Durgin 	/*
33769875201eSJosh Durgin 	 * If the device is being removed, rbd_dev->disk has
33779875201eSJosh Durgin 	 * been destroyed, so don't try to update its size
33789875201eSJosh Durgin 	 */
33799875201eSJosh Durgin 	if (!removing) {
33809875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
33819875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
33829875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
33839875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
33849875201eSJosh Durgin 	}
33859875201eSJosh Durgin }
33869875201eSJosh Durgin 
3387cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
33881fe5e993SAlex Elder {
3389e627db08SAlex Elder 	u64 mapping_size;
33901fe5e993SAlex Elder 	int ret;
33911fe5e993SAlex Elder 
3392117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3393cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
33943b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3395117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
339699a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
3397117973fbSAlex Elder 	else
33982df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
339915228edeSAlex Elder 
340015228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
340115228edeSAlex Elder 
340215228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
3403cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
3404cfbf6377SAlex Elder 
340500a653e2SAlex Elder 	if (mapping_size != rbd_dev->mapping.size) {
34069875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
340700a653e2SAlex Elder 	}
34081fe5e993SAlex Elder 
34091fe5e993SAlex Elder 	return ret;
34101fe5e993SAlex Elder }
34111fe5e993SAlex Elder 
3412602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3413602adf40SYehuda Sadeh {
3414602adf40SYehuda Sadeh 	struct gendisk *disk;
3415602adf40SYehuda Sadeh 	struct request_queue *q;
3416593a9e7bSAlex Elder 	u64 segment_size;
3417602adf40SYehuda Sadeh 
3418602adf40SYehuda Sadeh 	/* create gendisk info */
34197e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
34207e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
34217e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
3422602adf40SYehuda Sadeh 	if (!disk)
34231fcdb8aaSAlex Elder 		return -ENOMEM;
3424602adf40SYehuda Sadeh 
3425f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3426de71a297SAlex Elder 		 rbd_dev->dev_id);
3427602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3428dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
34297e513d43SIlya Dryomov 	if (single_major)
34307e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
3431602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3432602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3433602adf40SYehuda Sadeh 
3434bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3435602adf40SYehuda Sadeh 	if (!q)
3436602adf40SYehuda Sadeh 		goto out_disk;
3437029bcbd8SJosh Durgin 
3438593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3439593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3440593a9e7bSAlex Elder 
3441029bcbd8SJosh Durgin 	/* set io sizes to object size */
3442593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3443593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3444593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3445593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3446593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3447029bcbd8SJosh Durgin 
3448602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3449602adf40SYehuda Sadeh 	disk->queue = q;
3450602adf40SYehuda Sadeh 
3451602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3452602adf40SYehuda Sadeh 
3453602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3454602adf40SYehuda Sadeh 
3455602adf40SYehuda Sadeh 	return 0;
3456602adf40SYehuda Sadeh out_disk:
3457602adf40SYehuda Sadeh 	put_disk(disk);
34581fcdb8aaSAlex Elder 
34591fcdb8aaSAlex Elder 	return -ENOMEM;
3460602adf40SYehuda Sadeh }
3461602adf40SYehuda Sadeh 
3462dfc5606dSYehuda Sadeh /*
3463dfc5606dSYehuda Sadeh   sysfs
3464dfc5606dSYehuda Sadeh */
3465602adf40SYehuda Sadeh 
3466593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3467593a9e7bSAlex Elder {
3468593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3469593a9e7bSAlex Elder }
3470593a9e7bSAlex Elder 
3471dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3472dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3473602adf40SYehuda Sadeh {
3474593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3475dfc5606dSYehuda Sadeh 
3476fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3477fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3478602adf40SYehuda Sadeh }
3479602adf40SYehuda Sadeh 
348034b13184SAlex Elder /*
348134b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
348234b13184SAlex Elder  * necessarily the base image.
348334b13184SAlex Elder  */
348434b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
348534b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
348634b13184SAlex Elder {
348734b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
348834b13184SAlex Elder 
348934b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
349034b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
349134b13184SAlex Elder }
349234b13184SAlex Elder 
3493dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3494dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3495602adf40SYehuda Sadeh {
3496593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3497dfc5606dSYehuda Sadeh 
3498fc71d833SAlex Elder 	if (rbd_dev->major)
3499dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3500fc71d833SAlex Elder 
3501fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3502dd82fff1SIlya Dryomov }
3503fc71d833SAlex Elder 
3504dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
3505dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
3506dd82fff1SIlya Dryomov {
3507dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3508dd82fff1SIlya Dryomov 
3509dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
3510dfc5606dSYehuda Sadeh }
3511dfc5606dSYehuda Sadeh 
3512dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3513dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3514dfc5606dSYehuda Sadeh {
3515593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3516dfc5606dSYehuda Sadeh 
35171dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
35181dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3519dfc5606dSYehuda Sadeh }
3520dfc5606dSYehuda Sadeh 
3521dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3522dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3523dfc5606dSYehuda Sadeh {
3524593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3525dfc5606dSYehuda Sadeh 
35260d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3527dfc5606dSYehuda Sadeh }
3528dfc5606dSYehuda Sadeh 
35299bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
35309bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
35319bb2f334SAlex Elder {
35329bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
35339bb2f334SAlex Elder 
35340d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
35350d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
35369bb2f334SAlex Elder }
35379bb2f334SAlex Elder 
3538dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3539dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3540dfc5606dSYehuda Sadeh {
3541593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3542dfc5606dSYehuda Sadeh 
3543a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
35440d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3545a92ffdf8SAlex Elder 
3546a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3547dfc5606dSYehuda Sadeh }
3548dfc5606dSYehuda Sadeh 
3549589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3550589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3551589d30e0SAlex Elder {
3552589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3553589d30e0SAlex Elder 
35540d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3555589d30e0SAlex Elder }
3556589d30e0SAlex Elder 
355734b13184SAlex Elder /*
355834b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
355934b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
356034b13184SAlex Elder  */
3561dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3562dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3563dfc5606dSYehuda Sadeh 			     char *buf)
3564dfc5606dSYehuda Sadeh {
3565593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3566dfc5606dSYehuda Sadeh 
35670d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3568dfc5606dSYehuda Sadeh }
3569dfc5606dSYehuda Sadeh 
357086b00e0dSAlex Elder /*
357186b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
357286b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
357386b00e0dSAlex Elder  * "(no parent image)".
357486b00e0dSAlex Elder  */
357586b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
357686b00e0dSAlex Elder 			     struct device_attribute *attr,
357786b00e0dSAlex Elder 			     char *buf)
357886b00e0dSAlex Elder {
357986b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
358086b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
358186b00e0dSAlex Elder 	int count;
358286b00e0dSAlex Elder 	char *bufp = buf;
358386b00e0dSAlex Elder 
358486b00e0dSAlex Elder 	if (!spec)
358586b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
358686b00e0dSAlex Elder 
358786b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
358886b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
358986b00e0dSAlex Elder 	if (count < 0)
359086b00e0dSAlex Elder 		return count;
359186b00e0dSAlex Elder 	bufp += count;
359286b00e0dSAlex Elder 
359386b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
359486b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
359586b00e0dSAlex Elder 	if (count < 0)
359686b00e0dSAlex Elder 		return count;
359786b00e0dSAlex Elder 	bufp += count;
359886b00e0dSAlex Elder 
359986b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
360086b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
360186b00e0dSAlex Elder 	if (count < 0)
360286b00e0dSAlex Elder 		return count;
360386b00e0dSAlex Elder 	bufp += count;
360486b00e0dSAlex Elder 
360586b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
360686b00e0dSAlex Elder 	if (count < 0)
360786b00e0dSAlex Elder 		return count;
360886b00e0dSAlex Elder 	bufp += count;
360986b00e0dSAlex Elder 
361086b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
361186b00e0dSAlex Elder }
361286b00e0dSAlex Elder 
3613dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3614dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3615dfc5606dSYehuda Sadeh 				 const char *buf,
3616dfc5606dSYehuda Sadeh 				 size_t size)
3617dfc5606dSYehuda Sadeh {
3618593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3619b813623aSAlex Elder 	int ret;
3620602adf40SYehuda Sadeh 
3621cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3622e627db08SAlex Elder 	if (ret)
3623e627db08SAlex Elder 		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3624b813623aSAlex Elder 
3625b813623aSAlex Elder 	return ret < 0 ? ret : size;
3626dfc5606dSYehuda Sadeh }
3627602adf40SYehuda Sadeh 
3628dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
362934b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3630dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3631dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3632dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3633dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
36349bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3635dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3636589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3637dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3638dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
363986b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3640dfc5606dSYehuda Sadeh 
3641dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3642dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
364334b13184SAlex Elder 	&dev_attr_features.attr,
3644dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3645dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
3646dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3647dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
36489bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3649dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3650589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3651dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
365286b00e0dSAlex Elder 	&dev_attr_parent.attr,
3653dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3654dfc5606dSYehuda Sadeh 	NULL
3655dfc5606dSYehuda Sadeh };
3656dfc5606dSYehuda Sadeh 
3657dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3658dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3659dfc5606dSYehuda Sadeh };
3660dfc5606dSYehuda Sadeh 
3661dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3662dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3663dfc5606dSYehuda Sadeh 	NULL
3664dfc5606dSYehuda Sadeh };
3665dfc5606dSYehuda Sadeh 
3666dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3667dfc5606dSYehuda Sadeh {
3668dfc5606dSYehuda Sadeh }
3669dfc5606dSYehuda Sadeh 
3670dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3671dfc5606dSYehuda Sadeh 	.name		= "rbd",
3672dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3673dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3674dfc5606dSYehuda Sadeh };
3675dfc5606dSYehuda Sadeh 
36768b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
36778b8fb99cSAlex Elder {
36788b8fb99cSAlex Elder 	kref_get(&spec->kref);
36798b8fb99cSAlex Elder 
36808b8fb99cSAlex Elder 	return spec;
36818b8fb99cSAlex Elder }
36828b8fb99cSAlex Elder 
36838b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
36848b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
36858b8fb99cSAlex Elder {
36868b8fb99cSAlex Elder 	if (spec)
36878b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
36888b8fb99cSAlex Elder }
36898b8fb99cSAlex Elder 
36908b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
36918b8fb99cSAlex Elder {
36928b8fb99cSAlex Elder 	struct rbd_spec *spec;
36938b8fb99cSAlex Elder 
36948b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
36958b8fb99cSAlex Elder 	if (!spec)
36968b8fb99cSAlex Elder 		return NULL;
36978b8fb99cSAlex Elder 	kref_init(&spec->kref);
36988b8fb99cSAlex Elder 
36998b8fb99cSAlex Elder 	return spec;
37008b8fb99cSAlex Elder }
37018b8fb99cSAlex Elder 
37028b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
37038b8fb99cSAlex Elder {
37048b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
37058b8fb99cSAlex Elder 
37068b8fb99cSAlex Elder 	kfree(spec->pool_name);
37078b8fb99cSAlex Elder 	kfree(spec->image_id);
37088b8fb99cSAlex Elder 	kfree(spec->image_name);
37098b8fb99cSAlex Elder 	kfree(spec->snap_name);
37108b8fb99cSAlex Elder 	kfree(spec);
37118b8fb99cSAlex Elder }
37128b8fb99cSAlex Elder 
3713cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3714c53d5893SAlex Elder 				struct rbd_spec *spec)
3715c53d5893SAlex Elder {
3716c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3717c53d5893SAlex Elder 
3718c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3719c53d5893SAlex Elder 	if (!rbd_dev)
3720c53d5893SAlex Elder 		return NULL;
3721c53d5893SAlex Elder 
3722c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
37236d292906SAlex Elder 	rbd_dev->flags = 0;
3724a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
3725c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3726c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3727c53d5893SAlex Elder 
3728c53d5893SAlex Elder 	rbd_dev->spec = spec;
3729c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3730c53d5893SAlex Elder 
37310903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
37320903e875SAlex Elder 
37330903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
37340903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
37350903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
37360903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
37370903e875SAlex Elder 
3738c53d5893SAlex Elder 	return rbd_dev;
3739c53d5893SAlex Elder }
3740c53d5893SAlex Elder 
3741c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3742c53d5893SAlex Elder {
3743c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3744c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3745c53d5893SAlex Elder 	kfree(rbd_dev);
3746c53d5893SAlex Elder }
3747c53d5893SAlex Elder 
3748dfc5606dSYehuda Sadeh /*
37499d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
37509d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
37519d475de5SAlex Elder  * image.
37529d475de5SAlex Elder  */
37539d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
37549d475de5SAlex Elder 				u8 *order, u64 *snap_size)
37559d475de5SAlex Elder {
37569d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
37579d475de5SAlex Elder 	int ret;
37589d475de5SAlex Elder 	struct {
37599d475de5SAlex Elder 		u8 order;
37609d475de5SAlex Elder 		__le64 size;
37619d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
37629d475de5SAlex Elder 
376336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
37649d475de5SAlex Elder 				"rbd", "get_size",
37654157976bSAlex Elder 				&snapid, sizeof (snapid),
3766e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
376736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
37689d475de5SAlex Elder 	if (ret < 0)
37699d475de5SAlex Elder 		return ret;
377057385b51SAlex Elder 	if (ret < sizeof (size_buf))
377157385b51SAlex Elder 		return -ERANGE;
37729d475de5SAlex Elder 
3773c3545579SJosh Durgin 	if (order) {
37749d475de5SAlex Elder 		*order = size_buf.order;
3775c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
3776c3545579SJosh Durgin 	}
37779d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
37789d475de5SAlex Elder 
3779c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
3780c3545579SJosh Durgin 		(unsigned long long)snap_id,
37819d475de5SAlex Elder 		(unsigned long long)*snap_size);
37829d475de5SAlex Elder 
37839d475de5SAlex Elder 	return 0;
37849d475de5SAlex Elder }
37859d475de5SAlex Elder 
37869d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
37879d475de5SAlex Elder {
37889d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
37899d475de5SAlex Elder 					&rbd_dev->header.obj_order,
37909d475de5SAlex Elder 					&rbd_dev->header.image_size);
37919d475de5SAlex Elder }
37929d475de5SAlex Elder 
37931e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
37941e130199SAlex Elder {
37951e130199SAlex Elder 	void *reply_buf;
37961e130199SAlex Elder 	int ret;
37971e130199SAlex Elder 	void *p;
37981e130199SAlex Elder 
37991e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
38001e130199SAlex Elder 	if (!reply_buf)
38011e130199SAlex Elder 		return -ENOMEM;
38021e130199SAlex Elder 
380336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
38044157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3805e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
380636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
38071e130199SAlex Elder 	if (ret < 0)
38081e130199SAlex Elder 		goto out;
38091e130199SAlex Elder 
38101e130199SAlex Elder 	p = reply_buf;
38111e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
381257385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
381357385b51SAlex Elder 	ret = 0;
38141e130199SAlex Elder 
38151e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
38161e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
38171e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
38181e130199SAlex Elder 	} else {
38191e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
38201e130199SAlex Elder 	}
38211e130199SAlex Elder out:
38221e130199SAlex Elder 	kfree(reply_buf);
38231e130199SAlex Elder 
38241e130199SAlex Elder 	return ret;
38251e130199SAlex Elder }
38261e130199SAlex Elder 
3827b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3828b1b5402aSAlex Elder 		u64 *snap_features)
3829b1b5402aSAlex Elder {
3830b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3831b1b5402aSAlex Elder 	struct {
3832b1b5402aSAlex Elder 		__le64 features;
3833b1b5402aSAlex Elder 		__le64 incompat;
38344157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3835d889140cSAlex Elder 	u64 incompat;
3836b1b5402aSAlex Elder 	int ret;
3837b1b5402aSAlex Elder 
383836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3839b1b5402aSAlex Elder 				"rbd", "get_features",
38404157976bSAlex Elder 				&snapid, sizeof (snapid),
3841e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
384236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3843b1b5402aSAlex Elder 	if (ret < 0)
3844b1b5402aSAlex Elder 		return ret;
384557385b51SAlex Elder 	if (ret < sizeof (features_buf))
384657385b51SAlex Elder 		return -ERANGE;
3847d889140cSAlex Elder 
3848d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
38495cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3850b8f5c6edSAlex Elder 		return -ENXIO;
3851d889140cSAlex Elder 
3852b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3853b1b5402aSAlex Elder 
3854b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3855b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3856b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3857b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3858b1b5402aSAlex Elder 
3859b1b5402aSAlex Elder 	return 0;
3860b1b5402aSAlex Elder }
3861b1b5402aSAlex Elder 
3862b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3863b1b5402aSAlex Elder {
3864b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3865b1b5402aSAlex Elder 						&rbd_dev->header.features);
3866b1b5402aSAlex Elder }
3867b1b5402aSAlex Elder 
386886b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
386986b00e0dSAlex Elder {
387086b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
387186b00e0dSAlex Elder 	size_t size;
387286b00e0dSAlex Elder 	void *reply_buf = NULL;
387386b00e0dSAlex Elder 	__le64 snapid;
387486b00e0dSAlex Elder 	void *p;
387586b00e0dSAlex Elder 	void *end;
3876642a2537SAlex Elder 	u64 pool_id;
387786b00e0dSAlex Elder 	char *image_id;
38783b5cf2a2SAlex Elder 	u64 snap_id;
387986b00e0dSAlex Elder 	u64 overlap;
388086b00e0dSAlex Elder 	int ret;
388186b00e0dSAlex Elder 
388286b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
388386b00e0dSAlex Elder 	if (!parent_spec)
388486b00e0dSAlex Elder 		return -ENOMEM;
388586b00e0dSAlex Elder 
388686b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
388786b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
388886b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
388986b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
389086b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
389186b00e0dSAlex Elder 	if (!reply_buf) {
389286b00e0dSAlex Elder 		ret = -ENOMEM;
389386b00e0dSAlex Elder 		goto out_err;
389486b00e0dSAlex Elder 	}
389586b00e0dSAlex Elder 
389686b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
389736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
389886b00e0dSAlex Elder 				"rbd", "get_parent",
38994157976bSAlex Elder 				&snapid, sizeof (snapid),
3900e2a58ee5SAlex Elder 				reply_buf, size);
390136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
390286b00e0dSAlex Elder 	if (ret < 0)
390386b00e0dSAlex Elder 		goto out_err;
390486b00e0dSAlex Elder 
390586b00e0dSAlex Elder 	p = reply_buf;
390657385b51SAlex Elder 	end = reply_buf + ret;
390757385b51SAlex Elder 	ret = -ERANGE;
3908642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
3909392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
3910392a9dadSAlex Elder 		/*
3911392a9dadSAlex Elder 		 * Either the parent never existed, or we have
3912392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
3913392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
3914392a9dadSAlex Elder 		 * layered image disappears we immediately set the
3915392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
3916392a9dadSAlex Elder 		 * requests will be treated as if the image had no
3917392a9dadSAlex Elder 		 * parent.
3918392a9dadSAlex Elder 		 */
3919392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
3920392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
3921392a9dadSAlex Elder 			smp_mb();
3922392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
3923392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
3924392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
3925392a9dadSAlex Elder 		}
3926392a9dadSAlex Elder 
392786b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
3928392a9dadSAlex Elder 	}
392986b00e0dSAlex Elder 
39300903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
39310903e875SAlex Elder 
39320903e875SAlex Elder 	ret = -EIO;
3933642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
3934c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3935642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
393657385b51SAlex Elder 		goto out_err;
3937c0cd10dbSAlex Elder 	}
39380903e875SAlex Elder 
3939979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
394086b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
394186b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
394286b00e0dSAlex Elder 		goto out_err;
394386b00e0dSAlex Elder 	}
39443b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
394586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
394686b00e0dSAlex Elder 
39473b5cf2a2SAlex Elder 	/*
39483b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
39493b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
39503b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
39513b5cf2a2SAlex Elder 	 */
39523b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
39533b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
39543b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
39553b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
395686b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
395786b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
39583b5cf2a2SAlex Elder 	}
39593b5cf2a2SAlex Elder 
39603b5cf2a2SAlex Elder 	/*
39613b5cf2a2SAlex Elder 	 * We always update the parent overlap.  If it's zero we
39623b5cf2a2SAlex Elder 	 * treat it specially.
39633b5cf2a2SAlex Elder 	 */
396470cf49cfSAlex Elder 	rbd_dev->parent_overlap = overlap;
39653b5cf2a2SAlex Elder 	smp_mb();
39663b5cf2a2SAlex Elder 	if (!overlap) {
39673b5cf2a2SAlex Elder 
39683b5cf2a2SAlex Elder 		/* A null parent_spec indicates it's the initial probe */
39693b5cf2a2SAlex Elder 
39703b5cf2a2SAlex Elder 		if (parent_spec) {
39713b5cf2a2SAlex Elder 			/*
39723b5cf2a2SAlex Elder 			 * The overlap has become zero, so the clone
39733b5cf2a2SAlex Elder 			 * must have been resized down to 0 at some
39743b5cf2a2SAlex Elder 			 * point.  Treat this the same as a flatten.
39753b5cf2a2SAlex Elder 			 */
39763b5cf2a2SAlex Elder 			rbd_dev_parent_put(rbd_dev);
39773b5cf2a2SAlex Elder 			pr_info("%s: clone image now standalone\n",
39783b5cf2a2SAlex Elder 				rbd_dev->disk->disk_name);
397970cf49cfSAlex Elder 		} else {
39803b5cf2a2SAlex Elder 			/*
39813b5cf2a2SAlex Elder 			 * For the initial probe, if we find the
39823b5cf2a2SAlex Elder 			 * overlap is zero we just pretend there was
39833b5cf2a2SAlex Elder 			 * no parent image.
39843b5cf2a2SAlex Elder 			 */
39853b5cf2a2SAlex Elder 			rbd_warn(rbd_dev, "ignoring parent of "
39863b5cf2a2SAlex Elder 						"clone with overlap 0\n");
39873b5cf2a2SAlex Elder 		}
398870cf49cfSAlex Elder 	}
398986b00e0dSAlex Elder out:
399086b00e0dSAlex Elder 	ret = 0;
399186b00e0dSAlex Elder out_err:
399286b00e0dSAlex Elder 	kfree(reply_buf);
399386b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
399486b00e0dSAlex Elder 
399586b00e0dSAlex Elder 	return ret;
399686b00e0dSAlex Elder }
399786b00e0dSAlex Elder 
3998cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3999cc070d59SAlex Elder {
4000cc070d59SAlex Elder 	struct {
4001cc070d59SAlex Elder 		__le64 stripe_unit;
4002cc070d59SAlex Elder 		__le64 stripe_count;
4003cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4004cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4005cc070d59SAlex Elder 	void *p;
4006cc070d59SAlex Elder 	u64 obj_size;
4007cc070d59SAlex Elder 	u64 stripe_unit;
4008cc070d59SAlex Elder 	u64 stripe_count;
4009cc070d59SAlex Elder 	int ret;
4010cc070d59SAlex Elder 
4011cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4012cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
4013e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
4014cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4015cc070d59SAlex Elder 	if (ret < 0)
4016cc070d59SAlex Elder 		return ret;
4017cc070d59SAlex Elder 	if (ret < size)
4018cc070d59SAlex Elder 		return -ERANGE;
4019cc070d59SAlex Elder 
4020cc070d59SAlex Elder 	/*
4021cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
4022cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
4023cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
4024cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
4025cc070d59SAlex Elder 	 */
4026cc070d59SAlex Elder 	ret = -EINVAL;
4027cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
4028cc070d59SAlex Elder 	p = &striping_info_buf;
4029cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
4030cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
4031cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
4032cc070d59SAlex Elder 				"(got %llu want %llu)",
4033cc070d59SAlex Elder 				stripe_unit, obj_size);
4034cc070d59SAlex Elder 		return -EINVAL;
4035cc070d59SAlex Elder 	}
4036cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
4037cc070d59SAlex Elder 	if (stripe_count != 1) {
4038cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
4039cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
4040cc070d59SAlex Elder 		return -EINVAL;
4041cc070d59SAlex Elder 	}
4042500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
4043500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
4044cc070d59SAlex Elder 
4045cc070d59SAlex Elder 	return 0;
4046cc070d59SAlex Elder }
4047cc070d59SAlex Elder 
40489e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
40499e15b77dSAlex Elder {
40509e15b77dSAlex Elder 	size_t image_id_size;
40519e15b77dSAlex Elder 	char *image_id;
40529e15b77dSAlex Elder 	void *p;
40539e15b77dSAlex Elder 	void *end;
40549e15b77dSAlex Elder 	size_t size;
40559e15b77dSAlex Elder 	void *reply_buf = NULL;
40569e15b77dSAlex Elder 	size_t len = 0;
40579e15b77dSAlex Elder 	char *image_name = NULL;
40589e15b77dSAlex Elder 	int ret;
40599e15b77dSAlex Elder 
40609e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
40619e15b77dSAlex Elder 
406269e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
406369e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
40649e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
40659e15b77dSAlex Elder 	if (!image_id)
40669e15b77dSAlex Elder 		return NULL;
40679e15b77dSAlex Elder 
40689e15b77dSAlex Elder 	p = image_id;
40694157976bSAlex Elder 	end = image_id + image_id_size;
407069e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
40719e15b77dSAlex Elder 
40729e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
40739e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
40749e15b77dSAlex Elder 	if (!reply_buf)
40759e15b77dSAlex Elder 		goto out;
40769e15b77dSAlex Elder 
407736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
40789e15b77dSAlex Elder 				"rbd", "dir_get_name",
40799e15b77dSAlex Elder 				image_id, image_id_size,
4080e2a58ee5SAlex Elder 				reply_buf, size);
40819e15b77dSAlex Elder 	if (ret < 0)
40829e15b77dSAlex Elder 		goto out;
40839e15b77dSAlex Elder 	p = reply_buf;
4084f40eb349SAlex Elder 	end = reply_buf + ret;
4085f40eb349SAlex Elder 
40869e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
40879e15b77dSAlex Elder 	if (IS_ERR(image_name))
40889e15b77dSAlex Elder 		image_name = NULL;
40899e15b77dSAlex Elder 	else
40909e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
40919e15b77dSAlex Elder out:
40929e15b77dSAlex Elder 	kfree(reply_buf);
40939e15b77dSAlex Elder 	kfree(image_id);
40949e15b77dSAlex Elder 
40959e15b77dSAlex Elder 	return image_name;
40969e15b77dSAlex Elder }
40979e15b77dSAlex Elder 
40982ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
40992ad3d716SAlex Elder {
41002ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
41012ad3d716SAlex Elder 	const char *snap_name;
41022ad3d716SAlex Elder 	u32 which = 0;
41032ad3d716SAlex Elder 
41042ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
41052ad3d716SAlex Elder 
41062ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
41072ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
41082ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
41092ad3d716SAlex Elder 			return snapc->snaps[which];
41102ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
41112ad3d716SAlex Elder 		which++;
41122ad3d716SAlex Elder 	}
41132ad3d716SAlex Elder 	return CEPH_NOSNAP;
41142ad3d716SAlex Elder }
41152ad3d716SAlex Elder 
41162ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
41172ad3d716SAlex Elder {
41182ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
41192ad3d716SAlex Elder 	u32 which;
41202ad3d716SAlex Elder 	bool found = false;
41212ad3d716SAlex Elder 	u64 snap_id;
41222ad3d716SAlex Elder 
41232ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
41242ad3d716SAlex Elder 		const char *snap_name;
41252ad3d716SAlex Elder 
41262ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
41272ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4128efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
4129efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
4130efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
4131efadc98aSJosh Durgin 				continue;
4132efadc98aSJosh Durgin 			else
41332ad3d716SAlex Elder 				break;
4134efadc98aSJosh Durgin 		}
41352ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
41362ad3d716SAlex Elder 		kfree(snap_name);
41372ad3d716SAlex Elder 	}
41382ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
41392ad3d716SAlex Elder }
41402ad3d716SAlex Elder 
41412ad3d716SAlex Elder /*
41422ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
41432ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
41442ad3d716SAlex Elder  */
41452ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
41462ad3d716SAlex Elder {
41472ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
41482ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
41492ad3d716SAlex Elder 
41502ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
41512ad3d716SAlex Elder }
41522ad3d716SAlex Elder 
41539e15b77dSAlex Elder /*
41542e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
41552e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
41562e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
41572e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
41582e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
41592e9f7f1cSAlex Elder  * allocated.
4160e1d4213fSAlex Elder  *
4161e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
4162e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
4163e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
41649e15b77dSAlex Elder  */
41652e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
41669e15b77dSAlex Elder {
41672e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
41682e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
41692e9f7f1cSAlex Elder 	const char *pool_name;
41702e9f7f1cSAlex Elder 	const char *image_name;
41712e9f7f1cSAlex Elder 	const char *snap_name;
41729e15b77dSAlex Elder 	int ret;
41739e15b77dSAlex Elder 
4174e1d4213fSAlex Elder 	/*
4175e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
4176e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
4177e1d4213fSAlex Elder 	 */
41782e9f7f1cSAlex Elder 	if (spec->pool_name) {
41792e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
41802ad3d716SAlex Elder 			u64 snap_id;
4181e1d4213fSAlex Elder 
41822ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
41832ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
4184e1d4213fSAlex Elder 				return -ENOENT;
41852ad3d716SAlex Elder 			spec->snap_id = snap_id;
4186e1d4213fSAlex Elder 		} else {
41872e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
4188e1d4213fSAlex Elder 		}
4189e1d4213fSAlex Elder 
4190e1d4213fSAlex Elder 		return 0;
4191e1d4213fSAlex Elder 	}
41929e15b77dSAlex Elder 
41932e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
41949e15b77dSAlex Elder 
41952e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
41962e9f7f1cSAlex Elder 	if (!pool_name) {
41972e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4198935dc89fSAlex Elder 		return -EIO;
4199935dc89fSAlex Elder 	}
42002e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
42012e9f7f1cSAlex Elder 	if (!pool_name)
42029e15b77dSAlex Elder 		return -ENOMEM;
42039e15b77dSAlex Elder 
42049e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
42059e15b77dSAlex Elder 
42062e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
42072e9f7f1cSAlex Elder 	if (!image_name)
420806ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
42099e15b77dSAlex Elder 
42102e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
42119e15b77dSAlex Elder 
42122e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4213da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
4214da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
42159e15b77dSAlex Elder 		goto out_err;
42162e9f7f1cSAlex Elder 	}
42172e9f7f1cSAlex Elder 
42182e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
42192e9f7f1cSAlex Elder 	spec->image_name = image_name;
42202e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
42219e15b77dSAlex Elder 
42229e15b77dSAlex Elder 	return 0;
42239e15b77dSAlex Elder out_err:
42242e9f7f1cSAlex Elder 	kfree(image_name);
42252e9f7f1cSAlex Elder 	kfree(pool_name);
42269e15b77dSAlex Elder 
42279e15b77dSAlex Elder 	return ret;
42289e15b77dSAlex Elder }
42299e15b77dSAlex Elder 
4230cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
423135d489f9SAlex Elder {
423235d489f9SAlex Elder 	size_t size;
423335d489f9SAlex Elder 	int ret;
423435d489f9SAlex Elder 	void *reply_buf;
423535d489f9SAlex Elder 	void *p;
423635d489f9SAlex Elder 	void *end;
423735d489f9SAlex Elder 	u64 seq;
423835d489f9SAlex Elder 	u32 snap_count;
423935d489f9SAlex Elder 	struct ceph_snap_context *snapc;
424035d489f9SAlex Elder 	u32 i;
424135d489f9SAlex Elder 
424235d489f9SAlex Elder 	/*
424335d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
424435d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
424535d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
424635d489f9SAlex Elder 	 * prepared to receive.
424735d489f9SAlex Elder 	 */
424835d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
424935d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
425035d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
425135d489f9SAlex Elder 	if (!reply_buf)
425235d489f9SAlex Elder 		return -ENOMEM;
425335d489f9SAlex Elder 
425436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
42554157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4256e2a58ee5SAlex Elder 				reply_buf, size);
425736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
425835d489f9SAlex Elder 	if (ret < 0)
425935d489f9SAlex Elder 		goto out;
426035d489f9SAlex Elder 
426135d489f9SAlex Elder 	p = reply_buf;
426257385b51SAlex Elder 	end = reply_buf + ret;
426357385b51SAlex Elder 	ret = -ERANGE;
426435d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
426535d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
426635d489f9SAlex Elder 
426735d489f9SAlex Elder 	/*
426835d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
426935d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
427035d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
427135d489f9SAlex Elder 	 * allocate is representable in a size_t.
427235d489f9SAlex Elder 	 */
427335d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
427435d489f9SAlex Elder 				 / sizeof (u64)) {
427535d489f9SAlex Elder 		ret = -EINVAL;
427635d489f9SAlex Elder 		goto out;
427735d489f9SAlex Elder 	}
427835d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
427935d489f9SAlex Elder 		goto out;
4280468521c1SAlex Elder 	ret = 0;
428135d489f9SAlex Elder 
4282812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
428335d489f9SAlex Elder 	if (!snapc) {
428435d489f9SAlex Elder 		ret = -ENOMEM;
428535d489f9SAlex Elder 		goto out;
428635d489f9SAlex Elder 	}
428735d489f9SAlex Elder 	snapc->seq = seq;
428835d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
428935d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
429035d489f9SAlex Elder 
429149ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
429235d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
429335d489f9SAlex Elder 
429435d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
429535d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
429635d489f9SAlex Elder out:
429735d489f9SAlex Elder 	kfree(reply_buf);
429835d489f9SAlex Elder 
429957385b51SAlex Elder 	return ret;
430035d489f9SAlex Elder }
430135d489f9SAlex Elder 
430254cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
430354cac61fSAlex Elder 					u64 snap_id)
4304b8b1e2dbSAlex Elder {
4305b8b1e2dbSAlex Elder 	size_t size;
4306b8b1e2dbSAlex Elder 	void *reply_buf;
430754cac61fSAlex Elder 	__le64 snapid;
4308b8b1e2dbSAlex Elder 	int ret;
4309b8b1e2dbSAlex Elder 	void *p;
4310b8b1e2dbSAlex Elder 	void *end;
4311b8b1e2dbSAlex Elder 	char *snap_name;
4312b8b1e2dbSAlex Elder 
4313b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4314b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4315b8b1e2dbSAlex Elder 	if (!reply_buf)
4316b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4317b8b1e2dbSAlex Elder 
431854cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
431936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4320b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
432154cac61fSAlex Elder 				&snapid, sizeof (snapid),
4322e2a58ee5SAlex Elder 				reply_buf, size);
432336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4324f40eb349SAlex Elder 	if (ret < 0) {
4325f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4326b8b1e2dbSAlex Elder 		goto out;
4327f40eb349SAlex Elder 	}
4328b8b1e2dbSAlex Elder 
4329b8b1e2dbSAlex Elder 	p = reply_buf;
4330f40eb349SAlex Elder 	end = reply_buf + ret;
4331e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4332f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4333b8b1e2dbSAlex Elder 		goto out;
4334f40eb349SAlex Elder 
4335b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
433654cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4337b8b1e2dbSAlex Elder out:
4338b8b1e2dbSAlex Elder 	kfree(reply_buf);
4339b8b1e2dbSAlex Elder 
4340f40eb349SAlex Elder 	return snap_name;
4341b8b1e2dbSAlex Elder }
4342b8b1e2dbSAlex Elder 
43432df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4344117973fbSAlex Elder {
43452df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4346117973fbSAlex Elder 	int ret;
4347117973fbSAlex Elder 
43481617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
43491617e40cSJosh Durgin 	if (ret)
4350cfbf6377SAlex Elder 		return ret;
43511617e40cSJosh Durgin 
43522df3fac7SAlex Elder 	if (first_time) {
43532df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
43542df3fac7SAlex Elder 		if (ret)
4355cfbf6377SAlex Elder 			return ret;
43562df3fac7SAlex Elder 	}
43572df3fac7SAlex Elder 
4358642a2537SAlex Elder 	/*
4359642a2537SAlex Elder 	 * If the image supports layering, get the parent info.  We
4360642a2537SAlex Elder 	 * need to probe the first time regardless.  Thereafter we
4361642a2537SAlex Elder 	 * only need to if there's a parent, to see if it has
4362642a2537SAlex Elder 	 * disappeared due to the mapped image getting flattened.
4363642a2537SAlex Elder 	 */
4364642a2537SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4365642a2537SAlex Elder 			(first_time || rbd_dev->parent_spec)) {
4366642a2537SAlex Elder 		bool warn;
4367642a2537SAlex Elder 
4368642a2537SAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
4369642a2537SAlex Elder 		if (ret)
4370cfbf6377SAlex Elder 			return ret;
4371642a2537SAlex Elder 
4372642a2537SAlex Elder 		/*
4373642a2537SAlex Elder 		 * Print a warning if this is the initial probe and
4374642a2537SAlex Elder 		 * the image has a parent.  Don't print it if the
4375642a2537SAlex Elder 		 * image now being probed is itself a parent.  We
4376642a2537SAlex Elder 		 * can tell at this point because we won't know its
4377642a2537SAlex Elder 		 * pool name yet (just its pool id).
4378642a2537SAlex Elder 		 */
4379642a2537SAlex Elder 		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4380642a2537SAlex Elder 		if (first_time && warn)
4381642a2537SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
4382642a2537SAlex Elder 					"is EXPERIMENTAL!");
4383642a2537SAlex Elder 	}
4384642a2537SAlex Elder 
438529334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
438629334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
438729334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
4388117973fbSAlex Elder 
4389cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4390117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4391117973fbSAlex Elder 
4392117973fbSAlex Elder 	return ret;
4393117973fbSAlex Elder }
4394117973fbSAlex Elder 
4395dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4396dfc5606dSYehuda Sadeh {
4397dfc5606dSYehuda Sadeh 	struct device *dev;
4398cd789ab9SAlex Elder 	int ret;
4399dfc5606dSYehuda Sadeh 
4400cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4401dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4402dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4403dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4404200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4405de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4406dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4407dfc5606dSYehuda Sadeh 
4408dfc5606dSYehuda Sadeh 	return ret;
4409602adf40SYehuda Sadeh }
4410602adf40SYehuda Sadeh 
4411dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4412dfc5606dSYehuda Sadeh {
4413dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4414dfc5606dSYehuda Sadeh }
4415dfc5606dSYehuda Sadeh 
44161ddbe94eSAlex Elder /*
4417499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4418f8a22fc2SIlya Dryomov  * the rbd_dev to the global list.
44191ddbe94eSAlex Elder  */
4420f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4421b7f23c36SAlex Elder {
4422f8a22fc2SIlya Dryomov 	int new_dev_id;
4423f8a22fc2SIlya Dryomov 
44249b60e70bSIlya Dryomov 	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
44259b60e70bSIlya Dryomov 				    0, minor_to_rbd_dev_id(1 << MINORBITS),
44269b60e70bSIlya Dryomov 				    GFP_KERNEL);
4427f8a22fc2SIlya Dryomov 	if (new_dev_id < 0)
4428f8a22fc2SIlya Dryomov 		return new_dev_id;
4429f8a22fc2SIlya Dryomov 
4430f8a22fc2SIlya Dryomov 	rbd_dev->dev_id = new_dev_id;
4431499afd5bSAlex Elder 
4432499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4433499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4434499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4435f8a22fc2SIlya Dryomov 
443670eebd20SIlya Dryomov 	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4437f8a22fc2SIlya Dryomov 
4438f8a22fc2SIlya Dryomov 	return 0;
4439b7f23c36SAlex Elder }
4440b7f23c36SAlex Elder 
44411ddbe94eSAlex Elder /*
4442499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4443499afd5bSAlex Elder  * identifier is no longer in use.
44441ddbe94eSAlex Elder  */
4445e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
44461ddbe94eSAlex Elder {
4447499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4448499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4449499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
44501ddbe94eSAlex Elder 
4451f8a22fc2SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4452f8a22fc2SIlya Dryomov 
4453f8a22fc2SIlya Dryomov 	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4454b7f23c36SAlex Elder }
4455b7f23c36SAlex Elder 
4456a725f65eSAlex Elder /*
4457e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4458e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4459593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4460593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4461e28fff26SAlex Elder  */
4462e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4463e28fff26SAlex Elder {
4464e28fff26SAlex Elder         /*
4465e28fff26SAlex Elder         * These are the characters that produce nonzero for
4466e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4467e28fff26SAlex Elder         */
4468e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4469e28fff26SAlex Elder 
4470e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4471e28fff26SAlex Elder 
4472e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4473e28fff26SAlex Elder }
4474e28fff26SAlex Elder 
4475e28fff26SAlex Elder /*
4476e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4477e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4478593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4479593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4480e28fff26SAlex Elder  *
4481e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4482e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4483e28fff26SAlex Elder  * token_size if the token would not fit.
4484e28fff26SAlex Elder  *
4485593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4486e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4487e28fff26SAlex Elder  * too small to hold it.
4488e28fff26SAlex Elder  */
4489e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4490e28fff26SAlex Elder 				char *token,
4491e28fff26SAlex Elder 				size_t token_size)
4492e28fff26SAlex Elder {
4493e28fff26SAlex Elder         size_t len;
4494e28fff26SAlex Elder 
4495e28fff26SAlex Elder 	len = next_token(buf);
4496e28fff26SAlex Elder 	if (len < token_size) {
4497e28fff26SAlex Elder 		memcpy(token, *buf, len);
4498e28fff26SAlex Elder 		*(token + len) = '\0';
4499e28fff26SAlex Elder 	}
4500e28fff26SAlex Elder 	*buf += len;
4501e28fff26SAlex Elder 
4502e28fff26SAlex Elder         return len;
4503e28fff26SAlex Elder }
4504e28fff26SAlex Elder 
4505e28fff26SAlex Elder /*
4506ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4507ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4508ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4509ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4510ea3352f4SAlex Elder  *
4511ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4512ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4513ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4514ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4515ea3352f4SAlex Elder  *
4516ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4517ea3352f4SAlex Elder  * the end of the found token.
4518ea3352f4SAlex Elder  *
4519ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4520ea3352f4SAlex Elder  */
4521ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4522ea3352f4SAlex Elder {
4523ea3352f4SAlex Elder 	char *dup;
4524ea3352f4SAlex Elder 	size_t len;
4525ea3352f4SAlex Elder 
4526ea3352f4SAlex Elder 	len = next_token(buf);
45274caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4528ea3352f4SAlex Elder 	if (!dup)
4529ea3352f4SAlex Elder 		return NULL;
4530ea3352f4SAlex Elder 	*(dup + len) = '\0';
4531ea3352f4SAlex Elder 	*buf += len;
4532ea3352f4SAlex Elder 
4533ea3352f4SAlex Elder 	if (lenp)
4534ea3352f4SAlex Elder 		*lenp = len;
4535ea3352f4SAlex Elder 
4536ea3352f4SAlex Elder 	return dup;
4537ea3352f4SAlex Elder }
4538ea3352f4SAlex Elder 
4539ea3352f4SAlex Elder /*
4540859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4541859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4542859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4543859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4544d22f76e7SAlex Elder  *
4545859c31dfSAlex Elder  * The information extracted from these options is recorded in
4546859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4547859c31dfSAlex Elder  * structures:
4548859c31dfSAlex Elder  *  ceph_opts
4549859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4550859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4551859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4552859c31dfSAlex Elder  *  rbd_opts
4553859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4554859c31dfSAlex Elder  *	this function; caller must release with kfree().
4555859c31dfSAlex Elder  *  spec
4556859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4557859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4558859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4559859c31dfSAlex Elder  *
4560859c31dfSAlex Elder  * The options passed take this form:
4561859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4562859c31dfSAlex Elder  * where:
4563859c31dfSAlex Elder  *  <mon_addrs>
4564859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4565859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4566859c31dfSAlex Elder  *      by a port number (separated by a colon).
4567859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4568859c31dfSAlex Elder  *  <options>
4569859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4570859c31dfSAlex Elder  *  <pool_name>
4571859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4572859c31dfSAlex Elder  *  <image_name>
4573859c31dfSAlex Elder  *      The name of the image in that pool to map.
4574859c31dfSAlex Elder  *  <snap_id>
4575859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4576859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4577859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4578859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4579a725f65eSAlex Elder  */
4580859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4581dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4582859c31dfSAlex Elder 				struct rbd_options **opts,
4583859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4584a725f65eSAlex Elder {
4585e28fff26SAlex Elder 	size_t len;
4586859c31dfSAlex Elder 	char *options;
45870ddebc0cSAlex Elder 	const char *mon_addrs;
4588ecb4dc22SAlex Elder 	char *snap_name;
45890ddebc0cSAlex Elder 	size_t mon_addrs_size;
4590859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
45914e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4592859c31dfSAlex Elder 	struct ceph_options *copts;
4593dc79b113SAlex Elder 	int ret;
4594e28fff26SAlex Elder 
4595e28fff26SAlex Elder 	/* The first four tokens are required */
4596e28fff26SAlex Elder 
45977ef3214aSAlex Elder 	len = next_token(&buf);
45984fb5d671SAlex Elder 	if (!len) {
45994fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
46004fb5d671SAlex Elder 		return -EINVAL;
46014fb5d671SAlex Elder 	}
46020ddebc0cSAlex Elder 	mon_addrs = buf;
4603f28e565aSAlex Elder 	mon_addrs_size = len + 1;
46047ef3214aSAlex Elder 	buf += len;
4605a725f65eSAlex Elder 
4606dc79b113SAlex Elder 	ret = -EINVAL;
4607f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4608f28e565aSAlex Elder 	if (!options)
4609dc79b113SAlex Elder 		return -ENOMEM;
46104fb5d671SAlex Elder 	if (!*options) {
46114fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
46124fb5d671SAlex Elder 		goto out_err;
46134fb5d671SAlex Elder 	}
4614a725f65eSAlex Elder 
4615859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4616859c31dfSAlex Elder 	if (!spec)
4617f28e565aSAlex Elder 		goto out_mem;
4618859c31dfSAlex Elder 
4619859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4620859c31dfSAlex Elder 	if (!spec->pool_name)
4621859c31dfSAlex Elder 		goto out_mem;
46224fb5d671SAlex Elder 	if (!*spec->pool_name) {
46234fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
46244fb5d671SAlex Elder 		goto out_err;
46254fb5d671SAlex Elder 	}
4626e28fff26SAlex Elder 
462769e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4628859c31dfSAlex Elder 	if (!spec->image_name)
4629f28e565aSAlex Elder 		goto out_mem;
46304fb5d671SAlex Elder 	if (!*spec->image_name) {
46314fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
46324fb5d671SAlex Elder 		goto out_err;
46334fb5d671SAlex Elder 	}
4634e28fff26SAlex Elder 
4635f28e565aSAlex Elder 	/*
4636f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4637f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4638f28e565aSAlex Elder 	 */
46393feeb894SAlex Elder 	len = next_token(&buf);
4640820a5f3eSAlex Elder 	if (!len) {
46413feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
46423feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4643f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4644dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4645f28e565aSAlex Elder 		goto out_err;
4646849b4260SAlex Elder 	}
4647ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4648ecb4dc22SAlex Elder 	if (!snap_name)
4649f28e565aSAlex Elder 		goto out_mem;
4650ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4651ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4652e5c35534SAlex Elder 
46530ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4654e28fff26SAlex Elder 
46554e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
46564e9afebaSAlex Elder 	if (!rbd_opts)
46574e9afebaSAlex Elder 		goto out_mem;
46584e9afebaSAlex Elder 
46594e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4660d22f76e7SAlex Elder 
4661859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
46620ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
46634e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4664859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4665859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4666dc79b113SAlex Elder 		goto out_err;
4667dc79b113SAlex Elder 	}
4668859c31dfSAlex Elder 	kfree(options);
4669859c31dfSAlex Elder 
4670859c31dfSAlex Elder 	*ceph_opts = copts;
46714e9afebaSAlex Elder 	*opts = rbd_opts;
4672859c31dfSAlex Elder 	*rbd_spec = spec;
46730ddebc0cSAlex Elder 
4674dc79b113SAlex Elder 	return 0;
4675f28e565aSAlex Elder out_mem:
4676dc79b113SAlex Elder 	ret = -ENOMEM;
4677d22f76e7SAlex Elder out_err:
4678859c31dfSAlex Elder 	kfree(rbd_opts);
4679859c31dfSAlex Elder 	rbd_spec_put(spec);
4680f28e565aSAlex Elder 	kfree(options);
4681d22f76e7SAlex Elder 
4682dc79b113SAlex Elder 	return ret;
4683a725f65eSAlex Elder }
4684a725f65eSAlex Elder 
4685589d30e0SAlex Elder /*
468630ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
468730ba1f02SIlya Dryomov  */
468830ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
468930ba1f02SIlya Dryomov {
469030ba1f02SIlya Dryomov 	u64 newest_epoch;
469130ba1f02SIlya Dryomov 	unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
469230ba1f02SIlya Dryomov 	int tries = 0;
469330ba1f02SIlya Dryomov 	int ret;
469430ba1f02SIlya Dryomov 
469530ba1f02SIlya Dryomov again:
469630ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
469730ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
469830ba1f02SIlya Dryomov 		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
469930ba1f02SIlya Dryomov 					       &newest_epoch);
470030ba1f02SIlya Dryomov 		if (ret < 0)
470130ba1f02SIlya Dryomov 			return ret;
470230ba1f02SIlya Dryomov 
470330ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
470430ba1f02SIlya Dryomov 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
470530ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
470630ba1f02SIlya Dryomov 						     newest_epoch, timeout);
470730ba1f02SIlya Dryomov 			goto again;
470830ba1f02SIlya Dryomov 		} else {
470930ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
471030ba1f02SIlya Dryomov 			return -ENOENT;
471130ba1f02SIlya Dryomov 		}
471230ba1f02SIlya Dryomov 	}
471330ba1f02SIlya Dryomov 
471430ba1f02SIlya Dryomov 	return ret;
471530ba1f02SIlya Dryomov }
471630ba1f02SIlya Dryomov 
471730ba1f02SIlya Dryomov /*
4718589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4719589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4720589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4721589d30e0SAlex Elder  *
4722589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4723589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4724589d30e0SAlex Elder  * with the supplied name.
4725589d30e0SAlex Elder  *
4726589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4727589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4728589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4729589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4730589d30e0SAlex Elder  */
4731589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4732589d30e0SAlex Elder {
4733589d30e0SAlex Elder 	int ret;
4734589d30e0SAlex Elder 	size_t size;
4735589d30e0SAlex Elder 	char *object_name;
4736589d30e0SAlex Elder 	void *response;
4737c0fba368SAlex Elder 	char *image_id;
47382f82ee54SAlex Elder 
4739589d30e0SAlex Elder 	/*
47402c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
47412c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4742c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4743c0fba368SAlex Elder 	 * do still need to set the image format though.
47442c0d0a10SAlex Elder 	 */
4745c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4746c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4747c0fba368SAlex Elder 
47482c0d0a10SAlex Elder 		return 0;
4749c0fba368SAlex Elder 	}
47502c0d0a10SAlex Elder 
47512c0d0a10SAlex Elder 	/*
4752589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4753589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4754589d30e0SAlex Elder 	 */
475569e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4756589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4757589d30e0SAlex Elder 	if (!object_name)
4758589d30e0SAlex Elder 		return -ENOMEM;
47590d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4760589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4761589d30e0SAlex Elder 
4762589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4763589d30e0SAlex Elder 
4764589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4765589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4766589d30e0SAlex Elder 	if (!response) {
4767589d30e0SAlex Elder 		ret = -ENOMEM;
4768589d30e0SAlex Elder 		goto out;
4769589d30e0SAlex Elder 	}
4770589d30e0SAlex Elder 
4771c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4772c0fba368SAlex Elder 
477336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
47744157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4775e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
477636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4777c0fba368SAlex Elder 	if (ret == -ENOENT) {
4778c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4779c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4780c0fba368SAlex Elder 		if (!ret)
4781c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4782c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4783c0fba368SAlex Elder 		void *p = response;
4784589d30e0SAlex Elder 
4785c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4786979ed480SAlex Elder 						NULL, GFP_NOIO);
4787461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
4788c0fba368SAlex Elder 		if (!ret)
4789c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4790589d30e0SAlex Elder 	} else {
4791c0fba368SAlex Elder 		ret = -EINVAL;
4792c0fba368SAlex Elder 	}
4793c0fba368SAlex Elder 
4794c0fba368SAlex Elder 	if (!ret) {
4795c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4796c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4797589d30e0SAlex Elder 	}
4798589d30e0SAlex Elder out:
4799589d30e0SAlex Elder 	kfree(response);
4800589d30e0SAlex Elder 	kfree(object_name);
4801589d30e0SAlex Elder 
4802589d30e0SAlex Elder 	return ret;
4803589d30e0SAlex Elder }
4804589d30e0SAlex Elder 
48053abef3b3SAlex Elder /*
48063abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
48073abef3b3SAlex Elder  * call.
48083abef3b3SAlex Elder  */
48096fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
48106fd48b3bSAlex Elder {
48116fd48b3bSAlex Elder 	struct rbd_image_header	*header;
48126fd48b3bSAlex Elder 
4813392a9dadSAlex Elder 	/* Drop parent reference unless it's already been done (or none) */
4814392a9dadSAlex Elder 
4815392a9dadSAlex Elder 	if (rbd_dev->parent_overlap)
4816a2acd00eSAlex Elder 		rbd_dev_parent_put(rbd_dev);
48176fd48b3bSAlex Elder 
48186fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
48196fd48b3bSAlex Elder 
48206fd48b3bSAlex Elder 	header = &rbd_dev->header;
4821812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
48226fd48b3bSAlex Elder 	kfree(header->snap_sizes);
48236fd48b3bSAlex Elder 	kfree(header->snap_names);
48246fd48b3bSAlex Elder 	kfree(header->object_prefix);
48256fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
48266fd48b3bSAlex Elder }
48276fd48b3bSAlex Elder 
48282df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4829a30b71b9SAlex Elder {
4830a30b71b9SAlex Elder 	int ret;
4831a30b71b9SAlex Elder 
48321e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
483357385b51SAlex Elder 	if (ret)
48341e130199SAlex Elder 		goto out_err;
4835b1b5402aSAlex Elder 
48362df3fac7SAlex Elder 	/*
48372df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
48382df3fac7SAlex Elder 	 * features are assumed to never change.
48392df3fac7SAlex Elder 	 */
4840b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
484157385b51SAlex Elder 	if (ret)
4842b1b5402aSAlex Elder 		goto out_err;
484335d489f9SAlex Elder 
4844cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4845cc070d59SAlex Elder 
4846cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4847cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4848cc070d59SAlex Elder 		if (ret < 0)
4849cc070d59SAlex Elder 			goto out_err;
4850cc070d59SAlex Elder 	}
48512df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
4852a30b71b9SAlex Elder 
485335152979SAlex Elder 	return 0;
48549d475de5SAlex Elder out_err:
4855642a2537SAlex Elder 	rbd_dev->header.features = 0;
48561e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
48571e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
48589d475de5SAlex Elder 
48599d475de5SAlex Elder 	return ret;
4860a30b71b9SAlex Elder }
4861a30b71b9SAlex Elder 
4862124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
486383a06263SAlex Elder {
48642f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4865124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4866124afba2SAlex Elder 	struct rbd_client *rbdc;
4867124afba2SAlex Elder 	int ret;
4868124afba2SAlex Elder 
4869124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4870124afba2SAlex Elder 		return 0;
4871124afba2SAlex Elder 	/*
4872124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4873124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4874124afba2SAlex Elder 	 * parent/child relationships always share both.
4875124afba2SAlex Elder 	 */
4876124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4877124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4878124afba2SAlex Elder 
4879124afba2SAlex Elder 	ret = -ENOMEM;
4880124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4881124afba2SAlex Elder 	if (!parent)
4882124afba2SAlex Elder 		goto out_err;
4883124afba2SAlex Elder 
48841f3ef788SAlex Elder 	ret = rbd_dev_image_probe(parent, false);
4885124afba2SAlex Elder 	if (ret < 0)
4886124afba2SAlex Elder 		goto out_err;
4887124afba2SAlex Elder 	rbd_dev->parent = parent;
4888a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
4889124afba2SAlex Elder 
4890124afba2SAlex Elder 	return 0;
4891124afba2SAlex Elder out_err:
4892124afba2SAlex Elder 	if (parent) {
4893fb65d228SAlex Elder 		rbd_dev_unparent(rbd_dev);
4894124afba2SAlex Elder 		kfree(rbd_dev->header_name);
4895124afba2SAlex Elder 		rbd_dev_destroy(parent);
4896124afba2SAlex Elder 	} else {
4897124afba2SAlex Elder 		rbd_put_client(rbdc);
4898124afba2SAlex Elder 		rbd_spec_put(parent_spec);
4899124afba2SAlex Elder 	}
4900124afba2SAlex Elder 
4901124afba2SAlex Elder 	return ret;
4902124afba2SAlex Elder }
4903124afba2SAlex Elder 
4904200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4905124afba2SAlex Elder {
490683a06263SAlex Elder 	int ret;
490783a06263SAlex Elder 
4908f8a22fc2SIlya Dryomov 	/* Get an id and fill in device name. */
490983a06263SAlex Elder 
4910f8a22fc2SIlya Dryomov 	ret = rbd_dev_id_get(rbd_dev);
4911f8a22fc2SIlya Dryomov 	if (ret)
4912f8a22fc2SIlya Dryomov 		return ret;
4913f8a22fc2SIlya Dryomov 
491483a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
491583a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
491683a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
491783a06263SAlex Elder 
49189b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
491983a06263SAlex Elder 
49209b60e70bSIlya Dryomov 	if (!single_major) {
492183a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
492283a06263SAlex Elder 		if (ret < 0)
492383a06263SAlex Elder 			goto err_out_id;
49249b60e70bSIlya Dryomov 
492583a06263SAlex Elder 		rbd_dev->major = ret;
4926dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
49279b60e70bSIlya Dryomov 	} else {
49289b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
49299b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
49309b60e70bSIlya Dryomov 	}
493183a06263SAlex Elder 
493283a06263SAlex Elder 	/* Set up the blkdev mapping. */
493383a06263SAlex Elder 
493483a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
493583a06263SAlex Elder 	if (ret)
493683a06263SAlex Elder 		goto err_out_blkdev;
493783a06263SAlex Elder 
4938f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
493983a06263SAlex Elder 	if (ret)
494083a06263SAlex Elder 		goto err_out_disk;
4941f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4942f35a4deeSAlex Elder 
4943f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
4944f35a4deeSAlex Elder 	if (ret)
4945f35a4deeSAlex Elder 		goto err_out_mapping;
494683a06263SAlex Elder 
494783a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
494883a06263SAlex Elder 
4949129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
495083a06263SAlex Elder 	add_disk(rbd_dev->disk);
495183a06263SAlex Elder 
495283a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
495383a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
495483a06263SAlex Elder 
495583a06263SAlex Elder 	return ret;
49562f82ee54SAlex Elder 
4957f35a4deeSAlex Elder err_out_mapping:
4958f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
495983a06263SAlex Elder err_out_disk:
496083a06263SAlex Elder 	rbd_free_disk(rbd_dev);
496183a06263SAlex Elder err_out_blkdev:
49629b60e70bSIlya Dryomov 	if (!single_major)
496383a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
496483a06263SAlex Elder err_out_id:
496583a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
4966d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
496783a06263SAlex Elder 
496883a06263SAlex Elder 	return ret;
496983a06263SAlex Elder }
497083a06263SAlex Elder 
4971332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4972332bb12dSAlex Elder {
4973332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
4974332bb12dSAlex Elder 	size_t size;
4975332bb12dSAlex Elder 
4976332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
4977332bb12dSAlex Elder 
4978332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4979332bb12dSAlex Elder 
4980332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4981332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4982332bb12dSAlex Elder 	else
4983332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4984332bb12dSAlex Elder 
4985332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4986332bb12dSAlex Elder 	if (!rbd_dev->header_name)
4987332bb12dSAlex Elder 		return -ENOMEM;
4988332bb12dSAlex Elder 
4989332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4990332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4991332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
4992332bb12dSAlex Elder 	else
4993332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4994332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
4995332bb12dSAlex Elder 	return 0;
4996332bb12dSAlex Elder }
4997332bb12dSAlex Elder 
4998200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4999200a6a8bSAlex Elder {
50006fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5001200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
50026fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
50036fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
50046fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
50056fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
50066fd48b3bSAlex Elder 
5007200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
5008200a6a8bSAlex Elder }
5009200a6a8bSAlex Elder 
5010a30b71b9SAlex Elder /*
5011a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
50121f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
50131f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
50141f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5015a30b71b9SAlex Elder  */
50161f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
5017a30b71b9SAlex Elder {
5018a30b71b9SAlex Elder 	int ret;
5019a30b71b9SAlex Elder 
5020a30b71b9SAlex Elder 	/*
50213abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
50223abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
50233abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
50243abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5025a30b71b9SAlex Elder 	 */
5026a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5027a30b71b9SAlex Elder 	if (ret)
5028c0fba368SAlex Elder 		return ret;
5029c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
5030c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5031c0fba368SAlex Elder 
5032332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5033332bb12dSAlex Elder 	if (ret)
5034332bb12dSAlex Elder 		goto err_out_format;
5035332bb12dSAlex Elder 
50361f3ef788SAlex Elder 	if (mapping) {
5037fca27065SIlya Dryomov 		ret = rbd_dev_header_watch_sync(rbd_dev);
5038b644de2bSAlex Elder 		if (ret)
5039b644de2bSAlex Elder 			goto out_header_name;
50401f3ef788SAlex Elder 	}
5041b644de2bSAlex Elder 
5042c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
504399a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
5044a30b71b9SAlex Elder 	else
50452df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
50465655c4d9SAlex Elder 	if (ret)
5047b644de2bSAlex Elder 		goto err_out_watch;
5048a30b71b9SAlex Elder 
50499bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
50509bb81c9bSAlex Elder 	if (ret)
505133dca39fSAlex Elder 		goto err_out_probe;
50529bb81c9bSAlex Elder 
50539bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
505430d60ba2SAlex Elder 	if (ret)
505530d60ba2SAlex Elder 		goto err_out_probe;
505683a06263SAlex Elder 
505730d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
505830d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
505930d60ba2SAlex Elder 
506030d60ba2SAlex Elder 	return 0;
50616fd48b3bSAlex Elder err_out_probe:
50626fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5063b644de2bSAlex Elder err_out_watch:
5064fca27065SIlya Dryomov 	if (mapping)
5065fca27065SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
5066332bb12dSAlex Elder out_header_name:
5067332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
5068332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
5069332bb12dSAlex Elder err_out_format:
5070332bb12dSAlex Elder 	rbd_dev->image_format = 0;
50715655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
50725655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
50735655c4d9SAlex Elder 
50745655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
50755655c4d9SAlex Elder 
50765655c4d9SAlex Elder 	return ret;
507783a06263SAlex Elder }
507883a06263SAlex Elder 
50799b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
508059c2be1eSYehuda Sadeh 			  const char *buf,
508159c2be1eSYehuda Sadeh 			  size_t count)
5082602adf40SYehuda Sadeh {
5083cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5084dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
50854e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5086859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
50879d3997fdSAlex Elder 	struct rbd_client *rbdc;
508851344a38SAlex Elder 	bool read_only;
508927cc2594SAlex Elder 	int rc = -ENOMEM;
5090602adf40SYehuda Sadeh 
5091602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5092602adf40SYehuda Sadeh 		return -ENODEV;
5093602adf40SYehuda Sadeh 
5094a725f65eSAlex Elder 	/* parse add command */
5095859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5096dc79b113SAlex Elder 	if (rc < 0)
5097bd4ba655SAlex Elder 		goto err_out_module;
509851344a38SAlex Elder 	read_only = rbd_opts->read_only;
509951344a38SAlex Elder 	kfree(rbd_opts);
510051344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
5101a725f65eSAlex Elder 
51029d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
51039d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
51049d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
51050ddebc0cSAlex Elder 		goto err_out_args;
51069d3997fdSAlex Elder 	}
5107602adf40SYehuda Sadeh 
5108602adf40SYehuda Sadeh 	/* pick the pool */
510930ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
5110602adf40SYehuda Sadeh 	if (rc < 0)
5111602adf40SYehuda Sadeh 		goto err_out_client;
5112859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5113859c31dfSAlex Elder 
51140903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
51150903e875SAlex Elder 
5116c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
5117c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5118c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
51190903e875SAlex Elder 		rc = -EIO;
51200903e875SAlex Elder 		goto err_out_client;
51210903e875SAlex Elder 	}
51220903e875SAlex Elder 
5123c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
5124bd4ba655SAlex Elder 	if (!rbd_dev)
5125bd4ba655SAlex Elder 		goto err_out_client;
5126c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5127c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5128602adf40SYehuda Sadeh 
51291f3ef788SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, true);
5130a30b71b9SAlex Elder 	if (rc < 0)
5131c53d5893SAlex Elder 		goto err_out_rbd_dev;
513205fd6f6fSAlex Elder 
51337ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
51347ce4eef7SAlex Elder 
51357ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
51367ce4eef7SAlex Elder 		read_only = true;
51377ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
51387ce4eef7SAlex Elder 
5139b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
51403abef3b3SAlex Elder 	if (rc) {
5141e37180c0SIlya Dryomov 		/*
5142e37180c0SIlya Dryomov 		 * rbd_dev_header_unwatch_sync() can't be moved into
5143e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
5144e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
5145e37180c0SIlya Dryomov 		 */
5146e37180c0SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
51473abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
51483abef3b3SAlex Elder 		goto err_out_module;
51493abef3b3SAlex Elder 	}
51503abef3b3SAlex Elder 
5151602adf40SYehuda Sadeh 	return count;
5152b536f69aSAlex Elder 
5153c53d5893SAlex Elder err_out_rbd_dev:
5154c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5155bd4ba655SAlex Elder err_out_client:
51569d3997fdSAlex Elder 	rbd_put_client(rbdc);
51570ddebc0cSAlex Elder err_out_args:
5158859c31dfSAlex Elder 	rbd_spec_put(spec);
5159bd4ba655SAlex Elder err_out_module:
5160bd4ba655SAlex Elder 	module_put(THIS_MODULE);
516127cc2594SAlex Elder 
5162602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
516327cc2594SAlex Elder 
516427cc2594SAlex Elder 	return (ssize_t)rc;
5165602adf40SYehuda Sadeh }
5166602adf40SYehuda Sadeh 
51679b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
51689b60e70bSIlya Dryomov 		       const char *buf,
51699b60e70bSIlya Dryomov 		       size_t count)
51709b60e70bSIlya Dryomov {
51719b60e70bSIlya Dryomov 	if (single_major)
51729b60e70bSIlya Dryomov 		return -EINVAL;
51739b60e70bSIlya Dryomov 
51749b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
51759b60e70bSIlya Dryomov }
51769b60e70bSIlya Dryomov 
51779b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
51789b60e70bSIlya Dryomov 				    const char *buf,
51799b60e70bSIlya Dryomov 				    size_t count)
51809b60e70bSIlya Dryomov {
51819b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
51829b60e70bSIlya Dryomov }
51839b60e70bSIlya Dryomov 
5184200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
5185602adf40SYehuda Sadeh {
5186593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5187602adf40SYehuda Sadeh 
5188602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5189200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
51906d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
51919b60e70bSIlya Dryomov 	if (!single_major)
5192602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5193e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5194d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5195602adf40SYehuda Sadeh }
5196602adf40SYehuda Sadeh 
519705a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
519805a46afdSAlex Elder {
5199ad945fc1SAlex Elder 	while (rbd_dev->parent) {
520005a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
520105a46afdSAlex Elder 		struct rbd_device *second = first->parent;
520205a46afdSAlex Elder 		struct rbd_device *third;
520305a46afdSAlex Elder 
520405a46afdSAlex Elder 		/*
520505a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
520605a46afdSAlex Elder 		 * remove it.
520705a46afdSAlex Elder 		 */
520805a46afdSAlex Elder 		while (second && (third = second->parent)) {
520905a46afdSAlex Elder 			first = second;
521005a46afdSAlex Elder 			second = third;
521105a46afdSAlex Elder 		}
5212ad945fc1SAlex Elder 		rbd_assert(second);
52138ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5214ad945fc1SAlex Elder 		first->parent = NULL;
5215ad945fc1SAlex Elder 		first->parent_overlap = 0;
5216ad945fc1SAlex Elder 
5217ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
521805a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
521905a46afdSAlex Elder 		first->parent_spec = NULL;
522005a46afdSAlex Elder 	}
522105a46afdSAlex Elder }
522205a46afdSAlex Elder 
52239b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
5224602adf40SYehuda Sadeh 			     const char *buf,
5225602adf40SYehuda Sadeh 			     size_t count)
5226602adf40SYehuda Sadeh {
5227602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5228751cc0e3SAlex Elder 	struct list_head *tmp;
5229751cc0e3SAlex Elder 	int dev_id;
5230602adf40SYehuda Sadeh 	unsigned long ul;
523182a442d2SAlex Elder 	bool already = false;
52320d8189e1SAlex Elder 	int ret;
5233602adf40SYehuda Sadeh 
5234bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
52350d8189e1SAlex Elder 	if (ret)
52360d8189e1SAlex Elder 		return ret;
5237602adf40SYehuda Sadeh 
5238602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5239751cc0e3SAlex Elder 	dev_id = (int)ul;
5240751cc0e3SAlex Elder 	if (dev_id != ul)
5241602adf40SYehuda Sadeh 		return -EINVAL;
5242602adf40SYehuda Sadeh 
5243602adf40SYehuda Sadeh 	ret = -ENOENT;
5244751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5245751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5246751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5247751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5248751cc0e3SAlex Elder 			ret = 0;
5249751cc0e3SAlex Elder 			break;
5250602adf40SYehuda Sadeh 		}
5251751cc0e3SAlex Elder 	}
5252751cc0e3SAlex Elder 	if (!ret) {
5253a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5254b82d167bSAlex Elder 		if (rbd_dev->open_count)
525542382b70SAlex Elder 			ret = -EBUSY;
5256b82d167bSAlex Elder 		else
525782a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
525882a442d2SAlex Elder 							&rbd_dev->flags);
5259a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5260751cc0e3SAlex Elder 	}
5261751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
526282a442d2SAlex Elder 	if (ret < 0 || already)
52631ba0f1e7SAlex Elder 		return ret;
5264751cc0e3SAlex Elder 
5265fca27065SIlya Dryomov 	rbd_dev_header_unwatch_sync(rbd_dev);
52669abc5990SJosh Durgin 	/*
52679abc5990SJosh Durgin 	 * flush remaining watch callbacks - these must be complete
52689abc5990SJosh Durgin 	 * before the osd_client is shutdown
52699abc5990SJosh Durgin 	 */
52709abc5990SJosh Durgin 	dout("%s: flushing notifies", __func__);
52719abc5990SJosh Durgin 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5272fca27065SIlya Dryomov 
52739875201eSJosh Durgin 	/*
52749875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
52759875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
52769875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
52779875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
52789875201eSJosh Durgin 	 */
52799875201eSJosh Durgin 	rbd_bus_del_dev(rbd_dev);
52808ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
528179ab7558SAlex Elder 	module_put(THIS_MODULE);
5282aafb230eSAlex Elder 
52831ba0f1e7SAlex Elder 	return count;
5284602adf40SYehuda Sadeh }
5285602adf40SYehuda Sadeh 
52869b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
52879b60e70bSIlya Dryomov 			  const char *buf,
52889b60e70bSIlya Dryomov 			  size_t count)
52899b60e70bSIlya Dryomov {
52909b60e70bSIlya Dryomov 	if (single_major)
52919b60e70bSIlya Dryomov 		return -EINVAL;
52929b60e70bSIlya Dryomov 
52939b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
52949b60e70bSIlya Dryomov }
52959b60e70bSIlya Dryomov 
52969b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
52979b60e70bSIlya Dryomov 				       const char *buf,
52989b60e70bSIlya Dryomov 				       size_t count)
52999b60e70bSIlya Dryomov {
53009b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
53019b60e70bSIlya Dryomov }
53029b60e70bSIlya Dryomov 
5303602adf40SYehuda Sadeh /*
5304602adf40SYehuda Sadeh  * create control files in sysfs
5305dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5306602adf40SYehuda Sadeh  */
5307602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5308602adf40SYehuda Sadeh {
5309dfc5606dSYehuda Sadeh 	int ret;
5310602adf40SYehuda Sadeh 
5311fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5312dfc5606dSYehuda Sadeh 	if (ret < 0)
5313dfc5606dSYehuda Sadeh 		return ret;
5314602adf40SYehuda Sadeh 
5315fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5316fed4c143SAlex Elder 	if (ret < 0)
5317fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5318602adf40SYehuda Sadeh 
5319602adf40SYehuda Sadeh 	return ret;
5320602adf40SYehuda Sadeh }
5321602adf40SYehuda Sadeh 
5322602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5323602adf40SYehuda Sadeh {
5324dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5325fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5326602adf40SYehuda Sadeh }
5327602adf40SYehuda Sadeh 
53281c2a9dfeSAlex Elder static int rbd_slab_init(void)
53291c2a9dfeSAlex Elder {
53301c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
53311c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
53321c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
53331c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
53341c2a9dfeSAlex Elder 					0, NULL);
5335868311b1SAlex Elder 	if (!rbd_img_request_cache)
5336868311b1SAlex Elder 		return -ENOMEM;
5337868311b1SAlex Elder 
5338868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5339868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5340868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5341868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5342868311b1SAlex Elder 					0, NULL);
534378c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
534478c2a44aSAlex Elder 		goto out_err;
534578c2a44aSAlex Elder 
534678c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
534778c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
53482d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
534978c2a44aSAlex Elder 	if (rbd_segment_name_cache)
53501c2a9dfeSAlex Elder 		return 0;
535178c2a44aSAlex Elder out_err:
535278c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
535378c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
535478c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
535578c2a44aSAlex Elder 	}
53561c2a9dfeSAlex Elder 
5357868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5358868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5359868311b1SAlex Elder 
53601c2a9dfeSAlex Elder 	return -ENOMEM;
53611c2a9dfeSAlex Elder }
53621c2a9dfeSAlex Elder 
53631c2a9dfeSAlex Elder static void rbd_slab_exit(void)
53641c2a9dfeSAlex Elder {
536578c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
536678c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
536778c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
536878c2a44aSAlex Elder 
5369868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5370868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5371868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5372868311b1SAlex Elder 
53731c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
53741c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
53751c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
53761c2a9dfeSAlex Elder }
53771c2a9dfeSAlex Elder 
5378cc344fa1SAlex Elder static int __init rbd_init(void)
5379602adf40SYehuda Sadeh {
5380602adf40SYehuda Sadeh 	int rc;
5381602adf40SYehuda Sadeh 
53821e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
53831e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
53841e32d34cSAlex Elder 		return -EINVAL;
53851e32d34cSAlex Elder 	}
5386e1b4d96dSIlya Dryomov 
53871c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5388602adf40SYehuda Sadeh 	if (rc)
5389602adf40SYehuda Sadeh 		return rc;
5390e1b4d96dSIlya Dryomov 
53919b60e70bSIlya Dryomov 	if (single_major) {
53929b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
53939b60e70bSIlya Dryomov 		if (rbd_major < 0) {
53949b60e70bSIlya Dryomov 			rc = rbd_major;
53959b60e70bSIlya Dryomov 			goto err_out_slab;
53969b60e70bSIlya Dryomov 		}
53979b60e70bSIlya Dryomov 	}
53989b60e70bSIlya Dryomov 
53991c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
54001c2a9dfeSAlex Elder 	if (rc)
54019b60e70bSIlya Dryomov 		goto err_out_blkdev;
54021c2a9dfeSAlex Elder 
54039b60e70bSIlya Dryomov 	if (single_major)
54049b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
54059b60e70bSIlya Dryomov 	else
5406e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
54079b60e70bSIlya Dryomov 
5408e1b4d96dSIlya Dryomov 	return 0;
5409e1b4d96dSIlya Dryomov 
54109b60e70bSIlya Dryomov err_out_blkdev:
54119b60e70bSIlya Dryomov 	if (single_major)
54129b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5413e1b4d96dSIlya Dryomov err_out_slab:
5414e1b4d96dSIlya Dryomov 	rbd_slab_exit();
54151c2a9dfeSAlex Elder 	return rc;
5416602adf40SYehuda Sadeh }
5417602adf40SYehuda Sadeh 
5418cc344fa1SAlex Elder static void __exit rbd_exit(void)
5419602adf40SYehuda Sadeh {
5420602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
54219b60e70bSIlya Dryomov 	if (single_major)
54229b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
54231c2a9dfeSAlex Elder 	rbd_slab_exit();
5424602adf40SYehuda Sadeh }
5425602adf40SYehuda Sadeh 
5426602adf40SYehuda Sadeh module_init(rbd_init);
5427602adf40SYehuda Sadeh module_exit(rbd_exit);
5428602adf40SYehuda Sadeh 
5429d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5430602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5431602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5432602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5433602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5434602adf40SYehuda Sadeh 
543590da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5436602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5437