xref: /openbmc/linux/drivers/block/rbd.c (revision 65ab0dba)
1 
2 /*
3    rbd.c -- Export ceph rados objects as a Linux block device
4 
5 
6    based on drivers/block/osdblk.c:
7 
8    Copyright 2009 Red Hat, Inc.
9 
10    This program is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; see the file COPYING.  If not, write to
21    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22 
23 
24 
25    For usage instructions, please refer to:
26 
27                  Documentation/ABI/testing/sysfs-bus-rbd
28 
29  */
30 
31 #include <linux/ceph/libceph.h>
32 #include <linux/ceph/osd_client.h>
33 #include <linux/ceph/mon_client.h>
34 #include <linux/ceph/cls_lock_client.h>
35 #include <linux/ceph/striper.h>
36 #include <linux/ceph/decode.h>
37 #include <linux/parser.h>
38 #include <linux/bsearch.h>
39 
40 #include <linux/kernel.h>
41 #include <linux/device.h>
42 #include <linux/module.h>
43 #include <linux/blk-mq.h>
44 #include <linux/fs.h>
45 #include <linux/blkdev.h>
46 #include <linux/slab.h>
47 #include <linux/idr.h>
48 #include <linux/workqueue.h>
49 
50 #include "rbd_types.h"
51 
52 #define RBD_DEBUG	/* Activate rbd_assert() calls */
53 
54 /*
55  * Increment the given counter and return its updated value.
56  * If the counter is already 0 it will not be incremented.
57  * If the counter is already at its maximum value returns
58  * -EINVAL without updating it.
59  */
60 static int atomic_inc_return_safe(atomic_t *v)
61 {
62 	unsigned int counter;
63 
64 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65 	if (counter <= (unsigned int)INT_MAX)
66 		return (int)counter;
67 
68 	atomic_dec(v);
69 
70 	return -EINVAL;
71 }
72 
73 /* Decrement the counter.  Return the resulting value, or -EINVAL */
74 static int atomic_dec_return_safe(atomic_t *v)
75 {
76 	int counter;
77 
78 	counter = atomic_dec_return(v);
79 	if (counter >= 0)
80 		return counter;
81 
82 	atomic_inc(v);
83 
84 	return -EINVAL;
85 }
86 
87 #define RBD_DRV_NAME "rbd"
88 
89 #define RBD_MINORS_PER_MAJOR		256
90 #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91 
92 #define RBD_MAX_PARENT_CHAIN_LEN	16
93 
94 #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95 #define RBD_MAX_SNAP_NAME_LEN	\
96 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97 
98 #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99 
100 #define RBD_SNAP_HEAD_NAME	"-"
101 
102 #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
103 
104 /* This allows a single page to hold an image name sent by OSD */
105 #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106 #define RBD_IMAGE_ID_LEN_MAX	64
107 
108 #define RBD_OBJ_PREFIX_LEN_MAX	64
109 
110 #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
111 #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
112 
113 /* Feature bits */
114 
115 #define RBD_FEATURE_LAYERING		(1ULL<<0)
116 #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
117 #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
118 #define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
119 #define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
120 #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
121 #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
122 #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
123 
124 #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
125 				 RBD_FEATURE_STRIPINGV2 |	\
126 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
127 				 RBD_FEATURE_OBJECT_MAP |	\
128 				 RBD_FEATURE_FAST_DIFF |	\
129 				 RBD_FEATURE_DEEP_FLATTEN |	\
130 				 RBD_FEATURE_DATA_POOL |	\
131 				 RBD_FEATURE_OPERATIONS)
132 
133 /* Features supported by this (client software) implementation. */
134 
135 #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136 
137 /*
138  * An RBD device name will be "rbd#", where the "rbd" comes from
139  * RBD_DRV_NAME above, and # is a unique integer identifier.
140  */
141 #define DEV_NAME_LEN		32
142 
143 /*
144  * block device image metadata (in-memory version)
145  */
146 struct rbd_image_header {
147 	/* These six fields never change for a given rbd image */
148 	char *object_prefix;
149 	__u8 obj_order;
150 	u64 stripe_unit;
151 	u64 stripe_count;
152 	s64 data_pool_id;
153 	u64 features;		/* Might be changeable someday? */
154 
155 	/* The remaining fields need to be updated occasionally */
156 	u64 image_size;
157 	struct ceph_snap_context *snapc;
158 	char *snap_names;	/* format 1 only */
159 	u64 *snap_sizes;	/* format 1 only */
160 };
161 
162 /*
163  * An rbd image specification.
164  *
165  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166  * identify an image.  Each rbd_dev structure includes a pointer to
167  * an rbd_spec structure that encapsulates this identity.
168  *
169  * Each of the id's in an rbd_spec has an associated name.  For a
170  * user-mapped image, the names are supplied and the id's associated
171  * with them are looked up.  For a layered image, a parent image is
172  * defined by the tuple, and the names are looked up.
173  *
174  * An rbd_dev structure contains a parent_spec pointer which is
175  * non-null if the image it represents is a child in a layered
176  * image.  This pointer will refer to the rbd_spec structure used
177  * by the parent rbd_dev for its own identity (i.e., the structure
178  * is shared between the parent and child).
179  *
180  * Since these structures are populated once, during the discovery
181  * phase of image construction, they are effectively immutable so
182  * we make no effort to synchronize access to them.
183  *
184  * Note that code herein does not assume the image name is known (it
185  * could be a null pointer).
186  */
187 struct rbd_spec {
188 	u64		pool_id;
189 	const char	*pool_name;
190 	const char	*pool_ns;	/* NULL if default, never "" */
191 
192 	const char	*image_id;
193 	const char	*image_name;
194 
195 	u64		snap_id;
196 	const char	*snap_name;
197 
198 	struct kref	kref;
199 };
200 
201 /*
202  * an instance of the client.  multiple devices may share an rbd client.
203  */
204 struct rbd_client {
205 	struct ceph_client	*client;
206 	struct kref		kref;
207 	struct list_head	node;
208 };
209 
210 struct pending_result {
211 	int			result;		/* first nonzero result */
212 	int			num_pending;
213 };
214 
215 struct rbd_img_request;
216 
217 enum obj_request_type {
218 	OBJ_REQUEST_NODATA = 1,
219 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
220 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
221 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
222 };
223 
224 enum obj_operation_type {
225 	OBJ_OP_READ = 1,
226 	OBJ_OP_WRITE,
227 	OBJ_OP_DISCARD,
228 	OBJ_OP_ZEROOUT,
229 };
230 
231 #define RBD_OBJ_FLAG_DELETION			(1U << 0)
232 #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
233 #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
234 #define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
235 #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
236 
237 enum rbd_obj_read_state {
238 	RBD_OBJ_READ_START = 1,
239 	RBD_OBJ_READ_OBJECT,
240 	RBD_OBJ_READ_PARENT,
241 };
242 
243 /*
244  * Writes go through the following state machine to deal with
245  * layering:
246  *
247  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248  *            .                 |                                    .
249  *            .                 v                                    .
250  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
251  *            .                 |                    .               .
252  *            .                 v                    v (deep-copyup  .
253  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
254  * flattened) v                 |                    .               .
255  *            .                 v                    .               .
256  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
257  *                              |                        not needed) v
258  *                              v                                    .
259  *                            done . . . . . . . . . . . . . . . . . .
260  *                              ^
261  *                              |
262  *                     RBD_OBJ_WRITE_FLAT
263  *
264  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
265  * assert_exists guard is needed or not (in some cases it's not needed
266  * even if there is a parent).
267  */
268 enum rbd_obj_write_state {
269 	RBD_OBJ_WRITE_START = 1,
270 	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
271 	RBD_OBJ_WRITE_OBJECT,
272 	__RBD_OBJ_WRITE_COPYUP,
273 	RBD_OBJ_WRITE_COPYUP,
274 	RBD_OBJ_WRITE_POST_OBJECT_MAP,
275 };
276 
277 enum rbd_obj_copyup_state {
278 	RBD_OBJ_COPYUP_START = 1,
279 	RBD_OBJ_COPYUP_READ_PARENT,
280 	__RBD_OBJ_COPYUP_OBJECT_MAPS,
281 	RBD_OBJ_COPYUP_OBJECT_MAPS,
282 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
283 	RBD_OBJ_COPYUP_WRITE_OBJECT,
284 };
285 
286 struct rbd_obj_request {
287 	struct ceph_object_extent ex;
288 	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
289 	union {
290 		enum rbd_obj_read_state	 read_state;	/* for reads */
291 		enum rbd_obj_write_state write_state;	/* for writes */
292 	};
293 
294 	struct rbd_img_request	*img_request;
295 	struct ceph_file_extent	*img_extents;
296 	u32			num_img_extents;
297 
298 	union {
299 		struct ceph_bio_iter	bio_pos;
300 		struct {
301 			struct ceph_bvec_iter	bvec_pos;
302 			u32			bvec_count;
303 			u32			bvec_idx;
304 		};
305 	};
306 
307 	enum rbd_obj_copyup_state copyup_state;
308 	struct bio_vec		*copyup_bvecs;
309 	u32			copyup_bvec_count;
310 
311 	struct list_head	osd_reqs;	/* w/ r_private_item */
312 
313 	struct mutex		state_mutex;
314 	struct pending_result	pending;
315 	struct kref		kref;
316 };
317 
318 enum img_req_flags {
319 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
320 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
321 };
322 
323 enum rbd_img_state {
324 	RBD_IMG_START = 1,
325 	RBD_IMG_EXCLUSIVE_LOCK,
326 	__RBD_IMG_OBJECT_REQUESTS,
327 	RBD_IMG_OBJECT_REQUESTS,
328 };
329 
330 struct rbd_img_request {
331 	struct rbd_device	*rbd_dev;
332 	enum obj_operation_type	op_type;
333 	enum obj_request_type	data_type;
334 	unsigned long		flags;
335 	enum rbd_img_state	state;
336 	union {
337 		u64			snap_id;	/* for reads */
338 		struct ceph_snap_context *snapc;	/* for writes */
339 	};
340 	union {
341 		struct request		*rq;		/* block request */
342 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
343 	};
344 
345 	struct list_head	lock_item;
346 	struct list_head	object_extents;	/* obj_req.ex structs */
347 
348 	struct mutex		state_mutex;
349 	struct pending_result	pending;
350 	struct work_struct	work;
351 	int			work_result;
352 	struct kref		kref;
353 };
354 
355 #define for_each_obj_request(ireq, oreq) \
356 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
357 #define for_each_obj_request_safe(ireq, oreq, n) \
358 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
359 
360 enum rbd_watch_state {
361 	RBD_WATCH_STATE_UNREGISTERED,
362 	RBD_WATCH_STATE_REGISTERED,
363 	RBD_WATCH_STATE_ERROR,
364 };
365 
366 enum rbd_lock_state {
367 	RBD_LOCK_STATE_UNLOCKED,
368 	RBD_LOCK_STATE_LOCKED,
369 	RBD_LOCK_STATE_RELEASING,
370 };
371 
372 /* WatchNotify::ClientId */
373 struct rbd_client_id {
374 	u64 gid;
375 	u64 handle;
376 };
377 
378 struct rbd_mapping {
379 	u64                     size;
380 	u64                     features;
381 };
382 
383 /*
384  * a single device
385  */
386 struct rbd_device {
387 	int			dev_id;		/* blkdev unique id */
388 
389 	int			major;		/* blkdev assigned major */
390 	int			minor;
391 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
392 
393 	u32			image_format;	/* Either 1 or 2 */
394 	struct rbd_client	*rbd_client;
395 
396 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
397 
398 	spinlock_t		lock;		/* queue, flags, open_count */
399 
400 	struct rbd_image_header	header;
401 	unsigned long		flags;		/* possibly lock protected */
402 	struct rbd_spec		*spec;
403 	struct rbd_options	*opts;
404 	char			*config_info;	/* add{,_single_major} string */
405 
406 	struct ceph_object_id	header_oid;
407 	struct ceph_object_locator header_oloc;
408 
409 	struct ceph_file_layout	layout;		/* used for all rbd requests */
410 
411 	struct mutex		watch_mutex;
412 	enum rbd_watch_state	watch_state;
413 	struct ceph_osd_linger_request *watch_handle;
414 	u64			watch_cookie;
415 	struct delayed_work	watch_dwork;
416 
417 	struct rw_semaphore	lock_rwsem;
418 	enum rbd_lock_state	lock_state;
419 	char			lock_cookie[32];
420 	struct rbd_client_id	owner_cid;
421 	struct work_struct	acquired_lock_work;
422 	struct work_struct	released_lock_work;
423 	struct delayed_work	lock_dwork;
424 	struct work_struct	unlock_work;
425 	spinlock_t		lock_lists_lock;
426 	struct list_head	acquiring_list;
427 	struct list_head	running_list;
428 	struct completion	acquire_wait;
429 	int			acquire_err;
430 	struct completion	releasing_wait;
431 
432 	spinlock_t		object_map_lock;
433 	u8			*object_map;
434 	u64			object_map_size;	/* in objects */
435 	u64			object_map_flags;
436 
437 	struct workqueue_struct	*task_wq;
438 
439 	struct rbd_spec		*parent_spec;
440 	u64			parent_overlap;
441 	atomic_t		parent_ref;
442 	struct rbd_device	*parent;
443 
444 	/* Block layer tags. */
445 	struct blk_mq_tag_set	tag_set;
446 
447 	/* protects updating the header */
448 	struct rw_semaphore     header_rwsem;
449 
450 	struct rbd_mapping	mapping;
451 
452 	struct list_head	node;
453 
454 	/* sysfs related */
455 	struct device		dev;
456 	unsigned long		open_count;	/* protected by lock */
457 };
458 
459 /*
460  * Flag bits for rbd_dev->flags:
461  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
462  *   by rbd_dev->lock
463  */
464 enum rbd_dev_flags {
465 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
466 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
467 };
468 
469 static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
470 
471 static LIST_HEAD(rbd_dev_list);    /* devices */
472 static DEFINE_SPINLOCK(rbd_dev_list_lock);
473 
474 static LIST_HEAD(rbd_client_list);		/* clients */
475 static DEFINE_SPINLOCK(rbd_client_list_lock);
476 
477 /* Slab caches for frequently-allocated structures */
478 
479 static struct kmem_cache	*rbd_img_request_cache;
480 static struct kmem_cache	*rbd_obj_request_cache;
481 
482 static int rbd_major;
483 static DEFINE_IDA(rbd_dev_id_ida);
484 
485 static struct workqueue_struct *rbd_wq;
486 
487 static struct ceph_snap_context rbd_empty_snapc = {
488 	.nref = REFCOUNT_INIT(1),
489 };
490 
491 /*
492  * single-major requires >= 0.75 version of userspace rbd utility.
493  */
494 static bool single_major = true;
495 module_param(single_major, bool, 0444);
496 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
497 
498 static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
499 static ssize_t remove_store(struct bus_type *bus, const char *buf,
500 			    size_t count);
501 static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
502 				      size_t count);
503 static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
504 					 size_t count);
505 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
506 
507 static int rbd_dev_id_to_minor(int dev_id)
508 {
509 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
510 }
511 
512 static int minor_to_rbd_dev_id(int minor)
513 {
514 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
515 }
516 
517 static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
518 {
519 	lockdep_assert_held(&rbd_dev->lock_rwsem);
520 
521 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
522 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
523 }
524 
525 static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
526 {
527 	bool is_lock_owner;
528 
529 	down_read(&rbd_dev->lock_rwsem);
530 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
531 	up_read(&rbd_dev->lock_rwsem);
532 	return is_lock_owner;
533 }
534 
535 static ssize_t supported_features_show(struct bus_type *bus, char *buf)
536 {
537 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
538 }
539 
540 static BUS_ATTR_WO(add);
541 static BUS_ATTR_WO(remove);
542 static BUS_ATTR_WO(add_single_major);
543 static BUS_ATTR_WO(remove_single_major);
544 static BUS_ATTR_RO(supported_features);
545 
546 static struct attribute *rbd_bus_attrs[] = {
547 	&bus_attr_add.attr,
548 	&bus_attr_remove.attr,
549 	&bus_attr_add_single_major.attr,
550 	&bus_attr_remove_single_major.attr,
551 	&bus_attr_supported_features.attr,
552 	NULL,
553 };
554 
555 static umode_t rbd_bus_is_visible(struct kobject *kobj,
556 				  struct attribute *attr, int index)
557 {
558 	if (!single_major &&
559 	    (attr == &bus_attr_add_single_major.attr ||
560 	     attr == &bus_attr_remove_single_major.attr))
561 		return 0;
562 
563 	return attr->mode;
564 }
565 
566 static const struct attribute_group rbd_bus_group = {
567 	.attrs = rbd_bus_attrs,
568 	.is_visible = rbd_bus_is_visible,
569 };
570 __ATTRIBUTE_GROUPS(rbd_bus);
571 
572 static struct bus_type rbd_bus_type = {
573 	.name		= "rbd",
574 	.bus_groups	= rbd_bus_groups,
575 };
576 
577 static void rbd_root_dev_release(struct device *dev)
578 {
579 }
580 
581 static struct device rbd_root_dev = {
582 	.init_name =    "rbd",
583 	.release =      rbd_root_dev_release,
584 };
585 
586 static __printf(2, 3)
587 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
588 {
589 	struct va_format vaf;
590 	va_list args;
591 
592 	va_start(args, fmt);
593 	vaf.fmt = fmt;
594 	vaf.va = &args;
595 
596 	if (!rbd_dev)
597 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
598 	else if (rbd_dev->disk)
599 		printk(KERN_WARNING "%s: %s: %pV\n",
600 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
601 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
602 		printk(KERN_WARNING "%s: image %s: %pV\n",
603 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
604 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
605 		printk(KERN_WARNING "%s: id %s: %pV\n",
606 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
607 	else	/* punt */
608 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
609 			RBD_DRV_NAME, rbd_dev, &vaf);
610 	va_end(args);
611 }
612 
613 #ifdef RBD_DEBUG
614 #define rbd_assert(expr)						\
615 		if (unlikely(!(expr))) {				\
616 			printk(KERN_ERR "\nAssertion failure in %s() "	\
617 						"at line %d:\n\n"	\
618 					"\trbd_assert(%s);\n\n",	\
619 					__func__, __LINE__, #expr);	\
620 			BUG();						\
621 		}
622 #else /* !RBD_DEBUG */
623 #  define rbd_assert(expr)	((void) 0)
624 #endif /* !RBD_DEBUG */
625 
626 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
627 
628 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
629 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
630 static int rbd_dev_header_info(struct rbd_device *rbd_dev);
631 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
632 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
633 					u64 snap_id);
634 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
635 				u8 *order, u64 *snap_size);
636 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
637 		u64 *snap_features);
638 static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
639 
640 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
641 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
642 
643 /*
644  * Return true if nothing else is pending.
645  */
646 static bool pending_result_dec(struct pending_result *pending, int *result)
647 {
648 	rbd_assert(pending->num_pending > 0);
649 
650 	if (*result && !pending->result)
651 		pending->result = *result;
652 	if (--pending->num_pending)
653 		return false;
654 
655 	*result = pending->result;
656 	return true;
657 }
658 
659 static int rbd_open(struct block_device *bdev, fmode_t mode)
660 {
661 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
662 	bool removing = false;
663 
664 	spin_lock_irq(&rbd_dev->lock);
665 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
666 		removing = true;
667 	else
668 		rbd_dev->open_count++;
669 	spin_unlock_irq(&rbd_dev->lock);
670 	if (removing)
671 		return -ENOENT;
672 
673 	(void) get_device(&rbd_dev->dev);
674 
675 	return 0;
676 }
677 
678 static void rbd_release(struct gendisk *disk, fmode_t mode)
679 {
680 	struct rbd_device *rbd_dev = disk->private_data;
681 	unsigned long open_count_before;
682 
683 	spin_lock_irq(&rbd_dev->lock);
684 	open_count_before = rbd_dev->open_count--;
685 	spin_unlock_irq(&rbd_dev->lock);
686 	rbd_assert(open_count_before > 0);
687 
688 	put_device(&rbd_dev->dev);
689 }
690 
691 static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
692 {
693 	int ro;
694 
695 	if (get_user(ro, (int __user *)arg))
696 		return -EFAULT;
697 
698 	/* Snapshots can't be marked read-write */
699 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
700 		return -EROFS;
701 
702 	/* Let blkdev_roset() handle it */
703 	return -ENOTTY;
704 }
705 
706 static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
707 			unsigned int cmd, unsigned long arg)
708 {
709 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
710 	int ret;
711 
712 	switch (cmd) {
713 	case BLKROSET:
714 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
715 		break;
716 	default:
717 		ret = -ENOTTY;
718 	}
719 
720 	return ret;
721 }
722 
723 #ifdef CONFIG_COMPAT
724 static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
725 				unsigned int cmd, unsigned long arg)
726 {
727 	return rbd_ioctl(bdev, mode, cmd, arg);
728 }
729 #endif /* CONFIG_COMPAT */
730 
731 static const struct block_device_operations rbd_bd_ops = {
732 	.owner			= THIS_MODULE,
733 	.open			= rbd_open,
734 	.release		= rbd_release,
735 	.ioctl			= rbd_ioctl,
736 #ifdef CONFIG_COMPAT
737 	.compat_ioctl		= rbd_compat_ioctl,
738 #endif
739 };
740 
741 /*
742  * Initialize an rbd client instance.  Success or not, this function
743  * consumes ceph_opts.  Caller holds client_mutex.
744  */
745 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
746 {
747 	struct rbd_client *rbdc;
748 	int ret = -ENOMEM;
749 
750 	dout("%s:\n", __func__);
751 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
752 	if (!rbdc)
753 		goto out_opt;
754 
755 	kref_init(&rbdc->kref);
756 	INIT_LIST_HEAD(&rbdc->node);
757 
758 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
759 	if (IS_ERR(rbdc->client))
760 		goto out_rbdc;
761 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
762 
763 	ret = ceph_open_session(rbdc->client);
764 	if (ret < 0)
765 		goto out_client;
766 
767 	spin_lock(&rbd_client_list_lock);
768 	list_add_tail(&rbdc->node, &rbd_client_list);
769 	spin_unlock(&rbd_client_list_lock);
770 
771 	dout("%s: rbdc %p\n", __func__, rbdc);
772 
773 	return rbdc;
774 out_client:
775 	ceph_destroy_client(rbdc->client);
776 out_rbdc:
777 	kfree(rbdc);
778 out_opt:
779 	if (ceph_opts)
780 		ceph_destroy_options(ceph_opts);
781 	dout("%s: error %d\n", __func__, ret);
782 
783 	return ERR_PTR(ret);
784 }
785 
786 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
787 {
788 	kref_get(&rbdc->kref);
789 
790 	return rbdc;
791 }
792 
793 /*
794  * Find a ceph client with specific addr and configuration.  If
795  * found, bump its reference count.
796  */
797 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
798 {
799 	struct rbd_client *client_node;
800 	bool found = false;
801 
802 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
803 		return NULL;
804 
805 	spin_lock(&rbd_client_list_lock);
806 	list_for_each_entry(client_node, &rbd_client_list, node) {
807 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
808 			__rbd_get_client(client_node);
809 
810 			found = true;
811 			break;
812 		}
813 	}
814 	spin_unlock(&rbd_client_list_lock);
815 
816 	return found ? client_node : NULL;
817 }
818 
819 /*
820  * (Per device) rbd map options
821  */
822 enum {
823 	Opt_queue_depth,
824 	Opt_alloc_size,
825 	Opt_lock_timeout,
826 	Opt_last_int,
827 	/* int args above */
828 	Opt_pool_ns,
829 	Opt_last_string,
830 	/* string args above */
831 	Opt_read_only,
832 	Opt_read_write,
833 	Opt_lock_on_read,
834 	Opt_exclusive,
835 	Opt_notrim,
836 	Opt_err
837 };
838 
839 static match_table_t rbd_opts_tokens = {
840 	{Opt_queue_depth, "queue_depth=%d"},
841 	{Opt_alloc_size, "alloc_size=%d"},
842 	{Opt_lock_timeout, "lock_timeout=%d"},
843 	/* int args above */
844 	{Opt_pool_ns, "_pool_ns=%s"},
845 	/* string args above */
846 	{Opt_read_only, "read_only"},
847 	{Opt_read_only, "ro"},		/* Alternate spelling */
848 	{Opt_read_write, "read_write"},
849 	{Opt_read_write, "rw"},		/* Alternate spelling */
850 	{Opt_lock_on_read, "lock_on_read"},
851 	{Opt_exclusive, "exclusive"},
852 	{Opt_notrim, "notrim"},
853 	{Opt_err, NULL}
854 };
855 
856 struct rbd_options {
857 	int	queue_depth;
858 	int	alloc_size;
859 	unsigned long	lock_timeout;
860 	bool	read_only;
861 	bool	lock_on_read;
862 	bool	exclusive;
863 	bool	trim;
864 };
865 
866 #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
867 #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
868 #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
869 #define RBD_READ_ONLY_DEFAULT	false
870 #define RBD_LOCK_ON_READ_DEFAULT false
871 #define RBD_EXCLUSIVE_DEFAULT	false
872 #define RBD_TRIM_DEFAULT	true
873 
874 struct parse_rbd_opts_ctx {
875 	struct rbd_spec		*spec;
876 	struct rbd_options	*opts;
877 };
878 
879 static int parse_rbd_opts_token(char *c, void *private)
880 {
881 	struct parse_rbd_opts_ctx *pctx = private;
882 	substring_t argstr[MAX_OPT_ARGS];
883 	int token, intval, ret;
884 
885 	token = match_token(c, rbd_opts_tokens, argstr);
886 	if (token < Opt_last_int) {
887 		ret = match_int(&argstr[0], &intval);
888 		if (ret < 0) {
889 			pr_err("bad option arg (not int) at '%s'\n", c);
890 			return ret;
891 		}
892 		dout("got int token %d val %d\n", token, intval);
893 	} else if (token > Opt_last_int && token < Opt_last_string) {
894 		dout("got string token %d val %s\n", token, argstr[0].from);
895 	} else {
896 		dout("got token %d\n", token);
897 	}
898 
899 	switch (token) {
900 	case Opt_queue_depth:
901 		if (intval < 1) {
902 			pr_err("queue_depth out of range\n");
903 			return -EINVAL;
904 		}
905 		pctx->opts->queue_depth = intval;
906 		break;
907 	case Opt_alloc_size:
908 		if (intval < SECTOR_SIZE) {
909 			pr_err("alloc_size out of range\n");
910 			return -EINVAL;
911 		}
912 		if (!is_power_of_2(intval)) {
913 			pr_err("alloc_size must be a power of 2\n");
914 			return -EINVAL;
915 		}
916 		pctx->opts->alloc_size = intval;
917 		break;
918 	case Opt_lock_timeout:
919 		/* 0 is "wait forever" (i.e. infinite timeout) */
920 		if (intval < 0 || intval > INT_MAX / 1000) {
921 			pr_err("lock_timeout out of range\n");
922 			return -EINVAL;
923 		}
924 		pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
925 		break;
926 	case Opt_pool_ns:
927 		kfree(pctx->spec->pool_ns);
928 		pctx->spec->pool_ns = match_strdup(argstr);
929 		if (!pctx->spec->pool_ns)
930 			return -ENOMEM;
931 		break;
932 	case Opt_read_only:
933 		pctx->opts->read_only = true;
934 		break;
935 	case Opt_read_write:
936 		pctx->opts->read_only = false;
937 		break;
938 	case Opt_lock_on_read:
939 		pctx->opts->lock_on_read = true;
940 		break;
941 	case Opt_exclusive:
942 		pctx->opts->exclusive = true;
943 		break;
944 	case Opt_notrim:
945 		pctx->opts->trim = false;
946 		break;
947 	default:
948 		/* libceph prints "bad option" msg */
949 		return -EINVAL;
950 	}
951 
952 	return 0;
953 }
954 
955 static char* obj_op_name(enum obj_operation_type op_type)
956 {
957 	switch (op_type) {
958 	case OBJ_OP_READ:
959 		return "read";
960 	case OBJ_OP_WRITE:
961 		return "write";
962 	case OBJ_OP_DISCARD:
963 		return "discard";
964 	case OBJ_OP_ZEROOUT:
965 		return "zeroout";
966 	default:
967 		return "???";
968 	}
969 }
970 
971 /*
972  * Destroy ceph client
973  *
974  * Caller must hold rbd_client_list_lock.
975  */
976 static void rbd_client_release(struct kref *kref)
977 {
978 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
979 
980 	dout("%s: rbdc %p\n", __func__, rbdc);
981 	spin_lock(&rbd_client_list_lock);
982 	list_del(&rbdc->node);
983 	spin_unlock(&rbd_client_list_lock);
984 
985 	ceph_destroy_client(rbdc->client);
986 	kfree(rbdc);
987 }
988 
989 /*
990  * Drop reference to ceph client node. If it's not referenced anymore, release
991  * it.
992  */
993 static void rbd_put_client(struct rbd_client *rbdc)
994 {
995 	if (rbdc)
996 		kref_put(&rbdc->kref, rbd_client_release);
997 }
998 
999 /*
1000  * Get a ceph client with specific addr and configuration, if one does
1001  * not exist create it.  Either way, ceph_opts is consumed by this
1002  * function.
1003  */
1004 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
1005 {
1006 	struct rbd_client *rbdc;
1007 	int ret;
1008 
1009 	mutex_lock(&client_mutex);
1010 	rbdc = rbd_client_find(ceph_opts);
1011 	if (rbdc) {
1012 		ceph_destroy_options(ceph_opts);
1013 
1014 		/*
1015 		 * Using an existing client.  Make sure ->pg_pools is up to
1016 		 * date before we look up the pool id in do_rbd_add().
1017 		 */
1018 		ret = ceph_wait_for_latest_osdmap(rbdc->client,
1019 					rbdc->client->options->mount_timeout);
1020 		if (ret) {
1021 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
1022 			rbd_put_client(rbdc);
1023 			rbdc = ERR_PTR(ret);
1024 		}
1025 	} else {
1026 		rbdc = rbd_client_create(ceph_opts);
1027 	}
1028 	mutex_unlock(&client_mutex);
1029 
1030 	return rbdc;
1031 }
1032 
1033 static bool rbd_image_format_valid(u32 image_format)
1034 {
1035 	return image_format == 1 || image_format == 2;
1036 }
1037 
1038 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1039 {
1040 	size_t size;
1041 	u32 snap_count;
1042 
1043 	/* The header has to start with the magic rbd header text */
1044 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1045 		return false;
1046 
1047 	/* The bio layer requires at least sector-sized I/O */
1048 
1049 	if (ondisk->options.order < SECTOR_SHIFT)
1050 		return false;
1051 
1052 	/* If we use u64 in a few spots we may be able to loosen this */
1053 
1054 	if (ondisk->options.order > 8 * sizeof (int) - 1)
1055 		return false;
1056 
1057 	/*
1058 	 * The size of a snapshot header has to fit in a size_t, and
1059 	 * that limits the number of snapshots.
1060 	 */
1061 	snap_count = le32_to_cpu(ondisk->snap_count);
1062 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
1063 	if (snap_count > size / sizeof (__le64))
1064 		return false;
1065 
1066 	/*
1067 	 * Not only that, but the size of the entire the snapshot
1068 	 * header must also be representable in a size_t.
1069 	 */
1070 	size -= snap_count * sizeof (__le64);
1071 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1072 		return false;
1073 
1074 	return true;
1075 }
1076 
1077 /*
1078  * returns the size of an object in the image
1079  */
1080 static u32 rbd_obj_bytes(struct rbd_image_header *header)
1081 {
1082 	return 1U << header->obj_order;
1083 }
1084 
1085 static void rbd_init_layout(struct rbd_device *rbd_dev)
1086 {
1087 	if (rbd_dev->header.stripe_unit == 0 ||
1088 	    rbd_dev->header.stripe_count == 0) {
1089 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1090 		rbd_dev->header.stripe_count = 1;
1091 	}
1092 
1093 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1094 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1095 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
1096 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1097 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1098 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1099 }
1100 
1101 /*
1102  * Fill an rbd image header with information from the given format 1
1103  * on-disk header.
1104  */
1105 static int rbd_header_from_disk(struct rbd_device *rbd_dev,
1106 				 struct rbd_image_header_ondisk *ondisk)
1107 {
1108 	struct rbd_image_header *header = &rbd_dev->header;
1109 	bool first_time = header->object_prefix == NULL;
1110 	struct ceph_snap_context *snapc;
1111 	char *object_prefix = NULL;
1112 	char *snap_names = NULL;
1113 	u64 *snap_sizes = NULL;
1114 	u32 snap_count;
1115 	int ret = -ENOMEM;
1116 	u32 i;
1117 
1118 	/* Allocate this now to avoid having to handle failure below */
1119 
1120 	if (first_time) {
1121 		object_prefix = kstrndup(ondisk->object_prefix,
1122 					 sizeof(ondisk->object_prefix),
1123 					 GFP_KERNEL);
1124 		if (!object_prefix)
1125 			return -ENOMEM;
1126 	}
1127 
1128 	/* Allocate the snapshot context and fill it in */
1129 
1130 	snap_count = le32_to_cpu(ondisk->snap_count);
1131 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1132 	if (!snapc)
1133 		goto out_err;
1134 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1135 	if (snap_count) {
1136 		struct rbd_image_snap_ondisk *snaps;
1137 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1138 
1139 		/* We'll keep a copy of the snapshot names... */
1140 
1141 		if (snap_names_len > (u64)SIZE_MAX)
1142 			goto out_2big;
1143 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1144 		if (!snap_names)
1145 			goto out_err;
1146 
1147 		/* ...as well as the array of their sizes. */
1148 		snap_sizes = kmalloc_array(snap_count,
1149 					   sizeof(*header->snap_sizes),
1150 					   GFP_KERNEL);
1151 		if (!snap_sizes)
1152 			goto out_err;
1153 
1154 		/*
1155 		 * Copy the names, and fill in each snapshot's id
1156 		 * and size.
1157 		 *
1158 		 * Note that rbd_dev_v1_header_info() guarantees the
1159 		 * ondisk buffer we're working with has
1160 		 * snap_names_len bytes beyond the end of the
1161 		 * snapshot id array, this memcpy() is safe.
1162 		 */
1163 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1164 		snaps = ondisk->snaps;
1165 		for (i = 0; i < snap_count; i++) {
1166 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1167 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1168 		}
1169 	}
1170 
1171 	/* We won't fail any more, fill in the header */
1172 
1173 	if (first_time) {
1174 		header->object_prefix = object_prefix;
1175 		header->obj_order = ondisk->options.order;
1176 		rbd_init_layout(rbd_dev);
1177 	} else {
1178 		ceph_put_snap_context(header->snapc);
1179 		kfree(header->snap_names);
1180 		kfree(header->snap_sizes);
1181 	}
1182 
1183 	/* The remaining fields always get updated (when we refresh) */
1184 
1185 	header->image_size = le64_to_cpu(ondisk->image_size);
1186 	header->snapc = snapc;
1187 	header->snap_names = snap_names;
1188 	header->snap_sizes = snap_sizes;
1189 
1190 	return 0;
1191 out_2big:
1192 	ret = -EIO;
1193 out_err:
1194 	kfree(snap_sizes);
1195 	kfree(snap_names);
1196 	ceph_put_snap_context(snapc);
1197 	kfree(object_prefix);
1198 
1199 	return ret;
1200 }
1201 
1202 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1203 {
1204 	const char *snap_name;
1205 
1206 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1207 
1208 	/* Skip over names until we find the one we are looking for */
1209 
1210 	snap_name = rbd_dev->header.snap_names;
1211 	while (which--)
1212 		snap_name += strlen(snap_name) + 1;
1213 
1214 	return kstrdup(snap_name, GFP_KERNEL);
1215 }
1216 
1217 /*
1218  * Snapshot id comparison function for use with qsort()/bsearch().
1219  * Note that result is for snapshots in *descending* order.
1220  */
1221 static int snapid_compare_reverse(const void *s1, const void *s2)
1222 {
1223 	u64 snap_id1 = *(u64 *)s1;
1224 	u64 snap_id2 = *(u64 *)s2;
1225 
1226 	if (snap_id1 < snap_id2)
1227 		return 1;
1228 	return snap_id1 == snap_id2 ? 0 : -1;
1229 }
1230 
1231 /*
1232  * Search a snapshot context to see if the given snapshot id is
1233  * present.
1234  *
1235  * Returns the position of the snapshot id in the array if it's found,
1236  * or BAD_SNAP_INDEX otherwise.
1237  *
1238  * Note: The snapshot array is in kept sorted (by the osd) in
1239  * reverse order, highest snapshot id first.
1240  */
1241 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1242 {
1243 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
1244 	u64 *found;
1245 
1246 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1247 				sizeof (snap_id), snapid_compare_reverse);
1248 
1249 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
1250 }
1251 
1252 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1253 					u64 snap_id)
1254 {
1255 	u32 which;
1256 	const char *snap_name;
1257 
1258 	which = rbd_dev_snap_index(rbd_dev, snap_id);
1259 	if (which == BAD_SNAP_INDEX)
1260 		return ERR_PTR(-ENOENT);
1261 
1262 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1263 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
1264 }
1265 
1266 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1267 {
1268 	if (snap_id == CEPH_NOSNAP)
1269 		return RBD_SNAP_HEAD_NAME;
1270 
1271 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1272 	if (rbd_dev->image_format == 1)
1273 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
1274 
1275 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1276 }
1277 
1278 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1279 				u64 *snap_size)
1280 {
1281 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1282 	if (snap_id == CEPH_NOSNAP) {
1283 		*snap_size = rbd_dev->header.image_size;
1284 	} else if (rbd_dev->image_format == 1) {
1285 		u32 which;
1286 
1287 		which = rbd_dev_snap_index(rbd_dev, snap_id);
1288 		if (which == BAD_SNAP_INDEX)
1289 			return -ENOENT;
1290 
1291 		*snap_size = rbd_dev->header.snap_sizes[which];
1292 	} else {
1293 		u64 size = 0;
1294 		int ret;
1295 
1296 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1297 		if (ret)
1298 			return ret;
1299 
1300 		*snap_size = size;
1301 	}
1302 	return 0;
1303 }
1304 
1305 static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1306 			u64 *snap_features)
1307 {
1308 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1309 	if (snap_id == CEPH_NOSNAP) {
1310 		*snap_features = rbd_dev->header.features;
1311 	} else if (rbd_dev->image_format == 1) {
1312 		*snap_features = 0;	/* No features for format 1 */
1313 	} else {
1314 		u64 features = 0;
1315 		int ret;
1316 
1317 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1318 		if (ret)
1319 			return ret;
1320 
1321 		*snap_features = features;
1322 	}
1323 	return 0;
1324 }
1325 
1326 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1327 {
1328 	u64 snap_id = rbd_dev->spec->snap_id;
1329 	u64 size = 0;
1330 	u64 features = 0;
1331 	int ret;
1332 
1333 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
1334 	if (ret)
1335 		return ret;
1336 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
1337 	if (ret)
1338 		return ret;
1339 
1340 	rbd_dev->mapping.size = size;
1341 	rbd_dev->mapping.features = features;
1342 
1343 	return 0;
1344 }
1345 
1346 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1347 {
1348 	rbd_dev->mapping.size = 0;
1349 	rbd_dev->mapping.features = 0;
1350 }
1351 
1352 static void zero_bvec(struct bio_vec *bv)
1353 {
1354 	void *buf;
1355 	unsigned long flags;
1356 
1357 	buf = bvec_kmap_irq(bv, &flags);
1358 	memset(buf, 0, bv->bv_len);
1359 	flush_dcache_page(bv->bv_page);
1360 	bvec_kunmap_irq(buf, &flags);
1361 }
1362 
1363 static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1364 {
1365 	struct ceph_bio_iter it = *bio_pos;
1366 
1367 	ceph_bio_iter_advance(&it, off);
1368 	ceph_bio_iter_advance_step(&it, bytes, ({
1369 		zero_bvec(&bv);
1370 	}));
1371 }
1372 
1373 static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1374 {
1375 	struct ceph_bvec_iter it = *bvec_pos;
1376 
1377 	ceph_bvec_iter_advance(&it, off);
1378 	ceph_bvec_iter_advance_step(&it, bytes, ({
1379 		zero_bvec(&bv);
1380 	}));
1381 }
1382 
1383 /*
1384  * Zero a range in @obj_req data buffer defined by a bio (list) or
1385  * (private) bio_vec array.
1386  *
1387  * @off is relative to the start of the data buffer.
1388  */
1389 static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1390 			       u32 bytes)
1391 {
1392 	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1393 
1394 	switch (obj_req->img_request->data_type) {
1395 	case OBJ_REQUEST_BIO:
1396 		zero_bios(&obj_req->bio_pos, off, bytes);
1397 		break;
1398 	case OBJ_REQUEST_BVECS:
1399 	case OBJ_REQUEST_OWN_BVECS:
1400 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
1401 		break;
1402 	default:
1403 		BUG();
1404 	}
1405 }
1406 
1407 static void rbd_obj_request_destroy(struct kref *kref);
1408 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1409 {
1410 	rbd_assert(obj_request != NULL);
1411 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1412 		kref_read(&obj_request->kref));
1413 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1414 }
1415 
1416 static void rbd_img_request_destroy(struct kref *kref);
1417 static void rbd_img_request_put(struct rbd_img_request *img_request)
1418 {
1419 	rbd_assert(img_request != NULL);
1420 	dout("%s: img %p (was %d)\n", __func__, img_request,
1421 		kref_read(&img_request->kref));
1422 	kref_put(&img_request->kref, rbd_img_request_destroy);
1423 }
1424 
1425 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1426 					struct rbd_obj_request *obj_request)
1427 {
1428 	rbd_assert(obj_request->img_request == NULL);
1429 
1430 	/* Image request now owns object's original reference */
1431 	obj_request->img_request = img_request;
1432 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1433 }
1434 
1435 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1436 					struct rbd_obj_request *obj_request)
1437 {
1438 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1439 	list_del(&obj_request->ex.oe_item);
1440 	rbd_assert(obj_request->img_request == img_request);
1441 	rbd_obj_request_put(obj_request);
1442 }
1443 
1444 static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1445 {
1446 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1447 
1448 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1449 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1450 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
1451 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1452 }
1453 
1454 /*
1455  * The default/initial value for all image request flags is 0.  Each
1456  * is conditionally set to 1 at image request initialization time
1457  * and currently never change thereafter.
1458  */
1459 static void img_request_layered_set(struct rbd_img_request *img_request)
1460 {
1461 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1462 	smp_mb();
1463 }
1464 
1465 static void img_request_layered_clear(struct rbd_img_request *img_request)
1466 {
1467 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1468 	smp_mb();
1469 }
1470 
1471 static bool img_request_layered_test(struct rbd_img_request *img_request)
1472 {
1473 	smp_mb();
1474 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1475 }
1476 
1477 static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1478 {
1479 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1480 
1481 	return !obj_req->ex.oe_off &&
1482 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
1483 }
1484 
1485 static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1486 {
1487 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1488 
1489 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
1490 					rbd_dev->layout.object_size;
1491 }
1492 
1493 /*
1494  * Must be called after rbd_obj_calc_img_extents().
1495  */
1496 static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1497 {
1498 	if (!obj_req->num_img_extents ||
1499 	    (rbd_obj_is_entire(obj_req) &&
1500 	     !obj_req->img_request->snapc->num_snaps))
1501 		return false;
1502 
1503 	return true;
1504 }
1505 
1506 static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1507 {
1508 	return ceph_file_extents_bytes(obj_req->img_extents,
1509 				       obj_req->num_img_extents);
1510 }
1511 
1512 static bool rbd_img_is_write(struct rbd_img_request *img_req)
1513 {
1514 	switch (img_req->op_type) {
1515 	case OBJ_OP_READ:
1516 		return false;
1517 	case OBJ_OP_WRITE:
1518 	case OBJ_OP_DISCARD:
1519 	case OBJ_OP_ZEROOUT:
1520 		return true;
1521 	default:
1522 		BUG();
1523 	}
1524 }
1525 
1526 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1527 {
1528 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1529 	int result;
1530 
1531 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1532 	     osd_req->r_result, obj_req);
1533 
1534 	/*
1535 	 * Writes aren't allowed to return a data payload.  In some
1536 	 * guarded write cases (e.g. stat + zero on an empty object)
1537 	 * a stat response makes it through, but we don't care.
1538 	 */
1539 	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1540 		result = 0;
1541 	else
1542 		result = osd_req->r_result;
1543 
1544 	rbd_obj_handle_request(obj_req, result);
1545 }
1546 
1547 static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1548 {
1549 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1550 
1551 	osd_req->r_flags = CEPH_OSD_FLAG_READ;
1552 	osd_req->r_snapid = obj_request->img_request->snap_id;
1553 }
1554 
1555 static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
1556 {
1557 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1558 
1559 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1560 	ktime_get_real_ts64(&osd_req->r_mtime);
1561 	osd_req->r_data_offset = obj_request->ex.oe_off;
1562 }
1563 
1564 static struct ceph_osd_request *
1565 __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1566 			  struct ceph_snap_context *snapc, int num_ops)
1567 {
1568 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1569 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1570 	struct ceph_osd_request *req;
1571 	const char *name_format = rbd_dev->image_format == 1 ?
1572 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1573 	int ret;
1574 
1575 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1576 	if (!req)
1577 		return ERR_PTR(-ENOMEM);
1578 
1579 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1580 	req->r_callback = rbd_osd_req_callback;
1581 	req->r_priv = obj_req;
1582 
1583 	/*
1584 	 * Data objects may be stored in a separate pool, but always in
1585 	 * the same namespace in that pool as the header in its pool.
1586 	 */
1587 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1588 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1589 
1590 	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1591 			       rbd_dev->header.object_prefix,
1592 			       obj_req->ex.oe_objno);
1593 	if (ret)
1594 		return ERR_PTR(ret);
1595 
1596 	return req;
1597 }
1598 
1599 static struct ceph_osd_request *
1600 rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1601 {
1602 	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1603 					 num_ops);
1604 }
1605 
1606 static struct rbd_obj_request *rbd_obj_request_create(void)
1607 {
1608 	struct rbd_obj_request *obj_request;
1609 
1610 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1611 	if (!obj_request)
1612 		return NULL;
1613 
1614 	ceph_object_extent_init(&obj_request->ex);
1615 	INIT_LIST_HEAD(&obj_request->osd_reqs);
1616 	mutex_init(&obj_request->state_mutex);
1617 	kref_init(&obj_request->kref);
1618 
1619 	dout("%s %p\n", __func__, obj_request);
1620 	return obj_request;
1621 }
1622 
1623 static void rbd_obj_request_destroy(struct kref *kref)
1624 {
1625 	struct rbd_obj_request *obj_request;
1626 	struct ceph_osd_request *osd_req;
1627 	u32 i;
1628 
1629 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1630 
1631 	dout("%s: obj %p\n", __func__, obj_request);
1632 
1633 	while (!list_empty(&obj_request->osd_reqs)) {
1634 		osd_req = list_first_entry(&obj_request->osd_reqs,
1635 				    struct ceph_osd_request, r_private_item);
1636 		list_del_init(&osd_req->r_private_item);
1637 		ceph_osdc_put_request(osd_req);
1638 	}
1639 
1640 	switch (obj_request->img_request->data_type) {
1641 	case OBJ_REQUEST_NODATA:
1642 	case OBJ_REQUEST_BIO:
1643 	case OBJ_REQUEST_BVECS:
1644 		break;		/* Nothing to do */
1645 	case OBJ_REQUEST_OWN_BVECS:
1646 		kfree(obj_request->bvec_pos.bvecs);
1647 		break;
1648 	default:
1649 		BUG();
1650 	}
1651 
1652 	kfree(obj_request->img_extents);
1653 	if (obj_request->copyup_bvecs) {
1654 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1655 			if (obj_request->copyup_bvecs[i].bv_page)
1656 				__free_page(obj_request->copyup_bvecs[i].bv_page);
1657 		}
1658 		kfree(obj_request->copyup_bvecs);
1659 	}
1660 
1661 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1662 }
1663 
1664 /* It's OK to call this for a device with no parent */
1665 
1666 static void rbd_spec_put(struct rbd_spec *spec);
1667 static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1668 {
1669 	rbd_dev_remove_parent(rbd_dev);
1670 	rbd_spec_put(rbd_dev->parent_spec);
1671 	rbd_dev->parent_spec = NULL;
1672 	rbd_dev->parent_overlap = 0;
1673 }
1674 
1675 /*
1676  * Parent image reference counting is used to determine when an
1677  * image's parent fields can be safely torn down--after there are no
1678  * more in-flight requests to the parent image.  When the last
1679  * reference is dropped, cleaning them up is safe.
1680  */
1681 static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1682 {
1683 	int counter;
1684 
1685 	if (!rbd_dev->parent_spec)
1686 		return;
1687 
1688 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1689 	if (counter > 0)
1690 		return;
1691 
1692 	/* Last reference; clean up parent data structures */
1693 
1694 	if (!counter)
1695 		rbd_dev_unparent(rbd_dev);
1696 	else
1697 		rbd_warn(rbd_dev, "parent reference underflow");
1698 }
1699 
1700 /*
1701  * If an image has a non-zero parent overlap, get a reference to its
1702  * parent.
1703  *
1704  * Returns true if the rbd device has a parent with a non-zero
1705  * overlap and a reference for it was successfully taken, or
1706  * false otherwise.
1707  */
1708 static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1709 {
1710 	int counter = 0;
1711 
1712 	if (!rbd_dev->parent_spec)
1713 		return false;
1714 
1715 	down_read(&rbd_dev->header_rwsem);
1716 	if (rbd_dev->parent_overlap)
1717 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1718 	up_read(&rbd_dev->header_rwsem);
1719 
1720 	if (counter < 0)
1721 		rbd_warn(rbd_dev, "parent reference overflow");
1722 
1723 	return counter > 0;
1724 }
1725 
1726 /*
1727  * Caller is responsible for filling in the list of object requests
1728  * that comprises the image request, and the Linux request pointer
1729  * (if there is one).
1730  */
1731 static struct rbd_img_request *rbd_img_request_create(
1732 					struct rbd_device *rbd_dev,
1733 					enum obj_operation_type op_type,
1734 					struct ceph_snap_context *snapc)
1735 {
1736 	struct rbd_img_request *img_request;
1737 
1738 	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
1739 	if (!img_request)
1740 		return NULL;
1741 
1742 	img_request->rbd_dev = rbd_dev;
1743 	img_request->op_type = op_type;
1744 	if (!rbd_img_is_write(img_request))
1745 		img_request->snap_id = rbd_dev->spec->snap_id;
1746 	else
1747 		img_request->snapc = snapc;
1748 
1749 	if (rbd_dev_parent_get(rbd_dev))
1750 		img_request_layered_set(img_request);
1751 
1752 	INIT_LIST_HEAD(&img_request->lock_item);
1753 	INIT_LIST_HEAD(&img_request->object_extents);
1754 	mutex_init(&img_request->state_mutex);
1755 	kref_init(&img_request->kref);
1756 
1757 	dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1758 	     obj_op_name(op_type), img_request);
1759 	return img_request;
1760 }
1761 
1762 static void rbd_img_request_destroy(struct kref *kref)
1763 {
1764 	struct rbd_img_request *img_request;
1765 	struct rbd_obj_request *obj_request;
1766 	struct rbd_obj_request *next_obj_request;
1767 
1768 	img_request = container_of(kref, struct rbd_img_request, kref);
1769 
1770 	dout("%s: img %p\n", __func__, img_request);
1771 
1772 	WARN_ON(!list_empty(&img_request->lock_item));
1773 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1774 		rbd_img_obj_request_del(img_request, obj_request);
1775 
1776 	if (img_request_layered_test(img_request)) {
1777 		img_request_layered_clear(img_request);
1778 		rbd_dev_parent_put(img_request->rbd_dev);
1779 	}
1780 
1781 	if (rbd_img_is_write(img_request))
1782 		ceph_put_snap_context(img_request->snapc);
1783 
1784 	kmem_cache_free(rbd_img_request_cache, img_request);
1785 }
1786 
1787 #define BITS_PER_OBJ	2
1788 #define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
1789 #define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
1790 
1791 static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1792 				   u64 *index, u8 *shift)
1793 {
1794 	u32 off;
1795 
1796 	rbd_assert(objno < rbd_dev->object_map_size);
1797 	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1798 	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1799 }
1800 
1801 static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1802 {
1803 	u64 index;
1804 	u8 shift;
1805 
1806 	lockdep_assert_held(&rbd_dev->object_map_lock);
1807 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1808 	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
1809 }
1810 
1811 static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1812 {
1813 	u64 index;
1814 	u8 shift;
1815 	u8 *p;
1816 
1817 	lockdep_assert_held(&rbd_dev->object_map_lock);
1818 	rbd_assert(!(val & ~OBJ_MASK));
1819 
1820 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1821 	p = &rbd_dev->object_map[index];
1822 	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1823 }
1824 
1825 static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1826 {
1827 	u8 state;
1828 
1829 	spin_lock(&rbd_dev->object_map_lock);
1830 	state = __rbd_object_map_get(rbd_dev, objno);
1831 	spin_unlock(&rbd_dev->object_map_lock);
1832 	return state;
1833 }
1834 
1835 static bool use_object_map(struct rbd_device *rbd_dev)
1836 {
1837 	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1838 		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1839 }
1840 
1841 static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1842 {
1843 	u8 state;
1844 
1845 	/* fall back to default logic if object map is disabled or invalid */
1846 	if (!use_object_map(rbd_dev))
1847 		return true;
1848 
1849 	state = rbd_object_map_get(rbd_dev, objno);
1850 	return state != OBJECT_NONEXISTENT;
1851 }
1852 
1853 static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1854 				struct ceph_object_id *oid)
1855 {
1856 	if (snap_id == CEPH_NOSNAP)
1857 		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1858 				rbd_dev->spec->image_id);
1859 	else
1860 		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1861 				rbd_dev->spec->image_id, snap_id);
1862 }
1863 
1864 static int rbd_object_map_lock(struct rbd_device *rbd_dev)
1865 {
1866 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1867 	CEPH_DEFINE_OID_ONSTACK(oid);
1868 	u8 lock_type;
1869 	char *lock_tag;
1870 	struct ceph_locker *lockers;
1871 	u32 num_lockers;
1872 	bool broke_lock = false;
1873 	int ret;
1874 
1875 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1876 
1877 again:
1878 	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1879 			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1880 	if (ret != -EBUSY || broke_lock) {
1881 		if (ret == -EEXIST)
1882 			ret = 0; /* already locked by myself */
1883 		if (ret)
1884 			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1885 		return ret;
1886 	}
1887 
1888 	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1889 				 RBD_LOCK_NAME, &lock_type, &lock_tag,
1890 				 &lockers, &num_lockers);
1891 	if (ret) {
1892 		if (ret == -ENOENT)
1893 			goto again;
1894 
1895 		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1896 		return ret;
1897 	}
1898 
1899 	kfree(lock_tag);
1900 	if (num_lockers == 0)
1901 		goto again;
1902 
1903 	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1904 		 ENTITY_NAME(lockers[0].id.name));
1905 
1906 	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1907 				  RBD_LOCK_NAME, lockers[0].id.cookie,
1908 				  &lockers[0].id.name);
1909 	ceph_free_lockers(lockers, num_lockers);
1910 	if (ret) {
1911 		if (ret == -ENOENT)
1912 			goto again;
1913 
1914 		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1915 		return ret;
1916 	}
1917 
1918 	broke_lock = true;
1919 	goto again;
1920 }
1921 
1922 static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
1923 {
1924 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1925 	CEPH_DEFINE_OID_ONSTACK(oid);
1926 	int ret;
1927 
1928 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1929 
1930 	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1931 			      "");
1932 	if (ret && ret != -ENOENT)
1933 		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1934 }
1935 
1936 static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
1937 {
1938 	u8 struct_v;
1939 	u32 struct_len;
1940 	u32 header_len;
1941 	void *header_end;
1942 	int ret;
1943 
1944 	ceph_decode_32_safe(p, end, header_len, e_inval);
1945 	header_end = *p + header_len;
1946 
1947 	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1948 				  &struct_len);
1949 	if (ret)
1950 		return ret;
1951 
1952 	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1953 
1954 	*p = header_end;
1955 	return 0;
1956 
1957 e_inval:
1958 	return -EINVAL;
1959 }
1960 
1961 static int __rbd_object_map_load(struct rbd_device *rbd_dev)
1962 {
1963 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1964 	CEPH_DEFINE_OID_ONSTACK(oid);
1965 	struct page **pages;
1966 	void *p, *end;
1967 	size_t reply_len;
1968 	u64 num_objects;
1969 	u64 object_map_bytes;
1970 	u64 object_map_size;
1971 	int num_pages;
1972 	int ret;
1973 
1974 	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1975 
1976 	num_objects = ceph_get_num_objects(&rbd_dev->layout,
1977 					   rbd_dev->mapping.size);
1978 	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1979 					    BITS_PER_BYTE);
1980 	num_pages = calc_pages_for(0, object_map_bytes) + 1;
1981 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1982 	if (IS_ERR(pages))
1983 		return PTR_ERR(pages);
1984 
1985 	reply_len = num_pages * PAGE_SIZE;
1986 	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1987 	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1988 			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1989 			     NULL, 0, pages, &reply_len);
1990 	if (ret)
1991 		goto out;
1992 
1993 	p = page_address(pages[0]);
1994 	end = p + min(reply_len, (size_t)PAGE_SIZE);
1995 	ret = decode_object_map_header(&p, end, &object_map_size);
1996 	if (ret)
1997 		goto out;
1998 
1999 	if (object_map_size != num_objects) {
2000 		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
2001 			 object_map_size, num_objects);
2002 		ret = -EINVAL;
2003 		goto out;
2004 	}
2005 
2006 	if (offset_in_page(p) + object_map_bytes > reply_len) {
2007 		ret = -EINVAL;
2008 		goto out;
2009 	}
2010 
2011 	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
2012 	if (!rbd_dev->object_map) {
2013 		ret = -ENOMEM;
2014 		goto out;
2015 	}
2016 
2017 	rbd_dev->object_map_size = object_map_size;
2018 	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
2019 				   offset_in_page(p), object_map_bytes);
2020 
2021 out:
2022 	ceph_release_page_vector(pages, num_pages);
2023 	return ret;
2024 }
2025 
2026 static void rbd_object_map_free(struct rbd_device *rbd_dev)
2027 {
2028 	kvfree(rbd_dev->object_map);
2029 	rbd_dev->object_map = NULL;
2030 	rbd_dev->object_map_size = 0;
2031 }
2032 
2033 static int rbd_object_map_load(struct rbd_device *rbd_dev)
2034 {
2035 	int ret;
2036 
2037 	ret = __rbd_object_map_load(rbd_dev);
2038 	if (ret)
2039 		return ret;
2040 
2041 	ret = rbd_dev_v2_get_flags(rbd_dev);
2042 	if (ret) {
2043 		rbd_object_map_free(rbd_dev);
2044 		return ret;
2045 	}
2046 
2047 	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
2048 		rbd_warn(rbd_dev, "object map is invalid");
2049 
2050 	return 0;
2051 }
2052 
2053 static int rbd_object_map_open(struct rbd_device *rbd_dev)
2054 {
2055 	int ret;
2056 
2057 	ret = rbd_object_map_lock(rbd_dev);
2058 	if (ret)
2059 		return ret;
2060 
2061 	ret = rbd_object_map_load(rbd_dev);
2062 	if (ret) {
2063 		rbd_object_map_unlock(rbd_dev);
2064 		return ret;
2065 	}
2066 
2067 	return 0;
2068 }
2069 
2070 static void rbd_object_map_close(struct rbd_device *rbd_dev)
2071 {
2072 	rbd_object_map_free(rbd_dev);
2073 	rbd_object_map_unlock(rbd_dev);
2074 }
2075 
2076 /*
2077  * This function needs snap_id (or more precisely just something to
2078  * distinguish between HEAD and snapshot object maps), new_state and
2079  * current_state that were passed to rbd_object_map_update().
2080  *
2081  * To avoid allocating and stashing a context we piggyback on the OSD
2082  * request.  A HEAD update has two ops (assert_locked).  For new_state
2083  * and current_state we decode our own object_map_update op, encoded in
2084  * rbd_cls_object_map_update().
2085  */
2086 static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
2087 					struct ceph_osd_request *osd_req)
2088 {
2089 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2090 	struct ceph_osd_data *osd_data;
2091 	u64 objno;
2092 	u8 state, new_state, current_state;
2093 	bool has_current_state;
2094 	void *p;
2095 
2096 	if (osd_req->r_result)
2097 		return osd_req->r_result;
2098 
2099 	/*
2100 	 * Nothing to do for a snapshot object map.
2101 	 */
2102 	if (osd_req->r_num_ops == 1)
2103 		return 0;
2104 
2105 	/*
2106 	 * Update in-memory HEAD object map.
2107 	 */
2108 	rbd_assert(osd_req->r_num_ops == 2);
2109 	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2110 	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2111 
2112 	p = page_address(osd_data->pages[0]);
2113 	objno = ceph_decode_64(&p);
2114 	rbd_assert(objno == obj_req->ex.oe_objno);
2115 	rbd_assert(ceph_decode_64(&p) == objno + 1);
2116 	new_state = ceph_decode_8(&p);
2117 	has_current_state = ceph_decode_8(&p);
2118 	if (has_current_state)
2119 		current_state = ceph_decode_8(&p);
2120 
2121 	spin_lock(&rbd_dev->object_map_lock);
2122 	state = __rbd_object_map_get(rbd_dev, objno);
2123 	if (!has_current_state || current_state == state ||
2124 	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2125 		__rbd_object_map_set(rbd_dev, objno, new_state);
2126 	spin_unlock(&rbd_dev->object_map_lock);
2127 
2128 	return 0;
2129 }
2130 
2131 static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2132 {
2133 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2134 	int result;
2135 
2136 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2137 	     osd_req->r_result, obj_req);
2138 
2139 	result = rbd_object_map_update_finish(obj_req, osd_req);
2140 	rbd_obj_handle_request(obj_req, result);
2141 }
2142 
2143 static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2144 {
2145 	u8 state = rbd_object_map_get(rbd_dev, objno);
2146 
2147 	if (state == new_state ||
2148 	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2149 	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2150 		return false;
2151 
2152 	return true;
2153 }
2154 
2155 static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2156 				     int which, u64 objno, u8 new_state,
2157 				     const u8 *current_state)
2158 {
2159 	struct page **pages;
2160 	void *p, *start;
2161 	int ret;
2162 
2163 	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2164 	if (ret)
2165 		return ret;
2166 
2167 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2168 	if (IS_ERR(pages))
2169 		return PTR_ERR(pages);
2170 
2171 	p = start = page_address(pages[0]);
2172 	ceph_encode_64(&p, objno);
2173 	ceph_encode_64(&p, objno + 1);
2174 	ceph_encode_8(&p, new_state);
2175 	if (current_state) {
2176 		ceph_encode_8(&p, 1);
2177 		ceph_encode_8(&p, *current_state);
2178 	} else {
2179 		ceph_encode_8(&p, 0);
2180 	}
2181 
2182 	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2183 					  false, true);
2184 	return 0;
2185 }
2186 
2187 /*
2188  * Return:
2189  *   0 - object map update sent
2190  *   1 - object map update isn't needed
2191  *  <0 - error
2192  */
2193 static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2194 				 u8 new_state, const u8 *current_state)
2195 {
2196 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2197 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2198 	struct ceph_osd_request *req;
2199 	int num_ops = 1;
2200 	int which = 0;
2201 	int ret;
2202 
2203 	if (snap_id == CEPH_NOSNAP) {
2204 		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2205 			return 1;
2206 
2207 		num_ops++; /* assert_locked */
2208 	}
2209 
2210 	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2211 	if (!req)
2212 		return -ENOMEM;
2213 
2214 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2215 	req->r_callback = rbd_object_map_callback;
2216 	req->r_priv = obj_req;
2217 
2218 	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2219 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2220 	req->r_flags = CEPH_OSD_FLAG_WRITE;
2221 	ktime_get_real_ts64(&req->r_mtime);
2222 
2223 	if (snap_id == CEPH_NOSNAP) {
2224 		/*
2225 		 * Protect against possible race conditions during lock
2226 		 * ownership transitions.
2227 		 */
2228 		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2229 					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2230 		if (ret)
2231 			return ret;
2232 	}
2233 
2234 	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2235 					new_state, current_state);
2236 	if (ret)
2237 		return ret;
2238 
2239 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2240 	if (ret)
2241 		return ret;
2242 
2243 	ceph_osdc_start_request(osdc, req, false);
2244 	return 0;
2245 }
2246 
2247 static void prune_extents(struct ceph_file_extent *img_extents,
2248 			  u32 *num_img_extents, u64 overlap)
2249 {
2250 	u32 cnt = *num_img_extents;
2251 
2252 	/* drop extents completely beyond the overlap */
2253 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2254 		cnt--;
2255 
2256 	if (cnt) {
2257 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
2258 
2259 		/* trim final overlapping extent */
2260 		if (ex->fe_off + ex->fe_len > overlap)
2261 			ex->fe_len = overlap - ex->fe_off;
2262 	}
2263 
2264 	*num_img_extents = cnt;
2265 }
2266 
2267 /*
2268  * Determine the byte range(s) covered by either just the object extent
2269  * or the entire object in the parent image.
2270  */
2271 static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2272 				    bool entire)
2273 {
2274 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2275 	int ret;
2276 
2277 	if (!rbd_dev->parent_overlap)
2278 		return 0;
2279 
2280 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2281 				  entire ? 0 : obj_req->ex.oe_off,
2282 				  entire ? rbd_dev->layout.object_size :
2283 							obj_req->ex.oe_len,
2284 				  &obj_req->img_extents,
2285 				  &obj_req->num_img_extents);
2286 	if (ret)
2287 		return ret;
2288 
2289 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2290 		      rbd_dev->parent_overlap);
2291 	return 0;
2292 }
2293 
2294 static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
2295 {
2296 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2297 
2298 	switch (obj_req->img_request->data_type) {
2299 	case OBJ_REQUEST_BIO:
2300 		osd_req_op_extent_osd_data_bio(osd_req, which,
2301 					       &obj_req->bio_pos,
2302 					       obj_req->ex.oe_len);
2303 		break;
2304 	case OBJ_REQUEST_BVECS:
2305 	case OBJ_REQUEST_OWN_BVECS:
2306 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
2307 							obj_req->ex.oe_len);
2308 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2309 		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
2310 						    &obj_req->bvec_pos);
2311 		break;
2312 	default:
2313 		BUG();
2314 	}
2315 }
2316 
2317 static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
2318 {
2319 	struct page **pages;
2320 
2321 	/*
2322 	 * The response data for a STAT call consists of:
2323 	 *     le64 length;
2324 	 *     struct {
2325 	 *         le32 tv_sec;
2326 	 *         le32 tv_nsec;
2327 	 *     } mtime;
2328 	 */
2329 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2330 	if (IS_ERR(pages))
2331 		return PTR_ERR(pages);
2332 
2333 	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2334 	osd_req_op_raw_data_in_pages(osd_req, which, pages,
2335 				     8 + sizeof(struct ceph_timespec),
2336 				     0, false, true);
2337 	return 0;
2338 }
2339 
2340 static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2341 				u32 bytes)
2342 {
2343 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2344 	int ret;
2345 
2346 	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2347 	if (ret)
2348 		return ret;
2349 
2350 	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2351 					  obj_req->copyup_bvec_count, bytes);
2352 	return 0;
2353 }
2354 
2355 static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2356 {
2357 	obj_req->read_state = RBD_OBJ_READ_START;
2358 	return 0;
2359 }
2360 
2361 static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2362 				      int which)
2363 {
2364 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2365 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2366 	u16 opcode;
2367 
2368 	if (!use_object_map(rbd_dev) ||
2369 	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2370 		osd_req_op_alloc_hint_init(osd_req, which++,
2371 					   rbd_dev->layout.object_size,
2372 					   rbd_dev->layout.object_size);
2373 	}
2374 
2375 	if (rbd_obj_is_entire(obj_req))
2376 		opcode = CEPH_OSD_OP_WRITEFULL;
2377 	else
2378 		opcode = CEPH_OSD_OP_WRITE;
2379 
2380 	osd_req_op_extent_init(osd_req, which, opcode,
2381 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2382 	rbd_osd_setup_data(osd_req, which);
2383 }
2384 
2385 static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
2386 {
2387 	int ret;
2388 
2389 	/* reverse map the entire object onto the parent */
2390 	ret = rbd_obj_calc_img_extents(obj_req, true);
2391 	if (ret)
2392 		return ret;
2393 
2394 	if (rbd_obj_copyup_enabled(obj_req))
2395 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2396 
2397 	obj_req->write_state = RBD_OBJ_WRITE_START;
2398 	return 0;
2399 }
2400 
2401 static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2402 {
2403 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2404 					  CEPH_OSD_OP_ZERO;
2405 }
2406 
2407 static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2408 					int which)
2409 {
2410 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2411 
2412 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2413 		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2414 		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2415 	} else {
2416 		osd_req_op_extent_init(osd_req, which,
2417 				       truncate_or_zero_opcode(obj_req),
2418 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2419 				       0, 0);
2420 	}
2421 }
2422 
2423 static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
2424 {
2425 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2426 	u64 off, next_off;
2427 	int ret;
2428 
2429 	/*
2430 	 * Align the range to alloc_size boundary and punt on discards
2431 	 * that are too small to free up any space.
2432 	 *
2433 	 * alloc_size == object_size && is_tail() is a special case for
2434 	 * filestore with filestore_punch_hole = false, needed to allow
2435 	 * truncate (in addition to delete).
2436 	 */
2437 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2438 	    !rbd_obj_is_tail(obj_req)) {
2439 		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2440 		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2441 				      rbd_dev->opts->alloc_size);
2442 		if (off >= next_off)
2443 			return 1;
2444 
2445 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2446 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2447 		     off, next_off - off);
2448 		obj_req->ex.oe_off = off;
2449 		obj_req->ex.oe_len = next_off - off;
2450 	}
2451 
2452 	/* reverse map the entire object onto the parent */
2453 	ret = rbd_obj_calc_img_extents(obj_req, true);
2454 	if (ret)
2455 		return ret;
2456 
2457 	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2458 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2459 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2460 
2461 	obj_req->write_state = RBD_OBJ_WRITE_START;
2462 	return 0;
2463 }
2464 
2465 static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2466 					int which)
2467 {
2468 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2469 	u16 opcode;
2470 
2471 	if (rbd_obj_is_entire(obj_req)) {
2472 		if (obj_req->num_img_extents) {
2473 			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2474 				osd_req_op_init(osd_req, which++,
2475 						CEPH_OSD_OP_CREATE, 0);
2476 			opcode = CEPH_OSD_OP_TRUNCATE;
2477 		} else {
2478 			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2479 			osd_req_op_init(osd_req, which++,
2480 					CEPH_OSD_OP_DELETE, 0);
2481 			opcode = 0;
2482 		}
2483 	} else {
2484 		opcode = truncate_or_zero_opcode(obj_req);
2485 	}
2486 
2487 	if (opcode)
2488 		osd_req_op_extent_init(osd_req, which, opcode,
2489 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2490 				       0, 0);
2491 }
2492 
2493 static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
2494 {
2495 	int ret;
2496 
2497 	/* reverse map the entire object onto the parent */
2498 	ret = rbd_obj_calc_img_extents(obj_req, true);
2499 	if (ret)
2500 		return ret;
2501 
2502 	if (rbd_obj_copyup_enabled(obj_req))
2503 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2504 	if (!obj_req->num_img_extents) {
2505 		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2506 		if (rbd_obj_is_entire(obj_req))
2507 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2508 	}
2509 
2510 	obj_req->write_state = RBD_OBJ_WRITE_START;
2511 	return 0;
2512 }
2513 
2514 static int count_write_ops(struct rbd_obj_request *obj_req)
2515 {
2516 	struct rbd_img_request *img_req = obj_req->img_request;
2517 
2518 	switch (img_req->op_type) {
2519 	case OBJ_OP_WRITE:
2520 		if (!use_object_map(img_req->rbd_dev) ||
2521 		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2522 			return 2; /* setallochint + write/writefull */
2523 
2524 		return 1; /* write/writefull */
2525 	case OBJ_OP_DISCARD:
2526 		return 1; /* delete/truncate/zero */
2527 	case OBJ_OP_ZEROOUT:
2528 		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2529 		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2530 			return 2; /* create + truncate */
2531 
2532 		return 1; /* delete/truncate/zero */
2533 	default:
2534 		BUG();
2535 	}
2536 }
2537 
2538 static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2539 				    int which)
2540 {
2541 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2542 
2543 	switch (obj_req->img_request->op_type) {
2544 	case OBJ_OP_WRITE:
2545 		__rbd_osd_setup_write_ops(osd_req, which);
2546 		break;
2547 	case OBJ_OP_DISCARD:
2548 		__rbd_osd_setup_discard_ops(osd_req, which);
2549 		break;
2550 	case OBJ_OP_ZEROOUT:
2551 		__rbd_osd_setup_zeroout_ops(osd_req, which);
2552 		break;
2553 	default:
2554 		BUG();
2555 	}
2556 }
2557 
2558 /*
2559  * Prune the list of object requests (adjust offset and/or length, drop
2560  * redundant requests).  Prepare object request state machines and image
2561  * request state machine for execution.
2562  */
2563 static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2564 {
2565 	struct rbd_obj_request *obj_req, *next_obj_req;
2566 	int ret;
2567 
2568 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
2569 		switch (img_req->op_type) {
2570 		case OBJ_OP_READ:
2571 			ret = rbd_obj_init_read(obj_req);
2572 			break;
2573 		case OBJ_OP_WRITE:
2574 			ret = rbd_obj_init_write(obj_req);
2575 			break;
2576 		case OBJ_OP_DISCARD:
2577 			ret = rbd_obj_init_discard(obj_req);
2578 			break;
2579 		case OBJ_OP_ZEROOUT:
2580 			ret = rbd_obj_init_zeroout(obj_req);
2581 			break;
2582 		default:
2583 			BUG();
2584 		}
2585 		if (ret < 0)
2586 			return ret;
2587 		if (ret > 0) {
2588 			rbd_img_obj_request_del(img_req, obj_req);
2589 			continue;
2590 		}
2591 	}
2592 
2593 	img_req->state = RBD_IMG_START;
2594 	return 0;
2595 }
2596 
2597 union rbd_img_fill_iter {
2598 	struct ceph_bio_iter	bio_iter;
2599 	struct ceph_bvec_iter	bvec_iter;
2600 };
2601 
2602 struct rbd_img_fill_ctx {
2603 	enum obj_request_type	pos_type;
2604 	union rbd_img_fill_iter	*pos;
2605 	union rbd_img_fill_iter	iter;
2606 	ceph_object_extent_fn_t	set_pos_fn;
2607 	ceph_object_extent_fn_t	count_fn;
2608 	ceph_object_extent_fn_t	copy_fn;
2609 };
2610 
2611 static struct ceph_object_extent *alloc_object_extent(void *arg)
2612 {
2613 	struct rbd_img_request *img_req = arg;
2614 	struct rbd_obj_request *obj_req;
2615 
2616 	obj_req = rbd_obj_request_create();
2617 	if (!obj_req)
2618 		return NULL;
2619 
2620 	rbd_img_obj_request_add(img_req, obj_req);
2621 	return &obj_req->ex;
2622 }
2623 
2624 /*
2625  * While su != os && sc == 1 is technically not fancy (it's the same
2626  * layout as su == os && sc == 1), we can't use the nocopy path for it
2627  * because ->set_pos_fn() should be called only once per object.
2628  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2629  * treat su != os && sc == 1 as fancy.
2630  */
2631 static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2632 {
2633 	return l->stripe_unit != l->object_size;
2634 }
2635 
2636 static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2637 				       struct ceph_file_extent *img_extents,
2638 				       u32 num_img_extents,
2639 				       struct rbd_img_fill_ctx *fctx)
2640 {
2641 	u32 i;
2642 	int ret;
2643 
2644 	img_req->data_type = fctx->pos_type;
2645 
2646 	/*
2647 	 * Create object requests and set each object request's starting
2648 	 * position in the provided bio (list) or bio_vec array.
2649 	 */
2650 	fctx->iter = *fctx->pos;
2651 	for (i = 0; i < num_img_extents; i++) {
2652 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2653 					   img_extents[i].fe_off,
2654 					   img_extents[i].fe_len,
2655 					   &img_req->object_extents,
2656 					   alloc_object_extent, img_req,
2657 					   fctx->set_pos_fn, &fctx->iter);
2658 		if (ret)
2659 			return ret;
2660 	}
2661 
2662 	return __rbd_img_fill_request(img_req);
2663 }
2664 
2665 /*
2666  * Map a list of image extents to a list of object extents, create the
2667  * corresponding object requests (normally each to a different object,
2668  * but not always) and add them to @img_req.  For each object request,
2669  * set up its data descriptor to point to the corresponding chunk(s) of
2670  * @fctx->pos data buffer.
2671  *
2672  * Because ceph_file_to_extents() will merge adjacent object extents
2673  * together, each object request's data descriptor may point to multiple
2674  * different chunks of @fctx->pos data buffer.
2675  *
2676  * @fctx->pos data buffer is assumed to be large enough.
2677  */
2678 static int rbd_img_fill_request(struct rbd_img_request *img_req,
2679 				struct ceph_file_extent *img_extents,
2680 				u32 num_img_extents,
2681 				struct rbd_img_fill_ctx *fctx)
2682 {
2683 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2684 	struct rbd_obj_request *obj_req;
2685 	u32 i;
2686 	int ret;
2687 
2688 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2689 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2690 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2691 						   num_img_extents, fctx);
2692 
2693 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2694 
2695 	/*
2696 	 * Create object requests and determine ->bvec_count for each object
2697 	 * request.  Note that ->bvec_count sum over all object requests may
2698 	 * be greater than the number of bio_vecs in the provided bio (list)
2699 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2700 	 * stripe unit boundaries.
2701 	 */
2702 	fctx->iter = *fctx->pos;
2703 	for (i = 0; i < num_img_extents; i++) {
2704 		ret = ceph_file_to_extents(&rbd_dev->layout,
2705 					   img_extents[i].fe_off,
2706 					   img_extents[i].fe_len,
2707 					   &img_req->object_extents,
2708 					   alloc_object_extent, img_req,
2709 					   fctx->count_fn, &fctx->iter);
2710 		if (ret)
2711 			return ret;
2712 	}
2713 
2714 	for_each_obj_request(img_req, obj_req) {
2715 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2716 					      sizeof(*obj_req->bvec_pos.bvecs),
2717 					      GFP_NOIO);
2718 		if (!obj_req->bvec_pos.bvecs)
2719 			return -ENOMEM;
2720 	}
2721 
2722 	/*
2723 	 * Fill in each object request's private bio_vec array, splitting and
2724 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2725 	 */
2726 	fctx->iter = *fctx->pos;
2727 	for (i = 0; i < num_img_extents; i++) {
2728 		ret = ceph_iterate_extents(&rbd_dev->layout,
2729 					   img_extents[i].fe_off,
2730 					   img_extents[i].fe_len,
2731 					   &img_req->object_extents,
2732 					   fctx->copy_fn, &fctx->iter);
2733 		if (ret)
2734 			return ret;
2735 	}
2736 
2737 	return __rbd_img_fill_request(img_req);
2738 }
2739 
2740 static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2741 			       u64 off, u64 len)
2742 {
2743 	struct ceph_file_extent ex = { off, len };
2744 	union rbd_img_fill_iter dummy;
2745 	struct rbd_img_fill_ctx fctx = {
2746 		.pos_type = OBJ_REQUEST_NODATA,
2747 		.pos = &dummy,
2748 	};
2749 
2750 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2751 }
2752 
2753 static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2754 {
2755 	struct rbd_obj_request *obj_req =
2756 	    container_of(ex, struct rbd_obj_request, ex);
2757 	struct ceph_bio_iter *it = arg;
2758 
2759 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2760 	obj_req->bio_pos = *it;
2761 	ceph_bio_iter_advance(it, bytes);
2762 }
2763 
2764 static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2765 {
2766 	struct rbd_obj_request *obj_req =
2767 	    container_of(ex, struct rbd_obj_request, ex);
2768 	struct ceph_bio_iter *it = arg;
2769 
2770 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2771 	ceph_bio_iter_advance_step(it, bytes, ({
2772 		obj_req->bvec_count++;
2773 	}));
2774 
2775 }
2776 
2777 static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2778 {
2779 	struct rbd_obj_request *obj_req =
2780 	    container_of(ex, struct rbd_obj_request, ex);
2781 	struct ceph_bio_iter *it = arg;
2782 
2783 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2784 	ceph_bio_iter_advance_step(it, bytes, ({
2785 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2786 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2787 	}));
2788 }
2789 
2790 static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2791 				   struct ceph_file_extent *img_extents,
2792 				   u32 num_img_extents,
2793 				   struct ceph_bio_iter *bio_pos)
2794 {
2795 	struct rbd_img_fill_ctx fctx = {
2796 		.pos_type = OBJ_REQUEST_BIO,
2797 		.pos = (union rbd_img_fill_iter *)bio_pos,
2798 		.set_pos_fn = set_bio_pos,
2799 		.count_fn = count_bio_bvecs,
2800 		.copy_fn = copy_bio_bvecs,
2801 	};
2802 
2803 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2804 				    &fctx);
2805 }
2806 
2807 static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2808 				 u64 off, u64 len, struct bio *bio)
2809 {
2810 	struct ceph_file_extent ex = { off, len };
2811 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2812 
2813 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2814 }
2815 
2816 static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2817 {
2818 	struct rbd_obj_request *obj_req =
2819 	    container_of(ex, struct rbd_obj_request, ex);
2820 	struct ceph_bvec_iter *it = arg;
2821 
2822 	obj_req->bvec_pos = *it;
2823 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2824 	ceph_bvec_iter_advance(it, bytes);
2825 }
2826 
2827 static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2828 {
2829 	struct rbd_obj_request *obj_req =
2830 	    container_of(ex, struct rbd_obj_request, ex);
2831 	struct ceph_bvec_iter *it = arg;
2832 
2833 	ceph_bvec_iter_advance_step(it, bytes, ({
2834 		obj_req->bvec_count++;
2835 	}));
2836 }
2837 
2838 static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2839 {
2840 	struct rbd_obj_request *obj_req =
2841 	    container_of(ex, struct rbd_obj_request, ex);
2842 	struct ceph_bvec_iter *it = arg;
2843 
2844 	ceph_bvec_iter_advance_step(it, bytes, ({
2845 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2846 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2847 	}));
2848 }
2849 
2850 static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2851 				     struct ceph_file_extent *img_extents,
2852 				     u32 num_img_extents,
2853 				     struct ceph_bvec_iter *bvec_pos)
2854 {
2855 	struct rbd_img_fill_ctx fctx = {
2856 		.pos_type = OBJ_REQUEST_BVECS,
2857 		.pos = (union rbd_img_fill_iter *)bvec_pos,
2858 		.set_pos_fn = set_bvec_pos,
2859 		.count_fn = count_bvecs,
2860 		.copy_fn = copy_bvecs,
2861 	};
2862 
2863 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2864 				    &fctx);
2865 }
2866 
2867 static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2868 				   struct ceph_file_extent *img_extents,
2869 				   u32 num_img_extents,
2870 				   struct bio_vec *bvecs)
2871 {
2872 	struct ceph_bvec_iter it = {
2873 		.bvecs = bvecs,
2874 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2875 							     num_img_extents) },
2876 	};
2877 
2878 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2879 					 &it);
2880 }
2881 
2882 static void rbd_img_handle_request_work(struct work_struct *work)
2883 {
2884 	struct rbd_img_request *img_req =
2885 	    container_of(work, struct rbd_img_request, work);
2886 
2887 	rbd_img_handle_request(img_req, img_req->work_result);
2888 }
2889 
2890 static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2891 {
2892 	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2893 	img_req->work_result = result;
2894 	queue_work(rbd_wq, &img_req->work);
2895 }
2896 
2897 static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2898 {
2899 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2900 
2901 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2902 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2903 		return true;
2904 	}
2905 
2906 	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2907 	     obj_req->ex.oe_objno);
2908 	return false;
2909 }
2910 
2911 static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2912 {
2913 	struct ceph_osd_request *osd_req;
2914 	int ret;
2915 
2916 	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2917 	if (IS_ERR(osd_req))
2918 		return PTR_ERR(osd_req);
2919 
2920 	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2921 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2922 	rbd_osd_setup_data(osd_req, 0);
2923 	rbd_osd_format_read(osd_req);
2924 
2925 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2926 	if (ret)
2927 		return ret;
2928 
2929 	rbd_osd_submit(osd_req);
2930 	return 0;
2931 }
2932 
2933 static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2934 {
2935 	struct rbd_img_request *img_req = obj_req->img_request;
2936 	struct rbd_img_request *child_img_req;
2937 	int ret;
2938 
2939 	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2940 					       OBJ_OP_READ, NULL);
2941 	if (!child_img_req)
2942 		return -ENOMEM;
2943 
2944 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2945 	child_img_req->obj_request = obj_req;
2946 
2947 	if (!rbd_img_is_write(img_req)) {
2948 		switch (img_req->data_type) {
2949 		case OBJ_REQUEST_BIO:
2950 			ret = __rbd_img_fill_from_bio(child_img_req,
2951 						      obj_req->img_extents,
2952 						      obj_req->num_img_extents,
2953 						      &obj_req->bio_pos);
2954 			break;
2955 		case OBJ_REQUEST_BVECS:
2956 		case OBJ_REQUEST_OWN_BVECS:
2957 			ret = __rbd_img_fill_from_bvecs(child_img_req,
2958 						      obj_req->img_extents,
2959 						      obj_req->num_img_extents,
2960 						      &obj_req->bvec_pos);
2961 			break;
2962 		default:
2963 			BUG();
2964 		}
2965 	} else {
2966 		ret = rbd_img_fill_from_bvecs(child_img_req,
2967 					      obj_req->img_extents,
2968 					      obj_req->num_img_extents,
2969 					      obj_req->copyup_bvecs);
2970 	}
2971 	if (ret) {
2972 		rbd_img_request_put(child_img_req);
2973 		return ret;
2974 	}
2975 
2976 	/* avoid parent chain recursion */
2977 	rbd_img_schedule(child_img_req, 0);
2978 	return 0;
2979 }
2980 
2981 static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
2982 {
2983 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2984 	int ret;
2985 
2986 again:
2987 	switch (obj_req->read_state) {
2988 	case RBD_OBJ_READ_START:
2989 		rbd_assert(!*result);
2990 
2991 		if (!rbd_obj_may_exist(obj_req)) {
2992 			*result = -ENOENT;
2993 			obj_req->read_state = RBD_OBJ_READ_OBJECT;
2994 			goto again;
2995 		}
2996 
2997 		ret = rbd_obj_read_object(obj_req);
2998 		if (ret) {
2999 			*result = ret;
3000 			return true;
3001 		}
3002 		obj_req->read_state = RBD_OBJ_READ_OBJECT;
3003 		return false;
3004 	case RBD_OBJ_READ_OBJECT:
3005 		if (*result == -ENOENT && rbd_dev->parent_overlap) {
3006 			/* reverse map this object extent onto the parent */
3007 			ret = rbd_obj_calc_img_extents(obj_req, false);
3008 			if (ret) {
3009 				*result = ret;
3010 				return true;
3011 			}
3012 			if (obj_req->num_img_extents) {
3013 				ret = rbd_obj_read_from_parent(obj_req);
3014 				if (ret) {
3015 					*result = ret;
3016 					return true;
3017 				}
3018 				obj_req->read_state = RBD_OBJ_READ_PARENT;
3019 				return false;
3020 			}
3021 		}
3022 
3023 		/*
3024 		 * -ENOENT means a hole in the image -- zero-fill the entire
3025 		 * length of the request.  A short read also implies zero-fill
3026 		 * to the end of the request.
3027 		 */
3028 		if (*result == -ENOENT) {
3029 			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
3030 			*result = 0;
3031 		} else if (*result >= 0) {
3032 			if (*result < obj_req->ex.oe_len)
3033 				rbd_obj_zero_range(obj_req, *result,
3034 						obj_req->ex.oe_len - *result);
3035 			else
3036 				rbd_assert(*result == obj_req->ex.oe_len);
3037 			*result = 0;
3038 		}
3039 		return true;
3040 	case RBD_OBJ_READ_PARENT:
3041 		/*
3042 		 * The parent image is read only up to the overlap -- zero-fill
3043 		 * from the overlap to the end of the request.
3044 		 */
3045 		if (!*result) {
3046 			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
3047 
3048 			if (obj_overlap < obj_req->ex.oe_len)
3049 				rbd_obj_zero_range(obj_req, obj_overlap,
3050 					    obj_req->ex.oe_len - obj_overlap);
3051 		}
3052 		return true;
3053 	default:
3054 		BUG();
3055 	}
3056 }
3057 
3058 static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
3059 {
3060 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3061 
3062 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
3063 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
3064 
3065 	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
3066 	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
3067 		dout("%s %p noop for nonexistent\n", __func__, obj_req);
3068 		return true;
3069 	}
3070 
3071 	return false;
3072 }
3073 
3074 /*
3075  * Return:
3076  *   0 - object map update sent
3077  *   1 - object map update isn't needed
3078  *  <0 - error
3079  */
3080 static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
3081 {
3082 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3083 	u8 new_state;
3084 
3085 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3086 		return 1;
3087 
3088 	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3089 		new_state = OBJECT_PENDING;
3090 	else
3091 		new_state = OBJECT_EXISTS;
3092 
3093 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3094 }
3095 
3096 static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3097 {
3098 	struct ceph_osd_request *osd_req;
3099 	int num_ops = count_write_ops(obj_req);
3100 	int which = 0;
3101 	int ret;
3102 
3103 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3104 		num_ops++; /* stat */
3105 
3106 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3107 	if (IS_ERR(osd_req))
3108 		return PTR_ERR(osd_req);
3109 
3110 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3111 		ret = rbd_osd_setup_stat(osd_req, which++);
3112 		if (ret)
3113 			return ret;
3114 	}
3115 
3116 	rbd_osd_setup_write_ops(osd_req, which);
3117 	rbd_osd_format_write(osd_req);
3118 
3119 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3120 	if (ret)
3121 		return ret;
3122 
3123 	rbd_osd_submit(osd_req);
3124 	return 0;
3125 }
3126 
3127 /*
3128  * copyup_bvecs pages are never highmem pages
3129  */
3130 static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3131 {
3132 	struct ceph_bvec_iter it = {
3133 		.bvecs = bvecs,
3134 		.iter = { .bi_size = bytes },
3135 	};
3136 
3137 	ceph_bvec_iter_advance_step(&it, bytes, ({
3138 		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3139 			       bv.bv_len))
3140 			return false;
3141 	}));
3142 	return true;
3143 }
3144 
3145 #define MODS_ONLY	U32_MAX
3146 
3147 static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3148 				      u32 bytes)
3149 {
3150 	struct ceph_osd_request *osd_req;
3151 	int ret;
3152 
3153 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3154 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
3155 
3156 	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3157 	if (IS_ERR(osd_req))
3158 		return PTR_ERR(osd_req);
3159 
3160 	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3161 	if (ret)
3162 		return ret;
3163 
3164 	rbd_osd_format_write(osd_req);
3165 
3166 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3167 	if (ret)
3168 		return ret;
3169 
3170 	rbd_osd_submit(osd_req);
3171 	return 0;
3172 }
3173 
3174 static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3175 					u32 bytes)
3176 {
3177 	struct ceph_osd_request *osd_req;
3178 	int num_ops = count_write_ops(obj_req);
3179 	int which = 0;
3180 	int ret;
3181 
3182 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3183 
3184 	if (bytes != MODS_ONLY)
3185 		num_ops++; /* copyup */
3186 
3187 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3188 	if (IS_ERR(osd_req))
3189 		return PTR_ERR(osd_req);
3190 
3191 	if (bytes != MODS_ONLY) {
3192 		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3193 		if (ret)
3194 			return ret;
3195 	}
3196 
3197 	rbd_osd_setup_write_ops(osd_req, which);
3198 	rbd_osd_format_write(osd_req);
3199 
3200 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3201 	if (ret)
3202 		return ret;
3203 
3204 	rbd_osd_submit(osd_req);
3205 	return 0;
3206 }
3207 
3208 static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
3209 {
3210 	u32 i;
3211 
3212 	rbd_assert(!obj_req->copyup_bvecs);
3213 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3214 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3215 					sizeof(*obj_req->copyup_bvecs),
3216 					GFP_NOIO);
3217 	if (!obj_req->copyup_bvecs)
3218 		return -ENOMEM;
3219 
3220 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3221 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3222 
3223 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3224 		if (!obj_req->copyup_bvecs[i].bv_page)
3225 			return -ENOMEM;
3226 
3227 		obj_req->copyup_bvecs[i].bv_offset = 0;
3228 		obj_req->copyup_bvecs[i].bv_len = len;
3229 		obj_overlap -= len;
3230 	}
3231 
3232 	rbd_assert(!obj_overlap);
3233 	return 0;
3234 }
3235 
3236 /*
3237  * The target object doesn't exist.  Read the data for the entire
3238  * target object up to the overlap point (if any) from the parent,
3239  * so we can use it for a copyup.
3240  */
3241 static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
3242 {
3243 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3244 	int ret;
3245 
3246 	rbd_assert(obj_req->num_img_extents);
3247 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3248 		      rbd_dev->parent_overlap);
3249 	if (!obj_req->num_img_extents) {
3250 		/*
3251 		 * The overlap has become 0 (most likely because the
3252 		 * image has been flattened).  Re-submit the original write
3253 		 * request -- pass MODS_ONLY since the copyup isn't needed
3254 		 * anymore.
3255 		 */
3256 		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
3257 	}
3258 
3259 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3260 	if (ret)
3261 		return ret;
3262 
3263 	return rbd_obj_read_from_parent(obj_req);
3264 }
3265 
3266 static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3267 {
3268 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3269 	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3270 	u8 new_state;
3271 	u32 i;
3272 	int ret;
3273 
3274 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3275 
3276 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3277 		return;
3278 
3279 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3280 		return;
3281 
3282 	for (i = 0; i < snapc->num_snaps; i++) {
3283 		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3284 		    i + 1 < snapc->num_snaps)
3285 			new_state = OBJECT_EXISTS_CLEAN;
3286 		else
3287 			new_state = OBJECT_EXISTS;
3288 
3289 		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3290 					    new_state, NULL);
3291 		if (ret < 0) {
3292 			obj_req->pending.result = ret;
3293 			return;
3294 		}
3295 
3296 		rbd_assert(!ret);
3297 		obj_req->pending.num_pending++;
3298 	}
3299 }
3300 
3301 static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3302 {
3303 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3304 	int ret;
3305 
3306 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3307 
3308 	/*
3309 	 * Only send non-zero copyup data to save some I/O and network
3310 	 * bandwidth -- zero copyup data is equivalent to the object not
3311 	 * existing.
3312 	 */
3313 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3314 		bytes = 0;
3315 
3316 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3317 		/*
3318 		 * Send a copyup request with an empty snapshot context to
3319 		 * deep-copyup the object through all existing snapshots.
3320 		 * A second request with the current snapshot context will be
3321 		 * sent for the actual modification.
3322 		 */
3323 		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3324 		if (ret) {
3325 			obj_req->pending.result = ret;
3326 			return;
3327 		}
3328 
3329 		obj_req->pending.num_pending++;
3330 		bytes = MODS_ONLY;
3331 	}
3332 
3333 	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3334 	if (ret) {
3335 		obj_req->pending.result = ret;
3336 		return;
3337 	}
3338 
3339 	obj_req->pending.num_pending++;
3340 }
3341 
3342 static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3343 {
3344 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3345 	int ret;
3346 
3347 again:
3348 	switch (obj_req->copyup_state) {
3349 	case RBD_OBJ_COPYUP_START:
3350 		rbd_assert(!*result);
3351 
3352 		ret = rbd_obj_copyup_read_parent(obj_req);
3353 		if (ret) {
3354 			*result = ret;
3355 			return true;
3356 		}
3357 		if (obj_req->num_img_extents)
3358 			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3359 		else
3360 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3361 		return false;
3362 	case RBD_OBJ_COPYUP_READ_PARENT:
3363 		if (*result)
3364 			return true;
3365 
3366 		if (is_zero_bvecs(obj_req->copyup_bvecs,
3367 				  rbd_obj_img_extents_bytes(obj_req))) {
3368 			dout("%s %p detected zeros\n", __func__, obj_req);
3369 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3370 		}
3371 
3372 		rbd_obj_copyup_object_maps(obj_req);
3373 		if (!obj_req->pending.num_pending) {
3374 			*result = obj_req->pending.result;
3375 			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3376 			goto again;
3377 		}
3378 		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3379 		return false;
3380 	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3381 		if (!pending_result_dec(&obj_req->pending, result))
3382 			return false;
3383 		/* fall through */
3384 	case RBD_OBJ_COPYUP_OBJECT_MAPS:
3385 		if (*result) {
3386 			rbd_warn(rbd_dev, "snap object map update failed: %d",
3387 				 *result);
3388 			return true;
3389 		}
3390 
3391 		rbd_obj_copyup_write_object(obj_req);
3392 		if (!obj_req->pending.num_pending) {
3393 			*result = obj_req->pending.result;
3394 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3395 			goto again;
3396 		}
3397 		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3398 		return false;
3399 	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3400 		if (!pending_result_dec(&obj_req->pending, result))
3401 			return false;
3402 		/* fall through */
3403 	case RBD_OBJ_COPYUP_WRITE_OBJECT:
3404 		return true;
3405 	default:
3406 		BUG();
3407 	}
3408 }
3409 
3410 /*
3411  * Return:
3412  *   0 - object map update sent
3413  *   1 - object map update isn't needed
3414  *  <0 - error
3415  */
3416 static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3417 {
3418 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3419 	u8 current_state = OBJECT_PENDING;
3420 
3421 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3422 		return 1;
3423 
3424 	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3425 		return 1;
3426 
3427 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3428 				     &current_state);
3429 }
3430 
3431 static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3432 {
3433 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3434 	int ret;
3435 
3436 again:
3437 	switch (obj_req->write_state) {
3438 	case RBD_OBJ_WRITE_START:
3439 		rbd_assert(!*result);
3440 
3441 		if (rbd_obj_write_is_noop(obj_req))
3442 			return true;
3443 
3444 		ret = rbd_obj_write_pre_object_map(obj_req);
3445 		if (ret < 0) {
3446 			*result = ret;
3447 			return true;
3448 		}
3449 		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3450 		if (ret > 0)
3451 			goto again;
3452 		return false;
3453 	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3454 		if (*result) {
3455 			rbd_warn(rbd_dev, "pre object map update failed: %d",
3456 				 *result);
3457 			return true;
3458 		}
3459 		ret = rbd_obj_write_object(obj_req);
3460 		if (ret) {
3461 			*result = ret;
3462 			return true;
3463 		}
3464 		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3465 		return false;
3466 	case RBD_OBJ_WRITE_OBJECT:
3467 		if (*result == -ENOENT) {
3468 			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3469 				*result = 0;
3470 				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3471 				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3472 				goto again;
3473 			}
3474 			/*
3475 			 * On a non-existent object:
3476 			 *   delete - -ENOENT, truncate/zero - 0
3477 			 */
3478 			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3479 				*result = 0;
3480 		}
3481 		if (*result)
3482 			return true;
3483 
3484 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3485 		goto again;
3486 	case __RBD_OBJ_WRITE_COPYUP:
3487 		if (!rbd_obj_advance_copyup(obj_req, result))
3488 			return false;
3489 		/* fall through */
3490 	case RBD_OBJ_WRITE_COPYUP:
3491 		if (*result) {
3492 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
3493 			return true;
3494 		}
3495 		ret = rbd_obj_write_post_object_map(obj_req);
3496 		if (ret < 0) {
3497 			*result = ret;
3498 			return true;
3499 		}
3500 		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3501 		if (ret > 0)
3502 			goto again;
3503 		return false;
3504 	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3505 		if (*result)
3506 			rbd_warn(rbd_dev, "post object map update failed: %d",
3507 				 *result);
3508 		return true;
3509 	default:
3510 		BUG();
3511 	}
3512 }
3513 
3514 /*
3515  * Return true if @obj_req is completed.
3516  */
3517 static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3518 				     int *result)
3519 {
3520 	struct rbd_img_request *img_req = obj_req->img_request;
3521 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3522 	bool done;
3523 
3524 	mutex_lock(&obj_req->state_mutex);
3525 	if (!rbd_img_is_write(img_req))
3526 		done = rbd_obj_advance_read(obj_req, result);
3527 	else
3528 		done = rbd_obj_advance_write(obj_req, result);
3529 	mutex_unlock(&obj_req->state_mutex);
3530 
3531 	if (done && *result) {
3532 		rbd_assert(*result < 0);
3533 		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3534 			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3535 			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3536 	}
3537 	return done;
3538 }
3539 
3540 /*
3541  * This is open-coded in rbd_img_handle_request() to avoid parent chain
3542  * recursion.
3543  */
3544 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3545 {
3546 	if (__rbd_obj_handle_request(obj_req, &result))
3547 		rbd_img_handle_request(obj_req->img_request, result);
3548 }
3549 
3550 static bool need_exclusive_lock(struct rbd_img_request *img_req)
3551 {
3552 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3553 
3554 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3555 		return false;
3556 
3557 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
3558 		return false;
3559 
3560 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
3561 	if (rbd_dev->opts->lock_on_read ||
3562 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3563 		return true;
3564 
3565 	return rbd_img_is_write(img_req);
3566 }
3567 
3568 static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3569 {
3570 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3571 	bool locked;
3572 
3573 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3574 	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3575 	spin_lock(&rbd_dev->lock_lists_lock);
3576 	rbd_assert(list_empty(&img_req->lock_item));
3577 	if (!locked)
3578 		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3579 	else
3580 		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3581 	spin_unlock(&rbd_dev->lock_lists_lock);
3582 	return locked;
3583 }
3584 
3585 static void rbd_lock_del_request(struct rbd_img_request *img_req)
3586 {
3587 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3588 	bool need_wakeup;
3589 
3590 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3591 	spin_lock(&rbd_dev->lock_lists_lock);
3592 	rbd_assert(!list_empty(&img_req->lock_item));
3593 	list_del_init(&img_req->lock_item);
3594 	need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3595 		       list_empty(&rbd_dev->running_list));
3596 	spin_unlock(&rbd_dev->lock_lists_lock);
3597 	if (need_wakeup)
3598 		complete(&rbd_dev->releasing_wait);
3599 }
3600 
3601 static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3602 {
3603 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3604 
3605 	if (!need_exclusive_lock(img_req))
3606 		return 1;
3607 
3608 	if (rbd_lock_add_request(img_req))
3609 		return 1;
3610 
3611 	if (rbd_dev->opts->exclusive) {
3612 		WARN_ON(1); /* lock got released? */
3613 		return -EROFS;
3614 	}
3615 
3616 	/*
3617 	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3618 	 * and cancel_delayed_work() in wake_lock_waiters().
3619 	 */
3620 	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3621 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3622 	return 0;
3623 }
3624 
3625 static void rbd_img_object_requests(struct rbd_img_request *img_req)
3626 {
3627 	struct rbd_obj_request *obj_req;
3628 
3629 	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3630 
3631 	for_each_obj_request(img_req, obj_req) {
3632 		int result = 0;
3633 
3634 		if (__rbd_obj_handle_request(obj_req, &result)) {
3635 			if (result) {
3636 				img_req->pending.result = result;
3637 				return;
3638 			}
3639 		} else {
3640 			img_req->pending.num_pending++;
3641 		}
3642 	}
3643 }
3644 
3645 static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
3646 {
3647 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3648 	int ret;
3649 
3650 again:
3651 	switch (img_req->state) {
3652 	case RBD_IMG_START:
3653 		rbd_assert(!*result);
3654 
3655 		ret = rbd_img_exclusive_lock(img_req);
3656 		if (ret < 0) {
3657 			*result = ret;
3658 			return true;
3659 		}
3660 		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3661 		if (ret > 0)
3662 			goto again;
3663 		return false;
3664 	case RBD_IMG_EXCLUSIVE_LOCK:
3665 		if (*result)
3666 			return true;
3667 
3668 		rbd_assert(!need_exclusive_lock(img_req) ||
3669 			   __rbd_is_lock_owner(rbd_dev));
3670 
3671 		rbd_img_object_requests(img_req);
3672 		if (!img_req->pending.num_pending) {
3673 			*result = img_req->pending.result;
3674 			img_req->state = RBD_IMG_OBJECT_REQUESTS;
3675 			goto again;
3676 		}
3677 		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3678 		return false;
3679 	case __RBD_IMG_OBJECT_REQUESTS:
3680 		if (!pending_result_dec(&img_req->pending, result))
3681 			return false;
3682 		/* fall through */
3683 	case RBD_IMG_OBJECT_REQUESTS:
3684 		return true;
3685 	default:
3686 		BUG();
3687 	}
3688 }
3689 
3690 /*
3691  * Return true if @img_req is completed.
3692  */
3693 static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3694 				     int *result)
3695 {
3696 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3697 	bool done;
3698 
3699 	if (need_exclusive_lock(img_req)) {
3700 		down_read(&rbd_dev->lock_rwsem);
3701 		mutex_lock(&img_req->state_mutex);
3702 		done = rbd_img_advance(img_req, result);
3703 		if (done)
3704 			rbd_lock_del_request(img_req);
3705 		mutex_unlock(&img_req->state_mutex);
3706 		up_read(&rbd_dev->lock_rwsem);
3707 	} else {
3708 		mutex_lock(&img_req->state_mutex);
3709 		done = rbd_img_advance(img_req, result);
3710 		mutex_unlock(&img_req->state_mutex);
3711 	}
3712 
3713 	if (done && *result) {
3714 		rbd_assert(*result < 0);
3715 		rbd_warn(rbd_dev, "%s%s result %d",
3716 		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3717 		      obj_op_name(img_req->op_type), *result);
3718 	}
3719 	return done;
3720 }
3721 
3722 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3723 {
3724 again:
3725 	if (!__rbd_img_handle_request(img_req, &result))
3726 		return;
3727 
3728 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
3729 		struct rbd_obj_request *obj_req = img_req->obj_request;
3730 
3731 		rbd_img_request_put(img_req);
3732 		if (__rbd_obj_handle_request(obj_req, &result)) {
3733 			img_req = obj_req->img_request;
3734 			goto again;
3735 		}
3736 	} else {
3737 		struct request *rq = img_req->rq;
3738 
3739 		rbd_img_request_put(img_req);
3740 		blk_mq_end_request(rq, errno_to_blk_status(result));
3741 	}
3742 }
3743 
3744 static const struct rbd_client_id rbd_empty_cid;
3745 
3746 static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3747 			  const struct rbd_client_id *rhs)
3748 {
3749 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3750 }
3751 
3752 static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3753 {
3754 	struct rbd_client_id cid;
3755 
3756 	mutex_lock(&rbd_dev->watch_mutex);
3757 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3758 	cid.handle = rbd_dev->watch_cookie;
3759 	mutex_unlock(&rbd_dev->watch_mutex);
3760 	return cid;
3761 }
3762 
3763 /*
3764  * lock_rwsem must be held for write
3765  */
3766 static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3767 			      const struct rbd_client_id *cid)
3768 {
3769 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3770 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3771 	     cid->gid, cid->handle);
3772 	rbd_dev->owner_cid = *cid; /* struct */
3773 }
3774 
3775 static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3776 {
3777 	mutex_lock(&rbd_dev->watch_mutex);
3778 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3779 	mutex_unlock(&rbd_dev->watch_mutex);
3780 }
3781 
3782 static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3783 {
3784 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3785 
3786 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3787 	strcpy(rbd_dev->lock_cookie, cookie);
3788 	rbd_set_owner_cid(rbd_dev, &cid);
3789 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3790 }
3791 
3792 /*
3793  * lock_rwsem must be held for write
3794  */
3795 static int rbd_lock(struct rbd_device *rbd_dev)
3796 {
3797 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3798 	char cookie[32];
3799 	int ret;
3800 
3801 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3802 		rbd_dev->lock_cookie[0] != '\0');
3803 
3804 	format_lock_cookie(rbd_dev, cookie);
3805 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3806 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3807 			    RBD_LOCK_TAG, "", 0);
3808 	if (ret)
3809 		return ret;
3810 
3811 	__rbd_lock(rbd_dev, cookie);
3812 	return 0;
3813 }
3814 
3815 /*
3816  * lock_rwsem must be held for write
3817  */
3818 static void rbd_unlock(struct rbd_device *rbd_dev)
3819 {
3820 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3821 	int ret;
3822 
3823 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3824 		rbd_dev->lock_cookie[0] == '\0');
3825 
3826 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3827 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3828 	if (ret && ret != -ENOENT)
3829 		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3830 
3831 	/* treat errors as the image is unlocked */
3832 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3833 	rbd_dev->lock_cookie[0] = '\0';
3834 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3835 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3836 }
3837 
3838 static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3839 				enum rbd_notify_op notify_op,
3840 				struct page ***preply_pages,
3841 				size_t *preply_len)
3842 {
3843 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3844 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3845 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3846 	int buf_size = sizeof(buf);
3847 	void *p = buf;
3848 
3849 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3850 
3851 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3852 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3853 	ceph_encode_32(&p, notify_op);
3854 	ceph_encode_64(&p, cid.gid);
3855 	ceph_encode_64(&p, cid.handle);
3856 
3857 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3858 				&rbd_dev->header_oloc, buf, buf_size,
3859 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3860 }
3861 
3862 static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3863 			       enum rbd_notify_op notify_op)
3864 {
3865 	struct page **reply_pages;
3866 	size_t reply_len;
3867 
3868 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3869 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3870 }
3871 
3872 static void rbd_notify_acquired_lock(struct work_struct *work)
3873 {
3874 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3875 						  acquired_lock_work);
3876 
3877 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3878 }
3879 
3880 static void rbd_notify_released_lock(struct work_struct *work)
3881 {
3882 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3883 						  released_lock_work);
3884 
3885 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3886 }
3887 
3888 static int rbd_request_lock(struct rbd_device *rbd_dev)
3889 {
3890 	struct page **reply_pages;
3891 	size_t reply_len;
3892 	bool lock_owner_responded = false;
3893 	int ret;
3894 
3895 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3896 
3897 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3898 				   &reply_pages, &reply_len);
3899 	if (ret && ret != -ETIMEDOUT) {
3900 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3901 		goto out;
3902 	}
3903 
3904 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3905 		void *p = page_address(reply_pages[0]);
3906 		void *const end = p + reply_len;
3907 		u32 n;
3908 
3909 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3910 		while (n--) {
3911 			u8 struct_v;
3912 			u32 len;
3913 
3914 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3915 			p += 8 + 8; /* skip gid and cookie */
3916 
3917 			ceph_decode_32_safe(&p, end, len, e_inval);
3918 			if (!len)
3919 				continue;
3920 
3921 			if (lock_owner_responded) {
3922 				rbd_warn(rbd_dev,
3923 					 "duplicate lock owners detected");
3924 				ret = -EIO;
3925 				goto out;
3926 			}
3927 
3928 			lock_owner_responded = true;
3929 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3930 						  &struct_v, &len);
3931 			if (ret) {
3932 				rbd_warn(rbd_dev,
3933 					 "failed to decode ResponseMessage: %d",
3934 					 ret);
3935 				goto e_inval;
3936 			}
3937 
3938 			ret = ceph_decode_32(&p);
3939 		}
3940 	}
3941 
3942 	if (!lock_owner_responded) {
3943 		rbd_warn(rbd_dev, "no lock owners detected");
3944 		ret = -ETIMEDOUT;
3945 	}
3946 
3947 out:
3948 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3949 	return ret;
3950 
3951 e_inval:
3952 	ret = -EINVAL;
3953 	goto out;
3954 }
3955 
3956 /*
3957  * Either image request state machine(s) or rbd_add_acquire_lock()
3958  * (i.e. "rbd map").
3959  */
3960 static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3961 {
3962 	struct rbd_img_request *img_req;
3963 
3964 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3965 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3966 
3967 	cancel_delayed_work(&rbd_dev->lock_dwork);
3968 	if (!completion_done(&rbd_dev->acquire_wait)) {
3969 		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3970 			   list_empty(&rbd_dev->running_list));
3971 		rbd_dev->acquire_err = result;
3972 		complete_all(&rbd_dev->acquire_wait);
3973 		return;
3974 	}
3975 
3976 	list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3977 		mutex_lock(&img_req->state_mutex);
3978 		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3979 		rbd_img_schedule(img_req, result);
3980 		mutex_unlock(&img_req->state_mutex);
3981 	}
3982 
3983 	list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3984 }
3985 
3986 static int get_lock_owner_info(struct rbd_device *rbd_dev,
3987 			       struct ceph_locker **lockers, u32 *num_lockers)
3988 {
3989 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3990 	u8 lock_type;
3991 	char *lock_tag;
3992 	int ret;
3993 
3994 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3995 
3996 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3997 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3998 				 &lock_type, &lock_tag, lockers, num_lockers);
3999 	if (ret)
4000 		return ret;
4001 
4002 	if (*num_lockers == 0) {
4003 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
4004 		goto out;
4005 	}
4006 
4007 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
4008 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
4009 			 lock_tag);
4010 		ret = -EBUSY;
4011 		goto out;
4012 	}
4013 
4014 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
4015 		rbd_warn(rbd_dev, "shared lock type detected");
4016 		ret = -EBUSY;
4017 		goto out;
4018 	}
4019 
4020 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
4021 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
4022 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
4023 			 (*lockers)[0].id.cookie);
4024 		ret = -EBUSY;
4025 		goto out;
4026 	}
4027 
4028 out:
4029 	kfree(lock_tag);
4030 	return ret;
4031 }
4032 
4033 static int find_watcher(struct rbd_device *rbd_dev,
4034 			const struct ceph_locker *locker)
4035 {
4036 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4037 	struct ceph_watch_item *watchers;
4038 	u32 num_watchers;
4039 	u64 cookie;
4040 	int i;
4041 	int ret;
4042 
4043 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
4044 				      &rbd_dev->header_oloc, &watchers,
4045 				      &num_watchers);
4046 	if (ret)
4047 		return ret;
4048 
4049 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
4050 	for (i = 0; i < num_watchers; i++) {
4051 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
4052 			    sizeof(locker->info.addr)) &&
4053 		    watchers[i].cookie == cookie) {
4054 			struct rbd_client_id cid = {
4055 				.gid = le64_to_cpu(watchers[i].name.num),
4056 				.handle = cookie,
4057 			};
4058 
4059 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
4060 			     rbd_dev, cid.gid, cid.handle);
4061 			rbd_set_owner_cid(rbd_dev, &cid);
4062 			ret = 1;
4063 			goto out;
4064 		}
4065 	}
4066 
4067 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
4068 	ret = 0;
4069 out:
4070 	kfree(watchers);
4071 	return ret;
4072 }
4073 
4074 /*
4075  * lock_rwsem must be held for write
4076  */
4077 static int rbd_try_lock(struct rbd_device *rbd_dev)
4078 {
4079 	struct ceph_client *client = rbd_dev->rbd_client->client;
4080 	struct ceph_locker *lockers;
4081 	u32 num_lockers;
4082 	int ret;
4083 
4084 	for (;;) {
4085 		ret = rbd_lock(rbd_dev);
4086 		if (ret != -EBUSY)
4087 			return ret;
4088 
4089 		/* determine if the current lock holder is still alive */
4090 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4091 		if (ret)
4092 			return ret;
4093 
4094 		if (num_lockers == 0)
4095 			goto again;
4096 
4097 		ret = find_watcher(rbd_dev, lockers);
4098 		if (ret)
4099 			goto out; /* request lock or error */
4100 
4101 		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4102 			 ENTITY_NAME(lockers[0].id.name));
4103 
4104 		ret = ceph_monc_blacklist_add(&client->monc,
4105 					      &lockers[0].info.addr);
4106 		if (ret) {
4107 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
4108 				 ENTITY_NAME(lockers[0].id.name), ret);
4109 			goto out;
4110 		}
4111 
4112 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4113 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4114 					  lockers[0].id.cookie,
4115 					  &lockers[0].id.name);
4116 		if (ret && ret != -ENOENT)
4117 			goto out;
4118 
4119 again:
4120 		ceph_free_lockers(lockers, num_lockers);
4121 	}
4122 
4123 out:
4124 	ceph_free_lockers(lockers, num_lockers);
4125 	return ret;
4126 }
4127 
4128 static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4129 {
4130 	int ret;
4131 
4132 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4133 		ret = rbd_object_map_open(rbd_dev);
4134 		if (ret)
4135 			return ret;
4136 	}
4137 
4138 	return 0;
4139 }
4140 
4141 /*
4142  * Return:
4143  *   0 - lock acquired
4144  *   1 - caller should call rbd_request_lock()
4145  *  <0 - error
4146  */
4147 static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4148 {
4149 	int ret;
4150 
4151 	down_read(&rbd_dev->lock_rwsem);
4152 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4153 	     rbd_dev->lock_state);
4154 	if (__rbd_is_lock_owner(rbd_dev)) {
4155 		up_read(&rbd_dev->lock_rwsem);
4156 		return 0;
4157 	}
4158 
4159 	up_read(&rbd_dev->lock_rwsem);
4160 	down_write(&rbd_dev->lock_rwsem);
4161 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4162 	     rbd_dev->lock_state);
4163 	if (__rbd_is_lock_owner(rbd_dev)) {
4164 		up_write(&rbd_dev->lock_rwsem);
4165 		return 0;
4166 	}
4167 
4168 	ret = rbd_try_lock(rbd_dev);
4169 	if (ret < 0) {
4170 		rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4171 		if (ret == -EBLACKLISTED)
4172 			goto out;
4173 
4174 		ret = 1; /* request lock anyway */
4175 	}
4176 	if (ret > 0) {
4177 		up_write(&rbd_dev->lock_rwsem);
4178 		return ret;
4179 	}
4180 
4181 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4182 	rbd_assert(list_empty(&rbd_dev->running_list));
4183 
4184 	ret = rbd_post_acquire_action(rbd_dev);
4185 	if (ret) {
4186 		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4187 		/*
4188 		 * Can't stay in RBD_LOCK_STATE_LOCKED because
4189 		 * rbd_lock_add_request() would let the request through,
4190 		 * assuming that e.g. object map is locked and loaded.
4191 		 */
4192 		rbd_unlock(rbd_dev);
4193 	}
4194 
4195 out:
4196 	wake_lock_waiters(rbd_dev, ret);
4197 	up_write(&rbd_dev->lock_rwsem);
4198 	return ret;
4199 }
4200 
4201 static void rbd_acquire_lock(struct work_struct *work)
4202 {
4203 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4204 					    struct rbd_device, lock_dwork);
4205 	int ret;
4206 
4207 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4208 again:
4209 	ret = rbd_try_acquire_lock(rbd_dev);
4210 	if (ret <= 0) {
4211 		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4212 		return;
4213 	}
4214 
4215 	ret = rbd_request_lock(rbd_dev);
4216 	if (ret == -ETIMEDOUT) {
4217 		goto again; /* treat this as a dead client */
4218 	} else if (ret == -EROFS) {
4219 		rbd_warn(rbd_dev, "peer will not release lock");
4220 		down_write(&rbd_dev->lock_rwsem);
4221 		wake_lock_waiters(rbd_dev, ret);
4222 		up_write(&rbd_dev->lock_rwsem);
4223 	} else if (ret < 0) {
4224 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4225 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4226 				 RBD_RETRY_DELAY);
4227 	} else {
4228 		/*
4229 		 * lock owner acked, but resend if we don't see them
4230 		 * release the lock
4231 		 */
4232 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
4233 		     rbd_dev);
4234 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4235 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4236 	}
4237 }
4238 
4239 static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4240 {
4241 	bool need_wait;
4242 
4243 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4244 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4245 
4246 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4247 		return false;
4248 
4249 	/*
4250 	 * Ensure that all in-flight IO is flushed.
4251 	 */
4252 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4253 	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4254 	need_wait = !list_empty(&rbd_dev->running_list);
4255 	downgrade_write(&rbd_dev->lock_rwsem);
4256 	if (need_wait)
4257 		wait_for_completion(&rbd_dev->releasing_wait);
4258 	up_read(&rbd_dev->lock_rwsem);
4259 
4260 	down_write(&rbd_dev->lock_rwsem);
4261 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4262 		return false;
4263 
4264 	rbd_assert(list_empty(&rbd_dev->running_list));
4265 	return true;
4266 }
4267 
4268 static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4269 {
4270 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4271 		rbd_object_map_close(rbd_dev);
4272 }
4273 
4274 static void __rbd_release_lock(struct rbd_device *rbd_dev)
4275 {
4276 	rbd_assert(list_empty(&rbd_dev->running_list));
4277 
4278 	rbd_pre_release_action(rbd_dev);
4279 	rbd_unlock(rbd_dev);
4280 }
4281 
4282 /*
4283  * lock_rwsem must be held for write
4284  */
4285 static void rbd_release_lock(struct rbd_device *rbd_dev)
4286 {
4287 	if (!rbd_quiesce_lock(rbd_dev))
4288 		return;
4289 
4290 	__rbd_release_lock(rbd_dev);
4291 
4292 	/*
4293 	 * Give others a chance to grab the lock - we would re-acquire
4294 	 * almost immediately if we got new IO while draining the running
4295 	 * list otherwise.  We need to ack our own notifications, so this
4296 	 * lock_dwork will be requeued from rbd_handle_released_lock() by
4297 	 * way of maybe_kick_acquire().
4298 	 */
4299 	cancel_delayed_work(&rbd_dev->lock_dwork);
4300 }
4301 
4302 static void rbd_release_lock_work(struct work_struct *work)
4303 {
4304 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4305 						  unlock_work);
4306 
4307 	down_write(&rbd_dev->lock_rwsem);
4308 	rbd_release_lock(rbd_dev);
4309 	up_write(&rbd_dev->lock_rwsem);
4310 }
4311 
4312 static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4313 {
4314 	bool have_requests;
4315 
4316 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4317 	if (__rbd_is_lock_owner(rbd_dev))
4318 		return;
4319 
4320 	spin_lock(&rbd_dev->lock_lists_lock);
4321 	have_requests = !list_empty(&rbd_dev->acquiring_list);
4322 	spin_unlock(&rbd_dev->lock_lists_lock);
4323 	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4324 		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4325 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4326 	}
4327 }
4328 
4329 static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4330 				     void **p)
4331 {
4332 	struct rbd_client_id cid = { 0 };
4333 
4334 	if (struct_v >= 2) {
4335 		cid.gid = ceph_decode_64(p);
4336 		cid.handle = ceph_decode_64(p);
4337 	}
4338 
4339 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4340 	     cid.handle);
4341 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4342 		down_write(&rbd_dev->lock_rwsem);
4343 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4344 			/*
4345 			 * we already know that the remote client is
4346 			 * the owner
4347 			 */
4348 			up_write(&rbd_dev->lock_rwsem);
4349 			return;
4350 		}
4351 
4352 		rbd_set_owner_cid(rbd_dev, &cid);
4353 		downgrade_write(&rbd_dev->lock_rwsem);
4354 	} else {
4355 		down_read(&rbd_dev->lock_rwsem);
4356 	}
4357 
4358 	maybe_kick_acquire(rbd_dev);
4359 	up_read(&rbd_dev->lock_rwsem);
4360 }
4361 
4362 static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4363 				     void **p)
4364 {
4365 	struct rbd_client_id cid = { 0 };
4366 
4367 	if (struct_v >= 2) {
4368 		cid.gid = ceph_decode_64(p);
4369 		cid.handle = ceph_decode_64(p);
4370 	}
4371 
4372 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4373 	     cid.handle);
4374 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4375 		down_write(&rbd_dev->lock_rwsem);
4376 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4377 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4378 			     __func__, rbd_dev, cid.gid, cid.handle,
4379 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4380 			up_write(&rbd_dev->lock_rwsem);
4381 			return;
4382 		}
4383 
4384 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4385 		downgrade_write(&rbd_dev->lock_rwsem);
4386 	} else {
4387 		down_read(&rbd_dev->lock_rwsem);
4388 	}
4389 
4390 	maybe_kick_acquire(rbd_dev);
4391 	up_read(&rbd_dev->lock_rwsem);
4392 }
4393 
4394 /*
4395  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4396  * ResponseMessage is needed.
4397  */
4398 static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4399 				   void **p)
4400 {
4401 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4402 	struct rbd_client_id cid = { 0 };
4403 	int result = 1;
4404 
4405 	if (struct_v >= 2) {
4406 		cid.gid = ceph_decode_64(p);
4407 		cid.handle = ceph_decode_64(p);
4408 	}
4409 
4410 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4411 	     cid.handle);
4412 	if (rbd_cid_equal(&cid, &my_cid))
4413 		return result;
4414 
4415 	down_read(&rbd_dev->lock_rwsem);
4416 	if (__rbd_is_lock_owner(rbd_dev)) {
4417 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4418 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4419 			goto out_unlock;
4420 
4421 		/*
4422 		 * encode ResponseMessage(0) so the peer can detect
4423 		 * a missing owner
4424 		 */
4425 		result = 0;
4426 
4427 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4428 			if (!rbd_dev->opts->exclusive) {
4429 				dout("%s rbd_dev %p queueing unlock_work\n",
4430 				     __func__, rbd_dev);
4431 				queue_work(rbd_dev->task_wq,
4432 					   &rbd_dev->unlock_work);
4433 			} else {
4434 				/* refuse to release the lock */
4435 				result = -EROFS;
4436 			}
4437 		}
4438 	}
4439 
4440 out_unlock:
4441 	up_read(&rbd_dev->lock_rwsem);
4442 	return result;
4443 }
4444 
4445 static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4446 				     u64 notify_id, u64 cookie, s32 *result)
4447 {
4448 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4449 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4450 	int buf_size = sizeof(buf);
4451 	int ret;
4452 
4453 	if (result) {
4454 		void *p = buf;
4455 
4456 		/* encode ResponseMessage */
4457 		ceph_start_encoding(&p, 1, 1,
4458 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
4459 		ceph_encode_32(&p, *result);
4460 	} else {
4461 		buf_size = 0;
4462 	}
4463 
4464 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4465 				   &rbd_dev->header_oloc, notify_id, cookie,
4466 				   buf, buf_size);
4467 	if (ret)
4468 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4469 }
4470 
4471 static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4472 				   u64 cookie)
4473 {
4474 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4475 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4476 }
4477 
4478 static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4479 					  u64 notify_id, u64 cookie, s32 result)
4480 {
4481 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4482 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4483 }
4484 
4485 static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4486 			 u64 notifier_id, void *data, size_t data_len)
4487 {
4488 	struct rbd_device *rbd_dev = arg;
4489 	void *p = data;
4490 	void *const end = p + data_len;
4491 	u8 struct_v = 0;
4492 	u32 len;
4493 	u32 notify_op;
4494 	int ret;
4495 
4496 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4497 	     __func__, rbd_dev, cookie, notify_id, data_len);
4498 	if (data_len) {
4499 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4500 					  &struct_v, &len);
4501 		if (ret) {
4502 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4503 				 ret);
4504 			return;
4505 		}
4506 
4507 		notify_op = ceph_decode_32(&p);
4508 	} else {
4509 		/* legacy notification for header updates */
4510 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4511 		len = 0;
4512 	}
4513 
4514 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4515 	switch (notify_op) {
4516 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4517 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4518 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4519 		break;
4520 	case RBD_NOTIFY_OP_RELEASED_LOCK:
4521 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
4522 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4523 		break;
4524 	case RBD_NOTIFY_OP_REQUEST_LOCK:
4525 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4526 		if (ret <= 0)
4527 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4528 						      cookie, ret);
4529 		else
4530 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4531 		break;
4532 	case RBD_NOTIFY_OP_HEADER_UPDATE:
4533 		ret = rbd_dev_refresh(rbd_dev);
4534 		if (ret)
4535 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
4536 
4537 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4538 		break;
4539 	default:
4540 		if (rbd_is_lock_owner(rbd_dev))
4541 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4542 						      cookie, -EOPNOTSUPP);
4543 		else
4544 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4545 		break;
4546 	}
4547 }
4548 
4549 static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4550 
4551 static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4552 {
4553 	struct rbd_device *rbd_dev = arg;
4554 
4555 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
4556 
4557 	down_write(&rbd_dev->lock_rwsem);
4558 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4559 	up_write(&rbd_dev->lock_rwsem);
4560 
4561 	mutex_lock(&rbd_dev->watch_mutex);
4562 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4563 		__rbd_unregister_watch(rbd_dev);
4564 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4565 
4566 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4567 	}
4568 	mutex_unlock(&rbd_dev->watch_mutex);
4569 }
4570 
4571 /*
4572  * watch_mutex must be locked
4573  */
4574 static int __rbd_register_watch(struct rbd_device *rbd_dev)
4575 {
4576 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4577 	struct ceph_osd_linger_request *handle;
4578 
4579 	rbd_assert(!rbd_dev->watch_handle);
4580 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4581 
4582 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4583 				 &rbd_dev->header_oloc, rbd_watch_cb,
4584 				 rbd_watch_errcb, rbd_dev);
4585 	if (IS_ERR(handle))
4586 		return PTR_ERR(handle);
4587 
4588 	rbd_dev->watch_handle = handle;
4589 	return 0;
4590 }
4591 
4592 /*
4593  * watch_mutex must be locked
4594  */
4595 static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4596 {
4597 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4598 	int ret;
4599 
4600 	rbd_assert(rbd_dev->watch_handle);
4601 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4602 
4603 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4604 	if (ret)
4605 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4606 
4607 	rbd_dev->watch_handle = NULL;
4608 }
4609 
4610 static int rbd_register_watch(struct rbd_device *rbd_dev)
4611 {
4612 	int ret;
4613 
4614 	mutex_lock(&rbd_dev->watch_mutex);
4615 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4616 	ret = __rbd_register_watch(rbd_dev);
4617 	if (ret)
4618 		goto out;
4619 
4620 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4621 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4622 
4623 out:
4624 	mutex_unlock(&rbd_dev->watch_mutex);
4625 	return ret;
4626 }
4627 
4628 static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4629 {
4630 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4631 
4632 	cancel_work_sync(&rbd_dev->acquired_lock_work);
4633 	cancel_work_sync(&rbd_dev->released_lock_work);
4634 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4635 	cancel_work_sync(&rbd_dev->unlock_work);
4636 }
4637 
4638 static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4639 {
4640 	cancel_tasks_sync(rbd_dev);
4641 
4642 	mutex_lock(&rbd_dev->watch_mutex);
4643 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4644 		__rbd_unregister_watch(rbd_dev);
4645 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4646 	mutex_unlock(&rbd_dev->watch_mutex);
4647 
4648 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4649 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4650 }
4651 
4652 /*
4653  * lock_rwsem must be held for write
4654  */
4655 static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4656 {
4657 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4658 	char cookie[32];
4659 	int ret;
4660 
4661 	if (!rbd_quiesce_lock(rbd_dev))
4662 		return;
4663 
4664 	format_lock_cookie(rbd_dev, cookie);
4665 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4666 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4667 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4668 				  RBD_LOCK_TAG, cookie);
4669 	if (ret) {
4670 		if (ret != -EOPNOTSUPP)
4671 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4672 				 ret);
4673 
4674 		/*
4675 		 * Lock cookie cannot be updated on older OSDs, so do
4676 		 * a manual release and queue an acquire.
4677 		 */
4678 		__rbd_release_lock(rbd_dev);
4679 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4680 	} else {
4681 		__rbd_lock(rbd_dev, cookie);
4682 		wake_lock_waiters(rbd_dev, 0);
4683 	}
4684 }
4685 
4686 static void rbd_reregister_watch(struct work_struct *work)
4687 {
4688 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4689 					    struct rbd_device, watch_dwork);
4690 	int ret;
4691 
4692 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4693 
4694 	mutex_lock(&rbd_dev->watch_mutex);
4695 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4696 		mutex_unlock(&rbd_dev->watch_mutex);
4697 		return;
4698 	}
4699 
4700 	ret = __rbd_register_watch(rbd_dev);
4701 	if (ret) {
4702 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
4703 		if (ret != -EBLACKLISTED && ret != -ENOENT) {
4704 			queue_delayed_work(rbd_dev->task_wq,
4705 					   &rbd_dev->watch_dwork,
4706 					   RBD_RETRY_DELAY);
4707 			mutex_unlock(&rbd_dev->watch_mutex);
4708 			return;
4709 		}
4710 
4711 		mutex_unlock(&rbd_dev->watch_mutex);
4712 		down_write(&rbd_dev->lock_rwsem);
4713 		wake_lock_waiters(rbd_dev, ret);
4714 		up_write(&rbd_dev->lock_rwsem);
4715 		return;
4716 	}
4717 
4718 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4719 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4720 	mutex_unlock(&rbd_dev->watch_mutex);
4721 
4722 	down_write(&rbd_dev->lock_rwsem);
4723 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4724 		rbd_reacquire_lock(rbd_dev);
4725 	up_write(&rbd_dev->lock_rwsem);
4726 
4727 	ret = rbd_dev_refresh(rbd_dev);
4728 	if (ret)
4729 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
4730 }
4731 
4732 /*
4733  * Synchronous osd object method call.  Returns the number of bytes
4734  * returned in the outbound buffer, or a negative error code.
4735  */
4736 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4737 			     struct ceph_object_id *oid,
4738 			     struct ceph_object_locator *oloc,
4739 			     const char *method_name,
4740 			     const void *outbound,
4741 			     size_t outbound_size,
4742 			     void *inbound,
4743 			     size_t inbound_size)
4744 {
4745 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4746 	struct page *req_page = NULL;
4747 	struct page *reply_page;
4748 	int ret;
4749 
4750 	/*
4751 	 * Method calls are ultimately read operations.  The result
4752 	 * should placed into the inbound buffer provided.  They
4753 	 * also supply outbound data--parameters for the object
4754 	 * method.  Currently if this is present it will be a
4755 	 * snapshot id.
4756 	 */
4757 	if (outbound) {
4758 		if (outbound_size > PAGE_SIZE)
4759 			return -E2BIG;
4760 
4761 		req_page = alloc_page(GFP_KERNEL);
4762 		if (!req_page)
4763 			return -ENOMEM;
4764 
4765 		memcpy(page_address(req_page), outbound, outbound_size);
4766 	}
4767 
4768 	reply_page = alloc_page(GFP_KERNEL);
4769 	if (!reply_page) {
4770 		if (req_page)
4771 			__free_page(req_page);
4772 		return -ENOMEM;
4773 	}
4774 
4775 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4776 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
4777 			     &reply_page, &inbound_size);
4778 	if (!ret) {
4779 		memcpy(inbound, page_address(reply_page), inbound_size);
4780 		ret = inbound_size;
4781 	}
4782 
4783 	if (req_page)
4784 		__free_page(req_page);
4785 	__free_page(reply_page);
4786 	return ret;
4787 }
4788 
4789 static void rbd_queue_workfn(struct work_struct *work)
4790 {
4791 	struct request *rq = blk_mq_rq_from_pdu(work);
4792 	struct rbd_device *rbd_dev = rq->q->queuedata;
4793 	struct rbd_img_request *img_request;
4794 	struct ceph_snap_context *snapc = NULL;
4795 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4796 	u64 length = blk_rq_bytes(rq);
4797 	enum obj_operation_type op_type;
4798 	u64 mapping_size;
4799 	int result;
4800 
4801 	switch (req_op(rq)) {
4802 	case REQ_OP_DISCARD:
4803 		op_type = OBJ_OP_DISCARD;
4804 		break;
4805 	case REQ_OP_WRITE_ZEROES:
4806 		op_type = OBJ_OP_ZEROOUT;
4807 		break;
4808 	case REQ_OP_WRITE:
4809 		op_type = OBJ_OP_WRITE;
4810 		break;
4811 	case REQ_OP_READ:
4812 		op_type = OBJ_OP_READ;
4813 		break;
4814 	default:
4815 		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
4816 		result = -EIO;
4817 		goto err;
4818 	}
4819 
4820 	/* Ignore/skip any zero-length requests */
4821 
4822 	if (!length) {
4823 		dout("%s: zero-length request\n", __func__);
4824 		result = 0;
4825 		goto err_rq;
4826 	}
4827 
4828 	if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
4829 		rbd_warn(rbd_dev, "%s on read-only snapshot",
4830 			 obj_op_name(op_type));
4831 		result = -EIO;
4832 		goto err;
4833 	}
4834 
4835 	/*
4836 	 * Quit early if the mapped snapshot no longer exists.  It's
4837 	 * still possible the snapshot will have disappeared by the
4838 	 * time our request arrives at the osd, but there's no sense in
4839 	 * sending it if we already know.
4840 	 */
4841 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4842 		dout("request for non-existent snapshot");
4843 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4844 		result = -ENXIO;
4845 		goto err_rq;
4846 	}
4847 
4848 	if (offset && length > U64_MAX - offset + 1) {
4849 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4850 			 length);
4851 		result = -EINVAL;
4852 		goto err_rq;	/* Shouldn't happen */
4853 	}
4854 
4855 	blk_mq_start_request(rq);
4856 
4857 	down_read(&rbd_dev->header_rwsem);
4858 	mapping_size = rbd_dev->mapping.size;
4859 	if (op_type != OBJ_OP_READ) {
4860 		snapc = rbd_dev->header.snapc;
4861 		ceph_get_snap_context(snapc);
4862 	}
4863 	up_read(&rbd_dev->header_rwsem);
4864 
4865 	if (offset + length > mapping_size) {
4866 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4867 			 length, mapping_size);
4868 		result = -EIO;
4869 		goto err_rq;
4870 	}
4871 
4872 	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
4873 	if (!img_request) {
4874 		result = -ENOMEM;
4875 		goto err_rq;
4876 	}
4877 	img_request->rq = rq;
4878 	snapc = NULL; /* img_request consumes a ref */
4879 
4880 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
4881 		result = rbd_img_fill_nodata(img_request, offset, length);
4882 	else
4883 		result = rbd_img_fill_from_bio(img_request, offset, length,
4884 					       rq->bio);
4885 	if (result)
4886 		goto err_img_request;
4887 
4888 	rbd_img_handle_request(img_request, 0);
4889 	return;
4890 
4891 err_img_request:
4892 	rbd_img_request_put(img_request);
4893 err_rq:
4894 	if (result)
4895 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
4896 			 obj_op_name(op_type), length, offset, result);
4897 	ceph_put_snap_context(snapc);
4898 err:
4899 	blk_mq_end_request(rq, errno_to_blk_status(result));
4900 }
4901 
4902 static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
4903 		const struct blk_mq_queue_data *bd)
4904 {
4905 	struct request *rq = bd->rq;
4906 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4907 
4908 	queue_work(rbd_wq, work);
4909 	return BLK_STS_OK;
4910 }
4911 
4912 static void rbd_free_disk(struct rbd_device *rbd_dev)
4913 {
4914 	blk_cleanup_queue(rbd_dev->disk->queue);
4915 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4916 	put_disk(rbd_dev->disk);
4917 	rbd_dev->disk = NULL;
4918 }
4919 
4920 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4921 			     struct ceph_object_id *oid,
4922 			     struct ceph_object_locator *oloc,
4923 			     void *buf, int buf_len)
4924 
4925 {
4926 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4927 	struct ceph_osd_request *req;
4928 	struct page **pages;
4929 	int num_pages = calc_pages_for(0, buf_len);
4930 	int ret;
4931 
4932 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4933 	if (!req)
4934 		return -ENOMEM;
4935 
4936 	ceph_oid_copy(&req->r_base_oid, oid);
4937 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4938 	req->r_flags = CEPH_OSD_FLAG_READ;
4939 
4940 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4941 	if (IS_ERR(pages)) {
4942 		ret = PTR_ERR(pages);
4943 		goto out_req;
4944 	}
4945 
4946 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4947 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4948 					 true);
4949 
4950 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4951 	if (ret)
4952 		goto out_req;
4953 
4954 	ceph_osdc_start_request(osdc, req, false);
4955 	ret = ceph_osdc_wait_request(osdc, req);
4956 	if (ret >= 0)
4957 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4958 
4959 out_req:
4960 	ceph_osdc_put_request(req);
4961 	return ret;
4962 }
4963 
4964 /*
4965  * Read the complete header for the given rbd device.  On successful
4966  * return, the rbd_dev->header field will contain up-to-date
4967  * information about the image.
4968  */
4969 static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
4970 {
4971 	struct rbd_image_header_ondisk *ondisk = NULL;
4972 	u32 snap_count = 0;
4973 	u64 names_size = 0;
4974 	u32 want_count;
4975 	int ret;
4976 
4977 	/*
4978 	 * The complete header will include an array of its 64-bit
4979 	 * snapshot ids, followed by the names of those snapshots as
4980 	 * a contiguous block of NUL-terminated strings.  Note that
4981 	 * the number of snapshots could change by the time we read
4982 	 * it in, in which case we re-read it.
4983 	 */
4984 	do {
4985 		size_t size;
4986 
4987 		kfree(ondisk);
4988 
4989 		size = sizeof (*ondisk);
4990 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4991 		size += names_size;
4992 		ondisk = kmalloc(size, GFP_KERNEL);
4993 		if (!ondisk)
4994 			return -ENOMEM;
4995 
4996 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4997 					&rbd_dev->header_oloc, ondisk, size);
4998 		if (ret < 0)
4999 			goto out;
5000 		if ((size_t)ret < size) {
5001 			ret = -ENXIO;
5002 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
5003 				size, ret);
5004 			goto out;
5005 		}
5006 		if (!rbd_dev_ondisk_valid(ondisk)) {
5007 			ret = -ENXIO;
5008 			rbd_warn(rbd_dev, "invalid header");
5009 			goto out;
5010 		}
5011 
5012 		names_size = le64_to_cpu(ondisk->snap_names_len);
5013 		want_count = snap_count;
5014 		snap_count = le32_to_cpu(ondisk->snap_count);
5015 	} while (snap_count != want_count);
5016 
5017 	ret = rbd_header_from_disk(rbd_dev, ondisk);
5018 out:
5019 	kfree(ondisk);
5020 
5021 	return ret;
5022 }
5023 
5024 /*
5025  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
5026  * has disappeared from the (just updated) snapshot context.
5027  */
5028 static void rbd_exists_validate(struct rbd_device *rbd_dev)
5029 {
5030 	u64 snap_id;
5031 
5032 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
5033 		return;
5034 
5035 	snap_id = rbd_dev->spec->snap_id;
5036 	if (snap_id == CEPH_NOSNAP)
5037 		return;
5038 
5039 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
5040 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5041 }
5042 
5043 static void rbd_dev_update_size(struct rbd_device *rbd_dev)
5044 {
5045 	sector_t size;
5046 
5047 	/*
5048 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
5049 	 * try to update its size.  If REMOVING is set, updating size
5050 	 * is just useless work since the device can't be opened.
5051 	 */
5052 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
5053 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
5054 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
5055 		dout("setting size to %llu sectors", (unsigned long long)size);
5056 		set_capacity(rbd_dev->disk, size);
5057 		revalidate_disk(rbd_dev->disk);
5058 	}
5059 }
5060 
5061 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
5062 {
5063 	u64 mapping_size;
5064 	int ret;
5065 
5066 	down_write(&rbd_dev->header_rwsem);
5067 	mapping_size = rbd_dev->mapping.size;
5068 
5069 	ret = rbd_dev_header_info(rbd_dev);
5070 	if (ret)
5071 		goto out;
5072 
5073 	/*
5074 	 * If there is a parent, see if it has disappeared due to the
5075 	 * mapped image getting flattened.
5076 	 */
5077 	if (rbd_dev->parent) {
5078 		ret = rbd_dev_v2_parent_info(rbd_dev);
5079 		if (ret)
5080 			goto out;
5081 	}
5082 
5083 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
5084 		rbd_dev->mapping.size = rbd_dev->header.image_size;
5085 	} else {
5086 		/* validate mapped snapshot's EXISTS flag */
5087 		rbd_exists_validate(rbd_dev);
5088 	}
5089 
5090 out:
5091 	up_write(&rbd_dev->header_rwsem);
5092 	if (!ret && mapping_size != rbd_dev->mapping.size)
5093 		rbd_dev_update_size(rbd_dev);
5094 
5095 	return ret;
5096 }
5097 
5098 static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
5099 		unsigned int hctx_idx, unsigned int numa_node)
5100 {
5101 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
5102 
5103 	INIT_WORK(work, rbd_queue_workfn);
5104 	return 0;
5105 }
5106 
5107 static const struct blk_mq_ops rbd_mq_ops = {
5108 	.queue_rq	= rbd_queue_rq,
5109 	.init_request	= rbd_init_request,
5110 };
5111 
5112 static int rbd_init_disk(struct rbd_device *rbd_dev)
5113 {
5114 	struct gendisk *disk;
5115 	struct request_queue *q;
5116 	unsigned int objset_bytes =
5117 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
5118 	int err;
5119 
5120 	/* create gendisk info */
5121 	disk = alloc_disk(single_major ?
5122 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
5123 			  RBD_MINORS_PER_MAJOR);
5124 	if (!disk)
5125 		return -ENOMEM;
5126 
5127 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
5128 		 rbd_dev->dev_id);
5129 	disk->major = rbd_dev->major;
5130 	disk->first_minor = rbd_dev->minor;
5131 	if (single_major)
5132 		disk->flags |= GENHD_FL_EXT_DEVT;
5133 	disk->fops = &rbd_bd_ops;
5134 	disk->private_data = rbd_dev;
5135 
5136 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
5137 	rbd_dev->tag_set.ops = &rbd_mq_ops;
5138 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
5139 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
5140 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
5141 	rbd_dev->tag_set.nr_hw_queues = 1;
5142 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
5143 
5144 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
5145 	if (err)
5146 		goto out_disk;
5147 
5148 	q = blk_mq_init_queue(&rbd_dev->tag_set);
5149 	if (IS_ERR(q)) {
5150 		err = PTR_ERR(q);
5151 		goto out_tag_set;
5152 	}
5153 
5154 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
5155 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
5156 
5157 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
5158 	q->limits.max_sectors = queue_max_hw_sectors(q);
5159 	blk_queue_max_segments(q, USHRT_MAX);
5160 	blk_queue_max_segment_size(q, UINT_MAX);
5161 	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5162 	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
5163 
5164 	if (rbd_dev->opts->trim) {
5165 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
5166 		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
5167 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5168 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5169 	}
5170 
5171 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
5172 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
5173 
5174 	/*
5175 	 * disk_release() expects a queue ref from add_disk() and will
5176 	 * put it.  Hold an extra ref until add_disk() is called.
5177 	 */
5178 	WARN_ON(!blk_get_queue(q));
5179 	disk->queue = q;
5180 	q->queuedata = rbd_dev;
5181 
5182 	rbd_dev->disk = disk;
5183 
5184 	return 0;
5185 out_tag_set:
5186 	blk_mq_free_tag_set(&rbd_dev->tag_set);
5187 out_disk:
5188 	put_disk(disk);
5189 	return err;
5190 }
5191 
5192 /*
5193   sysfs
5194 */
5195 
5196 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5197 {
5198 	return container_of(dev, struct rbd_device, dev);
5199 }
5200 
5201 static ssize_t rbd_size_show(struct device *dev,
5202 			     struct device_attribute *attr, char *buf)
5203 {
5204 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5205 
5206 	return sprintf(buf, "%llu\n",
5207 		(unsigned long long)rbd_dev->mapping.size);
5208 }
5209 
5210 /*
5211  * Note this shows the features for whatever's mapped, which is not
5212  * necessarily the base image.
5213  */
5214 static ssize_t rbd_features_show(struct device *dev,
5215 			     struct device_attribute *attr, char *buf)
5216 {
5217 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5218 
5219 	return sprintf(buf, "0x%016llx\n",
5220 			(unsigned long long)rbd_dev->mapping.features);
5221 }
5222 
5223 static ssize_t rbd_major_show(struct device *dev,
5224 			      struct device_attribute *attr, char *buf)
5225 {
5226 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5227 
5228 	if (rbd_dev->major)
5229 		return sprintf(buf, "%d\n", rbd_dev->major);
5230 
5231 	return sprintf(buf, "(none)\n");
5232 }
5233 
5234 static ssize_t rbd_minor_show(struct device *dev,
5235 			      struct device_attribute *attr, char *buf)
5236 {
5237 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5238 
5239 	return sprintf(buf, "%d\n", rbd_dev->minor);
5240 }
5241 
5242 static ssize_t rbd_client_addr_show(struct device *dev,
5243 				    struct device_attribute *attr, char *buf)
5244 {
5245 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5246 	struct ceph_entity_addr *client_addr =
5247 	    ceph_client_addr(rbd_dev->rbd_client->client);
5248 
5249 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5250 		       le32_to_cpu(client_addr->nonce));
5251 }
5252 
5253 static ssize_t rbd_client_id_show(struct device *dev,
5254 				  struct device_attribute *attr, char *buf)
5255 {
5256 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5257 
5258 	return sprintf(buf, "client%lld\n",
5259 		       ceph_client_gid(rbd_dev->rbd_client->client));
5260 }
5261 
5262 static ssize_t rbd_cluster_fsid_show(struct device *dev,
5263 				     struct device_attribute *attr, char *buf)
5264 {
5265 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5266 
5267 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5268 }
5269 
5270 static ssize_t rbd_config_info_show(struct device *dev,
5271 				    struct device_attribute *attr, char *buf)
5272 {
5273 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5274 
5275 	return sprintf(buf, "%s\n", rbd_dev->config_info);
5276 }
5277 
5278 static ssize_t rbd_pool_show(struct device *dev,
5279 			     struct device_attribute *attr, char *buf)
5280 {
5281 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5282 
5283 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5284 }
5285 
5286 static ssize_t rbd_pool_id_show(struct device *dev,
5287 			     struct device_attribute *attr, char *buf)
5288 {
5289 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5290 
5291 	return sprintf(buf, "%llu\n",
5292 			(unsigned long long) rbd_dev->spec->pool_id);
5293 }
5294 
5295 static ssize_t rbd_pool_ns_show(struct device *dev,
5296 				struct device_attribute *attr, char *buf)
5297 {
5298 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5299 
5300 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5301 }
5302 
5303 static ssize_t rbd_name_show(struct device *dev,
5304 			     struct device_attribute *attr, char *buf)
5305 {
5306 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5307 
5308 	if (rbd_dev->spec->image_name)
5309 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5310 
5311 	return sprintf(buf, "(unknown)\n");
5312 }
5313 
5314 static ssize_t rbd_image_id_show(struct device *dev,
5315 			     struct device_attribute *attr, char *buf)
5316 {
5317 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5318 
5319 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5320 }
5321 
5322 /*
5323  * Shows the name of the currently-mapped snapshot (or
5324  * RBD_SNAP_HEAD_NAME for the base image).
5325  */
5326 static ssize_t rbd_snap_show(struct device *dev,
5327 			     struct device_attribute *attr,
5328 			     char *buf)
5329 {
5330 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5331 
5332 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5333 }
5334 
5335 static ssize_t rbd_snap_id_show(struct device *dev,
5336 				struct device_attribute *attr, char *buf)
5337 {
5338 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5339 
5340 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5341 }
5342 
5343 /*
5344  * For a v2 image, shows the chain of parent images, separated by empty
5345  * lines.  For v1 images or if there is no parent, shows "(no parent
5346  * image)".
5347  */
5348 static ssize_t rbd_parent_show(struct device *dev,
5349 			       struct device_attribute *attr,
5350 			       char *buf)
5351 {
5352 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5353 	ssize_t count = 0;
5354 
5355 	if (!rbd_dev->parent)
5356 		return sprintf(buf, "(no parent image)\n");
5357 
5358 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5359 		struct rbd_spec *spec = rbd_dev->parent_spec;
5360 
5361 		count += sprintf(&buf[count], "%s"
5362 			    "pool_id %llu\npool_name %s\n"
5363 			    "pool_ns %s\n"
5364 			    "image_id %s\nimage_name %s\n"
5365 			    "snap_id %llu\nsnap_name %s\n"
5366 			    "overlap %llu\n",
5367 			    !count ? "" : "\n", /* first? */
5368 			    spec->pool_id, spec->pool_name,
5369 			    spec->pool_ns ?: "",
5370 			    spec->image_id, spec->image_name ?: "(unknown)",
5371 			    spec->snap_id, spec->snap_name,
5372 			    rbd_dev->parent_overlap);
5373 	}
5374 
5375 	return count;
5376 }
5377 
5378 static ssize_t rbd_image_refresh(struct device *dev,
5379 				 struct device_attribute *attr,
5380 				 const char *buf,
5381 				 size_t size)
5382 {
5383 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5384 	int ret;
5385 
5386 	ret = rbd_dev_refresh(rbd_dev);
5387 	if (ret)
5388 		return ret;
5389 
5390 	return size;
5391 }
5392 
5393 static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5394 static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5395 static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5396 static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5397 static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5398 static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5399 static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5400 static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5401 static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5402 static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5403 static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5404 static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5405 static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5406 static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5407 static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5408 static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5409 static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5410 
5411 static struct attribute *rbd_attrs[] = {
5412 	&dev_attr_size.attr,
5413 	&dev_attr_features.attr,
5414 	&dev_attr_major.attr,
5415 	&dev_attr_minor.attr,
5416 	&dev_attr_client_addr.attr,
5417 	&dev_attr_client_id.attr,
5418 	&dev_attr_cluster_fsid.attr,
5419 	&dev_attr_config_info.attr,
5420 	&dev_attr_pool.attr,
5421 	&dev_attr_pool_id.attr,
5422 	&dev_attr_pool_ns.attr,
5423 	&dev_attr_name.attr,
5424 	&dev_attr_image_id.attr,
5425 	&dev_attr_current_snap.attr,
5426 	&dev_attr_snap_id.attr,
5427 	&dev_attr_parent.attr,
5428 	&dev_attr_refresh.attr,
5429 	NULL
5430 };
5431 
5432 static struct attribute_group rbd_attr_group = {
5433 	.attrs = rbd_attrs,
5434 };
5435 
5436 static const struct attribute_group *rbd_attr_groups[] = {
5437 	&rbd_attr_group,
5438 	NULL
5439 };
5440 
5441 static void rbd_dev_release(struct device *dev);
5442 
5443 static const struct device_type rbd_device_type = {
5444 	.name		= "rbd",
5445 	.groups		= rbd_attr_groups,
5446 	.release	= rbd_dev_release,
5447 };
5448 
5449 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5450 {
5451 	kref_get(&spec->kref);
5452 
5453 	return spec;
5454 }
5455 
5456 static void rbd_spec_free(struct kref *kref);
5457 static void rbd_spec_put(struct rbd_spec *spec)
5458 {
5459 	if (spec)
5460 		kref_put(&spec->kref, rbd_spec_free);
5461 }
5462 
5463 static struct rbd_spec *rbd_spec_alloc(void)
5464 {
5465 	struct rbd_spec *spec;
5466 
5467 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5468 	if (!spec)
5469 		return NULL;
5470 
5471 	spec->pool_id = CEPH_NOPOOL;
5472 	spec->snap_id = CEPH_NOSNAP;
5473 	kref_init(&spec->kref);
5474 
5475 	return spec;
5476 }
5477 
5478 static void rbd_spec_free(struct kref *kref)
5479 {
5480 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5481 
5482 	kfree(spec->pool_name);
5483 	kfree(spec->pool_ns);
5484 	kfree(spec->image_id);
5485 	kfree(spec->image_name);
5486 	kfree(spec->snap_name);
5487 	kfree(spec);
5488 }
5489 
5490 static void rbd_dev_free(struct rbd_device *rbd_dev)
5491 {
5492 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5493 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5494 
5495 	ceph_oid_destroy(&rbd_dev->header_oid);
5496 	ceph_oloc_destroy(&rbd_dev->header_oloc);
5497 	kfree(rbd_dev->config_info);
5498 
5499 	rbd_put_client(rbd_dev->rbd_client);
5500 	rbd_spec_put(rbd_dev->spec);
5501 	kfree(rbd_dev->opts);
5502 	kfree(rbd_dev);
5503 }
5504 
5505 static void rbd_dev_release(struct device *dev)
5506 {
5507 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5508 	bool need_put = !!rbd_dev->opts;
5509 
5510 	if (need_put) {
5511 		destroy_workqueue(rbd_dev->task_wq);
5512 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5513 	}
5514 
5515 	rbd_dev_free(rbd_dev);
5516 
5517 	/*
5518 	 * This is racy, but way better than putting module outside of
5519 	 * the release callback.  The race window is pretty small, so
5520 	 * doing something similar to dm (dm-builtin.c) is overkill.
5521 	 */
5522 	if (need_put)
5523 		module_put(THIS_MODULE);
5524 }
5525 
5526 static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5527 					   struct rbd_spec *spec)
5528 {
5529 	struct rbd_device *rbd_dev;
5530 
5531 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5532 	if (!rbd_dev)
5533 		return NULL;
5534 
5535 	spin_lock_init(&rbd_dev->lock);
5536 	INIT_LIST_HEAD(&rbd_dev->node);
5537 	init_rwsem(&rbd_dev->header_rwsem);
5538 
5539 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5540 	ceph_oid_init(&rbd_dev->header_oid);
5541 	rbd_dev->header_oloc.pool = spec->pool_id;
5542 	if (spec->pool_ns) {
5543 		WARN_ON(!*spec->pool_ns);
5544 		rbd_dev->header_oloc.pool_ns =
5545 		    ceph_find_or_create_string(spec->pool_ns,
5546 					       strlen(spec->pool_ns));
5547 	}
5548 
5549 	mutex_init(&rbd_dev->watch_mutex);
5550 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5551 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5552 
5553 	init_rwsem(&rbd_dev->lock_rwsem);
5554 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5555 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5556 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5557 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5558 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5559 	spin_lock_init(&rbd_dev->lock_lists_lock);
5560 	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5561 	INIT_LIST_HEAD(&rbd_dev->running_list);
5562 	init_completion(&rbd_dev->acquire_wait);
5563 	init_completion(&rbd_dev->releasing_wait);
5564 
5565 	spin_lock_init(&rbd_dev->object_map_lock);
5566 
5567 	rbd_dev->dev.bus = &rbd_bus_type;
5568 	rbd_dev->dev.type = &rbd_device_type;
5569 	rbd_dev->dev.parent = &rbd_root_dev;
5570 	device_initialize(&rbd_dev->dev);
5571 
5572 	rbd_dev->rbd_client = rbdc;
5573 	rbd_dev->spec = spec;
5574 
5575 	return rbd_dev;
5576 }
5577 
5578 /*
5579  * Create a mapping rbd_dev.
5580  */
5581 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5582 					 struct rbd_spec *spec,
5583 					 struct rbd_options *opts)
5584 {
5585 	struct rbd_device *rbd_dev;
5586 
5587 	rbd_dev = __rbd_dev_create(rbdc, spec);
5588 	if (!rbd_dev)
5589 		return NULL;
5590 
5591 	rbd_dev->opts = opts;
5592 
5593 	/* get an id and fill in device name */
5594 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5595 					 minor_to_rbd_dev_id(1 << MINORBITS),
5596 					 GFP_KERNEL);
5597 	if (rbd_dev->dev_id < 0)
5598 		goto fail_rbd_dev;
5599 
5600 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5601 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5602 						   rbd_dev->name);
5603 	if (!rbd_dev->task_wq)
5604 		goto fail_dev_id;
5605 
5606 	/* we have a ref from do_rbd_add() */
5607 	__module_get(THIS_MODULE);
5608 
5609 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5610 	return rbd_dev;
5611 
5612 fail_dev_id:
5613 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5614 fail_rbd_dev:
5615 	rbd_dev_free(rbd_dev);
5616 	return NULL;
5617 }
5618 
5619 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5620 {
5621 	if (rbd_dev)
5622 		put_device(&rbd_dev->dev);
5623 }
5624 
5625 /*
5626  * Get the size and object order for an image snapshot, or if
5627  * snap_id is CEPH_NOSNAP, gets this information for the base
5628  * image.
5629  */
5630 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5631 				u8 *order, u64 *snap_size)
5632 {
5633 	__le64 snapid = cpu_to_le64(snap_id);
5634 	int ret;
5635 	struct {
5636 		u8 order;
5637 		__le64 size;
5638 	} __attribute__ ((packed)) size_buf = { 0 };
5639 
5640 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5641 				  &rbd_dev->header_oloc, "get_size",
5642 				  &snapid, sizeof(snapid),
5643 				  &size_buf, sizeof(size_buf));
5644 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5645 	if (ret < 0)
5646 		return ret;
5647 	if (ret < sizeof (size_buf))
5648 		return -ERANGE;
5649 
5650 	if (order) {
5651 		*order = size_buf.order;
5652 		dout("  order %u", (unsigned int)*order);
5653 	}
5654 	*snap_size = le64_to_cpu(size_buf.size);
5655 
5656 	dout("  snap_id 0x%016llx snap_size = %llu\n",
5657 		(unsigned long long)snap_id,
5658 		(unsigned long long)*snap_size);
5659 
5660 	return 0;
5661 }
5662 
5663 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5664 {
5665 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5666 					&rbd_dev->header.obj_order,
5667 					&rbd_dev->header.image_size);
5668 }
5669 
5670 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5671 {
5672 	void *reply_buf;
5673 	int ret;
5674 	void *p;
5675 
5676 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
5677 	if (!reply_buf)
5678 		return -ENOMEM;
5679 
5680 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5681 				  &rbd_dev->header_oloc, "get_object_prefix",
5682 				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
5683 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5684 	if (ret < 0)
5685 		goto out;
5686 
5687 	p = reply_buf;
5688 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
5689 						p + ret, NULL, GFP_NOIO);
5690 	ret = 0;
5691 
5692 	if (IS_ERR(rbd_dev->header.object_prefix)) {
5693 		ret = PTR_ERR(rbd_dev->header.object_prefix);
5694 		rbd_dev->header.object_prefix = NULL;
5695 	} else {
5696 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
5697 	}
5698 out:
5699 	kfree(reply_buf);
5700 
5701 	return ret;
5702 }
5703 
5704 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5705 		u64 *snap_features)
5706 {
5707 	__le64 snapid = cpu_to_le64(snap_id);
5708 	struct {
5709 		__le64 features;
5710 		__le64 incompat;
5711 	} __attribute__ ((packed)) features_buf = { 0 };
5712 	u64 unsup;
5713 	int ret;
5714 
5715 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5716 				  &rbd_dev->header_oloc, "get_features",
5717 				  &snapid, sizeof(snapid),
5718 				  &features_buf, sizeof(features_buf));
5719 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5720 	if (ret < 0)
5721 		return ret;
5722 	if (ret < sizeof (features_buf))
5723 		return -ERANGE;
5724 
5725 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5726 	if (unsup) {
5727 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5728 			 unsup);
5729 		return -ENXIO;
5730 	}
5731 
5732 	*snap_features = le64_to_cpu(features_buf.features);
5733 
5734 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5735 		(unsigned long long)snap_id,
5736 		(unsigned long long)*snap_features,
5737 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5738 
5739 	return 0;
5740 }
5741 
5742 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5743 {
5744 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5745 						&rbd_dev->header.features);
5746 }
5747 
5748 /*
5749  * These are generic image flags, but since they are used only for
5750  * object map, store them in rbd_dev->object_map_flags.
5751  *
5752  * For the same reason, this function is called only on object map
5753  * (re)load and not on header refresh.
5754  */
5755 static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5756 {
5757 	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5758 	__le64 flags;
5759 	int ret;
5760 
5761 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5762 				  &rbd_dev->header_oloc, "get_flags",
5763 				  &snapid, sizeof(snapid),
5764 				  &flags, sizeof(flags));
5765 	if (ret < 0)
5766 		return ret;
5767 	if (ret < sizeof(flags))
5768 		return -EBADMSG;
5769 
5770 	rbd_dev->object_map_flags = le64_to_cpu(flags);
5771 	return 0;
5772 }
5773 
5774 struct parent_image_info {
5775 	u64		pool_id;
5776 	const char	*pool_ns;
5777 	const char	*image_id;
5778 	u64		snap_id;
5779 
5780 	bool		has_overlap;
5781 	u64		overlap;
5782 };
5783 
5784 /*
5785  * The caller is responsible for @pii.
5786  */
5787 static int decode_parent_image_spec(void **p, void *end,
5788 				    struct parent_image_info *pii)
5789 {
5790 	u8 struct_v;
5791 	u32 struct_len;
5792 	int ret;
5793 
5794 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5795 				  &struct_v, &struct_len);
5796 	if (ret)
5797 		return ret;
5798 
5799 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5800 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5801 	if (IS_ERR(pii->pool_ns)) {
5802 		ret = PTR_ERR(pii->pool_ns);
5803 		pii->pool_ns = NULL;
5804 		return ret;
5805 	}
5806 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5807 	if (IS_ERR(pii->image_id)) {
5808 		ret = PTR_ERR(pii->image_id);
5809 		pii->image_id = NULL;
5810 		return ret;
5811 	}
5812 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5813 	return 0;
5814 
5815 e_inval:
5816 	return -EINVAL;
5817 }
5818 
5819 static int __get_parent_info(struct rbd_device *rbd_dev,
5820 			     struct page *req_page,
5821 			     struct page *reply_page,
5822 			     struct parent_image_info *pii)
5823 {
5824 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5825 	size_t reply_len = PAGE_SIZE;
5826 	void *p, *end;
5827 	int ret;
5828 
5829 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5830 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
5831 			     req_page, sizeof(u64), &reply_page, &reply_len);
5832 	if (ret)
5833 		return ret == -EOPNOTSUPP ? 1 : ret;
5834 
5835 	p = page_address(reply_page);
5836 	end = p + reply_len;
5837 	ret = decode_parent_image_spec(&p, end, pii);
5838 	if (ret)
5839 		return ret;
5840 
5841 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5842 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
5843 			     req_page, sizeof(u64), &reply_page, &reply_len);
5844 	if (ret)
5845 		return ret;
5846 
5847 	p = page_address(reply_page);
5848 	end = p + reply_len;
5849 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5850 	if (pii->has_overlap)
5851 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5852 
5853 	return 0;
5854 
5855 e_inval:
5856 	return -EINVAL;
5857 }
5858 
5859 /*
5860  * The caller is responsible for @pii.
5861  */
5862 static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5863 				    struct page *req_page,
5864 				    struct page *reply_page,
5865 				    struct parent_image_info *pii)
5866 {
5867 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5868 	size_t reply_len = PAGE_SIZE;
5869 	void *p, *end;
5870 	int ret;
5871 
5872 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5873 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
5874 			     req_page, sizeof(u64), &reply_page, &reply_len);
5875 	if (ret)
5876 		return ret;
5877 
5878 	p = page_address(reply_page);
5879 	end = p + reply_len;
5880 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5881 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5882 	if (IS_ERR(pii->image_id)) {
5883 		ret = PTR_ERR(pii->image_id);
5884 		pii->image_id = NULL;
5885 		return ret;
5886 	}
5887 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5888 	pii->has_overlap = true;
5889 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5890 
5891 	return 0;
5892 
5893 e_inval:
5894 	return -EINVAL;
5895 }
5896 
5897 static int get_parent_info(struct rbd_device *rbd_dev,
5898 			   struct parent_image_info *pii)
5899 {
5900 	struct page *req_page, *reply_page;
5901 	void *p;
5902 	int ret;
5903 
5904 	req_page = alloc_page(GFP_KERNEL);
5905 	if (!req_page)
5906 		return -ENOMEM;
5907 
5908 	reply_page = alloc_page(GFP_KERNEL);
5909 	if (!reply_page) {
5910 		__free_page(req_page);
5911 		return -ENOMEM;
5912 	}
5913 
5914 	p = page_address(req_page);
5915 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
5916 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5917 	if (ret > 0)
5918 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5919 					       pii);
5920 
5921 	__free_page(req_page);
5922 	__free_page(reply_page);
5923 	return ret;
5924 }
5925 
5926 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5927 {
5928 	struct rbd_spec *parent_spec;
5929 	struct parent_image_info pii = { 0 };
5930 	int ret;
5931 
5932 	parent_spec = rbd_spec_alloc();
5933 	if (!parent_spec)
5934 		return -ENOMEM;
5935 
5936 	ret = get_parent_info(rbd_dev, &pii);
5937 	if (ret)
5938 		goto out_err;
5939 
5940 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5941 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5942 	     pii.has_overlap, pii.overlap);
5943 
5944 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
5945 		/*
5946 		 * Either the parent never existed, or we have
5947 		 * record of it but the image got flattened so it no
5948 		 * longer has a parent.  When the parent of a
5949 		 * layered image disappears we immediately set the
5950 		 * overlap to 0.  The effect of this is that all new
5951 		 * requests will be treated as if the image had no
5952 		 * parent.
5953 		 *
5954 		 * If !pii.has_overlap, the parent image spec is not
5955 		 * applicable.  It's there to avoid duplication in each
5956 		 * snapshot record.
5957 		 */
5958 		if (rbd_dev->parent_overlap) {
5959 			rbd_dev->parent_overlap = 0;
5960 			rbd_dev_parent_put(rbd_dev);
5961 			pr_info("%s: clone image has been flattened\n",
5962 				rbd_dev->disk->disk_name);
5963 		}
5964 
5965 		goto out;	/* No parent?  No problem. */
5966 	}
5967 
5968 	/* The ceph file layout needs to fit pool id in 32 bits */
5969 
5970 	ret = -EIO;
5971 	if (pii.pool_id > (u64)U32_MAX) {
5972 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5973 			(unsigned long long)pii.pool_id, U32_MAX);
5974 		goto out_err;
5975 	}
5976 
5977 	/*
5978 	 * The parent won't change (except when the clone is
5979 	 * flattened, already handled that).  So we only need to
5980 	 * record the parent spec we have not already done so.
5981 	 */
5982 	if (!rbd_dev->parent_spec) {
5983 		parent_spec->pool_id = pii.pool_id;
5984 		if (pii.pool_ns && *pii.pool_ns) {
5985 			parent_spec->pool_ns = pii.pool_ns;
5986 			pii.pool_ns = NULL;
5987 		}
5988 		parent_spec->image_id = pii.image_id;
5989 		pii.image_id = NULL;
5990 		parent_spec->snap_id = pii.snap_id;
5991 
5992 		rbd_dev->parent_spec = parent_spec;
5993 		parent_spec = NULL;	/* rbd_dev now owns this */
5994 	}
5995 
5996 	/*
5997 	 * We always update the parent overlap.  If it's zero we issue
5998 	 * a warning, as we will proceed as if there was no parent.
5999 	 */
6000 	if (!pii.overlap) {
6001 		if (parent_spec) {
6002 			/* refresh, careful to warn just once */
6003 			if (rbd_dev->parent_overlap)
6004 				rbd_warn(rbd_dev,
6005 				    "clone now standalone (overlap became 0)");
6006 		} else {
6007 			/* initial probe */
6008 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
6009 		}
6010 	}
6011 	rbd_dev->parent_overlap = pii.overlap;
6012 
6013 out:
6014 	ret = 0;
6015 out_err:
6016 	kfree(pii.pool_ns);
6017 	kfree(pii.image_id);
6018 	rbd_spec_put(parent_spec);
6019 	return ret;
6020 }
6021 
6022 static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
6023 {
6024 	struct {
6025 		__le64 stripe_unit;
6026 		__le64 stripe_count;
6027 	} __attribute__ ((packed)) striping_info_buf = { 0 };
6028 	size_t size = sizeof (striping_info_buf);
6029 	void *p;
6030 	int ret;
6031 
6032 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6033 				&rbd_dev->header_oloc, "get_stripe_unit_count",
6034 				NULL, 0, &striping_info_buf, size);
6035 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6036 	if (ret < 0)
6037 		return ret;
6038 	if (ret < size)
6039 		return -ERANGE;
6040 
6041 	p = &striping_info_buf;
6042 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
6043 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
6044 	return 0;
6045 }
6046 
6047 static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
6048 {
6049 	__le64 data_pool_id;
6050 	int ret;
6051 
6052 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6053 				  &rbd_dev->header_oloc, "get_data_pool",
6054 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
6055 	if (ret < 0)
6056 		return ret;
6057 	if (ret < sizeof(data_pool_id))
6058 		return -EBADMSG;
6059 
6060 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
6061 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
6062 	return 0;
6063 }
6064 
6065 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
6066 {
6067 	CEPH_DEFINE_OID_ONSTACK(oid);
6068 	size_t image_id_size;
6069 	char *image_id;
6070 	void *p;
6071 	void *end;
6072 	size_t size;
6073 	void *reply_buf = NULL;
6074 	size_t len = 0;
6075 	char *image_name = NULL;
6076 	int ret;
6077 
6078 	rbd_assert(!rbd_dev->spec->image_name);
6079 
6080 	len = strlen(rbd_dev->spec->image_id);
6081 	image_id_size = sizeof (__le32) + len;
6082 	image_id = kmalloc(image_id_size, GFP_KERNEL);
6083 	if (!image_id)
6084 		return NULL;
6085 
6086 	p = image_id;
6087 	end = image_id + image_id_size;
6088 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
6089 
6090 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
6091 	reply_buf = kmalloc(size, GFP_KERNEL);
6092 	if (!reply_buf)
6093 		goto out;
6094 
6095 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
6096 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6097 				  "dir_get_name", image_id, image_id_size,
6098 				  reply_buf, size);
6099 	if (ret < 0)
6100 		goto out;
6101 	p = reply_buf;
6102 	end = reply_buf + ret;
6103 
6104 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
6105 	if (IS_ERR(image_name))
6106 		image_name = NULL;
6107 	else
6108 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
6109 out:
6110 	kfree(reply_buf);
6111 	kfree(image_id);
6112 
6113 	return image_name;
6114 }
6115 
6116 static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6117 {
6118 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6119 	const char *snap_name;
6120 	u32 which = 0;
6121 
6122 	/* Skip over names until we find the one we are looking for */
6123 
6124 	snap_name = rbd_dev->header.snap_names;
6125 	while (which < snapc->num_snaps) {
6126 		if (!strcmp(name, snap_name))
6127 			return snapc->snaps[which];
6128 		snap_name += strlen(snap_name) + 1;
6129 		which++;
6130 	}
6131 	return CEPH_NOSNAP;
6132 }
6133 
6134 static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6135 {
6136 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6137 	u32 which;
6138 	bool found = false;
6139 	u64 snap_id;
6140 
6141 	for (which = 0; !found && which < snapc->num_snaps; which++) {
6142 		const char *snap_name;
6143 
6144 		snap_id = snapc->snaps[which];
6145 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
6146 		if (IS_ERR(snap_name)) {
6147 			/* ignore no-longer existing snapshots */
6148 			if (PTR_ERR(snap_name) == -ENOENT)
6149 				continue;
6150 			else
6151 				break;
6152 		}
6153 		found = !strcmp(name, snap_name);
6154 		kfree(snap_name);
6155 	}
6156 	return found ? snap_id : CEPH_NOSNAP;
6157 }
6158 
6159 /*
6160  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6161  * no snapshot by that name is found, or if an error occurs.
6162  */
6163 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6164 {
6165 	if (rbd_dev->image_format == 1)
6166 		return rbd_v1_snap_id_by_name(rbd_dev, name);
6167 
6168 	return rbd_v2_snap_id_by_name(rbd_dev, name);
6169 }
6170 
6171 /*
6172  * An image being mapped will have everything but the snap id.
6173  */
6174 static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6175 {
6176 	struct rbd_spec *spec = rbd_dev->spec;
6177 
6178 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6179 	rbd_assert(spec->image_id && spec->image_name);
6180 	rbd_assert(spec->snap_name);
6181 
6182 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6183 		u64 snap_id;
6184 
6185 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6186 		if (snap_id == CEPH_NOSNAP)
6187 			return -ENOENT;
6188 
6189 		spec->snap_id = snap_id;
6190 	} else {
6191 		spec->snap_id = CEPH_NOSNAP;
6192 	}
6193 
6194 	return 0;
6195 }
6196 
6197 /*
6198  * A parent image will have all ids but none of the names.
6199  *
6200  * All names in an rbd spec are dynamically allocated.  It's OK if we
6201  * can't figure out the name for an image id.
6202  */
6203 static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
6204 {
6205 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6206 	struct rbd_spec *spec = rbd_dev->spec;
6207 	const char *pool_name;
6208 	const char *image_name;
6209 	const char *snap_name;
6210 	int ret;
6211 
6212 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
6213 	rbd_assert(spec->image_id);
6214 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
6215 
6216 	/* Get the pool name; we have to make our own copy of this */
6217 
6218 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6219 	if (!pool_name) {
6220 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6221 		return -EIO;
6222 	}
6223 	pool_name = kstrdup(pool_name, GFP_KERNEL);
6224 	if (!pool_name)
6225 		return -ENOMEM;
6226 
6227 	/* Fetch the image name; tolerate failure here */
6228 
6229 	image_name = rbd_dev_image_name(rbd_dev);
6230 	if (!image_name)
6231 		rbd_warn(rbd_dev, "unable to get image name");
6232 
6233 	/* Fetch the snapshot name */
6234 
6235 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6236 	if (IS_ERR(snap_name)) {
6237 		ret = PTR_ERR(snap_name);
6238 		goto out_err;
6239 	}
6240 
6241 	spec->pool_name = pool_name;
6242 	spec->image_name = image_name;
6243 	spec->snap_name = snap_name;
6244 
6245 	return 0;
6246 
6247 out_err:
6248 	kfree(image_name);
6249 	kfree(pool_name);
6250 	return ret;
6251 }
6252 
6253 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
6254 {
6255 	size_t size;
6256 	int ret;
6257 	void *reply_buf;
6258 	void *p;
6259 	void *end;
6260 	u64 seq;
6261 	u32 snap_count;
6262 	struct ceph_snap_context *snapc;
6263 	u32 i;
6264 
6265 	/*
6266 	 * We'll need room for the seq value (maximum snapshot id),
6267 	 * snapshot count, and array of that many snapshot ids.
6268 	 * For now we have a fixed upper limit on the number we're
6269 	 * prepared to receive.
6270 	 */
6271 	size = sizeof (__le64) + sizeof (__le32) +
6272 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
6273 	reply_buf = kzalloc(size, GFP_KERNEL);
6274 	if (!reply_buf)
6275 		return -ENOMEM;
6276 
6277 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6278 				  &rbd_dev->header_oloc, "get_snapcontext",
6279 				  NULL, 0, reply_buf, size);
6280 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6281 	if (ret < 0)
6282 		goto out;
6283 
6284 	p = reply_buf;
6285 	end = reply_buf + ret;
6286 	ret = -ERANGE;
6287 	ceph_decode_64_safe(&p, end, seq, out);
6288 	ceph_decode_32_safe(&p, end, snap_count, out);
6289 
6290 	/*
6291 	 * Make sure the reported number of snapshot ids wouldn't go
6292 	 * beyond the end of our buffer.  But before checking that,
6293 	 * make sure the computed size of the snapshot context we
6294 	 * allocate is representable in a size_t.
6295 	 */
6296 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6297 				 / sizeof (u64)) {
6298 		ret = -EINVAL;
6299 		goto out;
6300 	}
6301 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6302 		goto out;
6303 	ret = 0;
6304 
6305 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
6306 	if (!snapc) {
6307 		ret = -ENOMEM;
6308 		goto out;
6309 	}
6310 	snapc->seq = seq;
6311 	for (i = 0; i < snap_count; i++)
6312 		snapc->snaps[i] = ceph_decode_64(&p);
6313 
6314 	ceph_put_snap_context(rbd_dev->header.snapc);
6315 	rbd_dev->header.snapc = snapc;
6316 
6317 	dout("  snap context seq = %llu, snap_count = %u\n",
6318 		(unsigned long long)seq, (unsigned int)snap_count);
6319 out:
6320 	kfree(reply_buf);
6321 
6322 	return ret;
6323 }
6324 
6325 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6326 					u64 snap_id)
6327 {
6328 	size_t size;
6329 	void *reply_buf;
6330 	__le64 snapid;
6331 	int ret;
6332 	void *p;
6333 	void *end;
6334 	char *snap_name;
6335 
6336 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6337 	reply_buf = kmalloc(size, GFP_KERNEL);
6338 	if (!reply_buf)
6339 		return ERR_PTR(-ENOMEM);
6340 
6341 	snapid = cpu_to_le64(snap_id);
6342 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6343 				  &rbd_dev->header_oloc, "get_snapshot_name",
6344 				  &snapid, sizeof(snapid), reply_buf, size);
6345 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6346 	if (ret < 0) {
6347 		snap_name = ERR_PTR(ret);
6348 		goto out;
6349 	}
6350 
6351 	p = reply_buf;
6352 	end = reply_buf + ret;
6353 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6354 	if (IS_ERR(snap_name))
6355 		goto out;
6356 
6357 	dout("  snap_id 0x%016llx snap_name = %s\n",
6358 		(unsigned long long)snap_id, snap_name);
6359 out:
6360 	kfree(reply_buf);
6361 
6362 	return snap_name;
6363 }
6364 
6365 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
6366 {
6367 	bool first_time = rbd_dev->header.object_prefix == NULL;
6368 	int ret;
6369 
6370 	ret = rbd_dev_v2_image_size(rbd_dev);
6371 	if (ret)
6372 		return ret;
6373 
6374 	if (first_time) {
6375 		ret = rbd_dev_v2_header_onetime(rbd_dev);
6376 		if (ret)
6377 			return ret;
6378 	}
6379 
6380 	ret = rbd_dev_v2_snap_context(rbd_dev);
6381 	if (ret && first_time) {
6382 		kfree(rbd_dev->header.object_prefix);
6383 		rbd_dev->header.object_prefix = NULL;
6384 	}
6385 
6386 	return ret;
6387 }
6388 
6389 static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6390 {
6391 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6392 
6393 	if (rbd_dev->image_format == 1)
6394 		return rbd_dev_v1_header_info(rbd_dev);
6395 
6396 	return rbd_dev_v2_header_info(rbd_dev);
6397 }
6398 
6399 /*
6400  * Skips over white space at *buf, and updates *buf to point to the
6401  * first found non-space character (if any). Returns the length of
6402  * the token (string of non-white space characters) found.  Note
6403  * that *buf must be terminated with '\0'.
6404  */
6405 static inline size_t next_token(const char **buf)
6406 {
6407         /*
6408         * These are the characters that produce nonzero for
6409         * isspace() in the "C" and "POSIX" locales.
6410         */
6411         const char *spaces = " \f\n\r\t\v";
6412 
6413         *buf += strspn(*buf, spaces);	/* Find start of token */
6414 
6415 	return strcspn(*buf, spaces);   /* Return token length */
6416 }
6417 
6418 /*
6419  * Finds the next token in *buf, dynamically allocates a buffer big
6420  * enough to hold a copy of it, and copies the token into the new
6421  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
6422  * that a duplicate buffer is created even for a zero-length token.
6423  *
6424  * Returns a pointer to the newly-allocated duplicate, or a null
6425  * pointer if memory for the duplicate was not available.  If
6426  * the lenp argument is a non-null pointer, the length of the token
6427  * (not including the '\0') is returned in *lenp.
6428  *
6429  * If successful, the *buf pointer will be updated to point beyond
6430  * the end of the found token.
6431  *
6432  * Note: uses GFP_KERNEL for allocation.
6433  */
6434 static inline char *dup_token(const char **buf, size_t *lenp)
6435 {
6436 	char *dup;
6437 	size_t len;
6438 
6439 	len = next_token(buf);
6440 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6441 	if (!dup)
6442 		return NULL;
6443 	*(dup + len) = '\0';
6444 	*buf += len;
6445 
6446 	if (lenp)
6447 		*lenp = len;
6448 
6449 	return dup;
6450 }
6451 
6452 /*
6453  * Parse the options provided for an "rbd add" (i.e., rbd image
6454  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
6455  * and the data written is passed here via a NUL-terminated buffer.
6456  * Returns 0 if successful or an error code otherwise.
6457  *
6458  * The information extracted from these options is recorded in
6459  * the other parameters which return dynamically-allocated
6460  * structures:
6461  *  ceph_opts
6462  *      The address of a pointer that will refer to a ceph options
6463  *      structure.  Caller must release the returned pointer using
6464  *      ceph_destroy_options() when it is no longer needed.
6465  *  rbd_opts
6466  *	Address of an rbd options pointer.  Fully initialized by
6467  *	this function; caller must release with kfree().
6468  *  spec
6469  *	Address of an rbd image specification pointer.  Fully
6470  *	initialized by this function based on parsed options.
6471  *	Caller must release with rbd_spec_put().
6472  *
6473  * The options passed take this form:
6474  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6475  * where:
6476  *  <mon_addrs>
6477  *      A comma-separated list of one or more monitor addresses.
6478  *      A monitor address is an ip address, optionally followed
6479  *      by a port number (separated by a colon).
6480  *        I.e.:  ip1[:port1][,ip2[:port2]...]
6481  *  <options>
6482  *      A comma-separated list of ceph and/or rbd options.
6483  *  <pool_name>
6484  *      The name of the rados pool containing the rbd image.
6485  *  <image_name>
6486  *      The name of the image in that pool to map.
6487  *  <snap_id>
6488  *      An optional snapshot id.  If provided, the mapping will
6489  *      present data from the image at the time that snapshot was
6490  *      created.  The image head is used if no snapshot id is
6491  *      provided.  Snapshot mappings are always read-only.
6492  */
6493 static int rbd_add_parse_args(const char *buf,
6494 				struct ceph_options **ceph_opts,
6495 				struct rbd_options **opts,
6496 				struct rbd_spec **rbd_spec)
6497 {
6498 	size_t len;
6499 	char *options;
6500 	const char *mon_addrs;
6501 	char *snap_name;
6502 	size_t mon_addrs_size;
6503 	struct parse_rbd_opts_ctx pctx = { 0 };
6504 	struct ceph_options *copts;
6505 	int ret;
6506 
6507 	/* The first four tokens are required */
6508 
6509 	len = next_token(&buf);
6510 	if (!len) {
6511 		rbd_warn(NULL, "no monitor address(es) provided");
6512 		return -EINVAL;
6513 	}
6514 	mon_addrs = buf;
6515 	mon_addrs_size = len + 1;
6516 	buf += len;
6517 
6518 	ret = -EINVAL;
6519 	options = dup_token(&buf, NULL);
6520 	if (!options)
6521 		return -ENOMEM;
6522 	if (!*options) {
6523 		rbd_warn(NULL, "no options provided");
6524 		goto out_err;
6525 	}
6526 
6527 	pctx.spec = rbd_spec_alloc();
6528 	if (!pctx.spec)
6529 		goto out_mem;
6530 
6531 	pctx.spec->pool_name = dup_token(&buf, NULL);
6532 	if (!pctx.spec->pool_name)
6533 		goto out_mem;
6534 	if (!*pctx.spec->pool_name) {
6535 		rbd_warn(NULL, "no pool name provided");
6536 		goto out_err;
6537 	}
6538 
6539 	pctx.spec->image_name = dup_token(&buf, NULL);
6540 	if (!pctx.spec->image_name)
6541 		goto out_mem;
6542 	if (!*pctx.spec->image_name) {
6543 		rbd_warn(NULL, "no image name provided");
6544 		goto out_err;
6545 	}
6546 
6547 	/*
6548 	 * Snapshot name is optional; default is to use "-"
6549 	 * (indicating the head/no snapshot).
6550 	 */
6551 	len = next_token(&buf);
6552 	if (!len) {
6553 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6554 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6555 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
6556 		ret = -ENAMETOOLONG;
6557 		goto out_err;
6558 	}
6559 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6560 	if (!snap_name)
6561 		goto out_mem;
6562 	*(snap_name + len) = '\0';
6563 	pctx.spec->snap_name = snap_name;
6564 
6565 	/* Initialize all rbd options to the defaults */
6566 
6567 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6568 	if (!pctx.opts)
6569 		goto out_mem;
6570 
6571 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6572 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
6573 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6574 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6575 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6576 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6577 	pctx.opts->trim = RBD_TRIM_DEFAULT;
6578 
6579 	copts = ceph_parse_options(options, mon_addrs,
6580 				   mon_addrs + mon_addrs_size - 1,
6581 				   parse_rbd_opts_token, &pctx);
6582 	if (IS_ERR(copts)) {
6583 		ret = PTR_ERR(copts);
6584 		goto out_err;
6585 	}
6586 	kfree(options);
6587 
6588 	*ceph_opts = copts;
6589 	*opts = pctx.opts;
6590 	*rbd_spec = pctx.spec;
6591 
6592 	return 0;
6593 out_mem:
6594 	ret = -ENOMEM;
6595 out_err:
6596 	kfree(pctx.opts);
6597 	rbd_spec_put(pctx.spec);
6598 	kfree(options);
6599 
6600 	return ret;
6601 }
6602 
6603 static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6604 {
6605 	down_write(&rbd_dev->lock_rwsem);
6606 	if (__rbd_is_lock_owner(rbd_dev))
6607 		__rbd_release_lock(rbd_dev);
6608 	up_write(&rbd_dev->lock_rwsem);
6609 }
6610 
6611 /*
6612  * If the wait is interrupted, an error is returned even if the lock
6613  * was successfully acquired.  rbd_dev_image_unlock() will release it
6614  * if needed.
6615  */
6616 static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6617 {
6618 	long ret;
6619 
6620 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6621 		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6622 			return 0;
6623 
6624 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6625 		return -EINVAL;
6626 	}
6627 
6628 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
6629 		return 0;
6630 
6631 	rbd_assert(!rbd_is_lock_owner(rbd_dev));
6632 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6633 	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6634 			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
6635 	if (ret > 0)
6636 		ret = rbd_dev->acquire_err;
6637 	else if (!ret)
6638 		ret = -ETIMEDOUT;
6639 
6640 	if (ret) {
6641 		rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6642 		return ret;
6643 	}
6644 
6645 	/*
6646 	 * The lock may have been released by now, unless automatic lock
6647 	 * transitions are disabled.
6648 	 */
6649 	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6650 	return 0;
6651 }
6652 
6653 /*
6654  * An rbd format 2 image has a unique identifier, distinct from the
6655  * name given to it by the user.  Internally, that identifier is
6656  * what's used to specify the names of objects related to the image.
6657  *
6658  * A special "rbd id" object is used to map an rbd image name to its
6659  * id.  If that object doesn't exist, then there is no v2 rbd image
6660  * with the supplied name.
6661  *
6662  * This function will record the given rbd_dev's image_id field if
6663  * it can be determined, and in that case will return 0.  If any
6664  * errors occur a negative errno will be returned and the rbd_dev's
6665  * image_id field will be unchanged (and should be NULL).
6666  */
6667 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6668 {
6669 	int ret;
6670 	size_t size;
6671 	CEPH_DEFINE_OID_ONSTACK(oid);
6672 	void *response;
6673 	char *image_id;
6674 
6675 	/*
6676 	 * When probing a parent image, the image id is already
6677 	 * known (and the image name likely is not).  There's no
6678 	 * need to fetch the image id again in this case.  We
6679 	 * do still need to set the image format though.
6680 	 */
6681 	if (rbd_dev->spec->image_id) {
6682 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6683 
6684 		return 0;
6685 	}
6686 
6687 	/*
6688 	 * First, see if the format 2 image id file exists, and if
6689 	 * so, get the image's persistent id from it.
6690 	 */
6691 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6692 			       rbd_dev->spec->image_name);
6693 	if (ret)
6694 		return ret;
6695 
6696 	dout("rbd id object name is %s\n", oid.name);
6697 
6698 	/* Response will be an encoded string, which includes a length */
6699 
6700 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6701 	response = kzalloc(size, GFP_NOIO);
6702 	if (!response) {
6703 		ret = -ENOMEM;
6704 		goto out;
6705 	}
6706 
6707 	/* If it doesn't exist we'll assume it's a format 1 image */
6708 
6709 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6710 				  "get_id", NULL, 0,
6711 				  response, RBD_IMAGE_ID_LEN_MAX);
6712 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6713 	if (ret == -ENOENT) {
6714 		image_id = kstrdup("", GFP_KERNEL);
6715 		ret = image_id ? 0 : -ENOMEM;
6716 		if (!ret)
6717 			rbd_dev->image_format = 1;
6718 	} else if (ret >= 0) {
6719 		void *p = response;
6720 
6721 		image_id = ceph_extract_encoded_string(&p, p + ret,
6722 						NULL, GFP_NOIO);
6723 		ret = PTR_ERR_OR_ZERO(image_id);
6724 		if (!ret)
6725 			rbd_dev->image_format = 2;
6726 	}
6727 
6728 	if (!ret) {
6729 		rbd_dev->spec->image_id = image_id;
6730 		dout("image_id is %s\n", image_id);
6731 	}
6732 out:
6733 	kfree(response);
6734 	ceph_oid_destroy(&oid);
6735 	return ret;
6736 }
6737 
6738 /*
6739  * Undo whatever state changes are made by v1 or v2 header info
6740  * call.
6741  */
6742 static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6743 {
6744 	struct rbd_image_header	*header;
6745 
6746 	rbd_dev_parent_put(rbd_dev);
6747 	rbd_object_map_free(rbd_dev);
6748 	rbd_dev_mapping_clear(rbd_dev);
6749 
6750 	/* Free dynamic fields from the header, then zero it out */
6751 
6752 	header = &rbd_dev->header;
6753 	ceph_put_snap_context(header->snapc);
6754 	kfree(header->snap_sizes);
6755 	kfree(header->snap_names);
6756 	kfree(header->object_prefix);
6757 	memset(header, 0, sizeof (*header));
6758 }
6759 
6760 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
6761 {
6762 	int ret;
6763 
6764 	ret = rbd_dev_v2_object_prefix(rbd_dev);
6765 	if (ret)
6766 		goto out_err;
6767 
6768 	/*
6769 	 * Get the and check features for the image.  Currently the
6770 	 * features are assumed to never change.
6771 	 */
6772 	ret = rbd_dev_v2_features(rbd_dev);
6773 	if (ret)
6774 		goto out_err;
6775 
6776 	/* If the image supports fancy striping, get its parameters */
6777 
6778 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6779 		ret = rbd_dev_v2_striping_info(rbd_dev);
6780 		if (ret < 0)
6781 			goto out_err;
6782 	}
6783 
6784 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6785 		ret = rbd_dev_v2_data_pool(rbd_dev);
6786 		if (ret)
6787 			goto out_err;
6788 	}
6789 
6790 	rbd_init_layout(rbd_dev);
6791 	return 0;
6792 
6793 out_err:
6794 	rbd_dev->header.features = 0;
6795 	kfree(rbd_dev->header.object_prefix);
6796 	rbd_dev->header.object_prefix = NULL;
6797 	return ret;
6798 }
6799 
6800 /*
6801  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6802  * rbd_dev_image_probe() recursion depth, which means it's also the
6803  * length of the already discovered part of the parent chain.
6804  */
6805 static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
6806 {
6807 	struct rbd_device *parent = NULL;
6808 	int ret;
6809 
6810 	if (!rbd_dev->parent_spec)
6811 		return 0;
6812 
6813 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6814 		pr_info("parent chain is too long (%d)\n", depth);
6815 		ret = -EINVAL;
6816 		goto out_err;
6817 	}
6818 
6819 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
6820 	if (!parent) {
6821 		ret = -ENOMEM;
6822 		goto out_err;
6823 	}
6824 
6825 	/*
6826 	 * Images related by parent/child relationships always share
6827 	 * rbd_client and spec/parent_spec, so bump their refcounts.
6828 	 */
6829 	__rbd_get_client(rbd_dev->rbd_client);
6830 	rbd_spec_get(rbd_dev->parent_spec);
6831 
6832 	ret = rbd_dev_image_probe(parent, depth);
6833 	if (ret < 0)
6834 		goto out_err;
6835 
6836 	rbd_dev->parent = parent;
6837 	atomic_set(&rbd_dev->parent_ref, 1);
6838 	return 0;
6839 
6840 out_err:
6841 	rbd_dev_unparent(rbd_dev);
6842 	rbd_dev_destroy(parent);
6843 	return ret;
6844 }
6845 
6846 static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6847 {
6848 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6849 	rbd_free_disk(rbd_dev);
6850 	if (!single_major)
6851 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6852 }
6853 
6854 /*
6855  * rbd_dev->header_rwsem must be locked for write and will be unlocked
6856  * upon return.
6857  */
6858 static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6859 {
6860 	int ret;
6861 
6862 	/* Record our major and minor device numbers. */
6863 
6864 	if (!single_major) {
6865 		ret = register_blkdev(0, rbd_dev->name);
6866 		if (ret < 0)
6867 			goto err_out_unlock;
6868 
6869 		rbd_dev->major = ret;
6870 		rbd_dev->minor = 0;
6871 	} else {
6872 		rbd_dev->major = rbd_major;
6873 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6874 	}
6875 
6876 	/* Set up the blkdev mapping. */
6877 
6878 	ret = rbd_init_disk(rbd_dev);
6879 	if (ret)
6880 		goto err_out_blkdev;
6881 
6882 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
6883 	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
6884 
6885 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6886 	if (ret)
6887 		goto err_out_disk;
6888 
6889 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6890 	up_write(&rbd_dev->header_rwsem);
6891 	return 0;
6892 
6893 err_out_disk:
6894 	rbd_free_disk(rbd_dev);
6895 err_out_blkdev:
6896 	if (!single_major)
6897 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6898 err_out_unlock:
6899 	up_write(&rbd_dev->header_rwsem);
6900 	return ret;
6901 }
6902 
6903 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6904 {
6905 	struct rbd_spec *spec = rbd_dev->spec;
6906 	int ret;
6907 
6908 	/* Record the header object name for this rbd image. */
6909 
6910 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6911 	if (rbd_dev->image_format == 1)
6912 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6913 				       spec->image_name, RBD_SUFFIX);
6914 	else
6915 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6916 				       RBD_HEADER_PREFIX, spec->image_id);
6917 
6918 	return ret;
6919 }
6920 
6921 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6922 {
6923 	rbd_dev_unprobe(rbd_dev);
6924 	if (rbd_dev->opts)
6925 		rbd_unregister_watch(rbd_dev);
6926 	rbd_dev->image_format = 0;
6927 	kfree(rbd_dev->spec->image_id);
6928 	rbd_dev->spec->image_id = NULL;
6929 }
6930 
6931 /*
6932  * Probe for the existence of the header object for the given rbd
6933  * device.  If this image is the one being mapped (i.e., not a
6934  * parent), initiate a watch on its header object before using that
6935  * object to get detailed information about the rbd image.
6936  */
6937 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6938 {
6939 	int ret;
6940 
6941 	/*
6942 	 * Get the id from the image id object.  Unless there's an
6943 	 * error, rbd_dev->spec->image_id will be filled in with
6944 	 * a dynamically-allocated string, and rbd_dev->image_format
6945 	 * will be set to either 1 or 2.
6946 	 */
6947 	ret = rbd_dev_image_id(rbd_dev);
6948 	if (ret)
6949 		return ret;
6950 
6951 	ret = rbd_dev_header_name(rbd_dev);
6952 	if (ret)
6953 		goto err_out_format;
6954 
6955 	if (!depth) {
6956 		ret = rbd_register_watch(rbd_dev);
6957 		if (ret) {
6958 			if (ret == -ENOENT)
6959 				pr_info("image %s/%s%s%s does not exist\n",
6960 					rbd_dev->spec->pool_name,
6961 					rbd_dev->spec->pool_ns ?: "",
6962 					rbd_dev->spec->pool_ns ? "/" : "",
6963 					rbd_dev->spec->image_name);
6964 			goto err_out_format;
6965 		}
6966 	}
6967 
6968 	ret = rbd_dev_header_info(rbd_dev);
6969 	if (ret)
6970 		goto err_out_watch;
6971 
6972 	/*
6973 	 * If this image is the one being mapped, we have pool name and
6974 	 * id, image name and id, and snap name - need to fill snap id.
6975 	 * Otherwise this is a parent image, identified by pool, image
6976 	 * and snap ids - need to fill in names for those ids.
6977 	 */
6978 	if (!depth)
6979 		ret = rbd_spec_fill_snap_id(rbd_dev);
6980 	else
6981 		ret = rbd_spec_fill_names(rbd_dev);
6982 	if (ret) {
6983 		if (ret == -ENOENT)
6984 			pr_info("snap %s/%s%s%s@%s does not exist\n",
6985 				rbd_dev->spec->pool_name,
6986 				rbd_dev->spec->pool_ns ?: "",
6987 				rbd_dev->spec->pool_ns ? "/" : "",
6988 				rbd_dev->spec->image_name,
6989 				rbd_dev->spec->snap_name);
6990 		goto err_out_probe;
6991 	}
6992 
6993 	ret = rbd_dev_mapping_set(rbd_dev);
6994 	if (ret)
6995 		goto err_out_probe;
6996 
6997 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
6998 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
6999 		ret = rbd_object_map_load(rbd_dev);
7000 		if (ret)
7001 			goto err_out_probe;
7002 	}
7003 
7004 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7005 		ret = rbd_dev_v2_parent_info(rbd_dev);
7006 		if (ret)
7007 			goto err_out_probe;
7008 	}
7009 
7010 	ret = rbd_dev_probe_parent(rbd_dev, depth);
7011 	if (ret)
7012 		goto err_out_probe;
7013 
7014 	dout("discovered format %u image, header name is %s\n",
7015 		rbd_dev->image_format, rbd_dev->header_oid.name);
7016 	return 0;
7017 
7018 err_out_probe:
7019 	rbd_dev_unprobe(rbd_dev);
7020 err_out_watch:
7021 	if (!depth)
7022 		rbd_unregister_watch(rbd_dev);
7023 err_out_format:
7024 	rbd_dev->image_format = 0;
7025 	kfree(rbd_dev->spec->image_id);
7026 	rbd_dev->spec->image_id = NULL;
7027 	return ret;
7028 }
7029 
7030 static ssize_t do_rbd_add(struct bus_type *bus,
7031 			  const char *buf,
7032 			  size_t count)
7033 {
7034 	struct rbd_device *rbd_dev = NULL;
7035 	struct ceph_options *ceph_opts = NULL;
7036 	struct rbd_options *rbd_opts = NULL;
7037 	struct rbd_spec *spec = NULL;
7038 	struct rbd_client *rbdc;
7039 	int rc;
7040 
7041 	if (!try_module_get(THIS_MODULE))
7042 		return -ENODEV;
7043 
7044 	/* parse add command */
7045 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7046 	if (rc < 0)
7047 		goto out;
7048 
7049 	rbdc = rbd_get_client(ceph_opts);
7050 	if (IS_ERR(rbdc)) {
7051 		rc = PTR_ERR(rbdc);
7052 		goto err_out_args;
7053 	}
7054 
7055 	/* pick the pool */
7056 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
7057 	if (rc < 0) {
7058 		if (rc == -ENOENT)
7059 			pr_info("pool %s does not exist\n", spec->pool_name);
7060 		goto err_out_client;
7061 	}
7062 	spec->pool_id = (u64)rc;
7063 
7064 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7065 	if (!rbd_dev) {
7066 		rc = -ENOMEM;
7067 		goto err_out_client;
7068 	}
7069 	rbdc = NULL;		/* rbd_dev now owns this */
7070 	spec = NULL;		/* rbd_dev now owns this */
7071 	rbd_opts = NULL;	/* rbd_dev now owns this */
7072 
7073 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7074 	if (!rbd_dev->config_info) {
7075 		rc = -ENOMEM;
7076 		goto err_out_rbd_dev;
7077 	}
7078 
7079 	down_write(&rbd_dev->header_rwsem);
7080 	rc = rbd_dev_image_probe(rbd_dev, 0);
7081 	if (rc < 0) {
7082 		up_write(&rbd_dev->header_rwsem);
7083 		goto err_out_rbd_dev;
7084 	}
7085 
7086 	/* If we are mapping a snapshot it must be marked read-only */
7087 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
7088 		rbd_dev->opts->read_only = true;
7089 
7090 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7091 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7092 			 rbd_dev->layout.object_size);
7093 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7094 	}
7095 
7096 	rc = rbd_dev_device_setup(rbd_dev);
7097 	if (rc)
7098 		goto err_out_image_probe;
7099 
7100 	rc = rbd_add_acquire_lock(rbd_dev);
7101 	if (rc)
7102 		goto err_out_image_lock;
7103 
7104 	/* Everything's ready.  Announce the disk to the world. */
7105 
7106 	rc = device_add(&rbd_dev->dev);
7107 	if (rc)
7108 		goto err_out_image_lock;
7109 
7110 	add_disk(rbd_dev->disk);
7111 	/* see rbd_init_disk() */
7112 	blk_put_queue(rbd_dev->disk->queue);
7113 
7114 	spin_lock(&rbd_dev_list_lock);
7115 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
7116 	spin_unlock(&rbd_dev_list_lock);
7117 
7118 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7119 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7120 		rbd_dev->header.features);
7121 	rc = count;
7122 out:
7123 	module_put(THIS_MODULE);
7124 	return rc;
7125 
7126 err_out_image_lock:
7127 	rbd_dev_image_unlock(rbd_dev);
7128 	rbd_dev_device_release(rbd_dev);
7129 err_out_image_probe:
7130 	rbd_dev_image_release(rbd_dev);
7131 err_out_rbd_dev:
7132 	rbd_dev_destroy(rbd_dev);
7133 err_out_client:
7134 	rbd_put_client(rbdc);
7135 err_out_args:
7136 	rbd_spec_put(spec);
7137 	kfree(rbd_opts);
7138 	goto out;
7139 }
7140 
7141 static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
7142 {
7143 	if (single_major)
7144 		return -EINVAL;
7145 
7146 	return do_rbd_add(bus, buf, count);
7147 }
7148 
7149 static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7150 				      size_t count)
7151 {
7152 	return do_rbd_add(bus, buf, count);
7153 }
7154 
7155 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7156 {
7157 	while (rbd_dev->parent) {
7158 		struct rbd_device *first = rbd_dev;
7159 		struct rbd_device *second = first->parent;
7160 		struct rbd_device *third;
7161 
7162 		/*
7163 		 * Follow to the parent with no grandparent and
7164 		 * remove it.
7165 		 */
7166 		while (second && (third = second->parent)) {
7167 			first = second;
7168 			second = third;
7169 		}
7170 		rbd_assert(second);
7171 		rbd_dev_image_release(second);
7172 		rbd_dev_destroy(second);
7173 		first->parent = NULL;
7174 		first->parent_overlap = 0;
7175 
7176 		rbd_assert(first->parent_spec);
7177 		rbd_spec_put(first->parent_spec);
7178 		first->parent_spec = NULL;
7179 	}
7180 }
7181 
7182 static ssize_t do_rbd_remove(struct bus_type *bus,
7183 			     const char *buf,
7184 			     size_t count)
7185 {
7186 	struct rbd_device *rbd_dev = NULL;
7187 	struct list_head *tmp;
7188 	int dev_id;
7189 	char opt_buf[6];
7190 	bool force = false;
7191 	int ret;
7192 
7193 	dev_id = -1;
7194 	opt_buf[0] = '\0';
7195 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
7196 	if (dev_id < 0) {
7197 		pr_err("dev_id out of range\n");
7198 		return -EINVAL;
7199 	}
7200 	if (opt_buf[0] != '\0') {
7201 		if (!strcmp(opt_buf, "force")) {
7202 			force = true;
7203 		} else {
7204 			pr_err("bad remove option at '%s'\n", opt_buf);
7205 			return -EINVAL;
7206 		}
7207 	}
7208 
7209 	ret = -ENOENT;
7210 	spin_lock(&rbd_dev_list_lock);
7211 	list_for_each(tmp, &rbd_dev_list) {
7212 		rbd_dev = list_entry(tmp, struct rbd_device, node);
7213 		if (rbd_dev->dev_id == dev_id) {
7214 			ret = 0;
7215 			break;
7216 		}
7217 	}
7218 	if (!ret) {
7219 		spin_lock_irq(&rbd_dev->lock);
7220 		if (rbd_dev->open_count && !force)
7221 			ret = -EBUSY;
7222 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7223 					  &rbd_dev->flags))
7224 			ret = -EINPROGRESS;
7225 		spin_unlock_irq(&rbd_dev->lock);
7226 	}
7227 	spin_unlock(&rbd_dev_list_lock);
7228 	if (ret)
7229 		return ret;
7230 
7231 	if (force) {
7232 		/*
7233 		 * Prevent new IO from being queued and wait for existing
7234 		 * IO to complete/fail.
7235 		 */
7236 		blk_mq_freeze_queue(rbd_dev->disk->queue);
7237 		blk_set_queue_dying(rbd_dev->disk->queue);
7238 	}
7239 
7240 	del_gendisk(rbd_dev->disk);
7241 	spin_lock(&rbd_dev_list_lock);
7242 	list_del_init(&rbd_dev->node);
7243 	spin_unlock(&rbd_dev_list_lock);
7244 	device_del(&rbd_dev->dev);
7245 
7246 	rbd_dev_image_unlock(rbd_dev);
7247 	rbd_dev_device_release(rbd_dev);
7248 	rbd_dev_image_release(rbd_dev);
7249 	rbd_dev_destroy(rbd_dev);
7250 	return count;
7251 }
7252 
7253 static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
7254 {
7255 	if (single_major)
7256 		return -EINVAL;
7257 
7258 	return do_rbd_remove(bus, buf, count);
7259 }
7260 
7261 static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7262 					 size_t count)
7263 {
7264 	return do_rbd_remove(bus, buf, count);
7265 }
7266 
7267 /*
7268  * create control files in sysfs
7269  * /sys/bus/rbd/...
7270  */
7271 static int __init rbd_sysfs_init(void)
7272 {
7273 	int ret;
7274 
7275 	ret = device_register(&rbd_root_dev);
7276 	if (ret < 0)
7277 		return ret;
7278 
7279 	ret = bus_register(&rbd_bus_type);
7280 	if (ret < 0)
7281 		device_unregister(&rbd_root_dev);
7282 
7283 	return ret;
7284 }
7285 
7286 static void __exit rbd_sysfs_cleanup(void)
7287 {
7288 	bus_unregister(&rbd_bus_type);
7289 	device_unregister(&rbd_root_dev);
7290 }
7291 
7292 static int __init rbd_slab_init(void)
7293 {
7294 	rbd_assert(!rbd_img_request_cache);
7295 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7296 	if (!rbd_img_request_cache)
7297 		return -ENOMEM;
7298 
7299 	rbd_assert(!rbd_obj_request_cache);
7300 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
7301 	if (!rbd_obj_request_cache)
7302 		goto out_err;
7303 
7304 	return 0;
7305 
7306 out_err:
7307 	kmem_cache_destroy(rbd_img_request_cache);
7308 	rbd_img_request_cache = NULL;
7309 	return -ENOMEM;
7310 }
7311 
7312 static void rbd_slab_exit(void)
7313 {
7314 	rbd_assert(rbd_obj_request_cache);
7315 	kmem_cache_destroy(rbd_obj_request_cache);
7316 	rbd_obj_request_cache = NULL;
7317 
7318 	rbd_assert(rbd_img_request_cache);
7319 	kmem_cache_destroy(rbd_img_request_cache);
7320 	rbd_img_request_cache = NULL;
7321 }
7322 
7323 static int __init rbd_init(void)
7324 {
7325 	int rc;
7326 
7327 	if (!libceph_compatible(NULL)) {
7328 		rbd_warn(NULL, "libceph incompatibility (quitting)");
7329 		return -EINVAL;
7330 	}
7331 
7332 	rc = rbd_slab_init();
7333 	if (rc)
7334 		return rc;
7335 
7336 	/*
7337 	 * The number of active work items is limited by the number of
7338 	 * rbd devices * queue depth, so leave @max_active at default.
7339 	 */
7340 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7341 	if (!rbd_wq) {
7342 		rc = -ENOMEM;
7343 		goto err_out_slab;
7344 	}
7345 
7346 	if (single_major) {
7347 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
7348 		if (rbd_major < 0) {
7349 			rc = rbd_major;
7350 			goto err_out_wq;
7351 		}
7352 	}
7353 
7354 	rc = rbd_sysfs_init();
7355 	if (rc)
7356 		goto err_out_blkdev;
7357 
7358 	if (single_major)
7359 		pr_info("loaded (major %d)\n", rbd_major);
7360 	else
7361 		pr_info("loaded\n");
7362 
7363 	return 0;
7364 
7365 err_out_blkdev:
7366 	if (single_major)
7367 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7368 err_out_wq:
7369 	destroy_workqueue(rbd_wq);
7370 err_out_slab:
7371 	rbd_slab_exit();
7372 	return rc;
7373 }
7374 
7375 static void __exit rbd_exit(void)
7376 {
7377 	ida_destroy(&rbd_dev_id_ida);
7378 	rbd_sysfs_cleanup();
7379 	if (single_major)
7380 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7381 	destroy_workqueue(rbd_wq);
7382 	rbd_slab_exit();
7383 }
7384 
7385 module_init(rbd_init);
7386 module_exit(rbd_exit);
7387 
7388 MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7389 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7390 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7391 /* following authorship retained from original osdblk.c */
7392 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7393 
7394 MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7395 MODULE_LICENSE("GPL");
7396