xref: /openbmc/linux/net/ceph/osdmap.c (revision 842d6b01)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
23d14c5d2SYehuda Sadeh 
33d14c5d2SYehuda Sadeh #include <linux/ceph/ceph_debug.h>
43d14c5d2SYehuda Sadeh 
53d14c5d2SYehuda Sadeh #include <linux/module.h>
63d14c5d2SYehuda Sadeh #include <linux/slab.h>
73d14c5d2SYehuda Sadeh 
83d14c5d2SYehuda Sadeh #include <linux/ceph/libceph.h>
93d14c5d2SYehuda Sadeh #include <linux/ceph/osdmap.h>
103d14c5d2SYehuda Sadeh #include <linux/ceph/decode.h>
113d14c5d2SYehuda Sadeh #include <linux/crush/hash.h>
123d14c5d2SYehuda Sadeh #include <linux/crush/mapper.h>
133d14c5d2SYehuda Sadeh 
14*842d6b01SDaichi Mukai static __printf(2, 3)
osdmap_info(const struct ceph_osdmap * map,const char * fmt,...)15*842d6b01SDaichi Mukai void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...)
16*842d6b01SDaichi Mukai {
17*842d6b01SDaichi Mukai 	struct va_format vaf;
18*842d6b01SDaichi Mukai 	va_list args;
19*842d6b01SDaichi Mukai 
20*842d6b01SDaichi Mukai 	va_start(args, fmt);
21*842d6b01SDaichi Mukai 	vaf.fmt = fmt;
22*842d6b01SDaichi Mukai 	vaf.va = &args;
23*842d6b01SDaichi Mukai 
24*842d6b01SDaichi Mukai 	printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid,
25*842d6b01SDaichi Mukai 	       map->epoch, &vaf);
26*842d6b01SDaichi Mukai 
27*842d6b01SDaichi Mukai 	va_end(args);
28*842d6b01SDaichi Mukai }
29*842d6b01SDaichi Mukai 
ceph_osdmap_state_str(char * str,int len,u32 state)300bb05da2SIlya Dryomov char *ceph_osdmap_state_str(char *str, int len, u32 state)
313d14c5d2SYehuda Sadeh {
323d14c5d2SYehuda Sadeh 	if (!len)
331ec3911dSCong Ding 		return str;
343d14c5d2SYehuda Sadeh 
351ec3911dSCong Ding 	if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
361ec3911dSCong Ding 		snprintf(str, len, "exists, up");
371ec3911dSCong Ding 	else if (state & CEPH_OSD_EXISTS)
383d14c5d2SYehuda Sadeh 		snprintf(str, len, "exists");
391ec3911dSCong Ding 	else if (state & CEPH_OSD_UP)
401ec3911dSCong Ding 		snprintf(str, len, "up");
411ec3911dSCong Ding 	else
423d14c5d2SYehuda Sadeh 		snprintf(str, len, "doesn't exist");
431ec3911dSCong Ding 
443d14c5d2SYehuda Sadeh 	return str;
453d14c5d2SYehuda Sadeh }
463d14c5d2SYehuda Sadeh 
473d14c5d2SYehuda Sadeh /* maps */
483d14c5d2SYehuda Sadeh 
calc_bits_of(unsigned int t)4995c96174SEric Dumazet static int calc_bits_of(unsigned int t)
503d14c5d2SYehuda Sadeh {
513d14c5d2SYehuda Sadeh 	int b = 0;
523d14c5d2SYehuda Sadeh 	while (t) {
533d14c5d2SYehuda Sadeh 		t = t >> 1;
543d14c5d2SYehuda Sadeh 		b++;
553d14c5d2SYehuda Sadeh 	}
563d14c5d2SYehuda Sadeh 	return b;
573d14c5d2SYehuda Sadeh }
583d14c5d2SYehuda Sadeh 
593d14c5d2SYehuda Sadeh /*
603d14c5d2SYehuda Sadeh  * the foo_mask is the smallest value 2^n-1 that is >= foo.
613d14c5d2SYehuda Sadeh  */
calc_pg_masks(struct ceph_pg_pool_info * pi)623d14c5d2SYehuda Sadeh static void calc_pg_masks(struct ceph_pg_pool_info *pi)
633d14c5d2SYehuda Sadeh {
644f6a7e5eSSage Weil 	pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
654f6a7e5eSSage Weil 	pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
663d14c5d2SYehuda Sadeh }
673d14c5d2SYehuda Sadeh 
683d14c5d2SYehuda Sadeh /*
693d14c5d2SYehuda Sadeh  * decode crush map
703d14c5d2SYehuda Sadeh  */
crush_decode_uniform_bucket(void ** p,void * end,struct crush_bucket_uniform * b)713d14c5d2SYehuda Sadeh static int crush_decode_uniform_bucket(void **p, void *end,
723d14c5d2SYehuda Sadeh 				       struct crush_bucket_uniform *b)
733d14c5d2SYehuda Sadeh {
743d14c5d2SYehuda Sadeh 	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
753d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
763d14c5d2SYehuda Sadeh 	b->item_weight = ceph_decode_32(p);
773d14c5d2SYehuda Sadeh 	return 0;
783d14c5d2SYehuda Sadeh bad:
793d14c5d2SYehuda Sadeh 	return -EINVAL;
803d14c5d2SYehuda Sadeh }
813d14c5d2SYehuda Sadeh 
crush_decode_list_bucket(void ** p,void * end,struct crush_bucket_list * b)823d14c5d2SYehuda Sadeh static int crush_decode_list_bucket(void **p, void *end,
833d14c5d2SYehuda Sadeh 				    struct crush_bucket_list *b)
843d14c5d2SYehuda Sadeh {
853d14c5d2SYehuda Sadeh 	int j;
863d14c5d2SYehuda Sadeh 	dout("crush_decode_list_bucket %p to %p\n", *p, end);
873d14c5d2SYehuda Sadeh 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
883d14c5d2SYehuda Sadeh 	if (b->item_weights == NULL)
893d14c5d2SYehuda Sadeh 		return -ENOMEM;
903d14c5d2SYehuda Sadeh 	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
913d14c5d2SYehuda Sadeh 	if (b->sum_weights == NULL)
923d14c5d2SYehuda Sadeh 		return -ENOMEM;
933d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
943d14c5d2SYehuda Sadeh 	for (j = 0; j < b->h.size; j++) {
953d14c5d2SYehuda Sadeh 		b->item_weights[j] = ceph_decode_32(p);
963d14c5d2SYehuda Sadeh 		b->sum_weights[j] = ceph_decode_32(p);
973d14c5d2SYehuda Sadeh 	}
983d14c5d2SYehuda Sadeh 	return 0;
993d14c5d2SYehuda Sadeh bad:
1003d14c5d2SYehuda Sadeh 	return -EINVAL;
1013d14c5d2SYehuda Sadeh }
1023d14c5d2SYehuda Sadeh 
crush_decode_tree_bucket(void ** p,void * end,struct crush_bucket_tree * b)1033d14c5d2SYehuda Sadeh static int crush_decode_tree_bucket(void **p, void *end,
1043d14c5d2SYehuda Sadeh 				    struct crush_bucket_tree *b)
1053d14c5d2SYehuda Sadeh {
1063d14c5d2SYehuda Sadeh 	int j;
1073d14c5d2SYehuda Sadeh 	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
10882cd003aSIlya Dryomov 	ceph_decode_8_safe(p, end, b->num_nodes, bad);
1093d14c5d2SYehuda Sadeh 	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
1103d14c5d2SYehuda Sadeh 	if (b->node_weights == NULL)
1113d14c5d2SYehuda Sadeh 		return -ENOMEM;
1123d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
1133d14c5d2SYehuda Sadeh 	for (j = 0; j < b->num_nodes; j++)
1143d14c5d2SYehuda Sadeh 		b->node_weights[j] = ceph_decode_32(p);
1153d14c5d2SYehuda Sadeh 	return 0;
1163d14c5d2SYehuda Sadeh bad:
1173d14c5d2SYehuda Sadeh 	return -EINVAL;
1183d14c5d2SYehuda Sadeh }
1193d14c5d2SYehuda Sadeh 
crush_decode_straw_bucket(void ** p,void * end,struct crush_bucket_straw * b)1203d14c5d2SYehuda Sadeh static int crush_decode_straw_bucket(void **p, void *end,
1213d14c5d2SYehuda Sadeh 				     struct crush_bucket_straw *b)
1223d14c5d2SYehuda Sadeh {
1233d14c5d2SYehuda Sadeh 	int j;
1243d14c5d2SYehuda Sadeh 	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
1253d14c5d2SYehuda Sadeh 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
1263d14c5d2SYehuda Sadeh 	if (b->item_weights == NULL)
1273d14c5d2SYehuda Sadeh 		return -ENOMEM;
1283d14c5d2SYehuda Sadeh 	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
1293d14c5d2SYehuda Sadeh 	if (b->straws == NULL)
1303d14c5d2SYehuda Sadeh 		return -ENOMEM;
1313d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
1323d14c5d2SYehuda Sadeh 	for (j = 0; j < b->h.size; j++) {
1333d14c5d2SYehuda Sadeh 		b->item_weights[j] = ceph_decode_32(p);
1343d14c5d2SYehuda Sadeh 		b->straws[j] = ceph_decode_32(p);
1353d14c5d2SYehuda Sadeh 	}
1363d14c5d2SYehuda Sadeh 	return 0;
1373d14c5d2SYehuda Sadeh bad:
1383d14c5d2SYehuda Sadeh 	return -EINVAL;
1393d14c5d2SYehuda Sadeh }
1403d14c5d2SYehuda Sadeh 
crush_decode_straw2_bucket(void ** p,void * end,struct crush_bucket_straw2 * b)141958a2765SIlya Dryomov static int crush_decode_straw2_bucket(void **p, void *end,
142958a2765SIlya Dryomov 				      struct crush_bucket_straw2 *b)
143958a2765SIlya Dryomov {
144958a2765SIlya Dryomov 	int j;
145958a2765SIlya Dryomov 	dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
146958a2765SIlya Dryomov 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
147958a2765SIlya Dryomov 	if (b->item_weights == NULL)
148958a2765SIlya Dryomov 		return -ENOMEM;
149958a2765SIlya Dryomov 	ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
150958a2765SIlya Dryomov 	for (j = 0; j < b->h.size; j++)
151958a2765SIlya Dryomov 		b->item_weights[j] = ceph_decode_32(p);
152958a2765SIlya Dryomov 	return 0;
153958a2765SIlya Dryomov bad:
154958a2765SIlya Dryomov 	return -EINVAL;
155958a2765SIlya Dryomov }
156958a2765SIlya Dryomov 
15786403a92SIlya Dryomov struct crush_name_node {
15886403a92SIlya Dryomov 	struct rb_node cn_node;
15986403a92SIlya Dryomov 	int cn_id;
16086403a92SIlya Dryomov 	char cn_name[];
16186403a92SIlya Dryomov };
16286403a92SIlya Dryomov 
alloc_crush_name(size_t name_len)16386403a92SIlya Dryomov static struct crush_name_node *alloc_crush_name(size_t name_len)
16486403a92SIlya Dryomov {
16586403a92SIlya Dryomov 	struct crush_name_node *cn;
16686403a92SIlya Dryomov 
16786403a92SIlya Dryomov 	cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
16886403a92SIlya Dryomov 	if (!cn)
16986403a92SIlya Dryomov 		return NULL;
17086403a92SIlya Dryomov 
17186403a92SIlya Dryomov 	RB_CLEAR_NODE(&cn->cn_node);
17286403a92SIlya Dryomov 	return cn;
17386403a92SIlya Dryomov }
17486403a92SIlya Dryomov 
free_crush_name(struct crush_name_node * cn)17586403a92SIlya Dryomov static void free_crush_name(struct crush_name_node *cn)
17686403a92SIlya Dryomov {
17786403a92SIlya Dryomov 	WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
17886403a92SIlya Dryomov 
17986403a92SIlya Dryomov 	kfree(cn);
18086403a92SIlya Dryomov }
18186403a92SIlya Dryomov 
DEFINE_RB_FUNCS(crush_name,struct crush_name_node,cn_id,cn_node)18286403a92SIlya Dryomov DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
18386403a92SIlya Dryomov 
18486403a92SIlya Dryomov static int decode_crush_names(void **p, void *end, struct rb_root *root)
18586403a92SIlya Dryomov {
18686403a92SIlya Dryomov 	u32 n;
18786403a92SIlya Dryomov 
18886403a92SIlya Dryomov 	ceph_decode_32_safe(p, end, n, e_inval);
18986403a92SIlya Dryomov 	while (n--) {
19086403a92SIlya Dryomov 		struct crush_name_node *cn;
19186403a92SIlya Dryomov 		int id;
19286403a92SIlya Dryomov 		u32 name_len;
19386403a92SIlya Dryomov 
19486403a92SIlya Dryomov 		ceph_decode_32_safe(p, end, id, e_inval);
19586403a92SIlya Dryomov 		ceph_decode_32_safe(p, end, name_len, e_inval);
19686403a92SIlya Dryomov 		ceph_decode_need(p, end, name_len, e_inval);
19786403a92SIlya Dryomov 
19886403a92SIlya Dryomov 		cn = alloc_crush_name(name_len);
19986403a92SIlya Dryomov 		if (!cn)
20086403a92SIlya Dryomov 			return -ENOMEM;
20186403a92SIlya Dryomov 
20286403a92SIlya Dryomov 		cn->cn_id = id;
20386403a92SIlya Dryomov 		memcpy(cn->cn_name, *p, name_len);
20486403a92SIlya Dryomov 		cn->cn_name[name_len] = '\0';
20586403a92SIlya Dryomov 		*p += name_len;
20686403a92SIlya Dryomov 
20786403a92SIlya Dryomov 		if (!__insert_crush_name(root, cn)) {
20886403a92SIlya Dryomov 			free_crush_name(cn);
20986403a92SIlya Dryomov 			return -EEXIST;
21086403a92SIlya Dryomov 		}
21186403a92SIlya Dryomov 	}
21286403a92SIlya Dryomov 
21386403a92SIlya Dryomov 	return 0;
21486403a92SIlya Dryomov 
21586403a92SIlya Dryomov e_inval:
21686403a92SIlya Dryomov 	return -EINVAL;
21786403a92SIlya Dryomov }
21886403a92SIlya Dryomov 
clear_crush_names(struct rb_root * root)21986403a92SIlya Dryomov void clear_crush_names(struct rb_root *root)
22086403a92SIlya Dryomov {
22186403a92SIlya Dryomov 	while (!RB_EMPTY_ROOT(root)) {
22286403a92SIlya Dryomov 		struct crush_name_node *cn =
22386403a92SIlya Dryomov 		    rb_entry(rb_first(root), struct crush_name_node, cn_node);
22486403a92SIlya Dryomov 
22586403a92SIlya Dryomov 		erase_crush_name(root, cn);
22686403a92SIlya Dryomov 		free_crush_name(cn);
22786403a92SIlya Dryomov 	}
22886403a92SIlya Dryomov }
22986403a92SIlya Dryomov 
alloc_choose_arg_map(void)2305cf9c4a9SIlya Dryomov static struct crush_choose_arg_map *alloc_choose_arg_map(void)
2315cf9c4a9SIlya Dryomov {
2325cf9c4a9SIlya Dryomov 	struct crush_choose_arg_map *arg_map;
2335cf9c4a9SIlya Dryomov 
2345cf9c4a9SIlya Dryomov 	arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO);
2355cf9c4a9SIlya Dryomov 	if (!arg_map)
2365cf9c4a9SIlya Dryomov 		return NULL;
2375cf9c4a9SIlya Dryomov 
2385cf9c4a9SIlya Dryomov 	RB_CLEAR_NODE(&arg_map->node);
2395cf9c4a9SIlya Dryomov 	return arg_map;
2405cf9c4a9SIlya Dryomov }
2415cf9c4a9SIlya Dryomov 
free_choose_arg_map(struct crush_choose_arg_map * arg_map)2425cf9c4a9SIlya Dryomov static void free_choose_arg_map(struct crush_choose_arg_map *arg_map)
2435cf9c4a9SIlya Dryomov {
2445cf9c4a9SIlya Dryomov 	if (arg_map) {
2455cf9c4a9SIlya Dryomov 		int i, j;
2465cf9c4a9SIlya Dryomov 
2475cf9c4a9SIlya Dryomov 		WARN_ON(!RB_EMPTY_NODE(&arg_map->node));
2485cf9c4a9SIlya Dryomov 
2495cf9c4a9SIlya Dryomov 		for (i = 0; i < arg_map->size; i++) {
2505cf9c4a9SIlya Dryomov 			struct crush_choose_arg *arg = &arg_map->args[i];
2515cf9c4a9SIlya Dryomov 
2525cf9c4a9SIlya Dryomov 			for (j = 0; j < arg->weight_set_size; j++)
2535cf9c4a9SIlya Dryomov 				kfree(arg->weight_set[j].weights);
2545cf9c4a9SIlya Dryomov 			kfree(arg->weight_set);
2555cf9c4a9SIlya Dryomov 			kfree(arg->ids);
2565cf9c4a9SIlya Dryomov 		}
2575cf9c4a9SIlya Dryomov 		kfree(arg_map->args);
2585cf9c4a9SIlya Dryomov 		kfree(arg_map);
2595cf9c4a9SIlya Dryomov 	}
2605cf9c4a9SIlya Dryomov }
2615cf9c4a9SIlya Dryomov 
2625cf9c4a9SIlya Dryomov DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index,
2635cf9c4a9SIlya Dryomov 		node);
2645cf9c4a9SIlya Dryomov 
clear_choose_args(struct crush_map * c)2655cf9c4a9SIlya Dryomov void clear_choose_args(struct crush_map *c)
2665cf9c4a9SIlya Dryomov {
2675cf9c4a9SIlya Dryomov 	while (!RB_EMPTY_ROOT(&c->choose_args)) {
2685cf9c4a9SIlya Dryomov 		struct crush_choose_arg_map *arg_map =
2695cf9c4a9SIlya Dryomov 		    rb_entry(rb_first(&c->choose_args),
2705cf9c4a9SIlya Dryomov 			     struct crush_choose_arg_map, node);
2715cf9c4a9SIlya Dryomov 
2725cf9c4a9SIlya Dryomov 		erase_choose_arg_map(&c->choose_args, arg_map);
2735cf9c4a9SIlya Dryomov 		free_choose_arg_map(arg_map);
2745cf9c4a9SIlya Dryomov 	}
2755cf9c4a9SIlya Dryomov }
2765cf9c4a9SIlya Dryomov 
decode_array_32_alloc(void ** p,void * end,u32 * plen)2775cf9c4a9SIlya Dryomov static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen)
2785cf9c4a9SIlya Dryomov {
2795cf9c4a9SIlya Dryomov 	u32 *a = NULL;
2805cf9c4a9SIlya Dryomov 	u32 len;
2815cf9c4a9SIlya Dryomov 	int ret;
2825cf9c4a9SIlya Dryomov 
2835cf9c4a9SIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
2845cf9c4a9SIlya Dryomov 	if (len) {
2855cf9c4a9SIlya Dryomov 		u32 i;
2865cf9c4a9SIlya Dryomov 
2875cf9c4a9SIlya Dryomov 		a = kmalloc_array(len, sizeof(u32), GFP_NOIO);
2885cf9c4a9SIlya Dryomov 		if (!a) {
2895cf9c4a9SIlya Dryomov 			ret = -ENOMEM;
2905cf9c4a9SIlya Dryomov 			goto fail;
2915cf9c4a9SIlya Dryomov 		}
2925cf9c4a9SIlya Dryomov 
2935cf9c4a9SIlya Dryomov 		ceph_decode_need(p, end, len * sizeof(u32), e_inval);
2945cf9c4a9SIlya Dryomov 		for (i = 0; i < len; i++)
2955cf9c4a9SIlya Dryomov 			a[i] = ceph_decode_32(p);
2965cf9c4a9SIlya Dryomov 	}
2975cf9c4a9SIlya Dryomov 
2985cf9c4a9SIlya Dryomov 	*plen = len;
2995cf9c4a9SIlya Dryomov 	return a;
3005cf9c4a9SIlya Dryomov 
3015cf9c4a9SIlya Dryomov e_inval:
3025cf9c4a9SIlya Dryomov 	ret = -EINVAL;
3035cf9c4a9SIlya Dryomov fail:
3045cf9c4a9SIlya Dryomov 	kfree(a);
3055cf9c4a9SIlya Dryomov 	return ERR_PTR(ret);
3065cf9c4a9SIlya Dryomov }
3075cf9c4a9SIlya Dryomov 
3085cf9c4a9SIlya Dryomov /*
3095cf9c4a9SIlya Dryomov  * Assumes @arg is zero-initialized.
3105cf9c4a9SIlya Dryomov  */
decode_choose_arg(void ** p,void * end,struct crush_choose_arg * arg)3115cf9c4a9SIlya Dryomov static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg)
3125cf9c4a9SIlya Dryomov {
3135cf9c4a9SIlya Dryomov 	int ret;
3145cf9c4a9SIlya Dryomov 
3155cf9c4a9SIlya Dryomov 	ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval);
3165cf9c4a9SIlya Dryomov 	if (arg->weight_set_size) {
3175cf9c4a9SIlya Dryomov 		u32 i;
3185cf9c4a9SIlya Dryomov 
3195cf9c4a9SIlya Dryomov 		arg->weight_set = kmalloc_array(arg->weight_set_size,
3205cf9c4a9SIlya Dryomov 						sizeof(*arg->weight_set),
3215cf9c4a9SIlya Dryomov 						GFP_NOIO);
3225cf9c4a9SIlya Dryomov 		if (!arg->weight_set)
3235cf9c4a9SIlya Dryomov 			return -ENOMEM;
3245cf9c4a9SIlya Dryomov 
3255cf9c4a9SIlya Dryomov 		for (i = 0; i < arg->weight_set_size; i++) {
3265cf9c4a9SIlya Dryomov 			struct crush_weight_set *w = &arg->weight_set[i];
3275cf9c4a9SIlya Dryomov 
3285cf9c4a9SIlya Dryomov 			w->weights = decode_array_32_alloc(p, end, &w->size);
3295cf9c4a9SIlya Dryomov 			if (IS_ERR(w->weights)) {
3305cf9c4a9SIlya Dryomov 				ret = PTR_ERR(w->weights);
3315cf9c4a9SIlya Dryomov 				w->weights = NULL;
3325cf9c4a9SIlya Dryomov 				return ret;
3335cf9c4a9SIlya Dryomov 			}
3345cf9c4a9SIlya Dryomov 		}
3355cf9c4a9SIlya Dryomov 	}
3365cf9c4a9SIlya Dryomov 
3375cf9c4a9SIlya Dryomov 	arg->ids = decode_array_32_alloc(p, end, &arg->ids_size);
3385cf9c4a9SIlya Dryomov 	if (IS_ERR(arg->ids)) {
3395cf9c4a9SIlya Dryomov 		ret = PTR_ERR(arg->ids);
3405cf9c4a9SIlya Dryomov 		arg->ids = NULL;
3415cf9c4a9SIlya Dryomov 		return ret;
3425cf9c4a9SIlya Dryomov 	}
3435cf9c4a9SIlya Dryomov 
3445cf9c4a9SIlya Dryomov 	return 0;
3455cf9c4a9SIlya Dryomov 
3465cf9c4a9SIlya Dryomov e_inval:
3475cf9c4a9SIlya Dryomov 	return -EINVAL;
3485cf9c4a9SIlya Dryomov }
3495cf9c4a9SIlya Dryomov 
decode_choose_args(void ** p,void * end,struct crush_map * c)3505cf9c4a9SIlya Dryomov static int decode_choose_args(void **p, void *end, struct crush_map *c)
3515cf9c4a9SIlya Dryomov {
3525cf9c4a9SIlya Dryomov 	struct crush_choose_arg_map *arg_map = NULL;
3535cf9c4a9SIlya Dryomov 	u32 num_choose_arg_maps, num_buckets;
3545cf9c4a9SIlya Dryomov 	int ret;
3555cf9c4a9SIlya Dryomov 
3565cf9c4a9SIlya Dryomov 	ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval);
3575cf9c4a9SIlya Dryomov 	while (num_choose_arg_maps--) {
3585cf9c4a9SIlya Dryomov 		arg_map = alloc_choose_arg_map();
3595cf9c4a9SIlya Dryomov 		if (!arg_map) {
3605cf9c4a9SIlya Dryomov 			ret = -ENOMEM;
3615cf9c4a9SIlya Dryomov 			goto fail;
3625cf9c4a9SIlya Dryomov 		}
3635cf9c4a9SIlya Dryomov 
3645cf9c4a9SIlya Dryomov 		ceph_decode_64_safe(p, end, arg_map->choose_args_index,
3655cf9c4a9SIlya Dryomov 				    e_inval);
3665cf9c4a9SIlya Dryomov 		arg_map->size = c->max_buckets;
3675cf9c4a9SIlya Dryomov 		arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args),
3685cf9c4a9SIlya Dryomov 					GFP_NOIO);
3695cf9c4a9SIlya Dryomov 		if (!arg_map->args) {
3705cf9c4a9SIlya Dryomov 			ret = -ENOMEM;
3715cf9c4a9SIlya Dryomov 			goto fail;
3725cf9c4a9SIlya Dryomov 		}
3735cf9c4a9SIlya Dryomov 
3745cf9c4a9SIlya Dryomov 		ceph_decode_32_safe(p, end, num_buckets, e_inval);
3755cf9c4a9SIlya Dryomov 		while (num_buckets--) {
3765cf9c4a9SIlya Dryomov 			struct crush_choose_arg *arg;
3775cf9c4a9SIlya Dryomov 			u32 bucket_index;
3785cf9c4a9SIlya Dryomov 
3795cf9c4a9SIlya Dryomov 			ceph_decode_32_safe(p, end, bucket_index, e_inval);
3805cf9c4a9SIlya Dryomov 			if (bucket_index >= arg_map->size)
3815cf9c4a9SIlya Dryomov 				goto e_inval;
3825cf9c4a9SIlya Dryomov 
3835cf9c4a9SIlya Dryomov 			arg = &arg_map->args[bucket_index];
3845cf9c4a9SIlya Dryomov 			ret = decode_choose_arg(p, end, arg);
3855cf9c4a9SIlya Dryomov 			if (ret)
3865cf9c4a9SIlya Dryomov 				goto fail;
387c7ed1a4bSIlya Dryomov 
388c7ed1a4bSIlya Dryomov 			if (arg->ids_size &&
389c7ed1a4bSIlya Dryomov 			    arg->ids_size != c->buckets[bucket_index]->size)
390c7ed1a4bSIlya Dryomov 				goto e_inval;
3915cf9c4a9SIlya Dryomov 		}
3925cf9c4a9SIlya Dryomov 
3935cf9c4a9SIlya Dryomov 		insert_choose_arg_map(&c->choose_args, arg_map);
3945cf9c4a9SIlya Dryomov 	}
3955cf9c4a9SIlya Dryomov 
3965cf9c4a9SIlya Dryomov 	return 0;
3975cf9c4a9SIlya Dryomov 
3985cf9c4a9SIlya Dryomov e_inval:
3995cf9c4a9SIlya Dryomov 	ret = -EINVAL;
4005cf9c4a9SIlya Dryomov fail:
4015cf9c4a9SIlya Dryomov 	free_choose_arg_map(arg_map);
4025cf9c4a9SIlya Dryomov 	return ret;
4035cf9c4a9SIlya Dryomov }
4045cf9c4a9SIlya Dryomov 
crush_finalize(struct crush_map * c)40566a0e2d5SIlya Dryomov static void crush_finalize(struct crush_map *c)
40666a0e2d5SIlya Dryomov {
40766a0e2d5SIlya Dryomov 	__s32 b;
40866a0e2d5SIlya Dryomov 
40966a0e2d5SIlya Dryomov 	/* Space for the array of pointers to per-bucket workspace */
41066a0e2d5SIlya Dryomov 	c->working_size = sizeof(struct crush_work) +
41166a0e2d5SIlya Dryomov 	    c->max_buckets * sizeof(struct crush_work_bucket *);
41266a0e2d5SIlya Dryomov 
41366a0e2d5SIlya Dryomov 	for (b = 0; b < c->max_buckets; b++) {
41466a0e2d5SIlya Dryomov 		if (!c->buckets[b])
41566a0e2d5SIlya Dryomov 			continue;
41666a0e2d5SIlya Dryomov 
41766a0e2d5SIlya Dryomov 		switch (c->buckets[b]->alg) {
41866a0e2d5SIlya Dryomov 		default:
41966a0e2d5SIlya Dryomov 			/*
42066a0e2d5SIlya Dryomov 			 * The base case, permutation variables and
42166a0e2d5SIlya Dryomov 			 * the pointer to the permutation array.
42266a0e2d5SIlya Dryomov 			 */
42366a0e2d5SIlya Dryomov 			c->working_size += sizeof(struct crush_work_bucket);
42466a0e2d5SIlya Dryomov 			break;
42566a0e2d5SIlya Dryomov 		}
42666a0e2d5SIlya Dryomov 		/* Every bucket has a permutation array. */
42766a0e2d5SIlya Dryomov 		c->working_size += c->buckets[b]->size * sizeof(__u32);
42866a0e2d5SIlya Dryomov 	}
42966a0e2d5SIlya Dryomov }
43066a0e2d5SIlya Dryomov 
crush_decode(void * pbyval,void * end)4313d14c5d2SYehuda Sadeh static struct crush_map *crush_decode(void *pbyval, void *end)
4323d14c5d2SYehuda Sadeh {
4333d14c5d2SYehuda Sadeh 	struct crush_map *c;
434c2acfd95SIlya Dryomov 	int err;
4353d14c5d2SYehuda Sadeh 	int i, j;
4363d14c5d2SYehuda Sadeh 	void **p = &pbyval;
4373d14c5d2SYehuda Sadeh 	void *start = pbyval;
4383d14c5d2SYehuda Sadeh 	u32 magic;
4393d14c5d2SYehuda Sadeh 
4403d14c5d2SYehuda Sadeh 	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
4413d14c5d2SYehuda Sadeh 
4423d14c5d2SYehuda Sadeh 	c = kzalloc(sizeof(*c), GFP_NOFS);
4433d14c5d2SYehuda Sadeh 	if (c == NULL)
4443d14c5d2SYehuda Sadeh 		return ERR_PTR(-ENOMEM);
4453d14c5d2SYehuda Sadeh 
44686403a92SIlya Dryomov 	c->type_names = RB_ROOT;
44786403a92SIlya Dryomov 	c->names = RB_ROOT;
4485cf9c4a9SIlya Dryomov 	c->choose_args = RB_ROOT;
4495cf9c4a9SIlya Dryomov 
450546f04efSSage Weil         /* set tunables to default values */
451546f04efSSage Weil         c->choose_local_tries = 2;
452546f04efSSage Weil         c->choose_local_fallback_tries = 5;
453546f04efSSage Weil         c->choose_total_tries = 19;
4541604f488SJim Schutt 	c->chooseleaf_descend_once = 0;
455546f04efSSage Weil 
4563d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
4573d14c5d2SYehuda Sadeh 	magic = ceph_decode_32(p);
4583d14c5d2SYehuda Sadeh 	if (magic != CRUSH_MAGIC) {
4593d14c5d2SYehuda Sadeh 		pr_err("crush_decode magic %x != current %x\n",
46095c96174SEric Dumazet 		       (unsigned int)magic, (unsigned int)CRUSH_MAGIC);
4613d14c5d2SYehuda Sadeh 		goto bad;
4623d14c5d2SYehuda Sadeh 	}
4633d14c5d2SYehuda Sadeh 	c->max_buckets = ceph_decode_32(p);
4643d14c5d2SYehuda Sadeh 	c->max_rules = ceph_decode_32(p);
4653d14c5d2SYehuda Sadeh 	c->max_devices = ceph_decode_32(p);
4663d14c5d2SYehuda Sadeh 
4673d14c5d2SYehuda Sadeh 	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
4683d14c5d2SYehuda Sadeh 	if (c->buckets == NULL)
4693d14c5d2SYehuda Sadeh 		goto badmem;
4703d14c5d2SYehuda Sadeh 	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
4713d14c5d2SYehuda Sadeh 	if (c->rules == NULL)
4723d14c5d2SYehuda Sadeh 		goto badmem;
4733d14c5d2SYehuda Sadeh 
4743d14c5d2SYehuda Sadeh 	/* buckets */
4753d14c5d2SYehuda Sadeh 	for (i = 0; i < c->max_buckets; i++) {
4763d14c5d2SYehuda Sadeh 		int size = 0;
4773d14c5d2SYehuda Sadeh 		u32 alg;
4783d14c5d2SYehuda Sadeh 		struct crush_bucket *b;
4793d14c5d2SYehuda Sadeh 
4803d14c5d2SYehuda Sadeh 		ceph_decode_32_safe(p, end, alg, bad);
4813d14c5d2SYehuda Sadeh 		if (alg == 0) {
4823d14c5d2SYehuda Sadeh 			c->buckets[i] = NULL;
4833d14c5d2SYehuda Sadeh 			continue;
4843d14c5d2SYehuda Sadeh 		}
4853d14c5d2SYehuda Sadeh 		dout("crush_decode bucket %d off %x %p to %p\n",
4863d14c5d2SYehuda Sadeh 		     i, (int)(*p-start), *p, end);
4873d14c5d2SYehuda Sadeh 
4883d14c5d2SYehuda Sadeh 		switch (alg) {
4893d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_UNIFORM:
4903d14c5d2SYehuda Sadeh 			size = sizeof(struct crush_bucket_uniform);
4913d14c5d2SYehuda Sadeh 			break;
4923d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_LIST:
4933d14c5d2SYehuda Sadeh 			size = sizeof(struct crush_bucket_list);
4943d14c5d2SYehuda Sadeh 			break;
4953d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_TREE:
4963d14c5d2SYehuda Sadeh 			size = sizeof(struct crush_bucket_tree);
4973d14c5d2SYehuda Sadeh 			break;
4983d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_STRAW:
4993d14c5d2SYehuda Sadeh 			size = sizeof(struct crush_bucket_straw);
5003d14c5d2SYehuda Sadeh 			break;
501958a2765SIlya Dryomov 		case CRUSH_BUCKET_STRAW2:
502958a2765SIlya Dryomov 			size = sizeof(struct crush_bucket_straw2);
503958a2765SIlya Dryomov 			break;
5043d14c5d2SYehuda Sadeh 		default:
5053d14c5d2SYehuda Sadeh 			goto bad;
5063d14c5d2SYehuda Sadeh 		}
5073d14c5d2SYehuda Sadeh 		BUG_ON(size == 0);
5083d14c5d2SYehuda Sadeh 		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
5093d14c5d2SYehuda Sadeh 		if (b == NULL)
5103d14c5d2SYehuda Sadeh 			goto badmem;
5113d14c5d2SYehuda Sadeh 
5123d14c5d2SYehuda Sadeh 		ceph_decode_need(p, end, 4*sizeof(u32), bad);
5133d14c5d2SYehuda Sadeh 		b->id = ceph_decode_32(p);
5143d14c5d2SYehuda Sadeh 		b->type = ceph_decode_16(p);
5153d14c5d2SYehuda Sadeh 		b->alg = ceph_decode_8(p);
5163d14c5d2SYehuda Sadeh 		b->hash = ceph_decode_8(p);
5173d14c5d2SYehuda Sadeh 		b->weight = ceph_decode_32(p);
5183d14c5d2SYehuda Sadeh 		b->size = ceph_decode_32(p);
5193d14c5d2SYehuda Sadeh 
5203d14c5d2SYehuda Sadeh 		dout("crush_decode bucket size %d off %x %p to %p\n",
5213d14c5d2SYehuda Sadeh 		     b->size, (int)(*p-start), *p, end);
5223d14c5d2SYehuda Sadeh 
5233d14c5d2SYehuda Sadeh 		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
5243d14c5d2SYehuda Sadeh 		if (b->items == NULL)
5253d14c5d2SYehuda Sadeh 			goto badmem;
5263d14c5d2SYehuda Sadeh 
5273d14c5d2SYehuda Sadeh 		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
5283d14c5d2SYehuda Sadeh 		for (j = 0; j < b->size; j++)
5293d14c5d2SYehuda Sadeh 			b->items[j] = ceph_decode_32(p);
5303d14c5d2SYehuda Sadeh 
5313d14c5d2SYehuda Sadeh 		switch (b->alg) {
5323d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_UNIFORM:
5333d14c5d2SYehuda Sadeh 			err = crush_decode_uniform_bucket(p, end,
5343d14c5d2SYehuda Sadeh 				  (struct crush_bucket_uniform *)b);
5353d14c5d2SYehuda Sadeh 			if (err < 0)
536c2acfd95SIlya Dryomov 				goto fail;
5373d14c5d2SYehuda Sadeh 			break;
5383d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_LIST:
5393d14c5d2SYehuda Sadeh 			err = crush_decode_list_bucket(p, end,
5403d14c5d2SYehuda Sadeh 			       (struct crush_bucket_list *)b);
5413d14c5d2SYehuda Sadeh 			if (err < 0)
542c2acfd95SIlya Dryomov 				goto fail;
5433d14c5d2SYehuda Sadeh 			break;
5443d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_TREE:
5453d14c5d2SYehuda Sadeh 			err = crush_decode_tree_bucket(p, end,
5463d14c5d2SYehuda Sadeh 				(struct crush_bucket_tree *)b);
5473d14c5d2SYehuda Sadeh 			if (err < 0)
548c2acfd95SIlya Dryomov 				goto fail;
5493d14c5d2SYehuda Sadeh 			break;
5503d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_STRAW:
5513d14c5d2SYehuda Sadeh 			err = crush_decode_straw_bucket(p, end,
5523d14c5d2SYehuda Sadeh 				(struct crush_bucket_straw *)b);
5533d14c5d2SYehuda Sadeh 			if (err < 0)
554c2acfd95SIlya Dryomov 				goto fail;
5553d14c5d2SYehuda Sadeh 			break;
556958a2765SIlya Dryomov 		case CRUSH_BUCKET_STRAW2:
557958a2765SIlya Dryomov 			err = crush_decode_straw2_bucket(p, end,
558958a2765SIlya Dryomov 				(struct crush_bucket_straw2 *)b);
559958a2765SIlya Dryomov 			if (err < 0)
560c2acfd95SIlya Dryomov 				goto fail;
561958a2765SIlya Dryomov 			break;
5623d14c5d2SYehuda Sadeh 		}
5633d14c5d2SYehuda Sadeh 	}
5643d14c5d2SYehuda Sadeh 
5653d14c5d2SYehuda Sadeh 	/* rules */
5663d14c5d2SYehuda Sadeh 	dout("rule vec is %p\n", c->rules);
5673d14c5d2SYehuda Sadeh 	for (i = 0; i < c->max_rules; i++) {
5683d14c5d2SYehuda Sadeh 		u32 yes;
5693d14c5d2SYehuda Sadeh 		struct crush_rule *r;
5703d14c5d2SYehuda Sadeh 
5713d14c5d2SYehuda Sadeh 		ceph_decode_32_safe(p, end, yes, bad);
5723d14c5d2SYehuda Sadeh 		if (!yes) {
5733d14c5d2SYehuda Sadeh 			dout("crush_decode NO rule %d off %x %p to %p\n",
5743d14c5d2SYehuda Sadeh 			     i, (int)(*p-start), *p, end);
5753d14c5d2SYehuda Sadeh 			c->rules[i] = NULL;
5763d14c5d2SYehuda Sadeh 			continue;
5773d14c5d2SYehuda Sadeh 		}
5783d14c5d2SYehuda Sadeh 
5793d14c5d2SYehuda Sadeh 		dout("crush_decode rule %d off %x %p to %p\n",
5803d14c5d2SYehuda Sadeh 		     i, (int)(*p-start), *p, end);
5813d14c5d2SYehuda Sadeh 
5823d14c5d2SYehuda Sadeh 		/* len */
5833d14c5d2SYehuda Sadeh 		ceph_decode_32_safe(p, end, yes, bad);
5843d14c5d2SYehuda Sadeh #if BITS_PER_LONG == 32
58564486697SXi Wang 		if (yes > (ULONG_MAX - sizeof(*r))
58664486697SXi Wang 			  / sizeof(struct crush_rule_step))
5873d14c5d2SYehuda Sadeh 			goto bad;
5883d14c5d2SYehuda Sadeh #endif
5896b41d4d9SGustavo A. R. Silva 		r = kmalloc(struct_size(r, steps, yes), GFP_NOFS);
5903d14c5d2SYehuda Sadeh 		if (r == NULL)
5913d14c5d2SYehuda Sadeh 			goto badmem;
5923d14c5d2SYehuda Sadeh 		dout(" rule %d is at %p\n", i, r);
593fc54cb8dSLi Qiong 		c->rules[i] = r;
5943d14c5d2SYehuda Sadeh 		r->len = yes;
5953d14c5d2SYehuda Sadeh 		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
5963d14c5d2SYehuda Sadeh 		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
5973d14c5d2SYehuda Sadeh 		for (j = 0; j < r->len; j++) {
5983d14c5d2SYehuda Sadeh 			r->steps[j].op = ceph_decode_32(p);
5993d14c5d2SYehuda Sadeh 			r->steps[j].arg1 = ceph_decode_32(p);
6003d14c5d2SYehuda Sadeh 			r->steps[j].arg2 = ceph_decode_32(p);
6013d14c5d2SYehuda Sadeh 		}
6023d14c5d2SYehuda Sadeh 	}
6033d14c5d2SYehuda Sadeh 
60486403a92SIlya Dryomov 	err = decode_crush_names(p, end, &c->type_names);
60586403a92SIlya Dryomov 	if (err)
60686403a92SIlya Dryomov 		goto fail;
60786403a92SIlya Dryomov 
60886403a92SIlya Dryomov 	err = decode_crush_names(p, end, &c->names);
60986403a92SIlya Dryomov 	if (err)
61086403a92SIlya Dryomov 		goto fail;
61186403a92SIlya Dryomov 
612278b1d70SIlya Dryomov 	ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
6133d14c5d2SYehuda Sadeh 
614546f04efSSage Weil         /* tunables */
615546f04efSSage Weil         ceph_decode_need(p, end, 3*sizeof(u32), done);
616546f04efSSage Weil         c->choose_local_tries = ceph_decode_32(p);
617546f04efSSage Weil         c->choose_local_fallback_tries =  ceph_decode_32(p);
618546f04efSSage Weil         c->choose_total_tries = ceph_decode_32(p);
619b9b519b7SIlya Dryomov         dout("crush decode tunable choose_local_tries = %d\n",
620546f04efSSage Weil              c->choose_local_tries);
621b9b519b7SIlya Dryomov         dout("crush decode tunable choose_local_fallback_tries = %d\n",
622546f04efSSage Weil              c->choose_local_fallback_tries);
623b9b519b7SIlya Dryomov         dout("crush decode tunable choose_total_tries = %d\n",
624546f04efSSage Weil              c->choose_total_tries);
625546f04efSSage Weil 
6261604f488SJim Schutt 	ceph_decode_need(p, end, sizeof(u32), done);
6271604f488SJim Schutt 	c->chooseleaf_descend_once = ceph_decode_32(p);
628b9b519b7SIlya Dryomov 	dout("crush decode tunable chooseleaf_descend_once = %d\n",
6291604f488SJim Schutt 	     c->chooseleaf_descend_once);
6301604f488SJim Schutt 
631f140662fSIlya Dryomov 	ceph_decode_need(p, end, sizeof(u8), done);
632f140662fSIlya Dryomov 	c->chooseleaf_vary_r = ceph_decode_8(p);
633b9b519b7SIlya Dryomov 	dout("crush decode tunable chooseleaf_vary_r = %d\n",
634f140662fSIlya Dryomov 	     c->chooseleaf_vary_r);
635f140662fSIlya Dryomov 
636b9b519b7SIlya Dryomov 	/* skip straw_calc_version, allowed_bucket_algs */
637b9b519b7SIlya Dryomov 	ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
638b9b519b7SIlya Dryomov 	*p += sizeof(u8) + sizeof(u32);
639b9b519b7SIlya Dryomov 
640b9b519b7SIlya Dryomov 	ceph_decode_need(p, end, sizeof(u8), done);
641b9b519b7SIlya Dryomov 	c->chooseleaf_stable = ceph_decode_8(p);
642b9b519b7SIlya Dryomov 	dout("crush decode tunable chooseleaf_stable = %d\n",
643b9b519b7SIlya Dryomov 	     c->chooseleaf_stable);
644b9b519b7SIlya Dryomov 
6455cf9c4a9SIlya Dryomov 	if (*p != end) {
6465cf9c4a9SIlya Dryomov 		/* class_map */
6475cf9c4a9SIlya Dryomov 		ceph_decode_skip_map(p, end, 32, 32, bad);
6485cf9c4a9SIlya Dryomov 		/* class_name */
6495cf9c4a9SIlya Dryomov 		ceph_decode_skip_map(p, end, 32, string, bad);
6505cf9c4a9SIlya Dryomov 		/* class_bucket */
6515cf9c4a9SIlya Dryomov 		ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad);
6525cf9c4a9SIlya Dryomov 	}
6535cf9c4a9SIlya Dryomov 
6545cf9c4a9SIlya Dryomov 	if (*p != end) {
6555cf9c4a9SIlya Dryomov 		err = decode_choose_args(p, end, c);
6565cf9c4a9SIlya Dryomov 		if (err)
657c2acfd95SIlya Dryomov 			goto fail;
6585cf9c4a9SIlya Dryomov 	}
6595cf9c4a9SIlya Dryomov 
660546f04efSSage Weil done:
6619afd30dbSIlya Dryomov 	crush_finalize(c);
6623d14c5d2SYehuda Sadeh 	dout("crush_decode success\n");
6633d14c5d2SYehuda Sadeh 	return c;
6643d14c5d2SYehuda Sadeh 
6653d14c5d2SYehuda Sadeh badmem:
6663d14c5d2SYehuda Sadeh 	err = -ENOMEM;
667c2acfd95SIlya Dryomov fail:
6683d14c5d2SYehuda Sadeh 	dout("crush_decode fail %d\n", err);
6693d14c5d2SYehuda Sadeh 	crush_destroy(c);
6703d14c5d2SYehuda Sadeh 	return ERR_PTR(err);
671c2acfd95SIlya Dryomov 
672c2acfd95SIlya Dryomov bad:
673c2acfd95SIlya Dryomov 	err = -EINVAL;
674c2acfd95SIlya Dryomov 	goto fail;
6753d14c5d2SYehuda Sadeh }
6763d14c5d2SYehuda Sadeh 
ceph_pg_compare(const struct ceph_pg * lhs,const struct ceph_pg * rhs)677f984cb76SIlya Dryomov int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
678f984cb76SIlya Dryomov {
679f984cb76SIlya Dryomov 	if (lhs->pool < rhs->pool)
680f984cb76SIlya Dryomov 		return -1;
681f984cb76SIlya Dryomov 	if (lhs->pool > rhs->pool)
682f984cb76SIlya Dryomov 		return 1;
683f984cb76SIlya Dryomov 	if (lhs->seed < rhs->seed)
684f984cb76SIlya Dryomov 		return -1;
685f984cb76SIlya Dryomov 	if (lhs->seed > rhs->seed)
686f984cb76SIlya Dryomov 		return 1;
687f984cb76SIlya Dryomov 
688f984cb76SIlya Dryomov 	return 0;
689f984cb76SIlya Dryomov }
690f984cb76SIlya Dryomov 
ceph_spg_compare(const struct ceph_spg * lhs,const struct ceph_spg * rhs)691a02a946dSIlya Dryomov int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
692a02a946dSIlya Dryomov {
693a02a946dSIlya Dryomov 	int ret;
694a02a946dSIlya Dryomov 
695a02a946dSIlya Dryomov 	ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
696a02a946dSIlya Dryomov 	if (ret)
697a02a946dSIlya Dryomov 		return ret;
698a02a946dSIlya Dryomov 
699a02a946dSIlya Dryomov 	if (lhs->shard < rhs->shard)
700a02a946dSIlya Dryomov 		return -1;
701a02a946dSIlya Dryomov 	if (lhs->shard > rhs->shard)
702a02a946dSIlya Dryomov 		return 1;
703a02a946dSIlya Dryomov 
704a02a946dSIlya Dryomov 	return 0;
705a02a946dSIlya Dryomov }
706a02a946dSIlya Dryomov 
alloc_pg_mapping(size_t payload_len)707a303bb0eSIlya Dryomov static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len)
708a303bb0eSIlya Dryomov {
709a303bb0eSIlya Dryomov 	struct ceph_pg_mapping *pg;
710a303bb0eSIlya Dryomov 
711a303bb0eSIlya Dryomov 	pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO);
712a303bb0eSIlya Dryomov 	if (!pg)
713a303bb0eSIlya Dryomov 		return NULL;
714a303bb0eSIlya Dryomov 
715a303bb0eSIlya Dryomov 	RB_CLEAR_NODE(&pg->node);
716a303bb0eSIlya Dryomov 	return pg;
717a303bb0eSIlya Dryomov }
718a303bb0eSIlya Dryomov 
free_pg_mapping(struct ceph_pg_mapping * pg)719a303bb0eSIlya Dryomov static void free_pg_mapping(struct ceph_pg_mapping *pg)
720a303bb0eSIlya Dryomov {
721a303bb0eSIlya Dryomov 	WARN_ON(!RB_EMPTY_NODE(&pg->node));
722a303bb0eSIlya Dryomov 
723a303bb0eSIlya Dryomov 	kfree(pg);
724a303bb0eSIlya Dryomov }
725a303bb0eSIlya Dryomov 
7263d14c5d2SYehuda Sadeh /*
7273d14c5d2SYehuda Sadeh  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
7289686f94cSIlya Dryomov  * to a set of osds) and primary_temp (explicit primary setting)
7293d14c5d2SYehuda Sadeh  */
DEFINE_RB_FUNCS2(pg_mapping,struct ceph_pg_mapping,pgid,ceph_pg_compare,RB_BYPTR,const struct ceph_pg *,node)730ab75144bSIlya Dryomov DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
731ab75144bSIlya Dryomov 		 RB_BYPTR, const struct ceph_pg *, node)
7328adc8b3dSSage Weil 
7333d14c5d2SYehuda Sadeh /*
7343d14c5d2SYehuda Sadeh  * rbtree of pg pool info
7353d14c5d2SYehuda Sadeh  */
7368a4b863cSIlya Dryomov DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node)
7373d14c5d2SYehuda Sadeh 
738ce7f6a27SIlya Dryomov struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
739ce7f6a27SIlya Dryomov {
7408a4b863cSIlya Dryomov 	return lookup_pg_pool(&map->pg_pools, id);
741ce7f6a27SIlya Dryomov }
742ce7f6a27SIlya Dryomov 
ceph_pg_pool_name_by_id(struct ceph_osdmap * map,u64 id)74372afc71fSAlex Elder const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
74472afc71fSAlex Elder {
74572afc71fSAlex Elder 	struct ceph_pg_pool_info *pi;
74672afc71fSAlex Elder 
74772afc71fSAlex Elder 	if (id == CEPH_NOPOOL)
74872afc71fSAlex Elder 		return NULL;
74972afc71fSAlex Elder 
75072afc71fSAlex Elder 	if (WARN_ON_ONCE(id > (u64) INT_MAX))
75172afc71fSAlex Elder 		return NULL;
75272afc71fSAlex Elder 
7538a4b863cSIlya Dryomov 	pi = lookup_pg_pool(&map->pg_pools, id);
75472afc71fSAlex Elder 	return pi ? pi->name : NULL;
75572afc71fSAlex Elder }
75672afc71fSAlex Elder EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
75772afc71fSAlex Elder 
ceph_pg_poolid_by_name(struct ceph_osdmap * map,const char * name)7583d14c5d2SYehuda Sadeh int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
7593d14c5d2SYehuda Sadeh {
7603d14c5d2SYehuda Sadeh 	struct rb_node *rbp;
7613d14c5d2SYehuda Sadeh 
7623d14c5d2SYehuda Sadeh 	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
7633d14c5d2SYehuda Sadeh 		struct ceph_pg_pool_info *pi =
7643d14c5d2SYehuda Sadeh 			rb_entry(rbp, struct ceph_pg_pool_info, node);
7653d14c5d2SYehuda Sadeh 		if (pi->name && strcmp(pi->name, name) == 0)
7663d14c5d2SYehuda Sadeh 			return pi->id;
7673d14c5d2SYehuda Sadeh 	}
7683d14c5d2SYehuda Sadeh 	return -ENOENT;
7693d14c5d2SYehuda Sadeh }
7703d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_pg_poolid_by_name);
7713d14c5d2SYehuda Sadeh 
ceph_pg_pool_flags(struct ceph_osdmap * map,u64 id)77276142097SIlya Dryomov u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
77376142097SIlya Dryomov {
77476142097SIlya Dryomov 	struct ceph_pg_pool_info *pi;
77576142097SIlya Dryomov 
7768a4b863cSIlya Dryomov 	pi = lookup_pg_pool(&map->pg_pools, id);
77776142097SIlya Dryomov 	return pi ? pi->flags : 0;
77876142097SIlya Dryomov }
77976142097SIlya Dryomov EXPORT_SYMBOL(ceph_pg_pool_flags);
78076142097SIlya Dryomov 
__remove_pg_pool(struct rb_root * root,struct ceph_pg_pool_info * pi)7813d14c5d2SYehuda Sadeh static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
7823d14c5d2SYehuda Sadeh {
7838a4b863cSIlya Dryomov 	erase_pg_pool(root, pi);
7843d14c5d2SYehuda Sadeh 	kfree(pi->name);
7853d14c5d2SYehuda Sadeh 	kfree(pi);
7863d14c5d2SYehuda Sadeh }
7873d14c5d2SYehuda Sadeh 
decode_pool(void ** p,void * end,struct ceph_pg_pool_info * pi)7880f70c7eeSIlya Dryomov static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
7893d14c5d2SYehuda Sadeh {
7904f6a7e5eSSage Weil 	u8 ev, cv;
7914f6a7e5eSSage Weil 	unsigned len, num;
7924f6a7e5eSSage Weil 	void *pool_end;
7933d14c5d2SYehuda Sadeh 
7944f6a7e5eSSage Weil 	ceph_decode_need(p, end, 2 + 4, bad);
7954f6a7e5eSSage Weil 	ev = ceph_decode_8(p);  /* encoding version */
7964f6a7e5eSSage Weil 	cv = ceph_decode_8(p); /* compat version */
7974f6a7e5eSSage Weil 	if (ev < 5) {
798b9a67899SJoe Perches 		pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
7994f6a7e5eSSage Weil 		return -EINVAL;
8004f6a7e5eSSage Weil 	}
80117a13e40SIlya Dryomov 	if (cv > 9) {
802b9a67899SJoe Perches 		pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
8034f6a7e5eSSage Weil 		return -EINVAL;
8044f6a7e5eSSage Weil 	}
8054f6a7e5eSSage Weil 	len = ceph_decode_32(p);
8064f6a7e5eSSage Weil 	ceph_decode_need(p, end, len, bad);
8074f6a7e5eSSage Weil 	pool_end = *p + len;
8083d14c5d2SYehuda Sadeh 
8094f6a7e5eSSage Weil 	pi->type = ceph_decode_8(p);
8104f6a7e5eSSage Weil 	pi->size = ceph_decode_8(p);
8114f6a7e5eSSage Weil 	pi->crush_ruleset = ceph_decode_8(p);
8124f6a7e5eSSage Weil 	pi->object_hash = ceph_decode_8(p);
8134f6a7e5eSSage Weil 
8144f6a7e5eSSage Weil 	pi->pg_num = ceph_decode_32(p);
8154f6a7e5eSSage Weil 	pi->pgp_num = ceph_decode_32(p);
8164f6a7e5eSSage Weil 
8174f6a7e5eSSage Weil 	*p += 4 + 4;  /* skip lpg* */
8184f6a7e5eSSage Weil 	*p += 4;      /* skip last_change */
8194f6a7e5eSSage Weil 	*p += 8 + 4;  /* skip snap_seq, snap_epoch */
8204f6a7e5eSSage Weil 
8214f6a7e5eSSage Weil 	/* skip snaps */
8224f6a7e5eSSage Weil 	num = ceph_decode_32(p);
8234f6a7e5eSSage Weil 	while (num--) {
8244f6a7e5eSSage Weil 		*p += 8;  /* snapid key */
8254f6a7e5eSSage Weil 		*p += 1 + 1; /* versions */
8264f6a7e5eSSage Weil 		len = ceph_decode_32(p);
8274f6a7e5eSSage Weil 		*p += len;
8283d14c5d2SYehuda Sadeh 	}
8293d14c5d2SYehuda Sadeh 
83017a13e40SIlya Dryomov 	/* skip removed_snaps */
8314f6a7e5eSSage Weil 	num = ceph_decode_32(p);
8324f6a7e5eSSage Weil 	*p += num * (8 + 8);
8334f6a7e5eSSage Weil 
8344f6a7e5eSSage Weil 	*p += 8;  /* skip auid */
8354f6a7e5eSSage Weil 	pi->flags = ceph_decode_64(p);
83617a13e40SIlya Dryomov 	*p += 4;  /* skip crash_replay_interval */
83717a13e40SIlya Dryomov 
83817a13e40SIlya Dryomov 	if (ev >= 7)
83904812acfSIlya Dryomov 		pi->min_size = ceph_decode_8(p);
84004812acfSIlya Dryomov 	else
84104812acfSIlya Dryomov 		pi->min_size = pi->size - pi->size / 2;
84217a13e40SIlya Dryomov 
84317a13e40SIlya Dryomov 	if (ev >= 8)
84417a13e40SIlya Dryomov 		*p += 8 + 8;  /* skip quota_max_* */
84517a13e40SIlya Dryomov 
84617a13e40SIlya Dryomov 	if (ev >= 9) {
84717a13e40SIlya Dryomov 		/* skip tiers */
84817a13e40SIlya Dryomov 		num = ceph_decode_32(p);
84917a13e40SIlya Dryomov 		*p += num * 8;
85017a13e40SIlya Dryomov 
85117a13e40SIlya Dryomov 		*p += 8;  /* skip tier_of */
85217a13e40SIlya Dryomov 		*p += 1;  /* skip cache_mode */
85317a13e40SIlya Dryomov 
85417a13e40SIlya Dryomov 		pi->read_tier = ceph_decode_64(p);
85517a13e40SIlya Dryomov 		pi->write_tier = ceph_decode_64(p);
85617a13e40SIlya Dryomov 	} else {
85717a13e40SIlya Dryomov 		pi->read_tier = -1;
85817a13e40SIlya Dryomov 		pi->write_tier = -1;
85917a13e40SIlya Dryomov 	}
8604f6a7e5eSSage Weil 
86104812acfSIlya Dryomov 	if (ev >= 10) {
86204812acfSIlya Dryomov 		/* skip properties */
86304812acfSIlya Dryomov 		num = ceph_decode_32(p);
86404812acfSIlya Dryomov 		while (num--) {
86504812acfSIlya Dryomov 			len = ceph_decode_32(p);
86604812acfSIlya Dryomov 			*p += len; /* key */
86704812acfSIlya Dryomov 			len = ceph_decode_32(p);
86804812acfSIlya Dryomov 			*p += len; /* val */
86904812acfSIlya Dryomov 		}
87004812acfSIlya Dryomov 	}
87104812acfSIlya Dryomov 
87204812acfSIlya Dryomov 	if (ev >= 11) {
87304812acfSIlya Dryomov 		/* skip hit_set_params */
87404812acfSIlya Dryomov 		*p += 1 + 1; /* versions */
87504812acfSIlya Dryomov 		len = ceph_decode_32(p);
87604812acfSIlya Dryomov 		*p += len;
87704812acfSIlya Dryomov 
87804812acfSIlya Dryomov 		*p += 4; /* skip hit_set_period */
87904812acfSIlya Dryomov 		*p += 4; /* skip hit_set_count */
88004812acfSIlya Dryomov 	}
88104812acfSIlya Dryomov 
88204812acfSIlya Dryomov 	if (ev >= 12)
88304812acfSIlya Dryomov 		*p += 4; /* skip stripe_width */
88404812acfSIlya Dryomov 
88504812acfSIlya Dryomov 	if (ev >= 13) {
88604812acfSIlya Dryomov 		*p += 8; /* skip target_max_bytes */
88704812acfSIlya Dryomov 		*p += 8; /* skip target_max_objects */
88804812acfSIlya Dryomov 		*p += 4; /* skip cache_target_dirty_ratio_micro */
88904812acfSIlya Dryomov 		*p += 4; /* skip cache_target_full_ratio_micro */
89004812acfSIlya Dryomov 		*p += 4; /* skip cache_min_flush_age */
89104812acfSIlya Dryomov 		*p += 4; /* skip cache_min_evict_age */
89204812acfSIlya Dryomov 	}
89304812acfSIlya Dryomov 
89404812acfSIlya Dryomov 	if (ev >=  14) {
89504812acfSIlya Dryomov 		/* skip erasure_code_profile */
89604812acfSIlya Dryomov 		len = ceph_decode_32(p);
89704812acfSIlya Dryomov 		*p += len;
89804812acfSIlya Dryomov 	}
89904812acfSIlya Dryomov 
9008e48cf00SIlya Dryomov 	/*
9018e48cf00SIlya Dryomov 	 * last_force_op_resend_preluminous, will be overridden if the
9028e48cf00SIlya Dryomov 	 * map was encoded with RESEND_ON_SPLIT
9038e48cf00SIlya Dryomov 	 */
90404812acfSIlya Dryomov 	if (ev >= 15)
90504812acfSIlya Dryomov 		pi->last_force_request_resend = ceph_decode_32(p);
90604812acfSIlya Dryomov 	else
90704812acfSIlya Dryomov 		pi->last_force_request_resend = 0;
90804812acfSIlya Dryomov 
9098e48cf00SIlya Dryomov 	if (ev >= 16)
9108e48cf00SIlya Dryomov 		*p += 4; /* skip min_read_recency_for_promote */
9118e48cf00SIlya Dryomov 
9128e48cf00SIlya Dryomov 	if (ev >= 17)
9138e48cf00SIlya Dryomov 		*p += 8; /* skip expected_num_objects */
9148e48cf00SIlya Dryomov 
9158e48cf00SIlya Dryomov 	if (ev >= 19)
9168e48cf00SIlya Dryomov 		*p += 4; /* skip cache_target_dirty_high_ratio_micro */
9178e48cf00SIlya Dryomov 
9188e48cf00SIlya Dryomov 	if (ev >= 20)
9198e48cf00SIlya Dryomov 		*p += 4; /* skip min_write_recency_for_promote */
9208e48cf00SIlya Dryomov 
9218e48cf00SIlya Dryomov 	if (ev >= 21)
9228e48cf00SIlya Dryomov 		*p += 1; /* skip use_gmt_hitset */
9238e48cf00SIlya Dryomov 
9248e48cf00SIlya Dryomov 	if (ev >= 22)
9258e48cf00SIlya Dryomov 		*p += 1; /* skip fast_read */
9268e48cf00SIlya Dryomov 
9278e48cf00SIlya Dryomov 	if (ev >= 23) {
9288e48cf00SIlya Dryomov 		*p += 4; /* skip hit_set_grade_decay_rate */
9298e48cf00SIlya Dryomov 		*p += 4; /* skip hit_set_search_last_n */
9308e48cf00SIlya Dryomov 	}
9318e48cf00SIlya Dryomov 
9328e48cf00SIlya Dryomov 	if (ev >= 24) {
9338e48cf00SIlya Dryomov 		/* skip opts */
9348e48cf00SIlya Dryomov 		*p += 1 + 1; /* versions */
9358e48cf00SIlya Dryomov 		len = ceph_decode_32(p);
9368e48cf00SIlya Dryomov 		*p += len;
9378e48cf00SIlya Dryomov 	}
9388e48cf00SIlya Dryomov 
9398e48cf00SIlya Dryomov 	if (ev >= 25)
9408e48cf00SIlya Dryomov 		pi->last_force_request_resend = ceph_decode_32(p);
9418e48cf00SIlya Dryomov 
9424f6a7e5eSSage Weil 	/* ignore the rest */
9434f6a7e5eSSage Weil 
9444f6a7e5eSSage Weil 	*p = pool_end;
9454f6a7e5eSSage Weil 	calc_pg_masks(pi);
9463d14c5d2SYehuda Sadeh 	return 0;
9473d14c5d2SYehuda Sadeh 
9483d14c5d2SYehuda Sadeh bad:
9493d14c5d2SYehuda Sadeh 	return -EINVAL;
9503d14c5d2SYehuda Sadeh }
9513d14c5d2SYehuda Sadeh 
decode_pool_names(void ** p,void * end,struct ceph_osdmap * map)9520f70c7eeSIlya Dryomov static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
9533d14c5d2SYehuda Sadeh {
9543d14c5d2SYehuda Sadeh 	struct ceph_pg_pool_info *pi;
9554f6a7e5eSSage Weil 	u32 num, len;
9564f6a7e5eSSage Weil 	u64 pool;
9573d14c5d2SYehuda Sadeh 
9583d14c5d2SYehuda Sadeh 	ceph_decode_32_safe(p, end, num, bad);
9593d14c5d2SYehuda Sadeh 	dout(" %d pool names\n", num);
9603d14c5d2SYehuda Sadeh 	while (num--) {
9614f6a7e5eSSage Weil 		ceph_decode_64_safe(p, end, pool, bad);
9623d14c5d2SYehuda Sadeh 		ceph_decode_32_safe(p, end, len, bad);
9634f6a7e5eSSage Weil 		dout("  pool %llu len %d\n", pool, len);
964ad3b904cSXi Wang 		ceph_decode_need(p, end, len, bad);
9658a4b863cSIlya Dryomov 		pi = lookup_pg_pool(&map->pg_pools, pool);
9663d14c5d2SYehuda Sadeh 		if (pi) {
967ad3b904cSXi Wang 			char *name = kstrndup(*p, len, GFP_NOFS);
968ad3b904cSXi Wang 
969ad3b904cSXi Wang 			if (!name)
970ad3b904cSXi Wang 				return -ENOMEM;
9713d14c5d2SYehuda Sadeh 			kfree(pi->name);
972ad3b904cSXi Wang 			pi->name = name;
9733d14c5d2SYehuda Sadeh 			dout("  name is %s\n", pi->name);
9743d14c5d2SYehuda Sadeh 		}
9753d14c5d2SYehuda Sadeh 		*p += len;
9763d14c5d2SYehuda Sadeh 	}
9773d14c5d2SYehuda Sadeh 	return 0;
9783d14c5d2SYehuda Sadeh 
9793d14c5d2SYehuda Sadeh bad:
9803d14c5d2SYehuda Sadeh 	return -EINVAL;
9813d14c5d2SYehuda Sadeh }
9823d14c5d2SYehuda Sadeh 
9833d14c5d2SYehuda Sadeh /*
9843986f9a4SIlya Dryomov  * CRUSH workspaces
9853986f9a4SIlya Dryomov  *
9863986f9a4SIlya Dryomov  * workspace_manager framework borrowed from fs/btrfs/compression.c.
9873986f9a4SIlya Dryomov  * Two simplifications: there is only one type of workspace and there
9883986f9a4SIlya Dryomov  * is always at least one workspace.
9893986f9a4SIlya Dryomov  */
alloc_workspace(const struct crush_map * c)9903986f9a4SIlya Dryomov static struct crush_work *alloc_workspace(const struct crush_map *c)
9913986f9a4SIlya Dryomov {
9923986f9a4SIlya Dryomov 	struct crush_work *work;
9933986f9a4SIlya Dryomov 	size_t work_size;
9943986f9a4SIlya Dryomov 
9953986f9a4SIlya Dryomov 	WARN_ON(!c->working_size);
9963986f9a4SIlya Dryomov 	work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
9973986f9a4SIlya Dryomov 	dout("%s work_size %zu bytes\n", __func__, work_size);
9983986f9a4SIlya Dryomov 
999a421ef30SMichal Hocko 	work = kvmalloc(work_size, GFP_NOIO);
10003986f9a4SIlya Dryomov 	if (!work)
10013986f9a4SIlya Dryomov 		return NULL;
10023986f9a4SIlya Dryomov 
10033986f9a4SIlya Dryomov 	INIT_LIST_HEAD(&work->item);
10043986f9a4SIlya Dryomov 	crush_init_workspace(c, work);
10053986f9a4SIlya Dryomov 	return work;
10063986f9a4SIlya Dryomov }
10073986f9a4SIlya Dryomov 
free_workspace(struct crush_work * work)10083986f9a4SIlya Dryomov static void free_workspace(struct crush_work *work)
10093986f9a4SIlya Dryomov {
10103986f9a4SIlya Dryomov 	WARN_ON(!list_empty(&work->item));
10113986f9a4SIlya Dryomov 	kvfree(work);
10123986f9a4SIlya Dryomov }
10133986f9a4SIlya Dryomov 
init_workspace_manager(struct workspace_manager * wsm)10143986f9a4SIlya Dryomov static void init_workspace_manager(struct workspace_manager *wsm)
10153986f9a4SIlya Dryomov {
10163986f9a4SIlya Dryomov 	INIT_LIST_HEAD(&wsm->idle_ws);
10173986f9a4SIlya Dryomov 	spin_lock_init(&wsm->ws_lock);
10183986f9a4SIlya Dryomov 	atomic_set(&wsm->total_ws, 0);
10193986f9a4SIlya Dryomov 	wsm->free_ws = 0;
10203986f9a4SIlya Dryomov 	init_waitqueue_head(&wsm->ws_wait);
10213986f9a4SIlya Dryomov }
10223986f9a4SIlya Dryomov 
add_initial_workspace(struct workspace_manager * wsm,struct crush_work * work)10233986f9a4SIlya Dryomov static void add_initial_workspace(struct workspace_manager *wsm,
10243986f9a4SIlya Dryomov 				  struct crush_work *work)
10253986f9a4SIlya Dryomov {
10263986f9a4SIlya Dryomov 	WARN_ON(!list_empty(&wsm->idle_ws));
10273986f9a4SIlya Dryomov 
10283986f9a4SIlya Dryomov 	list_add(&work->item, &wsm->idle_ws);
10293986f9a4SIlya Dryomov 	atomic_set(&wsm->total_ws, 1);
10303986f9a4SIlya Dryomov 	wsm->free_ws = 1;
10313986f9a4SIlya Dryomov }
10323986f9a4SIlya Dryomov 
cleanup_workspace_manager(struct workspace_manager * wsm)10333986f9a4SIlya Dryomov static void cleanup_workspace_manager(struct workspace_manager *wsm)
10343986f9a4SIlya Dryomov {
10353986f9a4SIlya Dryomov 	struct crush_work *work;
10363986f9a4SIlya Dryomov 
10373986f9a4SIlya Dryomov 	while (!list_empty(&wsm->idle_ws)) {
10383986f9a4SIlya Dryomov 		work = list_first_entry(&wsm->idle_ws, struct crush_work,
10393986f9a4SIlya Dryomov 					item);
10403986f9a4SIlya Dryomov 		list_del_init(&work->item);
10413986f9a4SIlya Dryomov 		free_workspace(work);
10423986f9a4SIlya Dryomov 	}
10433986f9a4SIlya Dryomov 	atomic_set(&wsm->total_ws, 0);
10443986f9a4SIlya Dryomov 	wsm->free_ws = 0;
10453986f9a4SIlya Dryomov }
10463986f9a4SIlya Dryomov 
10473986f9a4SIlya Dryomov /*
10483986f9a4SIlya Dryomov  * Finds an available workspace or allocates a new one.  If it's not
10493986f9a4SIlya Dryomov  * possible to allocate a new one, waits until there is one.
10503986f9a4SIlya Dryomov  */
get_workspace(struct workspace_manager * wsm,const struct crush_map * c)10513986f9a4SIlya Dryomov static struct crush_work *get_workspace(struct workspace_manager *wsm,
10523986f9a4SIlya Dryomov 					const struct crush_map *c)
10533986f9a4SIlya Dryomov {
10543986f9a4SIlya Dryomov 	struct crush_work *work;
10553986f9a4SIlya Dryomov 	int cpus = num_online_cpus();
10563986f9a4SIlya Dryomov 
10573986f9a4SIlya Dryomov again:
10583986f9a4SIlya Dryomov 	spin_lock(&wsm->ws_lock);
10593986f9a4SIlya Dryomov 	if (!list_empty(&wsm->idle_ws)) {
10603986f9a4SIlya Dryomov 		work = list_first_entry(&wsm->idle_ws, struct crush_work,
10613986f9a4SIlya Dryomov 					item);
10623986f9a4SIlya Dryomov 		list_del_init(&work->item);
10633986f9a4SIlya Dryomov 		wsm->free_ws--;
10643986f9a4SIlya Dryomov 		spin_unlock(&wsm->ws_lock);
10653986f9a4SIlya Dryomov 		return work;
10663986f9a4SIlya Dryomov 
10673986f9a4SIlya Dryomov 	}
10683986f9a4SIlya Dryomov 	if (atomic_read(&wsm->total_ws) > cpus) {
10693986f9a4SIlya Dryomov 		DEFINE_WAIT(wait);
10703986f9a4SIlya Dryomov 
10713986f9a4SIlya Dryomov 		spin_unlock(&wsm->ws_lock);
10723986f9a4SIlya Dryomov 		prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE);
10733986f9a4SIlya Dryomov 		if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws)
10743986f9a4SIlya Dryomov 			schedule();
10753986f9a4SIlya Dryomov 		finish_wait(&wsm->ws_wait, &wait);
10763986f9a4SIlya Dryomov 		goto again;
10773986f9a4SIlya Dryomov 	}
10783986f9a4SIlya Dryomov 	atomic_inc(&wsm->total_ws);
10793986f9a4SIlya Dryomov 	spin_unlock(&wsm->ws_lock);
10803986f9a4SIlya Dryomov 
10813986f9a4SIlya Dryomov 	work = alloc_workspace(c);
10823986f9a4SIlya Dryomov 	if (!work) {
10833986f9a4SIlya Dryomov 		atomic_dec(&wsm->total_ws);
10843986f9a4SIlya Dryomov 		wake_up(&wsm->ws_wait);
10853986f9a4SIlya Dryomov 
10863986f9a4SIlya Dryomov 		/*
10873986f9a4SIlya Dryomov 		 * Do not return the error but go back to waiting.  We
10883f9143f1SLu Wei 		 * have the initial workspace and the CRUSH computation
10893986f9a4SIlya Dryomov 		 * time is bounded so we will get it eventually.
10903986f9a4SIlya Dryomov 		 */
10913986f9a4SIlya Dryomov 		WARN_ON(atomic_read(&wsm->total_ws) < 1);
10923986f9a4SIlya Dryomov 		goto again;
10933986f9a4SIlya Dryomov 	}
10943986f9a4SIlya Dryomov 	return work;
10953986f9a4SIlya Dryomov }
10963986f9a4SIlya Dryomov 
10973986f9a4SIlya Dryomov /*
10983986f9a4SIlya Dryomov  * Puts a workspace back on the list or frees it if we have enough
10993986f9a4SIlya Dryomov  * idle ones sitting around.
11003986f9a4SIlya Dryomov  */
put_workspace(struct workspace_manager * wsm,struct crush_work * work)11013986f9a4SIlya Dryomov static void put_workspace(struct workspace_manager *wsm,
11023986f9a4SIlya Dryomov 			  struct crush_work *work)
11033986f9a4SIlya Dryomov {
11043986f9a4SIlya Dryomov 	spin_lock(&wsm->ws_lock);
11053986f9a4SIlya Dryomov 	if (wsm->free_ws <= num_online_cpus()) {
11063986f9a4SIlya Dryomov 		list_add(&work->item, &wsm->idle_ws);
11073986f9a4SIlya Dryomov 		wsm->free_ws++;
11083986f9a4SIlya Dryomov 		spin_unlock(&wsm->ws_lock);
11093986f9a4SIlya Dryomov 		goto wake;
11103986f9a4SIlya Dryomov 	}
11113986f9a4SIlya Dryomov 	spin_unlock(&wsm->ws_lock);
11123986f9a4SIlya Dryomov 
11133986f9a4SIlya Dryomov 	free_workspace(work);
11143986f9a4SIlya Dryomov 	atomic_dec(&wsm->total_ws);
11153986f9a4SIlya Dryomov wake:
11163986f9a4SIlya Dryomov 	if (wq_has_sleeper(&wsm->ws_wait))
11173986f9a4SIlya Dryomov 		wake_up(&wsm->ws_wait);
11183986f9a4SIlya Dryomov }
11193986f9a4SIlya Dryomov 
11203986f9a4SIlya Dryomov /*
11213d14c5d2SYehuda Sadeh  * osd map
11223d14c5d2SYehuda Sadeh  */
ceph_osdmap_alloc(void)1123e5253a7bSIlya Dryomov struct ceph_osdmap *ceph_osdmap_alloc(void)
1124e5253a7bSIlya Dryomov {
1125e5253a7bSIlya Dryomov 	struct ceph_osdmap *map;
1126e5253a7bSIlya Dryomov 
1127e5253a7bSIlya Dryomov 	map = kzalloc(sizeof(*map), GFP_NOIO);
1128e5253a7bSIlya Dryomov 	if (!map)
1129e5253a7bSIlya Dryomov 		return NULL;
1130e5253a7bSIlya Dryomov 
1131e5253a7bSIlya Dryomov 	map->pg_pools = RB_ROOT;
1132e5253a7bSIlya Dryomov 	map->pool_max = -1;
1133e5253a7bSIlya Dryomov 	map->pg_temp = RB_ROOT;
1134e5253a7bSIlya Dryomov 	map->primary_temp = RB_ROOT;
11356f428df4SIlya Dryomov 	map->pg_upmap = RB_ROOT;
11366f428df4SIlya Dryomov 	map->pg_upmap_items = RB_ROOT;
11373986f9a4SIlya Dryomov 
11383986f9a4SIlya Dryomov 	init_workspace_manager(&map->crush_wsm);
1139e5253a7bSIlya Dryomov 
1140e5253a7bSIlya Dryomov 	return map;
1141e5253a7bSIlya Dryomov }
1142e5253a7bSIlya Dryomov 
ceph_osdmap_destroy(struct ceph_osdmap * map)11433d14c5d2SYehuda Sadeh void ceph_osdmap_destroy(struct ceph_osdmap *map)
11443d14c5d2SYehuda Sadeh {
11453d14c5d2SYehuda Sadeh 	dout("osdmap_destroy %p\n", map);
11463986f9a4SIlya Dryomov 
11473d14c5d2SYehuda Sadeh 	if (map->crush)
11483d14c5d2SYehuda Sadeh 		crush_destroy(map->crush);
11493986f9a4SIlya Dryomov 	cleanup_workspace_manager(&map->crush_wsm);
11503986f9a4SIlya Dryomov 
11513d14c5d2SYehuda Sadeh 	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
11523d14c5d2SYehuda Sadeh 		struct ceph_pg_mapping *pg =
11533d14c5d2SYehuda Sadeh 			rb_entry(rb_first(&map->pg_temp),
11543d14c5d2SYehuda Sadeh 				 struct ceph_pg_mapping, node);
1155ab75144bSIlya Dryomov 		erase_pg_mapping(&map->pg_temp, pg);
1156ab75144bSIlya Dryomov 		free_pg_mapping(pg);
11573d14c5d2SYehuda Sadeh 	}
11589686f94cSIlya Dryomov 	while (!RB_EMPTY_ROOT(&map->primary_temp)) {
11599686f94cSIlya Dryomov 		struct ceph_pg_mapping *pg =
11609686f94cSIlya Dryomov 			rb_entry(rb_first(&map->primary_temp),
11619686f94cSIlya Dryomov 				 struct ceph_pg_mapping, node);
1162ab75144bSIlya Dryomov 		erase_pg_mapping(&map->primary_temp, pg);
1163ab75144bSIlya Dryomov 		free_pg_mapping(pg);
11649686f94cSIlya Dryomov 	}
11656f428df4SIlya Dryomov 	while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
11666f428df4SIlya Dryomov 		struct ceph_pg_mapping *pg =
11676f428df4SIlya Dryomov 			rb_entry(rb_first(&map->pg_upmap),
11686f428df4SIlya Dryomov 				 struct ceph_pg_mapping, node);
11696f428df4SIlya Dryomov 		rb_erase(&pg->node, &map->pg_upmap);
11706f428df4SIlya Dryomov 		kfree(pg);
11716f428df4SIlya Dryomov 	}
11726f428df4SIlya Dryomov 	while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
11736f428df4SIlya Dryomov 		struct ceph_pg_mapping *pg =
11746f428df4SIlya Dryomov 			rb_entry(rb_first(&map->pg_upmap_items),
11756f428df4SIlya Dryomov 				 struct ceph_pg_mapping, node);
11766f428df4SIlya Dryomov 		rb_erase(&pg->node, &map->pg_upmap_items);
11776f428df4SIlya Dryomov 		kfree(pg);
11786f428df4SIlya Dryomov 	}
11793d14c5d2SYehuda Sadeh 	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
11803d14c5d2SYehuda Sadeh 		struct ceph_pg_pool_info *pi =
11813d14c5d2SYehuda Sadeh 			rb_entry(rb_first(&map->pg_pools),
11823d14c5d2SYehuda Sadeh 				 struct ceph_pg_pool_info, node);
11833d14c5d2SYehuda Sadeh 		__remove_pg_pool(&map->pg_pools, pi);
11843d14c5d2SYehuda Sadeh 	}
1185cf73d882SIlya Dryomov 	kvfree(map->osd_state);
1186cf73d882SIlya Dryomov 	kvfree(map->osd_weight);
1187cf73d882SIlya Dryomov 	kvfree(map->osd_addr);
1188cf73d882SIlya Dryomov 	kvfree(map->osd_primary_affinity);
11893d14c5d2SYehuda Sadeh 	kfree(map);
11903d14c5d2SYehuda Sadeh }
11913d14c5d2SYehuda Sadeh 
11923d14c5d2SYehuda Sadeh /*
11934d60351fSIlya Dryomov  * Adjust max_osd value, (re)allocate arrays.
11944d60351fSIlya Dryomov  *
11954d60351fSIlya Dryomov  * The new elements are properly initialized.
11963d14c5d2SYehuda Sadeh  */
osdmap_set_max_osd(struct ceph_osdmap * map,u32 max)1197cf73d882SIlya Dryomov static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
11983d14c5d2SYehuda Sadeh {
11990bb05da2SIlya Dryomov 	u32 *state;
12003d14c5d2SYehuda Sadeh 	u32 *weight;
12014d60351fSIlya Dryomov 	struct ceph_entity_addr *addr;
1202cf73d882SIlya Dryomov 	u32 to_copy;
12034d60351fSIlya Dryomov 	int i;
12043d14c5d2SYehuda Sadeh 
1205cf73d882SIlya Dryomov 	dout("%s old %u new %u\n", __func__, map->max_osd, max);
1206cf73d882SIlya Dryomov 	if (max == map->max_osd)
1207cf73d882SIlya Dryomov 		return 0;
1208cf73d882SIlya Dryomov 
1209a421ef30SMichal Hocko 	state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
1210a421ef30SMichal Hocko 	weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
1211a421ef30SMichal Hocko 	addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
1212cf73d882SIlya Dryomov 	if (!state || !weight || !addr) {
1213cf73d882SIlya Dryomov 		kvfree(state);
1214cf73d882SIlya Dryomov 		kvfree(weight);
1215cf73d882SIlya Dryomov 		kvfree(addr);
12163d14c5d2SYehuda Sadeh 		return -ENOMEM;
1217cf73d882SIlya Dryomov 	}
1218cf73d882SIlya Dryomov 
1219cf73d882SIlya Dryomov 	to_copy = min(map->max_osd, max);
1220cf73d882SIlya Dryomov 	if (map->osd_state) {
1221cf73d882SIlya Dryomov 		memcpy(state, map->osd_state, to_copy * sizeof(*state));
1222cf73d882SIlya Dryomov 		memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
1223cf73d882SIlya Dryomov 		memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
1224cf73d882SIlya Dryomov 		kvfree(map->osd_state);
1225cf73d882SIlya Dryomov 		kvfree(map->osd_weight);
1226cf73d882SIlya Dryomov 		kvfree(map->osd_addr);
1227cf73d882SIlya Dryomov 	}
1228cf73d882SIlya Dryomov 
1229589506f1SLi RongQing 	map->osd_state = state;
1230589506f1SLi RongQing 	map->osd_weight = weight;
1231589506f1SLi RongQing 	map->osd_addr = addr;
12324d60351fSIlya Dryomov 	for (i = map->max_osd; i < max; i++) {
1233589506f1SLi RongQing 		map->osd_state[i] = 0;
1234589506f1SLi RongQing 		map->osd_weight[i] = CEPH_OSD_OUT;
1235589506f1SLi RongQing 		memset(map->osd_addr + i, 0, sizeof(*map->osd_addr));
12363d14c5d2SYehuda Sadeh 	}
12373d14c5d2SYehuda Sadeh 
12382cfa34f2SIlya Dryomov 	if (map->osd_primary_affinity) {
12392cfa34f2SIlya Dryomov 		u32 *affinity;
12402cfa34f2SIlya Dryomov 
1241a421ef30SMichal Hocko 		affinity = kvmalloc(array_size(max, sizeof(*affinity)),
1242cf73d882SIlya Dryomov 					 GFP_NOFS);
12432cfa34f2SIlya Dryomov 		if (!affinity)
12442cfa34f2SIlya Dryomov 			return -ENOMEM;
12452cfa34f2SIlya Dryomov 
1246cf73d882SIlya Dryomov 		memcpy(affinity, map->osd_primary_affinity,
1247cf73d882SIlya Dryomov 		       to_copy * sizeof(*affinity));
1248cf73d882SIlya Dryomov 		kvfree(map->osd_primary_affinity);
1249cf73d882SIlya Dryomov 
1250cf73d882SIlya Dryomov 		map->osd_primary_affinity = affinity;
12512cfa34f2SIlya Dryomov 		for (i = map->max_osd; i < max; i++)
1252589506f1SLi RongQing 			map->osd_primary_affinity[i] =
1253589506f1SLi RongQing 			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
12542cfa34f2SIlya Dryomov 	}
12552cfa34f2SIlya Dryomov 
12563d14c5d2SYehuda Sadeh 	map->max_osd = max;
12574d60351fSIlya Dryomov 
12583d14c5d2SYehuda Sadeh 	return 0;
12593d14c5d2SYehuda Sadeh }
12603d14c5d2SYehuda Sadeh 
osdmap_set_crush(struct ceph_osdmap * map,struct crush_map * crush)12611b6a78b5SIlya Dryomov static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
12621b6a78b5SIlya Dryomov {
12633986f9a4SIlya Dryomov 	struct crush_work *work;
126466a0e2d5SIlya Dryomov 
12651b6a78b5SIlya Dryomov 	if (IS_ERR(crush))
12661b6a78b5SIlya Dryomov 		return PTR_ERR(crush);
12671b6a78b5SIlya Dryomov 
12683986f9a4SIlya Dryomov 	work = alloc_workspace(crush);
12693986f9a4SIlya Dryomov 	if (!work) {
127066a0e2d5SIlya Dryomov 		crush_destroy(crush);
127166a0e2d5SIlya Dryomov 		return -ENOMEM;
127266a0e2d5SIlya Dryomov 	}
127366a0e2d5SIlya Dryomov 
12741b6a78b5SIlya Dryomov 	if (map->crush)
12751b6a78b5SIlya Dryomov 		crush_destroy(map->crush);
12763986f9a4SIlya Dryomov 	cleanup_workspace_manager(&map->crush_wsm);
12771b6a78b5SIlya Dryomov 	map->crush = crush;
12783986f9a4SIlya Dryomov 	add_initial_workspace(&map->crush_wsm, work);
12791b6a78b5SIlya Dryomov 	return 0;
12801b6a78b5SIlya Dryomov }
12811b6a78b5SIlya Dryomov 
1282ec7af972SIlya Dryomov #define OSDMAP_WRAPPER_COMPAT_VER	7
1283ec7af972SIlya Dryomov #define OSDMAP_CLIENT_DATA_COMPAT_VER	1
1284ec7af972SIlya Dryomov 
1285ec7af972SIlya Dryomov /*
1286ec7af972SIlya Dryomov  * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps,
1287ec7af972SIlya Dryomov  * to struct_v of the client_data section for new (v7 and above)
1288ec7af972SIlya Dryomov  * osdmaps.
1289ec7af972SIlya Dryomov  */
get_osdmap_client_data_v(void ** p,void * end,const char * prefix,u8 * v)1290ec7af972SIlya Dryomov static int get_osdmap_client_data_v(void **p, void *end,
1291ec7af972SIlya Dryomov 				    const char *prefix, u8 *v)
1292ec7af972SIlya Dryomov {
1293ec7af972SIlya Dryomov 	u8 struct_v;
1294ec7af972SIlya Dryomov 
1295ec7af972SIlya Dryomov 	ceph_decode_8_safe(p, end, struct_v, e_inval);
1296ec7af972SIlya Dryomov 	if (struct_v >= 7) {
1297ec7af972SIlya Dryomov 		u8 struct_compat;
1298ec7af972SIlya Dryomov 
1299ec7af972SIlya Dryomov 		ceph_decode_8_safe(p, end, struct_compat, e_inval);
1300ec7af972SIlya Dryomov 		if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
1301b9a67899SJoe Perches 			pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n",
1302ec7af972SIlya Dryomov 				struct_v, struct_compat,
1303ec7af972SIlya Dryomov 				OSDMAP_WRAPPER_COMPAT_VER, prefix);
1304ec7af972SIlya Dryomov 			return -EINVAL;
1305ec7af972SIlya Dryomov 		}
1306ec7af972SIlya Dryomov 		*p += 4; /* ignore wrapper struct_len */
1307ec7af972SIlya Dryomov 
1308ec7af972SIlya Dryomov 		ceph_decode_8_safe(p, end, struct_v, e_inval);
1309ec7af972SIlya Dryomov 		ceph_decode_8_safe(p, end, struct_compat, e_inval);
1310ec7af972SIlya Dryomov 		if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
1311b9a67899SJoe Perches 			pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n",
1312ec7af972SIlya Dryomov 				struct_v, struct_compat,
1313ec7af972SIlya Dryomov 				OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
1314ec7af972SIlya Dryomov 			return -EINVAL;
1315ec7af972SIlya Dryomov 		}
1316ec7af972SIlya Dryomov 		*p += 4; /* ignore client data struct_len */
1317ec7af972SIlya Dryomov 	} else {
1318ec7af972SIlya Dryomov 		u16 version;
1319ec7af972SIlya Dryomov 
1320ec7af972SIlya Dryomov 		*p -= 1;
1321ec7af972SIlya Dryomov 		ceph_decode_16_safe(p, end, version, e_inval);
1322ec7af972SIlya Dryomov 		if (version < 6) {
1323b9a67899SJoe Perches 			pr_warn("got v %d < 6 of %s ceph_osdmap\n",
1324b9a67899SJoe Perches 				version, prefix);
1325ec7af972SIlya Dryomov 			return -EINVAL;
1326ec7af972SIlya Dryomov 		}
1327ec7af972SIlya Dryomov 
1328dd0d91b9SZheng Yongjun 		/* old osdmap encoding */
1329ec7af972SIlya Dryomov 		struct_v = 0;
1330ec7af972SIlya Dryomov 	}
1331ec7af972SIlya Dryomov 
1332ec7af972SIlya Dryomov 	*v = struct_v;
1333ec7af972SIlya Dryomov 	return 0;
1334ec7af972SIlya Dryomov 
1335ec7af972SIlya Dryomov e_inval:
1336ec7af972SIlya Dryomov 	return -EINVAL;
1337ec7af972SIlya Dryomov }
1338ec7af972SIlya Dryomov 
__decode_pools(void ** p,void * end,struct ceph_osdmap * map,bool incremental)1339433fbdd3SIlya Dryomov static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
1340433fbdd3SIlya Dryomov 			  bool incremental)
1341433fbdd3SIlya Dryomov {
1342433fbdd3SIlya Dryomov 	u32 n;
1343433fbdd3SIlya Dryomov 
1344433fbdd3SIlya Dryomov 	ceph_decode_32_safe(p, end, n, e_inval);
1345433fbdd3SIlya Dryomov 	while (n--) {
1346433fbdd3SIlya Dryomov 		struct ceph_pg_pool_info *pi;
1347433fbdd3SIlya Dryomov 		u64 pool;
1348433fbdd3SIlya Dryomov 		int ret;
1349433fbdd3SIlya Dryomov 
1350433fbdd3SIlya Dryomov 		ceph_decode_64_safe(p, end, pool, e_inval);
1351433fbdd3SIlya Dryomov 
13528a4b863cSIlya Dryomov 		pi = lookup_pg_pool(&map->pg_pools, pool);
1353433fbdd3SIlya Dryomov 		if (!incremental || !pi) {
1354433fbdd3SIlya Dryomov 			pi = kzalloc(sizeof(*pi), GFP_NOFS);
1355433fbdd3SIlya Dryomov 			if (!pi)
1356433fbdd3SIlya Dryomov 				return -ENOMEM;
1357433fbdd3SIlya Dryomov 
13588a4b863cSIlya Dryomov 			RB_CLEAR_NODE(&pi->node);
1359433fbdd3SIlya Dryomov 			pi->id = pool;
1360433fbdd3SIlya Dryomov 
13618a4b863cSIlya Dryomov 			if (!__insert_pg_pool(&map->pg_pools, pi)) {
1362433fbdd3SIlya Dryomov 				kfree(pi);
13638a4b863cSIlya Dryomov 				return -EEXIST;
1364433fbdd3SIlya Dryomov 			}
1365433fbdd3SIlya Dryomov 		}
1366433fbdd3SIlya Dryomov 
1367433fbdd3SIlya Dryomov 		ret = decode_pool(p, end, pi);
1368433fbdd3SIlya Dryomov 		if (ret)
1369433fbdd3SIlya Dryomov 			return ret;
1370433fbdd3SIlya Dryomov 	}
1371433fbdd3SIlya Dryomov 
1372433fbdd3SIlya Dryomov 	return 0;
1373433fbdd3SIlya Dryomov 
1374433fbdd3SIlya Dryomov e_inval:
1375433fbdd3SIlya Dryomov 	return -EINVAL;
1376433fbdd3SIlya Dryomov }
1377433fbdd3SIlya Dryomov 
decode_pools(void ** p,void * end,struct ceph_osdmap * map)1378433fbdd3SIlya Dryomov static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
1379433fbdd3SIlya Dryomov {
1380433fbdd3SIlya Dryomov 	return __decode_pools(p, end, map, false);
1381433fbdd3SIlya Dryomov }
1382433fbdd3SIlya Dryomov 
decode_new_pools(void ** p,void * end,struct ceph_osdmap * map)1383433fbdd3SIlya Dryomov static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
1384433fbdd3SIlya Dryomov {
1385433fbdd3SIlya Dryomov 	return __decode_pools(p, end, map, true);
1386433fbdd3SIlya Dryomov }
1387433fbdd3SIlya Dryomov 
1388a303bb0eSIlya Dryomov typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool);
1389a303bb0eSIlya Dryomov 
1390a303bb0eSIlya Dryomov static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
1391a303bb0eSIlya Dryomov 			     decode_mapping_fn_t fn, bool incremental)
139210db634eSIlya Dryomov {
139310db634eSIlya Dryomov 	u32 n;
139410db634eSIlya Dryomov 
1395a303bb0eSIlya Dryomov 	WARN_ON(!incremental && !fn);
1396a303bb0eSIlya Dryomov 
139710db634eSIlya Dryomov 	ceph_decode_32_safe(p, end, n, e_inval);
139810db634eSIlya Dryomov 	while (n--) {
1399a303bb0eSIlya Dryomov 		struct ceph_pg_mapping *pg;
140010db634eSIlya Dryomov 		struct ceph_pg pgid;
140110db634eSIlya Dryomov 		int ret;
140210db634eSIlya Dryomov 
140310db634eSIlya Dryomov 		ret = ceph_decode_pgid(p, end, &pgid);
140410db634eSIlya Dryomov 		if (ret)
140510db634eSIlya Dryomov 			return ret;
140610db634eSIlya Dryomov 
1407ab75144bSIlya Dryomov 		pg = lookup_pg_mapping(mapping_root, &pgid);
1408ab75144bSIlya Dryomov 		if (pg) {
1409ab75144bSIlya Dryomov 			WARN_ON(!incremental);
1410ab75144bSIlya Dryomov 			erase_pg_mapping(mapping_root, pg);
1411ab75144bSIlya Dryomov 			free_pg_mapping(pg);
1412ab75144bSIlya Dryomov 		}
141310db634eSIlya Dryomov 
1414a303bb0eSIlya Dryomov 		if (fn) {
1415a303bb0eSIlya Dryomov 			pg = fn(p, end, incremental);
1416a303bb0eSIlya Dryomov 			if (IS_ERR(pg))
1417a303bb0eSIlya Dryomov 				return PTR_ERR(pg);
141810db634eSIlya Dryomov 
1419a303bb0eSIlya Dryomov 			if (pg) {
1420a303bb0eSIlya Dryomov 				pg->pgid = pgid; /* struct */
1421ab75144bSIlya Dryomov 				insert_pg_mapping(mapping_root, pg);
1422a303bb0eSIlya Dryomov 			}
1423a303bb0eSIlya Dryomov 		}
1424a303bb0eSIlya Dryomov 	}
1425a303bb0eSIlya Dryomov 
1426a303bb0eSIlya Dryomov 	return 0;
1427a303bb0eSIlya Dryomov 
1428a303bb0eSIlya Dryomov e_inval:
1429a303bb0eSIlya Dryomov 	return -EINVAL;
1430a303bb0eSIlya Dryomov }
1431a303bb0eSIlya Dryomov 
__decode_pg_temp(void ** p,void * end,bool incremental)1432a303bb0eSIlya Dryomov static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end,
1433a303bb0eSIlya Dryomov 						bool incremental)
1434a303bb0eSIlya Dryomov {
143510db634eSIlya Dryomov 	struct ceph_pg_mapping *pg;
1436a303bb0eSIlya Dryomov 	u32 len, i;
1437a303bb0eSIlya Dryomov 
1438a303bb0eSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
1439a303bb0eSIlya Dryomov 	if (len == 0 && incremental)
1440a303bb0eSIlya Dryomov 		return NULL;	/* new_pg_temp: [] to remove */
1441a303bb0eSIlya Dryomov 	if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32))
1442a303bb0eSIlya Dryomov 		return ERR_PTR(-EINVAL);
144310db634eSIlya Dryomov 
144410db634eSIlya Dryomov 	ceph_decode_need(p, end, len * sizeof(u32), e_inval);
1445a303bb0eSIlya Dryomov 	pg = alloc_pg_mapping(len * sizeof(u32));
144610db634eSIlya Dryomov 	if (!pg)
1447a303bb0eSIlya Dryomov 		return ERR_PTR(-ENOMEM);
144810db634eSIlya Dryomov 
144935a935d7SIlya Dryomov 	pg->pg_temp.len = len;
145010db634eSIlya Dryomov 	for (i = 0; i < len; i++)
145135a935d7SIlya Dryomov 		pg->pg_temp.osds[i] = ceph_decode_32(p);
145210db634eSIlya Dryomov 
1453a303bb0eSIlya Dryomov 	return pg;
145410db634eSIlya Dryomov 
145510db634eSIlya Dryomov e_inval:
1456a303bb0eSIlya Dryomov 	return ERR_PTR(-EINVAL);
145710db634eSIlya Dryomov }
145810db634eSIlya Dryomov 
decode_pg_temp(void ** p,void * end,struct ceph_osdmap * map)145910db634eSIlya Dryomov static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
146010db634eSIlya Dryomov {
1461a303bb0eSIlya Dryomov 	return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
1462a303bb0eSIlya Dryomov 				 false);
146310db634eSIlya Dryomov }
146410db634eSIlya Dryomov 
decode_new_pg_temp(void ** p,void * end,struct ceph_osdmap * map)146510db634eSIlya Dryomov static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
146610db634eSIlya Dryomov {
1467a303bb0eSIlya Dryomov 	return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
1468a303bb0eSIlya Dryomov 				 true);
146910db634eSIlya Dryomov }
147010db634eSIlya Dryomov 
__decode_primary_temp(void ** p,void * end,bool incremental)1471a303bb0eSIlya Dryomov static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end,
1472d286de79SIlya Dryomov 						     bool incremental)
1473d286de79SIlya Dryomov {
1474a303bb0eSIlya Dryomov 	struct ceph_pg_mapping *pg;
1475d286de79SIlya Dryomov 	u32 osd;
1476d286de79SIlya Dryomov 
1477d286de79SIlya Dryomov 	ceph_decode_32_safe(p, end, osd, e_inval);
1478a303bb0eSIlya Dryomov 	if (osd == (u32)-1 && incremental)
1479a303bb0eSIlya Dryomov 		return NULL;	/* new_primary_temp: -1 to remove */
1480d286de79SIlya Dryomov 
1481a303bb0eSIlya Dryomov 	pg = alloc_pg_mapping(0);
1482d286de79SIlya Dryomov 	if (!pg)
1483a303bb0eSIlya Dryomov 		return ERR_PTR(-ENOMEM);
1484d286de79SIlya Dryomov 
1485d286de79SIlya Dryomov 	pg->primary_temp.osd = osd;
1486a303bb0eSIlya Dryomov 	return pg;
1487d286de79SIlya Dryomov 
1488d286de79SIlya Dryomov e_inval:
1489a303bb0eSIlya Dryomov 	return ERR_PTR(-EINVAL);
1490d286de79SIlya Dryomov }
1491d286de79SIlya Dryomov 
decode_primary_temp(void ** p,void * end,struct ceph_osdmap * map)1492d286de79SIlya Dryomov static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
1493d286de79SIlya Dryomov {
1494a303bb0eSIlya Dryomov 	return decode_pg_mapping(p, end, &map->primary_temp,
1495a303bb0eSIlya Dryomov 				 __decode_primary_temp, false);
1496d286de79SIlya Dryomov }
1497d286de79SIlya Dryomov 
decode_new_primary_temp(void ** p,void * end,struct ceph_osdmap * map)1498d286de79SIlya Dryomov static int decode_new_primary_temp(void **p, void *end,
1499d286de79SIlya Dryomov 				   struct ceph_osdmap *map)
1500d286de79SIlya Dryomov {
1501a303bb0eSIlya Dryomov 	return decode_pg_mapping(p, end, &map->primary_temp,
1502a303bb0eSIlya Dryomov 				 __decode_primary_temp, true);
1503d286de79SIlya Dryomov }
1504d286de79SIlya Dryomov 
ceph_get_primary_affinity(struct ceph_osdmap * map,int osd)15052cfa34f2SIlya Dryomov u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
15062cfa34f2SIlya Dryomov {
15072cfa34f2SIlya Dryomov 	BUG_ON(osd >= map->max_osd);
15082cfa34f2SIlya Dryomov 
15092cfa34f2SIlya Dryomov 	if (!map->osd_primary_affinity)
15102cfa34f2SIlya Dryomov 		return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
15112cfa34f2SIlya Dryomov 
15122cfa34f2SIlya Dryomov 	return map->osd_primary_affinity[osd];
15132cfa34f2SIlya Dryomov }
15142cfa34f2SIlya Dryomov 
set_primary_affinity(struct ceph_osdmap * map,int osd,u32 aff)15152cfa34f2SIlya Dryomov static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
15162cfa34f2SIlya Dryomov {
15172cfa34f2SIlya Dryomov 	BUG_ON(osd >= map->max_osd);
15182cfa34f2SIlya Dryomov 
15192cfa34f2SIlya Dryomov 	if (!map->osd_primary_affinity) {
15202cfa34f2SIlya Dryomov 		int i;
15212cfa34f2SIlya Dryomov 
1522a421ef30SMichal Hocko 		map->osd_primary_affinity = kvmalloc(
1523cf73d882SIlya Dryomov 		    array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
15242cfa34f2SIlya Dryomov 		    GFP_NOFS);
15252cfa34f2SIlya Dryomov 		if (!map->osd_primary_affinity)
15262cfa34f2SIlya Dryomov 			return -ENOMEM;
15272cfa34f2SIlya Dryomov 
15282cfa34f2SIlya Dryomov 		for (i = 0; i < map->max_osd; i++)
15292cfa34f2SIlya Dryomov 			map->osd_primary_affinity[i] =
15302cfa34f2SIlya Dryomov 			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
15312cfa34f2SIlya Dryomov 	}
15322cfa34f2SIlya Dryomov 
15332cfa34f2SIlya Dryomov 	map->osd_primary_affinity[osd] = aff;
15342cfa34f2SIlya Dryomov 
15352cfa34f2SIlya Dryomov 	return 0;
15362cfa34f2SIlya Dryomov }
15372cfa34f2SIlya Dryomov 
decode_primary_affinity(void ** p,void * end,struct ceph_osdmap * map)153863a6993fSIlya Dryomov static int decode_primary_affinity(void **p, void *end,
153963a6993fSIlya Dryomov 				   struct ceph_osdmap *map)
154063a6993fSIlya Dryomov {
154163a6993fSIlya Dryomov 	u32 len, i;
154263a6993fSIlya Dryomov 
154363a6993fSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
154463a6993fSIlya Dryomov 	if (len == 0) {
1545cf73d882SIlya Dryomov 		kvfree(map->osd_primary_affinity);
154663a6993fSIlya Dryomov 		map->osd_primary_affinity = NULL;
154763a6993fSIlya Dryomov 		return 0;
154863a6993fSIlya Dryomov 	}
154963a6993fSIlya Dryomov 	if (len != map->max_osd)
155063a6993fSIlya Dryomov 		goto e_inval;
155163a6993fSIlya Dryomov 
155263a6993fSIlya Dryomov 	ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
155363a6993fSIlya Dryomov 
155463a6993fSIlya Dryomov 	for (i = 0; i < map->max_osd; i++) {
155563a6993fSIlya Dryomov 		int ret;
155663a6993fSIlya Dryomov 
155763a6993fSIlya Dryomov 		ret = set_primary_affinity(map, i, ceph_decode_32(p));
155863a6993fSIlya Dryomov 		if (ret)
155963a6993fSIlya Dryomov 			return ret;
156063a6993fSIlya Dryomov 	}
156163a6993fSIlya Dryomov 
156263a6993fSIlya Dryomov 	return 0;
156363a6993fSIlya Dryomov 
156463a6993fSIlya Dryomov e_inval:
156563a6993fSIlya Dryomov 	return -EINVAL;
156663a6993fSIlya Dryomov }
156763a6993fSIlya Dryomov 
decode_new_primary_affinity(void ** p,void * end,struct ceph_osdmap * map)156863a6993fSIlya Dryomov static int decode_new_primary_affinity(void **p, void *end,
156963a6993fSIlya Dryomov 				       struct ceph_osdmap *map)
157063a6993fSIlya Dryomov {
157163a6993fSIlya Dryomov 	u32 n;
157263a6993fSIlya Dryomov 
157363a6993fSIlya Dryomov 	ceph_decode_32_safe(p, end, n, e_inval);
157463a6993fSIlya Dryomov 	while (n--) {
157563a6993fSIlya Dryomov 		u32 osd, aff;
157663a6993fSIlya Dryomov 		int ret;
157763a6993fSIlya Dryomov 
157863a6993fSIlya Dryomov 		ceph_decode_32_safe(p, end, osd, e_inval);
157963a6993fSIlya Dryomov 		ceph_decode_32_safe(p, end, aff, e_inval);
158063a6993fSIlya Dryomov 
158163a6993fSIlya Dryomov 		ret = set_primary_affinity(map, osd, aff);
158263a6993fSIlya Dryomov 		if (ret)
158363a6993fSIlya Dryomov 			return ret;
1584f31da0f3SIlya Dryomov 
1585*842d6b01SDaichi Mukai 		osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff);
158663a6993fSIlya Dryomov 	}
158763a6993fSIlya Dryomov 
158863a6993fSIlya Dryomov 	return 0;
158963a6993fSIlya Dryomov 
159063a6993fSIlya Dryomov e_inval:
159163a6993fSIlya Dryomov 	return -EINVAL;
159263a6993fSIlya Dryomov }
159363a6993fSIlya Dryomov 
__decode_pg_upmap(void ** p,void * end,bool __unused)15946f428df4SIlya Dryomov static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
15956f428df4SIlya Dryomov 						 bool __unused)
15966f428df4SIlya Dryomov {
15976f428df4SIlya Dryomov 	return __decode_pg_temp(p, end, false);
15986f428df4SIlya Dryomov }
15996f428df4SIlya Dryomov 
decode_pg_upmap(void ** p,void * end,struct ceph_osdmap * map)16006f428df4SIlya Dryomov static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
16016f428df4SIlya Dryomov {
16026f428df4SIlya Dryomov 	return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
16036f428df4SIlya Dryomov 				 false);
16046f428df4SIlya Dryomov }
16056f428df4SIlya Dryomov 
decode_new_pg_upmap(void ** p,void * end,struct ceph_osdmap * map)16066f428df4SIlya Dryomov static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
16076f428df4SIlya Dryomov {
16086f428df4SIlya Dryomov 	return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
16096f428df4SIlya Dryomov 				 true);
16106f428df4SIlya Dryomov }
16116f428df4SIlya Dryomov 
decode_old_pg_upmap(void ** p,void * end,struct ceph_osdmap * map)16126f428df4SIlya Dryomov static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
16136f428df4SIlya Dryomov {
16146f428df4SIlya Dryomov 	return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
16156f428df4SIlya Dryomov }
16166f428df4SIlya Dryomov 
__decode_pg_upmap_items(void ** p,void * end,bool __unused)16176f428df4SIlya Dryomov static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
16186f428df4SIlya Dryomov 						       bool __unused)
16196f428df4SIlya Dryomov {
16206f428df4SIlya Dryomov 	struct ceph_pg_mapping *pg;
16216f428df4SIlya Dryomov 	u32 len, i;
16226f428df4SIlya Dryomov 
16236f428df4SIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
16246f428df4SIlya Dryomov 	if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
16256f428df4SIlya Dryomov 		return ERR_PTR(-EINVAL);
16266f428df4SIlya Dryomov 
16276f428df4SIlya Dryomov 	ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
1628f5cc6898SIlya Dryomov 	pg = alloc_pg_mapping(2 * len * sizeof(u32));
16296f428df4SIlya Dryomov 	if (!pg)
16306f428df4SIlya Dryomov 		return ERR_PTR(-ENOMEM);
16316f428df4SIlya Dryomov 
16326f428df4SIlya Dryomov 	pg->pg_upmap_items.len = len;
16336f428df4SIlya Dryomov 	for (i = 0; i < len; i++) {
16346f428df4SIlya Dryomov 		pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
16356f428df4SIlya Dryomov 		pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
16366f428df4SIlya Dryomov 	}
16376f428df4SIlya Dryomov 
16386f428df4SIlya Dryomov 	return pg;
16396f428df4SIlya Dryomov 
16406f428df4SIlya Dryomov e_inval:
16416f428df4SIlya Dryomov 	return ERR_PTR(-EINVAL);
16426f428df4SIlya Dryomov }
16436f428df4SIlya Dryomov 
decode_pg_upmap_items(void ** p,void * end,struct ceph_osdmap * map)16446f428df4SIlya Dryomov static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
16456f428df4SIlya Dryomov {
16466f428df4SIlya Dryomov 	return decode_pg_mapping(p, end, &map->pg_upmap_items,
16476f428df4SIlya Dryomov 				 __decode_pg_upmap_items, false);
16486f428df4SIlya Dryomov }
16496f428df4SIlya Dryomov 
decode_new_pg_upmap_items(void ** p,void * end,struct ceph_osdmap * map)16506f428df4SIlya Dryomov static int decode_new_pg_upmap_items(void **p, void *end,
16516f428df4SIlya Dryomov 				     struct ceph_osdmap *map)
16526f428df4SIlya Dryomov {
16536f428df4SIlya Dryomov 	return decode_pg_mapping(p, end, &map->pg_upmap_items,
16546f428df4SIlya Dryomov 				 __decode_pg_upmap_items, true);
16556f428df4SIlya Dryomov }
16566f428df4SIlya Dryomov 
decode_old_pg_upmap_items(void ** p,void * end,struct ceph_osdmap * map)16576f428df4SIlya Dryomov static int decode_old_pg_upmap_items(void **p, void *end,
16586f428df4SIlya Dryomov 				     struct ceph_osdmap *map)
16596f428df4SIlya Dryomov {
16606f428df4SIlya Dryomov 	return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
16616f428df4SIlya Dryomov }
16626f428df4SIlya Dryomov 
16633d14c5d2SYehuda Sadeh /*
16643d14c5d2SYehuda Sadeh  * decode a full map.
16653d14c5d2SYehuda Sadeh  */
osdmap_decode(void ** p,void * end,bool msgr2,struct ceph_osdmap * map)1666a5cbd5fcSIlya Dryomov static int osdmap_decode(void **p, void *end, bool msgr2,
1667a5cbd5fcSIlya Dryomov 			 struct ceph_osdmap *map)
16683d14c5d2SYehuda Sadeh {
1669ec7af972SIlya Dryomov 	u8 struct_v;
167038a8d560SIlya Dryomov 	u32 epoch = 0;
16713d14c5d2SYehuda Sadeh 	void *start = *p;
16723977058cSIlya Dryomov 	u32 max;
16733977058cSIlya Dryomov 	u32 len, i;
1674597b52f6SIlya Dryomov 	int err;
16753d14c5d2SYehuda Sadeh 
1676a2505d63SIlya Dryomov 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
16773d14c5d2SYehuda Sadeh 
1678ec7af972SIlya Dryomov 	err = get_osdmap_client_data_v(p, end, "full", &struct_v);
1679ec7af972SIlya Dryomov 	if (err)
1680ec7af972SIlya Dryomov 		goto bad;
16813d14c5d2SYehuda Sadeh 
168253bbaba9SIlya Dryomov 	/* fsid, epoch, created, modified */
168353bbaba9SIlya Dryomov 	ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
168453bbaba9SIlya Dryomov 			 sizeof(map->created) + sizeof(map->modified), e_inval);
16853d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
168638a8d560SIlya Dryomov 	epoch = map->epoch = ceph_decode_32(p);
16873d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &map->created, sizeof(map->created));
16883d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
16893d14c5d2SYehuda Sadeh 
1690433fbdd3SIlya Dryomov 	/* pools */
1691433fbdd3SIlya Dryomov 	err = decode_pools(p, end, map);
1692433fbdd3SIlya Dryomov 	if (err)
16933d14c5d2SYehuda Sadeh 		goto bad;
16943d14c5d2SYehuda Sadeh 
16950f70c7eeSIlya Dryomov 	/* pool_name */
16960f70c7eeSIlya Dryomov 	err = decode_pool_names(p, end, map);
1697597b52f6SIlya Dryomov 	if (err)
16983d14c5d2SYehuda Sadeh 		goto bad;
16993d14c5d2SYehuda Sadeh 
1700597b52f6SIlya Dryomov 	ceph_decode_32_safe(p, end, map->pool_max, e_inval);
17013d14c5d2SYehuda Sadeh 
1702597b52f6SIlya Dryomov 	ceph_decode_32_safe(p, end, map->flags, e_inval);
17033d14c5d2SYehuda Sadeh 
17043977058cSIlya Dryomov 	/* max_osd */
17053977058cSIlya Dryomov 	ceph_decode_32_safe(p, end, max, e_inval);
17063d14c5d2SYehuda Sadeh 
17073d14c5d2SYehuda Sadeh 	/* (re)alloc osd arrays */
17083d14c5d2SYehuda Sadeh 	err = osdmap_set_max_osd(map, max);
1709597b52f6SIlya Dryomov 	if (err)
17103d14c5d2SYehuda Sadeh 		goto bad;
17113d14c5d2SYehuda Sadeh 
17122d88b2e0SIlya Dryomov 	/* osd_state, osd_weight, osd_addrs->client_addr */
17133d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, 3*sizeof(u32) +
1714dcbc919aSJeff Layton 			 map->max_osd*(struct_v >= 5 ? sizeof(u32) :
17150bb05da2SIlya Dryomov 						       sizeof(u8)) +
1716dcbc919aSJeff Layton 				       sizeof(*map->osd_weight), e_inval);
17172d88b2e0SIlya Dryomov 	if (ceph_decode_32(p) != map->max_osd)
17182d88b2e0SIlya Dryomov 		goto e_inval;
17192d88b2e0SIlya Dryomov 
17200bb05da2SIlya Dryomov 	if (struct_v >= 5) {
17210bb05da2SIlya Dryomov 		for (i = 0; i < map->max_osd; i++)
17220bb05da2SIlya Dryomov 			map->osd_state[i] = ceph_decode_32(p);
17230bb05da2SIlya Dryomov 	} else {
17240bb05da2SIlya Dryomov 		for (i = 0; i < map->max_osd; i++)
17250bb05da2SIlya Dryomov 			map->osd_state[i] = ceph_decode_8(p);
17260bb05da2SIlya Dryomov 	}
17273d14c5d2SYehuda Sadeh 
17282d88b2e0SIlya Dryomov 	if (ceph_decode_32(p) != map->max_osd)
17292d88b2e0SIlya Dryomov 		goto e_inval;
17302d88b2e0SIlya Dryomov 
17313d14c5d2SYehuda Sadeh 	for (i = 0; i < map->max_osd; i++)
17323d14c5d2SYehuda Sadeh 		map->osd_weight[i] = ceph_decode_32(p);
17333d14c5d2SYehuda Sadeh 
17342d88b2e0SIlya Dryomov 	if (ceph_decode_32(p) != map->max_osd)
17352d88b2e0SIlya Dryomov 		goto e_inval;
17362d88b2e0SIlya Dryomov 
1737dcbc919aSJeff Layton 	for (i = 0; i < map->max_osd; i++) {
1738a5cbd5fcSIlya Dryomov 		struct ceph_entity_addr *addr = &map->osd_addr[i];
1739a5cbd5fcSIlya Dryomov 
1740a5cbd5fcSIlya Dryomov 		if (struct_v >= 8)
1741a5cbd5fcSIlya Dryomov 			err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
1742a5cbd5fcSIlya Dryomov 		else
1743a5cbd5fcSIlya Dryomov 			err = ceph_decode_entity_addr(p, end, addr);
1744dcbc919aSJeff Layton 		if (err)
1745dcbc919aSJeff Layton 			goto bad;
1746a5cbd5fcSIlya Dryomov 
1747a5cbd5fcSIlya Dryomov 		dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
1748dcbc919aSJeff Layton 	}
17493d14c5d2SYehuda Sadeh 
17503d14c5d2SYehuda Sadeh 	/* pg_temp */
175110db634eSIlya Dryomov 	err = decode_pg_temp(p, end, map);
1752d6c0dd6bSSage Weil 	if (err)
1753d6c0dd6bSSage Weil 		goto bad;
17543d14c5d2SYehuda Sadeh 
1755d286de79SIlya Dryomov 	/* primary_temp */
1756d286de79SIlya Dryomov 	if (struct_v >= 1) {
1757d286de79SIlya Dryomov 		err = decode_primary_temp(p, end, map);
1758d286de79SIlya Dryomov 		if (err)
1759d286de79SIlya Dryomov 			goto bad;
1760d286de79SIlya Dryomov 	}
1761d286de79SIlya Dryomov 
176263a6993fSIlya Dryomov 	/* primary_affinity */
176363a6993fSIlya Dryomov 	if (struct_v >= 2) {
176463a6993fSIlya Dryomov 		err = decode_primary_affinity(p, end, map);
176563a6993fSIlya Dryomov 		if (err)
176663a6993fSIlya Dryomov 			goto bad;
176763a6993fSIlya Dryomov 	} else {
17686f428df4SIlya Dryomov 		WARN_ON(map->osd_primary_affinity);
176963a6993fSIlya Dryomov 	}
177063a6993fSIlya Dryomov 
17713d14c5d2SYehuda Sadeh 	/* crush */
1772597b52f6SIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
17731b6a78b5SIlya Dryomov 	err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
17741b6a78b5SIlya Dryomov 	if (err)
17753d14c5d2SYehuda Sadeh 		goto bad;
17763d14c5d2SYehuda Sadeh 
17776f428df4SIlya Dryomov 	*p += len;
17786f428df4SIlya Dryomov 	if (struct_v >= 3) {
17796f428df4SIlya Dryomov 		/* erasure_code_profiles */
17806f428df4SIlya Dryomov 		ceph_decode_skip_map_of_map(p, end, string, string, string,
178100c8ebb3SDan Carpenter 					    e_inval);
17826f428df4SIlya Dryomov 	}
17836f428df4SIlya Dryomov 
17846f428df4SIlya Dryomov 	if (struct_v >= 4) {
17856f428df4SIlya Dryomov 		err = decode_pg_upmap(p, end, map);
17866f428df4SIlya Dryomov 		if (err)
17876f428df4SIlya Dryomov 			goto bad;
17886f428df4SIlya Dryomov 
17896f428df4SIlya Dryomov 		err = decode_pg_upmap_items(p, end, map);
17906f428df4SIlya Dryomov 		if (err)
17916f428df4SIlya Dryomov 			goto bad;
17926f428df4SIlya Dryomov 	} else {
17936f428df4SIlya Dryomov 		WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
17946f428df4SIlya Dryomov 		WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
17956f428df4SIlya Dryomov 	}
17966f428df4SIlya Dryomov 
179738a8d560SIlya Dryomov 	/* ignore the rest */
17983d14c5d2SYehuda Sadeh 	*p = end;
17993d14c5d2SYehuda Sadeh 
180038a8d560SIlya Dryomov 	dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
1801a2505d63SIlya Dryomov 	return 0;
18023d14c5d2SYehuda Sadeh 
1803597b52f6SIlya Dryomov e_inval:
1804597b52f6SIlya Dryomov 	err = -EINVAL;
18053d14c5d2SYehuda Sadeh bad:
180638a8d560SIlya Dryomov 	pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
180738a8d560SIlya Dryomov 	       err, epoch, (int)(*p - start), *p, start, end);
180838a8d560SIlya Dryomov 	print_hex_dump(KERN_DEBUG, "osdmap: ",
180938a8d560SIlya Dryomov 		       DUMP_PREFIX_OFFSET, 16, 1,
181038a8d560SIlya Dryomov 		       start, end - start, true);
1811a2505d63SIlya Dryomov 	return err;
1812a2505d63SIlya Dryomov }
1813a2505d63SIlya Dryomov 
1814a2505d63SIlya Dryomov /*
1815a2505d63SIlya Dryomov  * Allocate and decode a full map.
1816a2505d63SIlya Dryomov  */
ceph_osdmap_decode(void ** p,void * end,bool msgr2)1817a5cbd5fcSIlya Dryomov struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
1818a2505d63SIlya Dryomov {
1819a2505d63SIlya Dryomov 	struct ceph_osdmap *map;
1820a2505d63SIlya Dryomov 	int ret;
1821a2505d63SIlya Dryomov 
1822e5253a7bSIlya Dryomov 	map = ceph_osdmap_alloc();
1823a2505d63SIlya Dryomov 	if (!map)
1824a2505d63SIlya Dryomov 		return ERR_PTR(-ENOMEM);
1825a2505d63SIlya Dryomov 
1826a5cbd5fcSIlya Dryomov 	ret = osdmap_decode(p, end, msgr2, map);
1827a2505d63SIlya Dryomov 	if (ret) {
18283d14c5d2SYehuda Sadeh 		ceph_osdmap_destroy(map);
1829a2505d63SIlya Dryomov 		return ERR_PTR(ret);
1830a2505d63SIlya Dryomov 	}
1831a2505d63SIlya Dryomov 
1832a2505d63SIlya Dryomov 	return map;
18333d14c5d2SYehuda Sadeh }
18343d14c5d2SYehuda Sadeh 
18353d14c5d2SYehuda Sadeh /*
1836930c5328SIlya Dryomov  * Encoding order is (new_up_client, new_state, new_weight).  Need to
1837930c5328SIlya Dryomov  * apply in the (new_weight, new_state, new_up_client) order, because
1838930c5328SIlya Dryomov  * an incremental map may look like e.g.
1839930c5328SIlya Dryomov  *
1840930c5328SIlya Dryomov  *     new_up_client: { osd=6, addr=... } # set osd_state and addr
1841930c5328SIlya Dryomov  *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
1842930c5328SIlya Dryomov  */
decode_new_up_state_weight(void ** p,void * end,u8 struct_v,bool msgr2,struct ceph_osdmap * map)18430bb05da2SIlya Dryomov static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
1844a5cbd5fcSIlya Dryomov 				      bool msgr2, struct ceph_osdmap *map)
1845930c5328SIlya Dryomov {
1846930c5328SIlya Dryomov 	void *new_up_client;
1847930c5328SIlya Dryomov 	void *new_state;
1848930c5328SIlya Dryomov 	void *new_weight_end;
1849930c5328SIlya Dryomov 	u32 len;
1850a5cbd5fcSIlya Dryomov 	int ret;
18518cb5f2b4SJeff Layton 	int i;
1852930c5328SIlya Dryomov 
1853930c5328SIlya Dryomov 	new_up_client = *p;
1854930c5328SIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
18558cb5f2b4SJeff Layton 	for (i = 0; i < len; ++i) {
18568cb5f2b4SJeff Layton 		struct ceph_entity_addr addr;
18578cb5f2b4SJeff Layton 
18588cb5f2b4SJeff Layton 		ceph_decode_skip_32(p, end, e_inval);
1859a5cbd5fcSIlya Dryomov 		if (struct_v >= 7)
1860a5cbd5fcSIlya Dryomov 			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
1861a5cbd5fcSIlya Dryomov 		else
1862a5cbd5fcSIlya Dryomov 			ret = ceph_decode_entity_addr(p, end, &addr);
1863a5cbd5fcSIlya Dryomov 		if (ret)
1864a5cbd5fcSIlya Dryomov 			return ret;
18658cb5f2b4SJeff Layton 	}
1866930c5328SIlya Dryomov 
1867930c5328SIlya Dryomov 	new_state = *p;
1868930c5328SIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
18690bb05da2SIlya Dryomov 	len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8));
1870930c5328SIlya Dryomov 	ceph_decode_need(p, end, len, e_inval);
1871930c5328SIlya Dryomov 	*p += len;
1872930c5328SIlya Dryomov 
1873930c5328SIlya Dryomov 	/* new_weight */
1874930c5328SIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
1875930c5328SIlya Dryomov 	while (len--) {
1876930c5328SIlya Dryomov 		s32 osd;
1877930c5328SIlya Dryomov 		u32 w;
1878930c5328SIlya Dryomov 
1879930c5328SIlya Dryomov 		ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
1880930c5328SIlya Dryomov 		osd = ceph_decode_32(p);
1881930c5328SIlya Dryomov 		w = ceph_decode_32(p);
1882930c5328SIlya Dryomov 		BUG_ON(osd >= map->max_osd);
1883*842d6b01SDaichi Mukai 		osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w,
1884930c5328SIlya Dryomov 			    w == CEPH_OSD_IN ? "(in)" :
1885930c5328SIlya Dryomov 			    (w == CEPH_OSD_OUT ? "(out)" : ""));
1886930c5328SIlya Dryomov 		map->osd_weight[osd] = w;
1887930c5328SIlya Dryomov 
1888930c5328SIlya Dryomov 		/*
1889930c5328SIlya Dryomov 		 * If we are marking in, set the EXISTS, and clear the
1890930c5328SIlya Dryomov 		 * AUTOOUT and NEW bits.
1891930c5328SIlya Dryomov 		 */
1892930c5328SIlya Dryomov 		if (w) {
1893930c5328SIlya Dryomov 			map->osd_state[osd] |= CEPH_OSD_EXISTS;
1894930c5328SIlya Dryomov 			map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
1895930c5328SIlya Dryomov 						 CEPH_OSD_NEW);
1896930c5328SIlya Dryomov 		}
1897930c5328SIlya Dryomov 	}
1898930c5328SIlya Dryomov 	new_weight_end = *p;
1899930c5328SIlya Dryomov 
1900930c5328SIlya Dryomov 	/* new_state (up/down) */
1901930c5328SIlya Dryomov 	*p = new_state;
1902930c5328SIlya Dryomov 	len = ceph_decode_32(p);
1903930c5328SIlya Dryomov 	while (len--) {
1904930c5328SIlya Dryomov 		s32 osd;
19050bb05da2SIlya Dryomov 		u32 xorstate;
1906930c5328SIlya Dryomov 
1907930c5328SIlya Dryomov 		osd = ceph_decode_32(p);
19080bb05da2SIlya Dryomov 		if (struct_v >= 5)
19090bb05da2SIlya Dryomov 			xorstate = ceph_decode_32(p);
19100bb05da2SIlya Dryomov 		else
1911930c5328SIlya Dryomov 			xorstate = ceph_decode_8(p);
1912930c5328SIlya Dryomov 		if (xorstate == 0)
1913930c5328SIlya Dryomov 			xorstate = CEPH_OSD_UP;
1914930c5328SIlya Dryomov 		BUG_ON(osd >= map->max_osd);
1915930c5328SIlya Dryomov 		if ((map->osd_state[osd] & CEPH_OSD_UP) &&
1916930c5328SIlya Dryomov 		    (xorstate & CEPH_OSD_UP))
1917*842d6b01SDaichi Mukai 			osdmap_info(map, "osd%d down\n", osd);
1918930c5328SIlya Dryomov 		if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
1919930c5328SIlya Dryomov 		    (xorstate & CEPH_OSD_EXISTS)) {
1920*842d6b01SDaichi Mukai 			osdmap_info(map, "osd%d does not exist\n", osd);
1921930c5328SIlya Dryomov 			ret = set_primary_affinity(map, osd,
1922930c5328SIlya Dryomov 						   CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1923930c5328SIlya Dryomov 			if (ret)
1924930c5328SIlya Dryomov 				return ret;
1925930c5328SIlya Dryomov 			memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
1926930c5328SIlya Dryomov 			map->osd_state[osd] = 0;
1927930c5328SIlya Dryomov 		} else {
1928930c5328SIlya Dryomov 			map->osd_state[osd] ^= xorstate;
1929930c5328SIlya Dryomov 		}
1930930c5328SIlya Dryomov 	}
1931930c5328SIlya Dryomov 
1932930c5328SIlya Dryomov 	/* new_up_client */
1933930c5328SIlya Dryomov 	*p = new_up_client;
1934930c5328SIlya Dryomov 	len = ceph_decode_32(p);
1935930c5328SIlya Dryomov 	while (len--) {
1936930c5328SIlya Dryomov 		s32 osd;
1937930c5328SIlya Dryomov 		struct ceph_entity_addr addr;
1938930c5328SIlya Dryomov 
1939930c5328SIlya Dryomov 		osd = ceph_decode_32(p);
1940930c5328SIlya Dryomov 		BUG_ON(osd >= map->max_osd);
1941a5cbd5fcSIlya Dryomov 		if (struct_v >= 7)
1942a5cbd5fcSIlya Dryomov 			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
1943a5cbd5fcSIlya Dryomov 		else
1944a5cbd5fcSIlya Dryomov 			ret = ceph_decode_entity_addr(p, end, &addr);
1945a5cbd5fcSIlya Dryomov 		if (ret)
1946a5cbd5fcSIlya Dryomov 			return ret;
1947a5cbd5fcSIlya Dryomov 
1948a5cbd5fcSIlya Dryomov 		dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
1949a5cbd5fcSIlya Dryomov 
1950*842d6b01SDaichi Mukai 		osdmap_info(map, "osd%d up\n", osd);
1951930c5328SIlya Dryomov 		map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1952930c5328SIlya Dryomov 		map->osd_addr[osd] = addr;
1953930c5328SIlya Dryomov 	}
1954930c5328SIlya Dryomov 
1955930c5328SIlya Dryomov 	*p = new_weight_end;
1956930c5328SIlya Dryomov 	return 0;
1957930c5328SIlya Dryomov 
1958930c5328SIlya Dryomov e_inval:
1959930c5328SIlya Dryomov 	return -EINVAL;
1960930c5328SIlya Dryomov }
1961930c5328SIlya Dryomov 
1962930c5328SIlya Dryomov /*
19633d14c5d2SYehuda Sadeh  * decode and apply an incremental map update.
19643d14c5d2SYehuda Sadeh  */
osdmap_apply_incremental(void ** p,void * end,bool msgr2,struct ceph_osdmap * map)1965a5cbd5fcSIlya Dryomov struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
19660c0a8de1SIlya Dryomov 					     struct ceph_osdmap *map)
19673d14c5d2SYehuda Sadeh {
19683d14c5d2SYehuda Sadeh 	struct ceph_fsid fsid;
19693d14c5d2SYehuda Sadeh 	u32 epoch = 0;
19703d14c5d2SYehuda Sadeh 	struct ceph_timespec modified;
19714f6a7e5eSSage Weil 	s32 len;
19724f6a7e5eSSage Weil 	u64 pool;
19734f6a7e5eSSage Weil 	__s64 new_pool_max;
19744f6a7e5eSSage Weil 	__s32 new_flags, max;
19753d14c5d2SYehuda Sadeh 	void *start = *p;
197686f1742bSIlya Dryomov 	int err;
1977ec7af972SIlya Dryomov 	u8 struct_v;
19783d14c5d2SYehuda Sadeh 
197938a8d560SIlya Dryomov 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
198038a8d560SIlya Dryomov 
1981ec7af972SIlya Dryomov 	err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
1982ec7af972SIlya Dryomov 	if (err)
1983ec7af972SIlya Dryomov 		goto bad;
19843d14c5d2SYehuda Sadeh 
198553bbaba9SIlya Dryomov 	/* fsid, epoch, modified, new_pool_max, new_flags */
198653bbaba9SIlya Dryomov 	ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
198753bbaba9SIlya Dryomov 			 sizeof(u64) + sizeof(u32), e_inval);
19883d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &fsid, sizeof(fsid));
19893d14c5d2SYehuda Sadeh 	epoch = ceph_decode_32(p);
19903d14c5d2SYehuda Sadeh 	BUG_ON(epoch != map->epoch+1);
19913d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &modified, sizeof(modified));
19924f6a7e5eSSage Weil 	new_pool_max = ceph_decode_64(p);
19933d14c5d2SYehuda Sadeh 	new_flags = ceph_decode_32(p);
19943d14c5d2SYehuda Sadeh 
19953d14c5d2SYehuda Sadeh 	/* full map? */
199686f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
19973d14c5d2SYehuda Sadeh 	if (len > 0) {
19983d14c5d2SYehuda Sadeh 		dout("apply_incremental full map len %d, %p to %p\n",
19993d14c5d2SYehuda Sadeh 		     len, *p, end);
2000a5cbd5fcSIlya Dryomov 		return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
20013d14c5d2SYehuda Sadeh 	}
20023d14c5d2SYehuda Sadeh 
20033d14c5d2SYehuda Sadeh 	/* new crush? */
200486f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
20053d14c5d2SYehuda Sadeh 	if (len > 0) {
20061b6a78b5SIlya Dryomov 		err = osdmap_set_crush(map,
20071b6a78b5SIlya Dryomov 				       crush_decode(*p, min(*p + len, end)));
20081b6a78b5SIlya Dryomov 		if (err)
200986f1742bSIlya Dryomov 			goto bad;
20103d14c5d2SYehuda Sadeh 		*p += len;
20113d14c5d2SYehuda Sadeh 	}
20123d14c5d2SYehuda Sadeh 
20133d14c5d2SYehuda Sadeh 	/* new flags? */
20143d14c5d2SYehuda Sadeh 	if (new_flags >= 0)
20153d14c5d2SYehuda Sadeh 		map->flags = new_flags;
20163d14c5d2SYehuda Sadeh 	if (new_pool_max >= 0)
20173d14c5d2SYehuda Sadeh 		map->pool_max = new_pool_max;
20183d14c5d2SYehuda Sadeh 
20193d14c5d2SYehuda Sadeh 	/* new max? */
202053bbaba9SIlya Dryomov 	ceph_decode_32_safe(p, end, max, e_inval);
20213d14c5d2SYehuda Sadeh 	if (max >= 0) {
20223d14c5d2SYehuda Sadeh 		err = osdmap_set_max_osd(map, max);
202386f1742bSIlya Dryomov 		if (err)
20243d14c5d2SYehuda Sadeh 			goto bad;
20253d14c5d2SYehuda Sadeh 	}
20263d14c5d2SYehuda Sadeh 
20273d14c5d2SYehuda Sadeh 	map->epoch++;
202831456665SSage Weil 	map->modified = modified;
20293d14c5d2SYehuda Sadeh 
2030433fbdd3SIlya Dryomov 	/* new_pools */
2031433fbdd3SIlya Dryomov 	err = decode_new_pools(p, end, map);
2032433fbdd3SIlya Dryomov 	if (err)
20333d14c5d2SYehuda Sadeh 		goto bad;
20349464d008SIlya Dryomov 
20350f70c7eeSIlya Dryomov 	/* new_pool_names */
20360f70c7eeSIlya Dryomov 	err = decode_pool_names(p, end, map);
203786f1742bSIlya Dryomov 	if (err)
20383d14c5d2SYehuda Sadeh 		goto bad;
20393d14c5d2SYehuda Sadeh 
20403d14c5d2SYehuda Sadeh 	/* old_pool */
204186f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
20423d14c5d2SYehuda Sadeh 	while (len--) {
20433d14c5d2SYehuda Sadeh 		struct ceph_pg_pool_info *pi;
20443d14c5d2SYehuda Sadeh 
204586f1742bSIlya Dryomov 		ceph_decode_64_safe(p, end, pool, e_inval);
20468a4b863cSIlya Dryomov 		pi = lookup_pg_pool(&map->pg_pools, pool);
20473d14c5d2SYehuda Sadeh 		if (pi)
20483d14c5d2SYehuda Sadeh 			__remove_pg_pool(&map->pg_pools, pi);
20493d14c5d2SYehuda Sadeh 	}
20503d14c5d2SYehuda Sadeh 
2051930c5328SIlya Dryomov 	/* new_up_client, new_state, new_weight */
2052a5cbd5fcSIlya Dryomov 	err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
2053930c5328SIlya Dryomov 	if (err)
2054930c5328SIlya Dryomov 		goto bad;
20553d14c5d2SYehuda Sadeh 
20563d14c5d2SYehuda Sadeh 	/* new_pg_temp */
205710db634eSIlya Dryomov 	err = decode_new_pg_temp(p, end, map);
2058d6c0dd6bSSage Weil 	if (err)
2059d6c0dd6bSSage Weil 		goto bad;
20603d14c5d2SYehuda Sadeh 
2061d286de79SIlya Dryomov 	/* new_primary_temp */
2062d286de79SIlya Dryomov 	if (struct_v >= 1) {
2063d286de79SIlya Dryomov 		err = decode_new_primary_temp(p, end, map);
2064d286de79SIlya Dryomov 		if (err)
2065d286de79SIlya Dryomov 			goto bad;
2066d286de79SIlya Dryomov 	}
2067d286de79SIlya Dryomov 
206863a6993fSIlya Dryomov 	/* new_primary_affinity */
206963a6993fSIlya Dryomov 	if (struct_v >= 2) {
207063a6993fSIlya Dryomov 		err = decode_new_primary_affinity(p, end, map);
207163a6993fSIlya Dryomov 		if (err)
207263a6993fSIlya Dryomov 			goto bad;
207363a6993fSIlya Dryomov 	}
207463a6993fSIlya Dryomov 
20756f428df4SIlya Dryomov 	if (struct_v >= 3) {
20766f428df4SIlya Dryomov 		/* new_erasure_code_profiles */
20776f428df4SIlya Dryomov 		ceph_decode_skip_map_of_map(p, end, string, string, string,
207800c8ebb3SDan Carpenter 					    e_inval);
20796f428df4SIlya Dryomov 		/* old_erasure_code_profiles */
208000c8ebb3SDan Carpenter 		ceph_decode_skip_set(p, end, string, e_inval);
20816f428df4SIlya Dryomov 	}
20826f428df4SIlya Dryomov 
20836f428df4SIlya Dryomov 	if (struct_v >= 4) {
20846f428df4SIlya Dryomov 		err = decode_new_pg_upmap(p, end, map);
20856f428df4SIlya Dryomov 		if (err)
20866f428df4SIlya Dryomov 			goto bad;
20876f428df4SIlya Dryomov 
20886f428df4SIlya Dryomov 		err = decode_old_pg_upmap(p, end, map);
20896f428df4SIlya Dryomov 		if (err)
20906f428df4SIlya Dryomov 			goto bad;
20916f428df4SIlya Dryomov 
20926f428df4SIlya Dryomov 		err = decode_new_pg_upmap_items(p, end, map);
20936f428df4SIlya Dryomov 		if (err)
20946f428df4SIlya Dryomov 			goto bad;
20956f428df4SIlya Dryomov 
20966f428df4SIlya Dryomov 		err = decode_old_pg_upmap_items(p, end, map);
20976f428df4SIlya Dryomov 		if (err)
20986f428df4SIlya Dryomov 			goto bad;
20996f428df4SIlya Dryomov 	}
21006f428df4SIlya Dryomov 
21013d14c5d2SYehuda Sadeh 	/* ignore the rest */
21023d14c5d2SYehuda Sadeh 	*p = end;
210338a8d560SIlya Dryomov 
210438a8d560SIlya Dryomov 	dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
21053d14c5d2SYehuda Sadeh 	return map;
21063d14c5d2SYehuda Sadeh 
210786f1742bSIlya Dryomov e_inval:
210886f1742bSIlya Dryomov 	err = -EINVAL;
21093d14c5d2SYehuda Sadeh bad:
211038a8d560SIlya Dryomov 	pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
211138a8d560SIlya Dryomov 	       err, epoch, (int)(*p - start), *p, start, end);
21123d14c5d2SYehuda Sadeh 	print_hex_dump(KERN_DEBUG, "osdmap: ",
21133d14c5d2SYehuda Sadeh 		       DUMP_PREFIX_OFFSET, 16, 1,
21143d14c5d2SYehuda Sadeh 		       start, end - start, true);
21153d14c5d2SYehuda Sadeh 	return ERR_PTR(err);
21163d14c5d2SYehuda Sadeh }
21173d14c5d2SYehuda Sadeh 
ceph_oloc_copy(struct ceph_object_locator * dest,const struct ceph_object_locator * src)211830c156d9SYan, Zheng void ceph_oloc_copy(struct ceph_object_locator *dest,
211930c156d9SYan, Zheng 		    const struct ceph_object_locator *src)
212030c156d9SYan, Zheng {
2121ca35ffeaSIlya Dryomov 	ceph_oloc_destroy(dest);
212230c156d9SYan, Zheng 
212330c156d9SYan, Zheng 	dest->pool = src->pool;
212430c156d9SYan, Zheng 	if (src->pool_ns)
212530c156d9SYan, Zheng 		dest->pool_ns = ceph_get_string(src->pool_ns);
2126ca35ffeaSIlya Dryomov 	else
2127ca35ffeaSIlya Dryomov 		dest->pool_ns = NULL;
212830c156d9SYan, Zheng }
212930c156d9SYan, Zheng EXPORT_SYMBOL(ceph_oloc_copy);
213030c156d9SYan, Zheng 
ceph_oloc_destroy(struct ceph_object_locator * oloc)213130c156d9SYan, Zheng void ceph_oloc_destroy(struct ceph_object_locator *oloc)
213230c156d9SYan, Zheng {
213330c156d9SYan, Zheng 	ceph_put_string(oloc->pool_ns);
213430c156d9SYan, Zheng }
213530c156d9SYan, Zheng EXPORT_SYMBOL(ceph_oloc_destroy);
213630c156d9SYan, Zheng 
ceph_oid_copy(struct ceph_object_id * dest,const struct ceph_object_id * src)2137d30291b9SIlya Dryomov void ceph_oid_copy(struct ceph_object_id *dest,
2138d30291b9SIlya Dryomov 		   const struct ceph_object_id *src)
2139d30291b9SIlya Dryomov {
2140ca35ffeaSIlya Dryomov 	ceph_oid_destroy(dest);
21413d14c5d2SYehuda Sadeh 
2142d30291b9SIlya Dryomov 	if (src->name != src->inline_name) {
2143d30291b9SIlya Dryomov 		/* very rare, see ceph_object_id definition */
2144d30291b9SIlya Dryomov 		dest->name = kmalloc(src->name_len + 1,
2145d30291b9SIlya Dryomov 				     GFP_NOIO | __GFP_NOFAIL);
2146ca35ffeaSIlya Dryomov 	} else {
2147ca35ffeaSIlya Dryomov 		dest->name = dest->inline_name;
2148d30291b9SIlya Dryomov 	}
2149d30291b9SIlya Dryomov 	memcpy(dest->name, src->name, src->name_len + 1);
2150d30291b9SIlya Dryomov 	dest->name_len = src->name_len;
2151d30291b9SIlya Dryomov }
2152d30291b9SIlya Dryomov EXPORT_SYMBOL(ceph_oid_copy);
2153d30291b9SIlya Dryomov 
2154d30291b9SIlya Dryomov static __printf(2, 0)
oid_printf_vargs(struct ceph_object_id * oid,const char * fmt,va_list ap)2155d30291b9SIlya Dryomov int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
2156d30291b9SIlya Dryomov {
2157d30291b9SIlya Dryomov 	int len;
2158d30291b9SIlya Dryomov 
2159d30291b9SIlya Dryomov 	WARN_ON(!ceph_oid_empty(oid));
2160d30291b9SIlya Dryomov 
2161d30291b9SIlya Dryomov 	len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
2162d30291b9SIlya Dryomov 	if (len >= sizeof(oid->inline_name))
2163d30291b9SIlya Dryomov 		return len;
2164d30291b9SIlya Dryomov 
2165d30291b9SIlya Dryomov 	oid->name_len = len;
2166d30291b9SIlya Dryomov 	return 0;
2167d30291b9SIlya Dryomov }
2168d30291b9SIlya Dryomov 
2169d30291b9SIlya Dryomov /*
2170d30291b9SIlya Dryomov  * If oid doesn't fit into inline buffer, BUG.
2171d30291b9SIlya Dryomov  */
ceph_oid_printf(struct ceph_object_id * oid,const char * fmt,...)2172d30291b9SIlya Dryomov void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
2173d30291b9SIlya Dryomov {
2174d30291b9SIlya Dryomov 	va_list ap;
2175d30291b9SIlya Dryomov 
2176d30291b9SIlya Dryomov 	va_start(ap, fmt);
2177d30291b9SIlya Dryomov 	BUG_ON(oid_printf_vargs(oid, fmt, ap));
2178d30291b9SIlya Dryomov 	va_end(ap);
2179d30291b9SIlya Dryomov }
2180d30291b9SIlya Dryomov EXPORT_SYMBOL(ceph_oid_printf);
2181d30291b9SIlya Dryomov 
2182d30291b9SIlya Dryomov static __printf(3, 0)
oid_aprintf_vargs(struct ceph_object_id * oid,gfp_t gfp,const char * fmt,va_list ap)2183d30291b9SIlya Dryomov int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
2184d30291b9SIlya Dryomov 		      const char *fmt, va_list ap)
2185d30291b9SIlya Dryomov {
2186d30291b9SIlya Dryomov 	va_list aq;
2187d30291b9SIlya Dryomov 	int len;
2188d30291b9SIlya Dryomov 
2189d30291b9SIlya Dryomov 	va_copy(aq, ap);
2190d30291b9SIlya Dryomov 	len = oid_printf_vargs(oid, fmt, aq);
2191d30291b9SIlya Dryomov 	va_end(aq);
2192d30291b9SIlya Dryomov 
2193d30291b9SIlya Dryomov 	if (len) {
2194d30291b9SIlya Dryomov 		char *external_name;
2195d30291b9SIlya Dryomov 
2196d30291b9SIlya Dryomov 		external_name = kmalloc(len + 1, gfp);
2197d30291b9SIlya Dryomov 		if (!external_name)
2198d30291b9SIlya Dryomov 			return -ENOMEM;
2199d30291b9SIlya Dryomov 
2200d30291b9SIlya Dryomov 		oid->name = external_name;
2201d30291b9SIlya Dryomov 		WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
2202d30291b9SIlya Dryomov 		oid->name_len = len;
2203d30291b9SIlya Dryomov 	}
2204d30291b9SIlya Dryomov 
2205d30291b9SIlya Dryomov 	return 0;
2206d30291b9SIlya Dryomov }
2207d30291b9SIlya Dryomov 
2208d30291b9SIlya Dryomov /*
2209d30291b9SIlya Dryomov  * If oid doesn't fit into inline buffer, allocate.
2210d30291b9SIlya Dryomov  */
ceph_oid_aprintf(struct ceph_object_id * oid,gfp_t gfp,const char * fmt,...)2211d30291b9SIlya Dryomov int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
2212d30291b9SIlya Dryomov 		     const char *fmt, ...)
2213d30291b9SIlya Dryomov {
2214d30291b9SIlya Dryomov 	va_list ap;
2215d30291b9SIlya Dryomov 	int ret;
2216d30291b9SIlya Dryomov 
2217d30291b9SIlya Dryomov 	va_start(ap, fmt);
2218d30291b9SIlya Dryomov 	ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
2219d30291b9SIlya Dryomov 	va_end(ap);
2220d30291b9SIlya Dryomov 
2221d30291b9SIlya Dryomov 	return ret;
2222d30291b9SIlya Dryomov }
2223d30291b9SIlya Dryomov EXPORT_SYMBOL(ceph_oid_aprintf);
2224d30291b9SIlya Dryomov 
ceph_oid_destroy(struct ceph_object_id * oid)2225d30291b9SIlya Dryomov void ceph_oid_destroy(struct ceph_object_id *oid)
2226d30291b9SIlya Dryomov {
2227d30291b9SIlya Dryomov 	if (oid->name != oid->inline_name)
2228d30291b9SIlya Dryomov 		kfree(oid->name);
2229d30291b9SIlya Dryomov }
2230d30291b9SIlya Dryomov EXPORT_SYMBOL(ceph_oid_destroy);
22313d14c5d2SYehuda Sadeh 
223263244fa1SIlya Dryomov /*
223363244fa1SIlya Dryomov  * osds only
223463244fa1SIlya Dryomov  */
__osds_equal(const struct ceph_osds * lhs,const struct ceph_osds * rhs)223563244fa1SIlya Dryomov static bool __osds_equal(const struct ceph_osds *lhs,
223663244fa1SIlya Dryomov 			 const struct ceph_osds *rhs)
223763244fa1SIlya Dryomov {
223863244fa1SIlya Dryomov 	if (lhs->size == rhs->size &&
223963244fa1SIlya Dryomov 	    !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
224063244fa1SIlya Dryomov 		return true;
224163244fa1SIlya Dryomov 
224263244fa1SIlya Dryomov 	return false;
224363244fa1SIlya Dryomov }
224463244fa1SIlya Dryomov 
224563244fa1SIlya Dryomov /*
224663244fa1SIlya Dryomov  * osds + primary
224763244fa1SIlya Dryomov  */
osds_equal(const struct ceph_osds * lhs,const struct ceph_osds * rhs)224863244fa1SIlya Dryomov static bool osds_equal(const struct ceph_osds *lhs,
224963244fa1SIlya Dryomov 		       const struct ceph_osds *rhs)
225063244fa1SIlya Dryomov {
225163244fa1SIlya Dryomov 	if (__osds_equal(lhs, rhs) &&
225263244fa1SIlya Dryomov 	    lhs->primary == rhs->primary)
225363244fa1SIlya Dryomov 		return true;
225463244fa1SIlya Dryomov 
225563244fa1SIlya Dryomov 	return false;
225663244fa1SIlya Dryomov }
225763244fa1SIlya Dryomov 
osds_valid(const struct ceph_osds * set)22586f3bfd45SIlya Dryomov static bool osds_valid(const struct ceph_osds *set)
22596f3bfd45SIlya Dryomov {
22606f3bfd45SIlya Dryomov 	/* non-empty set */
22616f3bfd45SIlya Dryomov 	if (set->size > 0 && set->primary >= 0)
22626f3bfd45SIlya Dryomov 		return true;
22636f3bfd45SIlya Dryomov 
22646f3bfd45SIlya Dryomov 	/* empty can_shift_osds set */
22656f3bfd45SIlya Dryomov 	if (!set->size && set->primary == -1)
22666f3bfd45SIlya Dryomov 		return true;
22676f3bfd45SIlya Dryomov 
22686f3bfd45SIlya Dryomov 	/* empty !can_shift_osds set - all NONE */
22696f3bfd45SIlya Dryomov 	if (set->size > 0 && set->primary == -1) {
22706f3bfd45SIlya Dryomov 		int i;
22716f3bfd45SIlya Dryomov 
22726f3bfd45SIlya Dryomov 		for (i = 0; i < set->size; i++) {
22736f3bfd45SIlya Dryomov 			if (set->osds[i] != CRUSH_ITEM_NONE)
22746f3bfd45SIlya Dryomov 				break;
22756f3bfd45SIlya Dryomov 		}
22766f3bfd45SIlya Dryomov 		if (i == set->size)
22776f3bfd45SIlya Dryomov 			return true;
22786f3bfd45SIlya Dryomov 	}
22796f3bfd45SIlya Dryomov 
22806f3bfd45SIlya Dryomov 	return false;
22816f3bfd45SIlya Dryomov }
22826f3bfd45SIlya Dryomov 
ceph_osds_copy(struct ceph_osds * dest,const struct ceph_osds * src)22836f3bfd45SIlya Dryomov void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
22846f3bfd45SIlya Dryomov {
22856f3bfd45SIlya Dryomov 	memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
22866f3bfd45SIlya Dryomov 	dest->size = src->size;
22876f3bfd45SIlya Dryomov 	dest->primary = src->primary;
22886f3bfd45SIlya Dryomov }
22896f3bfd45SIlya Dryomov 
ceph_pg_is_split(const struct ceph_pg * pgid,u32 old_pg_num,u32 new_pg_num)22907de030d6SIlya Dryomov bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
229163244fa1SIlya Dryomov 		      u32 new_pg_num)
229263244fa1SIlya Dryomov {
229363244fa1SIlya Dryomov 	int old_bits = calc_bits_of(old_pg_num);
229463244fa1SIlya Dryomov 	int old_mask = (1 << old_bits) - 1;
229563244fa1SIlya Dryomov 	int n;
229663244fa1SIlya Dryomov 
229763244fa1SIlya Dryomov 	WARN_ON(pgid->seed >= old_pg_num);
229863244fa1SIlya Dryomov 	if (new_pg_num <= old_pg_num)
229963244fa1SIlya Dryomov 		return false;
230063244fa1SIlya Dryomov 
230163244fa1SIlya Dryomov 	for (n = 1; ; n++) {
230263244fa1SIlya Dryomov 		int next_bit = n << (old_bits - 1);
230363244fa1SIlya Dryomov 		u32 s = next_bit | pgid->seed;
230463244fa1SIlya Dryomov 
230563244fa1SIlya Dryomov 		if (s < old_pg_num || s == pgid->seed)
230663244fa1SIlya Dryomov 			continue;
230763244fa1SIlya Dryomov 		if (s >= new_pg_num)
230863244fa1SIlya Dryomov 			break;
230963244fa1SIlya Dryomov 
231063244fa1SIlya Dryomov 		s = ceph_stable_mod(s, old_pg_num, old_mask);
231163244fa1SIlya Dryomov 		if (s == pgid->seed)
231263244fa1SIlya Dryomov 			return true;
231363244fa1SIlya Dryomov 	}
231463244fa1SIlya Dryomov 
231563244fa1SIlya Dryomov 	return false;
231663244fa1SIlya Dryomov }
231763244fa1SIlya Dryomov 
ceph_is_new_interval(const struct ceph_osds * old_acting,const struct ceph_osds * new_acting,const struct ceph_osds * old_up,const struct ceph_osds * new_up,int old_size,int new_size,int old_min_size,int new_min_size,u32 old_pg_num,u32 new_pg_num,bool old_sort_bitwise,bool new_sort_bitwise,bool old_recovery_deletes,bool new_recovery_deletes,const struct ceph_pg * pgid)231863244fa1SIlya Dryomov bool ceph_is_new_interval(const struct ceph_osds *old_acting,
231963244fa1SIlya Dryomov 			  const struct ceph_osds *new_acting,
232063244fa1SIlya Dryomov 			  const struct ceph_osds *old_up,
232163244fa1SIlya Dryomov 			  const struct ceph_osds *new_up,
232263244fa1SIlya Dryomov 			  int old_size,
232363244fa1SIlya Dryomov 			  int new_size,
232463244fa1SIlya Dryomov 			  int old_min_size,
232563244fa1SIlya Dryomov 			  int new_min_size,
232663244fa1SIlya Dryomov 			  u32 old_pg_num,
232763244fa1SIlya Dryomov 			  u32 new_pg_num,
232863244fa1SIlya Dryomov 			  bool old_sort_bitwise,
232963244fa1SIlya Dryomov 			  bool new_sort_bitwise,
2330ae78dd81SIlya Dryomov 			  bool old_recovery_deletes,
2331ae78dd81SIlya Dryomov 			  bool new_recovery_deletes,
233263244fa1SIlya Dryomov 			  const struct ceph_pg *pgid)
233363244fa1SIlya Dryomov {
233463244fa1SIlya Dryomov 	return !osds_equal(old_acting, new_acting) ||
233563244fa1SIlya Dryomov 	       !osds_equal(old_up, new_up) ||
233663244fa1SIlya Dryomov 	       old_size != new_size ||
233763244fa1SIlya Dryomov 	       old_min_size != new_min_size ||
23387de030d6SIlya Dryomov 	       ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
2339ae78dd81SIlya Dryomov 	       old_sort_bitwise != new_sort_bitwise ||
2340ae78dd81SIlya Dryomov 	       old_recovery_deletes != new_recovery_deletes;
234163244fa1SIlya Dryomov }
234263244fa1SIlya Dryomov 
calc_pg_rank(int osd,const struct ceph_osds * acting)234363244fa1SIlya Dryomov static int calc_pg_rank(int osd, const struct ceph_osds *acting)
234463244fa1SIlya Dryomov {
234563244fa1SIlya Dryomov 	int i;
234663244fa1SIlya Dryomov 
234763244fa1SIlya Dryomov 	for (i = 0; i < acting->size; i++) {
234863244fa1SIlya Dryomov 		if (acting->osds[i] == osd)
234963244fa1SIlya Dryomov 			return i;
235063244fa1SIlya Dryomov 	}
235163244fa1SIlya Dryomov 
235263244fa1SIlya Dryomov 	return -1;
235363244fa1SIlya Dryomov }
235463244fa1SIlya Dryomov 
primary_changed(const struct ceph_osds * old_acting,const struct ceph_osds * new_acting)235563244fa1SIlya Dryomov static bool primary_changed(const struct ceph_osds *old_acting,
235663244fa1SIlya Dryomov 			    const struct ceph_osds *new_acting)
235763244fa1SIlya Dryomov {
235863244fa1SIlya Dryomov 	if (!old_acting->size && !new_acting->size)
235963244fa1SIlya Dryomov 		return false; /* both still empty */
236063244fa1SIlya Dryomov 
236163244fa1SIlya Dryomov 	if (!old_acting->size ^ !new_acting->size)
236263244fa1SIlya Dryomov 		return true; /* was empty, now not, or vice versa */
236363244fa1SIlya Dryomov 
236463244fa1SIlya Dryomov 	if (old_acting->primary != new_acting->primary)
236563244fa1SIlya Dryomov 		return true; /* primary changed */
236663244fa1SIlya Dryomov 
236763244fa1SIlya Dryomov 	if (calc_pg_rank(old_acting->primary, old_acting) !=
236863244fa1SIlya Dryomov 	    calc_pg_rank(new_acting->primary, new_acting))
236963244fa1SIlya Dryomov 		return true;
237063244fa1SIlya Dryomov 
237163244fa1SIlya Dryomov 	return false; /* same primary (tho replicas may have changed) */
237263244fa1SIlya Dryomov }
237363244fa1SIlya Dryomov 
ceph_osds_changed(const struct ceph_osds * old_acting,const struct ceph_osds * new_acting,bool any_change)237463244fa1SIlya Dryomov bool ceph_osds_changed(const struct ceph_osds *old_acting,
237563244fa1SIlya Dryomov 		       const struct ceph_osds *new_acting,
237663244fa1SIlya Dryomov 		       bool any_change)
237763244fa1SIlya Dryomov {
237863244fa1SIlya Dryomov 	if (primary_changed(old_acting, new_acting))
237963244fa1SIlya Dryomov 		return true;
238063244fa1SIlya Dryomov 
238163244fa1SIlya Dryomov 	if (any_change && !__osds_equal(old_acting, new_acting))
238263244fa1SIlya Dryomov 		return true;
238363244fa1SIlya Dryomov 
238463244fa1SIlya Dryomov 	return false;
238563244fa1SIlya Dryomov }
238663244fa1SIlya Dryomov 
23873d14c5d2SYehuda Sadeh /*
2388d9591f5eSIlya Dryomov  * Map an object into a PG.
2389d9591f5eSIlya Dryomov  *
2390d9591f5eSIlya Dryomov  * Should only be called with target_oid and target_oloc (as opposed to
2391d9591f5eSIlya Dryomov  * base_oid and base_oloc), since tiering isn't taken into account.
23923d14c5d2SYehuda Sadeh  */
__ceph_object_locator_to_pg(struct ceph_pg_pool_info * pi,const struct ceph_object_id * oid,const struct ceph_object_locator * oloc,struct ceph_pg * raw_pgid)2393a86f009fSIlya Dryomov void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
2394df28152dSIlya Dryomov 				 const struct ceph_object_id *oid,
2395df28152dSIlya Dryomov 				 const struct ceph_object_locator *oloc,
2396d9591f5eSIlya Dryomov 				 struct ceph_pg *raw_pgid)
23973d14c5d2SYehuda Sadeh {
2398df28152dSIlya Dryomov 	WARN_ON(pi->id != oloc->pool);
23993d14c5d2SYehuda Sadeh 
240030c156d9SYan, Zheng 	if (!oloc->pool_ns) {
2401d9591f5eSIlya Dryomov 		raw_pgid->pool = oloc->pool;
2402d9591f5eSIlya Dryomov 		raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
24037c13cb64SIlya Dryomov 					     oid->name_len);
24044a3262b1SIlya Dryomov 		dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
24054a3262b1SIlya Dryomov 		     raw_pgid->pool, raw_pgid->seed);
240630c156d9SYan, Zheng 	} else {
240730c156d9SYan, Zheng 		char stack_buf[256];
240830c156d9SYan, Zheng 		char *buf = stack_buf;
240930c156d9SYan, Zheng 		int nsl = oloc->pool_ns->len;
241030c156d9SYan, Zheng 		size_t total = nsl + 1 + oid->name_len;
241130c156d9SYan, Zheng 
2412a86f009fSIlya Dryomov 		if (total > sizeof(stack_buf))
2413a86f009fSIlya Dryomov 			buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL);
241430c156d9SYan, Zheng 		memcpy(buf, oloc->pool_ns->str, nsl);
241530c156d9SYan, Zheng 		buf[nsl] = '\037';
241630c156d9SYan, Zheng 		memcpy(buf + nsl + 1, oid->name, oid->name_len);
241730c156d9SYan, Zheng 		raw_pgid->pool = oloc->pool;
241830c156d9SYan, Zheng 		raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
241930c156d9SYan, Zheng 		if (buf != stack_buf)
242030c156d9SYan, Zheng 			kfree(buf);
242130c156d9SYan, Zheng 		dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
242230c156d9SYan, Zheng 		     oid->name, nsl, oloc->pool_ns->str,
242330c156d9SYan, Zheng 		     raw_pgid->pool, raw_pgid->seed);
242430c156d9SYan, Zheng 	}
24253d14c5d2SYehuda Sadeh }
2426df28152dSIlya Dryomov 
ceph_object_locator_to_pg(struct ceph_osdmap * osdmap,const struct ceph_object_id * oid,const struct ceph_object_locator * oloc,struct ceph_pg * raw_pgid)2427df28152dSIlya Dryomov int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
2428df28152dSIlya Dryomov 			      const struct ceph_object_id *oid,
2429df28152dSIlya Dryomov 			      const struct ceph_object_locator *oloc,
2430df28152dSIlya Dryomov 			      struct ceph_pg *raw_pgid)
2431df28152dSIlya Dryomov {
2432df28152dSIlya Dryomov 	struct ceph_pg_pool_info *pi;
2433df28152dSIlya Dryomov 
2434df28152dSIlya Dryomov 	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
2435df28152dSIlya Dryomov 	if (!pi)
2436df28152dSIlya Dryomov 		return -ENOENT;
2437df28152dSIlya Dryomov 
2438a86f009fSIlya Dryomov 	__ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
2439a86f009fSIlya Dryomov 	return 0;
2440df28152dSIlya Dryomov }
2441d9591f5eSIlya Dryomov EXPORT_SYMBOL(ceph_object_locator_to_pg);
24423d14c5d2SYehuda Sadeh 
24436f3bfd45SIlya Dryomov /*
24446f3bfd45SIlya Dryomov  * Map a raw PG (full precision ps) into an actual PG.
24456f3bfd45SIlya Dryomov  */
raw_pg_to_pg(struct ceph_pg_pool_info * pi,const struct ceph_pg * raw_pgid,struct ceph_pg * pgid)24466f3bfd45SIlya Dryomov static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
24476f3bfd45SIlya Dryomov 			 const struct ceph_pg *raw_pgid,
24486f3bfd45SIlya Dryomov 			 struct ceph_pg *pgid)
24496f3bfd45SIlya Dryomov {
24506f3bfd45SIlya Dryomov 	pgid->pool = raw_pgid->pool;
24516f3bfd45SIlya Dryomov 	pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
24526f3bfd45SIlya Dryomov 				     pi->pg_num_mask);
24536f3bfd45SIlya Dryomov }
24546f3bfd45SIlya Dryomov 
24556f3bfd45SIlya Dryomov /*
24566f3bfd45SIlya Dryomov  * Map a raw PG (full precision ps) into a placement ps (placement
24576f3bfd45SIlya Dryomov  * seed).  Include pool id in that value so that different pools don't
24586f3bfd45SIlya Dryomov  * use the same seeds.
24596f3bfd45SIlya Dryomov  */
raw_pg_to_pps(struct ceph_pg_pool_info * pi,const struct ceph_pg * raw_pgid)24606f3bfd45SIlya Dryomov static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
24616f3bfd45SIlya Dryomov 			 const struct ceph_pg *raw_pgid)
24626f3bfd45SIlya Dryomov {
24636f3bfd45SIlya Dryomov 	if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
24646f3bfd45SIlya Dryomov 		/* hash pool id and seed so that pool PGs do not overlap */
24656f3bfd45SIlya Dryomov 		return crush_hash32_2(CRUSH_HASH_RJENKINS1,
24666f3bfd45SIlya Dryomov 				      ceph_stable_mod(raw_pgid->seed,
24676f3bfd45SIlya Dryomov 						      pi->pgp_num,
24686f3bfd45SIlya Dryomov 						      pi->pgp_num_mask),
24696f3bfd45SIlya Dryomov 				      raw_pgid->pool);
24706f3bfd45SIlya Dryomov 	} else {
24716f3bfd45SIlya Dryomov 		/*
24726f3bfd45SIlya Dryomov 		 * legacy behavior: add ps and pool together.  this is
24736f3bfd45SIlya Dryomov 		 * not a great approach because the PGs from each pool
24746f3bfd45SIlya Dryomov 		 * will overlap on top of each other: 0.5 == 1.4 ==
24756f3bfd45SIlya Dryomov 		 * 2.3 == ...
24766f3bfd45SIlya Dryomov 		 */
24776f3bfd45SIlya Dryomov 		return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
24786f3bfd45SIlya Dryomov 				       pi->pgp_num_mask) +
24796f3bfd45SIlya Dryomov 		       (unsigned)raw_pgid->pool;
24806f3bfd45SIlya Dryomov 	}
24816f3bfd45SIlya Dryomov }
24826f3bfd45SIlya Dryomov 
2483e17e8969SIlya Dryomov /*
2484e17e8969SIlya Dryomov  * Magic value used for a "default" fallback choose_args, used if the
2485e17e8969SIlya Dryomov  * crush_choose_arg_map passed to do_crush() does not exist.  If this
2486e17e8969SIlya Dryomov  * also doesn't exist, fall back to canonical weights.
2487e17e8969SIlya Dryomov  */
2488e17e8969SIlya Dryomov #define CEPH_DEFAULT_CHOOSE_ARGS	-1
2489e17e8969SIlya Dryomov 
do_crush(struct ceph_osdmap * map,int ruleno,int x,int * result,int result_max,const __u32 * weight,int weight_max,s64 choose_args_index)24909d521470SIlya Dryomov static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
2491e8ef19c4SIlya Dryomov 		    int *result, int result_max,
24925cf9c4a9SIlya Dryomov 		    const __u32 *weight, int weight_max,
2493e17e8969SIlya Dryomov 		    s64 choose_args_index)
2494e8ef19c4SIlya Dryomov {
24955cf9c4a9SIlya Dryomov 	struct crush_choose_arg_map *arg_map;
24963986f9a4SIlya Dryomov 	struct crush_work *work;
24979d521470SIlya Dryomov 	int r;
2498e8ef19c4SIlya Dryomov 
24999d521470SIlya Dryomov 	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
25009d521470SIlya Dryomov 
25015cf9c4a9SIlya Dryomov 	arg_map = lookup_choose_arg_map(&map->crush->choose_args,
25025cf9c4a9SIlya Dryomov 					choose_args_index);
2503e17e8969SIlya Dryomov 	if (!arg_map)
2504e17e8969SIlya Dryomov 		arg_map = lookup_choose_arg_map(&map->crush->choose_args,
2505e17e8969SIlya Dryomov 						CEPH_DEFAULT_CHOOSE_ARGS);
25065cf9c4a9SIlya Dryomov 
25073986f9a4SIlya Dryomov 	work = get_workspace(&map->crush_wsm, map->crush);
25089d521470SIlya Dryomov 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
25093986f9a4SIlya Dryomov 			  weight, weight_max, work,
25105cf9c4a9SIlya Dryomov 			  arg_map ? arg_map->args : NULL);
25113986f9a4SIlya Dryomov 	put_workspace(&map->crush_wsm, work);
25129d521470SIlya Dryomov 	return r;
2513e8ef19c4SIlya Dryomov }
2514e8ef19c4SIlya Dryomov 
remove_nonexistent_osds(struct ceph_osdmap * osdmap,struct ceph_pg_pool_info * pi,struct ceph_osds * set)25151c2e7b45SIlya Dryomov static void remove_nonexistent_osds(struct ceph_osdmap *osdmap,
25161c2e7b45SIlya Dryomov 				    struct ceph_pg_pool_info *pi,
25171c2e7b45SIlya Dryomov 				    struct ceph_osds *set)
25181c2e7b45SIlya Dryomov {
25191c2e7b45SIlya Dryomov 	int i;
25201c2e7b45SIlya Dryomov 
25211c2e7b45SIlya Dryomov 	if (ceph_can_shift_osds(pi)) {
25221c2e7b45SIlya Dryomov 		int removed = 0;
25231c2e7b45SIlya Dryomov 
25241c2e7b45SIlya Dryomov 		/* shift left */
25251c2e7b45SIlya Dryomov 		for (i = 0; i < set->size; i++) {
25261c2e7b45SIlya Dryomov 			if (!ceph_osd_exists(osdmap, set->osds[i])) {
25271c2e7b45SIlya Dryomov 				removed++;
25281c2e7b45SIlya Dryomov 				continue;
25291c2e7b45SIlya Dryomov 			}
25301c2e7b45SIlya Dryomov 			if (removed)
25311c2e7b45SIlya Dryomov 				set->osds[i - removed] = set->osds[i];
25321c2e7b45SIlya Dryomov 		}
25331c2e7b45SIlya Dryomov 		set->size -= removed;
25341c2e7b45SIlya Dryomov 	} else {
25351c2e7b45SIlya Dryomov 		/* set dne devices to NONE */
25361c2e7b45SIlya Dryomov 		for (i = 0; i < set->size; i++) {
25371c2e7b45SIlya Dryomov 			if (!ceph_osd_exists(osdmap, set->osds[i]))
25381c2e7b45SIlya Dryomov 				set->osds[i] = CRUSH_ITEM_NONE;
25391c2e7b45SIlya Dryomov 		}
25401c2e7b45SIlya Dryomov 	}
25411c2e7b45SIlya Dryomov }
25421c2e7b45SIlya Dryomov 
25433d14c5d2SYehuda Sadeh /*
25441c2e7b45SIlya Dryomov  * Calculate raw set (CRUSH output) for given PG and filter out
25451c2e7b45SIlya Dryomov  * nonexistent OSDs.  ->primary is undefined for a raw set.
25462bd93d4dSIlya Dryomov  *
25476f3bfd45SIlya Dryomov  * Placement seed (CRUSH input) is returned through @ppps.
25482bd93d4dSIlya Dryomov  */
pg_to_raw_osds(struct ceph_osdmap * osdmap,struct ceph_pg_pool_info * pi,const struct ceph_pg * raw_pgid,struct ceph_osds * raw,u32 * ppps)25496f3bfd45SIlya Dryomov static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
25506f3bfd45SIlya Dryomov 			   struct ceph_pg_pool_info *pi,
25516f3bfd45SIlya Dryomov 			   const struct ceph_pg *raw_pgid,
25526f3bfd45SIlya Dryomov 			   struct ceph_osds *raw,
25536f3bfd45SIlya Dryomov 			   u32 *ppps)
25542bd93d4dSIlya Dryomov {
25556f3bfd45SIlya Dryomov 	u32 pps = raw_pg_to_pps(pi, raw_pgid);
25562bd93d4dSIlya Dryomov 	int ruleno;
25572bd93d4dSIlya Dryomov 	int len;
25582bd93d4dSIlya Dryomov 
25596f3bfd45SIlya Dryomov 	ceph_osds_init(raw);
25606f3bfd45SIlya Dryomov 	if (ppps)
25616f3bfd45SIlya Dryomov 		*ppps = pps;
25626f3bfd45SIlya Dryomov 
25636f3bfd45SIlya Dryomov 	ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
25646f3bfd45SIlya Dryomov 				 pi->size);
25652bd93d4dSIlya Dryomov 	if (ruleno < 0) {
25662bd93d4dSIlya Dryomov 		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
25676f3bfd45SIlya Dryomov 		       pi->id, pi->crush_ruleset, pi->type, pi->size);
25686f3bfd45SIlya Dryomov 		return;
25692bd93d4dSIlya Dryomov 	}
25702bd93d4dSIlya Dryomov 
2571ef9324bbSIlya Dryomov 	if (pi->size > ARRAY_SIZE(raw->osds)) {
2572ef9324bbSIlya Dryomov 		pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
2573ef9324bbSIlya Dryomov 		       pi->id, pi->crush_ruleset, pi->type, pi->size,
2574ef9324bbSIlya Dryomov 		       ARRAY_SIZE(raw->osds));
2575ef9324bbSIlya Dryomov 		return;
2576ef9324bbSIlya Dryomov 	}
2577ef9324bbSIlya Dryomov 
2578ef9324bbSIlya Dryomov 	len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
25795cf9c4a9SIlya Dryomov 		       osdmap->osd_weight, osdmap->max_osd, pi->id);
25802bd93d4dSIlya Dryomov 	if (len < 0) {
25812bd93d4dSIlya Dryomov 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
25826f3bfd45SIlya Dryomov 		       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
25836f3bfd45SIlya Dryomov 		       pi->size);
25846f3bfd45SIlya Dryomov 		return;
25852bd93d4dSIlya Dryomov 	}
25862bd93d4dSIlya Dryomov 
25876f3bfd45SIlya Dryomov 	raw->size = len;
25881c2e7b45SIlya Dryomov 	remove_nonexistent_osds(osdmap, pi, raw);
25891c2e7b45SIlya Dryomov }
25901c2e7b45SIlya Dryomov 
25911c2e7b45SIlya Dryomov /* apply pg_upmap[_items] mappings */
apply_upmap(struct ceph_osdmap * osdmap,const struct ceph_pg * pgid,struct ceph_osds * raw)25921c2e7b45SIlya Dryomov static void apply_upmap(struct ceph_osdmap *osdmap,
25931c2e7b45SIlya Dryomov 			const struct ceph_pg *pgid,
25941c2e7b45SIlya Dryomov 			struct ceph_osds *raw)
25951c2e7b45SIlya Dryomov {
25961c2e7b45SIlya Dryomov 	struct ceph_pg_mapping *pg;
25971c2e7b45SIlya Dryomov 	int i, j;
25981c2e7b45SIlya Dryomov 
25991c2e7b45SIlya Dryomov 	pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid);
26001c2e7b45SIlya Dryomov 	if (pg) {
26011c2e7b45SIlya Dryomov 		/* make sure targets aren't marked out */
26021c2e7b45SIlya Dryomov 		for (i = 0; i < pg->pg_upmap.len; i++) {
26031c2e7b45SIlya Dryomov 			int osd = pg->pg_upmap.osds[i];
26041c2e7b45SIlya Dryomov 
26051c2e7b45SIlya Dryomov 			if (osd != CRUSH_ITEM_NONE &&
26061c2e7b45SIlya Dryomov 			    osd < osdmap->max_osd &&
26071c2e7b45SIlya Dryomov 			    osdmap->osd_weight[osd] == 0) {
26081c2e7b45SIlya Dryomov 				/* reject/ignore explicit mapping */
26091c2e7b45SIlya Dryomov 				return;
26101c2e7b45SIlya Dryomov 			}
26111c2e7b45SIlya Dryomov 		}
26121c2e7b45SIlya Dryomov 		for (i = 0; i < pg->pg_upmap.len; i++)
26131c2e7b45SIlya Dryomov 			raw->osds[i] = pg->pg_upmap.osds[i];
26141c2e7b45SIlya Dryomov 		raw->size = pg->pg_upmap.len;
2615f53b7665SIlya Dryomov 		/* check and apply pg_upmap_items, if any */
26161c2e7b45SIlya Dryomov 	}
26171c2e7b45SIlya Dryomov 
26181c2e7b45SIlya Dryomov 	pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
26191c2e7b45SIlya Dryomov 	if (pg) {
262029a0cfbfSIlya Dryomov 		/*
262129a0cfbfSIlya Dryomov 		 * Note: this approach does not allow a bidirectional swap,
262229a0cfbfSIlya Dryomov 		 * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
262329a0cfbfSIlya Dryomov 		 */
262429a0cfbfSIlya Dryomov 		for (i = 0; i < pg->pg_upmap_items.len; i++) {
262529a0cfbfSIlya Dryomov 			int from = pg->pg_upmap_items.from_to[i][0];
262629a0cfbfSIlya Dryomov 			int to = pg->pg_upmap_items.from_to[i][1];
262729a0cfbfSIlya Dryomov 			int pos = -1;
262829a0cfbfSIlya Dryomov 			bool exists = false;
26291c2e7b45SIlya Dryomov 
263029a0cfbfSIlya Dryomov 			/* make sure replacement doesn't already appear */
263129a0cfbfSIlya Dryomov 			for (j = 0; j < raw->size; j++) {
263229a0cfbfSIlya Dryomov 				int osd = raw->osds[j];
263329a0cfbfSIlya Dryomov 
263429a0cfbfSIlya Dryomov 				if (osd == to) {
263529a0cfbfSIlya Dryomov 					exists = true;
26361c2e7b45SIlya Dryomov 					break;
26371c2e7b45SIlya Dryomov 				}
263829a0cfbfSIlya Dryomov 				/* ignore mapping if target is marked out */
263929a0cfbfSIlya Dryomov 				if (osd == from && pos < 0 &&
264029a0cfbfSIlya Dryomov 				    !(to != CRUSH_ITEM_NONE &&
264129a0cfbfSIlya Dryomov 				      to < osdmap->max_osd &&
264229a0cfbfSIlya Dryomov 				      osdmap->osd_weight[to] == 0)) {
264329a0cfbfSIlya Dryomov 					pos = j;
26441c2e7b45SIlya Dryomov 				}
26451c2e7b45SIlya Dryomov 			}
264629a0cfbfSIlya Dryomov 			if (!exists && pos >= 0)
264729a0cfbfSIlya Dryomov 				raw->osds[pos] = to;
264829a0cfbfSIlya Dryomov 		}
26491c2e7b45SIlya Dryomov 	}
26502bd93d4dSIlya Dryomov }
26512bd93d4dSIlya Dryomov 
26522bd93d4dSIlya Dryomov /*
26536f3bfd45SIlya Dryomov  * Given raw set, calculate up set and up primary.  By definition of an
26546f3bfd45SIlya Dryomov  * up set, the result won't contain nonexistent or down OSDs.
26552bd93d4dSIlya Dryomov  *
26566f3bfd45SIlya Dryomov  * This is done in-place - on return @set is the up set.  If it's
26576f3bfd45SIlya Dryomov  * empty, ->primary will remain undefined.
26582bd93d4dSIlya Dryomov  */
raw_to_up_osds(struct ceph_osdmap * osdmap,struct ceph_pg_pool_info * pi,struct ceph_osds * set)26596f3bfd45SIlya Dryomov static void raw_to_up_osds(struct ceph_osdmap *osdmap,
26606f3bfd45SIlya Dryomov 			   struct ceph_pg_pool_info *pi,
26616f3bfd45SIlya Dryomov 			   struct ceph_osds *set)
26622bd93d4dSIlya Dryomov {
26632bd93d4dSIlya Dryomov 	int i;
26642bd93d4dSIlya Dryomov 
26656f3bfd45SIlya Dryomov 	/* ->primary is undefined for a raw set */
26666f3bfd45SIlya Dryomov 	BUG_ON(set->primary != -1);
26676f3bfd45SIlya Dryomov 
26686f3bfd45SIlya Dryomov 	if (ceph_can_shift_osds(pi)) {
26692bd93d4dSIlya Dryomov 		int removed = 0;
26702bd93d4dSIlya Dryomov 
26716f3bfd45SIlya Dryomov 		/* shift left */
26726f3bfd45SIlya Dryomov 		for (i = 0; i < set->size; i++) {
26736f3bfd45SIlya Dryomov 			if (ceph_osd_is_down(osdmap, set->osds[i])) {
26742bd93d4dSIlya Dryomov 				removed++;
26752bd93d4dSIlya Dryomov 				continue;
26762bd93d4dSIlya Dryomov 			}
26772bd93d4dSIlya Dryomov 			if (removed)
26786f3bfd45SIlya Dryomov 				set->osds[i - removed] = set->osds[i];
26792bd93d4dSIlya Dryomov 		}
26806f3bfd45SIlya Dryomov 		set->size -= removed;
26816f3bfd45SIlya Dryomov 		if (set->size > 0)
26826f3bfd45SIlya Dryomov 			set->primary = set->osds[0];
26832bd93d4dSIlya Dryomov 	} else {
26846f3bfd45SIlya Dryomov 		/* set down/dne devices to NONE */
26856f3bfd45SIlya Dryomov 		for (i = set->size - 1; i >= 0; i--) {
26866f3bfd45SIlya Dryomov 			if (ceph_osd_is_down(osdmap, set->osds[i]))
26876f3bfd45SIlya Dryomov 				set->osds[i] = CRUSH_ITEM_NONE;
26882bd93d4dSIlya Dryomov 			else
26896f3bfd45SIlya Dryomov 				set->primary = set->osds[i];
26906f3bfd45SIlya Dryomov 		}
26912bd93d4dSIlya Dryomov 	}
26922bd93d4dSIlya Dryomov }
26932bd93d4dSIlya Dryomov 
apply_primary_affinity(struct ceph_osdmap * osdmap,struct ceph_pg_pool_info * pi,u32 pps,struct ceph_osds * up)26946f3bfd45SIlya Dryomov static void apply_primary_affinity(struct ceph_osdmap *osdmap,
26956f3bfd45SIlya Dryomov 				   struct ceph_pg_pool_info *pi,
26966f3bfd45SIlya Dryomov 				   u32 pps,
26976f3bfd45SIlya Dryomov 				   struct ceph_osds *up)
269847ec1f3cSIlya Dryomov {
269947ec1f3cSIlya Dryomov 	int i;
270047ec1f3cSIlya Dryomov 	int pos = -1;
270147ec1f3cSIlya Dryomov 
270247ec1f3cSIlya Dryomov 	/*
270347ec1f3cSIlya Dryomov 	 * Do we have any non-default primary_affinity values for these
270447ec1f3cSIlya Dryomov 	 * osds?
270547ec1f3cSIlya Dryomov 	 */
270647ec1f3cSIlya Dryomov 	if (!osdmap->osd_primary_affinity)
270747ec1f3cSIlya Dryomov 		return;
270847ec1f3cSIlya Dryomov 
27096f3bfd45SIlya Dryomov 	for (i = 0; i < up->size; i++) {
27106f3bfd45SIlya Dryomov 		int osd = up->osds[i];
271192b2e751SIlya Dryomov 
271292b2e751SIlya Dryomov 		if (osd != CRUSH_ITEM_NONE &&
271392b2e751SIlya Dryomov 		    osdmap->osd_primary_affinity[osd] !=
271447ec1f3cSIlya Dryomov 					CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
271547ec1f3cSIlya Dryomov 			break;
271647ec1f3cSIlya Dryomov 		}
271747ec1f3cSIlya Dryomov 	}
27186f3bfd45SIlya Dryomov 	if (i == up->size)
271947ec1f3cSIlya Dryomov 		return;
272047ec1f3cSIlya Dryomov 
272147ec1f3cSIlya Dryomov 	/*
272247ec1f3cSIlya Dryomov 	 * Pick the primary.  Feed both the seed (for the pg) and the
272347ec1f3cSIlya Dryomov 	 * osd into the hash/rng so that a proportional fraction of an
272447ec1f3cSIlya Dryomov 	 * osd's pgs get rejected as primary.
272547ec1f3cSIlya Dryomov 	 */
27266f3bfd45SIlya Dryomov 	for (i = 0; i < up->size; i++) {
27276f3bfd45SIlya Dryomov 		int osd = up->osds[i];
272847ec1f3cSIlya Dryomov 		u32 aff;
272947ec1f3cSIlya Dryomov 
273047ec1f3cSIlya Dryomov 		if (osd == CRUSH_ITEM_NONE)
273147ec1f3cSIlya Dryomov 			continue;
273247ec1f3cSIlya Dryomov 
273347ec1f3cSIlya Dryomov 		aff = osdmap->osd_primary_affinity[osd];
273447ec1f3cSIlya Dryomov 		if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
273547ec1f3cSIlya Dryomov 		    (crush_hash32_2(CRUSH_HASH_RJENKINS1,
273647ec1f3cSIlya Dryomov 				    pps, osd) >> 16) >= aff) {
273747ec1f3cSIlya Dryomov 			/*
273847ec1f3cSIlya Dryomov 			 * We chose not to use this primary.  Note it
273947ec1f3cSIlya Dryomov 			 * anyway as a fallback in case we don't pick
274047ec1f3cSIlya Dryomov 			 * anyone else, but keep looking.
274147ec1f3cSIlya Dryomov 			 */
274247ec1f3cSIlya Dryomov 			if (pos < 0)
274347ec1f3cSIlya Dryomov 				pos = i;
274447ec1f3cSIlya Dryomov 		} else {
274547ec1f3cSIlya Dryomov 			pos = i;
274647ec1f3cSIlya Dryomov 			break;
274747ec1f3cSIlya Dryomov 		}
274847ec1f3cSIlya Dryomov 	}
274947ec1f3cSIlya Dryomov 	if (pos < 0)
275047ec1f3cSIlya Dryomov 		return;
275147ec1f3cSIlya Dryomov 
27526f3bfd45SIlya Dryomov 	up->primary = up->osds[pos];
275347ec1f3cSIlya Dryomov 
27546f3bfd45SIlya Dryomov 	if (ceph_can_shift_osds(pi) && pos > 0) {
275547ec1f3cSIlya Dryomov 		/* move the new primary to the front */
275647ec1f3cSIlya Dryomov 		for (i = pos; i > 0; i--)
27576f3bfd45SIlya Dryomov 			up->osds[i] = up->osds[i - 1];
27586f3bfd45SIlya Dryomov 		up->osds[0] = up->primary;
275947ec1f3cSIlya Dryomov 	}
276047ec1f3cSIlya Dryomov }
276147ec1f3cSIlya Dryomov 
27622bd93d4dSIlya Dryomov /*
27636f3bfd45SIlya Dryomov  * Get pg_temp and primary_temp mappings for given PG.
276445966c34SIlya Dryomov  *
27656f3bfd45SIlya Dryomov  * Note that a PG may have none, only pg_temp, only primary_temp or
27666f3bfd45SIlya Dryomov  * both pg_temp and primary_temp mappings.  This means @temp isn't
27676f3bfd45SIlya Dryomov  * always a valid OSD set on return: in the "only primary_temp" case,
27686f3bfd45SIlya Dryomov  * @temp will have its ->primary >= 0 but ->size == 0.
276945966c34SIlya Dryomov  */
get_temp_osds(struct ceph_osdmap * osdmap,struct ceph_pg_pool_info * pi,const struct ceph_pg * pgid,struct ceph_osds * temp)27706f3bfd45SIlya Dryomov static void get_temp_osds(struct ceph_osdmap *osdmap,
27716f3bfd45SIlya Dryomov 			  struct ceph_pg_pool_info *pi,
2772463bb8daSIlya Dryomov 			  const struct ceph_pg *pgid,
27736f3bfd45SIlya Dryomov 			  struct ceph_osds *temp)
277445966c34SIlya Dryomov {
277545966c34SIlya Dryomov 	struct ceph_pg_mapping *pg;
277645966c34SIlya Dryomov 	int i;
277745966c34SIlya Dryomov 
27786f3bfd45SIlya Dryomov 	ceph_osds_init(temp);
277945966c34SIlya Dryomov 
278045966c34SIlya Dryomov 	/* pg_temp? */
2781463bb8daSIlya Dryomov 	pg = lookup_pg_mapping(&osdmap->pg_temp, pgid);
278245966c34SIlya Dryomov 	if (pg) {
278345966c34SIlya Dryomov 		for (i = 0; i < pg->pg_temp.len; i++) {
278445966c34SIlya Dryomov 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
27856f3bfd45SIlya Dryomov 				if (ceph_can_shift_osds(pi))
278645966c34SIlya Dryomov 					continue;
27876f3bfd45SIlya Dryomov 
27886f3bfd45SIlya Dryomov 				temp->osds[temp->size++] = CRUSH_ITEM_NONE;
278945966c34SIlya Dryomov 			} else {
27906f3bfd45SIlya Dryomov 				temp->osds[temp->size++] = pg->pg_temp.osds[i];
279145966c34SIlya Dryomov 			}
279245966c34SIlya Dryomov 		}
279345966c34SIlya Dryomov 
279445966c34SIlya Dryomov 		/* apply pg_temp's primary */
27956f3bfd45SIlya Dryomov 		for (i = 0; i < temp->size; i++) {
27966f3bfd45SIlya Dryomov 			if (temp->osds[i] != CRUSH_ITEM_NONE) {
27976f3bfd45SIlya Dryomov 				temp->primary = temp->osds[i];
279845966c34SIlya Dryomov 				break;
279945966c34SIlya Dryomov 			}
280045966c34SIlya Dryomov 		}
280145966c34SIlya Dryomov 	}
280245966c34SIlya Dryomov 
28035e8d4d36SIlya Dryomov 	/* primary_temp? */
2804463bb8daSIlya Dryomov 	pg = lookup_pg_mapping(&osdmap->primary_temp, pgid);
28055e8d4d36SIlya Dryomov 	if (pg)
28066f3bfd45SIlya Dryomov 		temp->primary = pg->primary_temp.osd;
280745966c34SIlya Dryomov }
280845966c34SIlya Dryomov 
280945966c34SIlya Dryomov /*
28106f3bfd45SIlya Dryomov  * Map a PG to its acting set as well as its up set.
2811ac972230SIlya Dryomov  *
28126f3bfd45SIlya Dryomov  * Acting set is used for data mapping purposes, while up set can be
28136f3bfd45SIlya Dryomov  * recorded for detecting interval changes and deciding whether to
28146f3bfd45SIlya Dryomov  * resend a request.
28153d14c5d2SYehuda Sadeh  */
ceph_pg_to_up_acting_osds(struct ceph_osdmap * osdmap,struct ceph_pg_pool_info * pi,const struct ceph_pg * raw_pgid,struct ceph_osds * up,struct ceph_osds * acting)28166f3bfd45SIlya Dryomov void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
2817df28152dSIlya Dryomov 			       struct ceph_pg_pool_info *pi,
28186f3bfd45SIlya Dryomov 			       const struct ceph_pg *raw_pgid,
28196f3bfd45SIlya Dryomov 			       struct ceph_osds *up,
28206f3bfd45SIlya Dryomov 			       struct ceph_osds *acting)
28213d14c5d2SYehuda Sadeh {
2822463bb8daSIlya Dryomov 	struct ceph_pg pgid;
2823ac972230SIlya Dryomov 	u32 pps;
28243d14c5d2SYehuda Sadeh 
2825df28152dSIlya Dryomov 	WARN_ON(pi->id != raw_pgid->pool);
2826463bb8daSIlya Dryomov 	raw_pg_to_pg(pi, raw_pgid, &pgid);
28273d14c5d2SYehuda Sadeh 
28286f3bfd45SIlya Dryomov 	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
28291c2e7b45SIlya Dryomov 	apply_upmap(osdmap, &pgid, up);
28306f3bfd45SIlya Dryomov 	raw_to_up_osds(osdmap, pi, up);
28316f3bfd45SIlya Dryomov 	apply_primary_affinity(osdmap, pi, pps, up);
2832463bb8daSIlya Dryomov 	get_temp_osds(osdmap, pi, &pgid, acting);
28336f3bfd45SIlya Dryomov 	if (!acting->size) {
28346f3bfd45SIlya Dryomov 		memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
28356f3bfd45SIlya Dryomov 		acting->size = up->size;
28366f3bfd45SIlya Dryomov 		if (acting->primary == -1)
28376f3bfd45SIlya Dryomov 			acting->primary = up->primary;
2838ac972230SIlya Dryomov 	}
28396f3bfd45SIlya Dryomov 	WARN_ON(!osds_valid(up) || !osds_valid(acting));
28403d14c5d2SYehuda Sadeh }
28413d14c5d2SYehuda Sadeh 
ceph_pg_to_primary_shard(struct ceph_osdmap * osdmap,struct ceph_pg_pool_info * pi,const struct ceph_pg * raw_pgid,struct ceph_spg * spgid)2842dc98ff72SIlya Dryomov bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
2843df28152dSIlya Dryomov 			      struct ceph_pg_pool_info *pi,
2844dc98ff72SIlya Dryomov 			      const struct ceph_pg *raw_pgid,
2845dc98ff72SIlya Dryomov 			      struct ceph_spg *spgid)
2846dc98ff72SIlya Dryomov {
2847dc98ff72SIlya Dryomov 	struct ceph_pg pgid;
2848dc98ff72SIlya Dryomov 	struct ceph_osds up, acting;
2849dc98ff72SIlya Dryomov 	int i;
2850dc98ff72SIlya Dryomov 
2851df28152dSIlya Dryomov 	WARN_ON(pi->id != raw_pgid->pool);
2852dc98ff72SIlya Dryomov 	raw_pg_to_pg(pi, raw_pgid, &pgid);
2853dc98ff72SIlya Dryomov 
2854dc98ff72SIlya Dryomov 	if (ceph_can_shift_osds(pi)) {
2855dc98ff72SIlya Dryomov 		spgid->pgid = pgid; /* struct */
2856dc98ff72SIlya Dryomov 		spgid->shard = CEPH_SPG_NOSHARD;
2857dc98ff72SIlya Dryomov 		return true;
2858dc98ff72SIlya Dryomov 	}
2859dc98ff72SIlya Dryomov 
2860df28152dSIlya Dryomov 	ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting);
2861dc98ff72SIlya Dryomov 	for (i = 0; i < acting.size; i++) {
2862dc98ff72SIlya Dryomov 		if (acting.osds[i] == acting.primary) {
2863dc98ff72SIlya Dryomov 			spgid->pgid = pgid; /* struct */
2864dc98ff72SIlya Dryomov 			spgid->shard = i;
2865dc98ff72SIlya Dryomov 			return true;
2866dc98ff72SIlya Dryomov 		}
2867dc98ff72SIlya Dryomov 	}
2868dc98ff72SIlya Dryomov 
2869dc98ff72SIlya Dryomov 	return false;
2870dc98ff72SIlya Dryomov }
2871dc98ff72SIlya Dryomov 
28723d14c5d2SYehuda Sadeh /*
2873f81f1633SIlya Dryomov  * Return acting primary for given PG, or -1 if none.
28743d14c5d2SYehuda Sadeh  */
ceph_pg_to_acting_primary(struct ceph_osdmap * osdmap,const struct ceph_pg * raw_pgid)2875f81f1633SIlya Dryomov int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
2876f81f1633SIlya Dryomov 			      const struct ceph_pg *raw_pgid)
28773d14c5d2SYehuda Sadeh {
2878df28152dSIlya Dryomov 	struct ceph_pg_pool_info *pi;
28796f3bfd45SIlya Dryomov 	struct ceph_osds up, acting;
28803d14c5d2SYehuda Sadeh 
2881df28152dSIlya Dryomov 	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
2882df28152dSIlya Dryomov 	if (!pi)
2883df28152dSIlya Dryomov 		return -1;
2884df28152dSIlya Dryomov 
2885df28152dSIlya Dryomov 	ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting);
28866f3bfd45SIlya Dryomov 	return acting.primary;
28873d14c5d2SYehuda Sadeh }
2888f81f1633SIlya Dryomov EXPORT_SYMBOL(ceph_pg_to_acting_primary);
288945e6aa9fSIlya Dryomov 
alloc_crush_loc(size_t type_name_len,size_t name_len)289045e6aa9fSIlya Dryomov static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
289145e6aa9fSIlya Dryomov 					      size_t name_len)
289245e6aa9fSIlya Dryomov {
289345e6aa9fSIlya Dryomov 	struct crush_loc_node *loc;
289445e6aa9fSIlya Dryomov 
289545e6aa9fSIlya Dryomov 	loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
289645e6aa9fSIlya Dryomov 	if (!loc)
289745e6aa9fSIlya Dryomov 		return NULL;
289845e6aa9fSIlya Dryomov 
289945e6aa9fSIlya Dryomov 	RB_CLEAR_NODE(&loc->cl_node);
290045e6aa9fSIlya Dryomov 	return loc;
290145e6aa9fSIlya Dryomov }
290245e6aa9fSIlya Dryomov 
free_crush_loc(struct crush_loc_node * loc)290345e6aa9fSIlya Dryomov static void free_crush_loc(struct crush_loc_node *loc)
290445e6aa9fSIlya Dryomov {
290545e6aa9fSIlya Dryomov 	WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
290645e6aa9fSIlya Dryomov 
290745e6aa9fSIlya Dryomov 	kfree(loc);
290845e6aa9fSIlya Dryomov }
290945e6aa9fSIlya Dryomov 
crush_loc_compare(const struct crush_loc * loc1,const struct crush_loc * loc2)291045e6aa9fSIlya Dryomov static int crush_loc_compare(const struct crush_loc *loc1,
291145e6aa9fSIlya Dryomov 			     const struct crush_loc *loc2)
291245e6aa9fSIlya Dryomov {
291345e6aa9fSIlya Dryomov 	return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
291445e6aa9fSIlya Dryomov 	       strcmp(loc1->cl_name, loc2->cl_name);
291545e6aa9fSIlya Dryomov }
291645e6aa9fSIlya Dryomov 
DEFINE_RB_FUNCS2(crush_loc,struct crush_loc_node,cl_loc,crush_loc_compare,RB_BYPTR,const struct crush_loc *,cl_node)291745e6aa9fSIlya Dryomov DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
291845e6aa9fSIlya Dryomov 		 RB_BYPTR, const struct crush_loc *, cl_node)
291945e6aa9fSIlya Dryomov 
292045e6aa9fSIlya Dryomov /*
292145e6aa9fSIlya Dryomov  * Parses a set of <bucket type name>':'<bucket name> pairs separated
292245e6aa9fSIlya Dryomov  * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
292345e6aa9fSIlya Dryomov  *
292445e6aa9fSIlya Dryomov  * Note that @crush_location is modified by strsep().
292545e6aa9fSIlya Dryomov  */
292645e6aa9fSIlya Dryomov int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
292745e6aa9fSIlya Dryomov {
292845e6aa9fSIlya Dryomov 	struct crush_loc_node *loc;
292945e6aa9fSIlya Dryomov 	const char *type_name, *name, *colon;
293045e6aa9fSIlya Dryomov 	size_t type_name_len, name_len;
293145e6aa9fSIlya Dryomov 
293245e6aa9fSIlya Dryomov 	dout("%s '%s'\n", __func__, crush_location);
293345e6aa9fSIlya Dryomov 	while ((type_name = strsep(&crush_location, "|"))) {
293445e6aa9fSIlya Dryomov 		colon = strchr(type_name, ':');
293545e6aa9fSIlya Dryomov 		if (!colon)
293645e6aa9fSIlya Dryomov 			return -EINVAL;
293745e6aa9fSIlya Dryomov 
293845e6aa9fSIlya Dryomov 		type_name_len = colon - type_name;
293945e6aa9fSIlya Dryomov 		if (type_name_len == 0)
294045e6aa9fSIlya Dryomov 			return -EINVAL;
294145e6aa9fSIlya Dryomov 
294245e6aa9fSIlya Dryomov 		name = colon + 1;
294345e6aa9fSIlya Dryomov 		name_len = strlen(name);
294445e6aa9fSIlya Dryomov 		if (name_len == 0)
294545e6aa9fSIlya Dryomov 			return -EINVAL;
294645e6aa9fSIlya Dryomov 
294745e6aa9fSIlya Dryomov 		loc = alloc_crush_loc(type_name_len, name_len);
294845e6aa9fSIlya Dryomov 		if (!loc)
294945e6aa9fSIlya Dryomov 			return -ENOMEM;
295045e6aa9fSIlya Dryomov 
295145e6aa9fSIlya Dryomov 		loc->cl_loc.cl_type_name = loc->cl_data;
295245e6aa9fSIlya Dryomov 		memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
295345e6aa9fSIlya Dryomov 		loc->cl_loc.cl_type_name[type_name_len] = '\0';
295445e6aa9fSIlya Dryomov 
295545e6aa9fSIlya Dryomov 		loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
295645e6aa9fSIlya Dryomov 		memcpy(loc->cl_loc.cl_name, name, name_len);
295745e6aa9fSIlya Dryomov 		loc->cl_loc.cl_name[name_len] = '\0';
295845e6aa9fSIlya Dryomov 
295945e6aa9fSIlya Dryomov 		if (!__insert_crush_loc(locs, loc)) {
296045e6aa9fSIlya Dryomov 			free_crush_loc(loc);
296145e6aa9fSIlya Dryomov 			return -EEXIST;
296245e6aa9fSIlya Dryomov 		}
296345e6aa9fSIlya Dryomov 
296445e6aa9fSIlya Dryomov 		dout("%s type_name '%s' name '%s'\n", __func__,
296545e6aa9fSIlya Dryomov 		     loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
296645e6aa9fSIlya Dryomov 	}
296745e6aa9fSIlya Dryomov 
296845e6aa9fSIlya Dryomov 	return 0;
296945e6aa9fSIlya Dryomov }
297045e6aa9fSIlya Dryomov 
ceph_compare_crush_locs(struct rb_root * locs1,struct rb_root * locs2)297145e6aa9fSIlya Dryomov int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
297245e6aa9fSIlya Dryomov {
297345e6aa9fSIlya Dryomov 	struct rb_node *n1 = rb_first(locs1);
297445e6aa9fSIlya Dryomov 	struct rb_node *n2 = rb_first(locs2);
297545e6aa9fSIlya Dryomov 	int ret;
297645e6aa9fSIlya Dryomov 
297745e6aa9fSIlya Dryomov 	for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
297845e6aa9fSIlya Dryomov 		struct crush_loc_node *loc1 =
297945e6aa9fSIlya Dryomov 		    rb_entry(n1, struct crush_loc_node, cl_node);
298045e6aa9fSIlya Dryomov 		struct crush_loc_node *loc2 =
298145e6aa9fSIlya Dryomov 		    rb_entry(n2, struct crush_loc_node, cl_node);
298245e6aa9fSIlya Dryomov 
298345e6aa9fSIlya Dryomov 		ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
298445e6aa9fSIlya Dryomov 		if (ret)
298545e6aa9fSIlya Dryomov 			return ret;
298645e6aa9fSIlya Dryomov 	}
298745e6aa9fSIlya Dryomov 
298845e6aa9fSIlya Dryomov 	if (!n1 && n2)
298945e6aa9fSIlya Dryomov 		return -1;
299045e6aa9fSIlya Dryomov 	if (n1 && !n2)
299145e6aa9fSIlya Dryomov 		return 1;
299245e6aa9fSIlya Dryomov 	return 0;
299345e6aa9fSIlya Dryomov }
299445e6aa9fSIlya Dryomov 
ceph_clear_crush_locs(struct rb_root * locs)299545e6aa9fSIlya Dryomov void ceph_clear_crush_locs(struct rb_root *locs)
299645e6aa9fSIlya Dryomov {
299745e6aa9fSIlya Dryomov 	while (!RB_EMPTY_ROOT(locs)) {
299845e6aa9fSIlya Dryomov 		struct crush_loc_node *loc =
299945e6aa9fSIlya Dryomov 		    rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
300045e6aa9fSIlya Dryomov 
300145e6aa9fSIlya Dryomov 		erase_crush_loc(locs, loc);
300245e6aa9fSIlya Dryomov 		free_crush_loc(loc);
300345e6aa9fSIlya Dryomov 	}
300445e6aa9fSIlya Dryomov }
3005117d96a0SIlya Dryomov 
3006117d96a0SIlya Dryomov /*
3007117d96a0SIlya Dryomov  * [a-zA-Z0-9-_.]+
3008117d96a0SIlya Dryomov  */
is_valid_crush_name(const char * name)3009117d96a0SIlya Dryomov static bool is_valid_crush_name(const char *name)
3010117d96a0SIlya Dryomov {
3011117d96a0SIlya Dryomov 	do {
3012117d96a0SIlya Dryomov 		if (!('a' <= *name && *name <= 'z') &&
3013117d96a0SIlya Dryomov 		    !('A' <= *name && *name <= 'Z') &&
3014117d96a0SIlya Dryomov 		    !('0' <= *name && *name <= '9') &&
3015117d96a0SIlya Dryomov 		    *name != '-' && *name != '_' && *name != '.')
3016117d96a0SIlya Dryomov 			return false;
3017117d96a0SIlya Dryomov 	} while (*++name != '\0');
3018117d96a0SIlya Dryomov 
3019117d96a0SIlya Dryomov 	return true;
3020117d96a0SIlya Dryomov }
3021117d96a0SIlya Dryomov 
3022117d96a0SIlya Dryomov /*
3023117d96a0SIlya Dryomov  * Gets the parent of an item.  Returns its id (<0 because the
3024117d96a0SIlya Dryomov  * parent is always a bucket), type id (>0 for the same reason,
3025117d96a0SIlya Dryomov  * via @parent_type_id) and location (via @parent_loc).  If no
3026117d96a0SIlya Dryomov  * parent, returns 0.
3027117d96a0SIlya Dryomov  *
3028117d96a0SIlya Dryomov  * Does a linear search, as there are no parent pointers of any
3029dd0d91b9SZheng Yongjun  * kind.  Note that the result is ambiguous for items that occur
3030117d96a0SIlya Dryomov  * multiple times in the map.
3031117d96a0SIlya Dryomov  */
get_immediate_parent(struct crush_map * c,int id,u16 * parent_type_id,struct crush_loc * parent_loc)3032117d96a0SIlya Dryomov static int get_immediate_parent(struct crush_map *c, int id,
3033117d96a0SIlya Dryomov 				u16 *parent_type_id,
3034117d96a0SIlya Dryomov 				struct crush_loc *parent_loc)
3035117d96a0SIlya Dryomov {
3036117d96a0SIlya Dryomov 	struct crush_bucket *b;
3037117d96a0SIlya Dryomov 	struct crush_name_node *type_cn, *cn;
3038117d96a0SIlya Dryomov 	int i, j;
3039117d96a0SIlya Dryomov 
3040117d96a0SIlya Dryomov 	for (i = 0; i < c->max_buckets; i++) {
3041117d96a0SIlya Dryomov 		b = c->buckets[i];
3042117d96a0SIlya Dryomov 		if (!b)
3043117d96a0SIlya Dryomov 			continue;
3044117d96a0SIlya Dryomov 
3045117d96a0SIlya Dryomov 		/* ignore per-class shadow hierarchy */
3046117d96a0SIlya Dryomov 		cn = lookup_crush_name(&c->names, b->id);
3047117d96a0SIlya Dryomov 		if (!cn || !is_valid_crush_name(cn->cn_name))
3048117d96a0SIlya Dryomov 			continue;
3049117d96a0SIlya Dryomov 
3050117d96a0SIlya Dryomov 		for (j = 0; j < b->size; j++) {
3051117d96a0SIlya Dryomov 			if (b->items[j] != id)
3052117d96a0SIlya Dryomov 				continue;
3053117d96a0SIlya Dryomov 
3054117d96a0SIlya Dryomov 			*parent_type_id = b->type;
3055117d96a0SIlya Dryomov 			type_cn = lookup_crush_name(&c->type_names, b->type);
3056117d96a0SIlya Dryomov 			parent_loc->cl_type_name = type_cn->cn_name;
3057117d96a0SIlya Dryomov 			parent_loc->cl_name = cn->cn_name;
3058117d96a0SIlya Dryomov 			return b->id;
3059117d96a0SIlya Dryomov 		}
3060117d96a0SIlya Dryomov 	}
3061117d96a0SIlya Dryomov 
3062117d96a0SIlya Dryomov 	return 0;  /* no parent */
3063117d96a0SIlya Dryomov }
3064117d96a0SIlya Dryomov 
3065117d96a0SIlya Dryomov /*
3066117d96a0SIlya Dryomov  * Calculates the locality/distance from an item to a client
3067117d96a0SIlya Dryomov  * location expressed in terms of CRUSH hierarchy as a set of
3068117d96a0SIlya Dryomov  * (bucket type name, bucket name) pairs.  Specifically, looks
3069117d96a0SIlya Dryomov  * for the lowest-valued bucket type for which the location of
3070117d96a0SIlya Dryomov  * @id matches one of the locations in @locs, so for standard
3071117d96a0SIlya Dryomov  * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9)
3072117d96a0SIlya Dryomov  * a matching host is closer than a matching rack and a matching
3073117d96a0SIlya Dryomov  * data center is closer than a matching zone.
3074117d96a0SIlya Dryomov  *
3075117d96a0SIlya Dryomov  * Specifying multiple locations (a "multipath" location) such
3076117d96a0SIlya Dryomov  * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs
3077117d96a0SIlya Dryomov  * is a multimap.  The locality will be:
3078117d96a0SIlya Dryomov  *
3079117d96a0SIlya Dryomov  * - 3 for OSDs in racks foo1 and foo2
3080117d96a0SIlya Dryomov  * - 8 for OSDs in data center bar
3081117d96a0SIlya Dryomov  * - -1 for all other OSDs
3082117d96a0SIlya Dryomov  *
3083117d96a0SIlya Dryomov  * The lowest possible bucket type is 1, so the best locality
3084117d96a0SIlya Dryomov  * for an OSD is 1 (i.e. a matching host).  Locality 0 would be
3085117d96a0SIlya Dryomov  * the OSD itself.
3086117d96a0SIlya Dryomov  */
ceph_get_crush_locality(struct ceph_osdmap * osdmap,int id,struct rb_root * locs)3087117d96a0SIlya Dryomov int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
3088117d96a0SIlya Dryomov 			    struct rb_root *locs)
3089117d96a0SIlya Dryomov {
3090117d96a0SIlya Dryomov 	struct crush_loc loc;
3091117d96a0SIlya Dryomov 	u16 type_id;
3092117d96a0SIlya Dryomov 
3093117d96a0SIlya Dryomov 	/*
3094117d96a0SIlya Dryomov 	 * Instead of repeated get_immediate_parent() calls,
3095117d96a0SIlya Dryomov 	 * the location of @id could be obtained with a single
3096117d96a0SIlya Dryomov 	 * depth-first traversal.
3097117d96a0SIlya Dryomov 	 */
3098117d96a0SIlya Dryomov 	for (;;) {
3099117d96a0SIlya Dryomov 		id = get_immediate_parent(osdmap->crush, id, &type_id, &loc);
3100117d96a0SIlya Dryomov 		if (id >= 0)
3101117d96a0SIlya Dryomov 			return -1;  /* not local */
3102117d96a0SIlya Dryomov 
3103117d96a0SIlya Dryomov 		if (lookup_crush_loc(locs, &loc))
3104117d96a0SIlya Dryomov 			return type_id;
3105117d96a0SIlya Dryomov 	}
3106117d96a0SIlya Dryomov }
3107