xref: /openbmc/linux/net/ceph/osdmap.c (revision 0c0a8de1)
13d14c5d2SYehuda Sadeh 
23d14c5d2SYehuda Sadeh #include <linux/ceph/ceph_debug.h>
33d14c5d2SYehuda Sadeh 
43d14c5d2SYehuda Sadeh #include <linux/module.h>
53d14c5d2SYehuda Sadeh #include <linux/slab.h>
63d14c5d2SYehuda Sadeh #include <asm/div64.h>
73d14c5d2SYehuda Sadeh 
83d14c5d2SYehuda Sadeh #include <linux/ceph/libceph.h>
93d14c5d2SYehuda Sadeh #include <linux/ceph/osdmap.h>
103d14c5d2SYehuda Sadeh #include <linux/ceph/decode.h>
113d14c5d2SYehuda Sadeh #include <linux/crush/hash.h>
123d14c5d2SYehuda Sadeh #include <linux/crush/mapper.h>
133d14c5d2SYehuda Sadeh 
143d14c5d2SYehuda Sadeh char *ceph_osdmap_state_str(char *str, int len, int state)
153d14c5d2SYehuda Sadeh {
163d14c5d2SYehuda Sadeh 	if (!len)
171ec3911dSCong Ding 		return str;
183d14c5d2SYehuda Sadeh 
191ec3911dSCong Ding 	if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
201ec3911dSCong Ding 		snprintf(str, len, "exists, up");
211ec3911dSCong Ding 	else if (state & CEPH_OSD_EXISTS)
223d14c5d2SYehuda Sadeh 		snprintf(str, len, "exists");
231ec3911dSCong Ding 	else if (state & CEPH_OSD_UP)
241ec3911dSCong Ding 		snprintf(str, len, "up");
251ec3911dSCong Ding 	else
263d14c5d2SYehuda Sadeh 		snprintf(str, len, "doesn't exist");
271ec3911dSCong Ding 
283d14c5d2SYehuda Sadeh 	return str;
293d14c5d2SYehuda Sadeh }
303d14c5d2SYehuda Sadeh 
313d14c5d2SYehuda Sadeh /* maps */
323d14c5d2SYehuda Sadeh 
3395c96174SEric Dumazet static int calc_bits_of(unsigned int t)
343d14c5d2SYehuda Sadeh {
353d14c5d2SYehuda Sadeh 	int b = 0;
363d14c5d2SYehuda Sadeh 	while (t) {
373d14c5d2SYehuda Sadeh 		t = t >> 1;
383d14c5d2SYehuda Sadeh 		b++;
393d14c5d2SYehuda Sadeh 	}
403d14c5d2SYehuda Sadeh 	return b;
413d14c5d2SYehuda Sadeh }
423d14c5d2SYehuda Sadeh 
433d14c5d2SYehuda Sadeh /*
443d14c5d2SYehuda Sadeh  * the foo_mask is the smallest value 2^n-1 that is >= foo.
453d14c5d2SYehuda Sadeh  */
463d14c5d2SYehuda Sadeh static void calc_pg_masks(struct ceph_pg_pool_info *pi)
473d14c5d2SYehuda Sadeh {
484f6a7e5eSSage Weil 	pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
494f6a7e5eSSage Weil 	pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
503d14c5d2SYehuda Sadeh }
513d14c5d2SYehuda Sadeh 
523d14c5d2SYehuda Sadeh /*
533d14c5d2SYehuda Sadeh  * decode crush map
543d14c5d2SYehuda Sadeh  */
553d14c5d2SYehuda Sadeh static int crush_decode_uniform_bucket(void **p, void *end,
563d14c5d2SYehuda Sadeh 				       struct crush_bucket_uniform *b)
573d14c5d2SYehuda Sadeh {
583d14c5d2SYehuda Sadeh 	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
593d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
603d14c5d2SYehuda Sadeh 	b->item_weight = ceph_decode_32(p);
613d14c5d2SYehuda Sadeh 	return 0;
623d14c5d2SYehuda Sadeh bad:
633d14c5d2SYehuda Sadeh 	return -EINVAL;
643d14c5d2SYehuda Sadeh }
653d14c5d2SYehuda Sadeh 
663d14c5d2SYehuda Sadeh static int crush_decode_list_bucket(void **p, void *end,
673d14c5d2SYehuda Sadeh 				    struct crush_bucket_list *b)
683d14c5d2SYehuda Sadeh {
693d14c5d2SYehuda Sadeh 	int j;
703d14c5d2SYehuda Sadeh 	dout("crush_decode_list_bucket %p to %p\n", *p, end);
713d14c5d2SYehuda Sadeh 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
723d14c5d2SYehuda Sadeh 	if (b->item_weights == NULL)
733d14c5d2SYehuda Sadeh 		return -ENOMEM;
743d14c5d2SYehuda Sadeh 	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
753d14c5d2SYehuda Sadeh 	if (b->sum_weights == NULL)
763d14c5d2SYehuda Sadeh 		return -ENOMEM;
773d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
783d14c5d2SYehuda Sadeh 	for (j = 0; j < b->h.size; j++) {
793d14c5d2SYehuda Sadeh 		b->item_weights[j] = ceph_decode_32(p);
803d14c5d2SYehuda Sadeh 		b->sum_weights[j] = ceph_decode_32(p);
813d14c5d2SYehuda Sadeh 	}
823d14c5d2SYehuda Sadeh 	return 0;
833d14c5d2SYehuda Sadeh bad:
843d14c5d2SYehuda Sadeh 	return -EINVAL;
853d14c5d2SYehuda Sadeh }
863d14c5d2SYehuda Sadeh 
873d14c5d2SYehuda Sadeh static int crush_decode_tree_bucket(void **p, void *end,
883d14c5d2SYehuda Sadeh 				    struct crush_bucket_tree *b)
893d14c5d2SYehuda Sadeh {
903d14c5d2SYehuda Sadeh 	int j;
913d14c5d2SYehuda Sadeh 	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
9282cd003aSIlya Dryomov 	ceph_decode_8_safe(p, end, b->num_nodes, bad);
933d14c5d2SYehuda Sadeh 	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
943d14c5d2SYehuda Sadeh 	if (b->node_weights == NULL)
953d14c5d2SYehuda Sadeh 		return -ENOMEM;
963d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
973d14c5d2SYehuda Sadeh 	for (j = 0; j < b->num_nodes; j++)
983d14c5d2SYehuda Sadeh 		b->node_weights[j] = ceph_decode_32(p);
993d14c5d2SYehuda Sadeh 	return 0;
1003d14c5d2SYehuda Sadeh bad:
1013d14c5d2SYehuda Sadeh 	return -EINVAL;
1023d14c5d2SYehuda Sadeh }
1033d14c5d2SYehuda Sadeh 
1043d14c5d2SYehuda Sadeh static int crush_decode_straw_bucket(void **p, void *end,
1053d14c5d2SYehuda Sadeh 				     struct crush_bucket_straw *b)
1063d14c5d2SYehuda Sadeh {
1073d14c5d2SYehuda Sadeh 	int j;
1083d14c5d2SYehuda Sadeh 	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
1093d14c5d2SYehuda Sadeh 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
1103d14c5d2SYehuda Sadeh 	if (b->item_weights == NULL)
1113d14c5d2SYehuda Sadeh 		return -ENOMEM;
1123d14c5d2SYehuda Sadeh 	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
1133d14c5d2SYehuda Sadeh 	if (b->straws == NULL)
1143d14c5d2SYehuda Sadeh 		return -ENOMEM;
1153d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
1163d14c5d2SYehuda Sadeh 	for (j = 0; j < b->h.size; j++) {
1173d14c5d2SYehuda Sadeh 		b->item_weights[j] = ceph_decode_32(p);
1183d14c5d2SYehuda Sadeh 		b->straws[j] = ceph_decode_32(p);
1193d14c5d2SYehuda Sadeh 	}
1203d14c5d2SYehuda Sadeh 	return 0;
1213d14c5d2SYehuda Sadeh bad:
1223d14c5d2SYehuda Sadeh 	return -EINVAL;
1233d14c5d2SYehuda Sadeh }
1243d14c5d2SYehuda Sadeh 
125958a2765SIlya Dryomov static int crush_decode_straw2_bucket(void **p, void *end,
126958a2765SIlya Dryomov 				      struct crush_bucket_straw2 *b)
127958a2765SIlya Dryomov {
128958a2765SIlya Dryomov 	int j;
129958a2765SIlya Dryomov 	dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
130958a2765SIlya Dryomov 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
131958a2765SIlya Dryomov 	if (b->item_weights == NULL)
132958a2765SIlya Dryomov 		return -ENOMEM;
133958a2765SIlya Dryomov 	ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
134958a2765SIlya Dryomov 	for (j = 0; j < b->h.size; j++)
135958a2765SIlya Dryomov 		b->item_weights[j] = ceph_decode_32(p);
136958a2765SIlya Dryomov 	return 0;
137958a2765SIlya Dryomov bad:
138958a2765SIlya Dryomov 	return -EINVAL;
139958a2765SIlya Dryomov }
140958a2765SIlya Dryomov 
141546f04efSSage Weil static int skip_name_map(void **p, void *end)
142546f04efSSage Weil {
143546f04efSSage Weil         int len;
144546f04efSSage Weil         ceph_decode_32_safe(p, end, len ,bad);
145546f04efSSage Weil         while (len--) {
146546f04efSSage Weil                 int strlen;
147546f04efSSage Weil                 *p += sizeof(u32);
148546f04efSSage Weil                 ceph_decode_32_safe(p, end, strlen, bad);
149546f04efSSage Weil                 *p += strlen;
150546f04efSSage Weil }
151546f04efSSage Weil         return 0;
152546f04efSSage Weil bad:
153546f04efSSage Weil         return -EINVAL;
154546f04efSSage Weil }
155546f04efSSage Weil 
1563d14c5d2SYehuda Sadeh static struct crush_map *crush_decode(void *pbyval, void *end)
1573d14c5d2SYehuda Sadeh {
1583d14c5d2SYehuda Sadeh 	struct crush_map *c;
1593d14c5d2SYehuda Sadeh 	int err = -EINVAL;
1603d14c5d2SYehuda Sadeh 	int i, j;
1613d14c5d2SYehuda Sadeh 	void **p = &pbyval;
1623d14c5d2SYehuda Sadeh 	void *start = pbyval;
1633d14c5d2SYehuda Sadeh 	u32 magic;
164546f04efSSage Weil 	u32 num_name_maps;
1653d14c5d2SYehuda Sadeh 
1663d14c5d2SYehuda Sadeh 	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
1673d14c5d2SYehuda Sadeh 
1683d14c5d2SYehuda Sadeh 	c = kzalloc(sizeof(*c), GFP_NOFS);
1693d14c5d2SYehuda Sadeh 	if (c == NULL)
1703d14c5d2SYehuda Sadeh 		return ERR_PTR(-ENOMEM);
1713d14c5d2SYehuda Sadeh 
172546f04efSSage Weil         /* set tunables to default values */
173546f04efSSage Weil         c->choose_local_tries = 2;
174546f04efSSage Weil         c->choose_local_fallback_tries = 5;
175546f04efSSage Weil         c->choose_total_tries = 19;
1761604f488SJim Schutt 	c->chooseleaf_descend_once = 0;
177546f04efSSage Weil 
1783d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
1793d14c5d2SYehuda Sadeh 	magic = ceph_decode_32(p);
1803d14c5d2SYehuda Sadeh 	if (magic != CRUSH_MAGIC) {
1813d14c5d2SYehuda Sadeh 		pr_err("crush_decode magic %x != current %x\n",
18295c96174SEric Dumazet 		       (unsigned int)magic, (unsigned int)CRUSH_MAGIC);
1833d14c5d2SYehuda Sadeh 		goto bad;
1843d14c5d2SYehuda Sadeh 	}
1853d14c5d2SYehuda Sadeh 	c->max_buckets = ceph_decode_32(p);
1863d14c5d2SYehuda Sadeh 	c->max_rules = ceph_decode_32(p);
1873d14c5d2SYehuda Sadeh 	c->max_devices = ceph_decode_32(p);
1883d14c5d2SYehuda Sadeh 
1893d14c5d2SYehuda Sadeh 	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
1903d14c5d2SYehuda Sadeh 	if (c->buckets == NULL)
1913d14c5d2SYehuda Sadeh 		goto badmem;
1923d14c5d2SYehuda Sadeh 	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
1933d14c5d2SYehuda Sadeh 	if (c->rules == NULL)
1943d14c5d2SYehuda Sadeh 		goto badmem;
1953d14c5d2SYehuda Sadeh 
1963d14c5d2SYehuda Sadeh 	/* buckets */
1973d14c5d2SYehuda Sadeh 	for (i = 0; i < c->max_buckets; i++) {
1983d14c5d2SYehuda Sadeh 		int size = 0;
1993d14c5d2SYehuda Sadeh 		u32 alg;
2003d14c5d2SYehuda Sadeh 		struct crush_bucket *b;
2013d14c5d2SYehuda Sadeh 
2023d14c5d2SYehuda Sadeh 		ceph_decode_32_safe(p, end, alg, bad);
2033d14c5d2SYehuda Sadeh 		if (alg == 0) {
2043d14c5d2SYehuda Sadeh 			c->buckets[i] = NULL;
2053d14c5d2SYehuda Sadeh 			continue;
2063d14c5d2SYehuda Sadeh 		}
2073d14c5d2SYehuda Sadeh 		dout("crush_decode bucket %d off %x %p to %p\n",
2083d14c5d2SYehuda Sadeh 		     i, (int)(*p-start), *p, end);
2093d14c5d2SYehuda Sadeh 
2103d14c5d2SYehuda Sadeh 		switch (alg) {
2113d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_UNIFORM:
2123d14c5d2SYehuda Sadeh 			size = sizeof(struct crush_bucket_uniform);
2133d14c5d2SYehuda Sadeh 			break;
2143d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_LIST:
2153d14c5d2SYehuda Sadeh 			size = sizeof(struct crush_bucket_list);
2163d14c5d2SYehuda Sadeh 			break;
2173d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_TREE:
2183d14c5d2SYehuda Sadeh 			size = sizeof(struct crush_bucket_tree);
2193d14c5d2SYehuda Sadeh 			break;
2203d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_STRAW:
2213d14c5d2SYehuda Sadeh 			size = sizeof(struct crush_bucket_straw);
2223d14c5d2SYehuda Sadeh 			break;
223958a2765SIlya Dryomov 		case CRUSH_BUCKET_STRAW2:
224958a2765SIlya Dryomov 			size = sizeof(struct crush_bucket_straw2);
225958a2765SIlya Dryomov 			break;
2263d14c5d2SYehuda Sadeh 		default:
2273d14c5d2SYehuda Sadeh 			err = -EINVAL;
2283d14c5d2SYehuda Sadeh 			goto bad;
2293d14c5d2SYehuda Sadeh 		}
2303d14c5d2SYehuda Sadeh 		BUG_ON(size == 0);
2313d14c5d2SYehuda Sadeh 		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
2323d14c5d2SYehuda Sadeh 		if (b == NULL)
2333d14c5d2SYehuda Sadeh 			goto badmem;
2343d14c5d2SYehuda Sadeh 
2353d14c5d2SYehuda Sadeh 		ceph_decode_need(p, end, 4*sizeof(u32), bad);
2363d14c5d2SYehuda Sadeh 		b->id = ceph_decode_32(p);
2373d14c5d2SYehuda Sadeh 		b->type = ceph_decode_16(p);
2383d14c5d2SYehuda Sadeh 		b->alg = ceph_decode_8(p);
2393d14c5d2SYehuda Sadeh 		b->hash = ceph_decode_8(p);
2403d14c5d2SYehuda Sadeh 		b->weight = ceph_decode_32(p);
2413d14c5d2SYehuda Sadeh 		b->size = ceph_decode_32(p);
2423d14c5d2SYehuda Sadeh 
2433d14c5d2SYehuda Sadeh 		dout("crush_decode bucket size %d off %x %p to %p\n",
2443d14c5d2SYehuda Sadeh 		     b->size, (int)(*p-start), *p, end);
2453d14c5d2SYehuda Sadeh 
2463d14c5d2SYehuda Sadeh 		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
2473d14c5d2SYehuda Sadeh 		if (b->items == NULL)
2483d14c5d2SYehuda Sadeh 			goto badmem;
2493d14c5d2SYehuda Sadeh 		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
2503d14c5d2SYehuda Sadeh 		if (b->perm == NULL)
2513d14c5d2SYehuda Sadeh 			goto badmem;
2523d14c5d2SYehuda Sadeh 		b->perm_n = 0;
2533d14c5d2SYehuda Sadeh 
2543d14c5d2SYehuda Sadeh 		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
2553d14c5d2SYehuda Sadeh 		for (j = 0; j < b->size; j++)
2563d14c5d2SYehuda Sadeh 			b->items[j] = ceph_decode_32(p);
2573d14c5d2SYehuda Sadeh 
2583d14c5d2SYehuda Sadeh 		switch (b->alg) {
2593d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_UNIFORM:
2603d14c5d2SYehuda Sadeh 			err = crush_decode_uniform_bucket(p, end,
2613d14c5d2SYehuda Sadeh 				  (struct crush_bucket_uniform *)b);
2623d14c5d2SYehuda Sadeh 			if (err < 0)
2633d14c5d2SYehuda Sadeh 				goto bad;
2643d14c5d2SYehuda Sadeh 			break;
2653d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_LIST:
2663d14c5d2SYehuda Sadeh 			err = crush_decode_list_bucket(p, end,
2673d14c5d2SYehuda Sadeh 			       (struct crush_bucket_list *)b);
2683d14c5d2SYehuda Sadeh 			if (err < 0)
2693d14c5d2SYehuda Sadeh 				goto bad;
2703d14c5d2SYehuda Sadeh 			break;
2713d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_TREE:
2723d14c5d2SYehuda Sadeh 			err = crush_decode_tree_bucket(p, end,
2733d14c5d2SYehuda Sadeh 				(struct crush_bucket_tree *)b);
2743d14c5d2SYehuda Sadeh 			if (err < 0)
2753d14c5d2SYehuda Sadeh 				goto bad;
2763d14c5d2SYehuda Sadeh 			break;
2773d14c5d2SYehuda Sadeh 		case CRUSH_BUCKET_STRAW:
2783d14c5d2SYehuda Sadeh 			err = crush_decode_straw_bucket(p, end,
2793d14c5d2SYehuda Sadeh 				(struct crush_bucket_straw *)b);
2803d14c5d2SYehuda Sadeh 			if (err < 0)
2813d14c5d2SYehuda Sadeh 				goto bad;
2823d14c5d2SYehuda Sadeh 			break;
283958a2765SIlya Dryomov 		case CRUSH_BUCKET_STRAW2:
284958a2765SIlya Dryomov 			err = crush_decode_straw2_bucket(p, end,
285958a2765SIlya Dryomov 				(struct crush_bucket_straw2 *)b);
286958a2765SIlya Dryomov 			if (err < 0)
287958a2765SIlya Dryomov 				goto bad;
288958a2765SIlya Dryomov 			break;
2893d14c5d2SYehuda Sadeh 		}
2903d14c5d2SYehuda Sadeh 	}
2913d14c5d2SYehuda Sadeh 
2923d14c5d2SYehuda Sadeh 	/* rules */
2933d14c5d2SYehuda Sadeh 	dout("rule vec is %p\n", c->rules);
2943d14c5d2SYehuda Sadeh 	for (i = 0; i < c->max_rules; i++) {
2953d14c5d2SYehuda Sadeh 		u32 yes;
2963d14c5d2SYehuda Sadeh 		struct crush_rule *r;
2973d14c5d2SYehuda Sadeh 
2983d14c5d2SYehuda Sadeh 		ceph_decode_32_safe(p, end, yes, bad);
2993d14c5d2SYehuda Sadeh 		if (!yes) {
3003d14c5d2SYehuda Sadeh 			dout("crush_decode NO rule %d off %x %p to %p\n",
3013d14c5d2SYehuda Sadeh 			     i, (int)(*p-start), *p, end);
3023d14c5d2SYehuda Sadeh 			c->rules[i] = NULL;
3033d14c5d2SYehuda Sadeh 			continue;
3043d14c5d2SYehuda Sadeh 		}
3053d14c5d2SYehuda Sadeh 
3063d14c5d2SYehuda Sadeh 		dout("crush_decode rule %d off %x %p to %p\n",
3073d14c5d2SYehuda Sadeh 		     i, (int)(*p-start), *p, end);
3083d14c5d2SYehuda Sadeh 
3093d14c5d2SYehuda Sadeh 		/* len */
3103d14c5d2SYehuda Sadeh 		ceph_decode_32_safe(p, end, yes, bad);
3113d14c5d2SYehuda Sadeh #if BITS_PER_LONG == 32
3123d14c5d2SYehuda Sadeh 		err = -EINVAL;
31364486697SXi Wang 		if (yes > (ULONG_MAX - sizeof(*r))
31464486697SXi Wang 			  / sizeof(struct crush_rule_step))
3153d14c5d2SYehuda Sadeh 			goto bad;
3163d14c5d2SYehuda Sadeh #endif
3173d14c5d2SYehuda Sadeh 		r = c->rules[i] = kmalloc(sizeof(*r) +
3183d14c5d2SYehuda Sadeh 					  yes*sizeof(struct crush_rule_step),
3193d14c5d2SYehuda Sadeh 					  GFP_NOFS);
3203d14c5d2SYehuda Sadeh 		if (r == NULL)
3213d14c5d2SYehuda Sadeh 			goto badmem;
3223d14c5d2SYehuda Sadeh 		dout(" rule %d is at %p\n", i, r);
3233d14c5d2SYehuda Sadeh 		r->len = yes;
3243d14c5d2SYehuda Sadeh 		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
3253d14c5d2SYehuda Sadeh 		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
3263d14c5d2SYehuda Sadeh 		for (j = 0; j < r->len; j++) {
3273d14c5d2SYehuda Sadeh 			r->steps[j].op = ceph_decode_32(p);
3283d14c5d2SYehuda Sadeh 			r->steps[j].arg1 = ceph_decode_32(p);
3293d14c5d2SYehuda Sadeh 			r->steps[j].arg2 = ceph_decode_32(p);
3303d14c5d2SYehuda Sadeh 		}
3313d14c5d2SYehuda Sadeh 	}
3323d14c5d2SYehuda Sadeh 
3333d14c5d2SYehuda Sadeh 	/* ignore trailing name maps. */
334546f04efSSage Weil         for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
335546f04efSSage Weil                 err = skip_name_map(p, end);
336546f04efSSage Weil                 if (err < 0)
337546f04efSSage Weil                         goto done;
338546f04efSSage Weil         }
3393d14c5d2SYehuda Sadeh 
340546f04efSSage Weil         /* tunables */
341546f04efSSage Weil         ceph_decode_need(p, end, 3*sizeof(u32), done);
342546f04efSSage Weil         c->choose_local_tries = ceph_decode_32(p);
343546f04efSSage Weil         c->choose_local_fallback_tries =  ceph_decode_32(p);
344546f04efSSage Weil         c->choose_total_tries = ceph_decode_32(p);
345b9b519b7SIlya Dryomov         dout("crush decode tunable choose_local_tries = %d\n",
346546f04efSSage Weil              c->choose_local_tries);
347b9b519b7SIlya Dryomov         dout("crush decode tunable choose_local_fallback_tries = %d\n",
348546f04efSSage Weil              c->choose_local_fallback_tries);
349b9b519b7SIlya Dryomov         dout("crush decode tunable choose_total_tries = %d\n",
350546f04efSSage Weil              c->choose_total_tries);
351546f04efSSage Weil 
3521604f488SJim Schutt 	ceph_decode_need(p, end, sizeof(u32), done);
3531604f488SJim Schutt 	c->chooseleaf_descend_once = ceph_decode_32(p);
354b9b519b7SIlya Dryomov 	dout("crush decode tunable chooseleaf_descend_once = %d\n",
3551604f488SJim Schutt 	     c->chooseleaf_descend_once);
3561604f488SJim Schutt 
357f140662fSIlya Dryomov 	ceph_decode_need(p, end, sizeof(u8), done);
358f140662fSIlya Dryomov 	c->chooseleaf_vary_r = ceph_decode_8(p);
359b9b519b7SIlya Dryomov 	dout("crush decode tunable chooseleaf_vary_r = %d\n",
360f140662fSIlya Dryomov 	     c->chooseleaf_vary_r);
361f140662fSIlya Dryomov 
362b9b519b7SIlya Dryomov 	/* skip straw_calc_version, allowed_bucket_algs */
363b9b519b7SIlya Dryomov 	ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
364b9b519b7SIlya Dryomov 	*p += sizeof(u8) + sizeof(u32);
365b9b519b7SIlya Dryomov 
366b9b519b7SIlya Dryomov 	ceph_decode_need(p, end, sizeof(u8), done);
367b9b519b7SIlya Dryomov 	c->chooseleaf_stable = ceph_decode_8(p);
368b9b519b7SIlya Dryomov 	dout("crush decode tunable chooseleaf_stable = %d\n",
369b9b519b7SIlya Dryomov 	     c->chooseleaf_stable);
370b9b519b7SIlya Dryomov 
371546f04efSSage Weil done:
3723d14c5d2SYehuda Sadeh 	dout("crush_decode success\n");
3733d14c5d2SYehuda Sadeh 	return c;
3743d14c5d2SYehuda Sadeh 
3753d14c5d2SYehuda Sadeh badmem:
3763d14c5d2SYehuda Sadeh 	err = -ENOMEM;
3773d14c5d2SYehuda Sadeh bad:
3783d14c5d2SYehuda Sadeh 	dout("crush_decode fail %d\n", err);
3793d14c5d2SYehuda Sadeh 	crush_destroy(c);
3803d14c5d2SYehuda Sadeh 	return ERR_PTR(err);
3813d14c5d2SYehuda Sadeh }
3823d14c5d2SYehuda Sadeh 
3833d14c5d2SYehuda Sadeh /*
3843d14c5d2SYehuda Sadeh  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
3859686f94cSIlya Dryomov  * to a set of osds) and primary_temp (explicit primary setting)
3863d14c5d2SYehuda Sadeh  */
3875b191d99SSage Weil static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
3883d14c5d2SYehuda Sadeh {
3895b191d99SSage Weil 	if (l.pool < r.pool)
3903d14c5d2SYehuda Sadeh 		return -1;
3915b191d99SSage Weil 	if (l.pool > r.pool)
3925b191d99SSage Weil 		return 1;
3935b191d99SSage Weil 	if (l.seed < r.seed)
3945b191d99SSage Weil 		return -1;
3955b191d99SSage Weil 	if (l.seed > r.seed)
3963d14c5d2SYehuda Sadeh 		return 1;
3973d14c5d2SYehuda Sadeh 	return 0;
3983d14c5d2SYehuda Sadeh }
3993d14c5d2SYehuda Sadeh 
4003d14c5d2SYehuda Sadeh static int __insert_pg_mapping(struct ceph_pg_mapping *new,
4013d14c5d2SYehuda Sadeh 			       struct rb_root *root)
4023d14c5d2SYehuda Sadeh {
4033d14c5d2SYehuda Sadeh 	struct rb_node **p = &root->rb_node;
4043d14c5d2SYehuda Sadeh 	struct rb_node *parent = NULL;
4053d14c5d2SYehuda Sadeh 	struct ceph_pg_mapping *pg = NULL;
4063d14c5d2SYehuda Sadeh 	int c;
4073d14c5d2SYehuda Sadeh 
4088adc8b3dSSage Weil 	dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new);
4093d14c5d2SYehuda Sadeh 	while (*p) {
4103d14c5d2SYehuda Sadeh 		parent = *p;
4113d14c5d2SYehuda Sadeh 		pg = rb_entry(parent, struct ceph_pg_mapping, node);
4123d14c5d2SYehuda Sadeh 		c = pgid_cmp(new->pgid, pg->pgid);
4133d14c5d2SYehuda Sadeh 		if (c < 0)
4143d14c5d2SYehuda Sadeh 			p = &(*p)->rb_left;
4153d14c5d2SYehuda Sadeh 		else if (c > 0)
4163d14c5d2SYehuda Sadeh 			p = &(*p)->rb_right;
4173d14c5d2SYehuda Sadeh 		else
4183d14c5d2SYehuda Sadeh 			return -EEXIST;
4193d14c5d2SYehuda Sadeh 	}
4203d14c5d2SYehuda Sadeh 
4213d14c5d2SYehuda Sadeh 	rb_link_node(&new->node, parent, p);
4223d14c5d2SYehuda Sadeh 	rb_insert_color(&new->node, root);
4233d14c5d2SYehuda Sadeh 	return 0;
4243d14c5d2SYehuda Sadeh }
4253d14c5d2SYehuda Sadeh 
4263d14c5d2SYehuda Sadeh static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
4275b191d99SSage Weil 						   struct ceph_pg pgid)
4283d14c5d2SYehuda Sadeh {
4293d14c5d2SYehuda Sadeh 	struct rb_node *n = root->rb_node;
4303d14c5d2SYehuda Sadeh 	struct ceph_pg_mapping *pg;
4313d14c5d2SYehuda Sadeh 	int c;
4323d14c5d2SYehuda Sadeh 
4333d14c5d2SYehuda Sadeh 	while (n) {
4343d14c5d2SYehuda Sadeh 		pg = rb_entry(n, struct ceph_pg_mapping, node);
4353d14c5d2SYehuda Sadeh 		c = pgid_cmp(pgid, pg->pgid);
4368adc8b3dSSage Weil 		if (c < 0) {
4373d14c5d2SYehuda Sadeh 			n = n->rb_left;
4388adc8b3dSSage Weil 		} else if (c > 0) {
4393d14c5d2SYehuda Sadeh 			n = n->rb_right;
4408adc8b3dSSage Weil 		} else {
4415b191d99SSage Weil 			dout("__lookup_pg_mapping %lld.%x got %p\n",
4425b191d99SSage Weil 			     pgid.pool, pgid.seed, pg);
4433d14c5d2SYehuda Sadeh 			return pg;
4443d14c5d2SYehuda Sadeh 		}
4458adc8b3dSSage Weil 	}
4463d14c5d2SYehuda Sadeh 	return NULL;
4473d14c5d2SYehuda Sadeh }
4483d14c5d2SYehuda Sadeh 
4495b191d99SSage Weil static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
4508adc8b3dSSage Weil {
4518adc8b3dSSage Weil 	struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
4528adc8b3dSSage Weil 
4538adc8b3dSSage Weil 	if (pg) {
4545b191d99SSage Weil 		dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
4555b191d99SSage Weil 		     pg);
4568adc8b3dSSage Weil 		rb_erase(&pg->node, root);
4578adc8b3dSSage Weil 		kfree(pg);
4588adc8b3dSSage Weil 		return 0;
4598adc8b3dSSage Weil 	}
4605b191d99SSage Weil 	dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
4618adc8b3dSSage Weil 	return -ENOENT;
4628adc8b3dSSage Weil }
4638adc8b3dSSage Weil 
4643d14c5d2SYehuda Sadeh /*
4653d14c5d2SYehuda Sadeh  * rbtree of pg pool info
4663d14c5d2SYehuda Sadeh  */
4673d14c5d2SYehuda Sadeh static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
4683d14c5d2SYehuda Sadeh {
4693d14c5d2SYehuda Sadeh 	struct rb_node **p = &root->rb_node;
4703d14c5d2SYehuda Sadeh 	struct rb_node *parent = NULL;
4713d14c5d2SYehuda Sadeh 	struct ceph_pg_pool_info *pi = NULL;
4723d14c5d2SYehuda Sadeh 
4733d14c5d2SYehuda Sadeh 	while (*p) {
4743d14c5d2SYehuda Sadeh 		parent = *p;
4753d14c5d2SYehuda Sadeh 		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
4763d14c5d2SYehuda Sadeh 		if (new->id < pi->id)
4773d14c5d2SYehuda Sadeh 			p = &(*p)->rb_left;
4783d14c5d2SYehuda Sadeh 		else if (new->id > pi->id)
4793d14c5d2SYehuda Sadeh 			p = &(*p)->rb_right;
4803d14c5d2SYehuda Sadeh 		else
4813d14c5d2SYehuda Sadeh 			return -EEXIST;
4823d14c5d2SYehuda Sadeh 	}
4833d14c5d2SYehuda Sadeh 
4843d14c5d2SYehuda Sadeh 	rb_link_node(&new->node, parent, p);
4853d14c5d2SYehuda Sadeh 	rb_insert_color(&new->node, root);
4863d14c5d2SYehuda Sadeh 	return 0;
4873d14c5d2SYehuda Sadeh }
4883d14c5d2SYehuda Sadeh 
4894f6a7e5eSSage Weil static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
4903d14c5d2SYehuda Sadeh {
4913d14c5d2SYehuda Sadeh 	struct ceph_pg_pool_info *pi;
4923d14c5d2SYehuda Sadeh 	struct rb_node *n = root->rb_node;
4933d14c5d2SYehuda Sadeh 
4943d14c5d2SYehuda Sadeh 	while (n) {
4953d14c5d2SYehuda Sadeh 		pi = rb_entry(n, struct ceph_pg_pool_info, node);
4963d14c5d2SYehuda Sadeh 		if (id < pi->id)
4973d14c5d2SYehuda Sadeh 			n = n->rb_left;
4983d14c5d2SYehuda Sadeh 		else if (id > pi->id)
4993d14c5d2SYehuda Sadeh 			n = n->rb_right;
5003d14c5d2SYehuda Sadeh 		else
5013d14c5d2SYehuda Sadeh 			return pi;
5023d14c5d2SYehuda Sadeh 	}
5033d14c5d2SYehuda Sadeh 	return NULL;
5043d14c5d2SYehuda Sadeh }
5053d14c5d2SYehuda Sadeh 
506ce7f6a27SIlya Dryomov struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
507ce7f6a27SIlya Dryomov {
508ce7f6a27SIlya Dryomov 	return __lookup_pg_pool(&map->pg_pools, id);
509ce7f6a27SIlya Dryomov }
510ce7f6a27SIlya Dryomov 
51172afc71fSAlex Elder const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
51272afc71fSAlex Elder {
51372afc71fSAlex Elder 	struct ceph_pg_pool_info *pi;
51472afc71fSAlex Elder 
51572afc71fSAlex Elder 	if (id == CEPH_NOPOOL)
51672afc71fSAlex Elder 		return NULL;
51772afc71fSAlex Elder 
51872afc71fSAlex Elder 	if (WARN_ON_ONCE(id > (u64) INT_MAX))
51972afc71fSAlex Elder 		return NULL;
52072afc71fSAlex Elder 
52172afc71fSAlex Elder 	pi = __lookup_pg_pool(&map->pg_pools, (int) id);
52272afc71fSAlex Elder 
52372afc71fSAlex Elder 	return pi ? pi->name : NULL;
52472afc71fSAlex Elder }
52572afc71fSAlex Elder EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
52672afc71fSAlex Elder 
5273d14c5d2SYehuda Sadeh int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
5283d14c5d2SYehuda Sadeh {
5293d14c5d2SYehuda Sadeh 	struct rb_node *rbp;
5303d14c5d2SYehuda Sadeh 
5313d14c5d2SYehuda Sadeh 	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
5323d14c5d2SYehuda Sadeh 		struct ceph_pg_pool_info *pi =
5333d14c5d2SYehuda Sadeh 			rb_entry(rbp, struct ceph_pg_pool_info, node);
5343d14c5d2SYehuda Sadeh 		if (pi->name && strcmp(pi->name, name) == 0)
5353d14c5d2SYehuda Sadeh 			return pi->id;
5363d14c5d2SYehuda Sadeh 	}
5373d14c5d2SYehuda Sadeh 	return -ENOENT;
5383d14c5d2SYehuda Sadeh }
5393d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_pg_poolid_by_name);
5403d14c5d2SYehuda Sadeh 
5413d14c5d2SYehuda Sadeh static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
5423d14c5d2SYehuda Sadeh {
5433d14c5d2SYehuda Sadeh 	rb_erase(&pi->node, root);
5443d14c5d2SYehuda Sadeh 	kfree(pi->name);
5453d14c5d2SYehuda Sadeh 	kfree(pi);
5463d14c5d2SYehuda Sadeh }
5473d14c5d2SYehuda Sadeh 
5480f70c7eeSIlya Dryomov static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
5493d14c5d2SYehuda Sadeh {
5504f6a7e5eSSage Weil 	u8 ev, cv;
5514f6a7e5eSSage Weil 	unsigned len, num;
5524f6a7e5eSSage Weil 	void *pool_end;
5533d14c5d2SYehuda Sadeh 
5544f6a7e5eSSage Weil 	ceph_decode_need(p, end, 2 + 4, bad);
5554f6a7e5eSSage Weil 	ev = ceph_decode_8(p);  /* encoding version */
5564f6a7e5eSSage Weil 	cv = ceph_decode_8(p); /* compat version */
5574f6a7e5eSSage Weil 	if (ev < 5) {
558b9a67899SJoe Perches 		pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
5594f6a7e5eSSage Weil 		return -EINVAL;
5604f6a7e5eSSage Weil 	}
56117a13e40SIlya Dryomov 	if (cv > 9) {
562b9a67899SJoe Perches 		pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
5634f6a7e5eSSage Weil 		return -EINVAL;
5644f6a7e5eSSage Weil 	}
5654f6a7e5eSSage Weil 	len = ceph_decode_32(p);
5664f6a7e5eSSage Weil 	ceph_decode_need(p, end, len, bad);
5674f6a7e5eSSage Weil 	pool_end = *p + len;
5683d14c5d2SYehuda Sadeh 
5694f6a7e5eSSage Weil 	pi->type = ceph_decode_8(p);
5704f6a7e5eSSage Weil 	pi->size = ceph_decode_8(p);
5714f6a7e5eSSage Weil 	pi->crush_ruleset = ceph_decode_8(p);
5724f6a7e5eSSage Weil 	pi->object_hash = ceph_decode_8(p);
5734f6a7e5eSSage Weil 
5744f6a7e5eSSage Weil 	pi->pg_num = ceph_decode_32(p);
5754f6a7e5eSSage Weil 	pi->pgp_num = ceph_decode_32(p);
5764f6a7e5eSSage Weil 
5774f6a7e5eSSage Weil 	*p += 4 + 4;  /* skip lpg* */
5784f6a7e5eSSage Weil 	*p += 4;      /* skip last_change */
5794f6a7e5eSSage Weil 	*p += 8 + 4;  /* skip snap_seq, snap_epoch */
5804f6a7e5eSSage Weil 
5814f6a7e5eSSage Weil 	/* skip snaps */
5824f6a7e5eSSage Weil 	num = ceph_decode_32(p);
5834f6a7e5eSSage Weil 	while (num--) {
5844f6a7e5eSSage Weil 		*p += 8;  /* snapid key */
5854f6a7e5eSSage Weil 		*p += 1 + 1; /* versions */
5864f6a7e5eSSage Weil 		len = ceph_decode_32(p);
5874f6a7e5eSSage Weil 		*p += len;
5883d14c5d2SYehuda Sadeh 	}
5893d14c5d2SYehuda Sadeh 
59017a13e40SIlya Dryomov 	/* skip removed_snaps */
5914f6a7e5eSSage Weil 	num = ceph_decode_32(p);
5924f6a7e5eSSage Weil 	*p += num * (8 + 8);
5934f6a7e5eSSage Weil 
5944f6a7e5eSSage Weil 	*p += 8;  /* skip auid */
5954f6a7e5eSSage Weil 	pi->flags = ceph_decode_64(p);
59617a13e40SIlya Dryomov 	*p += 4;  /* skip crash_replay_interval */
59717a13e40SIlya Dryomov 
59817a13e40SIlya Dryomov 	if (ev >= 7)
59917a13e40SIlya Dryomov 		*p += 1;  /* skip min_size */
60017a13e40SIlya Dryomov 
60117a13e40SIlya Dryomov 	if (ev >= 8)
60217a13e40SIlya Dryomov 		*p += 8 + 8;  /* skip quota_max_* */
60317a13e40SIlya Dryomov 
60417a13e40SIlya Dryomov 	if (ev >= 9) {
60517a13e40SIlya Dryomov 		/* skip tiers */
60617a13e40SIlya Dryomov 		num = ceph_decode_32(p);
60717a13e40SIlya Dryomov 		*p += num * 8;
60817a13e40SIlya Dryomov 
60917a13e40SIlya Dryomov 		*p += 8;  /* skip tier_of */
61017a13e40SIlya Dryomov 		*p += 1;  /* skip cache_mode */
61117a13e40SIlya Dryomov 
61217a13e40SIlya Dryomov 		pi->read_tier = ceph_decode_64(p);
61317a13e40SIlya Dryomov 		pi->write_tier = ceph_decode_64(p);
61417a13e40SIlya Dryomov 	} else {
61517a13e40SIlya Dryomov 		pi->read_tier = -1;
61617a13e40SIlya Dryomov 		pi->write_tier = -1;
61717a13e40SIlya Dryomov 	}
6184f6a7e5eSSage Weil 
6194f6a7e5eSSage Weil 	/* ignore the rest */
6204f6a7e5eSSage Weil 
6214f6a7e5eSSage Weil 	*p = pool_end;
6224f6a7e5eSSage Weil 	calc_pg_masks(pi);
6233d14c5d2SYehuda Sadeh 	return 0;
6243d14c5d2SYehuda Sadeh 
6253d14c5d2SYehuda Sadeh bad:
6263d14c5d2SYehuda Sadeh 	return -EINVAL;
6273d14c5d2SYehuda Sadeh }
6283d14c5d2SYehuda Sadeh 
6290f70c7eeSIlya Dryomov static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
6303d14c5d2SYehuda Sadeh {
6313d14c5d2SYehuda Sadeh 	struct ceph_pg_pool_info *pi;
6324f6a7e5eSSage Weil 	u32 num, len;
6334f6a7e5eSSage Weil 	u64 pool;
6343d14c5d2SYehuda Sadeh 
6353d14c5d2SYehuda Sadeh 	ceph_decode_32_safe(p, end, num, bad);
6363d14c5d2SYehuda Sadeh 	dout(" %d pool names\n", num);
6373d14c5d2SYehuda Sadeh 	while (num--) {
6384f6a7e5eSSage Weil 		ceph_decode_64_safe(p, end, pool, bad);
6393d14c5d2SYehuda Sadeh 		ceph_decode_32_safe(p, end, len, bad);
6404f6a7e5eSSage Weil 		dout("  pool %llu len %d\n", pool, len);
641ad3b904cSXi Wang 		ceph_decode_need(p, end, len, bad);
6423d14c5d2SYehuda Sadeh 		pi = __lookup_pg_pool(&map->pg_pools, pool);
6433d14c5d2SYehuda Sadeh 		if (pi) {
644ad3b904cSXi Wang 			char *name = kstrndup(*p, len, GFP_NOFS);
645ad3b904cSXi Wang 
646ad3b904cSXi Wang 			if (!name)
647ad3b904cSXi Wang 				return -ENOMEM;
6483d14c5d2SYehuda Sadeh 			kfree(pi->name);
649ad3b904cSXi Wang 			pi->name = name;
6503d14c5d2SYehuda Sadeh 			dout("  name is %s\n", pi->name);
6513d14c5d2SYehuda Sadeh 		}
6523d14c5d2SYehuda Sadeh 		*p += len;
6533d14c5d2SYehuda Sadeh 	}
6543d14c5d2SYehuda Sadeh 	return 0;
6553d14c5d2SYehuda Sadeh 
6563d14c5d2SYehuda Sadeh bad:
6573d14c5d2SYehuda Sadeh 	return -EINVAL;
6583d14c5d2SYehuda Sadeh }
6593d14c5d2SYehuda Sadeh 
6603d14c5d2SYehuda Sadeh /*
6613d14c5d2SYehuda Sadeh  * osd map
6623d14c5d2SYehuda Sadeh  */
6633d14c5d2SYehuda Sadeh void ceph_osdmap_destroy(struct ceph_osdmap *map)
6643d14c5d2SYehuda Sadeh {
6653d14c5d2SYehuda Sadeh 	dout("osdmap_destroy %p\n", map);
6663d14c5d2SYehuda Sadeh 	if (map->crush)
6673d14c5d2SYehuda Sadeh 		crush_destroy(map->crush);
6683d14c5d2SYehuda Sadeh 	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
6693d14c5d2SYehuda Sadeh 		struct ceph_pg_mapping *pg =
6703d14c5d2SYehuda Sadeh 			rb_entry(rb_first(&map->pg_temp),
6713d14c5d2SYehuda Sadeh 				 struct ceph_pg_mapping, node);
6723d14c5d2SYehuda Sadeh 		rb_erase(&pg->node, &map->pg_temp);
6733d14c5d2SYehuda Sadeh 		kfree(pg);
6743d14c5d2SYehuda Sadeh 	}
6759686f94cSIlya Dryomov 	while (!RB_EMPTY_ROOT(&map->primary_temp)) {
6769686f94cSIlya Dryomov 		struct ceph_pg_mapping *pg =
6779686f94cSIlya Dryomov 			rb_entry(rb_first(&map->primary_temp),
6789686f94cSIlya Dryomov 				 struct ceph_pg_mapping, node);
6799686f94cSIlya Dryomov 		rb_erase(&pg->node, &map->primary_temp);
6809686f94cSIlya Dryomov 		kfree(pg);
6819686f94cSIlya Dryomov 	}
6823d14c5d2SYehuda Sadeh 	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
6833d14c5d2SYehuda Sadeh 		struct ceph_pg_pool_info *pi =
6843d14c5d2SYehuda Sadeh 			rb_entry(rb_first(&map->pg_pools),
6853d14c5d2SYehuda Sadeh 				 struct ceph_pg_pool_info, node);
6863d14c5d2SYehuda Sadeh 		__remove_pg_pool(&map->pg_pools, pi);
6873d14c5d2SYehuda Sadeh 	}
6883d14c5d2SYehuda Sadeh 	kfree(map->osd_state);
6893d14c5d2SYehuda Sadeh 	kfree(map->osd_weight);
6903d14c5d2SYehuda Sadeh 	kfree(map->osd_addr);
6912cfa34f2SIlya Dryomov 	kfree(map->osd_primary_affinity);
6923d14c5d2SYehuda Sadeh 	kfree(map);
6933d14c5d2SYehuda Sadeh }
6943d14c5d2SYehuda Sadeh 
6953d14c5d2SYehuda Sadeh /*
6964d60351fSIlya Dryomov  * Adjust max_osd value, (re)allocate arrays.
6974d60351fSIlya Dryomov  *
6984d60351fSIlya Dryomov  * The new elements are properly initialized.
6993d14c5d2SYehuda Sadeh  */
7003d14c5d2SYehuda Sadeh static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
7013d14c5d2SYehuda Sadeh {
7023d14c5d2SYehuda Sadeh 	u8 *state;
7033d14c5d2SYehuda Sadeh 	u32 *weight;
7044d60351fSIlya Dryomov 	struct ceph_entity_addr *addr;
7054d60351fSIlya Dryomov 	int i;
7063d14c5d2SYehuda Sadeh 
7074d60351fSIlya Dryomov 	state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
708589506f1SLi RongQing 	if (!state)
7093d14c5d2SYehuda Sadeh 		return -ENOMEM;
710589506f1SLi RongQing 	map->osd_state = state;
711589506f1SLi RongQing 
712589506f1SLi RongQing 	weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
713589506f1SLi RongQing 	if (!weight)
714589506f1SLi RongQing 		return -ENOMEM;
715589506f1SLi RongQing 	map->osd_weight = weight;
716589506f1SLi RongQing 
717589506f1SLi RongQing 	addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
718589506f1SLi RongQing 	if (!addr)
719589506f1SLi RongQing 		return -ENOMEM;
720589506f1SLi RongQing 	map->osd_addr = addr;
7213d14c5d2SYehuda Sadeh 
7224d60351fSIlya Dryomov 	for (i = map->max_osd; i < max; i++) {
723589506f1SLi RongQing 		map->osd_state[i] = 0;
724589506f1SLi RongQing 		map->osd_weight[i] = CEPH_OSD_OUT;
725589506f1SLi RongQing 		memset(map->osd_addr + i, 0, sizeof(*map->osd_addr));
7263d14c5d2SYehuda Sadeh 	}
7273d14c5d2SYehuda Sadeh 
7282cfa34f2SIlya Dryomov 	if (map->osd_primary_affinity) {
7292cfa34f2SIlya Dryomov 		u32 *affinity;
7302cfa34f2SIlya Dryomov 
7312cfa34f2SIlya Dryomov 		affinity = krealloc(map->osd_primary_affinity,
7322cfa34f2SIlya Dryomov 				    max*sizeof(*affinity), GFP_NOFS);
7332cfa34f2SIlya Dryomov 		if (!affinity)
7342cfa34f2SIlya Dryomov 			return -ENOMEM;
735589506f1SLi RongQing 		map->osd_primary_affinity = affinity;
7362cfa34f2SIlya Dryomov 
7372cfa34f2SIlya Dryomov 		for (i = map->max_osd; i < max; i++)
738589506f1SLi RongQing 			map->osd_primary_affinity[i] =
739589506f1SLi RongQing 			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
7402cfa34f2SIlya Dryomov 	}
7412cfa34f2SIlya Dryomov 
7423d14c5d2SYehuda Sadeh 	map->max_osd = max;
7434d60351fSIlya Dryomov 
7443d14c5d2SYehuda Sadeh 	return 0;
7453d14c5d2SYehuda Sadeh }
7463d14c5d2SYehuda Sadeh 
747ec7af972SIlya Dryomov #define OSDMAP_WRAPPER_COMPAT_VER	7
748ec7af972SIlya Dryomov #define OSDMAP_CLIENT_DATA_COMPAT_VER	1
749ec7af972SIlya Dryomov 
750ec7af972SIlya Dryomov /*
751ec7af972SIlya Dryomov  * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps,
752ec7af972SIlya Dryomov  * to struct_v of the client_data section for new (v7 and above)
753ec7af972SIlya Dryomov  * osdmaps.
754ec7af972SIlya Dryomov  */
755ec7af972SIlya Dryomov static int get_osdmap_client_data_v(void **p, void *end,
756ec7af972SIlya Dryomov 				    const char *prefix, u8 *v)
757ec7af972SIlya Dryomov {
758ec7af972SIlya Dryomov 	u8 struct_v;
759ec7af972SIlya Dryomov 
760ec7af972SIlya Dryomov 	ceph_decode_8_safe(p, end, struct_v, e_inval);
761ec7af972SIlya Dryomov 	if (struct_v >= 7) {
762ec7af972SIlya Dryomov 		u8 struct_compat;
763ec7af972SIlya Dryomov 
764ec7af972SIlya Dryomov 		ceph_decode_8_safe(p, end, struct_compat, e_inval);
765ec7af972SIlya Dryomov 		if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
766b9a67899SJoe Perches 			pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n",
767ec7af972SIlya Dryomov 				struct_v, struct_compat,
768ec7af972SIlya Dryomov 				OSDMAP_WRAPPER_COMPAT_VER, prefix);
769ec7af972SIlya Dryomov 			return -EINVAL;
770ec7af972SIlya Dryomov 		}
771ec7af972SIlya Dryomov 		*p += 4; /* ignore wrapper struct_len */
772ec7af972SIlya Dryomov 
773ec7af972SIlya Dryomov 		ceph_decode_8_safe(p, end, struct_v, e_inval);
774ec7af972SIlya Dryomov 		ceph_decode_8_safe(p, end, struct_compat, e_inval);
775ec7af972SIlya Dryomov 		if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
776b9a67899SJoe Perches 			pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n",
777ec7af972SIlya Dryomov 				struct_v, struct_compat,
778ec7af972SIlya Dryomov 				OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
779ec7af972SIlya Dryomov 			return -EINVAL;
780ec7af972SIlya Dryomov 		}
781ec7af972SIlya Dryomov 		*p += 4; /* ignore client data struct_len */
782ec7af972SIlya Dryomov 	} else {
783ec7af972SIlya Dryomov 		u16 version;
784ec7af972SIlya Dryomov 
785ec7af972SIlya Dryomov 		*p -= 1;
786ec7af972SIlya Dryomov 		ceph_decode_16_safe(p, end, version, e_inval);
787ec7af972SIlya Dryomov 		if (version < 6) {
788b9a67899SJoe Perches 			pr_warn("got v %d < 6 of %s ceph_osdmap\n",
789b9a67899SJoe Perches 				version, prefix);
790ec7af972SIlya Dryomov 			return -EINVAL;
791ec7af972SIlya Dryomov 		}
792ec7af972SIlya Dryomov 
793ec7af972SIlya Dryomov 		/* old osdmap enconding */
794ec7af972SIlya Dryomov 		struct_v = 0;
795ec7af972SIlya Dryomov 	}
796ec7af972SIlya Dryomov 
797ec7af972SIlya Dryomov 	*v = struct_v;
798ec7af972SIlya Dryomov 	return 0;
799ec7af972SIlya Dryomov 
800ec7af972SIlya Dryomov e_inval:
801ec7af972SIlya Dryomov 	return -EINVAL;
802ec7af972SIlya Dryomov }
803ec7af972SIlya Dryomov 
804433fbdd3SIlya Dryomov static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
805433fbdd3SIlya Dryomov 			  bool incremental)
806433fbdd3SIlya Dryomov {
807433fbdd3SIlya Dryomov 	u32 n;
808433fbdd3SIlya Dryomov 
809433fbdd3SIlya Dryomov 	ceph_decode_32_safe(p, end, n, e_inval);
810433fbdd3SIlya Dryomov 	while (n--) {
811433fbdd3SIlya Dryomov 		struct ceph_pg_pool_info *pi;
812433fbdd3SIlya Dryomov 		u64 pool;
813433fbdd3SIlya Dryomov 		int ret;
814433fbdd3SIlya Dryomov 
815433fbdd3SIlya Dryomov 		ceph_decode_64_safe(p, end, pool, e_inval);
816433fbdd3SIlya Dryomov 
817433fbdd3SIlya Dryomov 		pi = __lookup_pg_pool(&map->pg_pools, pool);
818433fbdd3SIlya Dryomov 		if (!incremental || !pi) {
819433fbdd3SIlya Dryomov 			pi = kzalloc(sizeof(*pi), GFP_NOFS);
820433fbdd3SIlya Dryomov 			if (!pi)
821433fbdd3SIlya Dryomov 				return -ENOMEM;
822433fbdd3SIlya Dryomov 
823433fbdd3SIlya Dryomov 			pi->id = pool;
824433fbdd3SIlya Dryomov 
825433fbdd3SIlya Dryomov 			ret = __insert_pg_pool(&map->pg_pools, pi);
826433fbdd3SIlya Dryomov 			if (ret) {
827433fbdd3SIlya Dryomov 				kfree(pi);
828433fbdd3SIlya Dryomov 				return ret;
829433fbdd3SIlya Dryomov 			}
830433fbdd3SIlya Dryomov 		}
831433fbdd3SIlya Dryomov 
832433fbdd3SIlya Dryomov 		ret = decode_pool(p, end, pi);
833433fbdd3SIlya Dryomov 		if (ret)
834433fbdd3SIlya Dryomov 			return ret;
835433fbdd3SIlya Dryomov 	}
836433fbdd3SIlya Dryomov 
837433fbdd3SIlya Dryomov 	return 0;
838433fbdd3SIlya Dryomov 
839433fbdd3SIlya Dryomov e_inval:
840433fbdd3SIlya Dryomov 	return -EINVAL;
841433fbdd3SIlya Dryomov }
842433fbdd3SIlya Dryomov 
843433fbdd3SIlya Dryomov static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
844433fbdd3SIlya Dryomov {
845433fbdd3SIlya Dryomov 	return __decode_pools(p, end, map, false);
846433fbdd3SIlya Dryomov }
847433fbdd3SIlya Dryomov 
848433fbdd3SIlya Dryomov static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
849433fbdd3SIlya Dryomov {
850433fbdd3SIlya Dryomov 	return __decode_pools(p, end, map, true);
851433fbdd3SIlya Dryomov }
852433fbdd3SIlya Dryomov 
85310db634eSIlya Dryomov static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
85410db634eSIlya Dryomov 			    bool incremental)
85510db634eSIlya Dryomov {
85610db634eSIlya Dryomov 	u32 n;
85710db634eSIlya Dryomov 
85810db634eSIlya Dryomov 	ceph_decode_32_safe(p, end, n, e_inval);
85910db634eSIlya Dryomov 	while (n--) {
86010db634eSIlya Dryomov 		struct ceph_pg pgid;
86110db634eSIlya Dryomov 		u32 len, i;
86210db634eSIlya Dryomov 		int ret;
86310db634eSIlya Dryomov 
86410db634eSIlya Dryomov 		ret = ceph_decode_pgid(p, end, &pgid);
86510db634eSIlya Dryomov 		if (ret)
86610db634eSIlya Dryomov 			return ret;
86710db634eSIlya Dryomov 
86810db634eSIlya Dryomov 		ceph_decode_32_safe(p, end, len, e_inval);
86910db634eSIlya Dryomov 
87010db634eSIlya Dryomov 		ret = __remove_pg_mapping(&map->pg_temp, pgid);
87110db634eSIlya Dryomov 		BUG_ON(!incremental && ret != -ENOENT);
87210db634eSIlya Dryomov 
87310db634eSIlya Dryomov 		if (!incremental || len > 0) {
87410db634eSIlya Dryomov 			struct ceph_pg_mapping *pg;
87510db634eSIlya Dryomov 
87610db634eSIlya Dryomov 			ceph_decode_need(p, end, len*sizeof(u32), e_inval);
87710db634eSIlya Dryomov 
87810db634eSIlya Dryomov 			if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
87910db634eSIlya Dryomov 				return -EINVAL;
88010db634eSIlya Dryomov 
88110db634eSIlya Dryomov 			pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
88210db634eSIlya Dryomov 			if (!pg)
88310db634eSIlya Dryomov 				return -ENOMEM;
88410db634eSIlya Dryomov 
88510db634eSIlya Dryomov 			pg->pgid = pgid;
88635a935d7SIlya Dryomov 			pg->pg_temp.len = len;
88710db634eSIlya Dryomov 			for (i = 0; i < len; i++)
88835a935d7SIlya Dryomov 				pg->pg_temp.osds[i] = ceph_decode_32(p);
88910db634eSIlya Dryomov 
89010db634eSIlya Dryomov 			ret = __insert_pg_mapping(pg, &map->pg_temp);
89110db634eSIlya Dryomov 			if (ret) {
89210db634eSIlya Dryomov 				kfree(pg);
89310db634eSIlya Dryomov 				return ret;
89410db634eSIlya Dryomov 			}
89510db634eSIlya Dryomov 		}
89610db634eSIlya Dryomov 	}
89710db634eSIlya Dryomov 
89810db634eSIlya Dryomov 	return 0;
89910db634eSIlya Dryomov 
90010db634eSIlya Dryomov e_inval:
90110db634eSIlya Dryomov 	return -EINVAL;
90210db634eSIlya Dryomov }
90310db634eSIlya Dryomov 
90410db634eSIlya Dryomov static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
90510db634eSIlya Dryomov {
90610db634eSIlya Dryomov 	return __decode_pg_temp(p, end, map, false);
90710db634eSIlya Dryomov }
90810db634eSIlya Dryomov 
90910db634eSIlya Dryomov static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
91010db634eSIlya Dryomov {
91110db634eSIlya Dryomov 	return __decode_pg_temp(p, end, map, true);
91210db634eSIlya Dryomov }
91310db634eSIlya Dryomov 
914d286de79SIlya Dryomov static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
915d286de79SIlya Dryomov 				 bool incremental)
916d286de79SIlya Dryomov {
917d286de79SIlya Dryomov 	u32 n;
918d286de79SIlya Dryomov 
919d286de79SIlya Dryomov 	ceph_decode_32_safe(p, end, n, e_inval);
920d286de79SIlya Dryomov 	while (n--) {
921d286de79SIlya Dryomov 		struct ceph_pg pgid;
922d286de79SIlya Dryomov 		u32 osd;
923d286de79SIlya Dryomov 		int ret;
924d286de79SIlya Dryomov 
925d286de79SIlya Dryomov 		ret = ceph_decode_pgid(p, end, &pgid);
926d286de79SIlya Dryomov 		if (ret)
927d286de79SIlya Dryomov 			return ret;
928d286de79SIlya Dryomov 
929d286de79SIlya Dryomov 		ceph_decode_32_safe(p, end, osd, e_inval);
930d286de79SIlya Dryomov 
931d286de79SIlya Dryomov 		ret = __remove_pg_mapping(&map->primary_temp, pgid);
932d286de79SIlya Dryomov 		BUG_ON(!incremental && ret != -ENOENT);
933d286de79SIlya Dryomov 
934d286de79SIlya Dryomov 		if (!incremental || osd != (u32)-1) {
935d286de79SIlya Dryomov 			struct ceph_pg_mapping *pg;
936d286de79SIlya Dryomov 
937d286de79SIlya Dryomov 			pg = kzalloc(sizeof(*pg), GFP_NOFS);
938d286de79SIlya Dryomov 			if (!pg)
939d286de79SIlya Dryomov 				return -ENOMEM;
940d286de79SIlya Dryomov 
941d286de79SIlya Dryomov 			pg->pgid = pgid;
942d286de79SIlya Dryomov 			pg->primary_temp.osd = osd;
943d286de79SIlya Dryomov 
944d286de79SIlya Dryomov 			ret = __insert_pg_mapping(pg, &map->primary_temp);
945d286de79SIlya Dryomov 			if (ret) {
946d286de79SIlya Dryomov 				kfree(pg);
947d286de79SIlya Dryomov 				return ret;
948d286de79SIlya Dryomov 			}
949d286de79SIlya Dryomov 		}
950d286de79SIlya Dryomov 	}
951d286de79SIlya Dryomov 
952d286de79SIlya Dryomov 	return 0;
953d286de79SIlya Dryomov 
954d286de79SIlya Dryomov e_inval:
955d286de79SIlya Dryomov 	return -EINVAL;
956d286de79SIlya Dryomov }
957d286de79SIlya Dryomov 
958d286de79SIlya Dryomov static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
959d286de79SIlya Dryomov {
960d286de79SIlya Dryomov 	return __decode_primary_temp(p, end, map, false);
961d286de79SIlya Dryomov }
962d286de79SIlya Dryomov 
963d286de79SIlya Dryomov static int decode_new_primary_temp(void **p, void *end,
964d286de79SIlya Dryomov 				   struct ceph_osdmap *map)
965d286de79SIlya Dryomov {
966d286de79SIlya Dryomov 	return __decode_primary_temp(p, end, map, true);
967d286de79SIlya Dryomov }
968d286de79SIlya Dryomov 
9692cfa34f2SIlya Dryomov u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
9702cfa34f2SIlya Dryomov {
9712cfa34f2SIlya Dryomov 	BUG_ON(osd >= map->max_osd);
9722cfa34f2SIlya Dryomov 
9732cfa34f2SIlya Dryomov 	if (!map->osd_primary_affinity)
9742cfa34f2SIlya Dryomov 		return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
9752cfa34f2SIlya Dryomov 
9762cfa34f2SIlya Dryomov 	return map->osd_primary_affinity[osd];
9772cfa34f2SIlya Dryomov }
9782cfa34f2SIlya Dryomov 
9792cfa34f2SIlya Dryomov static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
9802cfa34f2SIlya Dryomov {
9812cfa34f2SIlya Dryomov 	BUG_ON(osd >= map->max_osd);
9822cfa34f2SIlya Dryomov 
9832cfa34f2SIlya Dryomov 	if (!map->osd_primary_affinity) {
9842cfa34f2SIlya Dryomov 		int i;
9852cfa34f2SIlya Dryomov 
9862cfa34f2SIlya Dryomov 		map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
9872cfa34f2SIlya Dryomov 						    GFP_NOFS);
9882cfa34f2SIlya Dryomov 		if (!map->osd_primary_affinity)
9892cfa34f2SIlya Dryomov 			return -ENOMEM;
9902cfa34f2SIlya Dryomov 
9912cfa34f2SIlya Dryomov 		for (i = 0; i < map->max_osd; i++)
9922cfa34f2SIlya Dryomov 			map->osd_primary_affinity[i] =
9932cfa34f2SIlya Dryomov 			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
9942cfa34f2SIlya Dryomov 	}
9952cfa34f2SIlya Dryomov 
9962cfa34f2SIlya Dryomov 	map->osd_primary_affinity[osd] = aff;
9972cfa34f2SIlya Dryomov 
9982cfa34f2SIlya Dryomov 	return 0;
9992cfa34f2SIlya Dryomov }
10002cfa34f2SIlya Dryomov 
100163a6993fSIlya Dryomov static int decode_primary_affinity(void **p, void *end,
100263a6993fSIlya Dryomov 				   struct ceph_osdmap *map)
100363a6993fSIlya Dryomov {
100463a6993fSIlya Dryomov 	u32 len, i;
100563a6993fSIlya Dryomov 
100663a6993fSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
100763a6993fSIlya Dryomov 	if (len == 0) {
100863a6993fSIlya Dryomov 		kfree(map->osd_primary_affinity);
100963a6993fSIlya Dryomov 		map->osd_primary_affinity = NULL;
101063a6993fSIlya Dryomov 		return 0;
101163a6993fSIlya Dryomov 	}
101263a6993fSIlya Dryomov 	if (len != map->max_osd)
101363a6993fSIlya Dryomov 		goto e_inval;
101463a6993fSIlya Dryomov 
101563a6993fSIlya Dryomov 	ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
101663a6993fSIlya Dryomov 
101763a6993fSIlya Dryomov 	for (i = 0; i < map->max_osd; i++) {
101863a6993fSIlya Dryomov 		int ret;
101963a6993fSIlya Dryomov 
102063a6993fSIlya Dryomov 		ret = set_primary_affinity(map, i, ceph_decode_32(p));
102163a6993fSIlya Dryomov 		if (ret)
102263a6993fSIlya Dryomov 			return ret;
102363a6993fSIlya Dryomov 	}
102463a6993fSIlya Dryomov 
102563a6993fSIlya Dryomov 	return 0;
102663a6993fSIlya Dryomov 
102763a6993fSIlya Dryomov e_inval:
102863a6993fSIlya Dryomov 	return -EINVAL;
102963a6993fSIlya Dryomov }
103063a6993fSIlya Dryomov 
103163a6993fSIlya Dryomov static int decode_new_primary_affinity(void **p, void *end,
103263a6993fSIlya Dryomov 				       struct ceph_osdmap *map)
103363a6993fSIlya Dryomov {
103463a6993fSIlya Dryomov 	u32 n;
103563a6993fSIlya Dryomov 
103663a6993fSIlya Dryomov 	ceph_decode_32_safe(p, end, n, e_inval);
103763a6993fSIlya Dryomov 	while (n--) {
103863a6993fSIlya Dryomov 		u32 osd, aff;
103963a6993fSIlya Dryomov 		int ret;
104063a6993fSIlya Dryomov 
104163a6993fSIlya Dryomov 		ceph_decode_32_safe(p, end, osd, e_inval);
104263a6993fSIlya Dryomov 		ceph_decode_32_safe(p, end, aff, e_inval);
104363a6993fSIlya Dryomov 
104463a6993fSIlya Dryomov 		ret = set_primary_affinity(map, osd, aff);
104563a6993fSIlya Dryomov 		if (ret)
104663a6993fSIlya Dryomov 			return ret;
1047f31da0f3SIlya Dryomov 
1048f31da0f3SIlya Dryomov 		pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
104963a6993fSIlya Dryomov 	}
105063a6993fSIlya Dryomov 
105163a6993fSIlya Dryomov 	return 0;
105263a6993fSIlya Dryomov 
105363a6993fSIlya Dryomov e_inval:
105463a6993fSIlya Dryomov 	return -EINVAL;
105563a6993fSIlya Dryomov }
105663a6993fSIlya Dryomov 
10573d14c5d2SYehuda Sadeh /*
10583d14c5d2SYehuda Sadeh  * decode a full map.
10593d14c5d2SYehuda Sadeh  */
1060a2505d63SIlya Dryomov static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
10613d14c5d2SYehuda Sadeh {
1062ec7af972SIlya Dryomov 	u8 struct_v;
106338a8d560SIlya Dryomov 	u32 epoch = 0;
10643d14c5d2SYehuda Sadeh 	void *start = *p;
10653977058cSIlya Dryomov 	u32 max;
10663977058cSIlya Dryomov 	u32 len, i;
1067597b52f6SIlya Dryomov 	int err;
10683d14c5d2SYehuda Sadeh 
1069a2505d63SIlya Dryomov 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
10703d14c5d2SYehuda Sadeh 
1071ec7af972SIlya Dryomov 	err = get_osdmap_client_data_v(p, end, "full", &struct_v);
1072ec7af972SIlya Dryomov 	if (err)
1073ec7af972SIlya Dryomov 		goto bad;
10743d14c5d2SYehuda Sadeh 
107553bbaba9SIlya Dryomov 	/* fsid, epoch, created, modified */
107653bbaba9SIlya Dryomov 	ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
107753bbaba9SIlya Dryomov 			 sizeof(map->created) + sizeof(map->modified), e_inval);
10783d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
107938a8d560SIlya Dryomov 	epoch = map->epoch = ceph_decode_32(p);
10803d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &map->created, sizeof(map->created));
10813d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
10823d14c5d2SYehuda Sadeh 
1083433fbdd3SIlya Dryomov 	/* pools */
1084433fbdd3SIlya Dryomov 	err = decode_pools(p, end, map);
1085433fbdd3SIlya Dryomov 	if (err)
10863d14c5d2SYehuda Sadeh 		goto bad;
10873d14c5d2SYehuda Sadeh 
10880f70c7eeSIlya Dryomov 	/* pool_name */
10890f70c7eeSIlya Dryomov 	err = decode_pool_names(p, end, map);
1090597b52f6SIlya Dryomov 	if (err)
10913d14c5d2SYehuda Sadeh 		goto bad;
10923d14c5d2SYehuda Sadeh 
1093597b52f6SIlya Dryomov 	ceph_decode_32_safe(p, end, map->pool_max, e_inval);
10943d14c5d2SYehuda Sadeh 
1095597b52f6SIlya Dryomov 	ceph_decode_32_safe(p, end, map->flags, e_inval);
10963d14c5d2SYehuda Sadeh 
10973977058cSIlya Dryomov 	/* max_osd */
10983977058cSIlya Dryomov 	ceph_decode_32_safe(p, end, max, e_inval);
10993d14c5d2SYehuda Sadeh 
11003d14c5d2SYehuda Sadeh 	/* (re)alloc osd arrays */
11013d14c5d2SYehuda Sadeh 	err = osdmap_set_max_osd(map, max);
1102597b52f6SIlya Dryomov 	if (err)
11033d14c5d2SYehuda Sadeh 		goto bad;
11043d14c5d2SYehuda Sadeh 
11052d88b2e0SIlya Dryomov 	/* osd_state, osd_weight, osd_addrs->client_addr */
11063d14c5d2SYehuda Sadeh 	ceph_decode_need(p, end, 3*sizeof(u32) +
11073d14c5d2SYehuda Sadeh 			 map->max_osd*(1 + sizeof(*map->osd_weight) +
1108597b52f6SIlya Dryomov 				       sizeof(*map->osd_addr)), e_inval);
1109597b52f6SIlya Dryomov 
11102d88b2e0SIlya Dryomov 	if (ceph_decode_32(p) != map->max_osd)
11112d88b2e0SIlya Dryomov 		goto e_inval;
11122d88b2e0SIlya Dryomov 
11133d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, map->osd_state, map->max_osd);
11143d14c5d2SYehuda Sadeh 
11152d88b2e0SIlya Dryomov 	if (ceph_decode_32(p) != map->max_osd)
11162d88b2e0SIlya Dryomov 		goto e_inval;
11172d88b2e0SIlya Dryomov 
11183d14c5d2SYehuda Sadeh 	for (i = 0; i < map->max_osd; i++)
11193d14c5d2SYehuda Sadeh 		map->osd_weight[i] = ceph_decode_32(p);
11203d14c5d2SYehuda Sadeh 
11212d88b2e0SIlya Dryomov 	if (ceph_decode_32(p) != map->max_osd)
11222d88b2e0SIlya Dryomov 		goto e_inval;
11232d88b2e0SIlya Dryomov 
11243d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
11253d14c5d2SYehuda Sadeh 	for (i = 0; i < map->max_osd; i++)
11263d14c5d2SYehuda Sadeh 		ceph_decode_addr(&map->osd_addr[i]);
11273d14c5d2SYehuda Sadeh 
11283d14c5d2SYehuda Sadeh 	/* pg_temp */
112910db634eSIlya Dryomov 	err = decode_pg_temp(p, end, map);
1130d6c0dd6bSSage Weil 	if (err)
1131d6c0dd6bSSage Weil 		goto bad;
11323d14c5d2SYehuda Sadeh 
1133d286de79SIlya Dryomov 	/* primary_temp */
1134d286de79SIlya Dryomov 	if (struct_v >= 1) {
1135d286de79SIlya Dryomov 		err = decode_primary_temp(p, end, map);
1136d286de79SIlya Dryomov 		if (err)
1137d286de79SIlya Dryomov 			goto bad;
1138d286de79SIlya Dryomov 	}
1139d286de79SIlya Dryomov 
114063a6993fSIlya Dryomov 	/* primary_affinity */
114163a6993fSIlya Dryomov 	if (struct_v >= 2) {
114263a6993fSIlya Dryomov 		err = decode_primary_affinity(p, end, map);
114363a6993fSIlya Dryomov 		if (err)
114463a6993fSIlya Dryomov 			goto bad;
114563a6993fSIlya Dryomov 	} else {
114663a6993fSIlya Dryomov 		/* XXX can this happen? */
114763a6993fSIlya Dryomov 		kfree(map->osd_primary_affinity);
114863a6993fSIlya Dryomov 		map->osd_primary_affinity = NULL;
114963a6993fSIlya Dryomov 	}
115063a6993fSIlya Dryomov 
11513d14c5d2SYehuda Sadeh 	/* crush */
1152597b52f6SIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
11539902e682SIlya Dryomov 	map->crush = crush_decode(*p, min(*p + len, end));
11543d14c5d2SYehuda Sadeh 	if (IS_ERR(map->crush)) {
11553d14c5d2SYehuda Sadeh 		err = PTR_ERR(map->crush);
11563d14c5d2SYehuda Sadeh 		map->crush = NULL;
11573d14c5d2SYehuda Sadeh 		goto bad;
11583d14c5d2SYehuda Sadeh 	}
11599902e682SIlya Dryomov 	*p += len;
11603d14c5d2SYehuda Sadeh 
116138a8d560SIlya Dryomov 	/* ignore the rest */
11623d14c5d2SYehuda Sadeh 	*p = end;
11633d14c5d2SYehuda Sadeh 
116438a8d560SIlya Dryomov 	dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
1165a2505d63SIlya Dryomov 	return 0;
11663d14c5d2SYehuda Sadeh 
1167597b52f6SIlya Dryomov e_inval:
1168597b52f6SIlya Dryomov 	err = -EINVAL;
11693d14c5d2SYehuda Sadeh bad:
117038a8d560SIlya Dryomov 	pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
117138a8d560SIlya Dryomov 	       err, epoch, (int)(*p - start), *p, start, end);
117238a8d560SIlya Dryomov 	print_hex_dump(KERN_DEBUG, "osdmap: ",
117338a8d560SIlya Dryomov 		       DUMP_PREFIX_OFFSET, 16, 1,
117438a8d560SIlya Dryomov 		       start, end - start, true);
1175a2505d63SIlya Dryomov 	return err;
1176a2505d63SIlya Dryomov }
1177a2505d63SIlya Dryomov 
1178a2505d63SIlya Dryomov /*
1179a2505d63SIlya Dryomov  * Allocate and decode a full map.
1180a2505d63SIlya Dryomov  */
1181a2505d63SIlya Dryomov struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
1182a2505d63SIlya Dryomov {
1183a2505d63SIlya Dryomov 	struct ceph_osdmap *map;
1184a2505d63SIlya Dryomov 	int ret;
1185a2505d63SIlya Dryomov 
1186a2505d63SIlya Dryomov 	map = kzalloc(sizeof(*map), GFP_NOFS);
1187a2505d63SIlya Dryomov 	if (!map)
1188a2505d63SIlya Dryomov 		return ERR_PTR(-ENOMEM);
1189a2505d63SIlya Dryomov 
1190a2505d63SIlya Dryomov 	map->pg_temp = RB_ROOT;
11919686f94cSIlya Dryomov 	map->primary_temp = RB_ROOT;
1192a2505d63SIlya Dryomov 	mutex_init(&map->crush_scratch_mutex);
1193a2505d63SIlya Dryomov 
1194a2505d63SIlya Dryomov 	ret = osdmap_decode(p, end, map);
1195a2505d63SIlya Dryomov 	if (ret) {
11963d14c5d2SYehuda Sadeh 		ceph_osdmap_destroy(map);
1197a2505d63SIlya Dryomov 		return ERR_PTR(ret);
1198a2505d63SIlya Dryomov 	}
1199a2505d63SIlya Dryomov 
1200a2505d63SIlya Dryomov 	return map;
12013d14c5d2SYehuda Sadeh }
12023d14c5d2SYehuda Sadeh 
12033d14c5d2SYehuda Sadeh /*
12043d14c5d2SYehuda Sadeh  * decode and apply an incremental map update.
12053d14c5d2SYehuda Sadeh  */
12063d14c5d2SYehuda Sadeh struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
12070c0a8de1SIlya Dryomov 					     struct ceph_osdmap *map)
12083d14c5d2SYehuda Sadeh {
12093d14c5d2SYehuda Sadeh 	struct crush_map *newcrush = NULL;
12103d14c5d2SYehuda Sadeh 	struct ceph_fsid fsid;
12113d14c5d2SYehuda Sadeh 	u32 epoch = 0;
12123d14c5d2SYehuda Sadeh 	struct ceph_timespec modified;
12134f6a7e5eSSage Weil 	s32 len;
12144f6a7e5eSSage Weil 	u64 pool;
12154f6a7e5eSSage Weil 	__s64 new_pool_max;
12164f6a7e5eSSage Weil 	__s32 new_flags, max;
12173d14c5d2SYehuda Sadeh 	void *start = *p;
121886f1742bSIlya Dryomov 	int err;
1219ec7af972SIlya Dryomov 	u8 struct_v;
12203d14c5d2SYehuda Sadeh 
122138a8d560SIlya Dryomov 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
122238a8d560SIlya Dryomov 
1223ec7af972SIlya Dryomov 	err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
1224ec7af972SIlya Dryomov 	if (err)
1225ec7af972SIlya Dryomov 		goto bad;
12263d14c5d2SYehuda Sadeh 
122753bbaba9SIlya Dryomov 	/* fsid, epoch, modified, new_pool_max, new_flags */
122853bbaba9SIlya Dryomov 	ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
122953bbaba9SIlya Dryomov 			 sizeof(u64) + sizeof(u32), e_inval);
12303d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &fsid, sizeof(fsid));
12313d14c5d2SYehuda Sadeh 	epoch = ceph_decode_32(p);
12323d14c5d2SYehuda Sadeh 	BUG_ON(epoch != map->epoch+1);
12333d14c5d2SYehuda Sadeh 	ceph_decode_copy(p, &modified, sizeof(modified));
12344f6a7e5eSSage Weil 	new_pool_max = ceph_decode_64(p);
12353d14c5d2SYehuda Sadeh 	new_flags = ceph_decode_32(p);
12363d14c5d2SYehuda Sadeh 
12373d14c5d2SYehuda Sadeh 	/* full map? */
123886f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
12393d14c5d2SYehuda Sadeh 	if (len > 0) {
12403d14c5d2SYehuda Sadeh 		dout("apply_incremental full map len %d, %p to %p\n",
12413d14c5d2SYehuda Sadeh 		     len, *p, end);
1242a2505d63SIlya Dryomov 		return ceph_osdmap_decode(p, min(*p+len, end));
12433d14c5d2SYehuda Sadeh 	}
12443d14c5d2SYehuda Sadeh 
12453d14c5d2SYehuda Sadeh 	/* new crush? */
124686f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
12473d14c5d2SYehuda Sadeh 	if (len > 0) {
12483d14c5d2SYehuda Sadeh 		newcrush = crush_decode(*p, min(*p+len, end));
124986f1742bSIlya Dryomov 		if (IS_ERR(newcrush)) {
125086f1742bSIlya Dryomov 			err = PTR_ERR(newcrush);
125186f1742bSIlya Dryomov 			newcrush = NULL;
125286f1742bSIlya Dryomov 			goto bad;
125386f1742bSIlya Dryomov 		}
12543d14c5d2SYehuda Sadeh 		*p += len;
12553d14c5d2SYehuda Sadeh 	}
12563d14c5d2SYehuda Sadeh 
12573d14c5d2SYehuda Sadeh 	/* new flags? */
12583d14c5d2SYehuda Sadeh 	if (new_flags >= 0)
12593d14c5d2SYehuda Sadeh 		map->flags = new_flags;
12603d14c5d2SYehuda Sadeh 	if (new_pool_max >= 0)
12613d14c5d2SYehuda Sadeh 		map->pool_max = new_pool_max;
12623d14c5d2SYehuda Sadeh 
12633d14c5d2SYehuda Sadeh 	/* new max? */
126453bbaba9SIlya Dryomov 	ceph_decode_32_safe(p, end, max, e_inval);
12653d14c5d2SYehuda Sadeh 	if (max >= 0) {
12663d14c5d2SYehuda Sadeh 		err = osdmap_set_max_osd(map, max);
126786f1742bSIlya Dryomov 		if (err)
12683d14c5d2SYehuda Sadeh 			goto bad;
12693d14c5d2SYehuda Sadeh 	}
12703d14c5d2SYehuda Sadeh 
12713d14c5d2SYehuda Sadeh 	map->epoch++;
127231456665SSage Weil 	map->modified = modified;
12733d14c5d2SYehuda Sadeh 	if (newcrush) {
12743d14c5d2SYehuda Sadeh 		if (map->crush)
12753d14c5d2SYehuda Sadeh 			crush_destroy(map->crush);
12763d14c5d2SYehuda Sadeh 		map->crush = newcrush;
12773d14c5d2SYehuda Sadeh 		newcrush = NULL;
12783d14c5d2SYehuda Sadeh 	}
12793d14c5d2SYehuda Sadeh 
1280433fbdd3SIlya Dryomov 	/* new_pools */
1281433fbdd3SIlya Dryomov 	err = decode_new_pools(p, end, map);
1282433fbdd3SIlya Dryomov 	if (err)
12833d14c5d2SYehuda Sadeh 		goto bad;
12849464d008SIlya Dryomov 
12850f70c7eeSIlya Dryomov 	/* new_pool_names */
12860f70c7eeSIlya Dryomov 	err = decode_pool_names(p, end, map);
128786f1742bSIlya Dryomov 	if (err)
12883d14c5d2SYehuda Sadeh 		goto bad;
12893d14c5d2SYehuda Sadeh 
12903d14c5d2SYehuda Sadeh 	/* old_pool */
129186f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
12923d14c5d2SYehuda Sadeh 	while (len--) {
12933d14c5d2SYehuda Sadeh 		struct ceph_pg_pool_info *pi;
12943d14c5d2SYehuda Sadeh 
129586f1742bSIlya Dryomov 		ceph_decode_64_safe(p, end, pool, e_inval);
12963d14c5d2SYehuda Sadeh 		pi = __lookup_pg_pool(&map->pg_pools, pool);
12973d14c5d2SYehuda Sadeh 		if (pi)
12983d14c5d2SYehuda Sadeh 			__remove_pg_pool(&map->pg_pools, pi);
12993d14c5d2SYehuda Sadeh 	}
13003d14c5d2SYehuda Sadeh 
13013d14c5d2SYehuda Sadeh 	/* new_up */
130286f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
13033d14c5d2SYehuda Sadeh 	while (len--) {
13043d14c5d2SYehuda Sadeh 		u32 osd;
13053d14c5d2SYehuda Sadeh 		struct ceph_entity_addr addr;
130686f1742bSIlya Dryomov 		ceph_decode_32_safe(p, end, osd, e_inval);
130786f1742bSIlya Dryomov 		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
13083d14c5d2SYehuda Sadeh 		ceph_decode_addr(&addr);
13093d14c5d2SYehuda Sadeh 		pr_info("osd%d up\n", osd);
13103d14c5d2SYehuda Sadeh 		BUG_ON(osd >= map->max_osd);
13116dd74e44SYan, Zheng 		map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
13123d14c5d2SYehuda Sadeh 		map->osd_addr[osd] = addr;
13133d14c5d2SYehuda Sadeh 	}
13143d14c5d2SYehuda Sadeh 
13157662d8ffSSage Weil 	/* new_state */
131686f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
13173d14c5d2SYehuda Sadeh 	while (len--) {
13183d14c5d2SYehuda Sadeh 		u32 osd;
13197662d8ffSSage Weil 		u8 xorstate;
132086f1742bSIlya Dryomov 		ceph_decode_32_safe(p, end, osd, e_inval);
13217662d8ffSSage Weil 		xorstate = **(u8 **)p;
13223d14c5d2SYehuda Sadeh 		(*p)++;  /* clean flag */
13237662d8ffSSage Weil 		if (xorstate == 0)
13247662d8ffSSage Weil 			xorstate = CEPH_OSD_UP;
13257662d8ffSSage Weil 		if (xorstate & CEPH_OSD_UP)
13263d14c5d2SYehuda Sadeh 			pr_info("osd%d down\n", osd);
13273d14c5d2SYehuda Sadeh 		if (osd < map->max_osd)
13287662d8ffSSage Weil 			map->osd_state[osd] ^= xorstate;
13293d14c5d2SYehuda Sadeh 	}
13303d14c5d2SYehuda Sadeh 
13313d14c5d2SYehuda Sadeh 	/* new_weight */
133286f1742bSIlya Dryomov 	ceph_decode_32_safe(p, end, len, e_inval);
13333d14c5d2SYehuda Sadeh 	while (len--) {
13343d14c5d2SYehuda Sadeh 		u32 osd, off;
133586f1742bSIlya Dryomov 		ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
13363d14c5d2SYehuda Sadeh 		osd = ceph_decode_32(p);
13373d14c5d2SYehuda Sadeh 		off = ceph_decode_32(p);
13383d14c5d2SYehuda Sadeh 		pr_info("osd%d weight 0x%x %s\n", osd, off,
13393d14c5d2SYehuda Sadeh 		     off == CEPH_OSD_IN ? "(in)" :
13403d14c5d2SYehuda Sadeh 		     (off == CEPH_OSD_OUT ? "(out)" : ""));
13413d14c5d2SYehuda Sadeh 		if (osd < map->max_osd)
13423d14c5d2SYehuda Sadeh 			map->osd_weight[osd] = off;
13433d14c5d2SYehuda Sadeh 	}
13443d14c5d2SYehuda Sadeh 
13453d14c5d2SYehuda Sadeh 	/* new_pg_temp */
134610db634eSIlya Dryomov 	err = decode_new_pg_temp(p, end, map);
1347d6c0dd6bSSage Weil 	if (err)
1348d6c0dd6bSSage Weil 		goto bad;
13493d14c5d2SYehuda Sadeh 
1350d286de79SIlya Dryomov 	/* new_primary_temp */
1351d286de79SIlya Dryomov 	if (struct_v >= 1) {
1352d286de79SIlya Dryomov 		err = decode_new_primary_temp(p, end, map);
1353d286de79SIlya Dryomov 		if (err)
1354d286de79SIlya Dryomov 			goto bad;
1355d286de79SIlya Dryomov 	}
1356d286de79SIlya Dryomov 
135763a6993fSIlya Dryomov 	/* new_primary_affinity */
135863a6993fSIlya Dryomov 	if (struct_v >= 2) {
135963a6993fSIlya Dryomov 		err = decode_new_primary_affinity(p, end, map);
136063a6993fSIlya Dryomov 		if (err)
136163a6993fSIlya Dryomov 			goto bad;
136263a6993fSIlya Dryomov 	}
136363a6993fSIlya Dryomov 
13643d14c5d2SYehuda Sadeh 	/* ignore the rest */
13653d14c5d2SYehuda Sadeh 	*p = end;
136638a8d560SIlya Dryomov 
136738a8d560SIlya Dryomov 	dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
13683d14c5d2SYehuda Sadeh 	return map;
13693d14c5d2SYehuda Sadeh 
137086f1742bSIlya Dryomov e_inval:
137186f1742bSIlya Dryomov 	err = -EINVAL;
13723d14c5d2SYehuda Sadeh bad:
137338a8d560SIlya Dryomov 	pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
137438a8d560SIlya Dryomov 	       err, epoch, (int)(*p - start), *p, start, end);
13753d14c5d2SYehuda Sadeh 	print_hex_dump(KERN_DEBUG, "osdmap: ",
13763d14c5d2SYehuda Sadeh 		       DUMP_PREFIX_OFFSET, 16, 1,
13773d14c5d2SYehuda Sadeh 		       start, end - start, true);
13783d14c5d2SYehuda Sadeh 	if (newcrush)
13793d14c5d2SYehuda Sadeh 		crush_destroy(newcrush);
13803d14c5d2SYehuda Sadeh 	return ERR_PTR(err);
13813d14c5d2SYehuda Sadeh }
13823d14c5d2SYehuda Sadeh 
1383d30291b9SIlya Dryomov void ceph_oid_copy(struct ceph_object_id *dest,
1384d30291b9SIlya Dryomov 		   const struct ceph_object_id *src)
1385d30291b9SIlya Dryomov {
1386d30291b9SIlya Dryomov 	WARN_ON(!ceph_oid_empty(dest));
13873d14c5d2SYehuda Sadeh 
1388d30291b9SIlya Dryomov 	if (src->name != src->inline_name) {
1389d30291b9SIlya Dryomov 		/* very rare, see ceph_object_id definition */
1390d30291b9SIlya Dryomov 		dest->name = kmalloc(src->name_len + 1,
1391d30291b9SIlya Dryomov 				     GFP_NOIO | __GFP_NOFAIL);
1392d30291b9SIlya Dryomov 	}
13933d14c5d2SYehuda Sadeh 
1394d30291b9SIlya Dryomov 	memcpy(dest->name, src->name, src->name_len + 1);
1395d30291b9SIlya Dryomov 	dest->name_len = src->name_len;
1396d30291b9SIlya Dryomov }
1397d30291b9SIlya Dryomov EXPORT_SYMBOL(ceph_oid_copy);
1398d30291b9SIlya Dryomov 
1399d30291b9SIlya Dryomov static __printf(2, 0)
1400d30291b9SIlya Dryomov int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
1401d30291b9SIlya Dryomov {
1402d30291b9SIlya Dryomov 	int len;
1403d30291b9SIlya Dryomov 
1404d30291b9SIlya Dryomov 	WARN_ON(!ceph_oid_empty(oid));
1405d30291b9SIlya Dryomov 
1406d30291b9SIlya Dryomov 	len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
1407d30291b9SIlya Dryomov 	if (len >= sizeof(oid->inline_name))
1408d30291b9SIlya Dryomov 		return len;
1409d30291b9SIlya Dryomov 
1410d30291b9SIlya Dryomov 	oid->name_len = len;
1411d30291b9SIlya Dryomov 	return 0;
1412d30291b9SIlya Dryomov }
1413d30291b9SIlya Dryomov 
1414d30291b9SIlya Dryomov /*
1415d30291b9SIlya Dryomov  * If oid doesn't fit into inline buffer, BUG.
1416d30291b9SIlya Dryomov  */
1417d30291b9SIlya Dryomov void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
1418d30291b9SIlya Dryomov {
1419d30291b9SIlya Dryomov 	va_list ap;
1420d30291b9SIlya Dryomov 
1421d30291b9SIlya Dryomov 	va_start(ap, fmt);
1422d30291b9SIlya Dryomov 	BUG_ON(oid_printf_vargs(oid, fmt, ap));
1423d30291b9SIlya Dryomov 	va_end(ap);
1424d30291b9SIlya Dryomov }
1425d30291b9SIlya Dryomov EXPORT_SYMBOL(ceph_oid_printf);
1426d30291b9SIlya Dryomov 
1427d30291b9SIlya Dryomov static __printf(3, 0)
1428d30291b9SIlya Dryomov int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
1429d30291b9SIlya Dryomov 		      const char *fmt, va_list ap)
1430d30291b9SIlya Dryomov {
1431d30291b9SIlya Dryomov 	va_list aq;
1432d30291b9SIlya Dryomov 	int len;
1433d30291b9SIlya Dryomov 
1434d30291b9SIlya Dryomov 	va_copy(aq, ap);
1435d30291b9SIlya Dryomov 	len = oid_printf_vargs(oid, fmt, aq);
1436d30291b9SIlya Dryomov 	va_end(aq);
1437d30291b9SIlya Dryomov 
1438d30291b9SIlya Dryomov 	if (len) {
1439d30291b9SIlya Dryomov 		char *external_name;
1440d30291b9SIlya Dryomov 
1441d30291b9SIlya Dryomov 		external_name = kmalloc(len + 1, gfp);
1442d30291b9SIlya Dryomov 		if (!external_name)
1443d30291b9SIlya Dryomov 			return -ENOMEM;
1444d30291b9SIlya Dryomov 
1445d30291b9SIlya Dryomov 		oid->name = external_name;
1446d30291b9SIlya Dryomov 		WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
1447d30291b9SIlya Dryomov 		oid->name_len = len;
1448d30291b9SIlya Dryomov 	}
1449d30291b9SIlya Dryomov 
1450d30291b9SIlya Dryomov 	return 0;
1451d30291b9SIlya Dryomov }
1452d30291b9SIlya Dryomov 
1453d30291b9SIlya Dryomov /*
1454d30291b9SIlya Dryomov  * If oid doesn't fit into inline buffer, allocate.
1455d30291b9SIlya Dryomov  */
1456d30291b9SIlya Dryomov int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
1457d30291b9SIlya Dryomov 		     const char *fmt, ...)
1458d30291b9SIlya Dryomov {
1459d30291b9SIlya Dryomov 	va_list ap;
1460d30291b9SIlya Dryomov 	int ret;
1461d30291b9SIlya Dryomov 
1462d30291b9SIlya Dryomov 	va_start(ap, fmt);
1463d30291b9SIlya Dryomov 	ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
1464d30291b9SIlya Dryomov 	va_end(ap);
1465d30291b9SIlya Dryomov 
1466d30291b9SIlya Dryomov 	return ret;
1467d30291b9SIlya Dryomov }
1468d30291b9SIlya Dryomov EXPORT_SYMBOL(ceph_oid_aprintf);
1469d30291b9SIlya Dryomov 
1470d30291b9SIlya Dryomov void ceph_oid_destroy(struct ceph_object_id *oid)
1471d30291b9SIlya Dryomov {
1472d30291b9SIlya Dryomov 	if (oid->name != oid->inline_name)
1473d30291b9SIlya Dryomov 		kfree(oid->name);
1474d30291b9SIlya Dryomov }
1475d30291b9SIlya Dryomov EXPORT_SYMBOL(ceph_oid_destroy);
14763d14c5d2SYehuda Sadeh 
14773d14c5d2SYehuda Sadeh /*
14783d14c5d2SYehuda Sadeh  * calculate file layout from given offset, length.
14793d14c5d2SYehuda Sadeh  * fill in correct oid, logical length, and object extent
14803d14c5d2SYehuda Sadeh  * offset, length.
14813d14c5d2SYehuda Sadeh  *
14823d14c5d2SYehuda Sadeh  * for now, we write only a single su, until we can
14833d14c5d2SYehuda Sadeh  * pass a stride back to the caller.
14843d14c5d2SYehuda Sadeh  */
1485d63b77f4SSage Weil int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1486e8afad65SAlex Elder 				   u64 off, u64 len,
14873d14c5d2SYehuda Sadeh 				   u64 *ono,
14883d14c5d2SYehuda Sadeh 				   u64 *oxoff, u64 *oxlen)
14893d14c5d2SYehuda Sadeh {
14903d14c5d2SYehuda Sadeh 	u32 osize = le32_to_cpu(layout->fl_object_size);
14913d14c5d2SYehuda Sadeh 	u32 su = le32_to_cpu(layout->fl_stripe_unit);
14923d14c5d2SYehuda Sadeh 	u32 sc = le32_to_cpu(layout->fl_stripe_count);
14933d14c5d2SYehuda Sadeh 	u32 bl, stripeno, stripepos, objsetno;
14943d14c5d2SYehuda Sadeh 	u32 su_per_object;
14953d14c5d2SYehuda Sadeh 	u64 t, su_offset;
14963d14c5d2SYehuda Sadeh 
1497e8afad65SAlex Elder 	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, len,
14983d14c5d2SYehuda Sadeh 	     osize, su);
1499d63b77f4SSage Weil 	if (su == 0 || sc == 0)
1500d63b77f4SSage Weil 		goto invalid;
15013d14c5d2SYehuda Sadeh 	su_per_object = osize / su;
1502d63b77f4SSage Weil 	if (su_per_object == 0)
1503d63b77f4SSage Weil 		goto invalid;
15043d14c5d2SYehuda Sadeh 	dout("osize %u / su %u = su_per_object %u\n", osize, su,
15053d14c5d2SYehuda Sadeh 	     su_per_object);
15063d14c5d2SYehuda Sadeh 
1507d63b77f4SSage Weil 	if ((su & ~PAGE_MASK) != 0)
1508d63b77f4SSage Weil 		goto invalid;
1509d63b77f4SSage Weil 
15103d14c5d2SYehuda Sadeh 	/* bl = *off / su; */
15113d14c5d2SYehuda Sadeh 	t = off;
15123d14c5d2SYehuda Sadeh 	do_div(t, su);
15133d14c5d2SYehuda Sadeh 	bl = t;
15143d14c5d2SYehuda Sadeh 	dout("off %llu / su %u = bl %u\n", off, su, bl);
15153d14c5d2SYehuda Sadeh 
15163d14c5d2SYehuda Sadeh 	stripeno = bl / sc;
15173d14c5d2SYehuda Sadeh 	stripepos = bl % sc;
15183d14c5d2SYehuda Sadeh 	objsetno = stripeno / su_per_object;
15193d14c5d2SYehuda Sadeh 
15203d14c5d2SYehuda Sadeh 	*ono = objsetno * sc + stripepos;
152195c96174SEric Dumazet 	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono);
15223d14c5d2SYehuda Sadeh 
15233d14c5d2SYehuda Sadeh 	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
15243d14c5d2SYehuda Sadeh 	t = off;
15253d14c5d2SYehuda Sadeh 	su_offset = do_div(t, su);
15263d14c5d2SYehuda Sadeh 	*oxoff = su_offset + (stripeno % su_per_object) * su;
15273d14c5d2SYehuda Sadeh 
15283d14c5d2SYehuda Sadeh 	/*
15293d14c5d2SYehuda Sadeh 	 * Calculate the length of the extent being written to the selected
1530e8afad65SAlex Elder 	 * object. This is the minimum of the full length requested (len) or
15313d14c5d2SYehuda Sadeh 	 * the remainder of the current stripe being written to.
15323d14c5d2SYehuda Sadeh 	 */
1533e8afad65SAlex Elder 	*oxlen = min_t(u64, len, su - su_offset);
15343d14c5d2SYehuda Sadeh 
15353d14c5d2SYehuda Sadeh 	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
1536d63b77f4SSage Weil 	return 0;
1537d63b77f4SSage Weil 
1538d63b77f4SSage Weil invalid:
1539d63b77f4SSage Weil 	dout(" invalid layout\n");
1540d63b77f4SSage Weil 	*ono = 0;
1541d63b77f4SSage Weil 	*oxoff = 0;
1542d63b77f4SSage Weil 	*oxlen = 0;
1543d63b77f4SSage Weil 	return -EINVAL;
15443d14c5d2SYehuda Sadeh }
15453d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_calc_file_object_mapping);
15463d14c5d2SYehuda Sadeh 
15473d14c5d2SYehuda Sadeh /*
15487c13cb64SIlya Dryomov  * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
15497c13cb64SIlya Dryomov  * called with target's (oloc, oid), since tiering isn't taken into
15507c13cb64SIlya Dryomov  * account.
15513d14c5d2SYehuda Sadeh  */
15527c13cb64SIlya Dryomov int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
15537c13cb64SIlya Dryomov 			struct ceph_object_locator *oloc,
15547c13cb64SIlya Dryomov 			struct ceph_object_id *oid,
15557c13cb64SIlya Dryomov 			struct ceph_pg *pg_out)
15563d14c5d2SYehuda Sadeh {
15577c13cb64SIlya Dryomov 	struct ceph_pg_pool_info *pi;
15583d14c5d2SYehuda Sadeh 
15597c13cb64SIlya Dryomov 	pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
15607c13cb64SIlya Dryomov 	if (!pi)
15613d14c5d2SYehuda Sadeh 		return -EIO;
15623d14c5d2SYehuda Sadeh 
15637c13cb64SIlya Dryomov 	pg_out->pool = oloc->pool;
15647c13cb64SIlya Dryomov 	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
15657c13cb64SIlya Dryomov 				     oid->name_len);
15667c13cb64SIlya Dryomov 
1567d30291b9SIlya Dryomov 	dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name,
15687c13cb64SIlya Dryomov 	     pg_out->pool, pg_out->seed);
15693d14c5d2SYehuda Sadeh 	return 0;
15703d14c5d2SYehuda Sadeh }
15717c13cb64SIlya Dryomov EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
15723d14c5d2SYehuda Sadeh 
15739d521470SIlya Dryomov static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
1574e8ef19c4SIlya Dryomov 		    int *result, int result_max,
1575e8ef19c4SIlya Dryomov 		    const __u32 *weight, int weight_max)
1576e8ef19c4SIlya Dryomov {
15779d521470SIlya Dryomov 	int r;
1578e8ef19c4SIlya Dryomov 
15799d521470SIlya Dryomov 	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
15809d521470SIlya Dryomov 
15819d521470SIlya Dryomov 	mutex_lock(&map->crush_scratch_mutex);
15829d521470SIlya Dryomov 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
15839d521470SIlya Dryomov 			  weight, weight_max, map->crush_scratch_ary);
15849d521470SIlya Dryomov 	mutex_unlock(&map->crush_scratch_mutex);
15859d521470SIlya Dryomov 
15869d521470SIlya Dryomov 	return r;
1587e8ef19c4SIlya Dryomov }
1588e8ef19c4SIlya Dryomov 
15893d14c5d2SYehuda Sadeh /*
15902bd93d4dSIlya Dryomov  * Calculate raw (crush) set for given pgid.
15912bd93d4dSIlya Dryomov  *
15922bd93d4dSIlya Dryomov  * Return raw set length, or error.
15932bd93d4dSIlya Dryomov  */
15942bd93d4dSIlya Dryomov static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
15952bd93d4dSIlya Dryomov 			  struct ceph_pg_pool_info *pool,
15962bd93d4dSIlya Dryomov 			  struct ceph_pg pgid, u32 pps, int *osds)
15972bd93d4dSIlya Dryomov {
15982bd93d4dSIlya Dryomov 	int ruleno;
15992bd93d4dSIlya Dryomov 	int len;
16002bd93d4dSIlya Dryomov 
16012bd93d4dSIlya Dryomov 	/* crush */
16022bd93d4dSIlya Dryomov 	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
16032bd93d4dSIlya Dryomov 				 pool->type, pool->size);
16042bd93d4dSIlya Dryomov 	if (ruleno < 0) {
16052bd93d4dSIlya Dryomov 		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
16062bd93d4dSIlya Dryomov 		       pgid.pool, pool->crush_ruleset, pool->type,
16072bd93d4dSIlya Dryomov 		       pool->size);
16082bd93d4dSIlya Dryomov 		return -ENOENT;
16092bd93d4dSIlya Dryomov 	}
16102bd93d4dSIlya Dryomov 
16112bd93d4dSIlya Dryomov 	len = do_crush(osdmap, ruleno, pps, osds,
16122bd93d4dSIlya Dryomov 		       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
16132bd93d4dSIlya Dryomov 		       osdmap->osd_weight, osdmap->max_osd);
16142bd93d4dSIlya Dryomov 	if (len < 0) {
16152bd93d4dSIlya Dryomov 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
16162bd93d4dSIlya Dryomov 		       len, ruleno, pgid.pool, pool->crush_ruleset,
16172bd93d4dSIlya Dryomov 		       pool->type, pool->size);
16182bd93d4dSIlya Dryomov 		return len;
16192bd93d4dSIlya Dryomov 	}
16202bd93d4dSIlya Dryomov 
16212bd93d4dSIlya Dryomov 	return len;
16222bd93d4dSIlya Dryomov }
16232bd93d4dSIlya Dryomov 
16242bd93d4dSIlya Dryomov /*
16252bd93d4dSIlya Dryomov  * Given raw set, calculate up set and up primary.
16262bd93d4dSIlya Dryomov  *
16272bd93d4dSIlya Dryomov  * Return up set length.  *primary is set to up primary osd id, or -1
16282bd93d4dSIlya Dryomov  * if up set is empty.
16292bd93d4dSIlya Dryomov  */
16302bd93d4dSIlya Dryomov static int raw_to_up_osds(struct ceph_osdmap *osdmap,
16312bd93d4dSIlya Dryomov 			  struct ceph_pg_pool_info *pool,
16322bd93d4dSIlya Dryomov 			  int *osds, int len, int *primary)
16332bd93d4dSIlya Dryomov {
16342bd93d4dSIlya Dryomov 	int up_primary = -1;
16352bd93d4dSIlya Dryomov 	int i;
16362bd93d4dSIlya Dryomov 
16372bd93d4dSIlya Dryomov 	if (ceph_can_shift_osds(pool)) {
16382bd93d4dSIlya Dryomov 		int removed = 0;
16392bd93d4dSIlya Dryomov 
16402bd93d4dSIlya Dryomov 		for (i = 0; i < len; i++) {
16412bd93d4dSIlya Dryomov 			if (ceph_osd_is_down(osdmap, osds[i])) {
16422bd93d4dSIlya Dryomov 				removed++;
16432bd93d4dSIlya Dryomov 				continue;
16442bd93d4dSIlya Dryomov 			}
16452bd93d4dSIlya Dryomov 			if (removed)
16462bd93d4dSIlya Dryomov 				osds[i - removed] = osds[i];
16472bd93d4dSIlya Dryomov 		}
16482bd93d4dSIlya Dryomov 
16492bd93d4dSIlya Dryomov 		len -= removed;
16502bd93d4dSIlya Dryomov 		if (len > 0)
16512bd93d4dSIlya Dryomov 			up_primary = osds[0];
16522bd93d4dSIlya Dryomov 	} else {
16532bd93d4dSIlya Dryomov 		for (i = len - 1; i >= 0; i--) {
16542bd93d4dSIlya Dryomov 			if (ceph_osd_is_down(osdmap, osds[i]))
16552bd93d4dSIlya Dryomov 				osds[i] = CRUSH_ITEM_NONE;
16562bd93d4dSIlya Dryomov 			else
16572bd93d4dSIlya Dryomov 				up_primary = osds[i];
16582bd93d4dSIlya Dryomov 		}
16592bd93d4dSIlya Dryomov 	}
16602bd93d4dSIlya Dryomov 
16612bd93d4dSIlya Dryomov 	*primary = up_primary;
16622bd93d4dSIlya Dryomov 	return len;
16632bd93d4dSIlya Dryomov }
16642bd93d4dSIlya Dryomov 
166547ec1f3cSIlya Dryomov static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
166647ec1f3cSIlya Dryomov 				   struct ceph_pg_pool_info *pool,
166747ec1f3cSIlya Dryomov 				   int *osds, int len, int *primary)
166847ec1f3cSIlya Dryomov {
166947ec1f3cSIlya Dryomov 	int i;
167047ec1f3cSIlya Dryomov 	int pos = -1;
167147ec1f3cSIlya Dryomov 
167247ec1f3cSIlya Dryomov 	/*
167347ec1f3cSIlya Dryomov 	 * Do we have any non-default primary_affinity values for these
167447ec1f3cSIlya Dryomov 	 * osds?
167547ec1f3cSIlya Dryomov 	 */
167647ec1f3cSIlya Dryomov 	if (!osdmap->osd_primary_affinity)
167747ec1f3cSIlya Dryomov 		return;
167847ec1f3cSIlya Dryomov 
167947ec1f3cSIlya Dryomov 	for (i = 0; i < len; i++) {
168092b2e751SIlya Dryomov 		int osd = osds[i];
168192b2e751SIlya Dryomov 
168292b2e751SIlya Dryomov 		if (osd != CRUSH_ITEM_NONE &&
168392b2e751SIlya Dryomov 		    osdmap->osd_primary_affinity[osd] !=
168447ec1f3cSIlya Dryomov 					CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
168547ec1f3cSIlya Dryomov 			break;
168647ec1f3cSIlya Dryomov 		}
168747ec1f3cSIlya Dryomov 	}
168847ec1f3cSIlya Dryomov 	if (i == len)
168947ec1f3cSIlya Dryomov 		return;
169047ec1f3cSIlya Dryomov 
169147ec1f3cSIlya Dryomov 	/*
169247ec1f3cSIlya Dryomov 	 * Pick the primary.  Feed both the seed (for the pg) and the
169347ec1f3cSIlya Dryomov 	 * osd into the hash/rng so that a proportional fraction of an
169447ec1f3cSIlya Dryomov 	 * osd's pgs get rejected as primary.
169547ec1f3cSIlya Dryomov 	 */
169647ec1f3cSIlya Dryomov 	for (i = 0; i < len; i++) {
169792b2e751SIlya Dryomov 		int osd = osds[i];
169847ec1f3cSIlya Dryomov 		u32 aff;
169947ec1f3cSIlya Dryomov 
170047ec1f3cSIlya Dryomov 		if (osd == CRUSH_ITEM_NONE)
170147ec1f3cSIlya Dryomov 			continue;
170247ec1f3cSIlya Dryomov 
170347ec1f3cSIlya Dryomov 		aff = osdmap->osd_primary_affinity[osd];
170447ec1f3cSIlya Dryomov 		if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
170547ec1f3cSIlya Dryomov 		    (crush_hash32_2(CRUSH_HASH_RJENKINS1,
170647ec1f3cSIlya Dryomov 				    pps, osd) >> 16) >= aff) {
170747ec1f3cSIlya Dryomov 			/*
170847ec1f3cSIlya Dryomov 			 * We chose not to use this primary.  Note it
170947ec1f3cSIlya Dryomov 			 * anyway as a fallback in case we don't pick
171047ec1f3cSIlya Dryomov 			 * anyone else, but keep looking.
171147ec1f3cSIlya Dryomov 			 */
171247ec1f3cSIlya Dryomov 			if (pos < 0)
171347ec1f3cSIlya Dryomov 				pos = i;
171447ec1f3cSIlya Dryomov 		} else {
171547ec1f3cSIlya Dryomov 			pos = i;
171647ec1f3cSIlya Dryomov 			break;
171747ec1f3cSIlya Dryomov 		}
171847ec1f3cSIlya Dryomov 	}
171947ec1f3cSIlya Dryomov 	if (pos < 0)
172047ec1f3cSIlya Dryomov 		return;
172147ec1f3cSIlya Dryomov 
172247ec1f3cSIlya Dryomov 	*primary = osds[pos];
172347ec1f3cSIlya Dryomov 
172447ec1f3cSIlya Dryomov 	if (ceph_can_shift_osds(pool) && pos > 0) {
172547ec1f3cSIlya Dryomov 		/* move the new primary to the front */
172647ec1f3cSIlya Dryomov 		for (i = pos; i > 0; i--)
172747ec1f3cSIlya Dryomov 			osds[i] = osds[i - 1];
172847ec1f3cSIlya Dryomov 		osds[0] = *primary;
172947ec1f3cSIlya Dryomov 	}
173047ec1f3cSIlya Dryomov }
173147ec1f3cSIlya Dryomov 
17322bd93d4dSIlya Dryomov /*
17335e8d4d36SIlya Dryomov  * Given up set, apply pg_temp and primary_temp mappings.
173445966c34SIlya Dryomov  *
173545966c34SIlya Dryomov  * Return acting set length.  *primary is set to acting primary osd id,
173645966c34SIlya Dryomov  * or -1 if acting set is empty.
173745966c34SIlya Dryomov  */
173845966c34SIlya Dryomov static int apply_temps(struct ceph_osdmap *osdmap,
173945966c34SIlya Dryomov 		       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
174045966c34SIlya Dryomov 		       int *osds, int len, int *primary)
174145966c34SIlya Dryomov {
174245966c34SIlya Dryomov 	struct ceph_pg_mapping *pg;
174345966c34SIlya Dryomov 	int temp_len;
174445966c34SIlya Dryomov 	int temp_primary;
174545966c34SIlya Dryomov 	int i;
174645966c34SIlya Dryomov 
174745966c34SIlya Dryomov 	/* raw_pg -> pg */
174845966c34SIlya Dryomov 	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
174945966c34SIlya Dryomov 				    pool->pg_num_mask);
175045966c34SIlya Dryomov 
175145966c34SIlya Dryomov 	/* pg_temp? */
175245966c34SIlya Dryomov 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
175345966c34SIlya Dryomov 	if (pg) {
175445966c34SIlya Dryomov 		temp_len = 0;
175545966c34SIlya Dryomov 		temp_primary = -1;
175645966c34SIlya Dryomov 
175745966c34SIlya Dryomov 		for (i = 0; i < pg->pg_temp.len; i++) {
175845966c34SIlya Dryomov 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
175945966c34SIlya Dryomov 				if (ceph_can_shift_osds(pool))
176045966c34SIlya Dryomov 					continue;
176145966c34SIlya Dryomov 				else
176245966c34SIlya Dryomov 					osds[temp_len++] = CRUSH_ITEM_NONE;
176345966c34SIlya Dryomov 			} else {
176445966c34SIlya Dryomov 				osds[temp_len++] = pg->pg_temp.osds[i];
176545966c34SIlya Dryomov 			}
176645966c34SIlya Dryomov 		}
176745966c34SIlya Dryomov 
176845966c34SIlya Dryomov 		/* apply pg_temp's primary */
176945966c34SIlya Dryomov 		for (i = 0; i < temp_len; i++) {
177045966c34SIlya Dryomov 			if (osds[i] != CRUSH_ITEM_NONE) {
177145966c34SIlya Dryomov 				temp_primary = osds[i];
177245966c34SIlya Dryomov 				break;
177345966c34SIlya Dryomov 			}
177445966c34SIlya Dryomov 		}
177545966c34SIlya Dryomov 	} else {
177645966c34SIlya Dryomov 		temp_len = len;
177745966c34SIlya Dryomov 		temp_primary = *primary;
177845966c34SIlya Dryomov 	}
177945966c34SIlya Dryomov 
17805e8d4d36SIlya Dryomov 	/* primary_temp? */
17815e8d4d36SIlya Dryomov 	pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
17825e8d4d36SIlya Dryomov 	if (pg)
17835e8d4d36SIlya Dryomov 		temp_primary = pg->primary_temp.osd;
17845e8d4d36SIlya Dryomov 
178545966c34SIlya Dryomov 	*primary = temp_primary;
178645966c34SIlya Dryomov 	return temp_len;
178745966c34SIlya Dryomov }
178845966c34SIlya Dryomov 
178945966c34SIlya Dryomov /*
1790ac972230SIlya Dryomov  * Calculate acting set for given pgid.
1791ac972230SIlya Dryomov  *
17928008ab10SIlya Dryomov  * Return acting set length, or error.  *primary is set to acting
17938008ab10SIlya Dryomov  * primary osd id, or -1 if acting set is empty or on error.
17943d14c5d2SYehuda Sadeh  */
17955b191d99SSage Weil int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
17968008ab10SIlya Dryomov 			int *osds, int *primary)
17973d14c5d2SYehuda Sadeh {
1798ac972230SIlya Dryomov 	struct ceph_pg_pool_info *pool;
1799ac972230SIlya Dryomov 	u32 pps;
1800ac972230SIlya Dryomov 	int len;
18013d14c5d2SYehuda Sadeh 
1802ac972230SIlya Dryomov 	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
18038008ab10SIlya Dryomov 	if (!pool) {
18048008ab10SIlya Dryomov 		*primary = -1;
18058008ab10SIlya Dryomov 		return -ENOENT;
18068008ab10SIlya Dryomov 	}
18073d14c5d2SYehuda Sadeh 
1808ac972230SIlya Dryomov 	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
1809ac972230SIlya Dryomov 		/* hash pool id and seed so that pool PGs do not overlap */
1810ac972230SIlya Dryomov 		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
1811ac972230SIlya Dryomov 				     ceph_stable_mod(pgid.seed, pool->pgp_num,
1812ac972230SIlya Dryomov 						     pool->pgp_num_mask),
1813ac972230SIlya Dryomov 				     pgid.pool);
1814ac972230SIlya Dryomov 	} else {
1815ac972230SIlya Dryomov 		/*
1816ac972230SIlya Dryomov 		 * legacy behavior: add ps and pool together.  this is
1817ac972230SIlya Dryomov 		 * not a great approach because the PGs from each pool
1818ac972230SIlya Dryomov 		 * will overlap on top of each other: 0.5 == 1.4 ==
1819ac972230SIlya Dryomov 		 * 2.3 == ...
1820ac972230SIlya Dryomov 		 */
1821ac972230SIlya Dryomov 		pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
1822ac972230SIlya Dryomov 				      pool->pgp_num_mask) +
1823ac972230SIlya Dryomov 			(unsigned)pgid.pool;
1824ac972230SIlya Dryomov 	}
1825ac972230SIlya Dryomov 
1826ac972230SIlya Dryomov 	len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
18278008ab10SIlya Dryomov 	if (len < 0) {
18288008ab10SIlya Dryomov 		*primary = -1;
1829ac972230SIlya Dryomov 		return len;
18308008ab10SIlya Dryomov 	}
1831ac972230SIlya Dryomov 
18328008ab10SIlya Dryomov 	len = raw_to_up_osds(osdmap, pool, osds, len, primary);
1833ac972230SIlya Dryomov 
183447ec1f3cSIlya Dryomov 	apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
183547ec1f3cSIlya Dryomov 
18368008ab10SIlya Dryomov 	len = apply_temps(osdmap, pool, pgid, osds, len, primary);
1837ac972230SIlya Dryomov 
1838ac972230SIlya Dryomov 	return len;
18393d14c5d2SYehuda Sadeh }
18403d14c5d2SYehuda Sadeh 
18413d14c5d2SYehuda Sadeh /*
18423d14c5d2SYehuda Sadeh  * Return primary osd for given pgid, or -1 if none.
18433d14c5d2SYehuda Sadeh  */
18445b191d99SSage Weil int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
18453d14c5d2SYehuda Sadeh {
1846c4c12285SIlya Dryomov 	int osds[CEPH_PG_MAX_SIZE];
1847c4c12285SIlya Dryomov 	int primary;
18483d14c5d2SYehuda Sadeh 
1849c4c12285SIlya Dryomov 	ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
18503d14c5d2SYehuda Sadeh 
1851c4c12285SIlya Dryomov 	return primary;
18523d14c5d2SYehuda Sadeh }
18533d14c5d2SYehuda Sadeh EXPORT_SYMBOL(ceph_calc_pg_primary);
1854