xref: /openbmc/linux/net/ceph/osdmap.c (revision 8365a898)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/ceph/ceph_debug.h>
4 
5 #include <linux/module.h>
6 #include <linux/slab.h>
7 
8 #include <linux/ceph/libceph.h>
9 #include <linux/ceph/osdmap.h>
10 #include <linux/ceph/decode.h>
11 #include <linux/crush/hash.h>
12 #include <linux/crush/mapper.h>
13 
14 char *ceph_osdmap_state_str(char *str, int len, u32 state)
15 {
16 	if (!len)
17 		return str;
18 
19 	if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
20 		snprintf(str, len, "exists, up");
21 	else if (state & CEPH_OSD_EXISTS)
22 		snprintf(str, len, "exists");
23 	else if (state & CEPH_OSD_UP)
24 		snprintf(str, len, "up");
25 	else
26 		snprintf(str, len, "doesn't exist");
27 
28 	return str;
29 }
30 
31 /* maps */
32 
33 static int calc_bits_of(unsigned int t)
34 {
35 	int b = 0;
36 	while (t) {
37 		t = t >> 1;
38 		b++;
39 	}
40 	return b;
41 }
42 
43 /*
44  * the foo_mask is the smallest value 2^n-1 that is >= foo.
45  */
46 static void calc_pg_masks(struct ceph_pg_pool_info *pi)
47 {
48 	pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
49 	pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
50 }
51 
52 /*
53  * decode crush map
54  */
55 static int crush_decode_uniform_bucket(void **p, void *end,
56 				       struct crush_bucket_uniform *b)
57 {
58 	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
59 	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
60 	b->item_weight = ceph_decode_32(p);
61 	return 0;
62 bad:
63 	return -EINVAL;
64 }
65 
66 static int crush_decode_list_bucket(void **p, void *end,
67 				    struct crush_bucket_list *b)
68 {
69 	int j;
70 	dout("crush_decode_list_bucket %p to %p\n", *p, end);
71 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
72 	if (b->item_weights == NULL)
73 		return -ENOMEM;
74 	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
75 	if (b->sum_weights == NULL)
76 		return -ENOMEM;
77 	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
78 	for (j = 0; j < b->h.size; j++) {
79 		b->item_weights[j] = ceph_decode_32(p);
80 		b->sum_weights[j] = ceph_decode_32(p);
81 	}
82 	return 0;
83 bad:
84 	return -EINVAL;
85 }
86 
87 static int crush_decode_tree_bucket(void **p, void *end,
88 				    struct crush_bucket_tree *b)
89 {
90 	int j;
91 	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
92 	ceph_decode_8_safe(p, end, b->num_nodes, bad);
93 	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
94 	if (b->node_weights == NULL)
95 		return -ENOMEM;
96 	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
97 	for (j = 0; j < b->num_nodes; j++)
98 		b->node_weights[j] = ceph_decode_32(p);
99 	return 0;
100 bad:
101 	return -EINVAL;
102 }
103 
104 static int crush_decode_straw_bucket(void **p, void *end,
105 				     struct crush_bucket_straw *b)
106 {
107 	int j;
108 	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
109 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
110 	if (b->item_weights == NULL)
111 		return -ENOMEM;
112 	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
113 	if (b->straws == NULL)
114 		return -ENOMEM;
115 	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
116 	for (j = 0; j < b->h.size; j++) {
117 		b->item_weights[j] = ceph_decode_32(p);
118 		b->straws[j] = ceph_decode_32(p);
119 	}
120 	return 0;
121 bad:
122 	return -EINVAL;
123 }
124 
125 static int crush_decode_straw2_bucket(void **p, void *end,
126 				      struct crush_bucket_straw2 *b)
127 {
128 	int j;
129 	dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
130 	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
131 	if (b->item_weights == NULL)
132 		return -ENOMEM;
133 	ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
134 	for (j = 0; j < b->h.size; j++)
135 		b->item_weights[j] = ceph_decode_32(p);
136 	return 0;
137 bad:
138 	return -EINVAL;
139 }
140 
141 struct crush_name_node {
142 	struct rb_node cn_node;
143 	int cn_id;
144 	char cn_name[];
145 };
146 
147 static struct crush_name_node *alloc_crush_name(size_t name_len)
148 {
149 	struct crush_name_node *cn;
150 
151 	cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
152 	if (!cn)
153 		return NULL;
154 
155 	RB_CLEAR_NODE(&cn->cn_node);
156 	return cn;
157 }
158 
159 static void free_crush_name(struct crush_name_node *cn)
160 {
161 	WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
162 
163 	kfree(cn);
164 }
165 
166 DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
167 
168 static int decode_crush_names(void **p, void *end, struct rb_root *root)
169 {
170 	u32 n;
171 
172 	ceph_decode_32_safe(p, end, n, e_inval);
173 	while (n--) {
174 		struct crush_name_node *cn;
175 		int id;
176 		u32 name_len;
177 
178 		ceph_decode_32_safe(p, end, id, e_inval);
179 		ceph_decode_32_safe(p, end, name_len, e_inval);
180 		ceph_decode_need(p, end, name_len, e_inval);
181 
182 		cn = alloc_crush_name(name_len);
183 		if (!cn)
184 			return -ENOMEM;
185 
186 		cn->cn_id = id;
187 		memcpy(cn->cn_name, *p, name_len);
188 		cn->cn_name[name_len] = '\0';
189 		*p += name_len;
190 
191 		if (!__insert_crush_name(root, cn)) {
192 			free_crush_name(cn);
193 			return -EEXIST;
194 		}
195 	}
196 
197 	return 0;
198 
199 e_inval:
200 	return -EINVAL;
201 }
202 
203 void clear_crush_names(struct rb_root *root)
204 {
205 	while (!RB_EMPTY_ROOT(root)) {
206 		struct crush_name_node *cn =
207 		    rb_entry(rb_first(root), struct crush_name_node, cn_node);
208 
209 		erase_crush_name(root, cn);
210 		free_crush_name(cn);
211 	}
212 }
213 
214 static struct crush_choose_arg_map *alloc_choose_arg_map(void)
215 {
216 	struct crush_choose_arg_map *arg_map;
217 
218 	arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO);
219 	if (!arg_map)
220 		return NULL;
221 
222 	RB_CLEAR_NODE(&arg_map->node);
223 	return arg_map;
224 }
225 
226 static void free_choose_arg_map(struct crush_choose_arg_map *arg_map)
227 {
228 	if (arg_map) {
229 		int i, j;
230 
231 		WARN_ON(!RB_EMPTY_NODE(&arg_map->node));
232 
233 		for (i = 0; i < arg_map->size; i++) {
234 			struct crush_choose_arg *arg = &arg_map->args[i];
235 
236 			for (j = 0; j < arg->weight_set_size; j++)
237 				kfree(arg->weight_set[j].weights);
238 			kfree(arg->weight_set);
239 			kfree(arg->ids);
240 		}
241 		kfree(arg_map->args);
242 		kfree(arg_map);
243 	}
244 }
245 
246 DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index,
247 		node);
248 
249 void clear_choose_args(struct crush_map *c)
250 {
251 	while (!RB_EMPTY_ROOT(&c->choose_args)) {
252 		struct crush_choose_arg_map *arg_map =
253 		    rb_entry(rb_first(&c->choose_args),
254 			     struct crush_choose_arg_map, node);
255 
256 		erase_choose_arg_map(&c->choose_args, arg_map);
257 		free_choose_arg_map(arg_map);
258 	}
259 }
260 
261 static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen)
262 {
263 	u32 *a = NULL;
264 	u32 len;
265 	int ret;
266 
267 	ceph_decode_32_safe(p, end, len, e_inval);
268 	if (len) {
269 		u32 i;
270 
271 		a = kmalloc_array(len, sizeof(u32), GFP_NOIO);
272 		if (!a) {
273 			ret = -ENOMEM;
274 			goto fail;
275 		}
276 
277 		ceph_decode_need(p, end, len * sizeof(u32), e_inval);
278 		for (i = 0; i < len; i++)
279 			a[i] = ceph_decode_32(p);
280 	}
281 
282 	*plen = len;
283 	return a;
284 
285 e_inval:
286 	ret = -EINVAL;
287 fail:
288 	kfree(a);
289 	return ERR_PTR(ret);
290 }
291 
292 /*
293  * Assumes @arg is zero-initialized.
294  */
295 static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg)
296 {
297 	int ret;
298 
299 	ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval);
300 	if (arg->weight_set_size) {
301 		u32 i;
302 
303 		arg->weight_set = kmalloc_array(arg->weight_set_size,
304 						sizeof(*arg->weight_set),
305 						GFP_NOIO);
306 		if (!arg->weight_set)
307 			return -ENOMEM;
308 
309 		for (i = 0; i < arg->weight_set_size; i++) {
310 			struct crush_weight_set *w = &arg->weight_set[i];
311 
312 			w->weights = decode_array_32_alloc(p, end, &w->size);
313 			if (IS_ERR(w->weights)) {
314 				ret = PTR_ERR(w->weights);
315 				w->weights = NULL;
316 				return ret;
317 			}
318 		}
319 	}
320 
321 	arg->ids = decode_array_32_alloc(p, end, &arg->ids_size);
322 	if (IS_ERR(arg->ids)) {
323 		ret = PTR_ERR(arg->ids);
324 		arg->ids = NULL;
325 		return ret;
326 	}
327 
328 	return 0;
329 
330 e_inval:
331 	return -EINVAL;
332 }
333 
334 static int decode_choose_args(void **p, void *end, struct crush_map *c)
335 {
336 	struct crush_choose_arg_map *arg_map = NULL;
337 	u32 num_choose_arg_maps, num_buckets;
338 	int ret;
339 
340 	ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval);
341 	while (num_choose_arg_maps--) {
342 		arg_map = alloc_choose_arg_map();
343 		if (!arg_map) {
344 			ret = -ENOMEM;
345 			goto fail;
346 		}
347 
348 		ceph_decode_64_safe(p, end, arg_map->choose_args_index,
349 				    e_inval);
350 		arg_map->size = c->max_buckets;
351 		arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args),
352 					GFP_NOIO);
353 		if (!arg_map->args) {
354 			ret = -ENOMEM;
355 			goto fail;
356 		}
357 
358 		ceph_decode_32_safe(p, end, num_buckets, e_inval);
359 		while (num_buckets--) {
360 			struct crush_choose_arg *arg;
361 			u32 bucket_index;
362 
363 			ceph_decode_32_safe(p, end, bucket_index, e_inval);
364 			if (bucket_index >= arg_map->size)
365 				goto e_inval;
366 
367 			arg = &arg_map->args[bucket_index];
368 			ret = decode_choose_arg(p, end, arg);
369 			if (ret)
370 				goto fail;
371 
372 			if (arg->ids_size &&
373 			    arg->ids_size != c->buckets[bucket_index]->size)
374 				goto e_inval;
375 		}
376 
377 		insert_choose_arg_map(&c->choose_args, arg_map);
378 	}
379 
380 	return 0;
381 
382 e_inval:
383 	ret = -EINVAL;
384 fail:
385 	free_choose_arg_map(arg_map);
386 	return ret;
387 }
388 
389 static void crush_finalize(struct crush_map *c)
390 {
391 	__s32 b;
392 
393 	/* Space for the array of pointers to per-bucket workspace */
394 	c->working_size = sizeof(struct crush_work) +
395 	    c->max_buckets * sizeof(struct crush_work_bucket *);
396 
397 	for (b = 0; b < c->max_buckets; b++) {
398 		if (!c->buckets[b])
399 			continue;
400 
401 		switch (c->buckets[b]->alg) {
402 		default:
403 			/*
404 			 * The base case, permutation variables and
405 			 * the pointer to the permutation array.
406 			 */
407 			c->working_size += sizeof(struct crush_work_bucket);
408 			break;
409 		}
410 		/* Every bucket has a permutation array. */
411 		c->working_size += c->buckets[b]->size * sizeof(__u32);
412 	}
413 }
414 
415 static struct crush_map *crush_decode(void *pbyval, void *end)
416 {
417 	struct crush_map *c;
418 	int err;
419 	int i, j;
420 	void **p = &pbyval;
421 	void *start = pbyval;
422 	u32 magic;
423 
424 	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
425 
426 	c = kzalloc(sizeof(*c), GFP_NOFS);
427 	if (c == NULL)
428 		return ERR_PTR(-ENOMEM);
429 
430 	c->type_names = RB_ROOT;
431 	c->names = RB_ROOT;
432 	c->choose_args = RB_ROOT;
433 
434         /* set tunables to default values */
435         c->choose_local_tries = 2;
436         c->choose_local_fallback_tries = 5;
437         c->choose_total_tries = 19;
438 	c->chooseleaf_descend_once = 0;
439 
440 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
441 	magic = ceph_decode_32(p);
442 	if (magic != CRUSH_MAGIC) {
443 		pr_err("crush_decode magic %x != current %x\n",
444 		       (unsigned int)magic, (unsigned int)CRUSH_MAGIC);
445 		goto bad;
446 	}
447 	c->max_buckets = ceph_decode_32(p);
448 	c->max_rules = ceph_decode_32(p);
449 	c->max_devices = ceph_decode_32(p);
450 
451 	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
452 	if (c->buckets == NULL)
453 		goto badmem;
454 	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
455 	if (c->rules == NULL)
456 		goto badmem;
457 
458 	/* buckets */
459 	for (i = 0; i < c->max_buckets; i++) {
460 		int size = 0;
461 		u32 alg;
462 		struct crush_bucket *b;
463 
464 		ceph_decode_32_safe(p, end, alg, bad);
465 		if (alg == 0) {
466 			c->buckets[i] = NULL;
467 			continue;
468 		}
469 		dout("crush_decode bucket %d off %x %p to %p\n",
470 		     i, (int)(*p-start), *p, end);
471 
472 		switch (alg) {
473 		case CRUSH_BUCKET_UNIFORM:
474 			size = sizeof(struct crush_bucket_uniform);
475 			break;
476 		case CRUSH_BUCKET_LIST:
477 			size = sizeof(struct crush_bucket_list);
478 			break;
479 		case CRUSH_BUCKET_TREE:
480 			size = sizeof(struct crush_bucket_tree);
481 			break;
482 		case CRUSH_BUCKET_STRAW:
483 			size = sizeof(struct crush_bucket_straw);
484 			break;
485 		case CRUSH_BUCKET_STRAW2:
486 			size = sizeof(struct crush_bucket_straw2);
487 			break;
488 		default:
489 			goto bad;
490 		}
491 		BUG_ON(size == 0);
492 		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
493 		if (b == NULL)
494 			goto badmem;
495 
496 		ceph_decode_need(p, end, 4*sizeof(u32), bad);
497 		b->id = ceph_decode_32(p);
498 		b->type = ceph_decode_16(p);
499 		b->alg = ceph_decode_8(p);
500 		b->hash = ceph_decode_8(p);
501 		b->weight = ceph_decode_32(p);
502 		b->size = ceph_decode_32(p);
503 
504 		dout("crush_decode bucket size %d off %x %p to %p\n",
505 		     b->size, (int)(*p-start), *p, end);
506 
507 		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
508 		if (b->items == NULL)
509 			goto badmem;
510 
511 		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
512 		for (j = 0; j < b->size; j++)
513 			b->items[j] = ceph_decode_32(p);
514 
515 		switch (b->alg) {
516 		case CRUSH_BUCKET_UNIFORM:
517 			err = crush_decode_uniform_bucket(p, end,
518 				  (struct crush_bucket_uniform *)b);
519 			if (err < 0)
520 				goto fail;
521 			break;
522 		case CRUSH_BUCKET_LIST:
523 			err = crush_decode_list_bucket(p, end,
524 			       (struct crush_bucket_list *)b);
525 			if (err < 0)
526 				goto fail;
527 			break;
528 		case CRUSH_BUCKET_TREE:
529 			err = crush_decode_tree_bucket(p, end,
530 				(struct crush_bucket_tree *)b);
531 			if (err < 0)
532 				goto fail;
533 			break;
534 		case CRUSH_BUCKET_STRAW:
535 			err = crush_decode_straw_bucket(p, end,
536 				(struct crush_bucket_straw *)b);
537 			if (err < 0)
538 				goto fail;
539 			break;
540 		case CRUSH_BUCKET_STRAW2:
541 			err = crush_decode_straw2_bucket(p, end,
542 				(struct crush_bucket_straw2 *)b);
543 			if (err < 0)
544 				goto fail;
545 			break;
546 		}
547 	}
548 
549 	/* rules */
550 	dout("rule vec is %p\n", c->rules);
551 	for (i = 0; i < c->max_rules; i++) {
552 		u32 yes;
553 		struct crush_rule *r;
554 
555 		ceph_decode_32_safe(p, end, yes, bad);
556 		if (!yes) {
557 			dout("crush_decode NO rule %d off %x %p to %p\n",
558 			     i, (int)(*p-start), *p, end);
559 			c->rules[i] = NULL;
560 			continue;
561 		}
562 
563 		dout("crush_decode rule %d off %x %p to %p\n",
564 		     i, (int)(*p-start), *p, end);
565 
566 		/* len */
567 		ceph_decode_32_safe(p, end, yes, bad);
568 #if BITS_PER_LONG == 32
569 		if (yes > (ULONG_MAX - sizeof(*r))
570 			  / sizeof(struct crush_rule_step))
571 			goto bad;
572 #endif
573 		r = kmalloc(struct_size(r, steps, yes), GFP_NOFS);
574 		c->rules[i] = r;
575 		if (r == NULL)
576 			goto badmem;
577 		dout(" rule %d is at %p\n", i, r);
578 		r->len = yes;
579 		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
580 		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
581 		for (j = 0; j < r->len; j++) {
582 			r->steps[j].op = ceph_decode_32(p);
583 			r->steps[j].arg1 = ceph_decode_32(p);
584 			r->steps[j].arg2 = ceph_decode_32(p);
585 		}
586 	}
587 
588 	err = decode_crush_names(p, end, &c->type_names);
589 	if (err)
590 		goto fail;
591 
592 	err = decode_crush_names(p, end, &c->names);
593 	if (err)
594 		goto fail;
595 
596 	ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
597 
598         /* tunables */
599         ceph_decode_need(p, end, 3*sizeof(u32), done);
600         c->choose_local_tries = ceph_decode_32(p);
601         c->choose_local_fallback_tries =  ceph_decode_32(p);
602         c->choose_total_tries = ceph_decode_32(p);
603         dout("crush decode tunable choose_local_tries = %d\n",
604              c->choose_local_tries);
605         dout("crush decode tunable choose_local_fallback_tries = %d\n",
606              c->choose_local_fallback_tries);
607         dout("crush decode tunable choose_total_tries = %d\n",
608              c->choose_total_tries);
609 
610 	ceph_decode_need(p, end, sizeof(u32), done);
611 	c->chooseleaf_descend_once = ceph_decode_32(p);
612 	dout("crush decode tunable chooseleaf_descend_once = %d\n",
613 	     c->chooseleaf_descend_once);
614 
615 	ceph_decode_need(p, end, sizeof(u8), done);
616 	c->chooseleaf_vary_r = ceph_decode_8(p);
617 	dout("crush decode tunable chooseleaf_vary_r = %d\n",
618 	     c->chooseleaf_vary_r);
619 
620 	/* skip straw_calc_version, allowed_bucket_algs */
621 	ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
622 	*p += sizeof(u8) + sizeof(u32);
623 
624 	ceph_decode_need(p, end, sizeof(u8), done);
625 	c->chooseleaf_stable = ceph_decode_8(p);
626 	dout("crush decode tunable chooseleaf_stable = %d\n",
627 	     c->chooseleaf_stable);
628 
629 	if (*p != end) {
630 		/* class_map */
631 		ceph_decode_skip_map(p, end, 32, 32, bad);
632 		/* class_name */
633 		ceph_decode_skip_map(p, end, 32, string, bad);
634 		/* class_bucket */
635 		ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad);
636 	}
637 
638 	if (*p != end) {
639 		err = decode_choose_args(p, end, c);
640 		if (err)
641 			goto fail;
642 	}
643 
644 done:
645 	crush_finalize(c);
646 	dout("crush_decode success\n");
647 	return c;
648 
649 badmem:
650 	err = -ENOMEM;
651 fail:
652 	dout("crush_decode fail %d\n", err);
653 	crush_destroy(c);
654 	return ERR_PTR(err);
655 
656 bad:
657 	err = -EINVAL;
658 	goto fail;
659 }
660 
661 int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
662 {
663 	if (lhs->pool < rhs->pool)
664 		return -1;
665 	if (lhs->pool > rhs->pool)
666 		return 1;
667 	if (lhs->seed < rhs->seed)
668 		return -1;
669 	if (lhs->seed > rhs->seed)
670 		return 1;
671 
672 	return 0;
673 }
674 
675 int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
676 {
677 	int ret;
678 
679 	ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
680 	if (ret)
681 		return ret;
682 
683 	if (lhs->shard < rhs->shard)
684 		return -1;
685 	if (lhs->shard > rhs->shard)
686 		return 1;
687 
688 	return 0;
689 }
690 
691 static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len)
692 {
693 	struct ceph_pg_mapping *pg;
694 
695 	pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO);
696 	if (!pg)
697 		return NULL;
698 
699 	RB_CLEAR_NODE(&pg->node);
700 	return pg;
701 }
702 
703 static void free_pg_mapping(struct ceph_pg_mapping *pg)
704 {
705 	WARN_ON(!RB_EMPTY_NODE(&pg->node));
706 
707 	kfree(pg);
708 }
709 
710 /*
711  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
712  * to a set of osds) and primary_temp (explicit primary setting)
713  */
714 DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
715 		 RB_BYPTR, const struct ceph_pg *, node)
716 
717 /*
718  * rbtree of pg pool info
719  */
720 DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node)
721 
722 struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
723 {
724 	return lookup_pg_pool(&map->pg_pools, id);
725 }
726 
727 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
728 {
729 	struct ceph_pg_pool_info *pi;
730 
731 	if (id == CEPH_NOPOOL)
732 		return NULL;
733 
734 	if (WARN_ON_ONCE(id > (u64) INT_MAX))
735 		return NULL;
736 
737 	pi = lookup_pg_pool(&map->pg_pools, id);
738 	return pi ? pi->name : NULL;
739 }
740 EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
741 
742 int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
743 {
744 	struct rb_node *rbp;
745 
746 	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
747 		struct ceph_pg_pool_info *pi =
748 			rb_entry(rbp, struct ceph_pg_pool_info, node);
749 		if (pi->name && strcmp(pi->name, name) == 0)
750 			return pi->id;
751 	}
752 	return -ENOENT;
753 }
754 EXPORT_SYMBOL(ceph_pg_poolid_by_name);
755 
756 u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
757 {
758 	struct ceph_pg_pool_info *pi;
759 
760 	pi = lookup_pg_pool(&map->pg_pools, id);
761 	return pi ? pi->flags : 0;
762 }
763 EXPORT_SYMBOL(ceph_pg_pool_flags);
764 
765 static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
766 {
767 	erase_pg_pool(root, pi);
768 	kfree(pi->name);
769 	kfree(pi);
770 }
771 
772 static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
773 {
774 	u8 ev, cv;
775 	unsigned len, num;
776 	void *pool_end;
777 
778 	ceph_decode_need(p, end, 2 + 4, bad);
779 	ev = ceph_decode_8(p);  /* encoding version */
780 	cv = ceph_decode_8(p); /* compat version */
781 	if (ev < 5) {
782 		pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
783 		return -EINVAL;
784 	}
785 	if (cv > 9) {
786 		pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
787 		return -EINVAL;
788 	}
789 	len = ceph_decode_32(p);
790 	ceph_decode_need(p, end, len, bad);
791 	pool_end = *p + len;
792 
793 	pi->type = ceph_decode_8(p);
794 	pi->size = ceph_decode_8(p);
795 	pi->crush_ruleset = ceph_decode_8(p);
796 	pi->object_hash = ceph_decode_8(p);
797 
798 	pi->pg_num = ceph_decode_32(p);
799 	pi->pgp_num = ceph_decode_32(p);
800 
801 	*p += 4 + 4;  /* skip lpg* */
802 	*p += 4;      /* skip last_change */
803 	*p += 8 + 4;  /* skip snap_seq, snap_epoch */
804 
805 	/* skip snaps */
806 	num = ceph_decode_32(p);
807 	while (num--) {
808 		*p += 8;  /* snapid key */
809 		*p += 1 + 1; /* versions */
810 		len = ceph_decode_32(p);
811 		*p += len;
812 	}
813 
814 	/* skip removed_snaps */
815 	num = ceph_decode_32(p);
816 	*p += num * (8 + 8);
817 
818 	*p += 8;  /* skip auid */
819 	pi->flags = ceph_decode_64(p);
820 	*p += 4;  /* skip crash_replay_interval */
821 
822 	if (ev >= 7)
823 		pi->min_size = ceph_decode_8(p);
824 	else
825 		pi->min_size = pi->size - pi->size / 2;
826 
827 	if (ev >= 8)
828 		*p += 8 + 8;  /* skip quota_max_* */
829 
830 	if (ev >= 9) {
831 		/* skip tiers */
832 		num = ceph_decode_32(p);
833 		*p += num * 8;
834 
835 		*p += 8;  /* skip tier_of */
836 		*p += 1;  /* skip cache_mode */
837 
838 		pi->read_tier = ceph_decode_64(p);
839 		pi->write_tier = ceph_decode_64(p);
840 	} else {
841 		pi->read_tier = -1;
842 		pi->write_tier = -1;
843 	}
844 
845 	if (ev >= 10) {
846 		/* skip properties */
847 		num = ceph_decode_32(p);
848 		while (num--) {
849 			len = ceph_decode_32(p);
850 			*p += len; /* key */
851 			len = ceph_decode_32(p);
852 			*p += len; /* val */
853 		}
854 	}
855 
856 	if (ev >= 11) {
857 		/* skip hit_set_params */
858 		*p += 1 + 1; /* versions */
859 		len = ceph_decode_32(p);
860 		*p += len;
861 
862 		*p += 4; /* skip hit_set_period */
863 		*p += 4; /* skip hit_set_count */
864 	}
865 
866 	if (ev >= 12)
867 		*p += 4; /* skip stripe_width */
868 
869 	if (ev >= 13) {
870 		*p += 8; /* skip target_max_bytes */
871 		*p += 8; /* skip target_max_objects */
872 		*p += 4; /* skip cache_target_dirty_ratio_micro */
873 		*p += 4; /* skip cache_target_full_ratio_micro */
874 		*p += 4; /* skip cache_min_flush_age */
875 		*p += 4; /* skip cache_min_evict_age */
876 	}
877 
878 	if (ev >=  14) {
879 		/* skip erasure_code_profile */
880 		len = ceph_decode_32(p);
881 		*p += len;
882 	}
883 
884 	/*
885 	 * last_force_op_resend_preluminous, will be overridden if the
886 	 * map was encoded with RESEND_ON_SPLIT
887 	 */
888 	if (ev >= 15)
889 		pi->last_force_request_resend = ceph_decode_32(p);
890 	else
891 		pi->last_force_request_resend = 0;
892 
893 	if (ev >= 16)
894 		*p += 4; /* skip min_read_recency_for_promote */
895 
896 	if (ev >= 17)
897 		*p += 8; /* skip expected_num_objects */
898 
899 	if (ev >= 19)
900 		*p += 4; /* skip cache_target_dirty_high_ratio_micro */
901 
902 	if (ev >= 20)
903 		*p += 4; /* skip min_write_recency_for_promote */
904 
905 	if (ev >= 21)
906 		*p += 1; /* skip use_gmt_hitset */
907 
908 	if (ev >= 22)
909 		*p += 1; /* skip fast_read */
910 
911 	if (ev >= 23) {
912 		*p += 4; /* skip hit_set_grade_decay_rate */
913 		*p += 4; /* skip hit_set_search_last_n */
914 	}
915 
916 	if (ev >= 24) {
917 		/* skip opts */
918 		*p += 1 + 1; /* versions */
919 		len = ceph_decode_32(p);
920 		*p += len;
921 	}
922 
923 	if (ev >= 25)
924 		pi->last_force_request_resend = ceph_decode_32(p);
925 
926 	/* ignore the rest */
927 
928 	*p = pool_end;
929 	calc_pg_masks(pi);
930 	return 0;
931 
932 bad:
933 	return -EINVAL;
934 }
935 
936 static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
937 {
938 	struct ceph_pg_pool_info *pi;
939 	u32 num, len;
940 	u64 pool;
941 
942 	ceph_decode_32_safe(p, end, num, bad);
943 	dout(" %d pool names\n", num);
944 	while (num--) {
945 		ceph_decode_64_safe(p, end, pool, bad);
946 		ceph_decode_32_safe(p, end, len, bad);
947 		dout("  pool %llu len %d\n", pool, len);
948 		ceph_decode_need(p, end, len, bad);
949 		pi = lookup_pg_pool(&map->pg_pools, pool);
950 		if (pi) {
951 			char *name = kstrndup(*p, len, GFP_NOFS);
952 
953 			if (!name)
954 				return -ENOMEM;
955 			kfree(pi->name);
956 			pi->name = name;
957 			dout("  name is %s\n", pi->name);
958 		}
959 		*p += len;
960 	}
961 	return 0;
962 
963 bad:
964 	return -EINVAL;
965 }
966 
967 /*
968  * osd map
969  */
970 struct ceph_osdmap *ceph_osdmap_alloc(void)
971 {
972 	struct ceph_osdmap *map;
973 
974 	map = kzalloc(sizeof(*map), GFP_NOIO);
975 	if (!map)
976 		return NULL;
977 
978 	map->pg_pools = RB_ROOT;
979 	map->pool_max = -1;
980 	map->pg_temp = RB_ROOT;
981 	map->primary_temp = RB_ROOT;
982 	map->pg_upmap = RB_ROOT;
983 	map->pg_upmap_items = RB_ROOT;
984 	mutex_init(&map->crush_workspace_mutex);
985 
986 	return map;
987 }
988 
989 void ceph_osdmap_destroy(struct ceph_osdmap *map)
990 {
991 	dout("osdmap_destroy %p\n", map);
992 	if (map->crush)
993 		crush_destroy(map->crush);
994 	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
995 		struct ceph_pg_mapping *pg =
996 			rb_entry(rb_first(&map->pg_temp),
997 				 struct ceph_pg_mapping, node);
998 		erase_pg_mapping(&map->pg_temp, pg);
999 		free_pg_mapping(pg);
1000 	}
1001 	while (!RB_EMPTY_ROOT(&map->primary_temp)) {
1002 		struct ceph_pg_mapping *pg =
1003 			rb_entry(rb_first(&map->primary_temp),
1004 				 struct ceph_pg_mapping, node);
1005 		erase_pg_mapping(&map->primary_temp, pg);
1006 		free_pg_mapping(pg);
1007 	}
1008 	while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
1009 		struct ceph_pg_mapping *pg =
1010 			rb_entry(rb_first(&map->pg_upmap),
1011 				 struct ceph_pg_mapping, node);
1012 		rb_erase(&pg->node, &map->pg_upmap);
1013 		kfree(pg);
1014 	}
1015 	while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
1016 		struct ceph_pg_mapping *pg =
1017 			rb_entry(rb_first(&map->pg_upmap_items),
1018 				 struct ceph_pg_mapping, node);
1019 		rb_erase(&pg->node, &map->pg_upmap_items);
1020 		kfree(pg);
1021 	}
1022 	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
1023 		struct ceph_pg_pool_info *pi =
1024 			rb_entry(rb_first(&map->pg_pools),
1025 				 struct ceph_pg_pool_info, node);
1026 		__remove_pg_pool(&map->pg_pools, pi);
1027 	}
1028 	kvfree(map->osd_state);
1029 	kvfree(map->osd_weight);
1030 	kvfree(map->osd_addr);
1031 	kvfree(map->osd_primary_affinity);
1032 	kvfree(map->crush_workspace);
1033 	kfree(map);
1034 }
1035 
1036 /*
1037  * Adjust max_osd value, (re)allocate arrays.
1038  *
1039  * The new elements are properly initialized.
1040  */
1041 static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
1042 {
1043 	u32 *state;
1044 	u32 *weight;
1045 	struct ceph_entity_addr *addr;
1046 	u32 to_copy;
1047 	int i;
1048 
1049 	dout("%s old %u new %u\n", __func__, map->max_osd, max);
1050 	if (max == map->max_osd)
1051 		return 0;
1052 
1053 	state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
1054 	weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
1055 	addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
1056 	if (!state || !weight || !addr) {
1057 		kvfree(state);
1058 		kvfree(weight);
1059 		kvfree(addr);
1060 		return -ENOMEM;
1061 	}
1062 
1063 	to_copy = min(map->max_osd, max);
1064 	if (map->osd_state) {
1065 		memcpy(state, map->osd_state, to_copy * sizeof(*state));
1066 		memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
1067 		memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
1068 		kvfree(map->osd_state);
1069 		kvfree(map->osd_weight);
1070 		kvfree(map->osd_addr);
1071 	}
1072 
1073 	map->osd_state = state;
1074 	map->osd_weight = weight;
1075 	map->osd_addr = addr;
1076 	for (i = map->max_osd; i < max; i++) {
1077 		map->osd_state[i] = 0;
1078 		map->osd_weight[i] = CEPH_OSD_OUT;
1079 		memset(map->osd_addr + i, 0, sizeof(*map->osd_addr));
1080 	}
1081 
1082 	if (map->osd_primary_affinity) {
1083 		u32 *affinity;
1084 
1085 		affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
1086 					 GFP_NOFS);
1087 		if (!affinity)
1088 			return -ENOMEM;
1089 
1090 		memcpy(affinity, map->osd_primary_affinity,
1091 		       to_copy * sizeof(*affinity));
1092 		kvfree(map->osd_primary_affinity);
1093 
1094 		map->osd_primary_affinity = affinity;
1095 		for (i = map->max_osd; i < max; i++)
1096 			map->osd_primary_affinity[i] =
1097 			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
1098 	}
1099 
1100 	map->max_osd = max;
1101 
1102 	return 0;
1103 }
1104 
1105 static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
1106 {
1107 	void *workspace;
1108 	size_t work_size;
1109 
1110 	if (IS_ERR(crush))
1111 		return PTR_ERR(crush);
1112 
1113 	work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
1114 	dout("%s work_size %zu bytes\n", __func__, work_size);
1115 	workspace = ceph_kvmalloc(work_size, GFP_NOIO);
1116 	if (!workspace) {
1117 		crush_destroy(crush);
1118 		return -ENOMEM;
1119 	}
1120 	crush_init_workspace(crush, workspace);
1121 
1122 	if (map->crush)
1123 		crush_destroy(map->crush);
1124 	kvfree(map->crush_workspace);
1125 	map->crush = crush;
1126 	map->crush_workspace = workspace;
1127 	return 0;
1128 }
1129 
1130 #define OSDMAP_WRAPPER_COMPAT_VER	7
1131 #define OSDMAP_CLIENT_DATA_COMPAT_VER	1
1132 
1133 /*
1134  * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps,
1135  * to struct_v of the client_data section for new (v7 and above)
1136  * osdmaps.
1137  */
1138 static int get_osdmap_client_data_v(void **p, void *end,
1139 				    const char *prefix, u8 *v)
1140 {
1141 	u8 struct_v;
1142 
1143 	ceph_decode_8_safe(p, end, struct_v, e_inval);
1144 	if (struct_v >= 7) {
1145 		u8 struct_compat;
1146 
1147 		ceph_decode_8_safe(p, end, struct_compat, e_inval);
1148 		if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
1149 			pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n",
1150 				struct_v, struct_compat,
1151 				OSDMAP_WRAPPER_COMPAT_VER, prefix);
1152 			return -EINVAL;
1153 		}
1154 		*p += 4; /* ignore wrapper struct_len */
1155 
1156 		ceph_decode_8_safe(p, end, struct_v, e_inval);
1157 		ceph_decode_8_safe(p, end, struct_compat, e_inval);
1158 		if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
1159 			pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n",
1160 				struct_v, struct_compat,
1161 				OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
1162 			return -EINVAL;
1163 		}
1164 		*p += 4; /* ignore client data struct_len */
1165 	} else {
1166 		u16 version;
1167 
1168 		*p -= 1;
1169 		ceph_decode_16_safe(p, end, version, e_inval);
1170 		if (version < 6) {
1171 			pr_warn("got v %d < 6 of %s ceph_osdmap\n",
1172 				version, prefix);
1173 			return -EINVAL;
1174 		}
1175 
1176 		/* old osdmap enconding */
1177 		struct_v = 0;
1178 	}
1179 
1180 	*v = struct_v;
1181 	return 0;
1182 
1183 e_inval:
1184 	return -EINVAL;
1185 }
1186 
1187 static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
1188 			  bool incremental)
1189 {
1190 	u32 n;
1191 
1192 	ceph_decode_32_safe(p, end, n, e_inval);
1193 	while (n--) {
1194 		struct ceph_pg_pool_info *pi;
1195 		u64 pool;
1196 		int ret;
1197 
1198 		ceph_decode_64_safe(p, end, pool, e_inval);
1199 
1200 		pi = lookup_pg_pool(&map->pg_pools, pool);
1201 		if (!incremental || !pi) {
1202 			pi = kzalloc(sizeof(*pi), GFP_NOFS);
1203 			if (!pi)
1204 				return -ENOMEM;
1205 
1206 			RB_CLEAR_NODE(&pi->node);
1207 			pi->id = pool;
1208 
1209 			if (!__insert_pg_pool(&map->pg_pools, pi)) {
1210 				kfree(pi);
1211 				return -EEXIST;
1212 			}
1213 		}
1214 
1215 		ret = decode_pool(p, end, pi);
1216 		if (ret)
1217 			return ret;
1218 	}
1219 
1220 	return 0;
1221 
1222 e_inval:
1223 	return -EINVAL;
1224 }
1225 
1226 static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
1227 {
1228 	return __decode_pools(p, end, map, false);
1229 }
1230 
1231 static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
1232 {
1233 	return __decode_pools(p, end, map, true);
1234 }
1235 
1236 typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool);
1237 
1238 static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
1239 			     decode_mapping_fn_t fn, bool incremental)
1240 {
1241 	u32 n;
1242 
1243 	WARN_ON(!incremental && !fn);
1244 
1245 	ceph_decode_32_safe(p, end, n, e_inval);
1246 	while (n--) {
1247 		struct ceph_pg_mapping *pg;
1248 		struct ceph_pg pgid;
1249 		int ret;
1250 
1251 		ret = ceph_decode_pgid(p, end, &pgid);
1252 		if (ret)
1253 			return ret;
1254 
1255 		pg = lookup_pg_mapping(mapping_root, &pgid);
1256 		if (pg) {
1257 			WARN_ON(!incremental);
1258 			erase_pg_mapping(mapping_root, pg);
1259 			free_pg_mapping(pg);
1260 		}
1261 
1262 		if (fn) {
1263 			pg = fn(p, end, incremental);
1264 			if (IS_ERR(pg))
1265 				return PTR_ERR(pg);
1266 
1267 			if (pg) {
1268 				pg->pgid = pgid; /* struct */
1269 				insert_pg_mapping(mapping_root, pg);
1270 			}
1271 		}
1272 	}
1273 
1274 	return 0;
1275 
1276 e_inval:
1277 	return -EINVAL;
1278 }
1279 
1280 static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end,
1281 						bool incremental)
1282 {
1283 	struct ceph_pg_mapping *pg;
1284 	u32 len, i;
1285 
1286 	ceph_decode_32_safe(p, end, len, e_inval);
1287 	if (len == 0 && incremental)
1288 		return NULL;	/* new_pg_temp: [] to remove */
1289 	if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32))
1290 		return ERR_PTR(-EINVAL);
1291 
1292 	ceph_decode_need(p, end, len * sizeof(u32), e_inval);
1293 	pg = alloc_pg_mapping(len * sizeof(u32));
1294 	if (!pg)
1295 		return ERR_PTR(-ENOMEM);
1296 
1297 	pg->pg_temp.len = len;
1298 	for (i = 0; i < len; i++)
1299 		pg->pg_temp.osds[i] = ceph_decode_32(p);
1300 
1301 	return pg;
1302 
1303 e_inval:
1304 	return ERR_PTR(-EINVAL);
1305 }
1306 
1307 static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
1308 {
1309 	return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
1310 				 false);
1311 }
1312 
1313 static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
1314 {
1315 	return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
1316 				 true);
1317 }
1318 
1319 static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end,
1320 						     bool incremental)
1321 {
1322 	struct ceph_pg_mapping *pg;
1323 	u32 osd;
1324 
1325 	ceph_decode_32_safe(p, end, osd, e_inval);
1326 	if (osd == (u32)-1 && incremental)
1327 		return NULL;	/* new_primary_temp: -1 to remove */
1328 
1329 	pg = alloc_pg_mapping(0);
1330 	if (!pg)
1331 		return ERR_PTR(-ENOMEM);
1332 
1333 	pg->primary_temp.osd = osd;
1334 	return pg;
1335 
1336 e_inval:
1337 	return ERR_PTR(-EINVAL);
1338 }
1339 
1340 static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
1341 {
1342 	return decode_pg_mapping(p, end, &map->primary_temp,
1343 				 __decode_primary_temp, false);
1344 }
1345 
1346 static int decode_new_primary_temp(void **p, void *end,
1347 				   struct ceph_osdmap *map)
1348 {
1349 	return decode_pg_mapping(p, end, &map->primary_temp,
1350 				 __decode_primary_temp, true);
1351 }
1352 
1353 u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
1354 {
1355 	BUG_ON(osd >= map->max_osd);
1356 
1357 	if (!map->osd_primary_affinity)
1358 		return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
1359 
1360 	return map->osd_primary_affinity[osd];
1361 }
1362 
1363 static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
1364 {
1365 	BUG_ON(osd >= map->max_osd);
1366 
1367 	if (!map->osd_primary_affinity) {
1368 		int i;
1369 
1370 		map->osd_primary_affinity = ceph_kvmalloc(
1371 		    array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
1372 		    GFP_NOFS);
1373 		if (!map->osd_primary_affinity)
1374 			return -ENOMEM;
1375 
1376 		for (i = 0; i < map->max_osd; i++)
1377 			map->osd_primary_affinity[i] =
1378 			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
1379 	}
1380 
1381 	map->osd_primary_affinity[osd] = aff;
1382 
1383 	return 0;
1384 }
1385 
1386 static int decode_primary_affinity(void **p, void *end,
1387 				   struct ceph_osdmap *map)
1388 {
1389 	u32 len, i;
1390 
1391 	ceph_decode_32_safe(p, end, len, e_inval);
1392 	if (len == 0) {
1393 		kvfree(map->osd_primary_affinity);
1394 		map->osd_primary_affinity = NULL;
1395 		return 0;
1396 	}
1397 	if (len != map->max_osd)
1398 		goto e_inval;
1399 
1400 	ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
1401 
1402 	for (i = 0; i < map->max_osd; i++) {
1403 		int ret;
1404 
1405 		ret = set_primary_affinity(map, i, ceph_decode_32(p));
1406 		if (ret)
1407 			return ret;
1408 	}
1409 
1410 	return 0;
1411 
1412 e_inval:
1413 	return -EINVAL;
1414 }
1415 
1416 static int decode_new_primary_affinity(void **p, void *end,
1417 				       struct ceph_osdmap *map)
1418 {
1419 	u32 n;
1420 
1421 	ceph_decode_32_safe(p, end, n, e_inval);
1422 	while (n--) {
1423 		u32 osd, aff;
1424 		int ret;
1425 
1426 		ceph_decode_32_safe(p, end, osd, e_inval);
1427 		ceph_decode_32_safe(p, end, aff, e_inval);
1428 
1429 		ret = set_primary_affinity(map, osd, aff);
1430 		if (ret)
1431 			return ret;
1432 
1433 		pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
1434 	}
1435 
1436 	return 0;
1437 
1438 e_inval:
1439 	return -EINVAL;
1440 }
1441 
1442 static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
1443 						 bool __unused)
1444 {
1445 	return __decode_pg_temp(p, end, false);
1446 }
1447 
1448 static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
1449 {
1450 	return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
1451 				 false);
1452 }
1453 
1454 static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
1455 {
1456 	return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
1457 				 true);
1458 }
1459 
1460 static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
1461 {
1462 	return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
1463 }
1464 
1465 static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
1466 						       bool __unused)
1467 {
1468 	struct ceph_pg_mapping *pg;
1469 	u32 len, i;
1470 
1471 	ceph_decode_32_safe(p, end, len, e_inval);
1472 	if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
1473 		return ERR_PTR(-EINVAL);
1474 
1475 	ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
1476 	pg = alloc_pg_mapping(2 * len * sizeof(u32));
1477 	if (!pg)
1478 		return ERR_PTR(-ENOMEM);
1479 
1480 	pg->pg_upmap_items.len = len;
1481 	for (i = 0; i < len; i++) {
1482 		pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
1483 		pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
1484 	}
1485 
1486 	return pg;
1487 
1488 e_inval:
1489 	return ERR_PTR(-EINVAL);
1490 }
1491 
1492 static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
1493 {
1494 	return decode_pg_mapping(p, end, &map->pg_upmap_items,
1495 				 __decode_pg_upmap_items, false);
1496 }
1497 
1498 static int decode_new_pg_upmap_items(void **p, void *end,
1499 				     struct ceph_osdmap *map)
1500 {
1501 	return decode_pg_mapping(p, end, &map->pg_upmap_items,
1502 				 __decode_pg_upmap_items, true);
1503 }
1504 
1505 static int decode_old_pg_upmap_items(void **p, void *end,
1506 				     struct ceph_osdmap *map)
1507 {
1508 	return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
1509 }
1510 
1511 /*
1512  * decode a full map.
1513  */
1514 static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
1515 {
1516 	u8 struct_v;
1517 	u32 epoch = 0;
1518 	void *start = *p;
1519 	u32 max;
1520 	u32 len, i;
1521 	int err;
1522 
1523 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
1524 
1525 	err = get_osdmap_client_data_v(p, end, "full", &struct_v);
1526 	if (err)
1527 		goto bad;
1528 
1529 	/* fsid, epoch, created, modified */
1530 	ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
1531 			 sizeof(map->created) + sizeof(map->modified), e_inval);
1532 	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
1533 	epoch = map->epoch = ceph_decode_32(p);
1534 	ceph_decode_copy(p, &map->created, sizeof(map->created));
1535 	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
1536 
1537 	/* pools */
1538 	err = decode_pools(p, end, map);
1539 	if (err)
1540 		goto bad;
1541 
1542 	/* pool_name */
1543 	err = decode_pool_names(p, end, map);
1544 	if (err)
1545 		goto bad;
1546 
1547 	ceph_decode_32_safe(p, end, map->pool_max, e_inval);
1548 
1549 	ceph_decode_32_safe(p, end, map->flags, e_inval);
1550 
1551 	/* max_osd */
1552 	ceph_decode_32_safe(p, end, max, e_inval);
1553 
1554 	/* (re)alloc osd arrays */
1555 	err = osdmap_set_max_osd(map, max);
1556 	if (err)
1557 		goto bad;
1558 
1559 	/* osd_state, osd_weight, osd_addrs->client_addr */
1560 	ceph_decode_need(p, end, 3*sizeof(u32) +
1561 			 map->max_osd*(struct_v >= 5 ? sizeof(u32) :
1562 						       sizeof(u8)) +
1563 				       sizeof(*map->osd_weight), e_inval);
1564 	if (ceph_decode_32(p) != map->max_osd)
1565 		goto e_inval;
1566 
1567 	if (struct_v >= 5) {
1568 		for (i = 0; i < map->max_osd; i++)
1569 			map->osd_state[i] = ceph_decode_32(p);
1570 	} else {
1571 		for (i = 0; i < map->max_osd; i++)
1572 			map->osd_state[i] = ceph_decode_8(p);
1573 	}
1574 
1575 	if (ceph_decode_32(p) != map->max_osd)
1576 		goto e_inval;
1577 
1578 	for (i = 0; i < map->max_osd; i++)
1579 		map->osd_weight[i] = ceph_decode_32(p);
1580 
1581 	if (ceph_decode_32(p) != map->max_osd)
1582 		goto e_inval;
1583 
1584 	for (i = 0; i < map->max_osd; i++) {
1585 		err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
1586 		if (err)
1587 			goto bad;
1588 	}
1589 
1590 	/* pg_temp */
1591 	err = decode_pg_temp(p, end, map);
1592 	if (err)
1593 		goto bad;
1594 
1595 	/* primary_temp */
1596 	if (struct_v >= 1) {
1597 		err = decode_primary_temp(p, end, map);
1598 		if (err)
1599 			goto bad;
1600 	}
1601 
1602 	/* primary_affinity */
1603 	if (struct_v >= 2) {
1604 		err = decode_primary_affinity(p, end, map);
1605 		if (err)
1606 			goto bad;
1607 	} else {
1608 		WARN_ON(map->osd_primary_affinity);
1609 	}
1610 
1611 	/* crush */
1612 	ceph_decode_32_safe(p, end, len, e_inval);
1613 	err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
1614 	if (err)
1615 		goto bad;
1616 
1617 	*p += len;
1618 	if (struct_v >= 3) {
1619 		/* erasure_code_profiles */
1620 		ceph_decode_skip_map_of_map(p, end, string, string, string,
1621 					    e_inval);
1622 	}
1623 
1624 	if (struct_v >= 4) {
1625 		err = decode_pg_upmap(p, end, map);
1626 		if (err)
1627 			goto bad;
1628 
1629 		err = decode_pg_upmap_items(p, end, map);
1630 		if (err)
1631 			goto bad;
1632 	} else {
1633 		WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
1634 		WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
1635 	}
1636 
1637 	/* ignore the rest */
1638 	*p = end;
1639 
1640 	dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
1641 	return 0;
1642 
1643 e_inval:
1644 	err = -EINVAL;
1645 bad:
1646 	pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
1647 	       err, epoch, (int)(*p - start), *p, start, end);
1648 	print_hex_dump(KERN_DEBUG, "osdmap: ",
1649 		       DUMP_PREFIX_OFFSET, 16, 1,
1650 		       start, end - start, true);
1651 	return err;
1652 }
1653 
1654 /*
1655  * Allocate and decode a full map.
1656  */
1657 struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
1658 {
1659 	struct ceph_osdmap *map;
1660 	int ret;
1661 
1662 	map = ceph_osdmap_alloc();
1663 	if (!map)
1664 		return ERR_PTR(-ENOMEM);
1665 
1666 	ret = osdmap_decode(p, end, map);
1667 	if (ret) {
1668 		ceph_osdmap_destroy(map);
1669 		return ERR_PTR(ret);
1670 	}
1671 
1672 	return map;
1673 }
1674 
1675 /*
1676  * Encoding order is (new_up_client, new_state, new_weight).  Need to
1677  * apply in the (new_weight, new_state, new_up_client) order, because
1678  * an incremental map may look like e.g.
1679  *
1680  *     new_up_client: { osd=6, addr=... } # set osd_state and addr
1681  *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
1682  */
1683 static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
1684 				      struct ceph_osdmap *map)
1685 {
1686 	void *new_up_client;
1687 	void *new_state;
1688 	void *new_weight_end;
1689 	u32 len;
1690 	int i;
1691 
1692 	new_up_client = *p;
1693 	ceph_decode_32_safe(p, end, len, e_inval);
1694 	for (i = 0; i < len; ++i) {
1695 		struct ceph_entity_addr addr;
1696 
1697 		ceph_decode_skip_32(p, end, e_inval);
1698 		if (ceph_decode_entity_addr(p, end, &addr))
1699 			goto e_inval;
1700 	}
1701 
1702 	new_state = *p;
1703 	ceph_decode_32_safe(p, end, len, e_inval);
1704 	len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8));
1705 	ceph_decode_need(p, end, len, e_inval);
1706 	*p += len;
1707 
1708 	/* new_weight */
1709 	ceph_decode_32_safe(p, end, len, e_inval);
1710 	while (len--) {
1711 		s32 osd;
1712 		u32 w;
1713 
1714 		ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
1715 		osd = ceph_decode_32(p);
1716 		w = ceph_decode_32(p);
1717 		BUG_ON(osd >= map->max_osd);
1718 		pr_info("osd%d weight 0x%x %s\n", osd, w,
1719 		     w == CEPH_OSD_IN ? "(in)" :
1720 		     (w == CEPH_OSD_OUT ? "(out)" : ""));
1721 		map->osd_weight[osd] = w;
1722 
1723 		/*
1724 		 * If we are marking in, set the EXISTS, and clear the
1725 		 * AUTOOUT and NEW bits.
1726 		 */
1727 		if (w) {
1728 			map->osd_state[osd] |= CEPH_OSD_EXISTS;
1729 			map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
1730 						 CEPH_OSD_NEW);
1731 		}
1732 	}
1733 	new_weight_end = *p;
1734 
1735 	/* new_state (up/down) */
1736 	*p = new_state;
1737 	len = ceph_decode_32(p);
1738 	while (len--) {
1739 		s32 osd;
1740 		u32 xorstate;
1741 		int ret;
1742 
1743 		osd = ceph_decode_32(p);
1744 		if (struct_v >= 5)
1745 			xorstate = ceph_decode_32(p);
1746 		else
1747 			xorstate = ceph_decode_8(p);
1748 		if (xorstate == 0)
1749 			xorstate = CEPH_OSD_UP;
1750 		BUG_ON(osd >= map->max_osd);
1751 		if ((map->osd_state[osd] & CEPH_OSD_UP) &&
1752 		    (xorstate & CEPH_OSD_UP))
1753 			pr_info("osd%d down\n", osd);
1754 		if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
1755 		    (xorstate & CEPH_OSD_EXISTS)) {
1756 			pr_info("osd%d does not exist\n", osd);
1757 			ret = set_primary_affinity(map, osd,
1758 						   CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1759 			if (ret)
1760 				return ret;
1761 			memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
1762 			map->osd_state[osd] = 0;
1763 		} else {
1764 			map->osd_state[osd] ^= xorstate;
1765 		}
1766 	}
1767 
1768 	/* new_up_client */
1769 	*p = new_up_client;
1770 	len = ceph_decode_32(p);
1771 	while (len--) {
1772 		s32 osd;
1773 		struct ceph_entity_addr addr;
1774 
1775 		osd = ceph_decode_32(p);
1776 		BUG_ON(osd >= map->max_osd);
1777 		if (ceph_decode_entity_addr(p, end, &addr))
1778 			goto e_inval;
1779 		pr_info("osd%d up\n", osd);
1780 		map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1781 		map->osd_addr[osd] = addr;
1782 	}
1783 
1784 	*p = new_weight_end;
1785 	return 0;
1786 
1787 e_inval:
1788 	return -EINVAL;
1789 }
1790 
1791 /*
1792  * decode and apply an incremental map update.
1793  */
1794 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1795 					     struct ceph_osdmap *map)
1796 {
1797 	struct ceph_fsid fsid;
1798 	u32 epoch = 0;
1799 	struct ceph_timespec modified;
1800 	s32 len;
1801 	u64 pool;
1802 	__s64 new_pool_max;
1803 	__s32 new_flags, max;
1804 	void *start = *p;
1805 	int err;
1806 	u8 struct_v;
1807 
1808 	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
1809 
1810 	err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
1811 	if (err)
1812 		goto bad;
1813 
1814 	/* fsid, epoch, modified, new_pool_max, new_flags */
1815 	ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
1816 			 sizeof(u64) + sizeof(u32), e_inval);
1817 	ceph_decode_copy(p, &fsid, sizeof(fsid));
1818 	epoch = ceph_decode_32(p);
1819 	BUG_ON(epoch != map->epoch+1);
1820 	ceph_decode_copy(p, &modified, sizeof(modified));
1821 	new_pool_max = ceph_decode_64(p);
1822 	new_flags = ceph_decode_32(p);
1823 
1824 	/* full map? */
1825 	ceph_decode_32_safe(p, end, len, e_inval);
1826 	if (len > 0) {
1827 		dout("apply_incremental full map len %d, %p to %p\n",
1828 		     len, *p, end);
1829 		return ceph_osdmap_decode(p, min(*p+len, end));
1830 	}
1831 
1832 	/* new crush? */
1833 	ceph_decode_32_safe(p, end, len, e_inval);
1834 	if (len > 0) {
1835 		err = osdmap_set_crush(map,
1836 				       crush_decode(*p, min(*p + len, end)));
1837 		if (err)
1838 			goto bad;
1839 		*p += len;
1840 	}
1841 
1842 	/* new flags? */
1843 	if (new_flags >= 0)
1844 		map->flags = new_flags;
1845 	if (new_pool_max >= 0)
1846 		map->pool_max = new_pool_max;
1847 
1848 	/* new max? */
1849 	ceph_decode_32_safe(p, end, max, e_inval);
1850 	if (max >= 0) {
1851 		err = osdmap_set_max_osd(map, max);
1852 		if (err)
1853 			goto bad;
1854 	}
1855 
1856 	map->epoch++;
1857 	map->modified = modified;
1858 
1859 	/* new_pools */
1860 	err = decode_new_pools(p, end, map);
1861 	if (err)
1862 		goto bad;
1863 
1864 	/* new_pool_names */
1865 	err = decode_pool_names(p, end, map);
1866 	if (err)
1867 		goto bad;
1868 
1869 	/* old_pool */
1870 	ceph_decode_32_safe(p, end, len, e_inval);
1871 	while (len--) {
1872 		struct ceph_pg_pool_info *pi;
1873 
1874 		ceph_decode_64_safe(p, end, pool, e_inval);
1875 		pi = lookup_pg_pool(&map->pg_pools, pool);
1876 		if (pi)
1877 			__remove_pg_pool(&map->pg_pools, pi);
1878 	}
1879 
1880 	/* new_up_client, new_state, new_weight */
1881 	err = decode_new_up_state_weight(p, end, struct_v, map);
1882 	if (err)
1883 		goto bad;
1884 
1885 	/* new_pg_temp */
1886 	err = decode_new_pg_temp(p, end, map);
1887 	if (err)
1888 		goto bad;
1889 
1890 	/* new_primary_temp */
1891 	if (struct_v >= 1) {
1892 		err = decode_new_primary_temp(p, end, map);
1893 		if (err)
1894 			goto bad;
1895 	}
1896 
1897 	/* new_primary_affinity */
1898 	if (struct_v >= 2) {
1899 		err = decode_new_primary_affinity(p, end, map);
1900 		if (err)
1901 			goto bad;
1902 	}
1903 
1904 	if (struct_v >= 3) {
1905 		/* new_erasure_code_profiles */
1906 		ceph_decode_skip_map_of_map(p, end, string, string, string,
1907 					    e_inval);
1908 		/* old_erasure_code_profiles */
1909 		ceph_decode_skip_set(p, end, string, e_inval);
1910 	}
1911 
1912 	if (struct_v >= 4) {
1913 		err = decode_new_pg_upmap(p, end, map);
1914 		if (err)
1915 			goto bad;
1916 
1917 		err = decode_old_pg_upmap(p, end, map);
1918 		if (err)
1919 			goto bad;
1920 
1921 		err = decode_new_pg_upmap_items(p, end, map);
1922 		if (err)
1923 			goto bad;
1924 
1925 		err = decode_old_pg_upmap_items(p, end, map);
1926 		if (err)
1927 			goto bad;
1928 	}
1929 
1930 	/* ignore the rest */
1931 	*p = end;
1932 
1933 	dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
1934 	return map;
1935 
1936 e_inval:
1937 	err = -EINVAL;
1938 bad:
1939 	pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
1940 	       err, epoch, (int)(*p - start), *p, start, end);
1941 	print_hex_dump(KERN_DEBUG, "osdmap: ",
1942 		       DUMP_PREFIX_OFFSET, 16, 1,
1943 		       start, end - start, true);
1944 	return ERR_PTR(err);
1945 }
1946 
1947 void ceph_oloc_copy(struct ceph_object_locator *dest,
1948 		    const struct ceph_object_locator *src)
1949 {
1950 	ceph_oloc_destroy(dest);
1951 
1952 	dest->pool = src->pool;
1953 	if (src->pool_ns)
1954 		dest->pool_ns = ceph_get_string(src->pool_ns);
1955 	else
1956 		dest->pool_ns = NULL;
1957 }
1958 EXPORT_SYMBOL(ceph_oloc_copy);
1959 
1960 void ceph_oloc_destroy(struct ceph_object_locator *oloc)
1961 {
1962 	ceph_put_string(oloc->pool_ns);
1963 }
1964 EXPORT_SYMBOL(ceph_oloc_destroy);
1965 
1966 void ceph_oid_copy(struct ceph_object_id *dest,
1967 		   const struct ceph_object_id *src)
1968 {
1969 	ceph_oid_destroy(dest);
1970 
1971 	if (src->name != src->inline_name) {
1972 		/* very rare, see ceph_object_id definition */
1973 		dest->name = kmalloc(src->name_len + 1,
1974 				     GFP_NOIO | __GFP_NOFAIL);
1975 	} else {
1976 		dest->name = dest->inline_name;
1977 	}
1978 	memcpy(dest->name, src->name, src->name_len + 1);
1979 	dest->name_len = src->name_len;
1980 }
1981 EXPORT_SYMBOL(ceph_oid_copy);
1982 
1983 static __printf(2, 0)
1984 int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
1985 {
1986 	int len;
1987 
1988 	WARN_ON(!ceph_oid_empty(oid));
1989 
1990 	len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
1991 	if (len >= sizeof(oid->inline_name))
1992 		return len;
1993 
1994 	oid->name_len = len;
1995 	return 0;
1996 }
1997 
1998 /*
1999  * If oid doesn't fit into inline buffer, BUG.
2000  */
2001 void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
2002 {
2003 	va_list ap;
2004 
2005 	va_start(ap, fmt);
2006 	BUG_ON(oid_printf_vargs(oid, fmt, ap));
2007 	va_end(ap);
2008 }
2009 EXPORT_SYMBOL(ceph_oid_printf);
2010 
2011 static __printf(3, 0)
2012 int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
2013 		      const char *fmt, va_list ap)
2014 {
2015 	va_list aq;
2016 	int len;
2017 
2018 	va_copy(aq, ap);
2019 	len = oid_printf_vargs(oid, fmt, aq);
2020 	va_end(aq);
2021 
2022 	if (len) {
2023 		char *external_name;
2024 
2025 		external_name = kmalloc(len + 1, gfp);
2026 		if (!external_name)
2027 			return -ENOMEM;
2028 
2029 		oid->name = external_name;
2030 		WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
2031 		oid->name_len = len;
2032 	}
2033 
2034 	return 0;
2035 }
2036 
2037 /*
2038  * If oid doesn't fit into inline buffer, allocate.
2039  */
2040 int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
2041 		     const char *fmt, ...)
2042 {
2043 	va_list ap;
2044 	int ret;
2045 
2046 	va_start(ap, fmt);
2047 	ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
2048 	va_end(ap);
2049 
2050 	return ret;
2051 }
2052 EXPORT_SYMBOL(ceph_oid_aprintf);
2053 
2054 void ceph_oid_destroy(struct ceph_object_id *oid)
2055 {
2056 	if (oid->name != oid->inline_name)
2057 		kfree(oid->name);
2058 }
2059 EXPORT_SYMBOL(ceph_oid_destroy);
2060 
2061 /*
2062  * osds only
2063  */
2064 static bool __osds_equal(const struct ceph_osds *lhs,
2065 			 const struct ceph_osds *rhs)
2066 {
2067 	if (lhs->size == rhs->size &&
2068 	    !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
2069 		return true;
2070 
2071 	return false;
2072 }
2073 
2074 /*
2075  * osds + primary
2076  */
2077 static bool osds_equal(const struct ceph_osds *lhs,
2078 		       const struct ceph_osds *rhs)
2079 {
2080 	if (__osds_equal(lhs, rhs) &&
2081 	    lhs->primary == rhs->primary)
2082 		return true;
2083 
2084 	return false;
2085 }
2086 
2087 static bool osds_valid(const struct ceph_osds *set)
2088 {
2089 	/* non-empty set */
2090 	if (set->size > 0 && set->primary >= 0)
2091 		return true;
2092 
2093 	/* empty can_shift_osds set */
2094 	if (!set->size && set->primary == -1)
2095 		return true;
2096 
2097 	/* empty !can_shift_osds set - all NONE */
2098 	if (set->size > 0 && set->primary == -1) {
2099 		int i;
2100 
2101 		for (i = 0; i < set->size; i++) {
2102 			if (set->osds[i] != CRUSH_ITEM_NONE)
2103 				break;
2104 		}
2105 		if (i == set->size)
2106 			return true;
2107 	}
2108 
2109 	return false;
2110 }
2111 
2112 void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
2113 {
2114 	memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
2115 	dest->size = src->size;
2116 	dest->primary = src->primary;
2117 }
2118 
2119 bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
2120 		      u32 new_pg_num)
2121 {
2122 	int old_bits = calc_bits_of(old_pg_num);
2123 	int old_mask = (1 << old_bits) - 1;
2124 	int n;
2125 
2126 	WARN_ON(pgid->seed >= old_pg_num);
2127 	if (new_pg_num <= old_pg_num)
2128 		return false;
2129 
2130 	for (n = 1; ; n++) {
2131 		int next_bit = n << (old_bits - 1);
2132 		u32 s = next_bit | pgid->seed;
2133 
2134 		if (s < old_pg_num || s == pgid->seed)
2135 			continue;
2136 		if (s >= new_pg_num)
2137 			break;
2138 
2139 		s = ceph_stable_mod(s, old_pg_num, old_mask);
2140 		if (s == pgid->seed)
2141 			return true;
2142 	}
2143 
2144 	return false;
2145 }
2146 
2147 bool ceph_is_new_interval(const struct ceph_osds *old_acting,
2148 			  const struct ceph_osds *new_acting,
2149 			  const struct ceph_osds *old_up,
2150 			  const struct ceph_osds *new_up,
2151 			  int old_size,
2152 			  int new_size,
2153 			  int old_min_size,
2154 			  int new_min_size,
2155 			  u32 old_pg_num,
2156 			  u32 new_pg_num,
2157 			  bool old_sort_bitwise,
2158 			  bool new_sort_bitwise,
2159 			  bool old_recovery_deletes,
2160 			  bool new_recovery_deletes,
2161 			  const struct ceph_pg *pgid)
2162 {
2163 	return !osds_equal(old_acting, new_acting) ||
2164 	       !osds_equal(old_up, new_up) ||
2165 	       old_size != new_size ||
2166 	       old_min_size != new_min_size ||
2167 	       ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
2168 	       old_sort_bitwise != new_sort_bitwise ||
2169 	       old_recovery_deletes != new_recovery_deletes;
2170 }
2171 
2172 static int calc_pg_rank(int osd, const struct ceph_osds *acting)
2173 {
2174 	int i;
2175 
2176 	for (i = 0; i < acting->size; i++) {
2177 		if (acting->osds[i] == osd)
2178 			return i;
2179 	}
2180 
2181 	return -1;
2182 }
2183 
2184 static bool primary_changed(const struct ceph_osds *old_acting,
2185 			    const struct ceph_osds *new_acting)
2186 {
2187 	if (!old_acting->size && !new_acting->size)
2188 		return false; /* both still empty */
2189 
2190 	if (!old_acting->size ^ !new_acting->size)
2191 		return true; /* was empty, now not, or vice versa */
2192 
2193 	if (old_acting->primary != new_acting->primary)
2194 		return true; /* primary changed */
2195 
2196 	if (calc_pg_rank(old_acting->primary, old_acting) !=
2197 	    calc_pg_rank(new_acting->primary, new_acting))
2198 		return true;
2199 
2200 	return false; /* same primary (tho replicas may have changed) */
2201 }
2202 
2203 bool ceph_osds_changed(const struct ceph_osds *old_acting,
2204 		       const struct ceph_osds *new_acting,
2205 		       bool any_change)
2206 {
2207 	if (primary_changed(old_acting, new_acting))
2208 		return true;
2209 
2210 	if (any_change && !__osds_equal(old_acting, new_acting))
2211 		return true;
2212 
2213 	return false;
2214 }
2215 
2216 /*
2217  * Map an object into a PG.
2218  *
2219  * Should only be called with target_oid and target_oloc (as opposed to
2220  * base_oid and base_oloc), since tiering isn't taken into account.
2221  */
2222 void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
2223 				 const struct ceph_object_id *oid,
2224 				 const struct ceph_object_locator *oloc,
2225 				 struct ceph_pg *raw_pgid)
2226 {
2227 	WARN_ON(pi->id != oloc->pool);
2228 
2229 	if (!oloc->pool_ns) {
2230 		raw_pgid->pool = oloc->pool;
2231 		raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
2232 					     oid->name_len);
2233 		dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
2234 		     raw_pgid->pool, raw_pgid->seed);
2235 	} else {
2236 		char stack_buf[256];
2237 		char *buf = stack_buf;
2238 		int nsl = oloc->pool_ns->len;
2239 		size_t total = nsl + 1 + oid->name_len;
2240 
2241 		if (total > sizeof(stack_buf))
2242 			buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL);
2243 		memcpy(buf, oloc->pool_ns->str, nsl);
2244 		buf[nsl] = '\037';
2245 		memcpy(buf + nsl + 1, oid->name, oid->name_len);
2246 		raw_pgid->pool = oloc->pool;
2247 		raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
2248 		if (buf != stack_buf)
2249 			kfree(buf);
2250 		dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
2251 		     oid->name, nsl, oloc->pool_ns->str,
2252 		     raw_pgid->pool, raw_pgid->seed);
2253 	}
2254 }
2255 
2256 int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
2257 			      const struct ceph_object_id *oid,
2258 			      const struct ceph_object_locator *oloc,
2259 			      struct ceph_pg *raw_pgid)
2260 {
2261 	struct ceph_pg_pool_info *pi;
2262 
2263 	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
2264 	if (!pi)
2265 		return -ENOENT;
2266 
2267 	__ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
2268 	return 0;
2269 }
2270 EXPORT_SYMBOL(ceph_object_locator_to_pg);
2271 
2272 /*
2273  * Map a raw PG (full precision ps) into an actual PG.
2274  */
2275 static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
2276 			 const struct ceph_pg *raw_pgid,
2277 			 struct ceph_pg *pgid)
2278 {
2279 	pgid->pool = raw_pgid->pool;
2280 	pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
2281 				     pi->pg_num_mask);
2282 }
2283 
2284 /*
2285  * Map a raw PG (full precision ps) into a placement ps (placement
2286  * seed).  Include pool id in that value so that different pools don't
2287  * use the same seeds.
2288  */
2289 static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
2290 			 const struct ceph_pg *raw_pgid)
2291 {
2292 	if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
2293 		/* hash pool id and seed so that pool PGs do not overlap */
2294 		return crush_hash32_2(CRUSH_HASH_RJENKINS1,
2295 				      ceph_stable_mod(raw_pgid->seed,
2296 						      pi->pgp_num,
2297 						      pi->pgp_num_mask),
2298 				      raw_pgid->pool);
2299 	} else {
2300 		/*
2301 		 * legacy behavior: add ps and pool together.  this is
2302 		 * not a great approach because the PGs from each pool
2303 		 * will overlap on top of each other: 0.5 == 1.4 ==
2304 		 * 2.3 == ...
2305 		 */
2306 		return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
2307 				       pi->pgp_num_mask) +
2308 		       (unsigned)raw_pgid->pool;
2309 	}
2310 }
2311 
2312 /*
2313  * Magic value used for a "default" fallback choose_args, used if the
2314  * crush_choose_arg_map passed to do_crush() does not exist.  If this
2315  * also doesn't exist, fall back to canonical weights.
2316  */
2317 #define CEPH_DEFAULT_CHOOSE_ARGS	-1
2318 
2319 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
2320 		    int *result, int result_max,
2321 		    const __u32 *weight, int weight_max,
2322 		    s64 choose_args_index)
2323 {
2324 	struct crush_choose_arg_map *arg_map;
2325 	int r;
2326 
2327 	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
2328 
2329 	arg_map = lookup_choose_arg_map(&map->crush->choose_args,
2330 					choose_args_index);
2331 	if (!arg_map)
2332 		arg_map = lookup_choose_arg_map(&map->crush->choose_args,
2333 						CEPH_DEFAULT_CHOOSE_ARGS);
2334 
2335 	mutex_lock(&map->crush_workspace_mutex);
2336 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
2337 			  weight, weight_max, map->crush_workspace,
2338 			  arg_map ? arg_map->args : NULL);
2339 	mutex_unlock(&map->crush_workspace_mutex);
2340 
2341 	return r;
2342 }
2343 
2344 static void remove_nonexistent_osds(struct ceph_osdmap *osdmap,
2345 				    struct ceph_pg_pool_info *pi,
2346 				    struct ceph_osds *set)
2347 {
2348 	int i;
2349 
2350 	if (ceph_can_shift_osds(pi)) {
2351 		int removed = 0;
2352 
2353 		/* shift left */
2354 		for (i = 0; i < set->size; i++) {
2355 			if (!ceph_osd_exists(osdmap, set->osds[i])) {
2356 				removed++;
2357 				continue;
2358 			}
2359 			if (removed)
2360 				set->osds[i - removed] = set->osds[i];
2361 		}
2362 		set->size -= removed;
2363 	} else {
2364 		/* set dne devices to NONE */
2365 		for (i = 0; i < set->size; i++) {
2366 			if (!ceph_osd_exists(osdmap, set->osds[i]))
2367 				set->osds[i] = CRUSH_ITEM_NONE;
2368 		}
2369 	}
2370 }
2371 
2372 /*
2373  * Calculate raw set (CRUSH output) for given PG and filter out
2374  * nonexistent OSDs.  ->primary is undefined for a raw set.
2375  *
2376  * Placement seed (CRUSH input) is returned through @ppps.
2377  */
2378 static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
2379 			   struct ceph_pg_pool_info *pi,
2380 			   const struct ceph_pg *raw_pgid,
2381 			   struct ceph_osds *raw,
2382 			   u32 *ppps)
2383 {
2384 	u32 pps = raw_pg_to_pps(pi, raw_pgid);
2385 	int ruleno;
2386 	int len;
2387 
2388 	ceph_osds_init(raw);
2389 	if (ppps)
2390 		*ppps = pps;
2391 
2392 	ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
2393 				 pi->size);
2394 	if (ruleno < 0) {
2395 		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
2396 		       pi->id, pi->crush_ruleset, pi->type, pi->size);
2397 		return;
2398 	}
2399 
2400 	if (pi->size > ARRAY_SIZE(raw->osds)) {
2401 		pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
2402 		       pi->id, pi->crush_ruleset, pi->type, pi->size,
2403 		       ARRAY_SIZE(raw->osds));
2404 		return;
2405 	}
2406 
2407 	len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
2408 		       osdmap->osd_weight, osdmap->max_osd, pi->id);
2409 	if (len < 0) {
2410 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
2411 		       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
2412 		       pi->size);
2413 		return;
2414 	}
2415 
2416 	raw->size = len;
2417 	remove_nonexistent_osds(osdmap, pi, raw);
2418 }
2419 
2420 /* apply pg_upmap[_items] mappings */
2421 static void apply_upmap(struct ceph_osdmap *osdmap,
2422 			const struct ceph_pg *pgid,
2423 			struct ceph_osds *raw)
2424 {
2425 	struct ceph_pg_mapping *pg;
2426 	int i, j;
2427 
2428 	pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid);
2429 	if (pg) {
2430 		/* make sure targets aren't marked out */
2431 		for (i = 0; i < pg->pg_upmap.len; i++) {
2432 			int osd = pg->pg_upmap.osds[i];
2433 
2434 			if (osd != CRUSH_ITEM_NONE &&
2435 			    osd < osdmap->max_osd &&
2436 			    osdmap->osd_weight[osd] == 0) {
2437 				/* reject/ignore explicit mapping */
2438 				return;
2439 			}
2440 		}
2441 		for (i = 0; i < pg->pg_upmap.len; i++)
2442 			raw->osds[i] = pg->pg_upmap.osds[i];
2443 		raw->size = pg->pg_upmap.len;
2444 		/* check and apply pg_upmap_items, if any */
2445 	}
2446 
2447 	pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
2448 	if (pg) {
2449 		/*
2450 		 * Note: this approach does not allow a bidirectional swap,
2451 		 * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2452 		 */
2453 		for (i = 0; i < pg->pg_upmap_items.len; i++) {
2454 			int from = pg->pg_upmap_items.from_to[i][0];
2455 			int to = pg->pg_upmap_items.from_to[i][1];
2456 			int pos = -1;
2457 			bool exists = false;
2458 
2459 			/* make sure replacement doesn't already appear */
2460 			for (j = 0; j < raw->size; j++) {
2461 				int osd = raw->osds[j];
2462 
2463 				if (osd == to) {
2464 					exists = true;
2465 					break;
2466 				}
2467 				/* ignore mapping if target is marked out */
2468 				if (osd == from && pos < 0 &&
2469 				    !(to != CRUSH_ITEM_NONE &&
2470 				      to < osdmap->max_osd &&
2471 				      osdmap->osd_weight[to] == 0)) {
2472 					pos = j;
2473 				}
2474 			}
2475 			if (!exists && pos >= 0)
2476 				raw->osds[pos] = to;
2477 		}
2478 	}
2479 }
2480 
2481 /*
2482  * Given raw set, calculate up set and up primary.  By definition of an
2483  * up set, the result won't contain nonexistent or down OSDs.
2484  *
2485  * This is done in-place - on return @set is the up set.  If it's
2486  * empty, ->primary will remain undefined.
2487  */
2488 static void raw_to_up_osds(struct ceph_osdmap *osdmap,
2489 			   struct ceph_pg_pool_info *pi,
2490 			   struct ceph_osds *set)
2491 {
2492 	int i;
2493 
2494 	/* ->primary is undefined for a raw set */
2495 	BUG_ON(set->primary != -1);
2496 
2497 	if (ceph_can_shift_osds(pi)) {
2498 		int removed = 0;
2499 
2500 		/* shift left */
2501 		for (i = 0; i < set->size; i++) {
2502 			if (ceph_osd_is_down(osdmap, set->osds[i])) {
2503 				removed++;
2504 				continue;
2505 			}
2506 			if (removed)
2507 				set->osds[i - removed] = set->osds[i];
2508 		}
2509 		set->size -= removed;
2510 		if (set->size > 0)
2511 			set->primary = set->osds[0];
2512 	} else {
2513 		/* set down/dne devices to NONE */
2514 		for (i = set->size - 1; i >= 0; i--) {
2515 			if (ceph_osd_is_down(osdmap, set->osds[i]))
2516 				set->osds[i] = CRUSH_ITEM_NONE;
2517 			else
2518 				set->primary = set->osds[i];
2519 		}
2520 	}
2521 }
2522 
2523 static void apply_primary_affinity(struct ceph_osdmap *osdmap,
2524 				   struct ceph_pg_pool_info *pi,
2525 				   u32 pps,
2526 				   struct ceph_osds *up)
2527 {
2528 	int i;
2529 	int pos = -1;
2530 
2531 	/*
2532 	 * Do we have any non-default primary_affinity values for these
2533 	 * osds?
2534 	 */
2535 	if (!osdmap->osd_primary_affinity)
2536 		return;
2537 
2538 	for (i = 0; i < up->size; i++) {
2539 		int osd = up->osds[i];
2540 
2541 		if (osd != CRUSH_ITEM_NONE &&
2542 		    osdmap->osd_primary_affinity[osd] !=
2543 					CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
2544 			break;
2545 		}
2546 	}
2547 	if (i == up->size)
2548 		return;
2549 
2550 	/*
2551 	 * Pick the primary.  Feed both the seed (for the pg) and the
2552 	 * osd into the hash/rng so that a proportional fraction of an
2553 	 * osd's pgs get rejected as primary.
2554 	 */
2555 	for (i = 0; i < up->size; i++) {
2556 		int osd = up->osds[i];
2557 		u32 aff;
2558 
2559 		if (osd == CRUSH_ITEM_NONE)
2560 			continue;
2561 
2562 		aff = osdmap->osd_primary_affinity[osd];
2563 		if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
2564 		    (crush_hash32_2(CRUSH_HASH_RJENKINS1,
2565 				    pps, osd) >> 16) >= aff) {
2566 			/*
2567 			 * We chose not to use this primary.  Note it
2568 			 * anyway as a fallback in case we don't pick
2569 			 * anyone else, but keep looking.
2570 			 */
2571 			if (pos < 0)
2572 				pos = i;
2573 		} else {
2574 			pos = i;
2575 			break;
2576 		}
2577 	}
2578 	if (pos < 0)
2579 		return;
2580 
2581 	up->primary = up->osds[pos];
2582 
2583 	if (ceph_can_shift_osds(pi) && pos > 0) {
2584 		/* move the new primary to the front */
2585 		for (i = pos; i > 0; i--)
2586 			up->osds[i] = up->osds[i - 1];
2587 		up->osds[0] = up->primary;
2588 	}
2589 }
2590 
2591 /*
2592  * Get pg_temp and primary_temp mappings for given PG.
2593  *
2594  * Note that a PG may have none, only pg_temp, only primary_temp or
2595  * both pg_temp and primary_temp mappings.  This means @temp isn't
2596  * always a valid OSD set on return: in the "only primary_temp" case,
2597  * @temp will have its ->primary >= 0 but ->size == 0.
2598  */
2599 static void get_temp_osds(struct ceph_osdmap *osdmap,
2600 			  struct ceph_pg_pool_info *pi,
2601 			  const struct ceph_pg *pgid,
2602 			  struct ceph_osds *temp)
2603 {
2604 	struct ceph_pg_mapping *pg;
2605 	int i;
2606 
2607 	ceph_osds_init(temp);
2608 
2609 	/* pg_temp? */
2610 	pg = lookup_pg_mapping(&osdmap->pg_temp, pgid);
2611 	if (pg) {
2612 		for (i = 0; i < pg->pg_temp.len; i++) {
2613 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
2614 				if (ceph_can_shift_osds(pi))
2615 					continue;
2616 
2617 				temp->osds[temp->size++] = CRUSH_ITEM_NONE;
2618 			} else {
2619 				temp->osds[temp->size++] = pg->pg_temp.osds[i];
2620 			}
2621 		}
2622 
2623 		/* apply pg_temp's primary */
2624 		for (i = 0; i < temp->size; i++) {
2625 			if (temp->osds[i] != CRUSH_ITEM_NONE) {
2626 				temp->primary = temp->osds[i];
2627 				break;
2628 			}
2629 		}
2630 	}
2631 
2632 	/* primary_temp? */
2633 	pg = lookup_pg_mapping(&osdmap->primary_temp, pgid);
2634 	if (pg)
2635 		temp->primary = pg->primary_temp.osd;
2636 }
2637 
2638 /*
2639  * Map a PG to its acting set as well as its up set.
2640  *
2641  * Acting set is used for data mapping purposes, while up set can be
2642  * recorded for detecting interval changes and deciding whether to
2643  * resend a request.
2644  */
2645 void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
2646 			       struct ceph_pg_pool_info *pi,
2647 			       const struct ceph_pg *raw_pgid,
2648 			       struct ceph_osds *up,
2649 			       struct ceph_osds *acting)
2650 {
2651 	struct ceph_pg pgid;
2652 	u32 pps;
2653 
2654 	WARN_ON(pi->id != raw_pgid->pool);
2655 	raw_pg_to_pg(pi, raw_pgid, &pgid);
2656 
2657 	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
2658 	apply_upmap(osdmap, &pgid, up);
2659 	raw_to_up_osds(osdmap, pi, up);
2660 	apply_primary_affinity(osdmap, pi, pps, up);
2661 	get_temp_osds(osdmap, pi, &pgid, acting);
2662 	if (!acting->size) {
2663 		memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
2664 		acting->size = up->size;
2665 		if (acting->primary == -1)
2666 			acting->primary = up->primary;
2667 	}
2668 	WARN_ON(!osds_valid(up) || !osds_valid(acting));
2669 }
2670 
2671 bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
2672 			      struct ceph_pg_pool_info *pi,
2673 			      const struct ceph_pg *raw_pgid,
2674 			      struct ceph_spg *spgid)
2675 {
2676 	struct ceph_pg pgid;
2677 	struct ceph_osds up, acting;
2678 	int i;
2679 
2680 	WARN_ON(pi->id != raw_pgid->pool);
2681 	raw_pg_to_pg(pi, raw_pgid, &pgid);
2682 
2683 	if (ceph_can_shift_osds(pi)) {
2684 		spgid->pgid = pgid; /* struct */
2685 		spgid->shard = CEPH_SPG_NOSHARD;
2686 		return true;
2687 	}
2688 
2689 	ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting);
2690 	for (i = 0; i < acting.size; i++) {
2691 		if (acting.osds[i] == acting.primary) {
2692 			spgid->pgid = pgid; /* struct */
2693 			spgid->shard = i;
2694 			return true;
2695 		}
2696 	}
2697 
2698 	return false;
2699 }
2700 
2701 /*
2702  * Return acting primary for given PG, or -1 if none.
2703  */
2704 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
2705 			      const struct ceph_pg *raw_pgid)
2706 {
2707 	struct ceph_pg_pool_info *pi;
2708 	struct ceph_osds up, acting;
2709 
2710 	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
2711 	if (!pi)
2712 		return -1;
2713 
2714 	ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting);
2715 	return acting.primary;
2716 }
2717 EXPORT_SYMBOL(ceph_pg_to_acting_primary);
2718 
2719 static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
2720 					      size_t name_len)
2721 {
2722 	struct crush_loc_node *loc;
2723 
2724 	loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
2725 	if (!loc)
2726 		return NULL;
2727 
2728 	RB_CLEAR_NODE(&loc->cl_node);
2729 	return loc;
2730 }
2731 
2732 static void free_crush_loc(struct crush_loc_node *loc)
2733 {
2734 	WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
2735 
2736 	kfree(loc);
2737 }
2738 
2739 static int crush_loc_compare(const struct crush_loc *loc1,
2740 			     const struct crush_loc *loc2)
2741 {
2742 	return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
2743 	       strcmp(loc1->cl_name, loc2->cl_name);
2744 }
2745 
2746 DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
2747 		 RB_BYPTR, const struct crush_loc *, cl_node)
2748 
2749 /*
2750  * Parses a set of <bucket type name>':'<bucket name> pairs separated
2751  * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
2752  *
2753  * Note that @crush_location is modified by strsep().
2754  */
2755 int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
2756 {
2757 	struct crush_loc_node *loc;
2758 	const char *type_name, *name, *colon;
2759 	size_t type_name_len, name_len;
2760 
2761 	dout("%s '%s'\n", __func__, crush_location);
2762 	while ((type_name = strsep(&crush_location, "|"))) {
2763 		colon = strchr(type_name, ':');
2764 		if (!colon)
2765 			return -EINVAL;
2766 
2767 		type_name_len = colon - type_name;
2768 		if (type_name_len == 0)
2769 			return -EINVAL;
2770 
2771 		name = colon + 1;
2772 		name_len = strlen(name);
2773 		if (name_len == 0)
2774 			return -EINVAL;
2775 
2776 		loc = alloc_crush_loc(type_name_len, name_len);
2777 		if (!loc)
2778 			return -ENOMEM;
2779 
2780 		loc->cl_loc.cl_type_name = loc->cl_data;
2781 		memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
2782 		loc->cl_loc.cl_type_name[type_name_len] = '\0';
2783 
2784 		loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
2785 		memcpy(loc->cl_loc.cl_name, name, name_len);
2786 		loc->cl_loc.cl_name[name_len] = '\0';
2787 
2788 		if (!__insert_crush_loc(locs, loc)) {
2789 			free_crush_loc(loc);
2790 			return -EEXIST;
2791 		}
2792 
2793 		dout("%s type_name '%s' name '%s'\n", __func__,
2794 		     loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
2795 	}
2796 
2797 	return 0;
2798 }
2799 
2800 int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
2801 {
2802 	struct rb_node *n1 = rb_first(locs1);
2803 	struct rb_node *n2 = rb_first(locs2);
2804 	int ret;
2805 
2806 	for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
2807 		struct crush_loc_node *loc1 =
2808 		    rb_entry(n1, struct crush_loc_node, cl_node);
2809 		struct crush_loc_node *loc2 =
2810 		    rb_entry(n2, struct crush_loc_node, cl_node);
2811 
2812 		ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
2813 		if (ret)
2814 			return ret;
2815 	}
2816 
2817 	if (!n1 && n2)
2818 		return -1;
2819 	if (n1 && !n2)
2820 		return 1;
2821 	return 0;
2822 }
2823 
2824 void ceph_clear_crush_locs(struct rb_root *locs)
2825 {
2826 	while (!RB_EMPTY_ROOT(locs)) {
2827 		struct crush_loc_node *loc =
2828 		    rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
2829 
2830 		erase_crush_loc(locs, loc);
2831 		free_crush_loc(loc);
2832 	}
2833 }
2834 
2835 /*
2836  * [a-zA-Z0-9-_.]+
2837  */
2838 static bool is_valid_crush_name(const char *name)
2839 {
2840 	do {
2841 		if (!('a' <= *name && *name <= 'z') &&
2842 		    !('A' <= *name && *name <= 'Z') &&
2843 		    !('0' <= *name && *name <= '9') &&
2844 		    *name != '-' && *name != '_' && *name != '.')
2845 			return false;
2846 	} while (*++name != '\0');
2847 
2848 	return true;
2849 }
2850 
2851 /*
2852  * Gets the parent of an item.  Returns its id (<0 because the
2853  * parent is always a bucket), type id (>0 for the same reason,
2854  * via @parent_type_id) and location (via @parent_loc).  If no
2855  * parent, returns 0.
2856  *
2857  * Does a linear search, as there are no parent pointers of any
2858  * kind.  Note that the result is ambigous for items that occur
2859  * multiple times in the map.
2860  */
2861 static int get_immediate_parent(struct crush_map *c, int id,
2862 				u16 *parent_type_id,
2863 				struct crush_loc *parent_loc)
2864 {
2865 	struct crush_bucket *b;
2866 	struct crush_name_node *type_cn, *cn;
2867 	int i, j;
2868 
2869 	for (i = 0; i < c->max_buckets; i++) {
2870 		b = c->buckets[i];
2871 		if (!b)
2872 			continue;
2873 
2874 		/* ignore per-class shadow hierarchy */
2875 		cn = lookup_crush_name(&c->names, b->id);
2876 		if (!cn || !is_valid_crush_name(cn->cn_name))
2877 			continue;
2878 
2879 		for (j = 0; j < b->size; j++) {
2880 			if (b->items[j] != id)
2881 				continue;
2882 
2883 			*parent_type_id = b->type;
2884 			type_cn = lookup_crush_name(&c->type_names, b->type);
2885 			parent_loc->cl_type_name = type_cn->cn_name;
2886 			parent_loc->cl_name = cn->cn_name;
2887 			return b->id;
2888 		}
2889 	}
2890 
2891 	return 0;  /* no parent */
2892 }
2893 
2894 /*
2895  * Calculates the locality/distance from an item to a client
2896  * location expressed in terms of CRUSH hierarchy as a set of
2897  * (bucket type name, bucket name) pairs.  Specifically, looks
2898  * for the lowest-valued bucket type for which the location of
2899  * @id matches one of the locations in @locs, so for standard
2900  * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9)
2901  * a matching host is closer than a matching rack and a matching
2902  * data center is closer than a matching zone.
2903  *
2904  * Specifying multiple locations (a "multipath" location) such
2905  * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs
2906  * is a multimap.  The locality will be:
2907  *
2908  * - 3 for OSDs in racks foo1 and foo2
2909  * - 8 for OSDs in data center bar
2910  * - -1 for all other OSDs
2911  *
2912  * The lowest possible bucket type is 1, so the best locality
2913  * for an OSD is 1 (i.e. a matching host).  Locality 0 would be
2914  * the OSD itself.
2915  */
2916 int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
2917 			    struct rb_root *locs)
2918 {
2919 	struct crush_loc loc;
2920 	u16 type_id;
2921 
2922 	/*
2923 	 * Instead of repeated get_immediate_parent() calls,
2924 	 * the location of @id could be obtained with a single
2925 	 * depth-first traversal.
2926 	 */
2927 	for (;;) {
2928 		id = get_immediate_parent(osdmap->crush, id, &type_id, &loc);
2929 		if (id >= 0)
2930 			return -1;  /* not local */
2931 
2932 		if (lookup_crush_loc(locs, &loc))
2933 			return type_id;
2934 	}
2935 }
2936