1 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/module.h> 5 #include <linux/slab.h> 6 #include <asm/div64.h> 7 8 #include <linux/ceph/libceph.h> 9 #include <linux/ceph/osdmap.h> 10 #include <linux/ceph/decode.h> 11 #include <linux/crush/hash.h> 12 #include <linux/crush/mapper.h> 13 14 char *ceph_osdmap_state_str(char *str, int len, int state) 15 { 16 if (!len) 17 return str; 18 19 if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) 20 snprintf(str, len, "exists, up"); 21 else if (state & CEPH_OSD_EXISTS) 22 snprintf(str, len, "exists"); 23 else if (state & CEPH_OSD_UP) 24 snprintf(str, len, "up"); 25 else 26 snprintf(str, len, "doesn't exist"); 27 28 return str; 29 } 30 31 /* maps */ 32 33 static int calc_bits_of(unsigned int t) 34 { 35 int b = 0; 36 while (t) { 37 t = t >> 1; 38 b++; 39 } 40 return b; 41 } 42 43 /* 44 * the foo_mask is the smallest value 2^n-1 that is >= foo. 45 */ 46 static void calc_pg_masks(struct ceph_pg_pool_info *pi) 47 { 48 pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; 49 pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; 50 } 51 52 /* 53 * decode crush map 54 */ 55 static int crush_decode_uniform_bucket(void **p, void *end, 56 struct crush_bucket_uniform *b) 57 { 58 dout("crush_decode_uniform_bucket %p to %p\n", *p, end); 59 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); 60 b->item_weight = ceph_decode_32(p); 61 return 0; 62 bad: 63 return -EINVAL; 64 } 65 66 static int crush_decode_list_bucket(void **p, void *end, 67 struct crush_bucket_list *b) 68 { 69 int j; 70 dout("crush_decode_list_bucket %p to %p\n", *p, end); 71 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 72 if (b->item_weights == NULL) 73 return -ENOMEM; 74 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 75 if (b->sum_weights == NULL) 76 return -ENOMEM; 77 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); 78 for (j = 0; j < b->h.size; j++) { 79 b->item_weights[j] = ceph_decode_32(p); 80 b->sum_weights[j] = ceph_decode_32(p); 81 } 82 return 0; 83 bad: 84 return -EINVAL; 85 } 86 87 static int crush_decode_tree_bucket(void **p, void *end, 88 struct crush_bucket_tree *b) 89 { 90 int j; 91 dout("crush_decode_tree_bucket %p to %p\n", *p, end); 92 ceph_decode_32_safe(p, end, b->num_nodes, bad); 93 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); 94 if (b->node_weights == NULL) 95 return -ENOMEM; 96 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); 97 for (j = 0; j < b->num_nodes; j++) 98 b->node_weights[j] = ceph_decode_32(p); 99 return 0; 100 bad: 101 return -EINVAL; 102 } 103 104 static int crush_decode_straw_bucket(void **p, void *end, 105 struct crush_bucket_straw *b) 106 { 107 int j; 108 dout("crush_decode_straw_bucket %p to %p\n", *p, end); 109 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 110 if (b->item_weights == NULL) 111 return -ENOMEM; 112 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 113 if (b->straws == NULL) 114 return -ENOMEM; 115 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); 116 for (j = 0; j < b->h.size; j++) { 117 b->item_weights[j] = ceph_decode_32(p); 118 b->straws[j] = ceph_decode_32(p); 119 } 120 return 0; 121 bad: 122 return -EINVAL; 123 } 124 125 static int skip_name_map(void **p, void *end) 126 { 127 int len; 128 ceph_decode_32_safe(p, end, len ,bad); 129 while (len--) { 130 int strlen; 131 *p += sizeof(u32); 132 ceph_decode_32_safe(p, end, strlen, bad); 133 *p += strlen; 134 } 135 return 0; 136 bad: 137 return -EINVAL; 138 } 139 140 static struct crush_map *crush_decode(void *pbyval, void *end) 141 { 142 struct crush_map *c; 143 int err = -EINVAL; 144 int i, j; 145 void **p = &pbyval; 146 void *start = pbyval; 147 u32 magic; 148 u32 num_name_maps; 149 150 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 151 152 c = kzalloc(sizeof(*c), GFP_NOFS); 153 if (c == NULL) 154 return ERR_PTR(-ENOMEM); 155 156 /* set tunables to default values */ 157 c->choose_local_tries = 2; 158 c->choose_local_fallback_tries = 5; 159 c->choose_total_tries = 19; 160 c->chooseleaf_descend_once = 0; 161 162 ceph_decode_need(p, end, 4*sizeof(u32), bad); 163 magic = ceph_decode_32(p); 164 if (magic != CRUSH_MAGIC) { 165 pr_err("crush_decode magic %x != current %x\n", 166 (unsigned int)magic, (unsigned int)CRUSH_MAGIC); 167 goto bad; 168 } 169 c->max_buckets = ceph_decode_32(p); 170 c->max_rules = ceph_decode_32(p); 171 c->max_devices = ceph_decode_32(p); 172 173 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); 174 if (c->buckets == NULL) 175 goto badmem; 176 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); 177 if (c->rules == NULL) 178 goto badmem; 179 180 /* buckets */ 181 for (i = 0; i < c->max_buckets; i++) { 182 int size = 0; 183 u32 alg; 184 struct crush_bucket *b; 185 186 ceph_decode_32_safe(p, end, alg, bad); 187 if (alg == 0) { 188 c->buckets[i] = NULL; 189 continue; 190 } 191 dout("crush_decode bucket %d off %x %p to %p\n", 192 i, (int)(*p-start), *p, end); 193 194 switch (alg) { 195 case CRUSH_BUCKET_UNIFORM: 196 size = sizeof(struct crush_bucket_uniform); 197 break; 198 case CRUSH_BUCKET_LIST: 199 size = sizeof(struct crush_bucket_list); 200 break; 201 case CRUSH_BUCKET_TREE: 202 size = sizeof(struct crush_bucket_tree); 203 break; 204 case CRUSH_BUCKET_STRAW: 205 size = sizeof(struct crush_bucket_straw); 206 break; 207 default: 208 err = -EINVAL; 209 goto bad; 210 } 211 BUG_ON(size == 0); 212 b = c->buckets[i] = kzalloc(size, GFP_NOFS); 213 if (b == NULL) 214 goto badmem; 215 216 ceph_decode_need(p, end, 4*sizeof(u32), bad); 217 b->id = ceph_decode_32(p); 218 b->type = ceph_decode_16(p); 219 b->alg = ceph_decode_8(p); 220 b->hash = ceph_decode_8(p); 221 b->weight = ceph_decode_32(p); 222 b->size = ceph_decode_32(p); 223 224 dout("crush_decode bucket size %d off %x %p to %p\n", 225 b->size, (int)(*p-start), *p, end); 226 227 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); 228 if (b->items == NULL) 229 goto badmem; 230 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); 231 if (b->perm == NULL) 232 goto badmem; 233 b->perm_n = 0; 234 235 ceph_decode_need(p, end, b->size*sizeof(u32), bad); 236 for (j = 0; j < b->size; j++) 237 b->items[j] = ceph_decode_32(p); 238 239 switch (b->alg) { 240 case CRUSH_BUCKET_UNIFORM: 241 err = crush_decode_uniform_bucket(p, end, 242 (struct crush_bucket_uniform *)b); 243 if (err < 0) 244 goto bad; 245 break; 246 case CRUSH_BUCKET_LIST: 247 err = crush_decode_list_bucket(p, end, 248 (struct crush_bucket_list *)b); 249 if (err < 0) 250 goto bad; 251 break; 252 case CRUSH_BUCKET_TREE: 253 err = crush_decode_tree_bucket(p, end, 254 (struct crush_bucket_tree *)b); 255 if (err < 0) 256 goto bad; 257 break; 258 case CRUSH_BUCKET_STRAW: 259 err = crush_decode_straw_bucket(p, end, 260 (struct crush_bucket_straw *)b); 261 if (err < 0) 262 goto bad; 263 break; 264 } 265 } 266 267 /* rules */ 268 dout("rule vec is %p\n", c->rules); 269 for (i = 0; i < c->max_rules; i++) { 270 u32 yes; 271 struct crush_rule *r; 272 273 ceph_decode_32_safe(p, end, yes, bad); 274 if (!yes) { 275 dout("crush_decode NO rule %d off %x %p to %p\n", 276 i, (int)(*p-start), *p, end); 277 c->rules[i] = NULL; 278 continue; 279 } 280 281 dout("crush_decode rule %d off %x %p to %p\n", 282 i, (int)(*p-start), *p, end); 283 284 /* len */ 285 ceph_decode_32_safe(p, end, yes, bad); 286 #if BITS_PER_LONG == 32 287 err = -EINVAL; 288 if (yes > (ULONG_MAX - sizeof(*r)) 289 / sizeof(struct crush_rule_step)) 290 goto bad; 291 #endif 292 r = c->rules[i] = kmalloc(sizeof(*r) + 293 yes*sizeof(struct crush_rule_step), 294 GFP_NOFS); 295 if (r == NULL) 296 goto badmem; 297 dout(" rule %d is at %p\n", i, r); 298 r->len = yes; 299 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ 300 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); 301 for (j = 0; j < r->len; j++) { 302 r->steps[j].op = ceph_decode_32(p); 303 r->steps[j].arg1 = ceph_decode_32(p); 304 r->steps[j].arg2 = ceph_decode_32(p); 305 } 306 } 307 308 /* ignore trailing name maps. */ 309 for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { 310 err = skip_name_map(p, end); 311 if (err < 0) 312 goto done; 313 } 314 315 /* tunables */ 316 ceph_decode_need(p, end, 3*sizeof(u32), done); 317 c->choose_local_tries = ceph_decode_32(p); 318 c->choose_local_fallback_tries = ceph_decode_32(p); 319 c->choose_total_tries = ceph_decode_32(p); 320 dout("crush decode tunable choose_local_tries = %d", 321 c->choose_local_tries); 322 dout("crush decode tunable choose_local_fallback_tries = %d", 323 c->choose_local_fallback_tries); 324 dout("crush decode tunable choose_total_tries = %d", 325 c->choose_total_tries); 326 327 ceph_decode_need(p, end, sizeof(u32), done); 328 c->chooseleaf_descend_once = ceph_decode_32(p); 329 dout("crush decode tunable chooseleaf_descend_once = %d", 330 c->chooseleaf_descend_once); 331 332 ceph_decode_need(p, end, sizeof(u8), done); 333 c->chooseleaf_vary_r = ceph_decode_8(p); 334 dout("crush decode tunable chooseleaf_vary_r = %d", 335 c->chooseleaf_vary_r); 336 337 done: 338 dout("crush_decode success\n"); 339 return c; 340 341 badmem: 342 err = -ENOMEM; 343 bad: 344 dout("crush_decode fail %d\n", err); 345 crush_destroy(c); 346 return ERR_PTR(err); 347 } 348 349 /* 350 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 351 * to a set of osds) and primary_temp (explicit primary setting) 352 */ 353 static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) 354 { 355 if (l.pool < r.pool) 356 return -1; 357 if (l.pool > r.pool) 358 return 1; 359 if (l.seed < r.seed) 360 return -1; 361 if (l.seed > r.seed) 362 return 1; 363 return 0; 364 } 365 366 static int __insert_pg_mapping(struct ceph_pg_mapping *new, 367 struct rb_root *root) 368 { 369 struct rb_node **p = &root->rb_node; 370 struct rb_node *parent = NULL; 371 struct ceph_pg_mapping *pg = NULL; 372 int c; 373 374 dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new); 375 while (*p) { 376 parent = *p; 377 pg = rb_entry(parent, struct ceph_pg_mapping, node); 378 c = pgid_cmp(new->pgid, pg->pgid); 379 if (c < 0) 380 p = &(*p)->rb_left; 381 else if (c > 0) 382 p = &(*p)->rb_right; 383 else 384 return -EEXIST; 385 } 386 387 rb_link_node(&new->node, parent, p); 388 rb_insert_color(&new->node, root); 389 return 0; 390 } 391 392 static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, 393 struct ceph_pg pgid) 394 { 395 struct rb_node *n = root->rb_node; 396 struct ceph_pg_mapping *pg; 397 int c; 398 399 while (n) { 400 pg = rb_entry(n, struct ceph_pg_mapping, node); 401 c = pgid_cmp(pgid, pg->pgid); 402 if (c < 0) { 403 n = n->rb_left; 404 } else if (c > 0) { 405 n = n->rb_right; 406 } else { 407 dout("__lookup_pg_mapping %lld.%x got %p\n", 408 pgid.pool, pgid.seed, pg); 409 return pg; 410 } 411 } 412 return NULL; 413 } 414 415 static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) 416 { 417 struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); 418 419 if (pg) { 420 dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, 421 pg); 422 rb_erase(&pg->node, root); 423 kfree(pg); 424 return 0; 425 } 426 dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); 427 return -ENOENT; 428 } 429 430 /* 431 * rbtree of pg pool info 432 */ 433 static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) 434 { 435 struct rb_node **p = &root->rb_node; 436 struct rb_node *parent = NULL; 437 struct ceph_pg_pool_info *pi = NULL; 438 439 while (*p) { 440 parent = *p; 441 pi = rb_entry(parent, struct ceph_pg_pool_info, node); 442 if (new->id < pi->id) 443 p = &(*p)->rb_left; 444 else if (new->id > pi->id) 445 p = &(*p)->rb_right; 446 else 447 return -EEXIST; 448 } 449 450 rb_link_node(&new->node, parent, p); 451 rb_insert_color(&new->node, root); 452 return 0; 453 } 454 455 static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) 456 { 457 struct ceph_pg_pool_info *pi; 458 struct rb_node *n = root->rb_node; 459 460 while (n) { 461 pi = rb_entry(n, struct ceph_pg_pool_info, node); 462 if (id < pi->id) 463 n = n->rb_left; 464 else if (id > pi->id) 465 n = n->rb_right; 466 else 467 return pi; 468 } 469 return NULL; 470 } 471 472 struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) 473 { 474 return __lookup_pg_pool(&map->pg_pools, id); 475 } 476 477 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) 478 { 479 struct ceph_pg_pool_info *pi; 480 481 if (id == CEPH_NOPOOL) 482 return NULL; 483 484 if (WARN_ON_ONCE(id > (u64) INT_MAX)) 485 return NULL; 486 487 pi = __lookup_pg_pool(&map->pg_pools, (int) id); 488 489 return pi ? pi->name : NULL; 490 } 491 EXPORT_SYMBOL(ceph_pg_pool_name_by_id); 492 493 int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) 494 { 495 struct rb_node *rbp; 496 497 for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { 498 struct ceph_pg_pool_info *pi = 499 rb_entry(rbp, struct ceph_pg_pool_info, node); 500 if (pi->name && strcmp(pi->name, name) == 0) 501 return pi->id; 502 } 503 return -ENOENT; 504 } 505 EXPORT_SYMBOL(ceph_pg_poolid_by_name); 506 507 static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) 508 { 509 rb_erase(&pi->node, root); 510 kfree(pi->name); 511 kfree(pi); 512 } 513 514 static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 515 { 516 u8 ev, cv; 517 unsigned len, num; 518 void *pool_end; 519 520 ceph_decode_need(p, end, 2 + 4, bad); 521 ev = ceph_decode_8(p); /* encoding version */ 522 cv = ceph_decode_8(p); /* compat version */ 523 if (ev < 5) { 524 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); 525 return -EINVAL; 526 } 527 if (cv > 9) { 528 pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); 529 return -EINVAL; 530 } 531 len = ceph_decode_32(p); 532 ceph_decode_need(p, end, len, bad); 533 pool_end = *p + len; 534 535 pi->type = ceph_decode_8(p); 536 pi->size = ceph_decode_8(p); 537 pi->crush_ruleset = ceph_decode_8(p); 538 pi->object_hash = ceph_decode_8(p); 539 540 pi->pg_num = ceph_decode_32(p); 541 pi->pgp_num = ceph_decode_32(p); 542 543 *p += 4 + 4; /* skip lpg* */ 544 *p += 4; /* skip last_change */ 545 *p += 8 + 4; /* skip snap_seq, snap_epoch */ 546 547 /* skip snaps */ 548 num = ceph_decode_32(p); 549 while (num--) { 550 *p += 8; /* snapid key */ 551 *p += 1 + 1; /* versions */ 552 len = ceph_decode_32(p); 553 *p += len; 554 } 555 556 /* skip removed_snaps */ 557 num = ceph_decode_32(p); 558 *p += num * (8 + 8); 559 560 *p += 8; /* skip auid */ 561 pi->flags = ceph_decode_64(p); 562 *p += 4; /* skip crash_replay_interval */ 563 564 if (ev >= 7) 565 *p += 1; /* skip min_size */ 566 567 if (ev >= 8) 568 *p += 8 + 8; /* skip quota_max_* */ 569 570 if (ev >= 9) { 571 /* skip tiers */ 572 num = ceph_decode_32(p); 573 *p += num * 8; 574 575 *p += 8; /* skip tier_of */ 576 *p += 1; /* skip cache_mode */ 577 578 pi->read_tier = ceph_decode_64(p); 579 pi->write_tier = ceph_decode_64(p); 580 } else { 581 pi->read_tier = -1; 582 pi->write_tier = -1; 583 } 584 585 /* ignore the rest */ 586 587 *p = pool_end; 588 calc_pg_masks(pi); 589 return 0; 590 591 bad: 592 return -EINVAL; 593 } 594 595 static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 596 { 597 struct ceph_pg_pool_info *pi; 598 u32 num, len; 599 u64 pool; 600 601 ceph_decode_32_safe(p, end, num, bad); 602 dout(" %d pool names\n", num); 603 while (num--) { 604 ceph_decode_64_safe(p, end, pool, bad); 605 ceph_decode_32_safe(p, end, len, bad); 606 dout(" pool %llu len %d\n", pool, len); 607 ceph_decode_need(p, end, len, bad); 608 pi = __lookup_pg_pool(&map->pg_pools, pool); 609 if (pi) { 610 char *name = kstrndup(*p, len, GFP_NOFS); 611 612 if (!name) 613 return -ENOMEM; 614 kfree(pi->name); 615 pi->name = name; 616 dout(" name is %s\n", pi->name); 617 } 618 *p += len; 619 } 620 return 0; 621 622 bad: 623 return -EINVAL; 624 } 625 626 /* 627 * osd map 628 */ 629 void ceph_osdmap_destroy(struct ceph_osdmap *map) 630 { 631 dout("osdmap_destroy %p\n", map); 632 if (map->crush) 633 crush_destroy(map->crush); 634 while (!RB_EMPTY_ROOT(&map->pg_temp)) { 635 struct ceph_pg_mapping *pg = 636 rb_entry(rb_first(&map->pg_temp), 637 struct ceph_pg_mapping, node); 638 rb_erase(&pg->node, &map->pg_temp); 639 kfree(pg); 640 } 641 while (!RB_EMPTY_ROOT(&map->primary_temp)) { 642 struct ceph_pg_mapping *pg = 643 rb_entry(rb_first(&map->primary_temp), 644 struct ceph_pg_mapping, node); 645 rb_erase(&pg->node, &map->primary_temp); 646 kfree(pg); 647 } 648 while (!RB_EMPTY_ROOT(&map->pg_pools)) { 649 struct ceph_pg_pool_info *pi = 650 rb_entry(rb_first(&map->pg_pools), 651 struct ceph_pg_pool_info, node); 652 __remove_pg_pool(&map->pg_pools, pi); 653 } 654 kfree(map->osd_state); 655 kfree(map->osd_weight); 656 kfree(map->osd_addr); 657 kfree(map->osd_primary_affinity); 658 kfree(map); 659 } 660 661 /* 662 * Adjust max_osd value, (re)allocate arrays. 663 * 664 * The new elements are properly initialized. 665 */ 666 static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 667 { 668 u8 *state; 669 u32 *weight; 670 struct ceph_entity_addr *addr; 671 int i; 672 673 state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); 674 weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); 675 addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); 676 if (!state || !weight || !addr) { 677 kfree(state); 678 kfree(weight); 679 kfree(addr); 680 681 return -ENOMEM; 682 } 683 684 for (i = map->max_osd; i < max; i++) { 685 state[i] = 0; 686 weight[i] = CEPH_OSD_OUT; 687 memset(addr + i, 0, sizeof(*addr)); 688 } 689 690 map->osd_state = state; 691 map->osd_weight = weight; 692 map->osd_addr = addr; 693 694 if (map->osd_primary_affinity) { 695 u32 *affinity; 696 697 affinity = krealloc(map->osd_primary_affinity, 698 max*sizeof(*affinity), GFP_NOFS); 699 if (!affinity) 700 return -ENOMEM; 701 702 for (i = map->max_osd; i < max; i++) 703 affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 704 705 map->osd_primary_affinity = affinity; 706 } 707 708 map->max_osd = max; 709 710 return 0; 711 } 712 713 #define OSDMAP_WRAPPER_COMPAT_VER 7 714 #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 715 716 /* 717 * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, 718 * to struct_v of the client_data section for new (v7 and above) 719 * osdmaps. 720 */ 721 static int get_osdmap_client_data_v(void **p, void *end, 722 const char *prefix, u8 *v) 723 { 724 u8 struct_v; 725 726 ceph_decode_8_safe(p, end, struct_v, e_inval); 727 if (struct_v >= 7) { 728 u8 struct_compat; 729 730 ceph_decode_8_safe(p, end, struct_compat, e_inval); 731 if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { 732 pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n", 733 struct_v, struct_compat, 734 OSDMAP_WRAPPER_COMPAT_VER, prefix); 735 return -EINVAL; 736 } 737 *p += 4; /* ignore wrapper struct_len */ 738 739 ceph_decode_8_safe(p, end, struct_v, e_inval); 740 ceph_decode_8_safe(p, end, struct_compat, e_inval); 741 if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { 742 pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n", 743 struct_v, struct_compat, 744 OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); 745 return -EINVAL; 746 } 747 *p += 4; /* ignore client data struct_len */ 748 } else { 749 u16 version; 750 751 *p -= 1; 752 ceph_decode_16_safe(p, end, version, e_inval); 753 if (version < 6) { 754 pr_warning("got v %d < 6 of %s ceph_osdmap\n", version, 755 prefix); 756 return -EINVAL; 757 } 758 759 /* old osdmap enconding */ 760 struct_v = 0; 761 } 762 763 *v = struct_v; 764 return 0; 765 766 e_inval: 767 return -EINVAL; 768 } 769 770 static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, 771 bool incremental) 772 { 773 u32 n; 774 775 ceph_decode_32_safe(p, end, n, e_inval); 776 while (n--) { 777 struct ceph_pg_pool_info *pi; 778 u64 pool; 779 int ret; 780 781 ceph_decode_64_safe(p, end, pool, e_inval); 782 783 pi = __lookup_pg_pool(&map->pg_pools, pool); 784 if (!incremental || !pi) { 785 pi = kzalloc(sizeof(*pi), GFP_NOFS); 786 if (!pi) 787 return -ENOMEM; 788 789 pi->id = pool; 790 791 ret = __insert_pg_pool(&map->pg_pools, pi); 792 if (ret) { 793 kfree(pi); 794 return ret; 795 } 796 } 797 798 ret = decode_pool(p, end, pi); 799 if (ret) 800 return ret; 801 } 802 803 return 0; 804 805 e_inval: 806 return -EINVAL; 807 } 808 809 static int decode_pools(void **p, void *end, struct ceph_osdmap *map) 810 { 811 return __decode_pools(p, end, map, false); 812 } 813 814 static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) 815 { 816 return __decode_pools(p, end, map, true); 817 } 818 819 static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, 820 bool incremental) 821 { 822 u32 n; 823 824 ceph_decode_32_safe(p, end, n, e_inval); 825 while (n--) { 826 struct ceph_pg pgid; 827 u32 len, i; 828 int ret; 829 830 ret = ceph_decode_pgid(p, end, &pgid); 831 if (ret) 832 return ret; 833 834 ceph_decode_32_safe(p, end, len, e_inval); 835 836 ret = __remove_pg_mapping(&map->pg_temp, pgid); 837 BUG_ON(!incremental && ret != -ENOENT); 838 839 if (!incremental || len > 0) { 840 struct ceph_pg_mapping *pg; 841 842 ceph_decode_need(p, end, len*sizeof(u32), e_inval); 843 844 if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 845 return -EINVAL; 846 847 pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); 848 if (!pg) 849 return -ENOMEM; 850 851 pg->pgid = pgid; 852 pg->pg_temp.len = len; 853 for (i = 0; i < len; i++) 854 pg->pg_temp.osds[i] = ceph_decode_32(p); 855 856 ret = __insert_pg_mapping(pg, &map->pg_temp); 857 if (ret) { 858 kfree(pg); 859 return ret; 860 } 861 } 862 } 863 864 return 0; 865 866 e_inval: 867 return -EINVAL; 868 } 869 870 static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) 871 { 872 return __decode_pg_temp(p, end, map, false); 873 } 874 875 static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) 876 { 877 return __decode_pg_temp(p, end, map, true); 878 } 879 880 static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, 881 bool incremental) 882 { 883 u32 n; 884 885 ceph_decode_32_safe(p, end, n, e_inval); 886 while (n--) { 887 struct ceph_pg pgid; 888 u32 osd; 889 int ret; 890 891 ret = ceph_decode_pgid(p, end, &pgid); 892 if (ret) 893 return ret; 894 895 ceph_decode_32_safe(p, end, osd, e_inval); 896 897 ret = __remove_pg_mapping(&map->primary_temp, pgid); 898 BUG_ON(!incremental && ret != -ENOENT); 899 900 if (!incremental || osd != (u32)-1) { 901 struct ceph_pg_mapping *pg; 902 903 pg = kzalloc(sizeof(*pg), GFP_NOFS); 904 if (!pg) 905 return -ENOMEM; 906 907 pg->pgid = pgid; 908 pg->primary_temp.osd = osd; 909 910 ret = __insert_pg_mapping(pg, &map->primary_temp); 911 if (ret) { 912 kfree(pg); 913 return ret; 914 } 915 } 916 } 917 918 return 0; 919 920 e_inval: 921 return -EINVAL; 922 } 923 924 static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) 925 { 926 return __decode_primary_temp(p, end, map, false); 927 } 928 929 static int decode_new_primary_temp(void **p, void *end, 930 struct ceph_osdmap *map) 931 { 932 return __decode_primary_temp(p, end, map, true); 933 } 934 935 u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) 936 { 937 BUG_ON(osd >= map->max_osd); 938 939 if (!map->osd_primary_affinity) 940 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 941 942 return map->osd_primary_affinity[osd]; 943 } 944 945 static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) 946 { 947 BUG_ON(osd >= map->max_osd); 948 949 if (!map->osd_primary_affinity) { 950 int i; 951 952 map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), 953 GFP_NOFS); 954 if (!map->osd_primary_affinity) 955 return -ENOMEM; 956 957 for (i = 0; i < map->max_osd; i++) 958 map->osd_primary_affinity[i] = 959 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 960 } 961 962 map->osd_primary_affinity[osd] = aff; 963 964 return 0; 965 } 966 967 static int decode_primary_affinity(void **p, void *end, 968 struct ceph_osdmap *map) 969 { 970 u32 len, i; 971 972 ceph_decode_32_safe(p, end, len, e_inval); 973 if (len == 0) { 974 kfree(map->osd_primary_affinity); 975 map->osd_primary_affinity = NULL; 976 return 0; 977 } 978 if (len != map->max_osd) 979 goto e_inval; 980 981 ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); 982 983 for (i = 0; i < map->max_osd; i++) { 984 int ret; 985 986 ret = set_primary_affinity(map, i, ceph_decode_32(p)); 987 if (ret) 988 return ret; 989 } 990 991 return 0; 992 993 e_inval: 994 return -EINVAL; 995 } 996 997 static int decode_new_primary_affinity(void **p, void *end, 998 struct ceph_osdmap *map) 999 { 1000 u32 n; 1001 1002 ceph_decode_32_safe(p, end, n, e_inval); 1003 while (n--) { 1004 u32 osd, aff; 1005 int ret; 1006 1007 ceph_decode_32_safe(p, end, osd, e_inval); 1008 ceph_decode_32_safe(p, end, aff, e_inval); 1009 1010 ret = set_primary_affinity(map, osd, aff); 1011 if (ret) 1012 return ret; 1013 1014 pr_info("osd%d primary-affinity 0x%x\n", osd, aff); 1015 } 1016 1017 return 0; 1018 1019 e_inval: 1020 return -EINVAL; 1021 } 1022 1023 /* 1024 * decode a full map. 1025 */ 1026 static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) 1027 { 1028 u8 struct_v; 1029 u32 epoch = 0; 1030 void *start = *p; 1031 u32 max; 1032 u32 len, i; 1033 int err; 1034 1035 dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); 1036 1037 err = get_osdmap_client_data_v(p, end, "full", &struct_v); 1038 if (err) 1039 goto bad; 1040 1041 /* fsid, epoch, created, modified */ 1042 ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + 1043 sizeof(map->created) + sizeof(map->modified), e_inval); 1044 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); 1045 epoch = map->epoch = ceph_decode_32(p); 1046 ceph_decode_copy(p, &map->created, sizeof(map->created)); 1047 ceph_decode_copy(p, &map->modified, sizeof(map->modified)); 1048 1049 /* pools */ 1050 err = decode_pools(p, end, map); 1051 if (err) 1052 goto bad; 1053 1054 /* pool_name */ 1055 err = decode_pool_names(p, end, map); 1056 if (err) 1057 goto bad; 1058 1059 ceph_decode_32_safe(p, end, map->pool_max, e_inval); 1060 1061 ceph_decode_32_safe(p, end, map->flags, e_inval); 1062 1063 /* max_osd */ 1064 ceph_decode_32_safe(p, end, max, e_inval); 1065 1066 /* (re)alloc osd arrays */ 1067 err = osdmap_set_max_osd(map, max); 1068 if (err) 1069 goto bad; 1070 1071 /* osd_state, osd_weight, osd_addrs->client_addr */ 1072 ceph_decode_need(p, end, 3*sizeof(u32) + 1073 map->max_osd*(1 + sizeof(*map->osd_weight) + 1074 sizeof(*map->osd_addr)), e_inval); 1075 1076 if (ceph_decode_32(p) != map->max_osd) 1077 goto e_inval; 1078 1079 ceph_decode_copy(p, map->osd_state, map->max_osd); 1080 1081 if (ceph_decode_32(p) != map->max_osd) 1082 goto e_inval; 1083 1084 for (i = 0; i < map->max_osd; i++) 1085 map->osd_weight[i] = ceph_decode_32(p); 1086 1087 if (ceph_decode_32(p) != map->max_osd) 1088 goto e_inval; 1089 1090 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); 1091 for (i = 0; i < map->max_osd; i++) 1092 ceph_decode_addr(&map->osd_addr[i]); 1093 1094 /* pg_temp */ 1095 err = decode_pg_temp(p, end, map); 1096 if (err) 1097 goto bad; 1098 1099 /* primary_temp */ 1100 if (struct_v >= 1) { 1101 err = decode_primary_temp(p, end, map); 1102 if (err) 1103 goto bad; 1104 } 1105 1106 /* primary_affinity */ 1107 if (struct_v >= 2) { 1108 err = decode_primary_affinity(p, end, map); 1109 if (err) 1110 goto bad; 1111 } else { 1112 /* XXX can this happen? */ 1113 kfree(map->osd_primary_affinity); 1114 map->osd_primary_affinity = NULL; 1115 } 1116 1117 /* crush */ 1118 ceph_decode_32_safe(p, end, len, e_inval); 1119 map->crush = crush_decode(*p, min(*p + len, end)); 1120 if (IS_ERR(map->crush)) { 1121 err = PTR_ERR(map->crush); 1122 map->crush = NULL; 1123 goto bad; 1124 } 1125 *p += len; 1126 1127 /* ignore the rest */ 1128 *p = end; 1129 1130 dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); 1131 return 0; 1132 1133 e_inval: 1134 err = -EINVAL; 1135 bad: 1136 pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", 1137 err, epoch, (int)(*p - start), *p, start, end); 1138 print_hex_dump(KERN_DEBUG, "osdmap: ", 1139 DUMP_PREFIX_OFFSET, 16, 1, 1140 start, end - start, true); 1141 return err; 1142 } 1143 1144 /* 1145 * Allocate and decode a full map. 1146 */ 1147 struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) 1148 { 1149 struct ceph_osdmap *map; 1150 int ret; 1151 1152 map = kzalloc(sizeof(*map), GFP_NOFS); 1153 if (!map) 1154 return ERR_PTR(-ENOMEM); 1155 1156 map->pg_temp = RB_ROOT; 1157 map->primary_temp = RB_ROOT; 1158 mutex_init(&map->crush_scratch_mutex); 1159 1160 ret = osdmap_decode(p, end, map); 1161 if (ret) { 1162 ceph_osdmap_destroy(map); 1163 return ERR_PTR(ret); 1164 } 1165 1166 return map; 1167 } 1168 1169 /* 1170 * decode and apply an incremental map update. 1171 */ 1172 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 1173 struct ceph_osdmap *map, 1174 struct ceph_messenger *msgr) 1175 { 1176 struct crush_map *newcrush = NULL; 1177 struct ceph_fsid fsid; 1178 u32 epoch = 0; 1179 struct ceph_timespec modified; 1180 s32 len; 1181 u64 pool; 1182 __s64 new_pool_max; 1183 __s32 new_flags, max; 1184 void *start = *p; 1185 int err; 1186 u8 struct_v; 1187 1188 dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); 1189 1190 err = get_osdmap_client_data_v(p, end, "inc", &struct_v); 1191 if (err) 1192 goto bad; 1193 1194 /* fsid, epoch, modified, new_pool_max, new_flags */ 1195 ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + 1196 sizeof(u64) + sizeof(u32), e_inval); 1197 ceph_decode_copy(p, &fsid, sizeof(fsid)); 1198 epoch = ceph_decode_32(p); 1199 BUG_ON(epoch != map->epoch+1); 1200 ceph_decode_copy(p, &modified, sizeof(modified)); 1201 new_pool_max = ceph_decode_64(p); 1202 new_flags = ceph_decode_32(p); 1203 1204 /* full map? */ 1205 ceph_decode_32_safe(p, end, len, e_inval); 1206 if (len > 0) { 1207 dout("apply_incremental full map len %d, %p to %p\n", 1208 len, *p, end); 1209 return ceph_osdmap_decode(p, min(*p+len, end)); 1210 } 1211 1212 /* new crush? */ 1213 ceph_decode_32_safe(p, end, len, e_inval); 1214 if (len > 0) { 1215 newcrush = crush_decode(*p, min(*p+len, end)); 1216 if (IS_ERR(newcrush)) { 1217 err = PTR_ERR(newcrush); 1218 newcrush = NULL; 1219 goto bad; 1220 } 1221 *p += len; 1222 } 1223 1224 /* new flags? */ 1225 if (new_flags >= 0) 1226 map->flags = new_flags; 1227 if (new_pool_max >= 0) 1228 map->pool_max = new_pool_max; 1229 1230 /* new max? */ 1231 ceph_decode_32_safe(p, end, max, e_inval); 1232 if (max >= 0) { 1233 err = osdmap_set_max_osd(map, max); 1234 if (err) 1235 goto bad; 1236 } 1237 1238 map->epoch++; 1239 map->modified = modified; 1240 if (newcrush) { 1241 if (map->crush) 1242 crush_destroy(map->crush); 1243 map->crush = newcrush; 1244 newcrush = NULL; 1245 } 1246 1247 /* new_pools */ 1248 err = decode_new_pools(p, end, map); 1249 if (err) 1250 goto bad; 1251 1252 /* new_pool_names */ 1253 err = decode_pool_names(p, end, map); 1254 if (err) 1255 goto bad; 1256 1257 /* old_pool */ 1258 ceph_decode_32_safe(p, end, len, e_inval); 1259 while (len--) { 1260 struct ceph_pg_pool_info *pi; 1261 1262 ceph_decode_64_safe(p, end, pool, e_inval); 1263 pi = __lookup_pg_pool(&map->pg_pools, pool); 1264 if (pi) 1265 __remove_pg_pool(&map->pg_pools, pi); 1266 } 1267 1268 /* new_up */ 1269 ceph_decode_32_safe(p, end, len, e_inval); 1270 while (len--) { 1271 u32 osd; 1272 struct ceph_entity_addr addr; 1273 ceph_decode_32_safe(p, end, osd, e_inval); 1274 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); 1275 ceph_decode_addr(&addr); 1276 pr_info("osd%d up\n", osd); 1277 BUG_ON(osd >= map->max_osd); 1278 map->osd_state[osd] |= CEPH_OSD_UP; 1279 map->osd_addr[osd] = addr; 1280 } 1281 1282 /* new_state */ 1283 ceph_decode_32_safe(p, end, len, e_inval); 1284 while (len--) { 1285 u32 osd; 1286 u8 xorstate; 1287 ceph_decode_32_safe(p, end, osd, e_inval); 1288 xorstate = **(u8 **)p; 1289 (*p)++; /* clean flag */ 1290 if (xorstate == 0) 1291 xorstate = CEPH_OSD_UP; 1292 if (xorstate & CEPH_OSD_UP) 1293 pr_info("osd%d down\n", osd); 1294 if (osd < map->max_osd) 1295 map->osd_state[osd] ^= xorstate; 1296 } 1297 1298 /* new_weight */ 1299 ceph_decode_32_safe(p, end, len, e_inval); 1300 while (len--) { 1301 u32 osd, off; 1302 ceph_decode_need(p, end, sizeof(u32)*2, e_inval); 1303 osd = ceph_decode_32(p); 1304 off = ceph_decode_32(p); 1305 pr_info("osd%d weight 0x%x %s\n", osd, off, 1306 off == CEPH_OSD_IN ? "(in)" : 1307 (off == CEPH_OSD_OUT ? "(out)" : "")); 1308 if (osd < map->max_osd) 1309 map->osd_weight[osd] = off; 1310 } 1311 1312 /* new_pg_temp */ 1313 err = decode_new_pg_temp(p, end, map); 1314 if (err) 1315 goto bad; 1316 1317 /* new_primary_temp */ 1318 if (struct_v >= 1) { 1319 err = decode_new_primary_temp(p, end, map); 1320 if (err) 1321 goto bad; 1322 } 1323 1324 /* new_primary_affinity */ 1325 if (struct_v >= 2) { 1326 err = decode_new_primary_affinity(p, end, map); 1327 if (err) 1328 goto bad; 1329 } 1330 1331 /* ignore the rest */ 1332 *p = end; 1333 1334 dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); 1335 return map; 1336 1337 e_inval: 1338 err = -EINVAL; 1339 bad: 1340 pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", 1341 err, epoch, (int)(*p - start), *p, start, end); 1342 print_hex_dump(KERN_DEBUG, "osdmap: ", 1343 DUMP_PREFIX_OFFSET, 16, 1, 1344 start, end - start, true); 1345 if (newcrush) 1346 crush_destroy(newcrush); 1347 return ERR_PTR(err); 1348 } 1349 1350 1351 1352 1353 /* 1354 * calculate file layout from given offset, length. 1355 * fill in correct oid, logical length, and object extent 1356 * offset, length. 1357 * 1358 * for now, we write only a single su, until we can 1359 * pass a stride back to the caller. 1360 */ 1361 int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 1362 u64 off, u64 len, 1363 u64 *ono, 1364 u64 *oxoff, u64 *oxlen) 1365 { 1366 u32 osize = le32_to_cpu(layout->fl_object_size); 1367 u32 su = le32_to_cpu(layout->fl_stripe_unit); 1368 u32 sc = le32_to_cpu(layout->fl_stripe_count); 1369 u32 bl, stripeno, stripepos, objsetno; 1370 u32 su_per_object; 1371 u64 t, su_offset; 1372 1373 dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, 1374 osize, su); 1375 if (su == 0 || sc == 0) 1376 goto invalid; 1377 su_per_object = osize / su; 1378 if (su_per_object == 0) 1379 goto invalid; 1380 dout("osize %u / su %u = su_per_object %u\n", osize, su, 1381 su_per_object); 1382 1383 if ((su & ~PAGE_MASK) != 0) 1384 goto invalid; 1385 1386 /* bl = *off / su; */ 1387 t = off; 1388 do_div(t, su); 1389 bl = t; 1390 dout("off %llu / su %u = bl %u\n", off, su, bl); 1391 1392 stripeno = bl / sc; 1393 stripepos = bl % sc; 1394 objsetno = stripeno / su_per_object; 1395 1396 *ono = objsetno * sc + stripepos; 1397 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono); 1398 1399 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ 1400 t = off; 1401 su_offset = do_div(t, su); 1402 *oxoff = su_offset + (stripeno % su_per_object) * su; 1403 1404 /* 1405 * Calculate the length of the extent being written to the selected 1406 * object. This is the minimum of the full length requested (len) or 1407 * the remainder of the current stripe being written to. 1408 */ 1409 *oxlen = min_t(u64, len, su - su_offset); 1410 1411 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 1412 return 0; 1413 1414 invalid: 1415 dout(" invalid layout\n"); 1416 *ono = 0; 1417 *oxoff = 0; 1418 *oxlen = 0; 1419 return -EINVAL; 1420 } 1421 EXPORT_SYMBOL(ceph_calc_file_object_mapping); 1422 1423 /* 1424 * Calculate mapping of a (oloc, oid) pair to a PG. Should only be 1425 * called with target's (oloc, oid), since tiering isn't taken into 1426 * account. 1427 */ 1428 int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 1429 struct ceph_object_locator *oloc, 1430 struct ceph_object_id *oid, 1431 struct ceph_pg *pg_out) 1432 { 1433 struct ceph_pg_pool_info *pi; 1434 1435 pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); 1436 if (!pi) 1437 return -EIO; 1438 1439 pg_out->pool = oloc->pool; 1440 pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, 1441 oid->name_len); 1442 1443 dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, 1444 pg_out->pool, pg_out->seed); 1445 return 0; 1446 } 1447 EXPORT_SYMBOL(ceph_oloc_oid_to_pg); 1448 1449 static int do_crush(struct ceph_osdmap *map, int ruleno, int x, 1450 int *result, int result_max, 1451 const __u32 *weight, int weight_max) 1452 { 1453 int r; 1454 1455 BUG_ON(result_max > CEPH_PG_MAX_SIZE); 1456 1457 mutex_lock(&map->crush_scratch_mutex); 1458 r = crush_do_rule(map->crush, ruleno, x, result, result_max, 1459 weight, weight_max, map->crush_scratch_ary); 1460 mutex_unlock(&map->crush_scratch_mutex); 1461 1462 return r; 1463 } 1464 1465 /* 1466 * Calculate raw (crush) set for given pgid. 1467 * 1468 * Return raw set length, or error. 1469 */ 1470 static int pg_to_raw_osds(struct ceph_osdmap *osdmap, 1471 struct ceph_pg_pool_info *pool, 1472 struct ceph_pg pgid, u32 pps, int *osds) 1473 { 1474 int ruleno; 1475 int len; 1476 1477 /* crush */ 1478 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1479 pool->type, pool->size); 1480 if (ruleno < 0) { 1481 pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", 1482 pgid.pool, pool->crush_ruleset, pool->type, 1483 pool->size); 1484 return -ENOENT; 1485 } 1486 1487 len = do_crush(osdmap, ruleno, pps, osds, 1488 min_t(int, pool->size, CEPH_PG_MAX_SIZE), 1489 osdmap->osd_weight, osdmap->max_osd); 1490 if (len < 0) { 1491 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", 1492 len, ruleno, pgid.pool, pool->crush_ruleset, 1493 pool->type, pool->size); 1494 return len; 1495 } 1496 1497 return len; 1498 } 1499 1500 /* 1501 * Given raw set, calculate up set and up primary. 1502 * 1503 * Return up set length. *primary is set to up primary osd id, or -1 1504 * if up set is empty. 1505 */ 1506 static int raw_to_up_osds(struct ceph_osdmap *osdmap, 1507 struct ceph_pg_pool_info *pool, 1508 int *osds, int len, int *primary) 1509 { 1510 int up_primary = -1; 1511 int i; 1512 1513 if (ceph_can_shift_osds(pool)) { 1514 int removed = 0; 1515 1516 for (i = 0; i < len; i++) { 1517 if (ceph_osd_is_down(osdmap, osds[i])) { 1518 removed++; 1519 continue; 1520 } 1521 if (removed) 1522 osds[i - removed] = osds[i]; 1523 } 1524 1525 len -= removed; 1526 if (len > 0) 1527 up_primary = osds[0]; 1528 } else { 1529 for (i = len - 1; i >= 0; i--) { 1530 if (ceph_osd_is_down(osdmap, osds[i])) 1531 osds[i] = CRUSH_ITEM_NONE; 1532 else 1533 up_primary = osds[i]; 1534 } 1535 } 1536 1537 *primary = up_primary; 1538 return len; 1539 } 1540 1541 static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, 1542 struct ceph_pg_pool_info *pool, 1543 int *osds, int len, int *primary) 1544 { 1545 int i; 1546 int pos = -1; 1547 1548 /* 1549 * Do we have any non-default primary_affinity values for these 1550 * osds? 1551 */ 1552 if (!osdmap->osd_primary_affinity) 1553 return; 1554 1555 for (i = 0; i < len; i++) { 1556 int osd = osds[i]; 1557 1558 if (osd != CRUSH_ITEM_NONE && 1559 osdmap->osd_primary_affinity[osd] != 1560 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { 1561 break; 1562 } 1563 } 1564 if (i == len) 1565 return; 1566 1567 /* 1568 * Pick the primary. Feed both the seed (for the pg) and the 1569 * osd into the hash/rng so that a proportional fraction of an 1570 * osd's pgs get rejected as primary. 1571 */ 1572 for (i = 0; i < len; i++) { 1573 int osd = osds[i]; 1574 u32 aff; 1575 1576 if (osd == CRUSH_ITEM_NONE) 1577 continue; 1578 1579 aff = osdmap->osd_primary_affinity[osd]; 1580 if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && 1581 (crush_hash32_2(CRUSH_HASH_RJENKINS1, 1582 pps, osd) >> 16) >= aff) { 1583 /* 1584 * We chose not to use this primary. Note it 1585 * anyway as a fallback in case we don't pick 1586 * anyone else, but keep looking. 1587 */ 1588 if (pos < 0) 1589 pos = i; 1590 } else { 1591 pos = i; 1592 break; 1593 } 1594 } 1595 if (pos < 0) 1596 return; 1597 1598 *primary = osds[pos]; 1599 1600 if (ceph_can_shift_osds(pool) && pos > 0) { 1601 /* move the new primary to the front */ 1602 for (i = pos; i > 0; i--) 1603 osds[i] = osds[i - 1]; 1604 osds[0] = *primary; 1605 } 1606 } 1607 1608 /* 1609 * Given up set, apply pg_temp and primary_temp mappings. 1610 * 1611 * Return acting set length. *primary is set to acting primary osd id, 1612 * or -1 if acting set is empty. 1613 */ 1614 static int apply_temps(struct ceph_osdmap *osdmap, 1615 struct ceph_pg_pool_info *pool, struct ceph_pg pgid, 1616 int *osds, int len, int *primary) 1617 { 1618 struct ceph_pg_mapping *pg; 1619 int temp_len; 1620 int temp_primary; 1621 int i; 1622 1623 /* raw_pg -> pg */ 1624 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, 1625 pool->pg_num_mask); 1626 1627 /* pg_temp? */ 1628 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1629 if (pg) { 1630 temp_len = 0; 1631 temp_primary = -1; 1632 1633 for (i = 0; i < pg->pg_temp.len; i++) { 1634 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 1635 if (ceph_can_shift_osds(pool)) 1636 continue; 1637 else 1638 osds[temp_len++] = CRUSH_ITEM_NONE; 1639 } else { 1640 osds[temp_len++] = pg->pg_temp.osds[i]; 1641 } 1642 } 1643 1644 /* apply pg_temp's primary */ 1645 for (i = 0; i < temp_len; i++) { 1646 if (osds[i] != CRUSH_ITEM_NONE) { 1647 temp_primary = osds[i]; 1648 break; 1649 } 1650 } 1651 } else { 1652 temp_len = len; 1653 temp_primary = *primary; 1654 } 1655 1656 /* primary_temp? */ 1657 pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); 1658 if (pg) 1659 temp_primary = pg->primary_temp.osd; 1660 1661 *primary = temp_primary; 1662 return temp_len; 1663 } 1664 1665 /* 1666 * Calculate acting set for given pgid. 1667 * 1668 * Return acting set length, or error. *primary is set to acting 1669 * primary osd id, or -1 if acting set is empty or on error. 1670 */ 1671 int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 1672 int *osds, int *primary) 1673 { 1674 struct ceph_pg_pool_info *pool; 1675 u32 pps; 1676 int len; 1677 1678 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 1679 if (!pool) { 1680 *primary = -1; 1681 return -ENOENT; 1682 } 1683 1684 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { 1685 /* hash pool id and seed so that pool PGs do not overlap */ 1686 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, 1687 ceph_stable_mod(pgid.seed, pool->pgp_num, 1688 pool->pgp_num_mask), 1689 pgid.pool); 1690 } else { 1691 /* 1692 * legacy behavior: add ps and pool together. this is 1693 * not a great approach because the PGs from each pool 1694 * will overlap on top of each other: 0.5 == 1.4 == 1695 * 2.3 == ... 1696 */ 1697 pps = ceph_stable_mod(pgid.seed, pool->pgp_num, 1698 pool->pgp_num_mask) + 1699 (unsigned)pgid.pool; 1700 } 1701 1702 len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); 1703 if (len < 0) { 1704 *primary = -1; 1705 return len; 1706 } 1707 1708 len = raw_to_up_osds(osdmap, pool, osds, len, primary); 1709 1710 apply_primary_affinity(osdmap, pps, pool, osds, len, primary); 1711 1712 len = apply_temps(osdmap, pool, pgid, osds, len, primary); 1713 1714 return len; 1715 } 1716 1717 /* 1718 * Return primary osd for given pgid, or -1 if none. 1719 */ 1720 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 1721 { 1722 int osds[CEPH_PG_MAX_SIZE]; 1723 int primary; 1724 1725 ceph_calc_pg_acting(osdmap, pgid, osds, &primary); 1726 1727 return primary; 1728 } 1729 EXPORT_SYMBOL(ceph_calc_pg_primary); 1730