1 2 #ifdef __KERNEL__ 3 # include <linux/string.h> 4 # include <linux/slab.h> 5 # include <linux/bug.h> 6 # include <linux/kernel.h> 7 # ifndef dprintk 8 # define dprintk(args...) 9 # endif 10 #else 11 # include <string.h> 12 # include <stdio.h> 13 # include <stdlib.h> 14 # include <assert.h> 15 # define BUG_ON(x) assert(!(x)) 16 # define dprintk(args...) /* printf(args) */ 17 # define kmalloc(x, f) malloc(x) 18 # define kfree(x) free(x) 19 #endif 20 21 #include <linux/crush/crush.h> 22 #include <linux/crush/hash.h> 23 24 /* 25 * Implement the core CRUSH mapping algorithm. 26 */ 27 28 /** 29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. 30 * @map: the crush_map 31 * @ruleset: the storage ruleset id (user defined) 32 * @type: storage ruleset type (user defined) 33 * @size: output set size 34 */ 35 int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) 36 { 37 int i; 38 39 for (i = 0; i < map->max_rules; i++) { 40 if (map->rules[i] && 41 map->rules[i]->mask.ruleset == ruleset && 42 map->rules[i]->mask.type == type && 43 map->rules[i]->mask.min_size <= size && 44 map->rules[i]->mask.max_size >= size) 45 return i; 46 } 47 return -1; 48 } 49 50 51 /* 52 * bucket choose methods 53 * 54 * For each bucket algorithm, we have a "choose" method that, given a 55 * crush input @x and replica position (usually, position in output set) @r, 56 * will produce an item in the bucket. 57 */ 58 59 /* 60 * Choose based on a random permutation of the bucket. 61 * 62 * We used to use some prime number arithmetic to do this, but it 63 * wasn't very random, and had some other bad behaviors. Instead, we 64 * calculate an actual random permutation of the bucket members. 65 * Since this is expensive, we optimize for the r=0 case, which 66 * captures the vast majority of calls. 67 */ 68 static int bucket_perm_choose(struct crush_bucket *bucket, 69 int x, int r) 70 { 71 unsigned pr = r % bucket->size; 72 unsigned i, s; 73 74 /* start a new permutation if @x has changed */ 75 if (bucket->perm_x != x || bucket->perm_n == 0) { 76 dprintk("bucket %d new x=%d\n", bucket->id, x); 77 bucket->perm_x = x; 78 79 /* optimize common r=0 case */ 80 if (pr == 0) { 81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % 82 bucket->size; 83 bucket->perm[0] = s; 84 bucket->perm_n = 0xffff; /* magic value, see below */ 85 goto out; 86 } 87 88 for (i = 0; i < bucket->size; i++) 89 bucket->perm[i] = i; 90 bucket->perm_n = 0; 91 } else if (bucket->perm_n == 0xffff) { 92 /* clean up after the r=0 case above */ 93 for (i = 1; i < bucket->size; i++) 94 bucket->perm[i] = i; 95 bucket->perm[bucket->perm[0]] = 0; 96 bucket->perm_n = 1; 97 } 98 99 /* calculate permutation up to pr */ 100 for (i = 0; i < bucket->perm_n; i++) 101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); 102 while (bucket->perm_n <= pr) { 103 unsigned p = bucket->perm_n; 104 /* no point in swapping the final entry */ 105 if (p < bucket->size - 1) { 106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) % 107 (bucket->size - p); 108 if (i) { 109 unsigned t = bucket->perm[p + i]; 110 bucket->perm[p + i] = bucket->perm[p]; 111 bucket->perm[p] = t; 112 } 113 dprintk(" perm_choose swap %d with %d\n", p, p+i); 114 } 115 bucket->perm_n++; 116 } 117 for (i = 0; i < bucket->size; i++) 118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); 119 120 s = bucket->perm[pr]; 121 out: 122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, 123 bucket->size, x, r, pr, s); 124 return bucket->items[s]; 125 } 126 127 /* uniform */ 128 static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, 129 int x, int r) 130 { 131 return bucket_perm_choose(&bucket->h, x, r); 132 } 133 134 /* list */ 135 static int bucket_list_choose(struct crush_bucket_list *bucket, 136 int x, int r) 137 { 138 int i; 139 140 for (i = bucket->h.size-1; i >= 0; i--) { 141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], 142 r, bucket->h.id); 143 w &= 0xffff; 144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x " 145 "sw %x rand %llx", 146 i, x, r, bucket->h.items[i], bucket->item_weights[i], 147 bucket->sum_weights[i], w); 148 w *= bucket->sum_weights[i]; 149 w = w >> 16; 150 /*dprintk(" scaled %llx\n", w);*/ 151 if (w < bucket->item_weights[i]) 152 return bucket->h.items[i]; 153 } 154 155 BUG_ON(1); 156 return 0; 157 } 158 159 160 /* (binary) tree */ 161 static int height(int n) 162 { 163 int h = 0; 164 while ((n & 1) == 0) { 165 h++; 166 n = n >> 1; 167 } 168 return h; 169 } 170 171 static int left(int x) 172 { 173 int h = height(x); 174 return x - (1 << (h-1)); 175 } 176 177 static int right(int x) 178 { 179 int h = height(x); 180 return x + (1 << (h-1)); 181 } 182 183 static int terminal(int x) 184 { 185 return x & 1; 186 } 187 188 static int bucket_tree_choose(struct crush_bucket_tree *bucket, 189 int x, int r) 190 { 191 int n, l; 192 __u32 w; 193 __u64 t; 194 195 /* start at root */ 196 n = bucket->num_nodes >> 1; 197 198 while (!terminal(n)) { 199 /* pick point in [0, w) */ 200 w = bucket->node_weights[n]; 201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, 202 bucket->h.id) * (__u64)w; 203 t = t >> 32; 204 205 /* descend to the left or right? */ 206 l = left(n); 207 if (t < bucket->node_weights[l]) 208 n = l; 209 else 210 n = right(n); 211 } 212 213 return bucket->h.items[n >> 1]; 214 } 215 216 217 /* straw */ 218 219 static int bucket_straw_choose(struct crush_bucket_straw *bucket, 220 int x, int r) 221 { 222 int i; 223 int high = 0; 224 __u64 high_draw = 0; 225 __u64 draw; 226 227 for (i = 0; i < bucket->h.size; i++) { 228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); 229 draw &= 0xffff; 230 draw *= bucket->straws[i]; 231 if (i == 0 || draw > high_draw) { 232 high = i; 233 high_draw = draw; 234 } 235 } 236 return bucket->h.items[high]; 237 } 238 239 static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 240 { 241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 242 switch (in->alg) { 243 case CRUSH_BUCKET_UNIFORM: 244 return bucket_uniform_choose((struct crush_bucket_uniform *)in, 245 x, r); 246 case CRUSH_BUCKET_LIST: 247 return bucket_list_choose((struct crush_bucket_list *)in, 248 x, r); 249 case CRUSH_BUCKET_TREE: 250 return bucket_tree_choose((struct crush_bucket_tree *)in, 251 x, r); 252 case CRUSH_BUCKET_STRAW: 253 return bucket_straw_choose((struct crush_bucket_straw *)in, 254 x, r); 255 default: 256 BUG_ON(1); 257 return in->items[0]; 258 } 259 } 260 261 /* 262 * true if device is marked "out" (failed, fully offloaded) 263 * of the cluster 264 */ 265 static int is_out(struct crush_map *map, __u32 *weight, int item, int x) 266 { 267 if (weight[item] >= 0x10000) 268 return 0; 269 if (weight[item] == 0) 270 return 1; 271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) 272 < weight[item]) 273 return 0; 274 return 1; 275 } 276 277 /** 278 * crush_choose - choose numrep distinct items of given type 279 * @map: the crush_map 280 * @bucket: the bucket we are choose an item from 281 * @x: crush input value 282 * @numrep: the number of items to choose 283 * @type: the type of item to choose 284 * @out: pointer to output vector 285 * @outpos: our position in that vector 286 * @firstn: true if choosing "first n" items, false if choosing "indep" 287 * @recurse_to_leaf: true if we want one device under each item of given type 288 * @out2: second output vector for leaf items (if @recurse_to_leaf) 289 */ 290 static int crush_choose(struct crush_map *map, 291 struct crush_bucket *bucket, 292 __u32 *weight, 293 int x, int numrep, int type, 294 int *out, int outpos, 295 int firstn, int recurse_to_leaf, 296 int *out2) 297 { 298 int rep; 299 int ftotal, flocal; 300 int retry_descent, retry_bucket, skip_rep; 301 struct crush_bucket *in = bucket; 302 int r; 303 int i; 304 int item = 0; 305 int itemtype; 306 int collide, reject; 307 const int orig_tries = 5; /* attempts before we fall back to search */ 308 309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 310 bucket->id, x, outpos, numrep); 311 312 for (rep = outpos; rep < numrep; rep++) { 313 /* keep trying until we get a non-out, non-colliding item */ 314 ftotal = 0; 315 skip_rep = 0; 316 do { 317 retry_descent = 0; 318 in = bucket; /* initial bucket */ 319 320 /* choose through intervening buckets */ 321 flocal = 0; 322 do { 323 collide = 0; 324 retry_bucket = 0; 325 r = rep; 326 if (in->alg == CRUSH_BUCKET_UNIFORM) { 327 /* be careful */ 328 if (firstn || numrep >= in->size) 329 /* r' = r + f_total */ 330 r += ftotal; 331 else if (in->size % numrep == 0) 332 /* r'=r+(n+1)*f_local */ 333 r += (numrep+1) * 334 (flocal+ftotal); 335 else 336 /* r' = r + n*f_local */ 337 r += numrep * (flocal+ftotal); 338 } else { 339 if (firstn) 340 /* r' = r + f_total */ 341 r += ftotal; 342 else 343 /* r' = r + n*f_local */ 344 r += numrep * (flocal+ftotal); 345 } 346 347 /* bucket choose */ 348 if (in->size == 0) { 349 reject = 1; 350 goto reject; 351 } 352 if (flocal >= (in->size>>1) && 353 flocal > orig_tries) 354 item = bucket_perm_choose(in, x, r); 355 else 356 item = crush_bucket_choose(in, x, r); 357 BUG_ON(item >= map->max_devices); 358 359 /* desired type? */ 360 if (item < 0) 361 itemtype = map->buckets[-1-item]->type; 362 else 363 itemtype = 0; 364 dprintk(" item %d type %d\n", item, itemtype); 365 366 /* keep going? */ 367 if (itemtype != type) { 368 BUG_ON(item >= 0 || 369 (-1-item) >= map->max_buckets); 370 in = map->buckets[-1-item]; 371 retry_bucket = 1; 372 continue; 373 } 374 375 /* collision? */ 376 for (i = 0; i < outpos; i++) { 377 if (out[i] == item) { 378 collide = 1; 379 break; 380 } 381 } 382 383 reject = 0; 384 if (recurse_to_leaf) { 385 if (item < 0) { 386 if (crush_choose(map, 387 map->buckets[-1-item], 388 weight, 389 x, outpos+1, 0, 390 out2, outpos, 391 firstn, 0, 392 NULL) <= outpos) 393 /* didn't get leaf */ 394 reject = 1; 395 } else { 396 /* we already have a leaf! */ 397 out2[outpos] = item; 398 } 399 } 400 401 if (!reject) { 402 /* out? */ 403 if (itemtype == 0) 404 reject = is_out(map, weight, 405 item, x); 406 else 407 reject = 0; 408 } 409 410 reject: 411 if (reject || collide) { 412 ftotal++; 413 flocal++; 414 415 if (collide && flocal < 3) 416 /* retry locally a few times */ 417 retry_bucket = 1; 418 else if (flocal < in->size + orig_tries) 419 /* exhaustive bucket search */ 420 retry_bucket = 1; 421 else if (ftotal < 20) 422 /* then retry descent */ 423 retry_descent = 1; 424 else 425 /* else give up */ 426 skip_rep = 1; 427 dprintk(" reject %d collide %d " 428 "ftotal %d flocal %d\n", 429 reject, collide, ftotal, 430 flocal); 431 } 432 } while (retry_bucket); 433 } while (retry_descent); 434 435 if (skip_rep) { 436 dprintk("skip rep\n"); 437 continue; 438 } 439 440 dprintk("CHOOSE got %d\n", item); 441 out[outpos] = item; 442 outpos++; 443 } 444 445 dprintk("CHOOSE returns %d\n", outpos); 446 return outpos; 447 } 448 449 450 /** 451 * crush_do_rule - calculate a mapping with the given input and rule 452 * @map: the crush_map 453 * @ruleno: the rule id 454 * @x: hash input 455 * @result: pointer to result vector 456 * @result_max: maximum result size 457 * @force: force initial replica choice; -1 for none 458 */ 459 int crush_do_rule(struct crush_map *map, 460 int ruleno, int x, int *result, int result_max, 461 int force, __u32 *weight) 462 { 463 int result_len; 464 int force_context[CRUSH_MAX_DEPTH]; 465 int force_pos = -1; 466 int a[CRUSH_MAX_SET]; 467 int b[CRUSH_MAX_SET]; 468 int c[CRUSH_MAX_SET]; 469 int recurse_to_leaf; 470 int *w; 471 int wsize = 0; 472 int *o; 473 int osize; 474 int *tmp; 475 struct crush_rule *rule; 476 int step; 477 int i, j; 478 int numrep; 479 int firstn; 480 int rc = -1; 481 482 BUG_ON(ruleno >= map->max_rules); 483 484 rule = map->rules[ruleno]; 485 result_len = 0; 486 w = a; 487 o = b; 488 489 /* 490 * determine hierarchical context of force, if any. note 491 * that this may or may not correspond to the specific types 492 * referenced by the crush rule. 493 */ 494 if (force >= 0) { 495 if (force >= map->max_devices || 496 map->device_parents[force] == 0) { 497 /*dprintk("CRUSH: forcefed device dne\n");*/ 498 rc = -1; /* force fed device dne */ 499 goto out; 500 } 501 if (!is_out(map, weight, force, x)) { 502 while (1) { 503 force_context[++force_pos] = force; 504 if (force >= 0) 505 force = map->device_parents[force]; 506 else 507 force = map->bucket_parents[-1-force]; 508 if (force == 0) 509 break; 510 } 511 } 512 } 513 514 for (step = 0; step < rule->len; step++) { 515 firstn = 0; 516 switch (rule->steps[step].op) { 517 case CRUSH_RULE_TAKE: 518 w[0] = rule->steps[step].arg1; 519 if (force_pos >= 0) { 520 BUG_ON(force_context[force_pos] != w[0]); 521 force_pos--; 522 } 523 wsize = 1; 524 break; 525 526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: 527 case CRUSH_RULE_CHOOSE_FIRSTN: 528 firstn = 1; 529 case CRUSH_RULE_CHOOSE_LEAF_INDEP: 530 case CRUSH_RULE_CHOOSE_INDEP: 531 BUG_ON(wsize == 0); 532 533 recurse_to_leaf = 534 rule->steps[step].op == 535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN || 536 rule->steps[step].op == 537 CRUSH_RULE_CHOOSE_LEAF_INDEP; 538 539 /* reset output */ 540 osize = 0; 541 542 for (i = 0; i < wsize; i++) { 543 /* 544 * see CRUSH_N, CRUSH_N_MINUS macros. 545 * basically, numrep <= 0 means relative to 546 * the provided result_max 547 */ 548 numrep = rule->steps[step].arg1; 549 if (numrep <= 0) { 550 numrep += result_max; 551 if (numrep <= 0) 552 continue; 553 } 554 j = 0; 555 if (osize == 0 && force_pos >= 0) { 556 /* skip any intermediate types */ 557 while (force_pos && 558 force_context[force_pos] < 0 && 559 rule->steps[step].arg2 != 560 map->buckets[-1 - 561 force_context[force_pos]]->type) 562 force_pos--; 563 o[osize] = force_context[force_pos]; 564 if (recurse_to_leaf) 565 c[osize] = force_context[0]; 566 j++; 567 force_pos--; 568 } 569 osize += crush_choose(map, 570 map->buckets[-1-w[i]], 571 weight, 572 x, numrep, 573 rule->steps[step].arg2, 574 o+osize, j, 575 firstn, 576 recurse_to_leaf, c+osize); 577 } 578 579 if (recurse_to_leaf) 580 /* copy final _leaf_ values to output set */ 581 memcpy(o, c, osize*sizeof(*o)); 582 583 /* swap t and w arrays */ 584 tmp = o; 585 o = w; 586 w = tmp; 587 wsize = osize; 588 break; 589 590 591 case CRUSH_RULE_EMIT: 592 for (i = 0; i < wsize && result_len < result_max; i++) { 593 result[result_len] = w[i]; 594 result_len++; 595 } 596 wsize = 0; 597 break; 598 599 default: 600 BUG_ON(1); 601 } 602 } 603 rc = result_len; 604 605 out: 606 return rc; 607 } 608 609 610