1 2 /*-------------------------------------------------------------*/ 3 /*--- Block sorting machinery ---*/ 4 /*--- blocksort.c ---*/ 5 /*-------------------------------------------------------------*/ 6 7 /*-- 8 This file is a part of bzip2 and/or libbzip2, a program and 9 library for lossless, block-sorting data compression. 10 11 Copyright (C) 1996-2002 Julian R Seward. All rights reserved. 12 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions 15 are met: 16 17 1. Redistributions of source code must retain the above copyright 18 notice, this list of conditions and the following disclaimer. 19 20 2. The origin of this software must not be misrepresented; you must 21 not claim that you wrote the original software. If you use this 22 software in a product, an acknowledgment in the product 23 documentation would be appreciated but is not required. 24 25 3. Altered source versions must be plainly marked as such, and must 26 not be misrepresented as being the original software. 27 28 4. The name of the author may not be used to endorse or promote 29 products derived from this software without specific prior written 30 permission. 31 32 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 33 OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 34 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 36 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 38 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 40 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 41 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 42 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 44 Julian Seward, Cambridge, UK. 45 jseward@acm.org 46 bzip2/libbzip2 version 1.0.6 of 6 September 2010 47 Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org> 48 49 This program is based on (at least) the work of: 50 Mike Burrows 51 David Wheeler 52 Peter Fenwick 53 Alistair Moffat 54 Radford Neal 55 Ian H. Witten 56 Robert Sedgewick 57 Jon L. Bentley 58 59 For more information on these sources, see the manual. 60 --*/ 61 62 #include "bzlib_private.h" 63 64 /*---------------------------------------------*/ 65 /*--- Fallback O(N log(N)^2) sorting ---*/ 66 /*--- algorithm, for repetitive blocks ---*/ 67 /*---------------------------------------------*/ 68 69 /*---------------------------------------------*/ 70 static 71 __inline__ 72 void fallbackSimpleSort ( UInt32* fmap, 73 UInt32* eclass, 74 Int32 lo, 75 Int32 hi ) 76 { 77 Int32 i, j, tmp; 78 UInt32 ec_tmp; 79 80 if (lo == hi) return; 81 82 if (hi - lo > 3) { 83 for ( i = hi-4; i >= lo; i-- ) { 84 tmp = fmap[i]; 85 ec_tmp = eclass[tmp]; 86 for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 ) 87 fmap[j-4] = fmap[j]; 88 fmap[j-4] = tmp; 89 } 90 } 91 92 for ( i = hi-1; i >= lo; i-- ) { 93 tmp = fmap[i]; 94 ec_tmp = eclass[tmp]; 95 for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ ) 96 fmap[j-1] = fmap[j]; 97 fmap[j-1] = tmp; 98 } 99 } 100 101 102 /*---------------------------------------------*/ 103 #define fswap(zz1, zz2) \ 104 { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } 105 106 #define fvswap(zzp1, zzp2, zzn) \ 107 { \ 108 Int32 yyp1 = (zzp1); \ 109 Int32 yyp2 = (zzp2); \ 110 Int32 yyn = (zzn); \ 111 while (yyn > 0) { \ 112 fswap(fmap[yyp1], fmap[yyp2]); \ 113 yyp1++; yyp2++; yyn--; \ 114 } \ 115 } 116 117 118 #define fmin(a,b) ((a) < (b)) ? (a) : (b) 119 120 #define fpush(lz,hz) { stackLo[sp] = lz; \ 121 stackHi[sp] = hz; \ 122 sp++; } 123 124 #define fpop(lz,hz) { sp--; \ 125 lz = stackLo[sp]; \ 126 hz = stackHi[sp]; } 127 128 #define FALLBACK_QSORT_SMALL_THRESH 10 129 #define FALLBACK_QSORT_STACK_SIZE 100 130 131 132 static 133 void fallbackQSort3 ( UInt32* fmap, 134 UInt32* eclass, 135 Int32 loSt, 136 Int32 hiSt ) 137 { 138 Int32 unLo, unHi, ltLo, gtHi, n, m; 139 Int32 sp, lo, hi; 140 UInt32 med, r, r3; 141 Int32 stackLo[FALLBACK_QSORT_STACK_SIZE]; 142 Int32 stackHi[FALLBACK_QSORT_STACK_SIZE]; 143 144 r = 0; 145 146 sp = 0; 147 fpush ( loSt, hiSt ); 148 149 while (sp > 0) { 150 151 AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 ); 152 153 fpop ( lo, hi ); 154 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { 155 fallbackSimpleSort ( fmap, eclass, lo, hi ); 156 continue; 157 } 158 159 /* Random partitioning. Median of 3 sometimes fails to 160 avoid bad cases. Median of 9 seems to help but 161 looks rather expensive. This too seems to work but 162 is cheaper. Guidance for the magic constants 163 7621 and 32768 is taken from Sedgewick's algorithms 164 book, chapter 35. 165 */ 166 r = ((r * 7621) + 1) % 32768; 167 r3 = r % 3; 168 if (r3 == 0) med = eclass[fmap[lo]]; else 169 if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else 170 med = eclass[fmap[hi]]; 171 172 unLo = ltLo = lo; 173 unHi = gtHi = hi; 174 175 while (1) { 176 while (1) { 177 if (unLo > unHi) break; 178 n = (Int32)eclass[fmap[unLo]] - (Int32)med; 179 if (n == 0) { 180 fswap(fmap[unLo], fmap[ltLo]); 181 ltLo++; unLo++; 182 continue; 183 }; 184 if (n > 0) break; 185 unLo++; 186 } 187 while (1) { 188 if (unLo > unHi) break; 189 n = (Int32)eclass[fmap[unHi]] - (Int32)med; 190 if (n == 0) { 191 fswap(fmap[unHi], fmap[gtHi]); 192 gtHi--; unHi--; 193 continue; 194 }; 195 if (n < 0) break; 196 unHi--; 197 } 198 if (unLo > unHi) break; 199 fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; 200 } 201 202 AssertD ( unHi == unLo-1, "fallbackQSort3(2)" ); 203 204 if (gtHi < ltLo) continue; 205 206 n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n); 207 m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m); 208 209 n = lo + unLo - ltLo - 1; 210 m = hi - (gtHi - unHi) + 1; 211 212 if (n - lo > hi - m) { 213 fpush ( lo, n ); 214 fpush ( m, hi ); 215 } else { 216 fpush ( m, hi ); 217 fpush ( lo, n ); 218 } 219 } 220 } 221 222 #undef fmin 223 #undef fpush 224 #undef fpop 225 #undef fswap 226 #undef fvswap 227 #undef FALLBACK_QSORT_SMALL_THRESH 228 #undef FALLBACK_QSORT_STACK_SIZE 229 230 231 /*---------------------------------------------*/ 232 /* Pre: 233 nblock > 0 234 eclass exists for [0 .. nblock-1] 235 ((UChar*)eclass) [0 .. nblock-1] holds block 236 ptr exists for [0 .. nblock-1] 237 238 Post: 239 ((UChar*)eclass) [0 .. nblock-1] holds block 240 All other areas of eclass destroyed 241 fmap [0 .. nblock-1] holds sorted order 242 bhtab [ 0 .. 2+(nblock/32) ] destroyed 243 */ 244 245 #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) 246 #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) 247 #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) 248 #define WORD_BH(zz) bhtab[(zz) >> 5] 249 #define UNALIGNED_BH(zz) ((zz) & 0x01f) 250 251 static 252 void fallbackSort ( UInt32* fmap, 253 UInt32* eclass, 254 UInt32* bhtab, 255 Int32 nblock, 256 Int32 verb ) 257 { 258 Int32 ftab[257]; 259 Int32 ftabCopy[256]; 260 Int32 H, i, j, k, l, r, cc, cc1; 261 Int32 nNotDone; 262 Int32 nBhtab; 263 UChar* eclass8 = (UChar*)eclass; 264 265 /*-- 266 Initial 1-char radix sort to generate 267 initial fmap and initial BH bits. 268 --*/ 269 if (verb >= 4) 270 VPrintf0 ( " bucket sorting ...\n" ); 271 for (i = 0; i < 257; i++) ftab[i] = 0; 272 for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; 273 for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; 274 for (i = 1; i < 257; i++) ftab[i] += ftab[i-1]; 275 276 for (i = 0; i < nblock; i++) { 277 j = eclass8[i]; 278 k = ftab[j] - 1; 279 ftab[j] = k; 280 fmap[k] = i; 281 } 282 283 nBhtab = 2 + (nblock / 32); 284 for (i = 0; i < nBhtab; i++) bhtab[i] = 0; 285 for (i = 0; i < 256; i++) SET_BH(ftab[i]); 286 287 /*-- 288 Inductively refine the buckets. Kind-of an 289 "exponential radix sort" (!), inspired by the 290 Manber-Myers suffix array construction algorithm. 291 --*/ 292 293 /*-- set sentinel bits for block-end detection --*/ 294 for (i = 0; i < 32; i++) { 295 SET_BH(nblock + 2*i); 296 CLEAR_BH(nblock + 2*i + 1); 297 } 298 299 /*-- the log(N) loop --*/ 300 H = 1; 301 while (1) { 302 303 if (verb >= 4) 304 VPrintf1 ( " depth %6d has ", H ); 305 306 j = 0; 307 for (i = 0; i < nblock; i++) { 308 if (ISSET_BH(i)) j = i; 309 k = fmap[i] - H; if (k < 0) k += nblock; 310 eclass[k] = j; 311 } 312 313 nNotDone = 0; 314 r = -1; 315 while (1) { 316 317 /*-- find the next non-singleton bucket --*/ 318 k = r + 1; 319 while (ISSET_BH(k) && UNALIGNED_BH(k)) k++; 320 if (ISSET_BH(k)) { 321 while (WORD_BH(k) == 0xffffffff) k += 32; 322 while (ISSET_BH(k)) k++; 323 } 324 l = k - 1; 325 if (l >= nblock) break; 326 while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++; 327 if (!ISSET_BH(k)) { 328 while (WORD_BH(k) == 0x00000000) k += 32; 329 while (!ISSET_BH(k)) k++; 330 } 331 r = k - 1; 332 if (r >= nblock) break; 333 334 /*-- now [l, r] bracket current bucket --*/ 335 if (r > l) { 336 nNotDone += (r - l + 1); 337 fallbackQSort3 ( fmap, eclass, l, r ); 338 339 /*-- scan bucket and generate header bits-- */ 340 cc = -1; 341 for (i = l; i <= r; i++) { 342 cc1 = eclass[fmap[i]]; 343 if (cc != cc1) { SET_BH(i); cc = cc1; }; 344 } 345 } 346 } 347 348 if (verb >= 4) 349 VPrintf1 ( "%6d unresolved strings\n", nNotDone ); 350 351 H *= 2; 352 if (H > nblock || nNotDone == 0) break; 353 } 354 355 /*-- 356 Reconstruct the original block in 357 eclass8 [0 .. nblock-1], since the 358 previous phase destroyed it. 359 --*/ 360 if (verb >= 4) 361 VPrintf0 ( " reconstructing block ...\n" ); 362 j = 0; 363 for (i = 0; i < nblock; i++) { 364 while (ftabCopy[j] == 0) j++; 365 ftabCopy[j]--; 366 eclass8[fmap[i]] = (UChar)j; 367 } 368 AssertH ( j < 256, 1005 ); 369 } 370 371 #undef SET_BH 372 #undef CLEAR_BH 373 #undef ISSET_BH 374 #undef WORD_BH 375 #undef UNALIGNED_BH 376 377 378 /*---------------------------------------------*/ 379 /*--- The main, O(N^2 log(N)) sorting ---*/ 380 /*--- algorithm. Faster for "normal" ---*/ 381 /*--- non-repetitive blocks. ---*/ 382 /*---------------------------------------------*/ 383 384 /*---------------------------------------------*/ 385 static 386 __inline__ 387 Bool mainGtU ( UInt32 i1, 388 UInt32 i2, 389 UChar* block, 390 UInt16* quadrant, 391 UInt32 nblock, 392 Int32* budget ) 393 { 394 Int32 k; 395 UChar c1, c2; 396 UInt16 s1, s2; 397 398 AssertD ( i1 != i2, "mainGtU" ); 399 /* 1 */ 400 c1 = block[i1]; c2 = block[i2]; 401 if (c1 != c2) return (c1 > c2); 402 i1++; i2++; 403 /* 2 */ 404 c1 = block[i1]; c2 = block[i2]; 405 if (c1 != c2) return (c1 > c2); 406 i1++; i2++; 407 /* 3 */ 408 c1 = block[i1]; c2 = block[i2]; 409 if (c1 != c2) return (c1 > c2); 410 i1++; i2++; 411 /* 4 */ 412 c1 = block[i1]; c2 = block[i2]; 413 if (c1 != c2) return (c1 > c2); 414 i1++; i2++; 415 /* 5 */ 416 c1 = block[i1]; c2 = block[i2]; 417 if (c1 != c2) return (c1 > c2); 418 i1++; i2++; 419 /* 6 */ 420 c1 = block[i1]; c2 = block[i2]; 421 if (c1 != c2) return (c1 > c2); 422 i1++; i2++; 423 /* 7 */ 424 c1 = block[i1]; c2 = block[i2]; 425 if (c1 != c2) return (c1 > c2); 426 i1++; i2++; 427 /* 8 */ 428 c1 = block[i1]; c2 = block[i2]; 429 if (c1 != c2) return (c1 > c2); 430 i1++; i2++; 431 /* 9 */ 432 c1 = block[i1]; c2 = block[i2]; 433 if (c1 != c2) return (c1 > c2); 434 i1++; i2++; 435 /* 10 */ 436 c1 = block[i1]; c2 = block[i2]; 437 if (c1 != c2) return (c1 > c2); 438 i1++; i2++; 439 /* 11 */ 440 c1 = block[i1]; c2 = block[i2]; 441 if (c1 != c2) return (c1 > c2); 442 i1++; i2++; 443 /* 12 */ 444 c1 = block[i1]; c2 = block[i2]; 445 if (c1 != c2) return (c1 > c2); 446 i1++; i2++; 447 448 k = nblock + 8; 449 450 do { 451 /* 1 */ 452 c1 = block[i1]; c2 = block[i2]; 453 if (c1 != c2) return (c1 > c2); 454 s1 = quadrant[i1]; s2 = quadrant[i2]; 455 if (s1 != s2) return (s1 > s2); 456 i1++; i2++; 457 /* 2 */ 458 c1 = block[i1]; c2 = block[i2]; 459 if (c1 != c2) return (c1 > c2); 460 s1 = quadrant[i1]; s2 = quadrant[i2]; 461 if (s1 != s2) return (s1 > s2); 462 i1++; i2++; 463 /* 3 */ 464 c1 = block[i1]; c2 = block[i2]; 465 if (c1 != c2) return (c1 > c2); 466 s1 = quadrant[i1]; s2 = quadrant[i2]; 467 if (s1 != s2) return (s1 > s2); 468 i1++; i2++; 469 /* 4 */ 470 c1 = block[i1]; c2 = block[i2]; 471 if (c1 != c2) return (c1 > c2); 472 s1 = quadrant[i1]; s2 = quadrant[i2]; 473 if (s1 != s2) return (s1 > s2); 474 i1++; i2++; 475 /* 5 */ 476 c1 = block[i1]; c2 = block[i2]; 477 if (c1 != c2) return (c1 > c2); 478 s1 = quadrant[i1]; s2 = quadrant[i2]; 479 if (s1 != s2) return (s1 > s2); 480 i1++; i2++; 481 /* 6 */ 482 c1 = block[i1]; c2 = block[i2]; 483 if (c1 != c2) return (c1 > c2); 484 s1 = quadrant[i1]; s2 = quadrant[i2]; 485 if (s1 != s2) return (s1 > s2); 486 i1++; i2++; 487 /* 7 */ 488 c1 = block[i1]; c2 = block[i2]; 489 if (c1 != c2) return (c1 > c2); 490 s1 = quadrant[i1]; s2 = quadrant[i2]; 491 if (s1 != s2) return (s1 > s2); 492 i1++; i2++; 493 /* 8 */ 494 c1 = block[i1]; c2 = block[i2]; 495 if (c1 != c2) return (c1 > c2); 496 s1 = quadrant[i1]; s2 = quadrant[i2]; 497 if (s1 != s2) return (s1 > s2); 498 i1++; i2++; 499 500 if (i1 >= nblock) i1 -= nblock; 501 if (i2 >= nblock) i2 -= nblock; 502 503 k -= 8; 504 (*budget)--; 505 } 506 while (k >= 0); 507 508 return False; 509 } 510 511 512 /*---------------------------------------------*/ 513 /*-- 514 Knuth's increments seem to work better 515 than Incerpi-Sedgewick here. Possibly 516 because the number of elems to sort is 517 usually small, typically <= 20. 518 --*/ 519 static 520 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, 521 9841, 29524, 88573, 265720, 522 797161, 2391484 }; 523 524 static 525 void mainSimpleSort ( UInt32* ptr, 526 UChar* block, 527 UInt16* quadrant, 528 Int32 nblock, 529 Int32 lo, 530 Int32 hi, 531 Int32 d, 532 Int32* budget ) 533 { 534 Int32 i, j, h, bigN, hp; 535 UInt32 v; 536 537 bigN = hi - lo + 1; 538 if (bigN < 2) return; 539 540 hp = 0; 541 while (incs[hp] < bigN) hp++; 542 hp--; 543 544 for (; hp >= 0; hp--) { 545 h = incs[hp]; 546 547 i = lo + h; 548 while (True) { 549 550 /*-- copy 1 --*/ 551 if (i > hi) break; 552 v = ptr[i]; 553 j = i; 554 while ( mainGtU ( 555 ptr[j-h]+d, v+d, block, quadrant, nblock, budget 556 ) ) { 557 ptr[j] = ptr[j-h]; 558 j = j - h; 559 if (j <= (lo + h - 1)) break; 560 } 561 ptr[j] = v; 562 i++; 563 564 /*-- copy 2 --*/ 565 if (i > hi) break; 566 v = ptr[i]; 567 j = i; 568 while ( mainGtU ( 569 ptr[j-h]+d, v+d, block, quadrant, nblock, budget 570 ) ) { 571 ptr[j] = ptr[j-h]; 572 j = j - h; 573 if (j <= (lo + h - 1)) break; 574 } 575 ptr[j] = v; 576 i++; 577 578 /*-- copy 3 --*/ 579 if (i > hi) break; 580 v = ptr[i]; 581 j = i; 582 while ( mainGtU ( 583 ptr[j-h]+d, v+d, block, quadrant, nblock, budget 584 ) ) { 585 ptr[j] = ptr[j-h]; 586 j = j - h; 587 if (j <= (lo + h - 1)) break; 588 } 589 ptr[j] = v; 590 i++; 591 592 if (*budget < 0) return; 593 } 594 } 595 } 596 597 598 /*---------------------------------------------*/ 599 /*-- 600 The following is an implementation of 601 an elegant 3-way quicksort for strings, 602 described in a paper "Fast Algorithms for 603 Sorting and Searching Strings", by Robert 604 Sedgewick and Jon L. Bentley. 605 --*/ 606 607 #define mswap(zz1, zz2) \ 608 { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } 609 610 #define mvswap(zzp1, zzp2, zzn) \ 611 { \ 612 Int32 yyp1 = (zzp1); \ 613 Int32 yyp2 = (zzp2); \ 614 Int32 yyn = (zzn); \ 615 while (yyn > 0) { \ 616 mswap(ptr[yyp1], ptr[yyp2]); \ 617 yyp1++; yyp2++; yyn--; \ 618 } \ 619 } 620 621 static 622 __inline__ 623 UChar mmed3 ( UChar a, UChar b, UChar c ) 624 { 625 UChar t; 626 if (a > b) { t = a; a = b; b = t; }; 627 if (b > c) { 628 b = c; 629 if (a > b) b = a; 630 } 631 return b; 632 } 633 634 #define mmin(a,b) ((a) < (b)) ? (a) : (b) 635 636 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \ 637 stackHi[sp] = hz; \ 638 stackD [sp] = dz; \ 639 sp++; } 640 641 #define mpop(lz,hz,dz) { sp--; \ 642 lz = stackLo[sp]; \ 643 hz = stackHi[sp]; \ 644 dz = stackD [sp]; } 645 646 647 #define mnextsize(az) (nextHi[az]-nextLo[az]) 648 649 #define mnextswap(az,bz) \ 650 { Int32 tz; \ 651 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ 652 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ 653 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; } 654 655 656 #define MAIN_QSORT_SMALL_THRESH 20 657 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) 658 #define MAIN_QSORT_STACK_SIZE 100 659 660 static 661 void mainQSort3 ( UInt32* ptr, 662 UChar* block, 663 UInt16* quadrant, 664 Int32 nblock, 665 Int32 loSt, 666 Int32 hiSt, 667 Int32 dSt, 668 Int32* budget ) 669 { 670 Int32 unLo, unHi, ltLo, gtHi, n, m, med; 671 Int32 sp, lo, hi, d; 672 673 Int32 stackLo[MAIN_QSORT_STACK_SIZE]; 674 Int32 stackHi[MAIN_QSORT_STACK_SIZE]; 675 Int32 stackD [MAIN_QSORT_STACK_SIZE]; 676 677 Int32 nextLo[3]; 678 Int32 nextHi[3]; 679 Int32 nextD [3]; 680 681 sp = 0; 682 mpush ( loSt, hiSt, dSt ); 683 684 while (sp > 0) { 685 686 AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 ); 687 688 mpop ( lo, hi, d ); 689 if (hi - lo < MAIN_QSORT_SMALL_THRESH || 690 d > MAIN_QSORT_DEPTH_THRESH) { 691 mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget ); 692 if (*budget < 0) return; 693 continue; 694 } 695 696 med = (Int32) 697 mmed3 ( block[ptr[ lo ]+d], 698 block[ptr[ hi ]+d], 699 block[ptr[ (lo+hi)>>1 ]+d] ); 700 701 unLo = ltLo = lo; 702 unHi = gtHi = hi; 703 704 while (True) { 705 while (True) { 706 if (unLo > unHi) break; 707 n = ((Int32)block[ptr[unLo]+d]) - med; 708 if (n == 0) { 709 mswap(ptr[unLo], ptr[ltLo]); 710 ltLo++; unLo++; continue; 711 }; 712 if (n > 0) break; 713 unLo++; 714 } 715 while (True) { 716 if (unLo > unHi) break; 717 n = ((Int32)block[ptr[unHi]+d]) - med; 718 if (n == 0) { 719 mswap(ptr[unHi], ptr[gtHi]); 720 gtHi--; unHi--; continue; 721 }; 722 if (n < 0) break; 723 unHi--; 724 } 725 if (unLo > unHi) break; 726 mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--; 727 } 728 729 AssertD ( unHi == unLo-1, "mainQSort3(2)" ); 730 731 if (gtHi < ltLo) { 732 mpush(lo, hi, d+1 ); 733 continue; 734 } 735 736 n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n); 737 m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m); 738 739 n = lo + unLo - ltLo - 1; 740 m = hi - (gtHi - unHi) + 1; 741 742 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; 743 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; 744 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; 745 746 if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); 747 if (mnextsize(1) < mnextsize(2)) mnextswap(1,2); 748 if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); 749 750 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" ); 751 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" ); 752 753 mpush (nextLo[0], nextHi[0], nextD[0]); 754 mpush (nextLo[1], nextHi[1], nextD[1]); 755 mpush (nextLo[2], nextHi[2], nextD[2]); 756 } 757 } 758 759 #undef mswap 760 #undef mvswap 761 #undef mpush 762 #undef mpop 763 #undef mmin 764 #undef mnextsize 765 #undef mnextswap 766 #undef MAIN_QSORT_SMALL_THRESH 767 #undef MAIN_QSORT_DEPTH_THRESH 768 #undef MAIN_QSORT_STACK_SIZE 769 770 771 /*---------------------------------------------*/ 772 /* Pre: 773 nblock > N_OVERSHOOT 774 block32 exists for [0 .. nblock-1 +N_OVERSHOOT] 775 ((UChar*)block32) [0 .. nblock-1] holds block 776 ptr exists for [0 .. nblock-1] 777 778 Post: 779 ((UChar*)block32) [0 .. nblock-1] holds block 780 All other areas of block32 destroyed 781 ftab [0 .. 65536 ] destroyed 782 ptr [0 .. nblock-1] holds sorted order 783 if (*budget < 0), sorting was abandoned 784 */ 785 786 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) 787 #define SETMASK (1 << 21) 788 #define CLEARMASK (~(SETMASK)) 789 790 static 791 void mainSort ( UInt32* ptr, 792 UChar* block, 793 UInt16* quadrant, 794 UInt32* ftab, 795 Int32 nblock, 796 Int32 verb, 797 Int32* budget ) 798 { 799 Int32 i, j, k, ss, sb; 800 Int32 runningOrder[256]; 801 Bool bigDone[256]; 802 Int32 copyStart[256]; 803 Int32 copyEnd [256]; 804 UChar c1; 805 Int32 numQSorted; 806 UInt16 s; 807 if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" ); 808 809 /*-- set up the 2-byte frequency table --*/ 810 for (i = 65536; i >= 0; i--) ftab[i] = 0; 811 812 j = block[0] << 8; 813 i = nblock-1; 814 for (; i >= 3; i -= 4) { 815 quadrant[i] = 0; 816 j = (j >> 8) | ( ((UInt16)block[i]) << 8); 817 ftab[j]++; 818 quadrant[i-1] = 0; 819 j = (j >> 8) | ( ((UInt16)block[i-1]) << 8); 820 ftab[j]++; 821 quadrant[i-2] = 0; 822 j = (j >> 8) | ( ((UInt16)block[i-2]) << 8); 823 ftab[j]++; 824 quadrant[i-3] = 0; 825 j = (j >> 8) | ( ((UInt16)block[i-3]) << 8); 826 ftab[j]++; 827 } 828 for (; i >= 0; i--) { 829 quadrant[i] = 0; 830 j = (j >> 8) | ( ((UInt16)block[i]) << 8); 831 ftab[j]++; 832 } 833 834 /*-- (emphasises close relationship of block & quadrant) --*/ 835 for (i = 0; i < BZ_N_OVERSHOOT; i++) { 836 block [nblock+i] = block[i]; 837 quadrant[nblock+i] = 0; 838 } 839 840 if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" ); 841 842 /*-- Complete the initial radix sort --*/ 843 for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; 844 845 s = block[0] << 8; 846 i = nblock-1; 847 for (; i >= 3; i -= 4) { 848 s = (s >> 8) | (block[i] << 8); 849 j = ftab[s] -1; 850 ftab[s] = j; 851 ptr[j] = i; 852 s = (s >> 8) | (block[i-1] << 8); 853 j = ftab[s] -1; 854 ftab[s] = j; 855 ptr[j] = i-1; 856 s = (s >> 8) | (block[i-2] << 8); 857 j = ftab[s] -1; 858 ftab[s] = j; 859 ptr[j] = i-2; 860 s = (s >> 8) | (block[i-3] << 8); 861 j = ftab[s] -1; 862 ftab[s] = j; 863 ptr[j] = i-3; 864 } 865 for (; i >= 0; i--) { 866 s = (s >> 8) | (block[i] << 8); 867 j = ftab[s] -1; 868 ftab[s] = j; 869 ptr[j] = i; 870 } 871 872 /*-- 873 Now ftab contains the first loc of every small bucket. 874 Calculate the running order, from smallest to largest 875 big bucket. 876 --*/ 877 for (i = 0; i <= 255; i++) { 878 bigDone [i] = False; 879 runningOrder[i] = i; 880 } 881 882 { 883 Int32 vv; 884 Int32 h = 1; 885 do h = 3 * h + 1; while (h <= 256); 886 do { 887 h = h / 3; 888 for (i = h; i <= 255; i++) { 889 vv = runningOrder[i]; 890 j = i; 891 while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { 892 runningOrder[j] = runningOrder[j-h]; 893 j = j - h; 894 if (j <= (h - 1)) goto zero; 895 } 896 zero: 897 runningOrder[j] = vv; 898 } 899 } while (h != 1); 900 } 901 902 /*-- 903 The main sorting loop. 904 --*/ 905 906 numQSorted = 0; 907 908 for (i = 0; i <= 255; i++) { 909 910 /*-- 911 Process big buckets, starting with the least full. 912 Basically this is a 3-step process in which we call 913 mainQSort3 to sort the small buckets [ss, j], but 914 also make a big effort to avoid the calls if we can. 915 --*/ 916 ss = runningOrder[i]; 917 918 /*-- 919 Step 1: 920 Complete the big bucket [ss] by quicksorting 921 any unsorted small buckets [ss, j], for j != ss. 922 Hopefully previous pointer-scanning phases have already 923 completed many of the small buckets [ss, j], so 924 we don't have to sort them at all. 925 --*/ 926 for (j = 0; j <= 255; j++) { 927 if (j != ss) { 928 sb = (ss << 8) + j; 929 if ( ! (ftab[sb] & SETMASK) ) { 930 Int32 lo = ftab[sb] & CLEARMASK; 931 Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; 932 if (hi > lo) { 933 if (verb >= 4) 934 VPrintf4 ( " qsort [0x%x, 0x%x] " 935 "done %d this %d\n", 936 ss, j, numQSorted, hi - lo + 1 ); 937 mainQSort3 ( 938 ptr, block, quadrant, nblock, 939 lo, hi, BZ_N_RADIX, budget 940 ); 941 numQSorted += (hi - lo + 1); 942 if (*budget < 0) return; 943 } 944 } 945 ftab[sb] |= SETMASK; 946 } 947 } 948 949 AssertH ( !bigDone[ss], 1006 ); 950 951 /*-- 952 Step 2: 953 Now scan this big bucket [ss] so as to synthesise the 954 sorted order for small buckets [t, ss] for all t, 955 including, magically, the bucket [ss,ss] too. 956 This will avoid doing Real Work in subsequent Step 1's. 957 --*/ 958 { 959 for (j = 0; j <= 255; j++) { 960 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; 961 copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; 962 } 963 for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { 964 k = ptr[j]-1; if (k < 0) k += nblock; 965 c1 = block[k]; 966 if (!bigDone[c1]) 967 ptr[ copyStart[c1]++ ] = k; 968 } 969 for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { 970 k = ptr[j]-1; if (k < 0) k += nblock; 971 c1 = block[k]; 972 if (!bigDone[c1]) 973 ptr[ copyEnd[c1]-- ] = k; 974 } 975 } 976 977 AssertH ( (copyStart[ss]-1 == copyEnd[ss]) 978 || 979 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. 980 Necessity for this case is demonstrated by compressing 981 a sequence of approximately 48.5 million of character 982 251; 1.0.0/1.0.1 will then die here. */ 983 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 984 1007 ) 985 986 for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; 987 988 /*-- 989 Step 3: 990 The [ss] big bucket is now done. Record this fact, 991 and update the quadrant descriptors. Remember to 992 update quadrants in the overshoot area too, if 993 necessary. The "if (i < 255)" test merely skips 994 this updating for the last bucket processed, since 995 updating for the last bucket is pointless. 996 997 The quadrant array provides a way to incrementally 998 cache sort orderings, as they appear, so as to 999 make subsequent comparisons in fullGtU() complete 1000 faster. For repetitive blocks this makes a big 1001 difference (but not big enough to be able to avoid 1002 the fallback sorting mechanism, exponential radix sort). 1003 1004 The precise meaning is: at all times: 1005 1006 for 0 <= i < nblock and 0 <= j <= nblock 1007 1008 if block[i] != block[j], 1009 1010 then the relative values of quadrant[i] and 1011 quadrant[j] are meaningless. 1012 1013 else { 1014 if quadrant[i] < quadrant[j] 1015 then the string starting at i lexicographically 1016 precedes the string starting at j 1017 1018 else if quadrant[i] > quadrant[j] 1019 then the string starting at j lexicographically 1020 precedes the string starting at i 1021 1022 else 1023 the relative ordering of the strings starting 1024 at i and j has not yet been determined. 1025 } 1026 --*/ 1027 bigDone[ss] = True; 1028 1029 if (i < 255) { 1030 Int32 bbStart = ftab[ss << 8] & CLEARMASK; 1031 Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; 1032 Int32 shifts = 0; 1033 1034 while ((bbSize >> shifts) > 65534) shifts++; 1035 1036 for (j = bbSize-1; j >= 0; j--) { 1037 Int32 a2update = ptr[bbStart + j]; 1038 UInt16 qVal = (UInt16)(j >> shifts); 1039 quadrant[a2update] = qVal; 1040 if (a2update < BZ_N_OVERSHOOT) 1041 quadrant[a2update + nblock] = qVal; 1042 } 1043 AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 ); 1044 } 1045 1046 } 1047 1048 if (verb >= 4) 1049 VPrintf3 ( " %d pointers, %d sorted, %d scanned\n", 1050 nblock, numQSorted, nblock - numQSorted ); 1051 } 1052 1053 #undef BIGFREQ 1054 #undef SETMASK 1055 #undef CLEARMASK 1056 1057 1058 /*---------------------------------------------*/ 1059 /* Pre: 1060 nblock > 0 1061 arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] 1062 ((UChar*)arr2) [0 .. nblock-1] holds block 1063 arr1 exists for [0 .. nblock-1] 1064 1065 Post: 1066 ((UChar*)arr2) [0 .. nblock-1] holds block 1067 All other areas of block destroyed 1068 ftab [ 0 .. 65536 ] destroyed 1069 arr1 [0 .. nblock-1] holds sorted order 1070 */ 1071 void BZ2_blockSort ( EState* s ) 1072 { 1073 UInt32* ptr = s->ptr; 1074 UChar* block = s->block; 1075 UInt32* ftab = s->ftab; 1076 Int32 nblock = s->nblock; 1077 Int32 verb = s->verbosity; 1078 Int32 wfact = s->workFactor; 1079 UInt16* quadrant; 1080 Int32 budget; 1081 Int32 budgetInit; 1082 Int32 i; 1083 1084 if (nblock < 10000) { 1085 fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); 1086 } else { 1087 /* Calculate the location for quadrant, remembering to get 1088 the alignment right. Assumes that &(block[0]) is at least 1089 2-byte aligned -- this should be ok since block is really 1090 the first section of arr2. 1091 */ 1092 i = nblock+BZ_N_OVERSHOOT; 1093 if (i & 1) i++; 1094 quadrant = (UInt16*)(&(block[i])); 1095 1096 /* (wfact-1) / 3 puts the default-factor-30 1097 transition point at very roughly the same place as 1098 with v0.1 and v0.9.0. 1099 Not that it particularly matters any more, since the 1100 resulting compressed stream is now the same regardless 1101 of whether or not we use the main sort or fallback sort. 1102 */ 1103 if (wfact < 1 ) wfact = 1; 1104 if (wfact > 100) wfact = 100; 1105 budgetInit = nblock * ((wfact-1) / 3); 1106 budget = budgetInit; 1107 1108 mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget ); 1109 if (verb >= 3) 1110 VPrintf3 ( " %d work, %d block, ratio %5.2f\n", 1111 budgetInit - budget, 1112 nblock, 1113 (float)(budgetInit - budget) / 1114 (float)(nblock==0 ? 1 : nblock) ); 1115 if (budget < 0) { 1116 if (verb >= 2) 1117 VPrintf0 ( " too repetitive; using fallback" 1118 " sorting algorithm\n" ); 1119 fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); 1120 } 1121 } 1122 1123 s->origPtr = -1; 1124 for (i = 0; i < s->nblock; i++) 1125 if (ptr[i] == 0) 1126 { s->origPtr = i; break; }; 1127 1128 AssertH( s->origPtr != -1, 1003 ); 1129 } 1130 1131 1132 /*-------------------------------------------------------------*/ 1133 /*--- end blocksort.c ---*/ 1134 /*-------------------------------------------------------------*/ 1135