xref: /openbmc/u-boot/lib/bzip2/bzlib_blocksort.c (revision 3335786a982578abf9a25e4d6ce67d3416ebe15e)
1 
2 /*-------------------------------------------------------------*/
3 /*--- Block sorting machinery                               ---*/
4 /*---                                           blocksort.c ---*/
5 /*-------------------------------------------------------------*/
6 
7 /*--
8   This file is a part of bzip2 and/or libbzip2, a program and
9   library for lossless, block-sorting data compression.
10 
11   Copyright (C) 1996-2002 Julian R Seward.  All rights reserved.
12 
13   Redistribution and use in source and binary forms, with or without
14   modification, are permitted provided that the following conditions
15   are met:
16 
17   1. Redistributions of source code must retain the above copyright
18      notice, this list of conditions and the following disclaimer.
19 
20   2. The origin of this software must not be misrepresented; you must
21      not claim that you wrote the original software.  If you use this
22      software in a product, an acknowledgment in the product
23      documentation would be appreciated but is not required.
24 
25   3. Altered source versions must be plainly marked as such, and must
26      not be misrepresented as being the original software.
27 
28   4. The name of the author may not be used to endorse or promote
29      products derived from this software without specific prior written
30      permission.
31 
32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 
44   Julian Seward, Cambridge, UK.
45   jseward@acm.org
46   bzip2/libbzip2 version 1.0.6 of 6 September 2010
47   Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
48 
49   This program is based on (at least) the work of:
50      Mike Burrows
51      David Wheeler
52      Peter Fenwick
53      Alistair Moffat
54      Radford Neal
55      Ian H. Witten
56      Robert Sedgewick
57      Jon L. Bentley
58 
59   For more information on these sources, see the manual.
60 --*/
61 
62 #include "bzlib_private.h"
63 
64 /*---------------------------------------------*/
65 /*--- Fallback O(N log(N)^2) sorting        ---*/
66 /*--- algorithm, for repetitive blocks      ---*/
67 /*---------------------------------------------*/
68 
69 /*---------------------------------------------*/
70 static
71 __inline__
72 void fallbackSimpleSort ( UInt32* fmap,
73                           UInt32* eclass,
74                           Int32   lo,
75                           Int32   hi )
76 {
77    Int32 i, j, tmp;
78    UInt32 ec_tmp;
79 
80    if (lo == hi) return;
81 
82    if (hi - lo > 3) {
83       for ( i = hi-4; i >= lo; i-- ) {
84          tmp = fmap[i];
85          ec_tmp = eclass[tmp];
86          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
87             fmap[j-4] = fmap[j];
88          fmap[j-4] = tmp;
89       }
90    }
91 
92    for ( i = hi-1; i >= lo; i-- ) {
93       tmp = fmap[i];
94       ec_tmp = eclass[tmp];
95       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
96          fmap[j-1] = fmap[j];
97       fmap[j-1] = tmp;
98    }
99 }
100 
101 
102 /*---------------------------------------------*/
103 #define fswap(zz1, zz2) \
104    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
105 
106 #define fvswap(zzp1, zzp2, zzn)       \
107 {                                     \
108    Int32 yyp1 = (zzp1);               \
109    Int32 yyp2 = (zzp2);               \
110    Int32 yyn  = (zzn);                \
111    while (yyn > 0) {                  \
112       fswap(fmap[yyp1], fmap[yyp2]);  \
113       yyp1++; yyp2++; yyn--;          \
114    }                                  \
115 }
116 
117 
118 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
119 
120 #define fpush(lz,hz) { stackLo[sp] = lz; \
121                        stackHi[sp] = hz; \
122                        sp++; }
123 
124 #define fpop(lz,hz) { sp--;              \
125                       lz = stackLo[sp];  \
126                       hz = stackHi[sp]; }
127 
128 #define FALLBACK_QSORT_SMALL_THRESH 10
129 #define FALLBACK_QSORT_STACK_SIZE   100
130 
131 
132 static
133 void fallbackQSort3 ( UInt32* fmap,
134                       UInt32* eclass,
135                       Int32   loSt,
136                       Int32   hiSt )
137 {
138    Int32 unLo, unHi, ltLo, gtHi, n, m;
139    Int32 sp, lo, hi;
140    UInt32 med, r, r3;
141    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
142    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
143 
144    r = 0;
145 
146    sp = 0;
147    fpush ( loSt, hiSt );
148 
149    while (sp > 0) {
150 
151       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
152 
153       fpop ( lo, hi );
154       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
155          fallbackSimpleSort ( fmap, eclass, lo, hi );
156          continue;
157       }
158 
159       /* Random partitioning.  Median of 3 sometimes fails to
160          avoid bad cases.  Median of 9 seems to help but
161          looks rather expensive.  This too seems to work but
162          is cheaper.  Guidance for the magic constants
163          7621 and 32768 is taken from Sedgewick's algorithms
164          book, chapter 35.
165       */
166       r = ((r * 7621) + 1) % 32768;
167       r3 = r % 3;
168       if (r3 == 0) med = eclass[fmap[lo]]; else
169       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
170                    med = eclass[fmap[hi]];
171 
172       unLo = ltLo = lo;
173       unHi = gtHi = hi;
174 
175       while (1) {
176          while (1) {
177             if (unLo > unHi) break;
178             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
179             if (n == 0) {
180                fswap(fmap[unLo], fmap[ltLo]);
181                ltLo++; unLo++;
182                continue;
183             };
184             if (n > 0) break;
185             unLo++;
186          }
187          while (1) {
188             if (unLo > unHi) break;
189             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
190             if (n == 0) {
191                fswap(fmap[unHi], fmap[gtHi]);
192                gtHi--; unHi--;
193                continue;
194             };
195             if (n < 0) break;
196             unHi--;
197          }
198          if (unLo > unHi) break;
199          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
200       }
201 
202       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
203 
204       if (gtHi < ltLo) continue;
205 
206       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
207       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
208 
209       n = lo + unLo - ltLo - 1;
210       m = hi - (gtHi - unHi) + 1;
211 
212       if (n - lo > hi - m) {
213          fpush ( lo, n );
214          fpush ( m, hi );
215       } else {
216          fpush ( m, hi );
217          fpush ( lo, n );
218       }
219    }
220 }
221 
222 #undef fmin
223 #undef fpush
224 #undef fpop
225 #undef fswap
226 #undef fvswap
227 #undef FALLBACK_QSORT_SMALL_THRESH
228 #undef FALLBACK_QSORT_STACK_SIZE
229 
230 
231 /*---------------------------------------------*/
232 /* Pre:
233       nblock > 0
234       eclass exists for [0 .. nblock-1]
235       ((UChar*)eclass) [0 .. nblock-1] holds block
236       ptr exists for [0 .. nblock-1]
237 
238    Post:
239       ((UChar*)eclass) [0 .. nblock-1] holds block
240       All other areas of eclass destroyed
241       fmap [0 .. nblock-1] holds sorted order
242       bhtab [ 0 .. 2+(nblock/32) ] destroyed
243 */
244 
245 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
246 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
247 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
248 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
249 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
250 
251 static
252 void fallbackSort ( UInt32* fmap,
253                     UInt32* eclass,
254                     UInt32* bhtab,
255                     Int32   nblock,
256                     Int32   verb )
257 {
258    Int32 ftab[257];
259    Int32 ftabCopy[256];
260    Int32 H, i, j, k, l, r, cc, cc1;
261    Int32 nNotDone;
262    Int32 nBhtab;
263    UChar* eclass8 = (UChar*)eclass;
264 
265    /*--
266       Initial 1-char radix sort to generate
267       initial fmap and initial BH bits.
268    --*/
269    if (verb >= 4)
270       VPrintf0 ( "        bucket sorting ...\n" );
271    for (i = 0; i < 257;    i++) ftab[i] = 0;
272    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
273    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
274    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
275 
276    for (i = 0; i < nblock; i++) {
277       j = eclass8[i];
278       k = ftab[j] - 1;
279       ftab[j] = k;
280       fmap[k] = i;
281    }
282 
283    nBhtab = 2 + (nblock / 32);
284    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
285    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
286 
287    /*--
288       Inductively refine the buckets.  Kind-of an
289       "exponential radix sort" (!), inspired by the
290       Manber-Myers suffix array construction algorithm.
291    --*/
292 
293    /*-- set sentinel bits for block-end detection --*/
294    for (i = 0; i < 32; i++) {
295       SET_BH(nblock + 2*i);
296       CLEAR_BH(nblock + 2*i + 1);
297    }
298 
299    /*-- the log(N) loop --*/
300    H = 1;
301    while (1) {
302 
303       if (verb >= 4)
304          VPrintf1 ( "        depth %6d has ", H );
305 
306       j = 0;
307       for (i = 0; i < nblock; i++) {
308          if (ISSET_BH(i)) j = i;
309          k = fmap[i] - H; if (k < 0) k += nblock;
310          eclass[k] = j;
311       }
312 
313       nNotDone = 0;
314       r = -1;
315       while (1) {
316 
317 	 /*-- find the next non-singleton bucket --*/
318          k = r + 1;
319          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
320          if (ISSET_BH(k)) {
321             while (WORD_BH(k) == 0xffffffff) k += 32;
322             while (ISSET_BH(k)) k++;
323          }
324          l = k - 1;
325          if (l >= nblock) break;
326          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
327          if (!ISSET_BH(k)) {
328             while (WORD_BH(k) == 0x00000000) k += 32;
329             while (!ISSET_BH(k)) k++;
330          }
331          r = k - 1;
332          if (r >= nblock) break;
333 
334          /*-- now [l, r] bracket current bucket --*/
335          if (r > l) {
336             nNotDone += (r - l + 1);
337             fallbackQSort3 ( fmap, eclass, l, r );
338 
339             /*-- scan bucket and generate header bits-- */
340             cc = -1;
341             for (i = l; i <= r; i++) {
342                cc1 = eclass[fmap[i]];
343                if (cc != cc1) { SET_BH(i); cc = cc1; };
344             }
345          }
346       }
347 
348       if (verb >= 4)
349          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
350 
351       H *= 2;
352       if (H > nblock || nNotDone == 0) break;
353    }
354 
355    /*--
356       Reconstruct the original block in
357       eclass8 [0 .. nblock-1], since the
358       previous phase destroyed it.
359    --*/
360    if (verb >= 4)
361       VPrintf0 ( "        reconstructing block ...\n" );
362    j = 0;
363    for (i = 0; i < nblock; i++) {
364       while (ftabCopy[j] == 0) j++;
365       ftabCopy[j]--;
366       eclass8[fmap[i]] = (UChar)j;
367    }
368    AssertH ( j < 256, 1005 );
369 }
370 
371 #undef       SET_BH
372 #undef     CLEAR_BH
373 #undef     ISSET_BH
374 #undef      WORD_BH
375 #undef UNALIGNED_BH
376 
377 
378 /*---------------------------------------------*/
379 /*--- The main, O(N^2 log(N)) sorting       ---*/
380 /*--- algorithm.  Faster for "normal"       ---*/
381 /*--- non-repetitive blocks.                ---*/
382 /*---------------------------------------------*/
383 
384 /*---------------------------------------------*/
385 static
386 __inline__
387 Bool mainGtU ( UInt32  i1,
388                UInt32  i2,
389                UChar*  block,
390                UInt16* quadrant,
391                UInt32  nblock,
392                Int32*  budget )
393 {
394    Int32  k;
395    UChar  c1, c2;
396    UInt16 s1, s2;
397 
398    AssertD ( i1 != i2, "mainGtU" );
399    /* 1 */
400    c1 = block[i1]; c2 = block[i2];
401    if (c1 != c2) return (c1 > c2);
402    i1++; i2++;
403    /* 2 */
404    c1 = block[i1]; c2 = block[i2];
405    if (c1 != c2) return (c1 > c2);
406    i1++; i2++;
407    /* 3 */
408    c1 = block[i1]; c2 = block[i2];
409    if (c1 != c2) return (c1 > c2);
410    i1++; i2++;
411    /* 4 */
412    c1 = block[i1]; c2 = block[i2];
413    if (c1 != c2) return (c1 > c2);
414    i1++; i2++;
415    /* 5 */
416    c1 = block[i1]; c2 = block[i2];
417    if (c1 != c2) return (c1 > c2);
418    i1++; i2++;
419    /* 6 */
420    c1 = block[i1]; c2 = block[i2];
421    if (c1 != c2) return (c1 > c2);
422    i1++; i2++;
423    /* 7 */
424    c1 = block[i1]; c2 = block[i2];
425    if (c1 != c2) return (c1 > c2);
426    i1++; i2++;
427    /* 8 */
428    c1 = block[i1]; c2 = block[i2];
429    if (c1 != c2) return (c1 > c2);
430    i1++; i2++;
431    /* 9 */
432    c1 = block[i1]; c2 = block[i2];
433    if (c1 != c2) return (c1 > c2);
434    i1++; i2++;
435    /* 10 */
436    c1 = block[i1]; c2 = block[i2];
437    if (c1 != c2) return (c1 > c2);
438    i1++; i2++;
439    /* 11 */
440    c1 = block[i1]; c2 = block[i2];
441    if (c1 != c2) return (c1 > c2);
442    i1++; i2++;
443    /* 12 */
444    c1 = block[i1]; c2 = block[i2];
445    if (c1 != c2) return (c1 > c2);
446    i1++; i2++;
447 
448    k = nblock + 8;
449 
450    do {
451       /* 1 */
452       c1 = block[i1]; c2 = block[i2];
453       if (c1 != c2) return (c1 > c2);
454       s1 = quadrant[i1]; s2 = quadrant[i2];
455       if (s1 != s2) return (s1 > s2);
456       i1++; i2++;
457       /* 2 */
458       c1 = block[i1]; c2 = block[i2];
459       if (c1 != c2) return (c1 > c2);
460       s1 = quadrant[i1]; s2 = quadrant[i2];
461       if (s1 != s2) return (s1 > s2);
462       i1++; i2++;
463       /* 3 */
464       c1 = block[i1]; c2 = block[i2];
465       if (c1 != c2) return (c1 > c2);
466       s1 = quadrant[i1]; s2 = quadrant[i2];
467       if (s1 != s2) return (s1 > s2);
468       i1++; i2++;
469       /* 4 */
470       c1 = block[i1]; c2 = block[i2];
471       if (c1 != c2) return (c1 > c2);
472       s1 = quadrant[i1]; s2 = quadrant[i2];
473       if (s1 != s2) return (s1 > s2);
474       i1++; i2++;
475       /* 5 */
476       c1 = block[i1]; c2 = block[i2];
477       if (c1 != c2) return (c1 > c2);
478       s1 = quadrant[i1]; s2 = quadrant[i2];
479       if (s1 != s2) return (s1 > s2);
480       i1++; i2++;
481       /* 6 */
482       c1 = block[i1]; c2 = block[i2];
483       if (c1 != c2) return (c1 > c2);
484       s1 = quadrant[i1]; s2 = quadrant[i2];
485       if (s1 != s2) return (s1 > s2);
486       i1++; i2++;
487       /* 7 */
488       c1 = block[i1]; c2 = block[i2];
489       if (c1 != c2) return (c1 > c2);
490       s1 = quadrant[i1]; s2 = quadrant[i2];
491       if (s1 != s2) return (s1 > s2);
492       i1++; i2++;
493       /* 8 */
494       c1 = block[i1]; c2 = block[i2];
495       if (c1 != c2) return (c1 > c2);
496       s1 = quadrant[i1]; s2 = quadrant[i2];
497       if (s1 != s2) return (s1 > s2);
498       i1++; i2++;
499 
500       if (i1 >= nblock) i1 -= nblock;
501       if (i2 >= nblock) i2 -= nblock;
502 
503       k -= 8;
504       (*budget)--;
505    }
506       while (k >= 0);
507 
508    return False;
509 }
510 
511 
512 /*---------------------------------------------*/
513 /*--
514    Knuth's increments seem to work better
515    than Incerpi-Sedgewick here.  Possibly
516    because the number of elems to sort is
517    usually small, typically <= 20.
518 --*/
519 static
520 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
521                    9841, 29524, 88573, 265720,
522                    797161, 2391484 };
523 
524 static
525 void mainSimpleSort ( UInt32* ptr,
526                       UChar*  block,
527                       UInt16* quadrant,
528                       Int32   nblock,
529                       Int32   lo,
530                       Int32   hi,
531                       Int32   d,
532                       Int32*  budget )
533 {
534    Int32 i, j, h, bigN, hp;
535    UInt32 v;
536 
537    bigN = hi - lo + 1;
538    if (bigN < 2) return;
539 
540    hp = 0;
541    while (incs[hp] < bigN) hp++;
542    hp--;
543 
544    for (; hp >= 0; hp--) {
545       h = incs[hp];
546 
547       i = lo + h;
548       while (True) {
549 
550          /*-- copy 1 --*/
551          if (i > hi) break;
552          v = ptr[i];
553          j = i;
554          while ( mainGtU (
555                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
556                  ) ) {
557             ptr[j] = ptr[j-h];
558             j = j - h;
559             if (j <= (lo + h - 1)) break;
560          }
561          ptr[j] = v;
562          i++;
563 
564          /*-- copy 2 --*/
565          if (i > hi) break;
566          v = ptr[i];
567          j = i;
568          while ( mainGtU (
569                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
570                  ) ) {
571             ptr[j] = ptr[j-h];
572             j = j - h;
573             if (j <= (lo + h - 1)) break;
574          }
575          ptr[j] = v;
576          i++;
577 
578          /*-- copy 3 --*/
579          if (i > hi) break;
580          v = ptr[i];
581          j = i;
582          while ( mainGtU (
583                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
584                  ) ) {
585             ptr[j] = ptr[j-h];
586             j = j - h;
587             if (j <= (lo + h - 1)) break;
588          }
589          ptr[j] = v;
590          i++;
591 
592          if (*budget < 0) return;
593       }
594    }
595 }
596 
597 
598 /*---------------------------------------------*/
599 /*--
600    The following is an implementation of
601    an elegant 3-way quicksort for strings,
602    described in a paper "Fast Algorithms for
603    Sorting and Searching Strings", by Robert
604    Sedgewick and Jon L. Bentley.
605 --*/
606 
607 #define mswap(zz1, zz2) \
608    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
609 
610 #define mvswap(zzp1, zzp2, zzn)       \
611 {                                     \
612    Int32 yyp1 = (zzp1);               \
613    Int32 yyp2 = (zzp2);               \
614    Int32 yyn  = (zzn);                \
615    while (yyn > 0) {                  \
616       mswap(ptr[yyp1], ptr[yyp2]);    \
617       yyp1++; yyp2++; yyn--;          \
618    }                                  \
619 }
620 
621 static
622 __inline__
623 UChar mmed3 ( UChar a, UChar b, UChar c )
624 {
625    UChar t;
626    if (a > b) { t = a; a = b; b = t; };
627    if (b > c) {
628       b = c;
629       if (a > b) b = a;
630    }
631    return b;
632 }
633 
634 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
635 
636 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
637                           stackHi[sp] = hz; \
638                           stackD [sp] = dz; \
639                           sp++; }
640 
641 #define mpop(lz,hz,dz) { sp--;             \
642                          lz = stackLo[sp]; \
643                          hz = stackHi[sp]; \
644                          dz = stackD [sp]; }
645 
646 
647 #define mnextsize(az) (nextHi[az]-nextLo[az])
648 
649 #define mnextswap(az,bz)                                        \
650    { Int32 tz;                                                  \
651      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
652      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
653      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
654 
655 
656 #define MAIN_QSORT_SMALL_THRESH 20
657 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
658 #define MAIN_QSORT_STACK_SIZE 100
659 
660 static
661 void mainQSort3 ( UInt32* ptr,
662                   UChar*  block,
663                   UInt16* quadrant,
664                   Int32   nblock,
665                   Int32   loSt,
666                   Int32   hiSt,
667                   Int32   dSt,
668                   Int32*  budget )
669 {
670    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
671    Int32 sp, lo, hi, d;
672 
673    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
674    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
675    Int32 stackD [MAIN_QSORT_STACK_SIZE];
676 
677    Int32 nextLo[3];
678    Int32 nextHi[3];
679    Int32 nextD [3];
680 
681    sp = 0;
682    mpush ( loSt, hiSt, dSt );
683 
684    while (sp > 0) {
685 
686       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
687 
688       mpop ( lo, hi, d );
689       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
690           d > MAIN_QSORT_DEPTH_THRESH) {
691          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
692          if (*budget < 0) return;
693          continue;
694       }
695 
696       med = (Int32)
697             mmed3 ( block[ptr[ lo         ]+d],
698                     block[ptr[ hi         ]+d],
699                     block[ptr[ (lo+hi)>>1 ]+d] );
700 
701       unLo = ltLo = lo;
702       unHi = gtHi = hi;
703 
704       while (True) {
705          while (True) {
706             if (unLo > unHi) break;
707             n = ((Int32)block[ptr[unLo]+d]) - med;
708             if (n == 0) {
709                mswap(ptr[unLo], ptr[ltLo]);
710                ltLo++; unLo++; continue;
711             };
712             if (n >  0) break;
713             unLo++;
714          }
715          while (True) {
716             if (unLo > unHi) break;
717             n = ((Int32)block[ptr[unHi]+d]) - med;
718             if (n == 0) {
719                mswap(ptr[unHi], ptr[gtHi]);
720                gtHi--; unHi--; continue;
721             };
722             if (n <  0) break;
723             unHi--;
724          }
725          if (unLo > unHi) break;
726          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
727       }
728 
729       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
730 
731       if (gtHi < ltLo) {
732          mpush(lo, hi, d+1 );
733          continue;
734       }
735 
736       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
737       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
738 
739       n = lo + unLo - ltLo - 1;
740       m = hi - (gtHi - unHi) + 1;
741 
742       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
743       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
744       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
745 
746       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
747       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
748       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
749 
750       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
751       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
752 
753       mpush (nextLo[0], nextHi[0], nextD[0]);
754       mpush (nextLo[1], nextHi[1], nextD[1]);
755       mpush (nextLo[2], nextHi[2], nextD[2]);
756    }
757 }
758 
759 #undef mswap
760 #undef mvswap
761 #undef mpush
762 #undef mpop
763 #undef mmin
764 #undef mnextsize
765 #undef mnextswap
766 #undef MAIN_QSORT_SMALL_THRESH
767 #undef MAIN_QSORT_DEPTH_THRESH
768 #undef MAIN_QSORT_STACK_SIZE
769 
770 
771 /*---------------------------------------------*/
772 /* Pre:
773       nblock > N_OVERSHOOT
774       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
775       ((UChar*)block32) [0 .. nblock-1] holds block
776       ptr exists for [0 .. nblock-1]
777 
778    Post:
779       ((UChar*)block32) [0 .. nblock-1] holds block
780       All other areas of block32 destroyed
781       ftab [0 .. 65536 ] destroyed
782       ptr [0 .. nblock-1] holds sorted order
783       if (*budget < 0), sorting was abandoned
784 */
785 
786 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
787 #define SETMASK (1 << 21)
788 #define CLEARMASK (~(SETMASK))
789 
790 static
791 void mainSort ( UInt32* ptr,
792                 UChar*  block,
793                 UInt16* quadrant,
794                 UInt32* ftab,
795                 Int32   nblock,
796                 Int32   verb,
797                 Int32*  budget )
798 {
799    Int32  i, j, k, ss, sb;
800    Int32  runningOrder[256];
801    Bool   bigDone[256];
802    Int32  copyStart[256];
803    Int32  copyEnd  [256];
804    UChar  c1;
805    Int32  numQSorted;
806    UInt16 s;
807    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
808 
809    /*-- set up the 2-byte frequency table --*/
810    for (i = 65536; i >= 0; i--) ftab[i] = 0;
811 
812    j = block[0] << 8;
813    i = nblock-1;
814    for (; i >= 3; i -= 4) {
815       quadrant[i] = 0;
816       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
817       ftab[j]++;
818       quadrant[i-1] = 0;
819       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
820       ftab[j]++;
821       quadrant[i-2] = 0;
822       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
823       ftab[j]++;
824       quadrant[i-3] = 0;
825       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
826       ftab[j]++;
827    }
828    for (; i >= 0; i--) {
829       quadrant[i] = 0;
830       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
831       ftab[j]++;
832    }
833 
834    /*-- (emphasises close relationship of block & quadrant) --*/
835    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
836       block   [nblock+i] = block[i];
837       quadrant[nblock+i] = 0;
838    }
839 
840    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
841 
842    /*-- Complete the initial radix sort --*/
843    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
844 
845    s = block[0] << 8;
846    i = nblock-1;
847    for (; i >= 3; i -= 4) {
848       s = (s >> 8) | (block[i] << 8);
849       j = ftab[s] -1;
850       ftab[s] = j;
851       ptr[j] = i;
852       s = (s >> 8) | (block[i-1] << 8);
853       j = ftab[s] -1;
854       ftab[s] = j;
855       ptr[j] = i-1;
856       s = (s >> 8) | (block[i-2] << 8);
857       j = ftab[s] -1;
858       ftab[s] = j;
859       ptr[j] = i-2;
860       s = (s >> 8) | (block[i-3] << 8);
861       j = ftab[s] -1;
862       ftab[s] = j;
863       ptr[j] = i-3;
864    }
865    for (; i >= 0; i--) {
866       s = (s >> 8) | (block[i] << 8);
867       j = ftab[s] -1;
868       ftab[s] = j;
869       ptr[j] = i;
870    }
871 
872    /*--
873       Now ftab contains the first loc of every small bucket.
874       Calculate the running order, from smallest to largest
875       big bucket.
876    --*/
877    for (i = 0; i <= 255; i++) {
878       bigDone     [i] = False;
879       runningOrder[i] = i;
880    }
881 
882    {
883       Int32 vv;
884       Int32 h = 1;
885       do h = 3 * h + 1; while (h <= 256);
886       do {
887          h = h / 3;
888          for (i = h; i <= 255; i++) {
889             vv = runningOrder[i];
890             j = i;
891             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
892                runningOrder[j] = runningOrder[j-h];
893                j = j - h;
894                if (j <= (h - 1)) goto zero;
895             }
896             zero:
897             runningOrder[j] = vv;
898          }
899       } while (h != 1);
900    }
901 
902    /*--
903       The main sorting loop.
904    --*/
905 
906    numQSorted = 0;
907 
908    for (i = 0; i <= 255; i++) {
909 
910       /*--
911          Process big buckets, starting with the least full.
912          Basically this is a 3-step process in which we call
913          mainQSort3 to sort the small buckets [ss, j], but
914          also make a big effort to avoid the calls if we can.
915       --*/
916       ss = runningOrder[i];
917 
918       /*--
919          Step 1:
920          Complete the big bucket [ss] by quicksorting
921          any unsorted small buckets [ss, j], for j != ss.
922          Hopefully previous pointer-scanning phases have already
923          completed many of the small buckets [ss, j], so
924          we don't have to sort them at all.
925       --*/
926       for (j = 0; j <= 255; j++) {
927          if (j != ss) {
928             sb = (ss << 8) + j;
929             if ( ! (ftab[sb] & SETMASK) ) {
930                Int32 lo = ftab[sb]   & CLEARMASK;
931                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
932                if (hi > lo) {
933                   if (verb >= 4)
934                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
935                                 "done %d   this %d\n",
936                                 ss, j, numQSorted, hi - lo + 1 );
937                   mainQSort3 (
938                      ptr, block, quadrant, nblock,
939                      lo, hi, BZ_N_RADIX, budget
940                   );
941                   numQSorted += (hi - lo + 1);
942                   if (*budget < 0) return;
943                }
944             }
945             ftab[sb] |= SETMASK;
946          }
947       }
948 
949       AssertH ( !bigDone[ss], 1006 );
950 
951       /*--
952          Step 2:
953          Now scan this big bucket [ss] so as to synthesise the
954          sorted order for small buckets [t, ss] for all t,
955          including, magically, the bucket [ss,ss] too.
956          This will avoid doing Real Work in subsequent Step 1's.
957       --*/
958       {
959          for (j = 0; j <= 255; j++) {
960             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
961             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
962          }
963          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
964             k = ptr[j]-1; if (k < 0) k += nblock;
965             c1 = block[k];
966             if (!bigDone[c1])
967                ptr[ copyStart[c1]++ ] = k;
968          }
969          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
970             k = ptr[j]-1; if (k < 0) k += nblock;
971             c1 = block[k];
972             if (!bigDone[c1])
973                ptr[ copyEnd[c1]-- ] = k;
974          }
975       }
976 
977       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
978                 ||
979                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
980                    Necessity for this case is demonstrated by compressing
981                    a sequence of approximately 48.5 million of character
982                    251; 1.0.0/1.0.1 will then die here. */
983                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
984                 1007 )
985 
986       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
987 
988       /*--
989          Step 3:
990          The [ss] big bucket is now done.  Record this fact,
991          and update the quadrant descriptors.  Remember to
992          update quadrants in the overshoot area too, if
993          necessary.  The "if (i < 255)" test merely skips
994          this updating for the last bucket processed, since
995          updating for the last bucket is pointless.
996 
997          The quadrant array provides a way to incrementally
998          cache sort orderings, as they appear, so as to
999          make subsequent comparisons in fullGtU() complete
1000          faster.  For repetitive blocks this makes a big
1001          difference (but not big enough to be able to avoid
1002          the fallback sorting mechanism, exponential radix sort).
1003 
1004          The precise meaning is: at all times:
1005 
1006             for 0 <= i < nblock and 0 <= j <= nblock
1007 
1008             if block[i] != block[j],
1009 
1010                then the relative values of quadrant[i] and
1011                     quadrant[j] are meaningless.
1012 
1013                else {
1014                   if quadrant[i] < quadrant[j]
1015                      then the string starting at i lexicographically
1016                      precedes the string starting at j
1017 
1018                   else if quadrant[i] > quadrant[j]
1019                      then the string starting at j lexicographically
1020                      precedes the string starting at i
1021 
1022                   else
1023                      the relative ordering of the strings starting
1024                      at i and j has not yet been determined.
1025                }
1026       --*/
1027       bigDone[ss] = True;
1028 
1029       if (i < 255) {
1030          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1031          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1032          Int32 shifts   = 0;
1033 
1034          while ((bbSize >> shifts) > 65534) shifts++;
1035 
1036          for (j = bbSize-1; j >= 0; j--) {
1037             Int32 a2update     = ptr[bbStart + j];
1038             UInt16 qVal        = (UInt16)(j >> shifts);
1039             quadrant[a2update] = qVal;
1040             if (a2update < BZ_N_OVERSHOOT)
1041                quadrant[a2update + nblock] = qVal;
1042          }
1043          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1044       }
1045 
1046    }
1047 
1048    if (verb >= 4)
1049       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1050                  nblock, numQSorted, nblock - numQSorted );
1051 }
1052 
1053 #undef BIGFREQ
1054 #undef SETMASK
1055 #undef CLEARMASK
1056 
1057 
1058 /*---------------------------------------------*/
1059 /* Pre:
1060       nblock > 0
1061       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1062       ((UChar*)arr2)  [0 .. nblock-1] holds block
1063       arr1 exists for [0 .. nblock-1]
1064 
1065    Post:
1066       ((UChar*)arr2) [0 .. nblock-1] holds block
1067       All other areas of block destroyed
1068       ftab [ 0 .. 65536 ] destroyed
1069       arr1 [0 .. nblock-1] holds sorted order
1070 */
1071 void BZ2_blockSort ( EState* s )
1072 {
1073    UInt32* ptr    = s->ptr;
1074    UChar*  block  = s->block;
1075    UInt32* ftab   = s->ftab;
1076    Int32   nblock = s->nblock;
1077    Int32   verb   = s->verbosity;
1078    Int32   wfact  = s->workFactor;
1079    UInt16* quadrant;
1080    Int32   budget;
1081    Int32   budgetInit;
1082    Int32   i;
1083 
1084    if (nblock < 10000) {
1085       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1086    } else {
1087       /* Calculate the location for quadrant, remembering to get
1088          the alignment right.  Assumes that &(block[0]) is at least
1089          2-byte aligned -- this should be ok since block is really
1090          the first section of arr2.
1091       */
1092       i = nblock+BZ_N_OVERSHOOT;
1093       if (i & 1) i++;
1094       quadrant = (UInt16*)(&(block[i]));
1095 
1096       /* (wfact-1) / 3 puts the default-factor-30
1097          transition point at very roughly the same place as
1098          with v0.1 and v0.9.0.
1099          Not that it particularly matters any more, since the
1100          resulting compressed stream is now the same regardless
1101          of whether or not we use the main sort or fallback sort.
1102       */
1103       if (wfact < 1  ) wfact = 1;
1104       if (wfact > 100) wfact = 100;
1105       budgetInit = nblock * ((wfact-1) / 3);
1106       budget = budgetInit;
1107 
1108       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1109       if (verb >= 3)
1110          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1111                     budgetInit - budget,
1112                     nblock,
1113                     (float)(budgetInit - budget) /
1114                     (float)(nblock==0 ? 1 : nblock) );
1115       if (budget < 0) {
1116          if (verb >= 2)
1117             VPrintf0 ( "    too repetitive; using fallback"
1118                        " sorting algorithm\n" );
1119          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1120       }
1121    }
1122 
1123    s->origPtr = -1;
1124    for (i = 0; i < s->nblock; i++)
1125       if (ptr[i] == 0)
1126          { s->origPtr = i; break; };
1127 
1128    AssertH( s->origPtr != -1, 1003 );
1129 }
1130 
1131 
1132 /*-------------------------------------------------------------*/
1133 /*--- end                                       blocksort.c ---*/
1134 /*-------------------------------------------------------------*/
1135