1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 /* P9 gunzip sample code for demonstrating the P9 NX hardware 4 * interface. Not intended for productive uses or for performance or 5 * compression ratio measurements. Note also that /dev/crypto/gzip, 6 * VAS and skiboot support are required 7 * 8 * Copyright 2020 IBM Corp. 9 * 10 * Author: Bulent Abali <abali@us.ibm.com> 11 * 12 * https://github.com/libnxz/power-gzip for zlib api and other utils 13 * Definitions of acronyms used here. See 14 * P9 NX Gzip Accelerator User's Manual for details: 15 * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf 16 * 17 * adler/crc: 32 bit checksums appended to stream tail 18 * ce: completion extension 19 * cpb: coprocessor parameter block (metadata) 20 * crb: coprocessor request block (command) 21 * csb: coprocessor status block (status) 22 * dht: dynamic huffman table 23 * dde: data descriptor element (address, length) 24 * ddl: list of ddes 25 * dh/fh: dynamic and fixed huffman types 26 * fc: coprocessor function code 27 * histlen: history/dictionary length 28 * history: sliding window of up to 32KB of data 29 * lzcount: Deflate LZ symbol counts 30 * rembytecnt: remaining byte count 31 * sfbt: source final block type; last block's type during decomp 32 * spbc: source processed byte count 33 * subc: source unprocessed bit count 34 * tebc: target ending bit count; valid bits in the last byte 35 * tpbc: target processed byte count 36 * vas: virtual accelerator switch; the user mode interface 37 */ 38 39 #define _ISOC11_SOURCE // For aligned_alloc() 40 #define _DEFAULT_SOURCE // For endian.h 41 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include <stdint.h> 47 #include <sys/types.h> 48 #include <sys/stat.h> 49 #include <sys/time.h> 50 #include <sys/fcntl.h> 51 #include <sys/mman.h> 52 #include <endian.h> 53 #include <bits/endian.h> 54 #include <sys/ioctl.h> 55 #include <assert.h> 56 #include <errno.h> 57 #include <signal.h> 58 #include "nxu.h" 59 #include "nx.h" 60 #include "crb.h" 61 62 int nx_dbg; 63 FILE *nx_gzip_log; 64 65 #define NX_MIN(X, Y) (((X) < (Y))?(X):(Y)) 66 #define NX_MAX(X, Y) (((X) > (Y))?(X):(Y)) 67 68 #define GETINPC(X) fgetc(X) 69 #define FNAME_MAX 1024 70 71 /* fifo queue management */ 72 #define fifo_used_bytes(used) (used) 73 #define fifo_free_bytes(used, len) ((len)-(used)) 74 /* amount of free bytes in the first and last parts */ 75 #define fifo_free_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ 76 ? (len)-((cur)+(used)) : 0) 77 #define fifo_free_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ 78 ? (cur) : (len)-(used)) 79 /* amount of used bytes in the first and last parts */ 80 #define fifo_used_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ 81 ? (used) : (len)-(cur)) 82 #define fifo_used_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ 83 ? 0 : ((used)+(cur))-(len)) 84 /* first and last free parts start here */ 85 #define fifo_free_first_offset(cur, used) ((cur)+(used)) 86 #define fifo_free_last_offset(cur, used, len) \ 87 fifo_used_last_bytes(cur, used, len) 88 /* first and last used parts start here */ 89 #define fifo_used_first_offset(cur) (cur) 90 #define fifo_used_last_offset(cur) (0) 91 92 const int fifo_in_len = 1<<24; 93 const int fifo_out_len = 1<<24; 94 const int page_sz = 1<<16; 95 const int line_sz = 1<<7; 96 const int window_max = 1<<15; 97 98 /* 99 * Adds an (address, len) pair to the list of ddes (ddl) and updates 100 * the base dde. ddl[0] is the only dde in a direct dde which 101 * contains a single (addr,len) pair. For more pairs, ddl[0] becomes 102 * the indirect (base) dde that points to a list of direct ddes. 103 * See Section 6.4 of the NX-gzip user manual for DDE description. 104 * Addr=NULL, len=0 clears the ddl[0]. Returns the total number of 105 * bytes in ddl. Caller is responsible for allocting the array of 106 * nx_dde_t *ddl. If N addresses are required in the scatter-gather 107 * list, the ddl array must have N+1 entries minimum. 108 */ 109 static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr, 110 uint32_t len) 111 { 112 uint32_t ddecnt; 113 uint32_t bytes; 114 115 if (addr == NULL && len == 0) { 116 clearp_dde(ddl); 117 return 0; 118 } 119 120 NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr, 121 __func__, len)); 122 123 /* Number of ddes in the dde list ; == 0 when it is a direct dde */ 124 ddecnt = getpnn(ddl, dde_count); 125 bytes = getp32(ddl, ddebc); 126 127 if (ddecnt == 0 && bytes == 0) { 128 /* First dde is unused; make it a direct dde */ 129 bytes = len; 130 putp32(ddl, ddebc, bytes); 131 putp64(ddl, ddead, (uint64_t) addr); 132 } else if (ddecnt == 0) { 133 /* Converting direct to indirect dde 134 * ddl[0] becomes head dde of ddl 135 * copy direct to indirect first. 136 */ 137 ddl[1] = ddl[0]; 138 139 /* Add the new dde next */ 140 clear_dde(ddl[2]); 141 put32(ddl[2], ddebc, len); 142 put64(ddl[2], ddead, (uint64_t) addr); 143 144 /* Ddl head points to 2 direct ddes */ 145 ddecnt = 2; 146 putpnn(ddl, dde_count, ddecnt); 147 bytes = bytes + len; 148 putp32(ddl, ddebc, bytes); 149 /* Pointer to the first direct dde */ 150 putp64(ddl, ddead, (uint64_t) &ddl[1]); 151 } else { 152 /* Append a dde to an existing indirect ddl */ 153 ++ddecnt; 154 clear_dde(ddl[ddecnt]); 155 put64(ddl[ddecnt], ddead, (uint64_t) addr); 156 put32(ddl[ddecnt], ddebc, len); 157 158 putpnn(ddl, dde_count, ddecnt); 159 bytes = bytes + len; 160 putp32(ddl, ddebc, bytes); /* byte sum of all dde */ 161 } 162 return bytes; 163 } 164 165 /* 166 * Touch specified number of pages represented in number bytes 167 * beginning from the first buffer in a dde list. 168 * Do not touch the pages past buf_sz-th byte's page. 169 * 170 * Set buf_sz = 0 to touch all pages described by the ddep. 171 */ 172 static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz, 173 int wr) 174 { 175 uint32_t indirect_count; 176 uint32_t buf_len; 177 long total; 178 uint64_t buf_addr; 179 struct nx_dde_t *dde_list; 180 int i; 181 182 assert(!!ddep); 183 184 indirect_count = getpnn(ddep, dde_count); 185 186 NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__, 187 indirect_count)); 188 NXPRT(fprintf(stderr, "0x%lx\n", buf_sz)); 189 190 if (indirect_count == 0) { 191 /* Direct dde */ 192 buf_len = getp32(ddep, ddebc); 193 buf_addr = getp64(ddep, ddead); 194 195 NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n", 196 buf_len, (void *)buf_addr)); 197 198 if (buf_sz == 0) 199 nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); 200 else 201 nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len, 202 buf_sz), page_sz, wr); 203 204 return ERR_NX_OK; 205 } 206 207 /* Indirect dde */ 208 if (indirect_count > MAX_DDE_COUNT) 209 return ERR_NX_EXCESSIVE_DDE; 210 211 /* First address of the list */ 212 dde_list = (struct nx_dde_t *) getp64(ddep, ddead); 213 214 if (buf_sz == 0) 215 buf_sz = getp32(ddep, ddebc); 216 217 total = 0; 218 for (i = 0; i < indirect_count; i++) { 219 buf_len = get32(dde_list[i], ddebc); 220 buf_addr = get64(dde_list[i], ddead); 221 total += buf_len; 222 223 NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ", 224 buf_len, (void *)buf_addr)); 225 NXPRT(fprintf(stderr, "0x%lx\n", total)); 226 227 /* Touching fewer pages than encoded in the ddebc */ 228 if (total > buf_sz) { 229 buf_len = NX_MIN(buf_len, total - buf_sz); 230 nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); 231 NXPRT(fprintf(stderr, "touch loop break len 0x%x ", 232 buf_len)); 233 NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr)); 234 break; 235 } 236 nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); 237 } 238 return ERR_NX_OK; 239 } 240 241 /* 242 * Src and dst buffers are supplied in scatter gather lists. 243 * NX function code and other parameters supplied in cmdp. 244 */ 245 static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst, 246 struct nx_gzip_crb_cpb_t *cmdp, void *handle) 247 { 248 uint64_t csbaddr; 249 250 memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb)); 251 252 cmdp->crb.source_dde = *src; 253 cmdp->crb.target_dde = *dst; 254 255 /* Status, output byte count in tpbc */ 256 csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask; 257 put64(cmdp->crb, csb_address, csbaddr); 258 259 /* NX reports input bytes in spbc; cleared */ 260 cmdp->cpb.out_spbc_comp_wrap = 0; 261 cmdp->cpb.out_spbc_comp_with_count = 0; 262 cmdp->cpb.out_spbc_decomp = 0; 263 264 /* Clear output */ 265 put32(cmdp->cpb, out_crc, INIT_CRC); 266 put32(cmdp->cpb, out_adler, INIT_ADLER); 267 268 /* Submit the crb, the job descriptor, to the accelerator. */ 269 return nxu_submit_job(cmdp, handle); 270 } 271 272 int decompress_file(int argc, char **argv, void *devhandle) 273 { 274 FILE *inpf = NULL; 275 FILE *outf = NULL; 276 277 int c, expect, i, cc, rc = 0; 278 char gzfname[FNAME_MAX]; 279 280 /* Queuing, file ops, byte counting */ 281 char *fifo_in, *fifo_out; 282 int used_in, cur_in, used_out, cur_out, read_sz, n; 283 int first_free, last_free, first_used, last_used; 284 int first_offset, last_offset; 285 int write_sz, free_space, source_sz; 286 int source_sz_estimate, target_sz_estimate; 287 uint64_t last_comp_ratio = 0; /* 1000 max */ 288 uint64_t total_out = 0; 289 int is_final, is_eof; 290 291 /* nx hardware */ 292 int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0; 293 int history_len = 0; 294 struct nx_gzip_crb_cpb_t cmd, *cmdp; 295 struct nx_dde_t *ddl_in; 296 struct nx_dde_t dde_in[6] __aligned(128); 297 struct nx_dde_t *ddl_out; 298 struct nx_dde_t dde_out[6] __aligned(128); 299 int pgfault_retries; 300 301 /* when using mmap'ed files */ 302 off_t input_file_offset; 303 304 if (argc > 2) { 305 fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]); 306 fprintf(stderr, " writes to stdout or <fname>.nx.gunzip\n"); 307 return -1; 308 } 309 310 if (argc == 1) { 311 inpf = stdin; 312 outf = stdout; 313 } else if (argc == 2) { 314 char w[1024]; 315 char *wp; 316 317 inpf = fopen(argv[1], "r"); 318 if (inpf == NULL) { 319 perror(argv[1]); 320 return -1; 321 } 322 323 /* Make a new file name to write to. Ignoring '.gz' */ 324 wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1]; 325 strcpy(w, wp); 326 strcat(w, ".nx.gunzip"); 327 328 outf = fopen(w, "w"); 329 if (outf == NULL) { 330 perror(w); 331 return -1; 332 } 333 } 334 335 /* Decode the gzip header */ 336 c = GETINPC(inpf); expect = 0x1f; /* ID1 */ 337 if (c != expect) 338 goto err1; 339 340 c = GETINPC(inpf); expect = 0x8b; /* ID2 */ 341 if (c != expect) 342 goto err1; 343 344 c = GETINPC(inpf); expect = 0x08; /* CM */ 345 if (c != expect) 346 goto err1; 347 348 int flg = GETINPC(inpf); /* FLG */ 349 350 if (flg & 0xE0 || flg & 0x4 || flg == EOF) 351 goto err2; 352 353 fprintf(stderr, "gzHeader FLG %x\n", flg); 354 355 /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this 356 * sample code. 357 */ 358 for (i = 0; i < 6; i++) { 359 char tmp[10]; 360 361 tmp[i] = GETINPC(inpf); 362 if (tmp[i] == EOF) 363 goto err3; 364 fprintf(stderr, "%02x ", tmp[i]); 365 if (i == 5) 366 fprintf(stderr, "\n"); 367 } 368 fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n"); 369 370 /* FNAME */ 371 if (flg & 0x8) { 372 int k = 0; 373 374 do { 375 c = GETINPC(inpf); 376 if (c == EOF || k >= FNAME_MAX) 377 goto err3; 378 gzfname[k++] = c; 379 } while (c); 380 fprintf(stderr, "gzHeader FNAME: %s\n", gzfname); 381 } 382 383 /* FHCRC */ 384 if (flg & 0x2) { 385 c = GETINPC(inpf); 386 if (c == EOF) 387 goto err3; 388 c = GETINPC(inpf); 389 if (c == EOF) 390 goto err3; 391 fprintf(stderr, "gzHeader FHCRC: ignored\n"); 392 } 393 394 used_in = cur_in = used_out = cur_out = 0; 395 is_final = is_eof = 0; 396 397 /* Allocate one page larger to prevent page faults due to NX 398 * overfetching. 399 * Either do this (char*)(uintptr_t)aligned_alloc or use 400 * -std=c11 flag to make the int-to-pointer warning go away. 401 */ 402 assert((fifo_in = (char *)(uintptr_t)aligned_alloc(line_sz, 403 fifo_in_len + page_sz)) != NULL); 404 assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz, 405 fifo_out_len + page_sz + line_sz)) != NULL); 406 /* Leave unused space due to history rounding rules */ 407 fifo_out = fifo_out + line_sz; 408 nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1); 409 410 ddl_in = &dde_in[0]; 411 ddl_out = &dde_out[0]; 412 cmdp = &cmd; 413 memset(&cmdp->crb, 0, sizeof(cmdp->crb)); 414 415 read_state: 416 417 /* Read from .gz file */ 418 419 NXPRT(fprintf(stderr, "read_state:\n")); 420 421 if (is_eof != 0) 422 goto write_state; 423 424 /* We read in to fifo_in in two steps: first: read in to from 425 * cur_in to the end of the buffer. last: if free space wrapped 426 * around, read from fifo_in offset 0 to offset cur_in. 427 */ 428 429 /* Reset fifo head to reduce unnecessary wrap arounds */ 430 cur_in = (used_in == 0) ? 0 : cur_in; 431 432 /* Free space total is reduced by a gap */ 433 free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len) 434 - line_sz); 435 436 /* Free space may wrap around as first and last */ 437 first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len); 438 last_free = fifo_free_last_bytes(cur_in, used_in, fifo_in_len); 439 440 /* Start offsets of the free memory */ 441 first_offset = fifo_free_first_offset(cur_in, used_in); 442 last_offset = fifo_free_last_offset(cur_in, used_in, fifo_in_len); 443 444 /* Reduce read_sz because of the line_sz gap */ 445 read_sz = NX_MIN(free_space, first_free); 446 n = 0; 447 if (read_sz > 0) { 448 /* Read in to offset cur_in + used_in */ 449 n = fread(fifo_in + first_offset, 1, read_sz, inpf); 450 used_in = used_in + n; 451 free_space = free_space - n; 452 assert(n <= read_sz); 453 if (n != read_sz) { 454 /* Either EOF or error; exit the read loop */ 455 is_eof = 1; 456 goto write_state; 457 } 458 } 459 460 /* If free space wrapped around */ 461 if (last_free > 0) { 462 /* Reduce read_sz because of the line_sz gap */ 463 read_sz = NX_MIN(free_space, last_free); 464 n = 0; 465 if (read_sz > 0) { 466 n = fread(fifo_in + last_offset, 1, read_sz, inpf); 467 used_in = used_in + n; /* Increase used space */ 468 free_space = free_space - n; /* Decrease free space */ 469 assert(n <= read_sz); 470 if (n != read_sz) { 471 /* Either EOF or error; exit the read loop */ 472 is_eof = 1; 473 goto write_state; 474 } 475 } 476 } 477 478 /* At this point we have used_in bytes in fifo_in with the 479 * data head starting at cur_in and possibly wrapping around. 480 */ 481 482 write_state: 483 484 /* Write decompressed data to output file */ 485 486 NXPRT(fprintf(stderr, "write_state:\n")); 487 488 if (used_out == 0) 489 goto decomp_state; 490 491 /* If fifo_out has data waiting, write it out to the file to 492 * make free target space for the accelerator used bytes in 493 * the first and last parts of fifo_out. 494 */ 495 496 first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len); 497 last_used = fifo_used_last_bytes(cur_out, used_out, fifo_out_len); 498 499 write_sz = first_used; 500 501 n = 0; 502 if (write_sz > 0) { 503 n = fwrite(fifo_out + cur_out, 1, write_sz, outf); 504 used_out = used_out - n; 505 /* Move head of the fifo */ 506 cur_out = (cur_out + n) % fifo_out_len; 507 assert(n <= write_sz); 508 if (n != write_sz) { 509 fprintf(stderr, "error: write\n"); 510 rc = -1; 511 goto err5; 512 } 513 } 514 515 if (last_used > 0) { /* If more data available in the last part */ 516 write_sz = last_used; /* Keep it here for later */ 517 n = 0; 518 if (write_sz > 0) { 519 n = fwrite(fifo_out, 1, write_sz, outf); 520 used_out = used_out - n; 521 cur_out = (cur_out + n) % fifo_out_len; 522 assert(n <= write_sz); 523 if (n != write_sz) { 524 fprintf(stderr, "error: write\n"); 525 rc = -1; 526 goto err5; 527 } 528 } 529 } 530 531 decomp_state: 532 533 /* NX decompresses input data */ 534 535 NXPRT(fprintf(stderr, "decomp_state:\n")); 536 537 if (is_final) 538 goto finish_state; 539 540 /* Address/len lists */ 541 clearp_dde(ddl_in); 542 clearp_dde(ddl_out); 543 544 /* FC, CRC, HistLen, Table 6-6 */ 545 if (resuming) { 546 /* Resuming a partially decompressed input. 547 * The key to resume is supplying the 32KB 548 * dictionary (history) to NX, which is basically 549 * the last 32KB of output produced. 550 */ 551 fc = GZIP_FC_DECOMPRESS_RESUME; 552 553 cmdp->cpb.in_crc = cmdp->cpb.out_crc; 554 cmdp->cpb.in_adler = cmdp->cpb.out_adler; 555 556 /* Round up the history size to quadword. Section 2.10 */ 557 history_len = (history_len + 15) / 16; 558 putnn(cmdp->cpb, in_histlen, history_len); 559 history_len = history_len * 16; /* bytes */ 560 561 if (history_len > 0) { 562 /* Chain in the history buffer to the DDE list */ 563 if (cur_out >= history_len) { 564 nx_append_dde(ddl_in, fifo_out 565 + (cur_out - history_len), 566 history_len); 567 } else { 568 nx_append_dde(ddl_in, fifo_out 569 + ((fifo_out_len + cur_out) 570 - history_len), 571 history_len - cur_out); 572 /* Up to 32KB history wraps around fifo_out */ 573 nx_append_dde(ddl_in, fifo_out, cur_out); 574 } 575 576 } 577 } else { 578 /* First decompress job */ 579 fc = GZIP_FC_DECOMPRESS; 580 581 history_len = 0; 582 /* Writing 0 clears out subc as well */ 583 cmdp->cpb.in_histlen = 0; 584 total_out = 0; 585 586 put32(cmdp->cpb, in_crc, INIT_CRC); 587 put32(cmdp->cpb, in_adler, INIT_ADLER); 588 put32(cmdp->cpb, out_crc, INIT_CRC); 589 put32(cmdp->cpb, out_adler, INIT_ADLER); 590 591 /* Assuming 10% compression ratio initially; use the 592 * most recently measured compression ratio as a 593 * heuristic to estimate the input and output 594 * sizes. If we give too much input, the target buffer 595 * overflows and NX cycles are wasted, and then we 596 * must retry with smaller input size. 1000 is 100%. 597 */ 598 last_comp_ratio = 100UL; 599 } 600 cmdp->crb.gzip_fc = 0; 601 putnn(cmdp->crb, gzip_fc, fc); 602 603 /* 604 * NX source buffers 605 */ 606 first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len); 607 last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len); 608 609 if (first_used > 0) 610 nx_append_dde(ddl_in, fifo_in + cur_in, first_used); 611 612 if (last_used > 0) 613 nx_append_dde(ddl_in, fifo_in, last_used); 614 615 /* 616 * NX target buffers 617 */ 618 first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len); 619 last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len); 620 621 /* Reduce output free space amount not to overwrite the history */ 622 int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len) 623 - (1<<16)); 624 625 NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max, 626 target_max)); 627 628 first_free = NX_MIN(target_max, first_free); 629 if (first_free > 0) { 630 first_offset = fifo_free_first_offset(cur_out, used_out); 631 nx_append_dde(ddl_out, fifo_out + first_offset, first_free); 632 } 633 634 if (last_free > 0) { 635 last_free = NX_MIN(target_max - first_free, last_free); 636 if (last_free > 0) { 637 last_offset = fifo_free_last_offset(cur_out, used_out, 638 fifo_out_len); 639 nx_append_dde(ddl_out, fifo_out + last_offset, 640 last_free); 641 } 642 } 643 644 /* Target buffer size is used to limit the source data size 645 * based on previous measurements of compression ratio. 646 */ 647 648 /* source_sz includes history */ 649 source_sz = getp32(ddl_in, ddebc); 650 assert(source_sz > history_len); 651 source_sz = source_sz - history_len; 652 653 /* Estimating how much source is needed to 3/4 fill a 654 * target_max size target buffer. If we overshoot, then NX 655 * must repeat the job with smaller input and we waste 656 * bandwidth. If we undershoot then we use more NX calls than 657 * necessary. 658 */ 659 660 source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL) 661 / 4000; 662 663 if (source_sz_estimate < source_sz) { 664 /* Target might be small, therefore limiting the 665 * source data. 666 */ 667 source_sz = source_sz_estimate; 668 target_sz_estimate = target_max; 669 } else { 670 /* Source file might be small, therefore limiting target 671 * touch pages to a smaller value to save processor cycles. 672 */ 673 target_sz_estimate = ((uint64_t)source_sz * 1000UL) 674 / (last_comp_ratio + 1); 675 target_sz_estimate = NX_MIN(2 * target_sz_estimate, 676 target_max); 677 } 678 679 source_sz = source_sz + history_len; 680 681 /* Some NX condition codes require submitting the NX job again. 682 * Kernel doesn't handle NX page faults. Expects user code to 683 * touch pages. 684 */ 685 pgfault_retries = NX_MAX_FAULTS; 686 687 restart_nx: 688 689 putp32(ddl_in, ddebc, source_sz); 690 691 /* Fault in pages */ 692 nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1); 693 nx_touch_pages_dde(ddl_in, 0, page_sz, 0); 694 nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1); 695 696 /* Send job to NX */ 697 cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle); 698 699 switch (cc) { 700 701 case ERR_NX_AT_FAULT: 702 703 /* We touched the pages ahead of time. In the most common case 704 * we shouldn't be here. But may be some pages were paged out. 705 * Kernel should have placed the faulting address to fsaddr. 706 */ 707 NXPRT(fprintf(stderr, "ERR_NX_AT_FAULT %p\n", 708 (void *)cmdp->crb.csb.fsaddr)); 709 710 if (pgfault_retries == NX_MAX_FAULTS) { 711 /* Try once with exact number of pages */ 712 --pgfault_retries; 713 goto restart_nx; 714 } else if (pgfault_retries > 0) { 715 /* If still faulting try fewer input pages 716 * assuming memory outage 717 */ 718 if (source_sz > page_sz) 719 source_sz = NX_MAX(source_sz / 2, page_sz); 720 --pgfault_retries; 721 goto restart_nx; 722 } else { 723 fprintf(stderr, "cannot make progress; too many "); 724 fprintf(stderr, "page fault retries cc= %d\n", cc); 725 rc = -1; 726 goto err5; 727 } 728 729 case ERR_NX_DATA_LENGTH: 730 731 NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; ")); 732 NXPRT(fprintf(stderr, "stream may have trailing data\n")); 733 734 /* Not an error in the most common case; it just says 735 * there is trailing data that we must examine. 736 * 737 * CC=3 CE(1)=0 CE(0)=1 indicates partial completion 738 * Fig.6-7 and Table 6-8. 739 */ 740 nx_ce = get_csb_ce_ms3b(cmdp->crb.csb); 741 742 if (!csb_ce_termination(nx_ce) && 743 csb_ce_partial_completion(nx_ce)) { 744 /* Check CPB for more information 745 * spbc and tpbc are valid 746 */ 747 sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */ 748 subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */ 749 spbc = get32(cmdp->cpb, out_spbc_decomp); 750 tpbc = get32(cmdp->crb.csb, tpbc); 751 assert(target_max >= tpbc); 752 753 goto ok_cc3; /* not an error */ 754 } else { 755 /* History length error when CE(1)=1 CE(0)=0. */ 756 rc = -1; 757 fprintf(stderr, "history length error cc= %d\n", cc); 758 goto err5; 759 } 760 761 case ERR_NX_TARGET_SPACE: 762 763 /* Target buffer not large enough; retry smaller input 764 * data; give at least 1 byte. SPBC/TPBC are not valid. 765 */ 766 assert(source_sz > history_len); 767 source_sz = ((source_sz - history_len + 2) / 2) + history_len; 768 NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with ")); 769 NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n", 770 source_sz, history_len)); 771 goto restart_nx; 772 773 case ERR_NX_OK: 774 775 /* This should not happen for gzip formatted data; 776 * we need trailing crc and isize 777 */ 778 fprintf(stderr, "ERR_NX_OK\n"); 779 spbc = get32(cmdp->cpb, out_spbc_decomp); 780 tpbc = get32(cmdp->crb.csb, tpbc); 781 assert(target_max >= tpbc); 782 assert(spbc >= history_len); 783 source_sz = spbc - history_len; 784 goto offsets_state; 785 786 default: 787 fprintf(stderr, "error: cc= %d\n", cc); 788 rc = -1; 789 goto err5; 790 } 791 792 ok_cc3: 793 794 NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt)); 795 796 assert(spbc > history_len); 797 source_sz = spbc - history_len; 798 799 /* Table 6-4: Source Final Block Type (SFBT) describes the 800 * last processed deflate block and clues the software how to 801 * resume the next job. SUBC indicates how many input bits NX 802 * consumed but did not process. SPBC indicates how many 803 * bytes of source were given to the accelerator including 804 * history bytes. 805 */ 806 807 switch (sfbt) { 808 int dhtlen; 809 810 case 0x0: /* Deflate final EOB received */ 811 812 /* Calculating the checksum start position. */ 813 814 source_sz = source_sz - subc / 8; 815 is_final = 1; 816 break; 817 818 /* Resume decompression cases are below. Basically 819 * indicates where NX has suspended and how to resume 820 * the input stream. 821 */ 822 823 case 0x8: /* Within a literal block; use rembytecount */ 824 case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */ 825 826 /* Supply the partially processed source byte again */ 827 source_sz = source_sz - ((subc + 7) / 8); 828 829 /* SUBC LS 3bits: number of bits in the first source byte need 830 * to be processed. 831 * 000 means all 8 bits; Table 6-3 832 * Clear subc, histlen, sfbt, rembytecnt, dhtlen 833 */ 834 cmdp->cpb.in_subc = 0; 835 cmdp->cpb.in_sfbt = 0; 836 putnn(cmdp->cpb, in_subc, subc % 8); 837 putnn(cmdp->cpb, in_sfbt, sfbt); 838 putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb, 839 out_rembytecnt)); 840 break; 841 842 case 0xA: /* Within a FH block; */ 843 case 0xB: /* Within a FH block; bfinal=1 */ 844 845 source_sz = source_sz - ((subc + 7) / 8); 846 847 /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ 848 cmdp->cpb.in_subc = 0; 849 cmdp->cpb.in_sfbt = 0; 850 putnn(cmdp->cpb, in_subc, subc % 8); 851 putnn(cmdp->cpb, in_sfbt, sfbt); 852 break; 853 854 case 0xC: /* Within a DH block; */ 855 case 0xD: /* Within a DH block; bfinal=1 */ 856 857 source_sz = source_sz - ((subc + 7) / 8); 858 859 /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ 860 cmdp->cpb.in_subc = 0; 861 cmdp->cpb.in_sfbt = 0; 862 putnn(cmdp->cpb, in_subc, subc % 8); 863 putnn(cmdp->cpb, in_sfbt, sfbt); 864 865 dhtlen = getnn(cmdp->cpb, out_dhtlen); 866 putnn(cmdp->cpb, in_dhtlen, dhtlen); 867 assert(dhtlen >= 42); 868 869 /* Round up to a qword */ 870 dhtlen = (dhtlen + 127) / 128; 871 872 while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */ 873 --dhtlen; 874 cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen]; 875 } 876 break; 877 878 case 0xE: /* Within a block header; bfinal=0; */ 879 /* Also given if source data exactly ends (SUBC=0) with 880 * EOB code with BFINAL=0. Means the next byte will 881 * contain a block header. 882 */ 883 case 0xF: /* within a block header with BFINAL=1. */ 884 885 source_sz = source_sz - ((subc + 7) / 8); 886 887 /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ 888 cmdp->cpb.in_subc = 0; 889 cmdp->cpb.in_sfbt = 0; 890 putnn(cmdp->cpb, in_subc, subc % 8); 891 putnn(cmdp->cpb, in_sfbt, sfbt); 892 893 /* Engine did not process any data */ 894 if (is_eof && (source_sz == 0)) 895 is_final = 1; 896 } 897 898 offsets_state: 899 900 /* Adjust the source and target buffer offsets and lengths */ 901 902 NXPRT(fprintf(stderr, "offsets_state:\n")); 903 904 /* Delete input data from fifo_in */ 905 used_in = used_in - source_sz; 906 cur_in = (cur_in + source_sz) % fifo_in_len; 907 input_file_offset = input_file_offset + source_sz; 908 909 /* Add output data to fifo_out */ 910 used_out = used_out + tpbc; 911 912 assert(used_out <= fifo_out_len); 913 914 total_out = total_out + tpbc; 915 916 /* Deflate history is 32KB max. No need to supply more 917 * than 32KB on a resume. 918 */ 919 history_len = (total_out > window_max) ? window_max : total_out; 920 921 /* To estimate expected expansion in the next NX job; 500 means 50%. 922 * Deflate best case is around 1 to 1000. 923 */ 924 last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1)) 925 / ((uint64_t)tpbc + 1); 926 last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1); 927 NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n", 928 last_comp_ratio, source_sz, spbc, tpbc)); 929 930 resuming = 1; 931 932 finish_state: 933 934 NXPRT(fprintf(stderr, "finish_state:\n")); 935 936 if (is_final) { 937 if (used_out) 938 goto write_state; /* More data to write out */ 939 else if (used_in < 8) { 940 /* Need at least 8 more bytes containing gzip crc 941 * and isize. 942 */ 943 rc = -1; 944 goto err4; 945 } else { 946 /* Compare checksums and exit */ 947 int i; 948 unsigned char tail[8]; 949 uint32_t cksum, isize; 950 951 for (i = 0; i < 8; i++) 952 tail[i] = fifo_in[(cur_in + i) % fifo_in_len]; 953 fprintf(stderr, "computed checksum %08x isize %08x\n", 954 cmdp->cpb.out_crc, (uint32_t) (total_out 955 % (1ULL<<32))); 956 cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8 957 | (uint32_t) tail[2]<<16 958 | (uint32_t) tail[3]<<24); 959 isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8 960 | (uint32_t) tail[6]<<16 961 | (uint32_t) tail[7]<<24); 962 fprintf(stderr, "stored checksum %08x isize %08x\n", 963 cksum, isize); 964 965 if (cksum == cmdp->cpb.out_crc && isize == (uint32_t) 966 (total_out % (1ULL<<32))) { 967 rc = 0; goto ok1; 968 } else { 969 rc = -1; goto err4; 970 } 971 } 972 } else 973 goto read_state; 974 975 return -1; 976 977 err1: 978 fprintf(stderr, "error: not a gzip file, expect %x, read %x\n", 979 expect, c); 980 return -1; 981 982 err2: 983 fprintf(stderr, "error: the FLG byte is wrong or not being handled\n"); 984 return -1; 985 986 err3: 987 fprintf(stderr, "error: gzip header\n"); 988 return -1; 989 990 err4: 991 fprintf(stderr, "error: checksum missing or mismatch\n"); 992 993 err5: 994 ok1: 995 fprintf(stderr, "decomp is complete: fclose\n"); 996 fclose(outf); 997 998 return rc; 999 } 1000 1001 1002 int main(int argc, char **argv) 1003 { 1004 int rc; 1005 struct sigaction act; 1006 void *handle; 1007 1008 nx_dbg = 0; 1009 nx_gzip_log = NULL; 1010 act.sa_handler = 0; 1011 act.sa_sigaction = nxu_sigsegv_handler; 1012 act.sa_flags = SA_SIGINFO; 1013 act.sa_restorer = 0; 1014 sigemptyset(&act.sa_mask); 1015 sigaction(SIGSEGV, &act, NULL); 1016 1017 handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0); 1018 if (!handle) { 1019 fprintf(stderr, "Unable to init NX, errno %d\n", errno); 1020 exit(-1); 1021 } 1022 1023 rc = decompress_file(argc, argv, handle); 1024 1025 nx_function_end(handle); 1026 1027 return rc; 1028 } 1029