xref: /openbmc/linux/lib/xz/xz_lzma2.h (revision 4b4193256c8d3bc3a5397b5cd9494c2ad386317d)
124fa0402SLasse Collin /*
224fa0402SLasse Collin  * LZMA2 definitions
324fa0402SLasse Collin  *
424fa0402SLasse Collin  * Authors: Lasse Collin <lasse.collin@tukaani.org>
5*d89775fcSAlexander A. Klimov  *          Igor Pavlov <https://7-zip.org/>
624fa0402SLasse Collin  *
724fa0402SLasse Collin  * This file has been put into the public domain.
824fa0402SLasse Collin  * You can do whatever you want with this file.
924fa0402SLasse Collin  */
1024fa0402SLasse Collin 
1124fa0402SLasse Collin #ifndef XZ_LZMA2_H
1224fa0402SLasse Collin #define XZ_LZMA2_H
1324fa0402SLasse Collin 
1424fa0402SLasse Collin /* Range coder constants */
1524fa0402SLasse Collin #define RC_SHIFT_BITS 8
1624fa0402SLasse Collin #define RC_TOP_BITS 24
1724fa0402SLasse Collin #define RC_TOP_VALUE (1 << RC_TOP_BITS)
1824fa0402SLasse Collin #define RC_BIT_MODEL_TOTAL_BITS 11
1924fa0402SLasse Collin #define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS)
2024fa0402SLasse Collin #define RC_MOVE_BITS 5
2124fa0402SLasse Collin 
2224fa0402SLasse Collin /*
2324fa0402SLasse Collin  * Maximum number of position states. A position state is the lowest pb
2424fa0402SLasse Collin  * number of bits of the current uncompressed offset. In some places there
2524fa0402SLasse Collin  * are different sets of probabilities for different position states.
2624fa0402SLasse Collin  */
2724fa0402SLasse Collin #define POS_STATES_MAX (1 << 4)
2824fa0402SLasse Collin 
2924fa0402SLasse Collin /*
3024fa0402SLasse Collin  * This enum is used to track which LZMA symbols have occurred most recently
3124fa0402SLasse Collin  * and in which order. This information is used to predict the next symbol.
3224fa0402SLasse Collin  *
3324fa0402SLasse Collin  * Symbols:
3424fa0402SLasse Collin  *  - Literal: One 8-bit byte
3524fa0402SLasse Collin  *  - Match: Repeat a chunk of data at some distance
3624fa0402SLasse Collin  *  - Long repeat: Multi-byte match at a recently seen distance
3724fa0402SLasse Collin  *  - Short repeat: One-byte repeat at a recently seen distance
3824fa0402SLasse Collin  *
3924fa0402SLasse Collin  * The symbol names are in from STATE_oldest_older_previous. REP means
4024fa0402SLasse Collin  * either short or long repeated match, and NONLIT means any non-literal.
4124fa0402SLasse Collin  */
4224fa0402SLasse Collin enum lzma_state {
4324fa0402SLasse Collin 	STATE_LIT_LIT,
4424fa0402SLasse Collin 	STATE_MATCH_LIT_LIT,
4524fa0402SLasse Collin 	STATE_REP_LIT_LIT,
4624fa0402SLasse Collin 	STATE_SHORTREP_LIT_LIT,
4724fa0402SLasse Collin 	STATE_MATCH_LIT,
4824fa0402SLasse Collin 	STATE_REP_LIT,
4924fa0402SLasse Collin 	STATE_SHORTREP_LIT,
5024fa0402SLasse Collin 	STATE_LIT_MATCH,
5124fa0402SLasse Collin 	STATE_LIT_LONGREP,
5224fa0402SLasse Collin 	STATE_LIT_SHORTREP,
5324fa0402SLasse Collin 	STATE_NONLIT_MATCH,
5424fa0402SLasse Collin 	STATE_NONLIT_REP
5524fa0402SLasse Collin };
5624fa0402SLasse Collin 
5724fa0402SLasse Collin /* Total number of states */
5824fa0402SLasse Collin #define STATES 12
5924fa0402SLasse Collin 
6024fa0402SLasse Collin /* The lowest 7 states indicate that the previous state was a literal. */
6124fa0402SLasse Collin #define LIT_STATES 7
6224fa0402SLasse Collin 
6324fa0402SLasse Collin /* Indicate that the latest symbol was a literal. */
lzma_state_literal(enum lzma_state * state)6424fa0402SLasse Collin static inline void lzma_state_literal(enum lzma_state *state)
6524fa0402SLasse Collin {
6624fa0402SLasse Collin 	if (*state <= STATE_SHORTREP_LIT_LIT)
6724fa0402SLasse Collin 		*state = STATE_LIT_LIT;
6824fa0402SLasse Collin 	else if (*state <= STATE_LIT_SHORTREP)
6924fa0402SLasse Collin 		*state -= 3;
7024fa0402SLasse Collin 	else
7124fa0402SLasse Collin 		*state -= 6;
7224fa0402SLasse Collin }
7324fa0402SLasse Collin 
7424fa0402SLasse Collin /* Indicate that the latest symbol was a match. */
lzma_state_match(enum lzma_state * state)7524fa0402SLasse Collin static inline void lzma_state_match(enum lzma_state *state)
7624fa0402SLasse Collin {
7724fa0402SLasse Collin 	*state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH;
7824fa0402SLasse Collin }
7924fa0402SLasse Collin 
8024fa0402SLasse Collin /* Indicate that the latest state was a long repeated match. */
lzma_state_long_rep(enum lzma_state * state)8124fa0402SLasse Collin static inline void lzma_state_long_rep(enum lzma_state *state)
8224fa0402SLasse Collin {
8324fa0402SLasse Collin 	*state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP;
8424fa0402SLasse Collin }
8524fa0402SLasse Collin 
8624fa0402SLasse Collin /* Indicate that the latest symbol was a short match. */
lzma_state_short_rep(enum lzma_state * state)8724fa0402SLasse Collin static inline void lzma_state_short_rep(enum lzma_state *state)
8824fa0402SLasse Collin {
8924fa0402SLasse Collin 	*state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP;
9024fa0402SLasse Collin }
9124fa0402SLasse Collin 
9224fa0402SLasse Collin /* Test if the previous symbol was a literal. */
lzma_state_is_literal(enum lzma_state state)9324fa0402SLasse Collin static inline bool lzma_state_is_literal(enum lzma_state state)
9424fa0402SLasse Collin {
9524fa0402SLasse Collin 	return state < LIT_STATES;
9624fa0402SLasse Collin }
9724fa0402SLasse Collin 
9824fa0402SLasse Collin /* Each literal coder is divided in three sections:
9924fa0402SLasse Collin  *   - 0x001-0x0FF: Without match byte
10024fa0402SLasse Collin  *   - 0x101-0x1FF: With match byte; match bit is 0
10124fa0402SLasse Collin  *   - 0x201-0x2FF: With match byte; match bit is 1
10224fa0402SLasse Collin  *
10324fa0402SLasse Collin  * Match byte is used when the previous LZMA symbol was something else than
10424fa0402SLasse Collin  * a literal (that is, it was some kind of match).
10524fa0402SLasse Collin  */
10624fa0402SLasse Collin #define LITERAL_CODER_SIZE 0x300
10724fa0402SLasse Collin 
10824fa0402SLasse Collin /* Maximum number of literal coders */
10924fa0402SLasse Collin #define LITERAL_CODERS_MAX (1 << 4)
11024fa0402SLasse Collin 
11124fa0402SLasse Collin /* Minimum length of a match is two bytes. */
11224fa0402SLasse Collin #define MATCH_LEN_MIN 2
11324fa0402SLasse Collin 
11424fa0402SLasse Collin /* Match length is encoded with 4, 5, or 10 bits.
11524fa0402SLasse Collin  *
11624fa0402SLasse Collin  * Length   Bits
11724fa0402SLasse Collin  *  2-9      4 = Choice=0 + 3 bits
11824fa0402SLasse Collin  * 10-17     5 = Choice=1 + Choice2=0 + 3 bits
11924fa0402SLasse Collin  * 18-273   10 = Choice=1 + Choice2=1 + 8 bits
12024fa0402SLasse Collin  */
12124fa0402SLasse Collin #define LEN_LOW_BITS 3
12224fa0402SLasse Collin #define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS)
12324fa0402SLasse Collin #define LEN_MID_BITS 3
12424fa0402SLasse Collin #define LEN_MID_SYMBOLS (1 << LEN_MID_BITS)
12524fa0402SLasse Collin #define LEN_HIGH_BITS 8
12624fa0402SLasse Collin #define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS)
12724fa0402SLasse Collin #define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS)
12824fa0402SLasse Collin 
12924fa0402SLasse Collin /*
13024fa0402SLasse Collin  * Maximum length of a match is 273 which is a result of the encoding
13124fa0402SLasse Collin  * described above.
13224fa0402SLasse Collin  */
13324fa0402SLasse Collin #define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1)
13424fa0402SLasse Collin 
13524fa0402SLasse Collin /*
13624fa0402SLasse Collin  * Different sets of probabilities are used for match distances that have
13724fa0402SLasse Collin  * very short match length: Lengths of 2, 3, and 4 bytes have a separate
13824fa0402SLasse Collin  * set of probabilities for each length. The matches with longer length
13924fa0402SLasse Collin  * use a shared set of probabilities.
14024fa0402SLasse Collin  */
14124fa0402SLasse Collin #define DIST_STATES 4
14224fa0402SLasse Collin 
14324fa0402SLasse Collin /*
14424fa0402SLasse Collin  * Get the index of the appropriate probability array for decoding
14524fa0402SLasse Collin  * the distance slot.
14624fa0402SLasse Collin  */
lzma_get_dist_state(uint32_t len)14724fa0402SLasse Collin static inline uint32_t lzma_get_dist_state(uint32_t len)
14824fa0402SLasse Collin {
14924fa0402SLasse Collin 	return len < DIST_STATES + MATCH_LEN_MIN
15024fa0402SLasse Collin 			? len - MATCH_LEN_MIN : DIST_STATES - 1;
15124fa0402SLasse Collin }
15224fa0402SLasse Collin 
15324fa0402SLasse Collin /*
15424fa0402SLasse Collin  * The highest two bits of a 32-bit match distance are encoded using six bits.
15524fa0402SLasse Collin  * This six-bit value is called a distance slot. This way encoding a 32-bit
15624fa0402SLasse Collin  * value takes 6-36 bits, larger values taking more bits.
15724fa0402SLasse Collin  */
15824fa0402SLasse Collin #define DIST_SLOT_BITS 6
15924fa0402SLasse Collin #define DIST_SLOTS (1 << DIST_SLOT_BITS)
16024fa0402SLasse Collin 
16124fa0402SLasse Collin /* Match distances up to 127 are fully encoded using probabilities. Since
16224fa0402SLasse Collin  * the highest two bits (distance slot) are always encoded using six bits,
16324fa0402SLasse Collin  * the distances 0-3 don't need any additional bits to encode, since the
16424fa0402SLasse Collin  * distance slot itself is the same as the actual distance. DIST_MODEL_START
16524fa0402SLasse Collin  * indicates the first distance slot where at least one additional bit is
16624fa0402SLasse Collin  * needed.
16724fa0402SLasse Collin  */
16824fa0402SLasse Collin #define DIST_MODEL_START 4
16924fa0402SLasse Collin 
17024fa0402SLasse Collin /*
17124fa0402SLasse Collin  * Match distances greater than 127 are encoded in three pieces:
17224fa0402SLasse Collin  *   - distance slot: the highest two bits
17324fa0402SLasse Collin  *   - direct bits: 2-26 bits below the highest two bits
17424fa0402SLasse Collin  *   - alignment bits: four lowest bits
17524fa0402SLasse Collin  *
17624fa0402SLasse Collin  * Direct bits don't use any probabilities.
17724fa0402SLasse Collin  *
17824fa0402SLasse Collin  * The distance slot value of 14 is for distances 128-191.
17924fa0402SLasse Collin  */
18024fa0402SLasse Collin #define DIST_MODEL_END 14
18124fa0402SLasse Collin 
18224fa0402SLasse Collin /* Distance slots that indicate a distance <= 127. */
18324fa0402SLasse Collin #define FULL_DISTANCES_BITS (DIST_MODEL_END / 2)
18424fa0402SLasse Collin #define FULL_DISTANCES (1 << FULL_DISTANCES_BITS)
18524fa0402SLasse Collin 
18624fa0402SLasse Collin /*
18724fa0402SLasse Collin  * For match distances greater than 127, only the highest two bits and the
18824fa0402SLasse Collin  * lowest four bits (alignment) is encoded using probabilities.
18924fa0402SLasse Collin  */
19024fa0402SLasse Collin #define ALIGN_BITS 4
19124fa0402SLasse Collin #define ALIGN_SIZE (1 << ALIGN_BITS)
19224fa0402SLasse Collin #define ALIGN_MASK (ALIGN_SIZE - 1)
19324fa0402SLasse Collin 
19424fa0402SLasse Collin /* Total number of all probability variables */
19524fa0402SLasse Collin #define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE)
19624fa0402SLasse Collin 
19724fa0402SLasse Collin /*
19824fa0402SLasse Collin  * LZMA remembers the four most recent match distances. Reusing these
19924fa0402SLasse Collin  * distances tends to take less space than re-encoding the actual
20024fa0402SLasse Collin  * distance value.
20124fa0402SLasse Collin  */
20224fa0402SLasse Collin #define REPS 4
20324fa0402SLasse Collin 
20424fa0402SLasse Collin #endif
205