xref: /openbmc/linux/lib/xz/xz_dec_bcj.c (revision 762f99f4f3cb41a775b5157dd761217beba65873)
1  /*
2   * Branch/Call/Jump (BCJ) filter decoders
3   *
4   * Authors: Lasse Collin <lasse.collin@tukaani.org>
5   *          Igor Pavlov <https://7-zip.org/>
6   *
7   * This file has been put into the public domain.
8   * You can do whatever you want with this file.
9   */
10  
11  #include "xz_private.h"
12  
13  /*
14   * The rest of the file is inside this ifdef. It makes things a little more
15   * convenient when building without support for any BCJ filters.
16   */
17  #ifdef XZ_DEC_BCJ
18  
19  struct xz_dec_bcj {
20  	/* Type of the BCJ filter being used */
21  	enum {
22  		BCJ_X86 = 4,        /* x86 or x86-64 */
23  		BCJ_POWERPC = 5,    /* Big endian only */
24  		BCJ_IA64 = 6,       /* Big or little endian */
25  		BCJ_ARM = 7,        /* Little endian only */
26  		BCJ_ARMTHUMB = 8,   /* Little endian only */
27  		BCJ_SPARC = 9       /* Big or little endian */
28  	} type;
29  
30  	/*
31  	 * Return value of the next filter in the chain. We need to preserve
32  	 * this information across calls, because we must not call the next
33  	 * filter anymore once it has returned XZ_STREAM_END.
34  	 */
35  	enum xz_ret ret;
36  
37  	/* True if we are operating in single-call mode. */
38  	bool single_call;
39  
40  	/*
41  	 * Absolute position relative to the beginning of the uncompressed
42  	 * data (in a single .xz Block). We care only about the lowest 32
43  	 * bits so this doesn't need to be uint64_t even with big files.
44  	 */
45  	uint32_t pos;
46  
47  	/* x86 filter state */
48  	uint32_t x86_prev_mask;
49  
50  	/* Temporary space to hold the variables from struct xz_buf */
51  	uint8_t *out;
52  	size_t out_pos;
53  	size_t out_size;
54  
55  	struct {
56  		/* Amount of already filtered data in the beginning of buf */
57  		size_t filtered;
58  
59  		/* Total amount of data currently stored in buf  */
60  		size_t size;
61  
62  		/*
63  		 * Buffer to hold a mix of filtered and unfiltered data. This
64  		 * needs to be big enough to hold Alignment + 2 * Look-ahead:
65  		 *
66  		 * Type         Alignment   Look-ahead
67  		 * x86              1           4
68  		 * PowerPC          4           0
69  		 * IA-64           16           0
70  		 * ARM              4           0
71  		 * ARM-Thumb        2           2
72  		 * SPARC            4           0
73  		 */
74  		uint8_t buf[16];
75  	} temp;
76  };
77  
78  #ifdef XZ_DEC_X86
79  /*
80   * This is used to test the most significant byte of a memory address
81   * in an x86 instruction.
82   */
bcj_x86_test_msbyte(uint8_t b)83  static inline int bcj_x86_test_msbyte(uint8_t b)
84  {
85  	return b == 0x00 || b == 0xFF;
86  }
87  
bcj_x86(struct xz_dec_bcj * s,uint8_t * buf,size_t size)88  static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
89  {
90  	static const bool mask_to_allowed_status[8]
91  		= { true, true, true, false, true, false, false, false };
92  
93  	static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
94  
95  	size_t i;
96  	size_t prev_pos = (size_t)-1;
97  	uint32_t prev_mask = s->x86_prev_mask;
98  	uint32_t src;
99  	uint32_t dest;
100  	uint32_t j;
101  	uint8_t b;
102  
103  	if (size <= 4)
104  		return 0;
105  
106  	size -= 4;
107  	for (i = 0; i < size; ++i) {
108  		if ((buf[i] & 0xFE) != 0xE8)
109  			continue;
110  
111  		prev_pos = i - prev_pos;
112  		if (prev_pos > 3) {
113  			prev_mask = 0;
114  		} else {
115  			prev_mask = (prev_mask << (prev_pos - 1)) & 7;
116  			if (prev_mask != 0) {
117  				b = buf[i + 4 - mask_to_bit_num[prev_mask]];
118  				if (!mask_to_allowed_status[prev_mask]
119  						|| bcj_x86_test_msbyte(b)) {
120  					prev_pos = i;
121  					prev_mask = (prev_mask << 1) | 1;
122  					continue;
123  				}
124  			}
125  		}
126  
127  		prev_pos = i;
128  
129  		if (bcj_x86_test_msbyte(buf[i + 4])) {
130  			src = get_unaligned_le32(buf + i + 1);
131  			while (true) {
132  				dest = src - (s->pos + (uint32_t)i + 5);
133  				if (prev_mask == 0)
134  					break;
135  
136  				j = mask_to_bit_num[prev_mask] * 8;
137  				b = (uint8_t)(dest >> (24 - j));
138  				if (!bcj_x86_test_msbyte(b))
139  					break;
140  
141  				src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
142  			}
143  
144  			dest &= 0x01FFFFFF;
145  			dest |= (uint32_t)0 - (dest & 0x01000000);
146  			put_unaligned_le32(dest, buf + i + 1);
147  			i += 4;
148  		} else {
149  			prev_mask = (prev_mask << 1) | 1;
150  		}
151  	}
152  
153  	prev_pos = i - prev_pos;
154  	s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
155  	return i;
156  }
157  #endif
158  
159  #ifdef XZ_DEC_POWERPC
bcj_powerpc(struct xz_dec_bcj * s,uint8_t * buf,size_t size)160  static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
161  {
162  	size_t i;
163  	uint32_t instr;
164  
165  	for (i = 0; i + 4 <= size; i += 4) {
166  		instr = get_unaligned_be32(buf + i);
167  		if ((instr & 0xFC000003) == 0x48000001) {
168  			instr &= 0x03FFFFFC;
169  			instr -= s->pos + (uint32_t)i;
170  			instr &= 0x03FFFFFC;
171  			instr |= 0x48000001;
172  			put_unaligned_be32(instr, buf + i);
173  		}
174  	}
175  
176  	return i;
177  }
178  #endif
179  
180  #ifdef XZ_DEC_IA64
bcj_ia64(struct xz_dec_bcj * s,uint8_t * buf,size_t size)181  static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
182  {
183  	static const uint8_t branch_table[32] = {
184  		0, 0, 0, 0, 0, 0, 0, 0,
185  		0, 0, 0, 0, 0, 0, 0, 0,
186  		4, 4, 6, 6, 0, 0, 7, 7,
187  		4, 4, 0, 0, 4, 4, 0, 0
188  	};
189  
190  	/*
191  	 * The local variables take a little bit stack space, but it's less
192  	 * than what LZMA2 decoder takes, so it doesn't make sense to reduce
193  	 * stack usage here without doing that for the LZMA2 decoder too.
194  	 */
195  
196  	/* Loop counters */
197  	size_t i;
198  	size_t j;
199  
200  	/* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
201  	uint32_t slot;
202  
203  	/* Bitwise offset of the instruction indicated by slot */
204  	uint32_t bit_pos;
205  
206  	/* bit_pos split into byte and bit parts */
207  	uint32_t byte_pos;
208  	uint32_t bit_res;
209  
210  	/* Address part of an instruction */
211  	uint32_t addr;
212  
213  	/* Mask used to detect which instructions to convert */
214  	uint32_t mask;
215  
216  	/* 41-bit instruction stored somewhere in the lowest 48 bits */
217  	uint64_t instr;
218  
219  	/* Instruction normalized with bit_res for easier manipulation */
220  	uint64_t norm;
221  
222  	for (i = 0; i + 16 <= size; i += 16) {
223  		mask = branch_table[buf[i] & 0x1F];
224  		for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
225  			if (((mask >> slot) & 1) == 0)
226  				continue;
227  
228  			byte_pos = bit_pos >> 3;
229  			bit_res = bit_pos & 7;
230  			instr = 0;
231  			for (j = 0; j < 6; ++j)
232  				instr |= (uint64_t)(buf[i + j + byte_pos])
233  						<< (8 * j);
234  
235  			norm = instr >> bit_res;
236  
237  			if (((norm >> 37) & 0x0F) == 0x05
238  					&& ((norm >> 9) & 0x07) == 0) {
239  				addr = (norm >> 13) & 0x0FFFFF;
240  				addr |= ((uint32_t)(norm >> 36) & 1) << 20;
241  				addr <<= 4;
242  				addr -= s->pos + (uint32_t)i;
243  				addr >>= 4;
244  
245  				norm &= ~((uint64_t)0x8FFFFF << 13);
246  				norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
247  				norm |= (uint64_t)(addr & 0x100000)
248  						<< (36 - 20);
249  
250  				instr &= (1 << bit_res) - 1;
251  				instr |= norm << bit_res;
252  
253  				for (j = 0; j < 6; j++)
254  					buf[i + j + byte_pos]
255  						= (uint8_t)(instr >> (8 * j));
256  			}
257  		}
258  	}
259  
260  	return i;
261  }
262  #endif
263  
264  #ifdef XZ_DEC_ARM
bcj_arm(struct xz_dec_bcj * s,uint8_t * buf,size_t size)265  static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
266  {
267  	size_t i;
268  	uint32_t addr;
269  
270  	for (i = 0; i + 4 <= size; i += 4) {
271  		if (buf[i + 3] == 0xEB) {
272  			addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
273  					| ((uint32_t)buf[i + 2] << 16);
274  			addr <<= 2;
275  			addr -= s->pos + (uint32_t)i + 8;
276  			addr >>= 2;
277  			buf[i] = (uint8_t)addr;
278  			buf[i + 1] = (uint8_t)(addr >> 8);
279  			buf[i + 2] = (uint8_t)(addr >> 16);
280  		}
281  	}
282  
283  	return i;
284  }
285  #endif
286  
287  #ifdef XZ_DEC_ARMTHUMB
bcj_armthumb(struct xz_dec_bcj * s,uint8_t * buf,size_t size)288  static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
289  {
290  	size_t i;
291  	uint32_t addr;
292  
293  	for (i = 0; i + 4 <= size; i += 2) {
294  		if ((buf[i + 1] & 0xF8) == 0xF0
295  				&& (buf[i + 3] & 0xF8) == 0xF8) {
296  			addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
297  					| ((uint32_t)buf[i] << 11)
298  					| (((uint32_t)buf[i + 3] & 0x07) << 8)
299  					| (uint32_t)buf[i + 2];
300  			addr <<= 1;
301  			addr -= s->pos + (uint32_t)i + 4;
302  			addr >>= 1;
303  			buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
304  			buf[i] = (uint8_t)(addr >> 11);
305  			buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
306  			buf[i + 2] = (uint8_t)addr;
307  			i += 2;
308  		}
309  	}
310  
311  	return i;
312  }
313  #endif
314  
315  #ifdef XZ_DEC_SPARC
bcj_sparc(struct xz_dec_bcj * s,uint8_t * buf,size_t size)316  static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
317  {
318  	size_t i;
319  	uint32_t instr;
320  
321  	for (i = 0; i + 4 <= size; i += 4) {
322  		instr = get_unaligned_be32(buf + i);
323  		if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
324  			instr <<= 2;
325  			instr -= s->pos + (uint32_t)i;
326  			instr >>= 2;
327  			instr = ((uint32_t)0x40000000 - (instr & 0x400000))
328  					| 0x40000000 | (instr & 0x3FFFFF);
329  			put_unaligned_be32(instr, buf + i);
330  		}
331  	}
332  
333  	return i;
334  }
335  #endif
336  
337  /*
338   * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
339   * of data that got filtered.
340   *
341   * NOTE: This is implemented as a switch statement to avoid using function
342   * pointers, which could be problematic in the kernel boot code, which must
343   * avoid pointers to static data (at least on x86).
344   */
bcj_apply(struct xz_dec_bcj * s,uint8_t * buf,size_t * pos,size_t size)345  static void bcj_apply(struct xz_dec_bcj *s,
346  		      uint8_t *buf, size_t *pos, size_t size)
347  {
348  	size_t filtered;
349  
350  	buf += *pos;
351  	size -= *pos;
352  
353  	switch (s->type) {
354  #ifdef XZ_DEC_X86
355  	case BCJ_X86:
356  		filtered = bcj_x86(s, buf, size);
357  		break;
358  #endif
359  #ifdef XZ_DEC_POWERPC
360  	case BCJ_POWERPC:
361  		filtered = bcj_powerpc(s, buf, size);
362  		break;
363  #endif
364  #ifdef XZ_DEC_IA64
365  	case BCJ_IA64:
366  		filtered = bcj_ia64(s, buf, size);
367  		break;
368  #endif
369  #ifdef XZ_DEC_ARM
370  	case BCJ_ARM:
371  		filtered = bcj_arm(s, buf, size);
372  		break;
373  #endif
374  #ifdef XZ_DEC_ARMTHUMB
375  	case BCJ_ARMTHUMB:
376  		filtered = bcj_armthumb(s, buf, size);
377  		break;
378  #endif
379  #ifdef XZ_DEC_SPARC
380  	case BCJ_SPARC:
381  		filtered = bcj_sparc(s, buf, size);
382  		break;
383  #endif
384  	default:
385  		/* Never reached but silence compiler warnings. */
386  		filtered = 0;
387  		break;
388  	}
389  
390  	*pos += filtered;
391  	s->pos += filtered;
392  }
393  
394  /*
395   * Flush pending filtered data from temp to the output buffer.
396   * Move the remaining mixture of possibly filtered and unfiltered
397   * data to the beginning of temp.
398   */
bcj_flush(struct xz_dec_bcj * s,struct xz_buf * b)399  static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
400  {
401  	size_t copy_size;
402  
403  	copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
404  	memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
405  	b->out_pos += copy_size;
406  
407  	s->temp.filtered -= copy_size;
408  	s->temp.size -= copy_size;
409  	memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
410  }
411  
412  /*
413   * The BCJ filter functions are primitive in sense that they process the
414   * data in chunks of 1-16 bytes. To hide this issue, this function does
415   * some buffering.
416   */
xz_dec_bcj_run(struct xz_dec_bcj * s,struct xz_dec_lzma2 * lzma2,struct xz_buf * b)417  XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
418  				     struct xz_dec_lzma2 *lzma2,
419  				     struct xz_buf *b)
420  {
421  	size_t out_start;
422  
423  	/*
424  	 * Flush pending already filtered data to the output buffer. Return
425  	 * immediately if we couldn't flush everything, or if the next
426  	 * filter in the chain had already returned XZ_STREAM_END.
427  	 */
428  	if (s->temp.filtered > 0) {
429  		bcj_flush(s, b);
430  		if (s->temp.filtered > 0)
431  			return XZ_OK;
432  
433  		if (s->ret == XZ_STREAM_END)
434  			return XZ_STREAM_END;
435  	}
436  
437  	/*
438  	 * If we have more output space than what is currently pending in
439  	 * temp, copy the unfiltered data from temp to the output buffer
440  	 * and try to fill the output buffer by decoding more data from the
441  	 * next filter in the chain. Apply the BCJ filter on the new data
442  	 * in the output buffer. If everything cannot be filtered, copy it
443  	 * to temp and rewind the output buffer position accordingly.
444  	 *
445  	 * This needs to be always run when temp.size == 0 to handle a special
446  	 * case where the output buffer is full and the next filter has no
447  	 * more output coming but hasn't returned XZ_STREAM_END yet.
448  	 */
449  	if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {
450  		out_start = b->out_pos;
451  		memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
452  		b->out_pos += s->temp.size;
453  
454  		s->ret = xz_dec_lzma2_run(lzma2, b);
455  		if (s->ret != XZ_STREAM_END
456  				&& (s->ret != XZ_OK || s->single_call))
457  			return s->ret;
458  
459  		bcj_apply(s, b->out, &out_start, b->out_pos);
460  
461  		/*
462  		 * As an exception, if the next filter returned XZ_STREAM_END,
463  		 * we can do that too, since the last few bytes that remain
464  		 * unfiltered are meant to remain unfiltered.
465  		 */
466  		if (s->ret == XZ_STREAM_END)
467  			return XZ_STREAM_END;
468  
469  		s->temp.size = b->out_pos - out_start;
470  		b->out_pos -= s->temp.size;
471  		memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
472  
473  		/*
474  		 * If there wasn't enough input to the next filter to fill
475  		 * the output buffer with unfiltered data, there's no point
476  		 * to try decoding more data to temp.
477  		 */
478  		if (b->out_pos + s->temp.size < b->out_size)
479  			return XZ_OK;
480  	}
481  
482  	/*
483  	 * We have unfiltered data in temp. If the output buffer isn't full
484  	 * yet, try to fill the temp buffer by decoding more data from the
485  	 * next filter. Apply the BCJ filter on temp. Then we hopefully can
486  	 * fill the actual output buffer by copying filtered data from temp.
487  	 * A mix of filtered and unfiltered data may be left in temp; it will
488  	 * be taken care on the next call to this function.
489  	 */
490  	if (b->out_pos < b->out_size) {
491  		/* Make b->out{,_pos,_size} temporarily point to s->temp. */
492  		s->out = b->out;
493  		s->out_pos = b->out_pos;
494  		s->out_size = b->out_size;
495  		b->out = s->temp.buf;
496  		b->out_pos = s->temp.size;
497  		b->out_size = sizeof(s->temp.buf);
498  
499  		s->ret = xz_dec_lzma2_run(lzma2, b);
500  
501  		s->temp.size = b->out_pos;
502  		b->out = s->out;
503  		b->out_pos = s->out_pos;
504  		b->out_size = s->out_size;
505  
506  		if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
507  			return s->ret;
508  
509  		bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
510  
511  		/*
512  		 * If the next filter returned XZ_STREAM_END, we mark that
513  		 * everything is filtered, since the last unfiltered bytes
514  		 * of the stream are meant to be left as is.
515  		 */
516  		if (s->ret == XZ_STREAM_END)
517  			s->temp.filtered = s->temp.size;
518  
519  		bcj_flush(s, b);
520  		if (s->temp.filtered > 0)
521  			return XZ_OK;
522  	}
523  
524  	return s->ret;
525  }
526  
xz_dec_bcj_create(bool single_call)527  XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call)
528  {
529  	struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
530  	if (s != NULL)
531  		s->single_call = single_call;
532  
533  	return s;
534  }
535  
xz_dec_bcj_reset(struct xz_dec_bcj * s,uint8_t id)536  XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id)
537  {
538  	switch (id) {
539  #ifdef XZ_DEC_X86
540  	case BCJ_X86:
541  #endif
542  #ifdef XZ_DEC_POWERPC
543  	case BCJ_POWERPC:
544  #endif
545  #ifdef XZ_DEC_IA64
546  	case BCJ_IA64:
547  #endif
548  #ifdef XZ_DEC_ARM
549  	case BCJ_ARM:
550  #endif
551  #ifdef XZ_DEC_ARMTHUMB
552  	case BCJ_ARMTHUMB:
553  #endif
554  #ifdef XZ_DEC_SPARC
555  	case BCJ_SPARC:
556  #endif
557  		break;
558  
559  	default:
560  		/* Unsupported Filter ID */
561  		return XZ_OPTIONS_ERROR;
562  	}
563  
564  	s->type = id;
565  	s->ret = XZ_OK;
566  	s->pos = 0;
567  	s->x86_prev_mask = 0;
568  	s->temp.filtered = 0;
569  	s->temp.size = 0;
570  
571  	return XZ_OK;
572  }
573  
574  #endif
575