193c68cc4SChristoph Böhmwalder /* SPDX-License-Identifier: GPL-2.0-only */
2b411b363SPhilipp Reisner /*
3b411b363SPhilipp Reisner -*- linux-c -*-
4b411b363SPhilipp Reisner drbd_receiver.c
5b411b363SPhilipp Reisner This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6b411b363SPhilipp Reisner
7b411b363SPhilipp Reisner Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
8b411b363SPhilipp Reisner Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9b411b363SPhilipp Reisner Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10b411b363SPhilipp Reisner
11b411b363SPhilipp Reisner */
12b411b363SPhilipp Reisner
13b411b363SPhilipp Reisner #ifndef _DRBD_VLI_H
14b411b363SPhilipp Reisner #define _DRBD_VLI_H
15b411b363SPhilipp Reisner
16b411b363SPhilipp Reisner /*
17b411b363SPhilipp Reisner * At a granularity of 4KiB storage represented per bit,
18b411b363SPhilipp Reisner * and stroage sizes of several TiB,
19b411b363SPhilipp Reisner * and possibly small-bandwidth replication,
20b411b363SPhilipp Reisner * the bitmap transfer time can take much too long,
21b411b363SPhilipp Reisner * if transmitted in plain text.
22b411b363SPhilipp Reisner *
2325985edcSLucas De Marchi * We try to reduce the transferred bitmap information
24b411b363SPhilipp Reisner * by encoding runlengths of bit polarity.
25b411b363SPhilipp Reisner *
26b411b363SPhilipp Reisner * We never actually need to encode a "zero" (runlengths are positive).
27b411b363SPhilipp Reisner * But then we have to store the value of the first bit.
28b411b363SPhilipp Reisner * The first bit of information thus shall encode if the first runlength
29b411b363SPhilipp Reisner * gives the number of set or unset bits.
30b411b363SPhilipp Reisner *
31b411b363SPhilipp Reisner * We assume that large areas are either completely set or unset,
32b411b363SPhilipp Reisner * which gives good compression with any runlength method,
33b411b363SPhilipp Reisner * even when encoding the runlength as fixed size 32bit/64bit integers.
34b411b363SPhilipp Reisner *
35b411b363SPhilipp Reisner * Still, there may be areas where the polarity flips every few bits,
36b411b363SPhilipp Reisner * and encoding the runlength sequence of those areas with fix size
37b411b363SPhilipp Reisner * integers would be much worse than plaintext.
38b411b363SPhilipp Reisner *
39b411b363SPhilipp Reisner * We want to encode small runlength values with minimum code length,
40b411b363SPhilipp Reisner * while still being able to encode a Huge run of all zeros.
41b411b363SPhilipp Reisner *
42b411b363SPhilipp Reisner * Thus we need a Variable Length Integer encoding, VLI.
43b411b363SPhilipp Reisner *
44b411b363SPhilipp Reisner * For some cases, we produce more code bits than plaintext input.
45b411b363SPhilipp Reisner * We need to send incompressible chunks as plaintext, skip over them
46b411b363SPhilipp Reisner * and then see if the next chunk compresses better.
47b411b363SPhilipp Reisner *
48b411b363SPhilipp Reisner * We don't care too much about "excellent" compression ratio for large
49b411b363SPhilipp Reisner * runlengths (all set/all clear): whether we achieve a factor of 100
50b411b363SPhilipp Reisner * or 1000 is not that much of an issue.
51b411b363SPhilipp Reisner * We do not want to waste too much on short runlengths in the "noisy"
52b411b363SPhilipp Reisner * parts of the bitmap, though.
53b411b363SPhilipp Reisner *
54b411b363SPhilipp Reisner * There are endless variants of VLI, we experimented with:
55b411b363SPhilipp Reisner * * simple byte-based
56b411b363SPhilipp Reisner * * various bit based with different code word length.
57b411b363SPhilipp Reisner *
58b411b363SPhilipp Reisner * To avoid yet an other configuration parameter (choice of bitmap compression
59b411b363SPhilipp Reisner * algorithm) which was difficult to explain and tune, we just chose the one
60b411b363SPhilipp Reisner * variant that turned out best in all test cases.
61b411b363SPhilipp Reisner * Based on real world usage patterns, with device sizes ranging from a few GiB
62b411b363SPhilipp Reisner * to several TiB, file server/mailserver/webserver/mysql/postgress,
63b411b363SPhilipp Reisner * mostly idle to really busy, the all time winner (though sometimes only
64b411b363SPhilipp Reisner * marginally better) is:
65b411b363SPhilipp Reisner */
66b411b363SPhilipp Reisner
67b411b363SPhilipp Reisner /*
68b411b363SPhilipp Reisner * encoding is "visualised" as
69b411b363SPhilipp Reisner * __little endian__ bitstream, least significant bit first (left most)
70b411b363SPhilipp Reisner *
71b411b363SPhilipp Reisner * this particular encoding is chosen so that the prefix code
72b411b363SPhilipp Reisner * starts as unary encoding the level, then modified so that
73b411b363SPhilipp Reisner * 10 levels can be described in 8bit, with minimal overhead
74b411b363SPhilipp Reisner * for the smaller levels.
75b411b363SPhilipp Reisner *
76b411b363SPhilipp Reisner * Number of data bits follow fibonacci sequence, with the exception of the
77b411b363SPhilipp Reisner * last level (+1 data bit, so it makes 64bit total). The only worse code when
78b411b363SPhilipp Reisner * encoding bit polarity runlength is 1 plain bits => 2 code bits.
79b411b363SPhilipp Reisner prefix data bits max val Nº data bits
80b411b363SPhilipp Reisner 0 x 0x2 1
81b411b363SPhilipp Reisner 10 x 0x4 1
82b411b363SPhilipp Reisner 110 xx 0x8 2
83b411b363SPhilipp Reisner 1110 xxx 0x10 3
84b411b363SPhilipp Reisner 11110 xxx xx 0x30 5
85b411b363SPhilipp Reisner 111110 xx xxxxxx 0x130 8
86b411b363SPhilipp Reisner 11111100 xxxxxxxx xxxxx 0x2130 13
87b411b363SPhilipp Reisner 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
88b411b363SPhilipp Reisner 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
89b411b363SPhilipp Reisner 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
90b411b363SPhilipp Reisner * maximum encodable value: 0x100000400202130 == 2**56 + some */
91b411b363SPhilipp Reisner
92b411b363SPhilipp Reisner /* compression "table":
93b411b363SPhilipp Reisner transmitted x 0.29
94b411b363SPhilipp Reisner as plaintext x ........................
95b411b363SPhilipp Reisner x ........................
96b411b363SPhilipp Reisner x ........................
97b411b363SPhilipp Reisner x 0.59 0.21........................
98b411b363SPhilipp Reisner x ........................................................
99b411b363SPhilipp Reisner x .. c ...................................................
100b411b363SPhilipp Reisner x 0.44.. o ...................................................
101b411b363SPhilipp Reisner x .......... d ...................................................
102b411b363SPhilipp Reisner x .......... e ...................................................
103b411b363SPhilipp Reisner X............. ...................................................
104b411b363SPhilipp Reisner x.............. b ...................................................
105b411b363SPhilipp Reisner 2.0x............... i ...................................................
106b411b363SPhilipp Reisner #X................ t ...................................................
107b411b363SPhilipp Reisner #................. s ........................... plain bits ..........
108b411b363SPhilipp Reisner -+-----------------------------------------------------------------------
109b411b363SPhilipp Reisner 1 16 32 64
110b411b363SPhilipp Reisner */
111b411b363SPhilipp Reisner
112b411b363SPhilipp Reisner /* LEVEL: (total bits, prefix bits, prefix value),
113b411b363SPhilipp Reisner * sorted ascending by number of total bits.
114b411b363SPhilipp Reisner * The rest of the code table is calculated at compiletime from this. */
115b411b363SPhilipp Reisner
116b411b363SPhilipp Reisner /* fibonacci data 1, 1, ... */
117b411b363SPhilipp Reisner #define VLI_L_1_1() do { \
118b411b363SPhilipp Reisner LEVEL( 2, 1, 0x00); \
119b411b363SPhilipp Reisner LEVEL( 3, 2, 0x01); \
120b411b363SPhilipp Reisner LEVEL( 5, 3, 0x03); \
121b411b363SPhilipp Reisner LEVEL( 7, 4, 0x07); \
122b411b363SPhilipp Reisner LEVEL(10, 5, 0x0f); \
123b411b363SPhilipp Reisner LEVEL(14, 6, 0x1f); \
124b411b363SPhilipp Reisner LEVEL(21, 8, 0x3f); \
125b411b363SPhilipp Reisner LEVEL(29, 8, 0x7f); \
126b411b363SPhilipp Reisner LEVEL(42, 8, 0xbf); \
127b411b363SPhilipp Reisner LEVEL(64, 8, 0xff); \
128b411b363SPhilipp Reisner } while (0)
129b411b363SPhilipp Reisner
130b411b363SPhilipp Reisner /* finds a suitable level to decode the least significant part of in.
131b411b363SPhilipp Reisner * returns number of bits consumed.
132b411b363SPhilipp Reisner *
133b411b363SPhilipp Reisner * BUG() for bad input, as that would mean a buggy code table. */
vli_decode_bits(u64 * out,const u64 in)134b411b363SPhilipp Reisner static inline int vli_decode_bits(u64 *out, const u64 in)
135b411b363SPhilipp Reisner {
136b411b363SPhilipp Reisner u64 adj = 1;
137b411b363SPhilipp Reisner
138b411b363SPhilipp Reisner #define LEVEL(t,b,v) \
139b411b363SPhilipp Reisner do { \
140b411b363SPhilipp Reisner if ((in & ((1 << b) -1)) == v) { \
141b411b363SPhilipp Reisner *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
142b411b363SPhilipp Reisner return t; \
143b411b363SPhilipp Reisner } \
144b411b363SPhilipp Reisner adj += 1ULL << (t - b); \
145b411b363SPhilipp Reisner } while (0)
146b411b363SPhilipp Reisner
147b411b363SPhilipp Reisner VLI_L_1_1();
148b411b363SPhilipp Reisner
149b411b363SPhilipp Reisner /* NOT REACHED, if VLI_LEVELS code table is defined properly */
150b411b363SPhilipp Reisner BUG();
151b411b363SPhilipp Reisner #undef LEVEL
152b411b363SPhilipp Reisner }
153b411b363SPhilipp Reisner
154b411b363SPhilipp Reisner /* return number of code bits needed,
155b411b363SPhilipp Reisner * or negative error number */
__vli_encode_bits(u64 * out,const u64 in)156b411b363SPhilipp Reisner static inline int __vli_encode_bits(u64 *out, const u64 in)
157b411b363SPhilipp Reisner {
158b411b363SPhilipp Reisner u64 max = 0;
159b411b363SPhilipp Reisner u64 adj = 1;
160b411b363SPhilipp Reisner
161b411b363SPhilipp Reisner if (in == 0)
162b411b363SPhilipp Reisner return -EINVAL;
163b411b363SPhilipp Reisner
164b411b363SPhilipp Reisner #define LEVEL(t,b,v) do { \
165b411b363SPhilipp Reisner max += 1ULL << (t - b); \
166b411b363SPhilipp Reisner if (in <= max) { \
167b411b363SPhilipp Reisner if (out) \
168b411b363SPhilipp Reisner *out = ((in - adj) << b) | v; \
169b411b363SPhilipp Reisner return t; \
170b411b363SPhilipp Reisner } \
171b411b363SPhilipp Reisner adj = max + 1; \
172b411b363SPhilipp Reisner } while (0)
173b411b363SPhilipp Reisner
174b411b363SPhilipp Reisner VLI_L_1_1();
175b411b363SPhilipp Reisner
176b411b363SPhilipp Reisner return -EOVERFLOW;
177b411b363SPhilipp Reisner #undef LEVEL
178b411b363SPhilipp Reisner }
179b411b363SPhilipp Reisner
180b411b363SPhilipp Reisner #undef VLI_L_1_1
181b411b363SPhilipp Reisner
182b411b363SPhilipp Reisner /* code from here down is independend of actually used bit code */
183b411b363SPhilipp Reisner
184b411b363SPhilipp Reisner /*
185b411b363SPhilipp Reisner * Code length is determined by some unique (e.g. unary) prefix.
186b411b363SPhilipp Reisner * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
187b411b363SPhilipp Reisner * not a byte stream.
188b411b363SPhilipp Reisner */
189b411b363SPhilipp Reisner
190b411b363SPhilipp Reisner /* for the bitstream, we need a cursor */
191b411b363SPhilipp Reisner struct bitstream_cursor {
192b411b363SPhilipp Reisner /* the current byte */
193b411b363SPhilipp Reisner u8 *b;
194b411b363SPhilipp Reisner /* the current bit within *b, nomalized: 0..7 */
195b411b363SPhilipp Reisner unsigned int bit;
196b411b363SPhilipp Reisner };
197b411b363SPhilipp Reisner
198b411b363SPhilipp Reisner /* initialize cursor to point to first bit of stream */
bitstream_cursor_reset(struct bitstream_cursor * cur,void * s)199b411b363SPhilipp Reisner static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
200b411b363SPhilipp Reisner {
201b411b363SPhilipp Reisner cur->b = s;
202b411b363SPhilipp Reisner cur->bit = 0;
203b411b363SPhilipp Reisner }
204b411b363SPhilipp Reisner
205b411b363SPhilipp Reisner /* advance cursor by that many bits; maximum expected input value: 64,
206b411b363SPhilipp Reisner * but depending on VLI implementation, it may be more. */
bitstream_cursor_advance(struct bitstream_cursor * cur,unsigned int bits)207b411b363SPhilipp Reisner static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
208b411b363SPhilipp Reisner {
209b411b363SPhilipp Reisner bits += cur->bit;
210b411b363SPhilipp Reisner cur->b = cur->b + (bits >> 3);
211b411b363SPhilipp Reisner cur->bit = bits & 7;
212b411b363SPhilipp Reisner }
213b411b363SPhilipp Reisner
214b411b363SPhilipp Reisner /* the bitstream itself knows its length */
215b411b363SPhilipp Reisner struct bitstream {
216b411b363SPhilipp Reisner struct bitstream_cursor cur;
217b411b363SPhilipp Reisner unsigned char *buf;
218b411b363SPhilipp Reisner size_t buf_len; /* in bytes */
219b411b363SPhilipp Reisner
220b411b363SPhilipp Reisner /* for input stream:
221b411b363SPhilipp Reisner * number of trailing 0 bits for padding
222b411b363SPhilipp Reisner * total number of valid bits in stream: buf_len * 8 - pad_bits */
223b411b363SPhilipp Reisner unsigned int pad_bits;
224b411b363SPhilipp Reisner };
225b411b363SPhilipp Reisner
bitstream_init(struct bitstream * bs,void * s,size_t len,unsigned int pad_bits)226b411b363SPhilipp Reisner static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
227b411b363SPhilipp Reisner {
228b411b363SPhilipp Reisner bs->buf = s;
229b411b363SPhilipp Reisner bs->buf_len = len;
230b411b363SPhilipp Reisner bs->pad_bits = pad_bits;
231b411b363SPhilipp Reisner bitstream_cursor_reset(&bs->cur, bs->buf);
232b411b363SPhilipp Reisner }
233b411b363SPhilipp Reisner
bitstream_rewind(struct bitstream * bs)234b411b363SPhilipp Reisner static inline void bitstream_rewind(struct bitstream *bs)
235b411b363SPhilipp Reisner {
236b411b363SPhilipp Reisner bitstream_cursor_reset(&bs->cur, bs->buf);
237b411b363SPhilipp Reisner memset(bs->buf, 0, bs->buf_len);
238b411b363SPhilipp Reisner }
239b411b363SPhilipp Reisner
240b411b363SPhilipp Reisner /* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
241b411b363SPhilipp Reisner * Ignores "pad_bits".
242b411b363SPhilipp Reisner * Returns zero if bits == 0 (nothing to do).
243b411b363SPhilipp Reisner * Returns number of bits used if successful.
244b411b363SPhilipp Reisner *
245b411b363SPhilipp Reisner * If there is not enough room left in bitstream,
246b411b363SPhilipp Reisner * leaves bitstream unchanged and returns -ENOBUFS.
247b411b363SPhilipp Reisner */
bitstream_put_bits(struct bitstream * bs,u64 val,const unsigned int bits)248b411b363SPhilipp Reisner static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
249b411b363SPhilipp Reisner {
250b411b363SPhilipp Reisner unsigned char *b = bs->cur.b;
251b411b363SPhilipp Reisner unsigned int tmp;
252b411b363SPhilipp Reisner
253b411b363SPhilipp Reisner if (bits == 0)
254b411b363SPhilipp Reisner return 0;
255b411b363SPhilipp Reisner
256b411b363SPhilipp Reisner if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
257b411b363SPhilipp Reisner return -ENOBUFS;
258b411b363SPhilipp Reisner
259b411b363SPhilipp Reisner /* paranoia: strip off hi bits; they should not be set anyways. */
260b411b363SPhilipp Reisner if (bits < 64)
261b411b363SPhilipp Reisner val &= ~0ULL >> (64 - bits);
262b411b363SPhilipp Reisner
263b411b363SPhilipp Reisner *b++ |= (val & 0xff) << bs->cur.bit;
264b411b363SPhilipp Reisner
265b411b363SPhilipp Reisner for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
266b411b363SPhilipp Reisner *b++ |= (val >> tmp) & 0xff;
267b411b363SPhilipp Reisner
268b411b363SPhilipp Reisner bitstream_cursor_advance(&bs->cur, bits);
269b411b363SPhilipp Reisner return bits;
270b411b363SPhilipp Reisner }
271b411b363SPhilipp Reisner
272b411b363SPhilipp Reisner /* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
273b411b363SPhilipp Reisner *
274b411b363SPhilipp Reisner * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
275b411b363SPhilipp Reisner *
276b411b363SPhilipp Reisner * If there are less than the requested number of valid bits left in the
277b411b363SPhilipp Reisner * bitstream, still fetches all available bits.
278b411b363SPhilipp Reisner *
279b411b363SPhilipp Reisner * Returns number of actually fetched bits.
280b411b363SPhilipp Reisner */
bitstream_get_bits(struct bitstream * bs,u64 * out,int bits)281b411b363SPhilipp Reisner static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
282b411b363SPhilipp Reisner {
283b411b363SPhilipp Reisner u64 val;
284b411b363SPhilipp Reisner unsigned int n;
285b411b363SPhilipp Reisner
286b411b363SPhilipp Reisner if (bits > 64)
287b411b363SPhilipp Reisner return -EINVAL;
288b411b363SPhilipp Reisner
289b411b363SPhilipp Reisner if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
290b411b363SPhilipp Reisner bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
291b411b363SPhilipp Reisner - bs->cur.bit - bs->pad_bits;
292b411b363SPhilipp Reisner
293b411b363SPhilipp Reisner if (bits == 0) {
294b411b363SPhilipp Reisner *out = 0;
295b411b363SPhilipp Reisner return 0;
296b411b363SPhilipp Reisner }
297b411b363SPhilipp Reisner
298b411b363SPhilipp Reisner /* get the high bits */
299b411b363SPhilipp Reisner val = 0;
300b411b363SPhilipp Reisner n = (bs->cur.bit + bits + 7) >> 3;
301b411b363SPhilipp Reisner /* n may be at most 9, if cur.bit + bits > 64 */
302b411b363SPhilipp Reisner /* which means this copies at most 8 byte */
303b411b363SPhilipp Reisner if (n) {
304b411b363SPhilipp Reisner memcpy(&val, bs->cur.b+1, n - 1);
305b411b363SPhilipp Reisner val = le64_to_cpu(val) << (8 - bs->cur.bit);
306b411b363SPhilipp Reisner }
307b411b363SPhilipp Reisner
308b411b363SPhilipp Reisner /* we still need the low bits */
309b411b363SPhilipp Reisner val |= bs->cur.b[0] >> bs->cur.bit;
310b411b363SPhilipp Reisner
311b411b363SPhilipp Reisner /* and mask out bits we don't want */
312b411b363SPhilipp Reisner val &= ~0ULL >> (64 - bits);
313b411b363SPhilipp Reisner
314b411b363SPhilipp Reisner bitstream_cursor_advance(&bs->cur, bits);
315b411b363SPhilipp Reisner *out = val;
316b411b363SPhilipp Reisner
317b411b363SPhilipp Reisner return bits;
318b411b363SPhilipp Reisner }
319b411b363SPhilipp Reisner
320b411b363SPhilipp Reisner /* encodes @in as vli into @bs;
321b411b363SPhilipp Reisner
322b411b363SPhilipp Reisner * return values
323b411b363SPhilipp Reisner * > 0: number of bits successfully stored in bitstream
324b411b363SPhilipp Reisner * -ENOBUFS @bs is full
325b411b363SPhilipp Reisner * -EINVAL input zero (invalid)
326b411b363SPhilipp Reisner * -EOVERFLOW input too large for this vli code (invalid)
327b411b363SPhilipp Reisner */
vli_encode_bits(struct bitstream * bs,u64 in)328b411b363SPhilipp Reisner static inline int vli_encode_bits(struct bitstream *bs, u64 in)
329b411b363SPhilipp Reisner {
330*06918200SChristoph Böhmwalder u64 code;
331b411b363SPhilipp Reisner int bits = __vli_encode_bits(&code, in);
332b411b363SPhilipp Reisner
333b411b363SPhilipp Reisner if (bits <= 0)
334b411b363SPhilipp Reisner return bits;
335b411b363SPhilipp Reisner
336b411b363SPhilipp Reisner return bitstream_put_bits(bs, code, bits);
337b411b363SPhilipp Reisner }
338b411b363SPhilipp Reisner
339b411b363SPhilipp Reisner #endif
340