1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 -*- linux-c -*- 4 drbd_receiver.c 5 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 6 7 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 8 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 9 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 10 11 */ 12 13 #ifndef _DRBD_VLI_H 14 #define _DRBD_VLI_H 15 16 /* 17 * At a granularity of 4KiB storage represented per bit, 18 * and stroage sizes of several TiB, 19 * and possibly small-bandwidth replication, 20 * the bitmap transfer time can take much too long, 21 * if transmitted in plain text. 22 * 23 * We try to reduce the transferred bitmap information 24 * by encoding runlengths of bit polarity. 25 * 26 * We never actually need to encode a "zero" (runlengths are positive). 27 * But then we have to store the value of the first bit. 28 * The first bit of information thus shall encode if the first runlength 29 * gives the number of set or unset bits. 30 * 31 * We assume that large areas are either completely set or unset, 32 * which gives good compression with any runlength method, 33 * even when encoding the runlength as fixed size 32bit/64bit integers. 34 * 35 * Still, there may be areas where the polarity flips every few bits, 36 * and encoding the runlength sequence of those areas with fix size 37 * integers would be much worse than plaintext. 38 * 39 * We want to encode small runlength values with minimum code length, 40 * while still being able to encode a Huge run of all zeros. 41 * 42 * Thus we need a Variable Length Integer encoding, VLI. 43 * 44 * For some cases, we produce more code bits than plaintext input. 45 * We need to send incompressible chunks as plaintext, skip over them 46 * and then see if the next chunk compresses better. 47 * 48 * We don't care too much about "excellent" compression ratio for large 49 * runlengths (all set/all clear): whether we achieve a factor of 100 50 * or 1000 is not that much of an issue. 51 * We do not want to waste too much on short runlengths in the "noisy" 52 * parts of the bitmap, though. 53 * 54 * There are endless variants of VLI, we experimented with: 55 * * simple byte-based 56 * * various bit based with different code word length. 57 * 58 * To avoid yet an other configuration parameter (choice of bitmap compression 59 * algorithm) which was difficult to explain and tune, we just chose the one 60 * variant that turned out best in all test cases. 61 * Based on real world usage patterns, with device sizes ranging from a few GiB 62 * to several TiB, file server/mailserver/webserver/mysql/postgress, 63 * mostly idle to really busy, the all time winner (though sometimes only 64 * marginally better) is: 65 */ 66 67 /* 68 * encoding is "visualised" as 69 * __little endian__ bitstream, least significant bit first (left most) 70 * 71 * this particular encoding is chosen so that the prefix code 72 * starts as unary encoding the level, then modified so that 73 * 10 levels can be described in 8bit, with minimal overhead 74 * for the smaller levels. 75 * 76 * Number of data bits follow fibonacci sequence, with the exception of the 77 * last level (+1 data bit, so it makes 64bit total). The only worse code when 78 * encoding bit polarity runlength is 1 plain bits => 2 code bits. 79 prefix data bits max val Nº data bits 80 0 x 0x2 1 81 10 x 0x4 1 82 110 xx 0x8 2 83 1110 xxx 0x10 3 84 11110 xxx xx 0x30 5 85 111110 xx xxxxxx 0x130 8 86 11111100 xxxxxxxx xxxxx 0x2130 13 87 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 88 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 89 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 90 * maximum encodable value: 0x100000400202130 == 2**56 + some */ 91 92 /* compression "table": 93 transmitted x 0.29 94 as plaintext x ........................ 95 x ........................ 96 x ........................ 97 x 0.59 0.21........................ 98 x ........................................................ 99 x .. c ................................................... 100 x 0.44.. o ................................................... 101 x .......... d ................................................... 102 x .......... e ................................................... 103 X............. ................................................... 104 x.............. b ................................................... 105 2.0x............... i ................................................... 106 #X................ t ................................................... 107 #................. s ........................... plain bits .......... 108 -+----------------------------------------------------------------------- 109 1 16 32 64 110 */ 111 112 /* LEVEL: (total bits, prefix bits, prefix value), 113 * sorted ascending by number of total bits. 114 * The rest of the code table is calculated at compiletime from this. */ 115 116 /* fibonacci data 1, 1, ... */ 117 #define VLI_L_1_1() do { \ 118 LEVEL( 2, 1, 0x00); \ 119 LEVEL( 3, 2, 0x01); \ 120 LEVEL( 5, 3, 0x03); \ 121 LEVEL( 7, 4, 0x07); \ 122 LEVEL(10, 5, 0x0f); \ 123 LEVEL(14, 6, 0x1f); \ 124 LEVEL(21, 8, 0x3f); \ 125 LEVEL(29, 8, 0x7f); \ 126 LEVEL(42, 8, 0xbf); \ 127 LEVEL(64, 8, 0xff); \ 128 } while (0) 129 130 /* finds a suitable level to decode the least significant part of in. 131 * returns number of bits consumed. 132 * 133 * BUG() for bad input, as that would mean a buggy code table. */ 134 static inline int vli_decode_bits(u64 *out, const u64 in) 135 { 136 u64 adj = 1; 137 138 #define LEVEL(t,b,v) \ 139 do { \ 140 if ((in & ((1 << b) -1)) == v) { \ 141 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ 142 return t; \ 143 } \ 144 adj += 1ULL << (t - b); \ 145 } while (0) 146 147 VLI_L_1_1(); 148 149 /* NOT REACHED, if VLI_LEVELS code table is defined properly */ 150 BUG(); 151 #undef LEVEL 152 } 153 154 /* return number of code bits needed, 155 * or negative error number */ 156 static inline int __vli_encode_bits(u64 *out, const u64 in) 157 { 158 u64 max = 0; 159 u64 adj = 1; 160 161 if (in == 0) 162 return -EINVAL; 163 164 #define LEVEL(t,b,v) do { \ 165 max += 1ULL << (t - b); \ 166 if (in <= max) { \ 167 if (out) \ 168 *out = ((in - adj) << b) | v; \ 169 return t; \ 170 } \ 171 adj = max + 1; \ 172 } while (0) 173 174 VLI_L_1_1(); 175 176 return -EOVERFLOW; 177 #undef LEVEL 178 } 179 180 #undef VLI_L_1_1 181 182 /* code from here down is independend of actually used bit code */ 183 184 /* 185 * Code length is determined by some unique (e.g. unary) prefix. 186 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, 187 * not a byte stream. 188 */ 189 190 /* for the bitstream, we need a cursor */ 191 struct bitstream_cursor { 192 /* the current byte */ 193 u8 *b; 194 /* the current bit within *b, nomalized: 0..7 */ 195 unsigned int bit; 196 }; 197 198 /* initialize cursor to point to first bit of stream */ 199 static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) 200 { 201 cur->b = s; 202 cur->bit = 0; 203 } 204 205 /* advance cursor by that many bits; maximum expected input value: 64, 206 * but depending on VLI implementation, it may be more. */ 207 static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) 208 { 209 bits += cur->bit; 210 cur->b = cur->b + (bits >> 3); 211 cur->bit = bits & 7; 212 } 213 214 /* the bitstream itself knows its length */ 215 struct bitstream { 216 struct bitstream_cursor cur; 217 unsigned char *buf; 218 size_t buf_len; /* in bytes */ 219 220 /* for input stream: 221 * number of trailing 0 bits for padding 222 * total number of valid bits in stream: buf_len * 8 - pad_bits */ 223 unsigned int pad_bits; 224 }; 225 226 static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) 227 { 228 bs->buf = s; 229 bs->buf_len = len; 230 bs->pad_bits = pad_bits; 231 bitstream_cursor_reset(&bs->cur, bs->buf); 232 } 233 234 static inline void bitstream_rewind(struct bitstream *bs) 235 { 236 bitstream_cursor_reset(&bs->cur, bs->buf); 237 memset(bs->buf, 0, bs->buf_len); 238 } 239 240 /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. 241 * Ignores "pad_bits". 242 * Returns zero if bits == 0 (nothing to do). 243 * Returns number of bits used if successful. 244 * 245 * If there is not enough room left in bitstream, 246 * leaves bitstream unchanged and returns -ENOBUFS. 247 */ 248 static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) 249 { 250 unsigned char *b = bs->cur.b; 251 unsigned int tmp; 252 253 if (bits == 0) 254 return 0; 255 256 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) 257 return -ENOBUFS; 258 259 /* paranoia: strip off hi bits; they should not be set anyways. */ 260 if (bits < 64) 261 val &= ~0ULL >> (64 - bits); 262 263 *b++ |= (val & 0xff) << bs->cur.bit; 264 265 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) 266 *b++ |= (val >> tmp) & 0xff; 267 268 bitstream_cursor_advance(&bs->cur, bits); 269 return bits; 270 } 271 272 /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. 273 * 274 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. 275 * 276 * If there are less than the requested number of valid bits left in the 277 * bitstream, still fetches all available bits. 278 * 279 * Returns number of actually fetched bits. 280 */ 281 static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) 282 { 283 u64 val; 284 unsigned int n; 285 286 if (bits > 64) 287 return -EINVAL; 288 289 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) 290 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) 291 - bs->cur.bit - bs->pad_bits; 292 293 if (bits == 0) { 294 *out = 0; 295 return 0; 296 } 297 298 /* get the high bits */ 299 val = 0; 300 n = (bs->cur.bit + bits + 7) >> 3; 301 /* n may be at most 9, if cur.bit + bits > 64 */ 302 /* which means this copies at most 8 byte */ 303 if (n) { 304 memcpy(&val, bs->cur.b+1, n - 1); 305 val = le64_to_cpu(val) << (8 - bs->cur.bit); 306 } 307 308 /* we still need the low bits */ 309 val |= bs->cur.b[0] >> bs->cur.bit; 310 311 /* and mask out bits we don't want */ 312 val &= ~0ULL >> (64 - bits); 313 314 bitstream_cursor_advance(&bs->cur, bits); 315 *out = val; 316 317 return bits; 318 } 319 320 /* encodes @in as vli into @bs; 321 322 * return values 323 * > 0: number of bits successfully stored in bitstream 324 * -ENOBUFS @bs is full 325 * -EINVAL input zero (invalid) 326 * -EOVERFLOW input too large for this vli code (invalid) 327 */ 328 static inline int vli_encode_bits(struct bitstream *bs, u64 in) 329 { 330 u64 code = code; 331 int bits = __vli_encode_bits(&code, in); 332 333 if (bits <= 0) 334 return bits; 335 336 return bitstream_put_bits(bs, code, bits); 337 } 338 339 #endif 340