1 /* 2 -*- linux-c -*- 3 drbd_receiver.c 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 */ 24 25 #ifndef _DRBD_VLI_H 26 #define _DRBD_VLI_H 27 28 /* 29 * At a granularity of 4KiB storage represented per bit, 30 * and stroage sizes of several TiB, 31 * and possibly small-bandwidth replication, 32 * the bitmap transfer time can take much too long, 33 * if transmitted in plain text. 34 * 35 * We try to reduce the transfered bitmap information 36 * by encoding runlengths of bit polarity. 37 * 38 * We never actually need to encode a "zero" (runlengths are positive). 39 * But then we have to store the value of the first bit. 40 * The first bit of information thus shall encode if the first runlength 41 * gives the number of set or unset bits. 42 * 43 * We assume that large areas are either completely set or unset, 44 * which gives good compression with any runlength method, 45 * even when encoding the runlength as fixed size 32bit/64bit integers. 46 * 47 * Still, there may be areas where the polarity flips every few bits, 48 * and encoding the runlength sequence of those areas with fix size 49 * integers would be much worse than plaintext. 50 * 51 * We want to encode small runlength values with minimum code length, 52 * while still being able to encode a Huge run of all zeros. 53 * 54 * Thus we need a Variable Length Integer encoding, VLI. 55 * 56 * For some cases, we produce more code bits than plaintext input. 57 * We need to send incompressible chunks as plaintext, skip over them 58 * and then see if the next chunk compresses better. 59 * 60 * We don't care too much about "excellent" compression ratio for large 61 * runlengths (all set/all clear): whether we achieve a factor of 100 62 * or 1000 is not that much of an issue. 63 * We do not want to waste too much on short runlengths in the "noisy" 64 * parts of the bitmap, though. 65 * 66 * There are endless variants of VLI, we experimented with: 67 * * simple byte-based 68 * * various bit based with different code word length. 69 * 70 * To avoid yet an other configuration parameter (choice of bitmap compression 71 * algorithm) which was difficult to explain and tune, we just chose the one 72 * variant that turned out best in all test cases. 73 * Based on real world usage patterns, with device sizes ranging from a few GiB 74 * to several TiB, file server/mailserver/webserver/mysql/postgress, 75 * mostly idle to really busy, the all time winner (though sometimes only 76 * marginally better) is: 77 */ 78 79 /* 80 * encoding is "visualised" as 81 * __little endian__ bitstream, least significant bit first (left most) 82 * 83 * this particular encoding is chosen so that the prefix code 84 * starts as unary encoding the level, then modified so that 85 * 10 levels can be described in 8bit, with minimal overhead 86 * for the smaller levels. 87 * 88 * Number of data bits follow fibonacci sequence, with the exception of the 89 * last level (+1 data bit, so it makes 64bit total). The only worse code when 90 * encoding bit polarity runlength is 1 plain bits => 2 code bits. 91 prefix data bits max val Nº data bits 92 0 x 0x2 1 93 10 x 0x4 1 94 110 xx 0x8 2 95 1110 xxx 0x10 3 96 11110 xxx xx 0x30 5 97 111110 xx xxxxxx 0x130 8 98 11111100 xxxxxxxx xxxxx 0x2130 13 99 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 100 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 101 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 102 * maximum encodable value: 0x100000400202130 == 2**56 + some */ 103 104 /* compression "table": 105 transmitted x 0.29 106 as plaintext x ........................ 107 x ........................ 108 x ........................ 109 x 0.59 0.21........................ 110 x ........................................................ 111 x .. c ................................................... 112 x 0.44.. o ................................................... 113 x .......... d ................................................... 114 x .......... e ................................................... 115 X............. ................................................... 116 x.............. b ................................................... 117 2.0x............... i ................................................... 118 #X................ t ................................................... 119 #................. s ........................... plain bits .......... 120 -+----------------------------------------------------------------------- 121 1 16 32 64 122 */ 123 124 /* LEVEL: (total bits, prefix bits, prefix value), 125 * sorted ascending by number of total bits. 126 * The rest of the code table is calculated at compiletime from this. */ 127 128 /* fibonacci data 1, 1, ... */ 129 #define VLI_L_1_1() do { \ 130 LEVEL( 2, 1, 0x00); \ 131 LEVEL( 3, 2, 0x01); \ 132 LEVEL( 5, 3, 0x03); \ 133 LEVEL( 7, 4, 0x07); \ 134 LEVEL(10, 5, 0x0f); \ 135 LEVEL(14, 6, 0x1f); \ 136 LEVEL(21, 8, 0x3f); \ 137 LEVEL(29, 8, 0x7f); \ 138 LEVEL(42, 8, 0xbf); \ 139 LEVEL(64, 8, 0xff); \ 140 } while (0) 141 142 /* finds a suitable level to decode the least significant part of in. 143 * returns number of bits consumed. 144 * 145 * BUG() for bad input, as that would mean a buggy code table. */ 146 static inline int vli_decode_bits(u64 *out, const u64 in) 147 { 148 u64 adj = 1; 149 150 #define LEVEL(t,b,v) \ 151 do { \ 152 if ((in & ((1 << b) -1)) == v) { \ 153 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ 154 return t; \ 155 } \ 156 adj += 1ULL << (t - b); \ 157 } while (0) 158 159 VLI_L_1_1(); 160 161 /* NOT REACHED, if VLI_LEVELS code table is defined properly */ 162 BUG(); 163 #undef LEVEL 164 } 165 166 /* return number of code bits needed, 167 * or negative error number */ 168 static inline int __vli_encode_bits(u64 *out, const u64 in) 169 { 170 u64 max = 0; 171 u64 adj = 1; 172 173 if (in == 0) 174 return -EINVAL; 175 176 #define LEVEL(t,b,v) do { \ 177 max += 1ULL << (t - b); \ 178 if (in <= max) { \ 179 if (out) \ 180 *out = ((in - adj) << b) | v; \ 181 return t; \ 182 } \ 183 adj = max + 1; \ 184 } while (0) 185 186 VLI_L_1_1(); 187 188 return -EOVERFLOW; 189 #undef LEVEL 190 } 191 192 #undef VLI_L_1_1 193 194 /* code from here down is independend of actually used bit code */ 195 196 /* 197 * Code length is determined by some unique (e.g. unary) prefix. 198 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, 199 * not a byte stream. 200 */ 201 202 /* for the bitstream, we need a cursor */ 203 struct bitstream_cursor { 204 /* the current byte */ 205 u8 *b; 206 /* the current bit within *b, nomalized: 0..7 */ 207 unsigned int bit; 208 }; 209 210 /* initialize cursor to point to first bit of stream */ 211 static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) 212 { 213 cur->b = s; 214 cur->bit = 0; 215 } 216 217 /* advance cursor by that many bits; maximum expected input value: 64, 218 * but depending on VLI implementation, it may be more. */ 219 static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) 220 { 221 bits += cur->bit; 222 cur->b = cur->b + (bits >> 3); 223 cur->bit = bits & 7; 224 } 225 226 /* the bitstream itself knows its length */ 227 struct bitstream { 228 struct bitstream_cursor cur; 229 unsigned char *buf; 230 size_t buf_len; /* in bytes */ 231 232 /* for input stream: 233 * number of trailing 0 bits for padding 234 * total number of valid bits in stream: buf_len * 8 - pad_bits */ 235 unsigned int pad_bits; 236 }; 237 238 static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) 239 { 240 bs->buf = s; 241 bs->buf_len = len; 242 bs->pad_bits = pad_bits; 243 bitstream_cursor_reset(&bs->cur, bs->buf); 244 } 245 246 static inline void bitstream_rewind(struct bitstream *bs) 247 { 248 bitstream_cursor_reset(&bs->cur, bs->buf); 249 memset(bs->buf, 0, bs->buf_len); 250 } 251 252 /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. 253 * Ignores "pad_bits". 254 * Returns zero if bits == 0 (nothing to do). 255 * Returns number of bits used if successful. 256 * 257 * If there is not enough room left in bitstream, 258 * leaves bitstream unchanged and returns -ENOBUFS. 259 */ 260 static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) 261 { 262 unsigned char *b = bs->cur.b; 263 unsigned int tmp; 264 265 if (bits == 0) 266 return 0; 267 268 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) 269 return -ENOBUFS; 270 271 /* paranoia: strip off hi bits; they should not be set anyways. */ 272 if (bits < 64) 273 val &= ~0ULL >> (64 - bits); 274 275 *b++ |= (val & 0xff) << bs->cur.bit; 276 277 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) 278 *b++ |= (val >> tmp) & 0xff; 279 280 bitstream_cursor_advance(&bs->cur, bits); 281 return bits; 282 } 283 284 /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. 285 * 286 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. 287 * 288 * If there are less than the requested number of valid bits left in the 289 * bitstream, still fetches all available bits. 290 * 291 * Returns number of actually fetched bits. 292 */ 293 static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) 294 { 295 u64 val; 296 unsigned int n; 297 298 if (bits > 64) 299 return -EINVAL; 300 301 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) 302 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) 303 - bs->cur.bit - bs->pad_bits; 304 305 if (bits == 0) { 306 *out = 0; 307 return 0; 308 } 309 310 /* get the high bits */ 311 val = 0; 312 n = (bs->cur.bit + bits + 7) >> 3; 313 /* n may be at most 9, if cur.bit + bits > 64 */ 314 /* which means this copies at most 8 byte */ 315 if (n) { 316 memcpy(&val, bs->cur.b+1, n - 1); 317 val = le64_to_cpu(val) << (8 - bs->cur.bit); 318 } 319 320 /* we still need the low bits */ 321 val |= bs->cur.b[0] >> bs->cur.bit; 322 323 /* and mask out bits we don't want */ 324 val &= ~0ULL >> (64 - bits); 325 326 bitstream_cursor_advance(&bs->cur, bits); 327 *out = val; 328 329 return bits; 330 } 331 332 /* encodes @in as vli into @bs; 333 334 * return values 335 * > 0: number of bits successfully stored in bitstream 336 * -ENOBUFS @bs is full 337 * -EINVAL input zero (invalid) 338 * -EOVERFLOW input too large for this vli code (invalid) 339 */ 340 static inline int vli_encode_bits(struct bitstream *bs, u64 in) 341 { 342 u64 code = code; 343 int bits = __vli_encode_bits(&code, in); 344 345 if (bits <= 0) 346 return bits; 347 348 return bitstream_put_bits(bs, code, bits); 349 } 350 351 #endif 352