xref: /openbmc/qemu/block/vhdx-log.c (revision 709395f8)
1 /*
2  * Block driver for Hyper-V VHDX Images
3  *
4  * Copyright (c) 2013 Red Hat, Inc.,
5  *
6  * Authors:
7  *  Jeff Cody <jcody@redhat.com>
8  *
9  *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
10  *  by Microsoft:
11  *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
12  *
13  * This file covers the functionality of the metadata log writing, parsing, and
14  * replay.
15  *
16  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
17  * See the COPYING.LIB file in the top-level directory.
18  *
19  */
20 #include "qemu/osdep.h"
21 #include "qapi/error.h"
22 #include "qemu-common.h"
23 #include "block/block_int.h"
24 #include "qemu/error-report.h"
25 #include "qemu/module.h"
26 #include "qemu/bswap.h"
27 #include "vhdx.h"
28 
29 
30 typedef struct VHDXLogSequence {
31     bool valid;
32     uint32_t count;
33     VHDXLogEntries log;
34     VHDXLogEntryHeader hdr;
35 } VHDXLogSequence;
36 
37 typedef struct VHDXLogDescEntries {
38     VHDXLogEntryHeader hdr;
39     VHDXLogDescriptor desc[];
40 } VHDXLogDescEntries;
41 
42 static const MSGUID zero_guid = { 0 };
43 
44 /* The log located on the disk is circular buffer containing
45  * sectors of 4096 bytes each.
46  *
47  * It is assumed for the read/write functions below that the
48  * circular buffer scheme uses a 'one sector open' to indicate
49  * the buffer is full.  Given the validation methods used for each
50  * sector, this method should be compatible with other methods that
51  * do not waste a sector.
52  */
53 
54 
55 /* Allow peeking at the hdr entry at the beginning of the current
56  * read index, without advancing the read index */
57 static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
58                              VHDXLogEntryHeader *hdr)
59 {
60     int ret = 0;
61     uint64_t offset;
62     uint32_t read;
63 
64     assert(hdr != NULL);
65 
66     /* peek is only supported on sector boundaries */
67     if (log->read % VHDX_LOG_SECTOR_SIZE) {
68         ret = -EFAULT;
69         goto exit;
70     }
71 
72     read = log->read;
73     /* we are guaranteed that a) log sectors are 4096 bytes,
74      * and b) the log length is a multiple of 1MB. So, there
75      * is always a round number of sectors in the buffer */
76     if ((read + sizeof(VHDXLogEntryHeader)) > log->length) {
77         read = 0;
78     }
79 
80     if (read == log->write) {
81         ret = -EINVAL;
82         goto exit;
83     }
84 
85     offset = log->offset + read;
86 
87     ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader));
88     if (ret < 0) {
89         goto exit;
90     }
91     vhdx_log_entry_hdr_le_import(hdr);
92 
93 exit:
94     return ret;
95 }
96 
97 /* Index increment for log, based on sector boundaries */
98 static int vhdx_log_inc_idx(uint32_t idx, uint64_t length)
99 {
100     idx += VHDX_LOG_SECTOR_SIZE;
101     /* we are guaranteed that a) log sectors are 4096 bytes,
102      * and b) the log length is a multiple of 1MB. So, there
103      * is always a round number of sectors in the buffer */
104     return idx >= length ? 0 : idx;
105 }
106 
107 
108 /* Reset the log to empty */
109 static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
110 {
111     MSGUID guid = { 0 };
112     s->log.read = s->log.write = 0;
113     /* a log guid of 0 indicates an empty log to any parser of v0
114      * VHDX logs */
115     vhdx_update_headers(bs, s, false, &guid);
116 }
117 
118 /* Reads num_sectors from the log (all log sectors are 4096 bytes),
119  * into buffer 'buffer'.  Upon return, *sectors_read will contain
120  * the number of sectors successfully read.
121  *
122  * It is assumed that 'buffer' is already allocated, and of sufficient
123  * size (i.e. >= 4096*num_sectors).
124  *
125  * If 'peek' is true, then the tail (read) pointer for the circular buffer is
126  * not modified.
127  *
128  * 0 is returned on success, -errno otherwise.  */
129 static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
130                                  uint32_t *sectors_read, void *buffer,
131                                  uint32_t num_sectors, bool peek)
132 {
133     int ret = 0;
134     uint64_t offset;
135     uint32_t read;
136 
137     read = log->read;
138 
139     *sectors_read = 0;
140     while (num_sectors) {
141         if (read == log->write) {
142             /* empty */
143             break;
144         }
145         offset = log->offset + read;
146 
147         ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE);
148         if (ret < 0) {
149             goto exit;
150         }
151         read = vhdx_log_inc_idx(read, log->length);
152 
153         *sectors_read = *sectors_read + 1;
154         num_sectors--;
155     }
156 
157 exit:
158     if (!peek) {
159         log->read = read;
160     }
161     return ret;
162 }
163 
164 /* Writes num_sectors to the log (all log sectors are 4096 bytes),
165  * from buffer 'buffer'.  Upon return, *sectors_written will contain
166  * the number of sectors successfully written.
167  *
168  * It is assumed that 'buffer' is at least 4096*num_sectors large.
169  *
170  * 0 is returned on success, -errno otherwise */
171 static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
172                                   uint32_t *sectors_written, void *buffer,
173                                   uint32_t num_sectors)
174 {
175     int ret = 0;
176     uint64_t offset;
177     uint32_t write;
178     void *buffer_tmp;
179     BDRVVHDXState *s = bs->opaque;
180 
181     ret = vhdx_user_visible_write(bs, s);
182     if (ret < 0) {
183         goto exit;
184     }
185 
186     write = log->write;
187 
188     buffer_tmp = buffer;
189     while (num_sectors) {
190 
191         offset = log->offset + write;
192         write = vhdx_log_inc_idx(write, log->length);
193         if (write == log->read) {
194             /* full */
195             break;
196         }
197         ret = bdrv_pwrite(bs->file, offset, buffer_tmp,
198                           VHDX_LOG_SECTOR_SIZE);
199         if (ret < 0) {
200             goto exit;
201         }
202         buffer_tmp += VHDX_LOG_SECTOR_SIZE;
203 
204         log->write = write;
205         *sectors_written = *sectors_written + 1;
206         num_sectors--;
207     }
208 
209 exit:
210     return ret;
211 }
212 
213 
214 /* Validates a log entry header */
215 static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
216                                   BDRVVHDXState *s)
217 {
218     int valid = false;
219 
220     if (hdr->signature != VHDX_LOG_SIGNATURE) {
221         goto exit;
222     }
223 
224     /* if the individual entry length is larger than the whole log
225      * buffer, that is obviously invalid */
226     if (log->length < hdr->entry_length) {
227         goto exit;
228     }
229 
230     /* length of entire entry must be in units of 4KB (log sector size) */
231     if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) {
232         goto exit;
233     }
234 
235     /* per spec, sequence # must be > 0 */
236     if (hdr->sequence_number == 0) {
237         goto exit;
238     }
239 
240     /* log entries are only valid if they match the file-wide log guid
241      * found in the active header */
242     if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) {
243         goto exit;
244     }
245 
246     if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) {
247         goto exit;
248     }
249 
250     valid = true;
251 
252 exit:
253     return valid;
254 }
255 
256 /*
257  * Given a log header, this will validate that the descriptors and the
258  * corresponding data sectors (if applicable)
259  *
260  * Validation consists of:
261  *      1. Making sure the sequence numbers matches the entry header
262  *      2. Verifying a valid signature ('zero' or 'desc' for descriptors)
263  *      3. File offset field is a multiple of 4KB
264  *      4. If a data descriptor, the corresponding data sector
265  *         has its signature ('data') and matching sequence number
266  *
267  * @desc: the data buffer containing the descriptor
268  * @hdr:  the log entry header
269  *
270  * Returns true if valid
271  */
272 static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
273                                    VHDXLogEntryHeader *hdr)
274 {
275     bool ret = false;
276 
277     if (desc->sequence_number != hdr->sequence_number) {
278         goto exit;
279     }
280     if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) {
281         goto exit;
282     }
283 
284     if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
285         if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
286             /* valid */
287             ret = true;
288         }
289     } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
290             /* valid */
291             ret = true;
292     }
293 
294 exit:
295     return ret;
296 }
297 
298 
299 /* Prior to sector data for a log entry, there is the header
300  * and the descriptors referenced in the header:
301  *
302  * [] = 4KB sector
303  *
304  * [ hdr, desc ][   desc   ][ ... ][ data ][ ... ]
305  *
306  * The first sector in a log entry has a 64 byte header, and
307  * up to 126 32-byte descriptors.  If more descriptors than
308  * 126 are required, then subsequent sectors can have up to 128
309  * descriptors.  Each sector is 4KB.  Data follows the descriptor
310  * sectors.
311  *
312  * This will return the number of sectors needed to encompass
313  * the passed number of descriptors in desc_cnt.
314  *
315  * This will never return 0, even if desc_cnt is 0.
316  */
317 static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
318 {
319     uint32_t desc_sectors;
320 
321     desc_cnt += 2; /* account for header in first sector */
322     desc_sectors = desc_cnt / 128;
323     if (desc_cnt % 128) {
324         desc_sectors++;
325     }
326 
327     return desc_sectors;
328 }
329 
330 
331 /* Reads the log header, and subsequent descriptors (if any).  This
332  * will allocate all the space for buffer, which must be NULL when
333  * passed into this function. Each descriptor will also be validated,
334  * and error returned if any are invalid. */
335 static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
336                               VHDXLogEntries *log, VHDXLogDescEntries **buffer,
337                               bool convert_endian)
338 {
339     int ret = 0;
340     uint32_t desc_sectors;
341     uint32_t sectors_read;
342     VHDXLogEntryHeader hdr;
343     VHDXLogDescEntries *desc_entries = NULL;
344     VHDXLogDescriptor desc;
345     int i;
346 
347     assert(*buffer == NULL);
348 
349     ret = vhdx_log_peek_hdr(bs, log, &hdr);
350     if (ret < 0) {
351         goto exit;
352     }
353 
354     if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
355         ret = -EINVAL;
356         goto exit;
357     }
358 
359     desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
360     desc_entries = qemu_try_blockalign(bs->file->bs,
361                                        desc_sectors * VHDX_LOG_SECTOR_SIZE);
362     if (desc_entries == NULL) {
363         ret = -ENOMEM;
364         goto exit;
365     }
366 
367     ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
368                                 desc_sectors, false);
369     if (ret < 0) {
370         goto free_and_exit;
371     }
372     if (sectors_read != desc_sectors) {
373         ret = -EINVAL;
374         goto free_and_exit;
375     }
376 
377     /* put in proper endianness, and validate each desc */
378     for (i = 0; i < hdr.descriptor_count; i++) {
379         desc = desc_entries->desc[i];
380         vhdx_log_desc_le_import(&desc);
381         if (convert_endian) {
382             desc_entries->desc[i] = desc;
383         }
384         if (vhdx_log_desc_is_valid(&desc, &hdr) == false) {
385             ret = -EINVAL;
386             goto free_and_exit;
387         }
388     }
389     if (convert_endian) {
390         desc_entries->hdr = hdr;
391     }
392 
393     *buffer = desc_entries;
394     goto exit;
395 
396 free_and_exit:
397     qemu_vfree(desc_entries);
398 exit:
399     return ret;
400 }
401 
402 
403 /* Flushes the descriptor described by desc to the VHDX image file.
404  * If the descriptor is a data descriptor, than 'data' must be non-NULL,
405  * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be
406  * written.
407  *
408  * Verification is performed to make sure the sequence numbers of a data
409  * descriptor match the sequence number in the desc.
410  *
411  * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
412  * In this case, it should be noted that zeroes are written to disk, and the
413  * image file is not extended as a sparse file.  */
414 static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
415                                VHDXLogDataSector *data)
416 {
417     int ret = 0;
418     uint64_t seq, file_offset;
419     uint32_t offset = 0;
420     void *buffer = NULL;
421     uint64_t count = 1;
422     int i;
423 
424     buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
425 
426     if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
427         /* data sector */
428         if (data == NULL) {
429             ret = -EFAULT;
430             goto exit;
431         }
432 
433         /* The sequence number of the data sector must match that
434          * in the descriptor */
435         seq = data->sequence_high;
436         seq <<= 32;
437         seq |= data->sequence_low & 0xffffffff;
438 
439         if (seq != desc->sequence_number) {
440             ret = -EINVAL;
441             goto exit;
442         }
443 
444         /* Each data sector is in total 4096 bytes, however the first
445          * 8 bytes, and last 4 bytes, are located in the descriptor */
446         memcpy(buffer, &desc->leading_bytes, 8);
447         offset += 8;
448 
449         memcpy(buffer+offset, data->data, 4084);
450         offset += 4084;
451 
452         memcpy(buffer+offset, &desc->trailing_bytes, 4);
453 
454     } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
455         /* write 'count' sectors of sector */
456         memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
457         count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
458     } else {
459         error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32,
460                       desc->signature);
461         ret = -EINVAL;
462         goto exit;
463     }
464 
465     file_offset = desc->file_offset;
466 
467     /* count is only > 1 if we are writing zeroes */
468     for (i = 0; i < count; i++) {
469         ret = bdrv_pwrite_sync(bs->file, file_offset, buffer,
470                                VHDX_LOG_SECTOR_SIZE);
471         if (ret < 0) {
472             goto exit;
473         }
474         file_offset += VHDX_LOG_SECTOR_SIZE;
475     }
476 
477 exit:
478     qemu_vfree(buffer);
479     return ret;
480 }
481 
482 /* Flush the entire log (as described by 'logs') to the VHDX image
483  * file, and then set the log to 'empty' status once complete.
484  *
485  * The log entries should be validate prior to flushing */
486 static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
487                           VHDXLogSequence *logs)
488 {
489     int ret = 0;
490     int i;
491     uint32_t cnt, sectors_read;
492     uint64_t new_file_size;
493     void *data = NULL;
494     int64_t file_length;
495     VHDXLogDescEntries *desc_entries = NULL;
496     VHDXLogEntryHeader hdr_tmp = { 0 };
497 
498     cnt = logs->count;
499 
500     data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
501 
502     ret = vhdx_user_visible_write(bs, s);
503     if (ret < 0) {
504         goto exit;
505     }
506 
507     /* each iteration represents one log sequence, which may span multiple
508      * sectors */
509     while (cnt--) {
510         ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp);
511         if (ret < 0) {
512             goto exit;
513         }
514         file_length = bdrv_getlength(bs->file->bs);
515         if (file_length < 0) {
516             ret = file_length;
517             goto exit;
518         }
519         /* if the log shows a FlushedFileOffset larger than our current file
520          * size, then that means the file has been truncated / corrupted, and
521          * we must refused to open it / use it */
522         if (hdr_tmp.flushed_file_offset > file_length) {
523             ret = -EINVAL;
524             goto exit;
525         }
526 
527         ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true);
528         if (ret < 0) {
529             goto exit;
530         }
531 
532         for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
533             if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) {
534                 /* data sector, so read a sector to flush */
535                 ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
536                                             data, 1, false);
537                 if (ret < 0) {
538                     goto exit;
539                 }
540                 if (sectors_read != 1) {
541                     ret = -EINVAL;
542                     goto exit;
543                 }
544                 vhdx_log_data_le_import(data);
545             }
546 
547             ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
548             if (ret < 0) {
549                 goto exit;
550             }
551         }
552         if (file_length < desc_entries->hdr.last_file_offset) {
553             new_file_size = desc_entries->hdr.last_file_offset;
554             if (new_file_size % (1024*1024)) {
555                 /* round up to nearest 1MB boundary */
556                 new_file_size = QEMU_ALIGN_UP(new_file_size, MiB);
557                 if (new_file_size > INT64_MAX) {
558                     ret = -EINVAL;
559                     goto exit;
560                 }
561                 ret = bdrv_truncate(bs->file, new_file_size, PREALLOC_MODE_OFF,
562                                     NULL);
563                 if (ret < 0) {
564                     goto exit;
565                 }
566             }
567         }
568         qemu_vfree(desc_entries);
569         desc_entries = NULL;
570     }
571 
572     ret = bdrv_flush(bs);
573     if (ret < 0) {
574         goto exit;
575     }
576     /* once the log is fully flushed, indicate that we have an empty log
577      * now.  This also sets the log guid to 0, to indicate an empty log */
578     vhdx_log_reset(bs, s);
579 
580 exit:
581     qemu_vfree(data);
582     qemu_vfree(desc_entries);
583     return ret;
584 }
585 
586 static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
587                                    VHDXLogEntries *log, uint64_t seq,
588                                    bool *valid, VHDXLogEntryHeader *entry)
589 {
590     int ret = 0;
591     VHDXLogEntryHeader hdr;
592     void *buffer = NULL;
593     uint32_t i, desc_sectors, total_sectors, crc;
594     uint32_t sectors_read = 0;
595     VHDXLogDescEntries *desc_buffer = NULL;
596 
597     *valid = false;
598 
599     ret = vhdx_log_peek_hdr(bs, log, &hdr);
600     if (ret < 0) {
601         goto inc_and_exit;
602     }
603 
604     if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
605         goto inc_and_exit;
606     }
607 
608     if (seq > 0) {
609         if (hdr.sequence_number != seq + 1) {
610             goto inc_and_exit;
611         }
612     }
613 
614     desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
615 
616     /* Read all log sectors, and calculate log checksum */
617 
618     total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;
619 
620 
621     /* read_desc() will increment the read idx */
622     ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false);
623     if (ret < 0) {
624         goto free_and_exit;
625     }
626 
627     crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer,
628                             desc_sectors * VHDX_LOG_SECTOR_SIZE, 4);
629     crc ^= 0xffffffff;
630 
631     buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
632     if (total_sectors > desc_sectors) {
633         for (i = 0; i < total_sectors - desc_sectors; i++) {
634             sectors_read = 0;
635             ret = vhdx_log_read_sectors(bs, log, &sectors_read, buffer,
636                                         1, false);
637             if (ret < 0 || sectors_read != 1) {
638                 goto free_and_exit;
639             }
640             crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1);
641             crc ^= 0xffffffff;
642         }
643     }
644     crc ^= 0xffffffff;
645     if (crc != hdr.checksum) {
646         goto free_and_exit;
647     }
648 
649     *valid = true;
650     *entry = hdr;
651     goto free_and_exit;
652 
653 inc_and_exit:
654     log->read = vhdx_log_inc_idx(log->read, log->length);
655 
656 free_and_exit:
657     qemu_vfree(buffer);
658     qemu_vfree(desc_buffer);
659     return ret;
660 }
661 
662 /* Search through the log circular buffer, and find the valid, active
663  * log sequence, if any exists
664  * */
665 static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s,
666                            VHDXLogSequence *logs)
667 {
668     int ret = 0;
669     uint32_t tail;
670     bool seq_valid = false;
671     VHDXLogSequence candidate = { 0 };
672     VHDXLogEntryHeader hdr = { 0 };
673     VHDXLogEntries curr_log;
674 
675     memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries));
676     curr_log.write = curr_log.length;   /* assume log is full */
677     curr_log.read = 0;
678 
679 
680     /* now we will go through the whole log sector by sector, until
681      * we find a valid, active log sequence, or reach the end of the
682      * log buffer */
683     for (;;) {
684         uint64_t curr_seq = 0;
685         VHDXLogSequence current = { 0 };
686 
687         tail = curr_log.read;
688 
689         ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
690                                       &seq_valid, &hdr);
691         if (ret < 0) {
692             goto exit;
693         }
694 
695         if (seq_valid) {
696             current.valid     = true;
697             current.log       = curr_log;
698             current.log.read  = tail;
699             current.log.write = curr_log.read;
700             current.count     = 1;
701             current.hdr       = hdr;
702 
703 
704             for (;;) {
705                 ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
706                                               &seq_valid, &hdr);
707                 if (ret < 0) {
708                     goto exit;
709                 }
710                 if (seq_valid == false) {
711                     break;
712                 }
713                 current.log.write = curr_log.read;
714                 current.count++;
715 
716                 curr_seq = hdr.sequence_number;
717             }
718         }
719 
720         if (current.valid) {
721             if (candidate.valid == false ||
722                 current.hdr.sequence_number > candidate.hdr.sequence_number) {
723                 candidate = current;
724             }
725         }
726 
727         if (curr_log.read < tail) {
728             break;
729         }
730     }
731 
732     *logs = candidate;
733 
734     if (candidate.valid) {
735         /* this is the next sequence number, for writes */
736         s->log.sequence = candidate.hdr.sequence_number + 1;
737     }
738 
739 
740 exit:
741     return ret;
742 }
743 
744 /* Parse the replay log.  Per the VHDX spec, if the log is present
745  * it must be replayed prior to opening the file, even read-only.
746  *
747  * If read-only, we must replay the log in RAM (or refuse to open
748  * a dirty VHDX file read-only) */
749 int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
750                    Error **errp)
751 {
752     int ret = 0;
753     VHDXHeader *hdr;
754     VHDXLogSequence logs = { 0 };
755 
756     hdr = s->headers[s->curr_header];
757 
758     *flushed = false;
759 
760     /* s->log.hdr is freed in vhdx_close() */
761     if (s->log.hdr == NULL) {
762         s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader));
763     }
764 
765     s->log.offset = hdr->log_offset;
766     s->log.length = hdr->log_length;
767 
768     if (s->log.offset < VHDX_LOG_MIN_SIZE ||
769         s->log.offset % VHDX_LOG_MIN_SIZE) {
770         ret = -EINVAL;
771         goto exit;
772     }
773 
774     /* per spec, only log version of 0 is supported */
775     if (hdr->log_version != 0) {
776         ret = -EINVAL;
777         goto exit;
778     }
779 
780     /* If either the log guid, or log length is zero,
781      * then a replay log is not present */
782     if (guid_eq(hdr->log_guid, zero_guid)) {
783         goto exit;
784     }
785 
786     if (hdr->log_length == 0) {
787         goto exit;
788     }
789 
790     if (hdr->log_length % VHDX_LOG_MIN_SIZE) {
791         ret = -EINVAL;
792         goto exit;
793     }
794 
795 
796     /* The log is present, we need to find if and where there is an active
797      * sequence of valid entries present in the log.  */
798 
799     ret = vhdx_log_search(bs, s, &logs);
800     if (ret < 0) {
801         goto exit;
802     }
803 
804     if (logs.valid) {
805         if (bs->read_only) {
806             bdrv_refresh_filename(bs);
807             ret = -EPERM;
808             error_setg(errp,
809                        "VHDX image file '%s' opened read-only, but "
810                        "contains a log that needs to be replayed",
811                        bs->filename);
812             error_append_hint(errp,  "To replay the log, run:\n"
813                               "qemu-img check -r all '%s'\n",
814                               bs->filename);
815             goto exit;
816         }
817         /* now flush the log */
818         ret = vhdx_log_flush(bs, s, &logs);
819         if (ret < 0) {
820             goto exit;
821         }
822         *flushed = true;
823     }
824 
825 
826 exit:
827     return ret;
828 }
829 
830 
831 
832 static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc,
833                                       VHDXLogDataSector *sector, void *data,
834                                       uint64_t seq)
835 {
836     /* 8 + 4084 + 4 = 4096, 1 log sector */
837     memcpy(&desc->leading_bytes, data, 8);
838     data += 8;
839     desc->leading_bytes = cpu_to_le64(desc->leading_bytes);
840     memcpy(sector->data, data, 4084);
841     data += 4084;
842     memcpy(&desc->trailing_bytes, data, 4);
843     desc->trailing_bytes = cpu_to_le32(desc->trailing_bytes);
844     data += 4;
845 
846     sector->sequence_high  = (uint32_t) (seq >> 32);
847     sector->sequence_low   = (uint32_t) (seq & 0xffffffff);
848     sector->data_signature = VHDX_LOG_DATA_SIGNATURE;
849 
850     vhdx_log_desc_le_export(desc);
851     vhdx_log_data_le_export(sector);
852 }
853 
854 
855 static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
856                           void *data, uint32_t length, uint64_t offset)
857 {
858     int ret = 0;
859     void *buffer = NULL;
860     void *merged_sector = NULL;
861     void *data_tmp, *sector_write;
862     unsigned int i;
863     int sector_offset;
864     uint32_t desc_sectors, sectors, total_length;
865     uint32_t sectors_written = 0;
866     uint32_t aligned_length;
867     uint32_t leading_length = 0;
868     uint32_t trailing_length = 0;
869     uint32_t partial_sectors = 0;
870     uint32_t bytes_written = 0;
871     uint64_t file_offset;
872     int64_t file_length;
873     VHDXHeader *header;
874     VHDXLogEntryHeader new_hdr;
875     VHDXLogDescriptor *new_desc = NULL;
876     VHDXLogDataSector *data_sector = NULL;
877     MSGUID new_guid = { 0 };
878 
879     header = s->headers[s->curr_header];
880 
881     /* need to have offset read data, and be on 4096 byte boundary */
882 
883     if (length > header->log_length) {
884         /* no log present.  we could create a log here instead of failing */
885         ret = -EINVAL;
886         goto exit;
887     }
888 
889     if (guid_eq(header->log_guid, zero_guid)) {
890         vhdx_guid_generate(&new_guid);
891         vhdx_update_headers(bs, s, false, &new_guid);
892     } else {
893         /* currently, we require that the log be flushed after
894          * every write. */
895         ret = -ENOTSUP;
896         goto exit;
897     }
898 
899     /* 0 is an invalid sequence number, but may also represent the first
900      * log write (or a wrapped seq) */
901     if (s->log.sequence == 0) {
902         s->log.sequence = 1;
903     }
904 
905     sector_offset = offset % VHDX_LOG_SECTOR_SIZE;
906     file_offset = QEMU_ALIGN_DOWN(offset, VHDX_LOG_SECTOR_SIZE);
907 
908     aligned_length = length;
909 
910     /* add in the unaligned head and tail bytes */
911     if (sector_offset) {
912         leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset);
913         leading_length = leading_length > length ? length : leading_length;
914         aligned_length -= leading_length;
915         partial_sectors++;
916     }
917 
918     sectors = aligned_length / VHDX_LOG_SECTOR_SIZE;
919     trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE);
920     if (trailing_length) {
921         partial_sectors++;
922     }
923 
924     sectors += partial_sectors;
925 
926     file_length = bdrv_getlength(bs->file->bs);
927     if (file_length < 0) {
928         ret = file_length;
929         goto exit;
930     }
931 
932     /* sectors is now how many sectors the data itself takes, not
933      * including the header and descriptor metadata */
934 
935     new_hdr = (VHDXLogEntryHeader) {
936                 .signature           = VHDX_LOG_SIGNATURE,
937                 .tail                = s->log.tail,
938                 .sequence_number     = s->log.sequence,
939                 .descriptor_count    = sectors,
940                 .reserved            = 0,
941                 .flushed_file_offset = file_length,
942                 .last_file_offset    = file_length,
943                 .log_guid            = header->log_guid,
944               };
945 
946 
947     desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count);
948 
949     total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE;
950     new_hdr.entry_length = total_length;
951 
952     vhdx_log_entry_hdr_le_export(&new_hdr);
953 
954     buffer = qemu_blockalign(bs, total_length);
955     memcpy(buffer, &new_hdr, sizeof(new_hdr));
956 
957     new_desc = buffer + sizeof(new_hdr);
958     data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE);
959     data_tmp = data;
960 
961     /* All log sectors are 4KB, so for any partial sectors we must
962      * merge the data with preexisting data from the final file
963      * destination */
964     merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
965 
966     for (i = 0; i < sectors; i++) {
967         new_desc->signature       = VHDX_LOG_DESC_SIGNATURE;
968         new_desc->sequence_number = s->log.sequence;
969         new_desc->file_offset     = file_offset;
970 
971         if (i == 0 && leading_length) {
972             /* partial sector at the front of the buffer */
973             ret = bdrv_pread(bs->file, file_offset, merged_sector,
974                              VHDX_LOG_SECTOR_SIZE);
975             if (ret < 0) {
976                 goto exit;
977             }
978             memcpy(merged_sector + sector_offset, data_tmp, leading_length);
979             bytes_written = leading_length;
980             sector_write = merged_sector;
981         } else if (i == sectors - 1 && trailing_length) {
982             /* partial sector at the end of the buffer */
983             ret = bdrv_pread(bs->file,
984                             file_offset,
985                             merged_sector + trailing_length,
986                             VHDX_LOG_SECTOR_SIZE - trailing_length);
987             if (ret < 0) {
988                 goto exit;
989             }
990             memcpy(merged_sector, data_tmp, trailing_length);
991             bytes_written = trailing_length;
992             sector_write = merged_sector;
993         } else {
994             bytes_written = VHDX_LOG_SECTOR_SIZE;
995             sector_write = data_tmp;
996         }
997 
998         /* populate the raw sector data into the proper structures,
999          * as well as update the descriptor, and convert to proper
1000          * endianness */
1001         vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write,
1002                                   s->log.sequence);
1003 
1004         data_tmp += bytes_written;
1005         data_sector++;
1006         new_desc++;
1007         file_offset += VHDX_LOG_SECTOR_SIZE;
1008     }
1009 
1010     /* checksum covers entire entry, from the log header through the
1011      * last data sector */
1012     vhdx_update_checksum(buffer, total_length,
1013                          offsetof(VHDXLogEntryHeader, checksum));
1014 
1015     /* now write to the log */
1016     ret = vhdx_log_write_sectors(bs, &s->log, &sectors_written, buffer,
1017                                  desc_sectors + sectors);
1018     if (ret < 0) {
1019         goto exit;
1020     }
1021 
1022     if (sectors_written != desc_sectors + sectors) {
1023         /* instead of failing, we could flush the log here */
1024         ret = -EINVAL;
1025         goto exit;
1026     }
1027 
1028     s->log.sequence++;
1029     /* write new tail */
1030     s->log.tail = s->log.write;
1031 
1032 exit:
1033     qemu_vfree(buffer);
1034     qemu_vfree(merged_sector);
1035     return ret;
1036 }
1037 
1038 /* Perform a log write, and then immediately flush the entire log */
1039 int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
1040                              void *data, uint32_t length, uint64_t offset)
1041 {
1042     int ret = 0;
1043     VHDXLogSequence logs = { .valid = true,
1044                              .count = 1,
1045                              .hdr = { 0 } };
1046 
1047 
1048     /* Make sure data written (new and/or changed blocks) is stable
1049      * on disk, before creating log entry */
1050     ret = bdrv_flush(bs);
1051     if (ret < 0) {
1052         goto exit;
1053     }
1054 
1055     ret = vhdx_log_write(bs, s, data, length, offset);
1056     if (ret < 0) {
1057         goto exit;
1058     }
1059     logs.log = s->log;
1060 
1061     /* Make sure log is stable on disk */
1062     ret = bdrv_flush(bs);
1063     if (ret < 0) {
1064         goto exit;
1065     }
1066 
1067     ret = vhdx_log_flush(bs, s, &logs);
1068     if (ret < 0) {
1069         goto exit;
1070     }
1071 
1072     s->log = logs.log;
1073 
1074 exit:
1075     return ret;
1076 }
1077 
1078