xref: /openbmc/qemu/block/vhdx-log.c (revision 934676c7b726fe9cccac02a7a4fa23c1e6a4c91e)
1 /*
2  * Block driver for Hyper-V VHDX Images
3  *
4  * Copyright (c) 2013 Red Hat, Inc.,
5  *
6  * Authors:
7  *  Jeff Cody <jcody@redhat.com>
8  *
9  *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
10  *  by Microsoft:
11  *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
12  *
13  * This file covers the functionality of the metadata log writing, parsing, and
14  * replay.
15  *
16  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
17  * See the COPYING.LIB file in the top-level directory.
18  *
19  */
20 
21 #include "qemu/osdep.h"
22 #include "qapi/error.h"
23 #include "block/block-io.h"
24 #include "block/block_int.h"
25 #include "qemu/error-report.h"
26 #include "qemu/bswap.h"
27 #include "qemu/memalign.h"
28 #include "vhdx.h"
29 
30 
31 typedef struct VHDXLogSequence {
32     bool valid;
33     uint32_t count;
34     VHDXLogEntries log;
35     VHDXLogEntryHeader hdr;
36 } VHDXLogSequence;
37 
38 typedef struct VHDXLogDescEntries {
39     VHDXLogEntryHeader hdr;
40     VHDXLogDescriptor desc[];
41 } VHDXLogDescEntries;
42 
43 static const MSGUID zero_guid = { 0 };
44 
45 /* The log located on the disk is circular buffer containing
46  * sectors of 4096 bytes each.
47  *
48  * It is assumed for the read/write functions below that the
49  * circular buffer scheme uses a 'one sector open' to indicate
50  * the buffer is full.  Given the validation methods used for each
51  * sector, this method should be compatible with other methods that
52  * do not waste a sector.
53  */
54 
55 
56 /* Allow peeking at the hdr entry at the beginning of the current
57  * read index, without advancing the read index */
58 static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
59                              VHDXLogEntryHeader *hdr)
60 {
61     int ret = 0;
62     uint64_t offset;
63     uint32_t read;
64 
65     assert(hdr != NULL);
66 
67     /* peek is only supported on sector boundaries */
68     if (log->read % VHDX_LOG_SECTOR_SIZE) {
69         ret = -EFAULT;
70         goto exit;
71     }
72 
73     read = log->read;
74     /* we are guaranteed that a) log sectors are 4096 bytes,
75      * and b) the log length is a multiple of 1MB. So, there
76      * is always a round number of sectors in the buffer */
77     if ((read + sizeof(VHDXLogEntryHeader)) > log->length) {
78         read = 0;
79     }
80 
81     if (read == log->write) {
82         ret = -EINVAL;
83         goto exit;
84     }
85 
86     offset = log->offset + read;
87 
88     ret = bdrv_pread(bs->file, offset, sizeof(VHDXLogEntryHeader), hdr, 0);
89     if (ret < 0) {
90         goto exit;
91     }
92     vhdx_log_entry_hdr_le_import(hdr);
93 
94 exit:
95     return ret;
96 }
97 
98 /* Index increment for log, based on sector boundaries */
99 static int vhdx_log_inc_idx(uint32_t idx, uint64_t length)
100 {
101     idx += VHDX_LOG_SECTOR_SIZE;
102     /* we are guaranteed that a) log sectors are 4096 bytes,
103      * and b) the log length is a multiple of 1MB. So, there
104      * is always a round number of sectors in the buffer */
105     return idx >= length ? 0 : idx;
106 }
107 
108 
109 /* Reset the log to empty */
110 static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
111 {
112     MSGUID guid = { 0 };
113     s->log.read = s->log.write = 0;
114     /* a log guid of 0 indicates an empty log to any parser of v0
115      * VHDX logs */
116     vhdx_update_headers(bs, s, false, &guid);
117 }
118 
119 /* Reads num_sectors from the log (all log sectors are 4096 bytes),
120  * into buffer 'buffer'.  Upon return, *sectors_read will contain
121  * the number of sectors successfully read.
122  *
123  * It is assumed that 'buffer' is already allocated, and of sufficient
124  * size (i.e. >= 4096*num_sectors).
125  *
126  * If 'peek' is true, then the tail (read) pointer for the circular buffer is
127  * not modified.
128  *
129  * 0 is returned on success, -errno otherwise.  */
130 static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
131                                  uint32_t *sectors_read, void *buffer,
132                                  uint32_t num_sectors, bool peek)
133 {
134     int ret = 0;
135     uint64_t offset;
136     uint32_t read;
137 
138     read = log->read;
139 
140     *sectors_read = 0;
141     while (num_sectors) {
142         if (read == log->write) {
143             /* empty */
144             break;
145         }
146         offset = log->offset + read;
147 
148         ret = bdrv_pread(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer, 0);
149         if (ret < 0) {
150             goto exit;
151         }
152         read = vhdx_log_inc_idx(read, log->length);
153 
154         *sectors_read = *sectors_read + 1;
155         num_sectors--;
156     }
157 
158 exit:
159     if (!peek) {
160         log->read = read;
161     }
162     return ret;
163 }
164 
165 /* Writes num_sectors to the log (all log sectors are 4096 bytes),
166  * from buffer 'buffer'.  Upon return, *sectors_written will contain
167  * the number of sectors successfully written.
168  *
169  * It is assumed that 'buffer' is at least 4096*num_sectors large.
170  *
171  * 0 is returned on success, -errno otherwise */
172 static int coroutine_fn GRAPH_RDLOCK
173 vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
174                        uint32_t *sectors_written, void *buffer,
175                        uint32_t num_sectors)
176 {
177     int ret = 0;
178     uint64_t offset;
179     uint32_t write;
180     void *buffer_tmp;
181     BDRVVHDXState *s = bs->opaque;
182 
183     ret = vhdx_user_visible_write(bs, s);
184     if (ret < 0) {
185         goto exit;
186     }
187 
188     write = log->write;
189 
190     buffer_tmp = buffer;
191     while (num_sectors) {
192 
193         offset = log->offset + write;
194         write = vhdx_log_inc_idx(write, log->length);
195         if (write == log->read) {
196             /* full */
197             break;
198         }
199         ret = bdrv_co_pwrite(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer_tmp, 0);
200         if (ret < 0) {
201             goto exit;
202         }
203         buffer_tmp += VHDX_LOG_SECTOR_SIZE;
204 
205         log->write = write;
206         *sectors_written = *sectors_written + 1;
207         num_sectors--;
208     }
209 
210 exit:
211     return ret;
212 }
213 
214 
215 /* Validates a log entry header */
216 static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
217                                   BDRVVHDXState *s)
218 {
219     int valid = false;
220 
221     if (hdr->signature != VHDX_LOG_SIGNATURE) {
222         goto exit;
223     }
224 
225     /* if the individual entry length is larger than the whole log
226      * buffer, that is obviously invalid */
227     if (log->length < hdr->entry_length) {
228         goto exit;
229     }
230 
231     /* length of entire entry must be in units of 4KB (log sector size) */
232     if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) {
233         goto exit;
234     }
235 
236     /* per spec, sequence # must be > 0 */
237     if (hdr->sequence_number == 0) {
238         goto exit;
239     }
240 
241     /* log entries are only valid if they match the file-wide log guid
242      * found in the active header */
243     if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) {
244         goto exit;
245     }
246 
247     if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) {
248         goto exit;
249     }
250 
251     valid = true;
252 
253 exit:
254     return valid;
255 }
256 
257 /*
258  * Given a log header, this will validate that the descriptors and the
259  * corresponding data sectors (if applicable)
260  *
261  * Validation consists of:
262  *      1. Making sure the sequence numbers matches the entry header
263  *      2. Verifying a valid signature ('zero' or 'desc' for descriptors)
264  *      3. File offset field is a multiple of 4KB
265  *      4. If a data descriptor, the corresponding data sector
266  *         has its signature ('data') and matching sequence number
267  *
268  * @desc: the data buffer containing the descriptor
269  * @hdr:  the log entry header
270  *
271  * Returns true if valid
272  */
273 static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
274                                    VHDXLogEntryHeader *hdr)
275 {
276     bool ret = false;
277 
278     if (desc->sequence_number != hdr->sequence_number) {
279         goto exit;
280     }
281     if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) {
282         goto exit;
283     }
284 
285     if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
286         if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
287             /* valid */
288             ret = true;
289         }
290     } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
291             /* valid */
292             ret = true;
293     }
294 
295 exit:
296     return ret;
297 }
298 
299 
300 /* Prior to sector data for a log entry, there is the header
301  * and the descriptors referenced in the header:
302  *
303  * [] = 4KB sector
304  *
305  * [ hdr, desc ][   desc   ][ ... ][ data ][ ... ]
306  *
307  * The first sector in a log entry has a 64 byte header, and
308  * up to 126 32-byte descriptors.  If more descriptors than
309  * 126 are required, then subsequent sectors can have up to 128
310  * descriptors.  Each sector is 4KB.  Data follows the descriptor
311  * sectors.
312  *
313  * This will return the number of sectors needed to encompass
314  * the passed number of descriptors in desc_cnt.
315  *
316  * This will never return 0, even if desc_cnt is 0.
317  */
318 static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
319 {
320     uint32_t desc_sectors;
321 
322     desc_cnt += 2; /* account for header in first sector */
323     desc_sectors = desc_cnt / 128;
324     if (desc_cnt % 128) {
325         desc_sectors++;
326     }
327 
328     return desc_sectors;
329 }
330 
331 
332 /* Reads the log header, and subsequent descriptors (if any).  This
333  * will allocate all the space for buffer, which must be NULL when
334  * passed into this function. Each descriptor will also be validated,
335  * and error returned if any are invalid. */
336 static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
337                               VHDXLogEntries *log, VHDXLogDescEntries **buffer,
338                               bool convert_endian)
339 {
340     int ret = 0;
341     uint32_t desc_sectors;
342     uint32_t sectors_read;
343     VHDXLogEntryHeader hdr;
344     VHDXLogDescEntries *desc_entries = NULL;
345     VHDXLogDescriptor desc;
346     int i;
347 
348     assert(*buffer == NULL);
349 
350     ret = vhdx_log_peek_hdr(bs, log, &hdr);
351     if (ret < 0) {
352         goto exit;
353     }
354 
355     if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
356         ret = -EINVAL;
357         goto exit;
358     }
359 
360     desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
361     desc_entries = qemu_try_blockalign(bs->file->bs,
362                                        desc_sectors * VHDX_LOG_SECTOR_SIZE);
363     if (desc_entries == NULL) {
364         ret = -ENOMEM;
365         goto exit;
366     }
367 
368     ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
369                                 desc_sectors, false);
370     if (ret < 0) {
371         goto free_and_exit;
372     }
373     if (sectors_read != desc_sectors) {
374         ret = -EINVAL;
375         goto free_and_exit;
376     }
377 
378     /* put in proper endianness, and validate each desc */
379     for (i = 0; i < hdr.descriptor_count; i++) {
380         desc = desc_entries->desc[i];
381         vhdx_log_desc_le_import(&desc);
382         if (convert_endian) {
383             desc_entries->desc[i] = desc;
384         }
385         if (vhdx_log_desc_is_valid(&desc, &hdr) == false) {
386             ret = -EINVAL;
387             goto free_and_exit;
388         }
389     }
390     if (convert_endian) {
391         desc_entries->hdr = hdr;
392     }
393 
394     *buffer = desc_entries;
395     goto exit;
396 
397 free_and_exit:
398     qemu_vfree(desc_entries);
399 exit:
400     return ret;
401 }
402 
403 
404 /* Flushes the descriptor described by desc to the VHDX image file.
405  * If the descriptor is a data descriptor, than 'data' must be non-NULL,
406  * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be
407  * written.
408  *
409  * Verification is performed to make sure the sequence numbers of a data
410  * descriptor match the sequence number in the desc.
411  *
412  * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
413  * In this case, it should be noted that zeroes are written to disk, and the
414  * image file is not extended as a sparse file.  */
415 static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
416                                VHDXLogDataSector *data)
417 {
418     int ret = 0;
419     uint64_t seq, file_offset;
420     uint32_t offset = 0;
421     void *buffer = NULL;
422     uint64_t count = 1;
423     int i;
424 
425     buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
426 
427     if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
428         /* data sector */
429         if (data == NULL) {
430             ret = -EFAULT;
431             goto exit;
432         }
433 
434         /* The sequence number of the data sector must match that
435          * in the descriptor */
436         seq = data->sequence_high;
437         seq <<= 32;
438         seq |= data->sequence_low & 0xffffffff;
439 
440         if (seq != desc->sequence_number) {
441             ret = -EINVAL;
442             goto exit;
443         }
444 
445         /* Each data sector is in total 4096 bytes, however the first
446          * 8 bytes, and last 4 bytes, are located in the descriptor */
447         memcpy(buffer, &desc->leading_bytes, 8);
448         offset += 8;
449 
450         memcpy(buffer+offset, data->data, 4084);
451         offset += 4084;
452 
453         memcpy(buffer+offset, &desc->trailing_bytes, 4);
454 
455     } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
456         /* write 'count' sectors of sector */
457         memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
458         count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
459     } else {
460         error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32,
461                       desc->signature);
462         ret = -EINVAL;
463         goto exit;
464     }
465 
466     file_offset = desc->file_offset;
467 
468     /* count is only > 1 if we are writing zeroes */
469     for (i = 0; i < count; i++) {
470         ret = bdrv_pwrite_sync(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE,
471                                buffer, 0);
472         if (ret < 0) {
473             goto exit;
474         }
475         file_offset += VHDX_LOG_SECTOR_SIZE;
476     }
477 
478 exit:
479     qemu_vfree(buffer);
480     return ret;
481 }
482 
483 /* Flush the entire log (as described by 'logs') to the VHDX image
484  * file, and then set the log to 'empty' status once complete.
485  *
486  * The log entries should be validate prior to flushing */
487 static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
488                           VHDXLogSequence *logs)
489 {
490     int ret = 0;
491     int i;
492     uint32_t cnt, sectors_read;
493     uint64_t new_file_size;
494     void *data = NULL;
495     int64_t file_length;
496     VHDXLogDescEntries *desc_entries = NULL;
497     VHDXLogEntryHeader hdr_tmp = { 0 };
498 
499     cnt = logs->count;
500 
501     data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
502 
503     ret = vhdx_user_visible_write(bs, s);
504     if (ret < 0) {
505         goto exit;
506     }
507 
508     /* each iteration represents one log sequence, which may span multiple
509      * sectors */
510     while (cnt--) {
511         ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp);
512         if (ret < 0) {
513             goto exit;
514         }
515         file_length = bdrv_getlength(bs->file->bs);
516         if (file_length < 0) {
517             ret = file_length;
518             goto exit;
519         }
520         /* if the log shows a FlushedFileOffset larger than our current file
521          * size, then that means the file has been truncated / corrupted, and
522          * we must refused to open it / use it */
523         if (hdr_tmp.flushed_file_offset > file_length) {
524             ret = -EINVAL;
525             goto exit;
526         }
527 
528         ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true);
529         if (ret < 0) {
530             goto exit;
531         }
532 
533         for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
534             if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) {
535                 /* data sector, so read a sector to flush */
536                 ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
537                                             data, 1, false);
538                 if (ret < 0) {
539                     goto exit;
540                 }
541                 if (sectors_read != 1) {
542                     ret = -EINVAL;
543                     goto exit;
544                 }
545                 vhdx_log_data_le_import(data);
546             }
547 
548             ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
549             if (ret < 0) {
550                 goto exit;
551             }
552         }
553         if (file_length < desc_entries->hdr.last_file_offset) {
554             new_file_size = desc_entries->hdr.last_file_offset;
555             if (new_file_size % (1 * MiB)) {
556                 /* round up to nearest 1MB boundary */
557                 new_file_size = QEMU_ALIGN_UP(new_file_size, MiB);
558                 if (new_file_size > INT64_MAX) {
559                     ret = -EINVAL;
560                     goto exit;
561                 }
562                 ret = bdrv_truncate(bs->file, new_file_size, false,
563                                     PREALLOC_MODE_OFF, 0, NULL);
564                 if (ret < 0) {
565                     goto exit;
566                 }
567             }
568         }
569         qemu_vfree(desc_entries);
570         desc_entries = NULL;
571     }
572 
573     ret = bdrv_flush(bs);
574     if (ret < 0) {
575         goto exit;
576     }
577     /* once the log is fully flushed, indicate that we have an empty log
578      * now.  This also sets the log guid to 0, to indicate an empty log */
579     vhdx_log_reset(bs, s);
580 
581 exit:
582     qemu_vfree(data);
583     qemu_vfree(desc_entries);
584     return ret;
585 }
586 
587 static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
588                                    VHDXLogEntries *log, uint64_t seq,
589                                    bool *valid, VHDXLogEntryHeader *entry)
590 {
591     int ret = 0;
592     VHDXLogEntryHeader hdr;
593     void *buffer = NULL;
594     uint32_t i, desc_sectors, total_sectors, crc;
595     uint32_t sectors_read = 0;
596     VHDXLogDescEntries *desc_buffer = NULL;
597 
598     *valid = false;
599 
600     ret = vhdx_log_peek_hdr(bs, log, &hdr);
601     if (ret < 0) {
602         goto inc_and_exit;
603     }
604 
605     if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
606         goto inc_and_exit;
607     }
608 
609     if (seq > 0) {
610         if (hdr.sequence_number != seq + 1) {
611             goto inc_and_exit;
612         }
613     }
614 
615     desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
616 
617     /* Read all log sectors, and calculate log checksum */
618 
619     total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;
620 
621 
622     /* read_desc() will increment the read idx */
623     ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false);
624     if (ret < 0) {
625         goto free_and_exit;
626     }
627 
628     crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer,
629                             desc_sectors * VHDX_LOG_SECTOR_SIZE, 4);
630     crc ^= 0xffffffff;
631 
632     buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
633     if (total_sectors > desc_sectors) {
634         for (i = 0; i < total_sectors - desc_sectors; i++) {
635             sectors_read = 0;
636             ret = vhdx_log_read_sectors(bs, log, &sectors_read, buffer,
637                                         1, false);
638             if (ret < 0 || sectors_read != 1) {
639                 goto free_and_exit;
640             }
641             crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1);
642             crc ^= 0xffffffff;
643         }
644     }
645     crc ^= 0xffffffff;
646     if (crc != hdr.checksum) {
647         goto free_and_exit;
648     }
649 
650     *valid = true;
651     *entry = hdr;
652     goto free_and_exit;
653 
654 inc_and_exit:
655     log->read = vhdx_log_inc_idx(log->read, log->length);
656 
657 free_and_exit:
658     qemu_vfree(buffer);
659     qemu_vfree(desc_buffer);
660     return ret;
661 }
662 
663 /* Search through the log circular buffer, and find the valid, active
664  * log sequence, if any exists
665  * */
666 static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s,
667                            VHDXLogSequence *logs)
668 {
669     int ret = 0;
670     uint32_t tail;
671     bool seq_valid = false;
672     VHDXLogSequence candidate = { 0 };
673     VHDXLogEntryHeader hdr = { 0 };
674     VHDXLogEntries curr_log;
675 
676     memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries));
677     curr_log.write = curr_log.length;   /* assume log is full */
678     curr_log.read = 0;
679 
680 
681     /* now we will go through the whole log sector by sector, until
682      * we find a valid, active log sequence, or reach the end of the
683      * log buffer */
684     for (;;) {
685         uint64_t curr_seq = 0;
686         VHDXLogSequence current = { 0 };
687 
688         tail = curr_log.read;
689 
690         ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
691                                       &seq_valid, &hdr);
692         if (ret < 0) {
693             goto exit;
694         }
695 
696         if (seq_valid) {
697             current.valid     = true;
698             current.log       = curr_log;
699             current.log.read  = tail;
700             current.log.write = curr_log.read;
701             current.count     = 1;
702             current.hdr       = hdr;
703 
704 
705             for (;;) {
706                 ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
707                                               &seq_valid, &hdr);
708                 if (ret < 0) {
709                     goto exit;
710                 }
711                 if (seq_valid == false) {
712                     break;
713                 }
714                 current.log.write = curr_log.read;
715                 current.count++;
716 
717                 curr_seq = hdr.sequence_number;
718             }
719         }
720 
721         if (current.valid) {
722             if (candidate.valid == false ||
723                 current.hdr.sequence_number > candidate.hdr.sequence_number) {
724                 candidate = current;
725             }
726         }
727 
728         if (curr_log.read < tail) {
729             break;
730         }
731     }
732 
733     *logs = candidate;
734 
735     if (candidate.valid) {
736         /* this is the next sequence number, for writes */
737         s->log.sequence = candidate.hdr.sequence_number + 1;
738     }
739 
740 
741 exit:
742     return ret;
743 }
744 
745 /* Parse the replay log.  Per the VHDX spec, if the log is present
746  * it must be replayed prior to opening the file, even read-only.
747  *
748  * If read-only, we must replay the log in RAM (or refuse to open
749  * a dirty VHDX file read-only) */
750 int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
751                    Error **errp)
752 {
753     int ret = 0;
754     VHDXHeader *hdr;
755     VHDXLogSequence logs = { 0 };
756 
757     hdr = s->headers[s->curr_header];
758 
759     *flushed = false;
760 
761     /* s->log.hdr is freed in vhdx_close() */
762     if (s->log.hdr == NULL) {
763         s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader));
764     }
765 
766     s->log.offset = hdr->log_offset;
767     s->log.length = hdr->log_length;
768 
769     if (s->log.offset < VHDX_LOG_MIN_SIZE ||
770         s->log.offset % VHDX_LOG_MIN_SIZE) {
771         ret = -EINVAL;
772         goto exit;
773     }
774 
775     /* per spec, only log version of 0 is supported */
776     if (hdr->log_version != 0) {
777         ret = -EINVAL;
778         goto exit;
779     }
780 
781     /* If either the log guid, or log length is zero,
782      * then a replay log is not present */
783     if (guid_eq(hdr->log_guid, zero_guid)) {
784         goto exit;
785     }
786 
787     if (hdr->log_length == 0) {
788         goto exit;
789     }
790 
791     if (hdr->log_length % VHDX_LOG_MIN_SIZE) {
792         ret = -EINVAL;
793         goto exit;
794     }
795 
796 
797     /* The log is present, we need to find if and where there is an active
798      * sequence of valid entries present in the log.  */
799 
800     ret = vhdx_log_search(bs, s, &logs);
801     if (ret < 0) {
802         goto exit;
803     }
804 
805     if (logs.valid) {
806         if (bdrv_is_read_only(bs)) {
807             bdrv_refresh_filename(bs);
808             ret = -EPERM;
809             error_setg(errp,
810                        "VHDX image file '%s' opened read-only, but "
811                        "contains a log that needs to be replayed",
812                        bs->filename);
813             error_append_hint(errp,  "To replay the log, run:\n"
814                               "qemu-img check -r all '%s'\n",
815                               bs->filename);
816             goto exit;
817         }
818         /* now flush the log */
819         ret = vhdx_log_flush(bs, s, &logs);
820         if (ret < 0) {
821             goto exit;
822         }
823         *flushed = true;
824     }
825 
826 
827 exit:
828     return ret;
829 }
830 
831 
832 
833 static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc,
834                                       VHDXLogDataSector *sector, void *data,
835                                       uint64_t seq)
836 {
837     /* 8 + 4084 + 4 = 4096, 1 log sector */
838     memcpy(&desc->leading_bytes, data, 8);
839     data += 8;
840     desc->leading_bytes = cpu_to_le64(desc->leading_bytes);
841     memcpy(sector->data, data, 4084);
842     data += 4084;
843     memcpy(&desc->trailing_bytes, data, 4);
844     desc->trailing_bytes = cpu_to_le32(desc->trailing_bytes);
845     data += 4;
846 
847     sector->sequence_high  = (uint32_t) (seq >> 32);
848     sector->sequence_low   = (uint32_t) (seq & 0xffffffff);
849     sector->data_signature = VHDX_LOG_DATA_SIGNATURE;
850 
851     vhdx_log_desc_le_export(desc);
852     vhdx_log_data_le_export(sector);
853 }
854 
855 
856 static int coroutine_fn GRAPH_RDLOCK
857 vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
858                void *data, uint32_t length, uint64_t offset)
859 {
860     int ret = 0;
861     void *buffer = NULL;
862     void *merged_sector = NULL;
863     void *data_tmp, *sector_write;
864     unsigned int i;
865     int sector_offset;
866     uint32_t desc_sectors, sectors, total_length;
867     uint32_t sectors_written = 0;
868     uint32_t aligned_length;
869     uint32_t leading_length = 0;
870     uint32_t trailing_length = 0;
871     uint32_t partial_sectors = 0;
872     uint32_t bytes_written = 0;
873     uint64_t file_offset;
874     int64_t file_length;
875     VHDXHeader *header;
876     VHDXLogEntryHeader new_hdr;
877     VHDXLogDescriptor *new_desc = NULL;
878     VHDXLogDataSector *data_sector = NULL;
879     MSGUID new_guid = { 0 };
880 
881     header = s->headers[s->curr_header];
882 
883     /* need to have offset read data, and be on 4096 byte boundary */
884 
885     if (length > header->log_length) {
886         /* no log present.  we could create a log here instead of failing */
887         ret = -EINVAL;
888         goto exit;
889     }
890 
891     if (guid_eq(header->log_guid, zero_guid)) {
892         vhdx_guid_generate(&new_guid);
893         vhdx_update_headers(bs, s, false, &new_guid);
894     } else {
895         /* currently, we require that the log be flushed after
896          * every write. */
897         ret = -ENOTSUP;
898         goto exit;
899     }
900 
901     /* 0 is an invalid sequence number, but may also represent the first
902      * log write (or a wrapped seq) */
903     if (s->log.sequence == 0) {
904         s->log.sequence = 1;
905     }
906 
907     sector_offset = offset % VHDX_LOG_SECTOR_SIZE;
908     file_offset = QEMU_ALIGN_DOWN(offset, VHDX_LOG_SECTOR_SIZE);
909 
910     aligned_length = length;
911 
912     /* add in the unaligned head and tail bytes */
913     if (sector_offset) {
914         leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset);
915         leading_length = leading_length > length ? length : leading_length;
916         aligned_length -= leading_length;
917         partial_sectors++;
918     }
919 
920     sectors = aligned_length / VHDX_LOG_SECTOR_SIZE;
921     trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE);
922     if (trailing_length) {
923         partial_sectors++;
924     }
925 
926     sectors += partial_sectors;
927 
928     file_length = bdrv_co_getlength(bs->file->bs);
929     if (file_length < 0) {
930         ret = file_length;
931         goto exit;
932     }
933 
934     /* sectors is now how many sectors the data itself takes, not
935      * including the header and descriptor metadata */
936 
937     new_hdr = (VHDXLogEntryHeader) {
938                 .signature           = VHDX_LOG_SIGNATURE,
939                 .tail                = s->log.tail,
940                 .sequence_number     = s->log.sequence,
941                 .descriptor_count    = sectors,
942                 .reserved            = 0,
943                 .flushed_file_offset = file_length,
944                 .last_file_offset    = file_length,
945                 .log_guid            = header->log_guid,
946               };
947 
948 
949     desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count);
950 
951     total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE;
952     new_hdr.entry_length = total_length;
953 
954     vhdx_log_entry_hdr_le_export(&new_hdr);
955 
956     buffer = qemu_blockalign(bs, total_length);
957     memcpy(buffer, &new_hdr, sizeof(new_hdr));
958 
959     new_desc = buffer + sizeof(new_hdr);
960     data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE);
961     data_tmp = data;
962 
963     /* All log sectors are 4KB, so for any partial sectors we must
964      * merge the data with preexisting data from the final file
965      * destination */
966     merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
967 
968     for (i = 0; i < sectors; i++) {
969         new_desc->signature       = VHDX_LOG_DESC_SIGNATURE;
970         new_desc->sequence_number = s->log.sequence;
971         new_desc->file_offset     = file_offset;
972 
973         if (i == 0 && leading_length) {
974             /* partial sector at the front of the buffer */
975             ret = bdrv_co_pread(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE,
976                                 merged_sector, 0);
977             if (ret < 0) {
978                 goto exit;
979             }
980             memcpy(merged_sector + sector_offset, data_tmp, leading_length);
981             bytes_written = leading_length;
982             sector_write = merged_sector;
983         } else if (i == sectors - 1 && trailing_length) {
984             /* partial sector at the end of the buffer */
985             ret = bdrv_co_pread(bs->file, file_offset + trailing_length,
986                                 VHDX_LOG_SECTOR_SIZE - trailing_length,
987                                 merged_sector + trailing_length, 0);
988             if (ret < 0) {
989                 goto exit;
990             }
991             memcpy(merged_sector, data_tmp, trailing_length);
992             bytes_written = trailing_length;
993             sector_write = merged_sector;
994         } else {
995             bytes_written = VHDX_LOG_SECTOR_SIZE;
996             sector_write = data_tmp;
997         }
998 
999         /* populate the raw sector data into the proper structures,
1000          * as well as update the descriptor, and convert to proper
1001          * endianness */
1002         vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write,
1003                                   s->log.sequence);
1004 
1005         data_tmp += bytes_written;
1006         data_sector++;
1007         new_desc++;
1008         file_offset += VHDX_LOG_SECTOR_SIZE;
1009     }
1010 
1011     /* checksum covers entire entry, from the log header through the
1012      * last data sector */
1013     vhdx_update_checksum(buffer, total_length,
1014                          offsetof(VHDXLogEntryHeader, checksum));
1015 
1016     /* now write to the log */
1017     ret = vhdx_log_write_sectors(bs, &s->log, &sectors_written, buffer,
1018                                  desc_sectors + sectors);
1019     if (ret < 0) {
1020         goto exit;
1021     }
1022 
1023     if (sectors_written != desc_sectors + sectors) {
1024         /* instead of failing, we could flush the log here */
1025         ret = -EINVAL;
1026         goto exit;
1027     }
1028 
1029     s->log.sequence++;
1030     /* write new tail */
1031     s->log.tail = s->log.write;
1032 
1033 exit:
1034     qemu_vfree(buffer);
1035     qemu_vfree(merged_sector);
1036     return ret;
1037 }
1038 
1039 /* Perform a log write, and then immediately flush the entire log */
1040 int coroutine_fn
1041 vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
1042                          void *data, uint32_t length, uint64_t offset)
1043 {
1044     int ret = 0;
1045     VHDXLogSequence logs = { .valid = true,
1046                              .count = 1,
1047                              .hdr = { 0 } };
1048 
1049 
1050     /* Make sure data written (new and/or changed blocks) is stable
1051      * on disk, before creating log entry */
1052     ret = bdrv_co_flush(bs);
1053     if (ret < 0) {
1054         goto exit;
1055     }
1056 
1057     ret = vhdx_log_write(bs, s, data, length, offset);
1058     if (ret < 0) {
1059         goto exit;
1060     }
1061     logs.log = s->log;
1062 
1063     /* Make sure log is stable on disk */
1064     ret = bdrv_co_flush(bs);
1065     if (ret < 0) {
1066         goto exit;
1067     }
1068 
1069     ret = vhdx_log_flush(bs, s, &logs);
1070     if (ret < 0) {
1071         goto exit;
1072     }
1073 
1074     s->log = logs.log;
1075 
1076 exit:
1077     return ret;
1078 }
1079 
1080