xref: /openbmc/qemu/block/vhdx-log.c (revision 2df1eb27)
1 /*
2  * Block driver for Hyper-V VHDX Images
3  *
4  * Copyright (c) 2013 Red Hat, Inc.,
5  *
6  * Authors:
7  *  Jeff Cody <jcody@redhat.com>
8  *
9  *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
10  *  by Microsoft:
11  *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
12  *
13  * This file covers the functionality of the metadata log writing, parsing, and
14  * replay.
15  *
16  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
17  * See the COPYING.LIB file in the top-level directory.
18  *
19  */
20 
21 #include "qemu/osdep.h"
22 #include "qapi/error.h"
23 #include "block/block-io.h"
24 #include "block/block_int.h"
25 #include "qemu/error-report.h"
26 #include "qemu/bswap.h"
27 #include "qemu/memalign.h"
28 #include "vhdx.h"
29 
30 
31 typedef struct VHDXLogSequence {
32     bool valid;
33     uint32_t count;
34     VHDXLogEntries log;
35     VHDXLogEntryHeader hdr;
36 } VHDXLogSequence;
37 
38 typedef struct VHDXLogDescEntries {
39     VHDXLogEntryHeader hdr;
40     VHDXLogDescriptor desc[];
41 } VHDXLogDescEntries;
42 
43 static const MSGUID zero_guid = { 0 };
44 
45 /* The log located on the disk is circular buffer containing
46  * sectors of 4096 bytes each.
47  *
48  * It is assumed for the read/write functions below that the
49  * circular buffer scheme uses a 'one sector open' to indicate
50  * the buffer is full.  Given the validation methods used for each
51  * sector, this method should be compatible with other methods that
52  * do not waste a sector.
53  */
54 
55 
56 /* Allow peeking at the hdr entry at the beginning of the current
57  * read index, without advancing the read index */
58 static int GRAPH_RDLOCK
59 vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
60                   VHDXLogEntryHeader *hdr)
61 {
62     int ret = 0;
63     uint64_t offset;
64     uint32_t read;
65 
66     assert(hdr != NULL);
67 
68     /* peek is only supported on sector boundaries */
69     if (log->read % VHDX_LOG_SECTOR_SIZE) {
70         ret = -EFAULT;
71         goto exit;
72     }
73 
74     read = log->read;
75     /* we are guaranteed that a) log sectors are 4096 bytes,
76      * and b) the log length is a multiple of 1MB. So, there
77      * is always a round number of sectors in the buffer */
78     if ((read + sizeof(VHDXLogEntryHeader)) > log->length) {
79         read = 0;
80     }
81 
82     if (read == log->write) {
83         ret = -EINVAL;
84         goto exit;
85     }
86 
87     offset = log->offset + read;
88 
89     ret = bdrv_pread(bs->file, offset, sizeof(VHDXLogEntryHeader), hdr, 0);
90     if (ret < 0) {
91         goto exit;
92     }
93     vhdx_log_entry_hdr_le_import(hdr);
94 
95 exit:
96     return ret;
97 }
98 
99 /* Index increment for log, based on sector boundaries */
100 static int vhdx_log_inc_idx(uint32_t idx, uint64_t length)
101 {
102     idx += VHDX_LOG_SECTOR_SIZE;
103     /* we are guaranteed that a) log sectors are 4096 bytes,
104      * and b) the log length is a multiple of 1MB. So, there
105      * is always a round number of sectors in the buffer */
106     return idx >= length ? 0 : idx;
107 }
108 
109 
110 /* Reset the log to empty */
111 static void GRAPH_RDLOCK vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
112 {
113     MSGUID guid = { 0 };
114     s->log.read = s->log.write = 0;
115     /* a log guid of 0 indicates an empty log to any parser of v0
116      * VHDX logs */
117     vhdx_update_headers(bs, s, false, &guid);
118 }
119 
120 /* Reads num_sectors from the log (all log sectors are 4096 bytes),
121  * into buffer 'buffer'.  Upon return, *sectors_read will contain
122  * the number of sectors successfully read.
123  *
124  * It is assumed that 'buffer' is already allocated, and of sufficient
125  * size (i.e. >= 4096*num_sectors).
126  *
127  * If 'peek' is true, then the tail (read) pointer for the circular buffer is
128  * not modified.
129  *
130  * 0 is returned on success, -errno otherwise.  */
131 static int GRAPH_RDLOCK
132 vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
133                       uint32_t *sectors_read, void *buffer,
134                       uint32_t num_sectors, bool peek)
135 {
136     int ret = 0;
137     uint64_t offset;
138     uint32_t read;
139 
140     read = log->read;
141 
142     *sectors_read = 0;
143     while (num_sectors) {
144         if (read == log->write) {
145             /* empty */
146             break;
147         }
148         offset = log->offset + read;
149 
150         ret = bdrv_pread(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer, 0);
151         if (ret < 0) {
152             goto exit;
153         }
154         read = vhdx_log_inc_idx(read, log->length);
155 
156         *sectors_read = *sectors_read + 1;
157         num_sectors--;
158     }
159 
160 exit:
161     if (!peek) {
162         log->read = read;
163     }
164     return ret;
165 }
166 
167 /* Writes num_sectors to the log (all log sectors are 4096 bytes),
168  * from buffer 'buffer'.  Upon return, *sectors_written will contain
169  * the number of sectors successfully written.
170  *
171  * It is assumed that 'buffer' is at least 4096*num_sectors large.
172  *
173  * 0 is returned on success, -errno otherwise */
174 static int coroutine_fn GRAPH_RDLOCK
175 vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
176                        uint32_t *sectors_written, void *buffer,
177                        uint32_t num_sectors)
178 {
179     int ret = 0;
180     uint64_t offset;
181     uint32_t write;
182     void *buffer_tmp;
183     BDRVVHDXState *s = bs->opaque;
184 
185     ret = vhdx_user_visible_write(bs, s);
186     if (ret < 0) {
187         goto exit;
188     }
189 
190     write = log->write;
191 
192     buffer_tmp = buffer;
193     while (num_sectors) {
194 
195         offset = log->offset + write;
196         write = vhdx_log_inc_idx(write, log->length);
197         if (write == log->read) {
198             /* full */
199             break;
200         }
201         ret = bdrv_co_pwrite(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer_tmp, 0);
202         if (ret < 0) {
203             goto exit;
204         }
205         buffer_tmp += VHDX_LOG_SECTOR_SIZE;
206 
207         log->write = write;
208         *sectors_written = *sectors_written + 1;
209         num_sectors--;
210     }
211 
212 exit:
213     return ret;
214 }
215 
216 
217 /* Validates a log entry header */
218 static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
219                                   BDRVVHDXState *s)
220 {
221     int valid = false;
222 
223     if (hdr->signature != VHDX_LOG_SIGNATURE) {
224         goto exit;
225     }
226 
227     /* if the individual entry length is larger than the whole log
228      * buffer, that is obviously invalid */
229     if (log->length < hdr->entry_length) {
230         goto exit;
231     }
232 
233     /* length of entire entry must be in units of 4KB (log sector size) */
234     if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) {
235         goto exit;
236     }
237 
238     /* per spec, sequence # must be > 0 */
239     if (hdr->sequence_number == 0) {
240         goto exit;
241     }
242 
243     /* log entries are only valid if they match the file-wide log guid
244      * found in the active header */
245     if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) {
246         goto exit;
247     }
248 
249     if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) {
250         goto exit;
251     }
252 
253     valid = true;
254 
255 exit:
256     return valid;
257 }
258 
259 /*
260  * Given a log header, this will validate that the descriptors and the
261  * corresponding data sectors (if applicable)
262  *
263  * Validation consists of:
264  *      1. Making sure the sequence numbers matches the entry header
265  *      2. Verifying a valid signature ('zero' or 'desc' for descriptors)
266  *      3. File offset field is a multiple of 4KB
267  *      4. If a data descriptor, the corresponding data sector
268  *         has its signature ('data') and matching sequence number
269  *
270  * @desc: the data buffer containing the descriptor
271  * @hdr:  the log entry header
272  *
273  * Returns true if valid
274  */
275 static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
276                                    VHDXLogEntryHeader *hdr)
277 {
278     bool ret = false;
279 
280     if (desc->sequence_number != hdr->sequence_number) {
281         goto exit;
282     }
283     if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) {
284         goto exit;
285     }
286 
287     if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
288         if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
289             /* valid */
290             ret = true;
291         }
292     } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
293             /* valid */
294             ret = true;
295     }
296 
297 exit:
298     return ret;
299 }
300 
301 
302 /* Prior to sector data for a log entry, there is the header
303  * and the descriptors referenced in the header:
304  *
305  * [] = 4KB sector
306  *
307  * [ hdr, desc ][   desc   ][ ... ][ data ][ ... ]
308  *
309  * The first sector in a log entry has a 64 byte header, and
310  * up to 126 32-byte descriptors.  If more descriptors than
311  * 126 are required, then subsequent sectors can have up to 128
312  * descriptors.  Each sector is 4KB.  Data follows the descriptor
313  * sectors.
314  *
315  * This will return the number of sectors needed to encompass
316  * the passed number of descriptors in desc_cnt.
317  *
318  * This will never return 0, even if desc_cnt is 0.
319  */
320 static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
321 {
322     uint32_t desc_sectors;
323 
324     desc_cnt += 2; /* account for header in first sector */
325     desc_sectors = desc_cnt / 128;
326     if (desc_cnt % 128) {
327         desc_sectors++;
328     }
329 
330     return desc_sectors;
331 }
332 
333 
334 /* Reads the log header, and subsequent descriptors (if any).  This
335  * will allocate all the space for buffer, which must be NULL when
336  * passed into this function. Each descriptor will also be validated,
337  * and error returned if any are invalid. */
338 static int GRAPH_RDLOCK
339 vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, VHDXLogEntries *log,
340                    VHDXLogDescEntries **buffer, bool convert_endian)
341 {
342     int ret = 0;
343     uint32_t desc_sectors;
344     uint32_t sectors_read;
345     VHDXLogEntryHeader hdr;
346     VHDXLogDescEntries *desc_entries = NULL;
347     VHDXLogDescriptor desc;
348     int i;
349 
350     assert(*buffer == NULL);
351 
352     ret = vhdx_log_peek_hdr(bs, log, &hdr);
353     if (ret < 0) {
354         goto exit;
355     }
356 
357     if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
358         ret = -EINVAL;
359         goto exit;
360     }
361 
362     desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
363     desc_entries = qemu_try_blockalign(bs->file->bs,
364                                        desc_sectors * VHDX_LOG_SECTOR_SIZE);
365     if (desc_entries == NULL) {
366         ret = -ENOMEM;
367         goto exit;
368     }
369 
370     ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
371                                 desc_sectors, false);
372     if (ret < 0) {
373         goto free_and_exit;
374     }
375     if (sectors_read != desc_sectors) {
376         ret = -EINVAL;
377         goto free_and_exit;
378     }
379 
380     /* put in proper endianness, and validate each desc */
381     for (i = 0; i < hdr.descriptor_count; i++) {
382         desc = desc_entries->desc[i];
383         vhdx_log_desc_le_import(&desc);
384         if (convert_endian) {
385             desc_entries->desc[i] = desc;
386         }
387         if (vhdx_log_desc_is_valid(&desc, &hdr) == false) {
388             ret = -EINVAL;
389             goto free_and_exit;
390         }
391     }
392     if (convert_endian) {
393         desc_entries->hdr = hdr;
394     }
395 
396     *buffer = desc_entries;
397     goto exit;
398 
399 free_and_exit:
400     qemu_vfree(desc_entries);
401 exit:
402     return ret;
403 }
404 
405 
406 /* Flushes the descriptor described by desc to the VHDX image file.
407  * If the descriptor is a data descriptor, than 'data' must be non-NULL,
408  * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be
409  * written.
410  *
411  * Verification is performed to make sure the sequence numbers of a data
412  * descriptor match the sequence number in the desc.
413  *
414  * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
415  * In this case, it should be noted that zeroes are written to disk, and the
416  * image file is not extended as a sparse file.  */
417 static int GRAPH_RDLOCK
418 vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
419                     VHDXLogDataSector *data)
420 {
421     int ret = 0;
422     uint64_t seq, file_offset;
423     uint32_t offset = 0;
424     void *buffer = NULL;
425     uint64_t count = 1;
426     int i;
427 
428     buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
429 
430     if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
431         /* data sector */
432         if (data == NULL) {
433             ret = -EFAULT;
434             goto exit;
435         }
436 
437         /* The sequence number of the data sector must match that
438          * in the descriptor */
439         seq = data->sequence_high;
440         seq <<= 32;
441         seq |= data->sequence_low & 0xffffffff;
442 
443         if (seq != desc->sequence_number) {
444             ret = -EINVAL;
445             goto exit;
446         }
447 
448         /* Each data sector is in total 4096 bytes, however the first
449          * 8 bytes, and last 4 bytes, are located in the descriptor */
450         memcpy(buffer, &desc->leading_bytes, 8);
451         offset += 8;
452 
453         memcpy(buffer+offset, data->data, 4084);
454         offset += 4084;
455 
456         memcpy(buffer+offset, &desc->trailing_bytes, 4);
457 
458     } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
459         /* write 'count' sectors of sector */
460         memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
461         count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
462     } else {
463         error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32,
464                       desc->signature);
465         ret = -EINVAL;
466         goto exit;
467     }
468 
469     file_offset = desc->file_offset;
470 
471     /* count is only > 1 if we are writing zeroes */
472     for (i = 0; i < count; i++) {
473         ret = bdrv_pwrite_sync(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE,
474                                buffer, 0);
475         if (ret < 0) {
476             goto exit;
477         }
478         file_offset += VHDX_LOG_SECTOR_SIZE;
479     }
480 
481 exit:
482     qemu_vfree(buffer);
483     return ret;
484 }
485 
486 /* Flush the entire log (as described by 'logs') to the VHDX image
487  * file, and then set the log to 'empty' status once complete.
488  *
489  * The log entries should be validate prior to flushing */
490 static int GRAPH_RDLOCK
491 vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, VHDXLogSequence *logs)
492 {
493     int ret = 0;
494     int i;
495     uint32_t cnt, sectors_read;
496     uint64_t new_file_size;
497     void *data = NULL;
498     int64_t file_length;
499     VHDXLogDescEntries *desc_entries = NULL;
500     VHDXLogEntryHeader hdr_tmp = { 0 };
501 
502     cnt = logs->count;
503 
504     data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
505 
506     ret = vhdx_user_visible_write(bs, s);
507     if (ret < 0) {
508         goto exit;
509     }
510 
511     /* each iteration represents one log sequence, which may span multiple
512      * sectors */
513     while (cnt--) {
514         ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp);
515         if (ret < 0) {
516             goto exit;
517         }
518         file_length = bdrv_getlength(bs->file->bs);
519         if (file_length < 0) {
520             ret = file_length;
521             goto exit;
522         }
523         /* if the log shows a FlushedFileOffset larger than our current file
524          * size, then that means the file has been truncated / corrupted, and
525          * we must refused to open it / use it */
526         if (hdr_tmp.flushed_file_offset > file_length) {
527             ret = -EINVAL;
528             goto exit;
529         }
530 
531         ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true);
532         if (ret < 0) {
533             goto exit;
534         }
535 
536         for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
537             if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) {
538                 /* data sector, so read a sector to flush */
539                 ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
540                                             data, 1, false);
541                 if (ret < 0) {
542                     goto exit;
543                 }
544                 if (sectors_read != 1) {
545                     ret = -EINVAL;
546                     goto exit;
547                 }
548                 vhdx_log_data_le_import(data);
549             }
550 
551             ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
552             if (ret < 0) {
553                 goto exit;
554             }
555         }
556         if (file_length < desc_entries->hdr.last_file_offset) {
557             new_file_size = desc_entries->hdr.last_file_offset;
558             if (new_file_size % (1 * MiB)) {
559                 /* round up to nearest 1MB boundary */
560                 new_file_size = QEMU_ALIGN_UP(new_file_size, MiB);
561                 if (new_file_size > INT64_MAX) {
562                     ret = -EINVAL;
563                     goto exit;
564                 }
565                 ret = bdrv_truncate(bs->file, new_file_size, false,
566                                     PREALLOC_MODE_OFF, 0, NULL);
567                 if (ret < 0) {
568                     goto exit;
569                 }
570             }
571         }
572         qemu_vfree(desc_entries);
573         desc_entries = NULL;
574     }
575 
576     ret = bdrv_flush(bs);
577     if (ret < 0) {
578         goto exit;
579     }
580     /* once the log is fully flushed, indicate that we have an empty log
581      * now.  This also sets the log guid to 0, to indicate an empty log */
582     vhdx_log_reset(bs, s);
583 
584 exit:
585     qemu_vfree(data);
586     qemu_vfree(desc_entries);
587     return ret;
588 }
589 
590 static int GRAPH_RDLOCK
591 vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
592                         VHDXLogEntries *log, uint64_t seq,
593                         bool *valid, VHDXLogEntryHeader *entry)
594 {
595     int ret = 0;
596     VHDXLogEntryHeader hdr;
597     void *buffer = NULL;
598     uint32_t i, desc_sectors, total_sectors, crc;
599     uint32_t sectors_read = 0;
600     VHDXLogDescEntries *desc_buffer = NULL;
601 
602     *valid = false;
603 
604     ret = vhdx_log_peek_hdr(bs, log, &hdr);
605     if (ret < 0) {
606         goto inc_and_exit;
607     }
608 
609     if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
610         goto inc_and_exit;
611     }
612 
613     if (seq > 0) {
614         if (hdr.sequence_number != seq + 1) {
615             goto inc_and_exit;
616         }
617     }
618 
619     desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
620 
621     /* Read all log sectors, and calculate log checksum */
622 
623     total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;
624 
625 
626     /* read_desc() will increment the read idx */
627     ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false);
628     if (ret < 0) {
629         goto free_and_exit;
630     }
631 
632     crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer,
633                             desc_sectors * VHDX_LOG_SECTOR_SIZE, 4);
634     crc ^= 0xffffffff;
635 
636     buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
637     if (total_sectors > desc_sectors) {
638         for (i = 0; i < total_sectors - desc_sectors; i++) {
639             sectors_read = 0;
640             ret = vhdx_log_read_sectors(bs, log, &sectors_read, buffer,
641                                         1, false);
642             if (ret < 0 || sectors_read != 1) {
643                 goto free_and_exit;
644             }
645             crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1);
646             crc ^= 0xffffffff;
647         }
648     }
649     crc ^= 0xffffffff;
650     if (crc != hdr.checksum) {
651         goto free_and_exit;
652     }
653 
654     *valid = true;
655     *entry = hdr;
656     goto free_and_exit;
657 
658 inc_and_exit:
659     log->read = vhdx_log_inc_idx(log->read, log->length);
660 
661 free_and_exit:
662     qemu_vfree(buffer);
663     qemu_vfree(desc_buffer);
664     return ret;
665 }
666 
667 /* Search through the log circular buffer, and find the valid, active
668  * log sequence, if any exists
669  * */
670 static int GRAPH_RDLOCK
671 vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s, VHDXLogSequence *logs)
672 {
673     int ret = 0;
674     uint32_t tail;
675     bool seq_valid = false;
676     VHDXLogSequence candidate = { 0 };
677     VHDXLogEntryHeader hdr = { 0 };
678     VHDXLogEntries curr_log;
679 
680     memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries));
681     curr_log.write = curr_log.length;   /* assume log is full */
682     curr_log.read = 0;
683 
684 
685     /* now we will go through the whole log sector by sector, until
686      * we find a valid, active log sequence, or reach the end of the
687      * log buffer */
688     for (;;) {
689         uint64_t curr_seq = 0;
690         VHDXLogSequence current = { 0 };
691 
692         tail = curr_log.read;
693 
694         ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
695                                       &seq_valid, &hdr);
696         if (ret < 0) {
697             goto exit;
698         }
699 
700         if (seq_valid) {
701             current.valid     = true;
702             current.log       = curr_log;
703             current.log.read  = tail;
704             current.log.write = curr_log.read;
705             current.count     = 1;
706             current.hdr       = hdr;
707 
708 
709             for (;;) {
710                 ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
711                                               &seq_valid, &hdr);
712                 if (ret < 0) {
713                     goto exit;
714                 }
715                 if (seq_valid == false) {
716                     break;
717                 }
718                 current.log.write = curr_log.read;
719                 current.count++;
720 
721                 curr_seq = hdr.sequence_number;
722             }
723         }
724 
725         if (current.valid) {
726             if (candidate.valid == false ||
727                 current.hdr.sequence_number > candidate.hdr.sequence_number) {
728                 candidate = current;
729             }
730         }
731 
732         if (curr_log.read < tail) {
733             break;
734         }
735     }
736 
737     *logs = candidate;
738 
739     if (candidate.valid) {
740         /* this is the next sequence number, for writes */
741         s->log.sequence = candidate.hdr.sequence_number + 1;
742     }
743 
744 
745 exit:
746     return ret;
747 }
748 
749 /* Parse the replay log.  Per the VHDX spec, if the log is present
750  * it must be replayed prior to opening the file, even read-only.
751  *
752  * If read-only, we must replay the log in RAM (or refuse to open
753  * a dirty VHDX file read-only) */
754 int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
755                    Error **errp)
756 {
757     int ret = 0;
758     VHDXHeader *hdr;
759     VHDXLogSequence logs = { 0 };
760 
761     hdr = s->headers[s->curr_header];
762 
763     *flushed = false;
764 
765     /* s->log.hdr is freed in vhdx_close() */
766     if (s->log.hdr == NULL) {
767         s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader));
768     }
769 
770     s->log.offset = hdr->log_offset;
771     s->log.length = hdr->log_length;
772 
773     if (s->log.offset < VHDX_LOG_MIN_SIZE ||
774         s->log.offset % VHDX_LOG_MIN_SIZE) {
775         ret = -EINVAL;
776         goto exit;
777     }
778 
779     /* per spec, only log version of 0 is supported */
780     if (hdr->log_version != 0) {
781         ret = -EINVAL;
782         goto exit;
783     }
784 
785     /* If either the log guid, or log length is zero,
786      * then a replay log is not present */
787     if (guid_eq(hdr->log_guid, zero_guid)) {
788         goto exit;
789     }
790 
791     if (hdr->log_length == 0) {
792         goto exit;
793     }
794 
795     if (hdr->log_length % VHDX_LOG_MIN_SIZE) {
796         ret = -EINVAL;
797         goto exit;
798     }
799 
800 
801     /* The log is present, we need to find if and where there is an active
802      * sequence of valid entries present in the log.  */
803 
804     ret = vhdx_log_search(bs, s, &logs);
805     if (ret < 0) {
806         goto exit;
807     }
808 
809     if (logs.valid) {
810         if (bdrv_is_read_only(bs)) {
811             bdrv_refresh_filename(bs);
812             ret = -EPERM;
813             error_setg(errp,
814                        "VHDX image file '%s' opened read-only, but "
815                        "contains a log that needs to be replayed",
816                        bs->filename);
817             error_append_hint(errp,  "To replay the log, run:\n"
818                               "qemu-img check -r all '%s'\n",
819                               bs->filename);
820             goto exit;
821         }
822         /* now flush the log */
823         ret = vhdx_log_flush(bs, s, &logs);
824         if (ret < 0) {
825             goto exit;
826         }
827         *flushed = true;
828     }
829 
830 
831 exit:
832     return ret;
833 }
834 
835 
836 
837 static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc,
838                                       VHDXLogDataSector *sector, void *data,
839                                       uint64_t seq)
840 {
841     /* 8 + 4084 + 4 = 4096, 1 log sector */
842     memcpy(&desc->leading_bytes, data, 8);
843     data += 8;
844     desc->leading_bytes = cpu_to_le64(desc->leading_bytes);
845     memcpy(sector->data, data, 4084);
846     data += 4084;
847     memcpy(&desc->trailing_bytes, data, 4);
848     desc->trailing_bytes = cpu_to_le32(desc->trailing_bytes);
849     data += 4;
850 
851     sector->sequence_high  = (uint32_t) (seq >> 32);
852     sector->sequence_low   = (uint32_t) (seq & 0xffffffff);
853     sector->data_signature = VHDX_LOG_DATA_SIGNATURE;
854 
855     vhdx_log_desc_le_export(desc);
856     vhdx_log_data_le_export(sector);
857 }
858 
859 
860 static int coroutine_fn GRAPH_RDLOCK
861 vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
862                void *data, uint32_t length, uint64_t offset)
863 {
864     int ret = 0;
865     void *buffer = NULL;
866     void *merged_sector = NULL;
867     void *data_tmp, *sector_write;
868     unsigned int i;
869     int sector_offset;
870     uint32_t desc_sectors, sectors, total_length;
871     uint32_t sectors_written = 0;
872     uint32_t aligned_length;
873     uint32_t leading_length = 0;
874     uint32_t trailing_length = 0;
875     uint32_t partial_sectors = 0;
876     uint32_t bytes_written = 0;
877     uint64_t file_offset;
878     int64_t file_length;
879     VHDXHeader *header;
880     VHDXLogEntryHeader new_hdr;
881     VHDXLogDescriptor *new_desc = NULL;
882     VHDXLogDataSector *data_sector = NULL;
883     MSGUID new_guid = { 0 };
884 
885     header = s->headers[s->curr_header];
886 
887     /* need to have offset read data, and be on 4096 byte boundary */
888 
889     if (length > header->log_length) {
890         /* no log present.  we could create a log here instead of failing */
891         ret = -EINVAL;
892         goto exit;
893     }
894 
895     if (guid_eq(header->log_guid, zero_guid)) {
896         vhdx_guid_generate(&new_guid);
897         vhdx_update_headers(bs, s, false, &new_guid);
898     } else {
899         /* currently, we require that the log be flushed after
900          * every write. */
901         ret = -ENOTSUP;
902         goto exit;
903     }
904 
905     /* 0 is an invalid sequence number, but may also represent the first
906      * log write (or a wrapped seq) */
907     if (s->log.sequence == 0) {
908         s->log.sequence = 1;
909     }
910 
911     sector_offset = offset % VHDX_LOG_SECTOR_SIZE;
912     file_offset = QEMU_ALIGN_DOWN(offset, VHDX_LOG_SECTOR_SIZE);
913 
914     aligned_length = length;
915 
916     /* add in the unaligned head and tail bytes */
917     if (sector_offset) {
918         leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset);
919         leading_length = leading_length > length ? length : leading_length;
920         aligned_length -= leading_length;
921         partial_sectors++;
922     }
923 
924     sectors = aligned_length / VHDX_LOG_SECTOR_SIZE;
925     trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE);
926     if (trailing_length) {
927         partial_sectors++;
928     }
929 
930     sectors += partial_sectors;
931 
932     file_length = bdrv_co_getlength(bs->file->bs);
933     if (file_length < 0) {
934         ret = file_length;
935         goto exit;
936     }
937 
938     /* sectors is now how many sectors the data itself takes, not
939      * including the header and descriptor metadata */
940 
941     new_hdr = (VHDXLogEntryHeader) {
942                 .signature           = VHDX_LOG_SIGNATURE,
943                 .tail                = s->log.tail,
944                 .sequence_number     = s->log.sequence,
945                 .descriptor_count    = sectors,
946                 .reserved            = 0,
947                 .flushed_file_offset = file_length,
948                 .last_file_offset    = file_length,
949                 .log_guid            = header->log_guid,
950               };
951 
952 
953     desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count);
954 
955     total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE;
956     new_hdr.entry_length = total_length;
957 
958     vhdx_log_entry_hdr_le_export(&new_hdr);
959 
960     buffer = qemu_blockalign(bs, total_length);
961     memcpy(buffer, &new_hdr, sizeof(new_hdr));
962 
963     new_desc = buffer + sizeof(new_hdr);
964     data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE);
965     data_tmp = data;
966 
967     /* All log sectors are 4KB, so for any partial sectors we must
968      * merge the data with preexisting data from the final file
969      * destination */
970     merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
971 
972     for (i = 0; i < sectors; i++) {
973         new_desc->signature       = VHDX_LOG_DESC_SIGNATURE;
974         new_desc->sequence_number = s->log.sequence;
975         new_desc->file_offset     = file_offset;
976 
977         if (i == 0 && leading_length) {
978             /* partial sector at the front of the buffer */
979             ret = bdrv_co_pread(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE,
980                                 merged_sector, 0);
981             if (ret < 0) {
982                 goto exit;
983             }
984             memcpy(merged_sector + sector_offset, data_tmp, leading_length);
985             bytes_written = leading_length;
986             sector_write = merged_sector;
987         } else if (i == sectors - 1 && trailing_length) {
988             /* partial sector at the end of the buffer */
989             ret = bdrv_co_pread(bs->file, file_offset + trailing_length,
990                                 VHDX_LOG_SECTOR_SIZE - trailing_length,
991                                 merged_sector + trailing_length, 0);
992             if (ret < 0) {
993                 goto exit;
994             }
995             memcpy(merged_sector, data_tmp, trailing_length);
996             bytes_written = trailing_length;
997             sector_write = merged_sector;
998         } else {
999             bytes_written = VHDX_LOG_SECTOR_SIZE;
1000             sector_write = data_tmp;
1001         }
1002 
1003         /* populate the raw sector data into the proper structures,
1004          * as well as update the descriptor, and convert to proper
1005          * endianness */
1006         vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write,
1007                                   s->log.sequence);
1008 
1009         data_tmp += bytes_written;
1010         data_sector++;
1011         new_desc++;
1012         file_offset += VHDX_LOG_SECTOR_SIZE;
1013     }
1014 
1015     /* checksum covers entire entry, from the log header through the
1016      * last data sector */
1017     vhdx_update_checksum(buffer, total_length,
1018                          offsetof(VHDXLogEntryHeader, checksum));
1019 
1020     /* now write to the log */
1021     ret = vhdx_log_write_sectors(bs, &s->log, &sectors_written, buffer,
1022                                  desc_sectors + sectors);
1023     if (ret < 0) {
1024         goto exit;
1025     }
1026 
1027     if (sectors_written != desc_sectors + sectors) {
1028         /* instead of failing, we could flush the log here */
1029         ret = -EINVAL;
1030         goto exit;
1031     }
1032 
1033     s->log.sequence++;
1034     /* write new tail */
1035     s->log.tail = s->log.write;
1036 
1037 exit:
1038     qemu_vfree(buffer);
1039     qemu_vfree(merged_sector);
1040     return ret;
1041 }
1042 
1043 /* Perform a log write, and then immediately flush the entire log */
1044 int coroutine_fn
1045 vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
1046                          void *data, uint32_t length, uint64_t offset)
1047 {
1048     int ret = 0;
1049     VHDXLogSequence logs = { .valid = true,
1050                              .count = 1,
1051                              .hdr = { 0 } };
1052 
1053 
1054     /* Make sure data written (new and/or changed blocks) is stable
1055      * on disk, before creating log entry */
1056     ret = bdrv_co_flush(bs);
1057     if (ret < 0) {
1058         goto exit;
1059     }
1060 
1061     ret = vhdx_log_write(bs, s, data, length, offset);
1062     if (ret < 0) {
1063         goto exit;
1064     }
1065     logs.log = s->log;
1066 
1067     /* Make sure log is stable on disk */
1068     ret = bdrv_co_flush(bs);
1069     if (ret < 0) {
1070         goto exit;
1071     }
1072 
1073     ret = vhdx_log_flush(bs, s, &logs);
1074     if (ret < 0) {
1075         goto exit;
1076     }
1077 
1078     s->log = logs.log;
1079 
1080 exit:
1081     return ret;
1082 }
1083 
1084