xref: /openbmc/qemu/tests/unit/test-bdrv-drain.c (revision 46627f41)
1 /*
2  * Block node draining tests
3  *
4  * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "block/block.h"
27 #include "block/blockjob_int.h"
28 #include "sysemu/block-backend.h"
29 #include "qapi/error.h"
30 #include "qemu/main-loop.h"
31 #include "iothread.h"
32 
33 static QemuEvent done_event;
34 
35 typedef struct BDRVTestState {
36     int drain_count;
37     AioContext *bh_indirection_ctx;
38     bool sleep_in_drain_begin;
39 } BDRVTestState;
40 
41 static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
42 {
43     BDRVTestState *s = bs->opaque;
44     s->drain_count++;
45     if (s->sleep_in_drain_begin) {
46         qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
47     }
48 }
49 
50 static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
51 {
52     BDRVTestState *s = bs->opaque;
53     s->drain_count--;
54 }
55 
56 static void bdrv_test_close(BlockDriverState *bs)
57 {
58     BDRVTestState *s = bs->opaque;
59     g_assert_cmpint(s->drain_count, >, 0);
60 }
61 
62 static void co_reenter_bh(void *opaque)
63 {
64     aio_co_wake(opaque);
65 }
66 
67 static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
68                                             uint64_t offset, uint64_t bytes,
69                                             QEMUIOVector *qiov, int flags)
70 {
71     BDRVTestState *s = bs->opaque;
72 
73     /* We want this request to stay until the polling loop in drain waits for
74      * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
75      * first and polls its result, too, but it shouldn't accidentally complete
76      * this request yet. */
77     qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
78 
79     if (s->bh_indirection_ctx) {
80         aio_bh_schedule_oneshot(s->bh_indirection_ctx, co_reenter_bh,
81                                 qemu_coroutine_self());
82         qemu_coroutine_yield();
83     }
84 
85     return 0;
86 }
87 
88 static int bdrv_test_change_backing_file(BlockDriverState *bs,
89                                          const char *backing_file,
90                                          const char *backing_fmt)
91 {
92     return 0;
93 }
94 
95 static BlockDriver bdrv_test = {
96     .format_name            = "test",
97     .instance_size          = sizeof(BDRVTestState),
98 
99     .bdrv_close             = bdrv_test_close,
100     .bdrv_co_preadv         = bdrv_test_co_preadv,
101 
102     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
103     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
104 
105     .bdrv_child_perm        = bdrv_default_perms,
106 
107     .bdrv_change_backing_file = bdrv_test_change_backing_file,
108 };
109 
110 static void aio_ret_cb(void *opaque, int ret)
111 {
112     int *aio_ret = opaque;
113     *aio_ret = ret;
114 }
115 
116 typedef struct CallInCoroutineData {
117     void (*entry)(void);
118     bool done;
119 } CallInCoroutineData;
120 
121 static coroutine_fn void call_in_coroutine_entry(void *opaque)
122 {
123     CallInCoroutineData *data = opaque;
124 
125     data->entry();
126     data->done = true;
127 }
128 
129 static void call_in_coroutine(void (*entry)(void))
130 {
131     Coroutine *co;
132     CallInCoroutineData data = {
133         .entry  = entry,
134         .done   = false,
135     };
136 
137     co = qemu_coroutine_create(call_in_coroutine_entry, &data);
138     qemu_coroutine_enter(co);
139     while (!data.done) {
140         aio_poll(qemu_get_aio_context(), true);
141     }
142 }
143 
144 enum drain_type {
145     BDRV_DRAIN_ALL,
146     BDRV_DRAIN,
147     BDRV_SUBTREE_DRAIN,
148     DRAIN_TYPE_MAX,
149 };
150 
151 static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
152 {
153     switch (drain_type) {
154     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
155     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
156     case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
157     default:                    g_assert_not_reached();
158     }
159 }
160 
161 static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
162 {
163     switch (drain_type) {
164     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
165     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
166     case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
167     default:                    g_assert_not_reached();
168     }
169 }
170 
171 static void do_drain_begin_unlocked(enum drain_type drain_type, BlockDriverState *bs)
172 {
173     if (drain_type != BDRV_DRAIN_ALL) {
174         aio_context_acquire(bdrv_get_aio_context(bs));
175     }
176     do_drain_begin(drain_type, bs);
177     if (drain_type != BDRV_DRAIN_ALL) {
178         aio_context_release(bdrv_get_aio_context(bs));
179     }
180 }
181 
182 static void do_drain_end_unlocked(enum drain_type drain_type, BlockDriverState *bs)
183 {
184     if (drain_type != BDRV_DRAIN_ALL) {
185         aio_context_acquire(bdrv_get_aio_context(bs));
186     }
187     do_drain_end(drain_type, bs);
188     if (drain_type != BDRV_DRAIN_ALL) {
189         aio_context_release(bdrv_get_aio_context(bs));
190     }
191 }
192 
193 static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
194 {
195     BlockBackend *blk;
196     BlockDriverState *bs, *backing;
197     BDRVTestState *s, *backing_s;
198     BlockAIOCB *acb;
199     int aio_ret;
200 
201     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, 0);
202 
203     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
204     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
205                               &error_abort);
206     s = bs->opaque;
207     blk_insert_bs(blk, bs, &error_abort);
208 
209     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
210     backing_s = backing->opaque;
211     bdrv_set_backing_hd(bs, backing, &error_abort);
212 
213     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
214     g_assert_cmpint(s->drain_count, ==, 0);
215     g_assert_cmpint(backing_s->drain_count, ==, 0);
216 
217     do_drain_begin(drain_type, bs);
218 
219     g_assert_cmpint(s->drain_count, ==, 1);
220     g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
221 
222     do_drain_end(drain_type, bs);
223 
224     g_assert_cmpint(s->drain_count, ==, 0);
225     g_assert_cmpint(backing_s->drain_count, ==, 0);
226 
227     /* Now do the same while a request is pending */
228     aio_ret = -EINPROGRESS;
229     acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
230     g_assert(acb != NULL);
231     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
232 
233     g_assert_cmpint(s->drain_count, ==, 0);
234     g_assert_cmpint(backing_s->drain_count, ==, 0);
235 
236     do_drain_begin(drain_type, bs);
237 
238     g_assert_cmpint(aio_ret, ==, 0);
239     g_assert_cmpint(s->drain_count, ==, 1);
240     g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
241 
242     do_drain_end(drain_type, bs);
243 
244     g_assert_cmpint(s->drain_count, ==, 0);
245     g_assert_cmpint(backing_s->drain_count, ==, 0);
246 
247     bdrv_unref(backing);
248     bdrv_unref(bs);
249     blk_unref(blk);
250 }
251 
252 static void test_drv_cb_drain_all(void)
253 {
254     test_drv_cb_common(BDRV_DRAIN_ALL, true);
255 }
256 
257 static void test_drv_cb_drain(void)
258 {
259     test_drv_cb_common(BDRV_DRAIN, false);
260 }
261 
262 static void test_drv_cb_drain_subtree(void)
263 {
264     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
265 }
266 
267 static void test_drv_cb_co_drain_all(void)
268 {
269     call_in_coroutine(test_drv_cb_drain_all);
270 }
271 
272 static void test_drv_cb_co_drain(void)
273 {
274     call_in_coroutine(test_drv_cb_drain);
275 }
276 
277 static void test_drv_cb_co_drain_subtree(void)
278 {
279     call_in_coroutine(test_drv_cb_drain_subtree);
280 }
281 
282 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
283 {
284     BlockBackend *blk;
285     BlockDriverState *bs, *backing;
286 
287     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
288     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
289                               &error_abort);
290     blk_insert_bs(blk, bs, &error_abort);
291 
292     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
293     bdrv_set_backing_hd(bs, backing, &error_abort);
294 
295     g_assert_cmpint(bs->quiesce_counter, ==, 0);
296     g_assert_cmpint(backing->quiesce_counter, ==, 0);
297 
298     do_drain_begin(drain_type, bs);
299 
300     g_assert_cmpint(bs->quiesce_counter, ==, 1);
301     g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
302 
303     do_drain_end(drain_type, bs);
304 
305     g_assert_cmpint(bs->quiesce_counter, ==, 0);
306     g_assert_cmpint(backing->quiesce_counter, ==, 0);
307 
308     bdrv_unref(backing);
309     bdrv_unref(bs);
310     blk_unref(blk);
311 }
312 
313 static void test_quiesce_drain_all(void)
314 {
315     test_quiesce_common(BDRV_DRAIN_ALL, true);
316 }
317 
318 static void test_quiesce_drain(void)
319 {
320     test_quiesce_common(BDRV_DRAIN, false);
321 }
322 
323 static void test_quiesce_drain_subtree(void)
324 {
325     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
326 }
327 
328 static void test_quiesce_co_drain_all(void)
329 {
330     call_in_coroutine(test_quiesce_drain_all);
331 }
332 
333 static void test_quiesce_co_drain(void)
334 {
335     call_in_coroutine(test_quiesce_drain);
336 }
337 
338 static void test_quiesce_co_drain_subtree(void)
339 {
340     call_in_coroutine(test_quiesce_drain_subtree);
341 }
342 
343 static void test_nested(void)
344 {
345     BlockBackend *blk;
346     BlockDriverState *bs, *backing;
347     BDRVTestState *s, *backing_s;
348     enum drain_type outer, inner;
349 
350     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
351     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
352                               &error_abort);
353     s = bs->opaque;
354     blk_insert_bs(blk, bs, &error_abort);
355 
356     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
357     backing_s = backing->opaque;
358     bdrv_set_backing_hd(bs, backing, &error_abort);
359 
360     for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
361         for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
362             int backing_quiesce = (outer != BDRV_DRAIN) +
363                                   (inner != BDRV_DRAIN);
364 
365             g_assert_cmpint(bs->quiesce_counter, ==, 0);
366             g_assert_cmpint(backing->quiesce_counter, ==, 0);
367             g_assert_cmpint(s->drain_count, ==, 0);
368             g_assert_cmpint(backing_s->drain_count, ==, 0);
369 
370             do_drain_begin(outer, bs);
371             do_drain_begin(inner, bs);
372 
373             g_assert_cmpint(bs->quiesce_counter, ==, 2);
374             g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
375             g_assert_cmpint(s->drain_count, ==, 2);
376             g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce);
377 
378             do_drain_end(inner, bs);
379             do_drain_end(outer, bs);
380 
381             g_assert_cmpint(bs->quiesce_counter, ==, 0);
382             g_assert_cmpint(backing->quiesce_counter, ==, 0);
383             g_assert_cmpint(s->drain_count, ==, 0);
384             g_assert_cmpint(backing_s->drain_count, ==, 0);
385         }
386     }
387 
388     bdrv_unref(backing);
389     bdrv_unref(bs);
390     blk_unref(blk);
391 }
392 
393 static void test_multiparent(void)
394 {
395     BlockBackend *blk_a, *blk_b;
396     BlockDriverState *bs_a, *bs_b, *backing;
397     BDRVTestState *a_s, *b_s, *backing_s;
398 
399     blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
400     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
401                                 &error_abort);
402     a_s = bs_a->opaque;
403     blk_insert_bs(blk_a, bs_a, &error_abort);
404 
405     blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
406     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
407                                 &error_abort);
408     b_s = bs_b->opaque;
409     blk_insert_bs(blk_b, bs_b, &error_abort);
410 
411     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
412     backing_s = backing->opaque;
413     bdrv_set_backing_hd(bs_a, backing, &error_abort);
414     bdrv_set_backing_hd(bs_b, backing, &error_abort);
415 
416     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
417     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
418     g_assert_cmpint(backing->quiesce_counter, ==, 0);
419     g_assert_cmpint(a_s->drain_count, ==, 0);
420     g_assert_cmpint(b_s->drain_count, ==, 0);
421     g_assert_cmpint(backing_s->drain_count, ==, 0);
422 
423     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
424 
425     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
426     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
427     g_assert_cmpint(backing->quiesce_counter, ==, 1);
428     g_assert_cmpint(a_s->drain_count, ==, 1);
429     g_assert_cmpint(b_s->drain_count, ==, 1);
430     g_assert_cmpint(backing_s->drain_count, ==, 1);
431 
432     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
433 
434     g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
435     g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
436     g_assert_cmpint(backing->quiesce_counter, ==, 2);
437     g_assert_cmpint(a_s->drain_count, ==, 2);
438     g_assert_cmpint(b_s->drain_count, ==, 2);
439     g_assert_cmpint(backing_s->drain_count, ==, 2);
440 
441     do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
442 
443     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
444     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
445     g_assert_cmpint(backing->quiesce_counter, ==, 1);
446     g_assert_cmpint(a_s->drain_count, ==, 1);
447     g_assert_cmpint(b_s->drain_count, ==, 1);
448     g_assert_cmpint(backing_s->drain_count, ==, 1);
449 
450     do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
451 
452     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
453     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
454     g_assert_cmpint(backing->quiesce_counter, ==, 0);
455     g_assert_cmpint(a_s->drain_count, ==, 0);
456     g_assert_cmpint(b_s->drain_count, ==, 0);
457     g_assert_cmpint(backing_s->drain_count, ==, 0);
458 
459     bdrv_unref(backing);
460     bdrv_unref(bs_a);
461     bdrv_unref(bs_b);
462     blk_unref(blk_a);
463     blk_unref(blk_b);
464 }
465 
466 static void test_graph_change_drain_subtree(void)
467 {
468     BlockBackend *blk_a, *blk_b;
469     BlockDriverState *bs_a, *bs_b, *backing;
470     BDRVTestState *a_s, *b_s, *backing_s;
471 
472     blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
473     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
474                                 &error_abort);
475     a_s = bs_a->opaque;
476     blk_insert_bs(blk_a, bs_a, &error_abort);
477 
478     blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
479     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
480                                 &error_abort);
481     b_s = bs_b->opaque;
482     blk_insert_bs(blk_b, bs_b, &error_abort);
483 
484     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
485     backing_s = backing->opaque;
486     bdrv_set_backing_hd(bs_a, backing, &error_abort);
487 
488     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
489     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
490     g_assert_cmpint(backing->quiesce_counter, ==, 0);
491     g_assert_cmpint(a_s->drain_count, ==, 0);
492     g_assert_cmpint(b_s->drain_count, ==, 0);
493     g_assert_cmpint(backing_s->drain_count, ==, 0);
494 
495     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
496     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
497     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
498     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
499     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
500 
501     bdrv_set_backing_hd(bs_b, backing, &error_abort);
502     g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
503     g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
504     g_assert_cmpint(backing->quiesce_counter, ==, 5);
505     g_assert_cmpint(a_s->drain_count, ==, 5);
506     g_assert_cmpint(b_s->drain_count, ==, 5);
507     g_assert_cmpint(backing_s->drain_count, ==, 5);
508 
509     bdrv_set_backing_hd(bs_b, NULL, &error_abort);
510     g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
511     g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
512     g_assert_cmpint(backing->quiesce_counter, ==, 3);
513     g_assert_cmpint(a_s->drain_count, ==, 3);
514     g_assert_cmpint(b_s->drain_count, ==, 2);
515     g_assert_cmpint(backing_s->drain_count, ==, 3);
516 
517     bdrv_set_backing_hd(bs_b, backing, &error_abort);
518     g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
519     g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
520     g_assert_cmpint(backing->quiesce_counter, ==, 5);
521     g_assert_cmpint(a_s->drain_count, ==, 5);
522     g_assert_cmpint(b_s->drain_count, ==, 5);
523     g_assert_cmpint(backing_s->drain_count, ==, 5);
524 
525     do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
526     do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
527     do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
528     do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
529     do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
530 
531     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
532     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
533     g_assert_cmpint(backing->quiesce_counter, ==, 0);
534     g_assert_cmpint(a_s->drain_count, ==, 0);
535     g_assert_cmpint(b_s->drain_count, ==, 0);
536     g_assert_cmpint(backing_s->drain_count, ==, 0);
537 
538     bdrv_unref(backing);
539     bdrv_unref(bs_a);
540     bdrv_unref(bs_b);
541     blk_unref(blk_a);
542     blk_unref(blk_b);
543 }
544 
545 static void test_graph_change_drain_all(void)
546 {
547     BlockBackend *blk_a, *blk_b;
548     BlockDriverState *bs_a, *bs_b;
549     BDRVTestState *a_s, *b_s;
550 
551     /* Create node A with a BlockBackend */
552     blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
553     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
554                                 &error_abort);
555     a_s = bs_a->opaque;
556     blk_insert_bs(blk_a, bs_a, &error_abort);
557 
558     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
559     g_assert_cmpint(a_s->drain_count, ==, 0);
560 
561     /* Call bdrv_drain_all_begin() */
562     bdrv_drain_all_begin();
563 
564     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
565     g_assert_cmpint(a_s->drain_count, ==, 1);
566 
567     /* Create node B with a BlockBackend */
568     blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
569     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
570                                 &error_abort);
571     b_s = bs_b->opaque;
572     blk_insert_bs(blk_b, bs_b, &error_abort);
573 
574     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
575     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
576     g_assert_cmpint(a_s->drain_count, ==, 1);
577     g_assert_cmpint(b_s->drain_count, ==, 1);
578 
579     /* Unref and finally delete node A */
580     blk_unref(blk_a);
581 
582     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
583     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
584     g_assert_cmpint(a_s->drain_count, ==, 1);
585     g_assert_cmpint(b_s->drain_count, ==, 1);
586 
587     bdrv_unref(bs_a);
588 
589     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
590     g_assert_cmpint(b_s->drain_count, ==, 1);
591 
592     /* End the drained section */
593     bdrv_drain_all_end();
594 
595     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
596     g_assert_cmpint(b_s->drain_count, ==, 0);
597     g_assert_cmpint(qemu_get_aio_context()->external_disable_cnt, ==, 0);
598 
599     bdrv_unref(bs_b);
600     blk_unref(blk_b);
601 }
602 
603 struct test_iothread_data {
604     BlockDriverState *bs;
605     enum drain_type drain_type;
606     int *aio_ret;
607 };
608 
609 static void test_iothread_drain_entry(void *opaque)
610 {
611     struct test_iothread_data *data = opaque;
612 
613     aio_context_acquire(bdrv_get_aio_context(data->bs));
614     do_drain_begin(data->drain_type, data->bs);
615     g_assert_cmpint(*data->aio_ret, ==, 0);
616     do_drain_end(data->drain_type, data->bs);
617     aio_context_release(bdrv_get_aio_context(data->bs));
618 
619     qemu_event_set(&done_event);
620 }
621 
622 static void test_iothread_aio_cb(void *opaque, int ret)
623 {
624     int *aio_ret = opaque;
625     *aio_ret = ret;
626     qemu_event_set(&done_event);
627 }
628 
629 static void test_iothread_main_thread_bh(void *opaque)
630 {
631     struct test_iothread_data *data = opaque;
632 
633     /* Test that the AioContext is not yet locked in a random BH that is
634      * executed during drain, otherwise this would deadlock. */
635     aio_context_acquire(bdrv_get_aio_context(data->bs));
636     bdrv_flush(data->bs);
637     aio_context_release(bdrv_get_aio_context(data->bs));
638 }
639 
640 /*
641  * Starts an AIO request on a BDS that runs in the AioContext of iothread 1.
642  * The request involves a BH on iothread 2 before it can complete.
643  *
644  * @drain_thread = 0 means that do_drain_begin/end are called from the main
645  * thread, @drain_thread = 1 means that they are called from iothread 1. Drain
646  * for this BDS cannot be called from iothread 2 because only the main thread
647  * may do cross-AioContext polling.
648  */
649 static void test_iothread_common(enum drain_type drain_type, int drain_thread)
650 {
651     BlockBackend *blk;
652     BlockDriverState *bs;
653     BDRVTestState *s;
654     BlockAIOCB *acb;
655     int aio_ret;
656     struct test_iothread_data data;
657 
658     IOThread *a = iothread_new();
659     IOThread *b = iothread_new();
660     AioContext *ctx_a = iothread_get_aio_context(a);
661     AioContext *ctx_b = iothread_get_aio_context(b);
662 
663     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, 0);
664 
665     /* bdrv_drain_all() may only be called from the main loop thread */
666     if (drain_type == BDRV_DRAIN_ALL && drain_thread != 0) {
667         goto out;
668     }
669 
670     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
671     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
672                               &error_abort);
673     s = bs->opaque;
674     blk_insert_bs(blk, bs, &error_abort);
675     blk_set_disable_request_queuing(blk, true);
676 
677     blk_set_aio_context(blk, ctx_a, &error_abort);
678     aio_context_acquire(ctx_a);
679 
680     s->bh_indirection_ctx = ctx_b;
681 
682     aio_ret = -EINPROGRESS;
683     qemu_event_reset(&done_event);
684 
685     if (drain_thread == 0) {
686         acb = blk_aio_preadv(blk, 0, &qiov, 0, test_iothread_aio_cb, &aio_ret);
687     } else {
688         acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
689     }
690     g_assert(acb != NULL);
691     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
692 
693     aio_context_release(ctx_a);
694 
695     data = (struct test_iothread_data) {
696         .bs         = bs,
697         .drain_type = drain_type,
698         .aio_ret    = &aio_ret,
699     };
700 
701     switch (drain_thread) {
702     case 0:
703         if (drain_type != BDRV_DRAIN_ALL) {
704             aio_context_acquire(ctx_a);
705         }
706 
707         aio_bh_schedule_oneshot(ctx_a, test_iothread_main_thread_bh, &data);
708 
709         /* The request is running on the IOThread a. Draining its block device
710          * will make sure that it has completed as far as the BDS is concerned,
711          * but the drain in this thread can continue immediately after
712          * bdrv_dec_in_flight() and aio_ret might be assigned only slightly
713          * later. */
714         do_drain_begin(drain_type, bs);
715         g_assert_cmpint(bs->in_flight, ==, 0);
716 
717         if (drain_type != BDRV_DRAIN_ALL) {
718             aio_context_release(ctx_a);
719         }
720         qemu_event_wait(&done_event);
721         if (drain_type != BDRV_DRAIN_ALL) {
722             aio_context_acquire(ctx_a);
723         }
724 
725         g_assert_cmpint(aio_ret, ==, 0);
726         do_drain_end(drain_type, bs);
727 
728         if (drain_type != BDRV_DRAIN_ALL) {
729             aio_context_release(ctx_a);
730         }
731         break;
732     case 1:
733         aio_bh_schedule_oneshot(ctx_a, test_iothread_drain_entry, &data);
734         qemu_event_wait(&done_event);
735         break;
736     default:
737         g_assert_not_reached();
738     }
739 
740     aio_context_acquire(ctx_a);
741     blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
742     aio_context_release(ctx_a);
743 
744     bdrv_unref(bs);
745     blk_unref(blk);
746 
747 out:
748     iothread_join(a);
749     iothread_join(b);
750 }
751 
752 static void test_iothread_drain_all(void)
753 {
754     test_iothread_common(BDRV_DRAIN_ALL, 0);
755     test_iothread_common(BDRV_DRAIN_ALL, 1);
756 }
757 
758 static void test_iothread_drain(void)
759 {
760     test_iothread_common(BDRV_DRAIN, 0);
761     test_iothread_common(BDRV_DRAIN, 1);
762 }
763 
764 static void test_iothread_drain_subtree(void)
765 {
766     test_iothread_common(BDRV_SUBTREE_DRAIN, 0);
767     test_iothread_common(BDRV_SUBTREE_DRAIN, 1);
768 }
769 
770 
771 typedef struct TestBlockJob {
772     BlockJob common;
773     int run_ret;
774     int prepare_ret;
775     bool running;
776     bool should_complete;
777 } TestBlockJob;
778 
779 static int test_job_prepare(Job *job)
780 {
781     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
782 
783     /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */
784     blk_flush(s->common.blk);
785     return s->prepare_ret;
786 }
787 
788 static void test_job_commit(Job *job)
789 {
790     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
791 
792     /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */
793     blk_flush(s->common.blk);
794 }
795 
796 static void test_job_abort(Job *job)
797 {
798     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
799 
800     /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */
801     blk_flush(s->common.blk);
802 }
803 
804 static int coroutine_fn test_job_run(Job *job, Error **errp)
805 {
806     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
807 
808     /* We are running the actual job code past the pause point in
809      * job_co_entry(). */
810     s->running = true;
811 
812     job_transition_to_ready(&s->common.job);
813     while (!s->should_complete) {
814         /* Avoid job_sleep_ns() because it marks the job as !busy. We want to
815          * emulate some actual activity (probably some I/O) here so that drain
816          * has to wait for this activity to stop. */
817         qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
818 
819         job_pause_point(&s->common.job);
820     }
821 
822     return s->run_ret;
823 }
824 
825 static void test_job_complete(Job *job, Error **errp)
826 {
827     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
828     s->should_complete = true;
829 }
830 
831 BlockJobDriver test_job_driver = {
832     .job_driver = {
833         .instance_size  = sizeof(TestBlockJob),
834         .free           = block_job_free,
835         .user_resume    = block_job_user_resume,
836         .run            = test_job_run,
837         .complete       = test_job_complete,
838         .prepare        = test_job_prepare,
839         .commit         = test_job_commit,
840         .abort          = test_job_abort,
841     },
842 };
843 
844 enum test_job_result {
845     TEST_JOB_SUCCESS,
846     TEST_JOB_FAIL_RUN,
847     TEST_JOB_FAIL_PREPARE,
848 };
849 
850 enum test_job_drain_node {
851     TEST_JOB_DRAIN_SRC,
852     TEST_JOB_DRAIN_SRC_CHILD,
853     TEST_JOB_DRAIN_SRC_PARENT,
854 };
855 
856 static void test_blockjob_common_drain_node(enum drain_type drain_type,
857                                             bool use_iothread,
858                                             enum test_job_result result,
859                                             enum test_job_drain_node drain_node)
860 {
861     BlockBackend *blk_src, *blk_target;
862     BlockDriverState *src, *src_backing, *src_overlay, *target, *drain_bs;
863     BlockJob *job;
864     TestBlockJob *tjob;
865     IOThread *iothread = NULL;
866     AioContext *ctx;
867     int ret;
868 
869     src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
870                                &error_abort);
871     src_backing = bdrv_new_open_driver(&bdrv_test, "source-backing",
872                                        BDRV_O_RDWR, &error_abort);
873     src_overlay = bdrv_new_open_driver(&bdrv_test, "source-overlay",
874                                        BDRV_O_RDWR, &error_abort);
875 
876     bdrv_set_backing_hd(src_overlay, src, &error_abort);
877     bdrv_unref(src);
878     bdrv_set_backing_hd(src, src_backing, &error_abort);
879     bdrv_unref(src_backing);
880 
881     blk_src = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
882     blk_insert_bs(blk_src, src_overlay, &error_abort);
883 
884     switch (drain_node) {
885     case TEST_JOB_DRAIN_SRC:
886         drain_bs = src;
887         break;
888     case TEST_JOB_DRAIN_SRC_CHILD:
889         drain_bs = src_backing;
890         break;
891     case TEST_JOB_DRAIN_SRC_PARENT:
892         drain_bs = src_overlay;
893         break;
894     default:
895         g_assert_not_reached();
896     }
897 
898     if (use_iothread) {
899         iothread = iothread_new();
900         ctx = iothread_get_aio_context(iothread);
901         blk_set_aio_context(blk_src, ctx, &error_abort);
902     } else {
903         ctx = qemu_get_aio_context();
904     }
905 
906     target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
907                                   &error_abort);
908     blk_target = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
909     blk_insert_bs(blk_target, target, &error_abort);
910     blk_set_allow_aio_context_change(blk_target, true);
911 
912     aio_context_acquire(ctx);
913     tjob = block_job_create("job0", &test_job_driver, NULL, src,
914                             0, BLK_PERM_ALL,
915                             0, 0, NULL, NULL, &error_abort);
916     job = &tjob->common;
917     block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
918 
919     switch (result) {
920     case TEST_JOB_SUCCESS:
921         break;
922     case TEST_JOB_FAIL_RUN:
923         tjob->run_ret = -EIO;
924         break;
925     case TEST_JOB_FAIL_PREPARE:
926         tjob->prepare_ret = -EIO;
927         break;
928     }
929 
930     job_start(&job->job);
931     aio_context_release(ctx);
932 
933     if (use_iothread) {
934         /* job_co_entry() is run in the I/O thread, wait for the actual job
935          * code to start (we don't want to catch the job in the pause point in
936          * job_co_entry(). */
937         while (!tjob->running) {
938             aio_poll(qemu_get_aio_context(), false);
939         }
940     }
941 
942     g_assert_cmpint(job->job.pause_count, ==, 0);
943     g_assert_false(job->job.paused);
944     g_assert_true(tjob->running);
945     g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
946 
947     do_drain_begin_unlocked(drain_type, drain_bs);
948 
949     if (drain_type == BDRV_DRAIN_ALL) {
950         /* bdrv_drain_all() drains both src and target */
951         g_assert_cmpint(job->job.pause_count, ==, 2);
952     } else {
953         g_assert_cmpint(job->job.pause_count, ==, 1);
954     }
955     g_assert_true(job->job.paused);
956     g_assert_false(job->job.busy); /* The job is paused */
957 
958     do_drain_end_unlocked(drain_type, drain_bs);
959 
960     if (use_iothread) {
961         /* paused is reset in the I/O thread, wait for it */
962         while (job->job.paused) {
963             aio_poll(qemu_get_aio_context(), false);
964         }
965     }
966 
967     g_assert_cmpint(job->job.pause_count, ==, 0);
968     g_assert_false(job->job.paused);
969     g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
970 
971     do_drain_begin_unlocked(drain_type, target);
972 
973     if (drain_type == BDRV_DRAIN_ALL) {
974         /* bdrv_drain_all() drains both src and target */
975         g_assert_cmpint(job->job.pause_count, ==, 2);
976     } else {
977         g_assert_cmpint(job->job.pause_count, ==, 1);
978     }
979     g_assert_true(job->job.paused);
980     g_assert_false(job->job.busy); /* The job is paused */
981 
982     do_drain_end_unlocked(drain_type, target);
983 
984     if (use_iothread) {
985         /* paused is reset in the I/O thread, wait for it */
986         while (job->job.paused) {
987             aio_poll(qemu_get_aio_context(), false);
988         }
989     }
990 
991     g_assert_cmpint(job->job.pause_count, ==, 0);
992     g_assert_false(job->job.paused);
993     g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
994 
995     aio_context_acquire(ctx);
996     ret = job_complete_sync(&job->job, &error_abort);
997     g_assert_cmpint(ret, ==, (result == TEST_JOB_SUCCESS ? 0 : -EIO));
998 
999     if (use_iothread) {
1000         blk_set_aio_context(blk_src, qemu_get_aio_context(), &error_abort);
1001         assert(blk_get_aio_context(blk_target) == qemu_get_aio_context());
1002     }
1003     aio_context_release(ctx);
1004 
1005     blk_unref(blk_src);
1006     blk_unref(blk_target);
1007     bdrv_unref(src_overlay);
1008     bdrv_unref(target);
1009 
1010     if (iothread) {
1011         iothread_join(iothread);
1012     }
1013 }
1014 
1015 static void test_blockjob_common(enum drain_type drain_type, bool use_iothread,
1016                                  enum test_job_result result)
1017 {
1018     test_blockjob_common_drain_node(drain_type, use_iothread, result,
1019                                     TEST_JOB_DRAIN_SRC);
1020     test_blockjob_common_drain_node(drain_type, use_iothread, result,
1021                                     TEST_JOB_DRAIN_SRC_CHILD);
1022     if (drain_type == BDRV_SUBTREE_DRAIN) {
1023         test_blockjob_common_drain_node(drain_type, use_iothread, result,
1024                                         TEST_JOB_DRAIN_SRC_PARENT);
1025     }
1026 }
1027 
1028 static void test_blockjob_drain_all(void)
1029 {
1030     test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_SUCCESS);
1031 }
1032 
1033 static void test_blockjob_drain(void)
1034 {
1035     test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_SUCCESS);
1036 }
1037 
1038 static void test_blockjob_drain_subtree(void)
1039 {
1040     test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_SUCCESS);
1041 }
1042 
1043 static void test_blockjob_error_drain_all(void)
1044 {
1045     test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_RUN);
1046     test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_PREPARE);
1047 }
1048 
1049 static void test_blockjob_error_drain(void)
1050 {
1051     test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_RUN);
1052     test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_PREPARE);
1053 }
1054 
1055 static void test_blockjob_error_drain_subtree(void)
1056 {
1057     test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_RUN);
1058     test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_PREPARE);
1059 }
1060 
1061 static void test_blockjob_iothread_drain_all(void)
1062 {
1063     test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_SUCCESS);
1064 }
1065 
1066 static void test_blockjob_iothread_drain(void)
1067 {
1068     test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_SUCCESS);
1069 }
1070 
1071 static void test_blockjob_iothread_drain_subtree(void)
1072 {
1073     test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_SUCCESS);
1074 }
1075 
1076 static void test_blockjob_iothread_error_drain_all(void)
1077 {
1078     test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_RUN);
1079     test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_PREPARE);
1080 }
1081 
1082 static void test_blockjob_iothread_error_drain(void)
1083 {
1084     test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_RUN);
1085     test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_PREPARE);
1086 }
1087 
1088 static void test_blockjob_iothread_error_drain_subtree(void)
1089 {
1090     test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_RUN);
1091     test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_PREPARE);
1092 }
1093 
1094 
1095 typedef struct BDRVTestTopState {
1096     BdrvChild *wait_child;
1097 } BDRVTestTopState;
1098 
1099 static void bdrv_test_top_close(BlockDriverState *bs)
1100 {
1101     BdrvChild *c, *next_c;
1102     QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
1103         bdrv_unref_child(bs, c);
1104     }
1105 }
1106 
1107 static int coroutine_fn bdrv_test_top_co_preadv(BlockDriverState *bs,
1108                                                 uint64_t offset, uint64_t bytes,
1109                                                 QEMUIOVector *qiov, int flags)
1110 {
1111     BDRVTestTopState *tts = bs->opaque;
1112     return bdrv_co_preadv(tts->wait_child, offset, bytes, qiov, flags);
1113 }
1114 
1115 static BlockDriver bdrv_test_top_driver = {
1116     .format_name            = "test_top_driver",
1117     .instance_size          = sizeof(BDRVTestTopState),
1118 
1119     .bdrv_close             = bdrv_test_top_close,
1120     .bdrv_co_preadv         = bdrv_test_top_co_preadv,
1121 
1122     .bdrv_child_perm        = bdrv_default_perms,
1123 };
1124 
1125 typedef struct TestCoDeleteByDrainData {
1126     BlockBackend *blk;
1127     bool detach_instead_of_delete;
1128     bool done;
1129 } TestCoDeleteByDrainData;
1130 
1131 static void coroutine_fn test_co_delete_by_drain(void *opaque)
1132 {
1133     TestCoDeleteByDrainData *dbdd = opaque;
1134     BlockBackend *blk = dbdd->blk;
1135     BlockDriverState *bs = blk_bs(blk);
1136     BDRVTestTopState *tts = bs->opaque;
1137     void *buffer = g_malloc(65536);
1138     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buffer, 65536);
1139 
1140     /* Pretend some internal write operation from parent to child.
1141      * Important: We have to read from the child, not from the parent!
1142      * Draining works by first propagating it all up the tree to the
1143      * root and then waiting for drainage from root to the leaves
1144      * (protocol nodes).  If we have a request waiting on the root,
1145      * everything will be drained before we go back down the tree, but
1146      * we do not want that.  We want to be in the middle of draining
1147      * when this following requests returns. */
1148     bdrv_co_preadv(tts->wait_child, 0, 65536, &qiov, 0);
1149 
1150     g_assert_cmpint(bs->refcnt, ==, 1);
1151 
1152     if (!dbdd->detach_instead_of_delete) {
1153         blk_unref(blk);
1154     } else {
1155         BdrvChild *c, *next_c;
1156         QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
1157             bdrv_unref_child(bs, c);
1158         }
1159     }
1160 
1161     dbdd->done = true;
1162     g_free(buffer);
1163 }
1164 
1165 /**
1166  * Test what happens when some BDS has some children, you drain one of
1167  * them and this results in the BDS being deleted.
1168  *
1169  * If @detach_instead_of_delete is set, the BDS is not going to be
1170  * deleted but will only detach all of its children.
1171  */
1172 static void do_test_delete_by_drain(bool detach_instead_of_delete,
1173                                     enum drain_type drain_type)
1174 {
1175     BlockBackend *blk;
1176     BlockDriverState *bs, *child_bs, *null_bs;
1177     BDRVTestTopState *tts;
1178     TestCoDeleteByDrainData dbdd;
1179     Coroutine *co;
1180 
1181     bs = bdrv_new_open_driver(&bdrv_test_top_driver, "top", BDRV_O_RDWR,
1182                               &error_abort);
1183     bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
1184     tts = bs->opaque;
1185 
1186     null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
1187                         &error_abort);
1188     bdrv_attach_child(bs, null_bs, "null-child", &child_of_bds,
1189                       BDRV_CHILD_DATA, &error_abort);
1190 
1191     /* This child will be the one to pass to requests through to, and
1192      * it will stall until a drain occurs */
1193     child_bs = bdrv_new_open_driver(&bdrv_test, "child", BDRV_O_RDWR,
1194                                     &error_abort);
1195     child_bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
1196     /* Takes our reference to child_bs */
1197     tts->wait_child = bdrv_attach_child(bs, child_bs, "wait-child",
1198                                         &child_of_bds,
1199                                         BDRV_CHILD_DATA | BDRV_CHILD_PRIMARY,
1200                                         &error_abort);
1201 
1202     /* This child is just there to be deleted
1203      * (for detach_instead_of_delete == true) */
1204     null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
1205                         &error_abort);
1206     bdrv_attach_child(bs, null_bs, "null-child", &child_of_bds, BDRV_CHILD_DATA,
1207                       &error_abort);
1208 
1209     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
1210     blk_insert_bs(blk, bs, &error_abort);
1211 
1212     /* Referenced by blk now */
1213     bdrv_unref(bs);
1214 
1215     g_assert_cmpint(bs->refcnt, ==, 1);
1216     g_assert_cmpint(child_bs->refcnt, ==, 1);
1217     g_assert_cmpint(null_bs->refcnt, ==, 1);
1218 
1219 
1220     dbdd = (TestCoDeleteByDrainData){
1221         .blk = blk,
1222         .detach_instead_of_delete = detach_instead_of_delete,
1223         .done = false,
1224     };
1225     co = qemu_coroutine_create(test_co_delete_by_drain, &dbdd);
1226     qemu_coroutine_enter(co);
1227 
1228     /* Drain the child while the read operation is still pending.
1229      * This should result in the operation finishing and
1230      * test_co_delete_by_drain() resuming.  Thus, @bs will be deleted
1231      * and the coroutine will exit while this drain operation is still
1232      * in progress. */
1233     switch (drain_type) {
1234     case BDRV_DRAIN:
1235         bdrv_ref(child_bs);
1236         bdrv_drain(child_bs);
1237         bdrv_unref(child_bs);
1238         break;
1239     case BDRV_SUBTREE_DRAIN:
1240         /* Would have to ref/unref bs here for !detach_instead_of_delete, but
1241          * then the whole test becomes pointless because the graph changes
1242          * don't occur during the drain any more. */
1243         assert(detach_instead_of_delete);
1244         bdrv_subtree_drained_begin(bs);
1245         bdrv_subtree_drained_end(bs);
1246         break;
1247     case BDRV_DRAIN_ALL:
1248         bdrv_drain_all_begin();
1249         bdrv_drain_all_end();
1250         break;
1251     default:
1252         g_assert_not_reached();
1253     }
1254 
1255     while (!dbdd.done) {
1256         aio_poll(qemu_get_aio_context(), true);
1257     }
1258 
1259     if (detach_instead_of_delete) {
1260         /* Here, the reference has not passed over to the coroutine,
1261          * so we have to delete the BB ourselves */
1262         blk_unref(blk);
1263     }
1264 }
1265 
1266 static void test_delete_by_drain(void)
1267 {
1268     do_test_delete_by_drain(false, BDRV_DRAIN);
1269 }
1270 
1271 static void test_detach_by_drain_all(void)
1272 {
1273     do_test_delete_by_drain(true, BDRV_DRAIN_ALL);
1274 }
1275 
1276 static void test_detach_by_drain(void)
1277 {
1278     do_test_delete_by_drain(true, BDRV_DRAIN);
1279 }
1280 
1281 static void test_detach_by_drain_subtree(void)
1282 {
1283     do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN);
1284 }
1285 
1286 
1287 struct detach_by_parent_data {
1288     BlockDriverState *parent_b;
1289     BdrvChild *child_b;
1290     BlockDriverState *c;
1291     BdrvChild *child_c;
1292     bool by_parent_cb;
1293 };
1294 static struct detach_by_parent_data detach_by_parent_data;
1295 
1296 static void detach_indirect_bh(void *opaque)
1297 {
1298     struct detach_by_parent_data *data = opaque;
1299 
1300     bdrv_unref_child(data->parent_b, data->child_b);
1301 
1302     bdrv_ref(data->c);
1303     data->child_c = bdrv_attach_child(data->parent_b, data->c, "PB-C",
1304                                       &child_of_bds, BDRV_CHILD_DATA,
1305                                       &error_abort);
1306 }
1307 
1308 static void detach_by_parent_aio_cb(void *opaque, int ret)
1309 {
1310     struct detach_by_parent_data *data = &detach_by_parent_data;
1311 
1312     g_assert_cmpint(ret, ==, 0);
1313     if (data->by_parent_cb) {
1314         detach_indirect_bh(data);
1315     }
1316 }
1317 
1318 static void detach_by_driver_cb_drained_begin(BdrvChild *child)
1319 {
1320     aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
1321                             detach_indirect_bh, &detach_by_parent_data);
1322     child_of_bds.drained_begin(child);
1323 }
1324 
1325 static BdrvChildClass detach_by_driver_cb_class;
1326 
1327 /*
1328  * Initial graph:
1329  *
1330  * PA     PB
1331  *    \ /   \
1332  *     A     B     C
1333  *
1334  * by_parent_cb == true:  Test that parent callbacks don't poll
1335  *
1336  *     PA has a pending write request whose callback changes the child nodes of
1337  *     PB: It removes B and adds C instead. The subtree of PB is drained, which
1338  *     will indirectly drain the write request, too.
1339  *
1340  * by_parent_cb == false: Test that bdrv_drain_invoke() doesn't poll
1341  *
1342  *     PA's BdrvChildClass has a .drained_begin callback that schedules a BH
1343  *     that does the same graph change. If bdrv_drain_invoke() calls it, the
1344  *     state is messed up, but if it is only polled in the single
1345  *     BDRV_POLL_WHILE() at the end of the drain, this should work fine.
1346  */
1347 static void test_detach_indirect(bool by_parent_cb)
1348 {
1349     BlockBackend *blk;
1350     BlockDriverState *parent_a, *parent_b, *a, *b, *c;
1351     BdrvChild *child_a, *child_b;
1352     BlockAIOCB *acb;
1353 
1354     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, 0);
1355 
1356     if (!by_parent_cb) {
1357         detach_by_driver_cb_class = child_of_bds;
1358         detach_by_driver_cb_class.drained_begin =
1359             detach_by_driver_cb_drained_begin;
1360     }
1361 
1362     /* Create all involved nodes */
1363     parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
1364                                     &error_abort);
1365     parent_b = bdrv_new_open_driver(&bdrv_test, "parent-b", 0,
1366                                     &error_abort);
1367 
1368     a = bdrv_new_open_driver(&bdrv_test, "a", BDRV_O_RDWR, &error_abort);
1369     b = bdrv_new_open_driver(&bdrv_test, "b", BDRV_O_RDWR, &error_abort);
1370     c = bdrv_new_open_driver(&bdrv_test, "c", BDRV_O_RDWR, &error_abort);
1371 
1372     /* blk is a BB for parent-a */
1373     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
1374     blk_insert_bs(blk, parent_a, &error_abort);
1375     bdrv_unref(parent_a);
1376 
1377     /* If we want to get bdrv_drain_invoke() to call aio_poll(), the driver
1378      * callback must not return immediately. */
1379     if (!by_parent_cb) {
1380         BDRVTestState *s = parent_a->opaque;
1381         s->sleep_in_drain_begin = true;
1382     }
1383 
1384     /* Set child relationships */
1385     bdrv_ref(b);
1386     bdrv_ref(a);
1387     child_b = bdrv_attach_child(parent_b, b, "PB-B", &child_of_bds,
1388                                 BDRV_CHILD_DATA, &error_abort);
1389     child_a = bdrv_attach_child(parent_b, a, "PB-A", &child_of_bds,
1390                                 BDRV_CHILD_COW, &error_abort);
1391 
1392     bdrv_ref(a);
1393     bdrv_attach_child(parent_a, a, "PA-A",
1394                       by_parent_cb ? &child_of_bds : &detach_by_driver_cb_class,
1395                       BDRV_CHILD_DATA, &error_abort);
1396 
1397     g_assert_cmpint(parent_a->refcnt, ==, 1);
1398     g_assert_cmpint(parent_b->refcnt, ==, 1);
1399     g_assert_cmpint(a->refcnt, ==, 3);
1400     g_assert_cmpint(b->refcnt, ==, 2);
1401     g_assert_cmpint(c->refcnt, ==, 1);
1402 
1403     g_assert(QLIST_FIRST(&parent_b->children) == child_a);
1404     g_assert(QLIST_NEXT(child_a, next) == child_b);
1405     g_assert(QLIST_NEXT(child_b, next) == NULL);
1406 
1407     /* Start the evil write request */
1408     detach_by_parent_data = (struct detach_by_parent_data) {
1409         .parent_b = parent_b,
1410         .child_b = child_b,
1411         .c = c,
1412         .by_parent_cb = by_parent_cb,
1413     };
1414     acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
1415     g_assert(acb != NULL);
1416 
1417     /* Drain and check the expected result */
1418     bdrv_subtree_drained_begin(parent_b);
1419 
1420     g_assert(detach_by_parent_data.child_c != NULL);
1421 
1422     g_assert_cmpint(parent_a->refcnt, ==, 1);
1423     g_assert_cmpint(parent_b->refcnt, ==, 1);
1424     g_assert_cmpint(a->refcnt, ==, 3);
1425     g_assert_cmpint(b->refcnt, ==, 1);
1426     g_assert_cmpint(c->refcnt, ==, 2);
1427 
1428     g_assert(QLIST_FIRST(&parent_b->children) == detach_by_parent_data.child_c);
1429     g_assert(QLIST_NEXT(detach_by_parent_data.child_c, next) == child_a);
1430     g_assert(QLIST_NEXT(child_a, next) == NULL);
1431 
1432     g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
1433     g_assert_cmpint(parent_b->quiesce_counter, ==, 1);
1434     g_assert_cmpint(a->quiesce_counter, ==, 1);
1435     g_assert_cmpint(b->quiesce_counter, ==, 0);
1436     g_assert_cmpint(c->quiesce_counter, ==, 1);
1437 
1438     bdrv_subtree_drained_end(parent_b);
1439 
1440     bdrv_unref(parent_b);
1441     blk_unref(blk);
1442 
1443     g_assert_cmpint(a->refcnt, ==, 1);
1444     g_assert_cmpint(b->refcnt, ==, 1);
1445     g_assert_cmpint(c->refcnt, ==, 1);
1446     bdrv_unref(a);
1447     bdrv_unref(b);
1448     bdrv_unref(c);
1449 }
1450 
1451 static void test_detach_by_parent_cb(void)
1452 {
1453     test_detach_indirect(true);
1454 }
1455 
1456 static void test_detach_by_driver_cb(void)
1457 {
1458     test_detach_indirect(false);
1459 }
1460 
1461 static void test_append_to_drained(void)
1462 {
1463     BlockBackend *blk;
1464     BlockDriverState *base, *overlay;
1465     BDRVTestState *base_s, *overlay_s;
1466 
1467     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
1468     base = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort);
1469     base_s = base->opaque;
1470     blk_insert_bs(blk, base, &error_abort);
1471 
1472     overlay = bdrv_new_open_driver(&bdrv_test, "overlay", BDRV_O_RDWR,
1473                                    &error_abort);
1474     overlay_s = overlay->opaque;
1475 
1476     do_drain_begin(BDRV_DRAIN, base);
1477     g_assert_cmpint(base->quiesce_counter, ==, 1);
1478     g_assert_cmpint(base_s->drain_count, ==, 1);
1479     g_assert_cmpint(base->in_flight, ==, 0);
1480 
1481     bdrv_append(overlay, base, &error_abort);
1482     g_assert_cmpint(base->in_flight, ==, 0);
1483     g_assert_cmpint(overlay->in_flight, ==, 0);
1484 
1485     g_assert_cmpint(base->quiesce_counter, ==, 1);
1486     g_assert_cmpint(base_s->drain_count, ==, 1);
1487     g_assert_cmpint(overlay->quiesce_counter, ==, 1);
1488     g_assert_cmpint(overlay_s->drain_count, ==, 1);
1489 
1490     do_drain_end(BDRV_DRAIN, base);
1491 
1492     g_assert_cmpint(base->quiesce_counter, ==, 0);
1493     g_assert_cmpint(base_s->drain_count, ==, 0);
1494     g_assert_cmpint(overlay->quiesce_counter, ==, 0);
1495     g_assert_cmpint(overlay_s->drain_count, ==, 0);
1496 
1497     bdrv_unref(overlay);
1498     bdrv_unref(base);
1499     blk_unref(blk);
1500 }
1501 
1502 static void test_set_aio_context(void)
1503 {
1504     BlockDriverState *bs;
1505     IOThread *a = iothread_new();
1506     IOThread *b = iothread_new();
1507     AioContext *ctx_a = iothread_get_aio_context(a);
1508     AioContext *ctx_b = iothread_get_aio_context(b);
1509 
1510     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
1511                               &error_abort);
1512 
1513     bdrv_drained_begin(bs);
1514     bdrv_try_set_aio_context(bs, ctx_a, &error_abort);
1515 
1516     aio_context_acquire(ctx_a);
1517     bdrv_drained_end(bs);
1518 
1519     bdrv_drained_begin(bs);
1520     bdrv_try_set_aio_context(bs, ctx_b, &error_abort);
1521     aio_context_release(ctx_a);
1522     aio_context_acquire(ctx_b);
1523     bdrv_try_set_aio_context(bs, qemu_get_aio_context(), &error_abort);
1524     aio_context_release(ctx_b);
1525     bdrv_drained_end(bs);
1526 
1527     bdrv_unref(bs);
1528     iothread_join(a);
1529     iothread_join(b);
1530 }
1531 
1532 
1533 typedef struct TestDropBackingBlockJob {
1534     BlockJob common;
1535     bool should_complete;
1536     bool *did_complete;
1537     BlockDriverState *detach_also;
1538 } TestDropBackingBlockJob;
1539 
1540 static int coroutine_fn test_drop_backing_job_run(Job *job, Error **errp)
1541 {
1542     TestDropBackingBlockJob *s =
1543         container_of(job, TestDropBackingBlockJob, common.job);
1544 
1545     while (!s->should_complete) {
1546         job_sleep_ns(job, 0);
1547     }
1548 
1549     return 0;
1550 }
1551 
1552 static void test_drop_backing_job_commit(Job *job)
1553 {
1554     TestDropBackingBlockJob *s =
1555         container_of(job, TestDropBackingBlockJob, common.job);
1556 
1557     bdrv_set_backing_hd(blk_bs(s->common.blk), NULL, &error_abort);
1558     bdrv_set_backing_hd(s->detach_also, NULL, &error_abort);
1559 
1560     *s->did_complete = true;
1561 }
1562 
1563 static const BlockJobDriver test_drop_backing_job_driver = {
1564     .job_driver = {
1565         .instance_size  = sizeof(TestDropBackingBlockJob),
1566         .free           = block_job_free,
1567         .user_resume    = block_job_user_resume,
1568         .run            = test_drop_backing_job_run,
1569         .commit         = test_drop_backing_job_commit,
1570     }
1571 };
1572 
1573 /**
1574  * Creates a child node with three parent nodes on it, and then runs a
1575  * block job on the final one, parent-node-2.
1576  *
1577  * The job is then asked to complete before a section where the child
1578  * is drained.
1579  *
1580  * Ending this section will undrain the child's parents, first
1581  * parent-node-2, then parent-node-1, then parent-node-0 -- the parent
1582  * list is in reverse order of how they were added.  Ending the drain
1583  * on parent-node-2 will resume the job, thus completing it and
1584  * scheduling job_exit().
1585  *
1586  * Ending the drain on parent-node-1 will poll the AioContext, which
1587  * lets job_exit() and thus test_drop_backing_job_commit() run.  That
1588  * function first removes the child as parent-node-2's backing file.
1589  *
1590  * In old (and buggy) implementations, there are two problems with
1591  * that:
1592  * (A) bdrv_drain_invoke() polls for every node that leaves the
1593  *     drained section.  This means that job_exit() is scheduled
1594  *     before the child has left the drained section.  Its
1595  *     quiesce_counter is therefore still 1 when it is removed from
1596  *     parent-node-2.
1597  *
1598  * (B) bdrv_replace_child_noperm() calls drained_end() on the old
1599  *     child's parents as many times as the child is quiesced.  This
1600  *     means it will call drained_end() on parent-node-2 once.
1601  *     Because parent-node-2 is no longer quiesced at this point, this
1602  *     will fail.
1603  *
1604  * bdrv_replace_child_noperm() therefore must call drained_end() on
1605  * the parent only if it really is still drained because the child is
1606  * drained.
1607  *
1608  * If removing child from parent-node-2 was successful (as it should
1609  * be), test_drop_backing_job_commit() will then also remove the child
1610  * from parent-node-0.
1611  *
1612  * With an old version of our drain infrastructure ((A) above), that
1613  * resulted in the following flow:
1614  *
1615  * 1. child attempts to leave its drained section.  The call recurses
1616  *    to its parents.
1617  *
1618  * 2. parent-node-2 leaves the drained section.  Polling in
1619  *    bdrv_drain_invoke() will schedule job_exit().
1620  *
1621  * 3. parent-node-1 leaves the drained section.  Polling in
1622  *    bdrv_drain_invoke() will run job_exit(), thus disconnecting
1623  *    parent-node-0 from the child node.
1624  *
1625  * 4. bdrv_parent_drained_end() uses a QLIST_FOREACH_SAFE() loop to
1626  *    iterate over the parents.  Thus, it now accesses the BdrvChild
1627  *    object that used to connect parent-node-0 and the child node.
1628  *    However, that object no longer exists, so it accesses a dangling
1629  *    pointer.
1630  *
1631  * The solution is to only poll once when running a bdrv_drained_end()
1632  * operation, specifically at the end when all drained_end()
1633  * operations for all involved nodes have been scheduled.
1634  * Note that this also solves (A) above, thus hiding (B).
1635  */
1636 static void test_blockjob_commit_by_drained_end(void)
1637 {
1638     BlockDriverState *bs_child, *bs_parents[3];
1639     TestDropBackingBlockJob *job;
1640     bool job_has_completed = false;
1641     int i;
1642 
1643     bs_child = bdrv_new_open_driver(&bdrv_test, "child-node", BDRV_O_RDWR,
1644                                     &error_abort);
1645 
1646     for (i = 0; i < 3; i++) {
1647         char name[32];
1648         snprintf(name, sizeof(name), "parent-node-%i", i);
1649         bs_parents[i] = bdrv_new_open_driver(&bdrv_test, name, BDRV_O_RDWR,
1650                                              &error_abort);
1651         bdrv_set_backing_hd(bs_parents[i], bs_child, &error_abort);
1652     }
1653 
1654     job = block_job_create("job", &test_drop_backing_job_driver, NULL,
1655                            bs_parents[2], 0, BLK_PERM_ALL, 0, 0, NULL, NULL,
1656                            &error_abort);
1657 
1658     job->detach_also = bs_parents[0];
1659     job->did_complete = &job_has_completed;
1660 
1661     job_start(&job->common.job);
1662 
1663     job->should_complete = true;
1664     bdrv_drained_begin(bs_child);
1665     g_assert(!job_has_completed);
1666     bdrv_drained_end(bs_child);
1667     g_assert(job_has_completed);
1668 
1669     bdrv_unref(bs_parents[0]);
1670     bdrv_unref(bs_parents[1]);
1671     bdrv_unref(bs_parents[2]);
1672     bdrv_unref(bs_child);
1673 }
1674 
1675 
1676 typedef struct TestSimpleBlockJob {
1677     BlockJob common;
1678     bool should_complete;
1679     bool *did_complete;
1680 } TestSimpleBlockJob;
1681 
1682 static int coroutine_fn test_simple_job_run(Job *job, Error **errp)
1683 {
1684     TestSimpleBlockJob *s = container_of(job, TestSimpleBlockJob, common.job);
1685 
1686     while (!s->should_complete) {
1687         job_sleep_ns(job, 0);
1688     }
1689 
1690     return 0;
1691 }
1692 
1693 static void test_simple_job_clean(Job *job)
1694 {
1695     TestSimpleBlockJob *s = container_of(job, TestSimpleBlockJob, common.job);
1696     *s->did_complete = true;
1697 }
1698 
1699 static const BlockJobDriver test_simple_job_driver = {
1700     .job_driver = {
1701         .instance_size  = sizeof(TestSimpleBlockJob),
1702         .free           = block_job_free,
1703         .user_resume    = block_job_user_resume,
1704         .run            = test_simple_job_run,
1705         .clean          = test_simple_job_clean,
1706     },
1707 };
1708 
1709 static int drop_intermediate_poll_update_filename(BdrvChild *child,
1710                                                   BlockDriverState *new_base,
1711                                                   const char *filename,
1712                                                   Error **errp)
1713 {
1714     /*
1715      * We are free to poll here, which may change the block graph, if
1716      * it is not drained.
1717      */
1718 
1719     /* If the job is not drained: Complete it, schedule job_exit() */
1720     aio_poll(qemu_get_current_aio_context(), false);
1721     /* If the job is not drained: Run job_exit(), finish the job */
1722     aio_poll(qemu_get_current_aio_context(), false);
1723 
1724     return 0;
1725 }
1726 
1727 /**
1728  * Test a poll in the midst of bdrv_drop_intermediate().
1729  *
1730  * bdrv_drop_intermediate() calls BdrvChildClass.update_filename(),
1731  * which can yield or poll.  This may lead to graph changes, unless
1732  * the whole subtree in question is drained.
1733  *
1734  * We test this on the following graph:
1735  *
1736  *                    Job
1737  *
1738  *                     |
1739  *                  job-node
1740  *                     |
1741  *                     v
1742  *
1743  *                  job-node
1744  *
1745  *                     |
1746  *                  backing
1747  *                     |
1748  *                     v
1749  *
1750  * node-2 --chain--> node-1 --chain--> node-0
1751  *
1752  * We drop node-1 with bdrv_drop_intermediate(top=node-1, base=node-0).
1753  *
1754  * This first updates node-2's backing filename by invoking
1755  * drop_intermediate_poll_update_filename(), which polls twice.  This
1756  * causes the job to finish, which in turns causes the job-node to be
1757  * deleted.
1758  *
1759  * bdrv_drop_intermediate() uses a QLIST_FOREACH_SAFE() loop, so it
1760  * already has a pointer to the BdrvChild edge between job-node and
1761  * node-1.  When it tries to handle that edge, we probably get a
1762  * segmentation fault because the object no longer exists.
1763  *
1764  *
1765  * The solution is for bdrv_drop_intermediate() to drain top's
1766  * subtree.  This prevents graph changes from happening just because
1767  * BdrvChildClass.update_filename() yields or polls.  Thus, the block
1768  * job is paused during that drained section and must finish before or
1769  * after.
1770  *
1771  * (In addition, bdrv_replace_child() must keep the job paused.)
1772  */
1773 static void test_drop_intermediate_poll(void)
1774 {
1775     static BdrvChildClass chain_child_class;
1776     BlockDriverState *chain[3];
1777     TestSimpleBlockJob *job;
1778     BlockDriverState *job_node;
1779     bool job_has_completed = false;
1780     int i;
1781     int ret;
1782 
1783     chain_child_class = child_of_bds;
1784     chain_child_class.update_filename = drop_intermediate_poll_update_filename;
1785 
1786     for (i = 0; i < 3; i++) {
1787         char name[32];
1788         snprintf(name, 32, "node-%i", i);
1789 
1790         chain[i] = bdrv_new_open_driver(&bdrv_test, name, 0, &error_abort);
1791     }
1792 
1793     job_node = bdrv_new_open_driver(&bdrv_test, "job-node", BDRV_O_RDWR,
1794                                     &error_abort);
1795     bdrv_set_backing_hd(job_node, chain[1], &error_abort);
1796 
1797     /*
1798      * Establish the chain last, so the chain links are the first
1799      * elements in the BDS.parents lists
1800      */
1801     for (i = 0; i < 3; i++) {
1802         if (i) {
1803             /* Takes the reference to chain[i - 1] */
1804             chain[i]->backing = bdrv_attach_child(chain[i], chain[i - 1],
1805                                                   "chain", &chain_child_class,
1806                                                   BDRV_CHILD_COW, &error_abort);
1807         }
1808     }
1809 
1810     job = block_job_create("job", &test_simple_job_driver, NULL, job_node,
1811                            0, BLK_PERM_ALL, 0, 0, NULL, NULL, &error_abort);
1812 
1813     /* The job has a reference now */
1814     bdrv_unref(job_node);
1815 
1816     job->did_complete = &job_has_completed;
1817 
1818     job_start(&job->common.job);
1819     job->should_complete = true;
1820 
1821     g_assert(!job_has_completed);
1822     ret = bdrv_drop_intermediate(chain[1], chain[0], NULL);
1823     g_assert(ret == 0);
1824     g_assert(job_has_completed);
1825 
1826     bdrv_unref(chain[2]);
1827 }
1828 
1829 
1830 typedef struct BDRVReplaceTestState {
1831     bool was_drained;
1832     bool was_undrained;
1833     bool has_read;
1834 
1835     int drain_count;
1836 
1837     bool yield_before_read;
1838     Coroutine *io_co;
1839     Coroutine *drain_co;
1840 } BDRVReplaceTestState;
1841 
1842 static void bdrv_replace_test_close(BlockDriverState *bs)
1843 {
1844 }
1845 
1846 /**
1847  * If @bs has a backing file:
1848  *   Yield if .yield_before_read is true (and wait for drain_begin to
1849  *   wake us up).
1850  *   Forward the read to bs->backing.  Set .has_read to true.
1851  *   If drain_begin has woken us, wake it in turn.
1852  *
1853  * Otherwise:
1854  *   Set .has_read to true and return success.
1855  */
1856 static int coroutine_fn bdrv_replace_test_co_preadv(BlockDriverState *bs,
1857                                                     uint64_t offset,
1858                                                     uint64_t bytes,
1859                                                     QEMUIOVector *qiov,
1860                                                     int flags)
1861 {
1862     BDRVReplaceTestState *s = bs->opaque;
1863 
1864     if (bs->backing) {
1865         int ret;
1866 
1867         g_assert(!s->drain_count);
1868 
1869         s->io_co = qemu_coroutine_self();
1870         if (s->yield_before_read) {
1871             s->yield_before_read = false;
1872             qemu_coroutine_yield();
1873         }
1874         s->io_co = NULL;
1875 
1876         ret = bdrv_co_preadv(bs->backing, offset, bytes, qiov, 0);
1877         s->has_read = true;
1878 
1879         /* Wake up drain_co if it runs */
1880         if (s->drain_co) {
1881             aio_co_wake(s->drain_co);
1882         }
1883 
1884         return ret;
1885     }
1886 
1887     s->has_read = true;
1888     return 0;
1889 }
1890 
1891 /**
1892  * If .drain_count is 0, wake up .io_co if there is one; and set
1893  * .was_drained.
1894  * Increment .drain_count.
1895  */
1896 static void coroutine_fn bdrv_replace_test_co_drain_begin(BlockDriverState *bs)
1897 {
1898     BDRVReplaceTestState *s = bs->opaque;
1899 
1900     if (!s->drain_count) {
1901         /* Keep waking io_co up until it is done */
1902         s->drain_co = qemu_coroutine_self();
1903         while (s->io_co) {
1904             aio_co_wake(s->io_co);
1905             s->io_co = NULL;
1906             qemu_coroutine_yield();
1907         }
1908         s->drain_co = NULL;
1909 
1910         s->was_drained = true;
1911     }
1912     s->drain_count++;
1913 }
1914 
1915 /**
1916  * Reduce .drain_count, set .was_undrained once it reaches 0.
1917  * If .drain_count reaches 0 and the node has a backing file, issue a
1918  * read request.
1919  */
1920 static void coroutine_fn bdrv_replace_test_co_drain_end(BlockDriverState *bs)
1921 {
1922     BDRVReplaceTestState *s = bs->opaque;
1923 
1924     g_assert(s->drain_count > 0);
1925     if (!--s->drain_count) {
1926         int ret;
1927 
1928         s->was_undrained = true;
1929 
1930         if (bs->backing) {
1931             char data;
1932             QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1);
1933 
1934             /* Queue a read request post-drain */
1935             ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0);
1936             g_assert(ret >= 0);
1937         }
1938     }
1939 }
1940 
1941 static BlockDriver bdrv_replace_test = {
1942     .format_name            = "replace_test",
1943     .instance_size          = sizeof(BDRVReplaceTestState),
1944 
1945     .bdrv_close             = bdrv_replace_test_close,
1946     .bdrv_co_preadv         = bdrv_replace_test_co_preadv,
1947 
1948     .bdrv_co_drain_begin    = bdrv_replace_test_co_drain_begin,
1949     .bdrv_co_drain_end      = bdrv_replace_test_co_drain_end,
1950 
1951     .bdrv_child_perm        = bdrv_default_perms,
1952 };
1953 
1954 static void coroutine_fn test_replace_child_mid_drain_read_co(void *opaque)
1955 {
1956     int ret;
1957     char data;
1958 
1959     ret = blk_co_pread(opaque, 0, 1, &data, 0);
1960     g_assert(ret >= 0);
1961 }
1962 
1963 /**
1964  * We test two things:
1965  * (1) bdrv_replace_child_noperm() must not undrain the parent if both
1966  *     children are drained.
1967  * (2) bdrv_replace_child_noperm() must never flush I/O requests to a
1968  *     drained child.  If the old child is drained, it must flush I/O
1969  *     requests after the new one has been attached.  If the new child
1970  *     is drained, it must flush I/O requests before the old one is
1971  *     detached.
1972  *
1973  * To do so, we create one parent node and two child nodes; then
1974  * attach one of the children (old_child_bs) to the parent, then
1975  * drain both old_child_bs and new_child_bs according to
1976  * old_drain_count and new_drain_count, respectively, and finally
1977  * we invoke bdrv_replace_node() to replace old_child_bs by
1978  * new_child_bs.
1979  *
1980  * The test block driver we use here (bdrv_replace_test) has a read
1981  * function that:
1982  * - For the parent node, can optionally yield, and then forwards the
1983  *   read to bdrv_preadv(),
1984  * - For the child node, just returns immediately.
1985  *
1986  * If the read yields, the drain_begin function will wake it up.
1987  *
1988  * The drain_end function issues a read on the parent once it is fully
1989  * undrained (which simulates requests starting to come in again).
1990  */
1991 static void do_test_replace_child_mid_drain(int old_drain_count,
1992                                             int new_drain_count)
1993 {
1994     BlockBackend *parent_blk;
1995     BlockDriverState *parent_bs;
1996     BlockDriverState *old_child_bs, *new_child_bs;
1997     BDRVReplaceTestState *parent_s;
1998     BDRVReplaceTestState *old_child_s, *new_child_s;
1999     Coroutine *io_co;
2000     int i;
2001 
2002     parent_bs = bdrv_new_open_driver(&bdrv_replace_test, "parent", 0,
2003                                      &error_abort);
2004     parent_s = parent_bs->opaque;
2005 
2006     parent_blk = blk_new(qemu_get_aio_context(),
2007                          BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
2008     blk_insert_bs(parent_blk, parent_bs, &error_abort);
2009 
2010     old_child_bs = bdrv_new_open_driver(&bdrv_replace_test, "old-child", 0,
2011                                         &error_abort);
2012     new_child_bs = bdrv_new_open_driver(&bdrv_replace_test, "new-child", 0,
2013                                         &error_abort);
2014     old_child_s = old_child_bs->opaque;
2015     new_child_s = new_child_bs->opaque;
2016 
2017     /* So that we can read something */
2018     parent_bs->total_sectors = 1;
2019     old_child_bs->total_sectors = 1;
2020     new_child_bs->total_sectors = 1;
2021 
2022     bdrv_ref(old_child_bs);
2023     parent_bs->backing = bdrv_attach_child(parent_bs, old_child_bs, "child",
2024                                            &child_of_bds, BDRV_CHILD_COW,
2025                                            &error_abort);
2026 
2027     for (i = 0; i < old_drain_count; i++) {
2028         bdrv_drained_begin(old_child_bs);
2029     }
2030     for (i = 0; i < new_drain_count; i++) {
2031         bdrv_drained_begin(new_child_bs);
2032     }
2033 
2034     if (!old_drain_count) {
2035         /*
2036          * Start a read operation that will yield, so it will not
2037          * complete before the node is drained.
2038          */
2039         parent_s->yield_before_read = true;
2040         io_co = qemu_coroutine_create(test_replace_child_mid_drain_read_co,
2041                                       parent_blk);
2042         qemu_coroutine_enter(io_co);
2043     }
2044 
2045     /* If we have started a read operation, it should have yielded */
2046     g_assert(!parent_s->has_read);
2047 
2048     /* Reset drained status so we can see what bdrv_replace_node() does */
2049     parent_s->was_drained = false;
2050     parent_s->was_undrained = false;
2051 
2052     g_assert(parent_bs->quiesce_counter == old_drain_count);
2053     bdrv_replace_node(old_child_bs, new_child_bs, &error_abort);
2054     g_assert(parent_bs->quiesce_counter == new_drain_count);
2055 
2056     if (!old_drain_count && !new_drain_count) {
2057         /*
2058          * From undrained to undrained drains and undrains the parent,
2059          * because bdrv_replace_node() contains a drained section for
2060          * @old_child_bs.
2061          */
2062         g_assert(parent_s->was_drained && parent_s->was_undrained);
2063     } else if (!old_drain_count && new_drain_count) {
2064         /*
2065          * From undrained to drained should drain the parent and keep
2066          * it that way.
2067          */
2068         g_assert(parent_s->was_drained && !parent_s->was_undrained);
2069     } else if (old_drain_count && !new_drain_count) {
2070         /*
2071          * From drained to undrained should undrain the parent and
2072          * keep it that way.
2073          */
2074         g_assert(!parent_s->was_drained && parent_s->was_undrained);
2075     } else /* if (old_drain_count && new_drain_count) */ {
2076         /*
2077          * From drained to drained must not undrain the parent at any
2078          * point
2079          */
2080         g_assert(!parent_s->was_drained && !parent_s->was_undrained);
2081     }
2082 
2083     if (!old_drain_count || !new_drain_count) {
2084         /*
2085          * If !old_drain_count, we have started a read request before
2086          * bdrv_replace_node().  If !new_drain_count, the parent must
2087          * have been undrained at some point, and
2088          * bdrv_replace_test_co_drain_end() starts a read request
2089          * then.
2090          */
2091         g_assert(parent_s->has_read);
2092     } else {
2093         /*
2094          * If the parent was never undrained, there is no way to start
2095          * a read request.
2096          */
2097         g_assert(!parent_s->has_read);
2098     }
2099 
2100     /* A drained child must have not received any request */
2101     g_assert(!(old_drain_count && old_child_s->has_read));
2102     g_assert(!(new_drain_count && new_child_s->has_read));
2103 
2104     for (i = 0; i < new_drain_count; i++) {
2105         bdrv_drained_end(new_child_bs);
2106     }
2107     for (i = 0; i < old_drain_count; i++) {
2108         bdrv_drained_end(old_child_bs);
2109     }
2110 
2111     /*
2112      * By now, bdrv_replace_test_co_drain_end() must have been called
2113      * at some point while the new child was attached to the parent.
2114      */
2115     g_assert(parent_s->has_read);
2116     g_assert(new_child_s->has_read);
2117 
2118     blk_unref(parent_blk);
2119     bdrv_unref(parent_bs);
2120     bdrv_unref(old_child_bs);
2121     bdrv_unref(new_child_bs);
2122 }
2123 
2124 static void test_replace_child_mid_drain(void)
2125 {
2126     int old_drain_count, new_drain_count;
2127 
2128     for (old_drain_count = 0; old_drain_count < 2; old_drain_count++) {
2129         for (new_drain_count = 0; new_drain_count < 2; new_drain_count++) {
2130             do_test_replace_child_mid_drain(old_drain_count, new_drain_count);
2131         }
2132     }
2133 }
2134 
2135 int main(int argc, char **argv)
2136 {
2137     int ret;
2138 
2139     bdrv_init();
2140     qemu_init_main_loop(&error_abort);
2141 
2142     g_test_init(&argc, &argv, NULL);
2143     qemu_event_init(&done_event, false);
2144 
2145     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
2146     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
2147     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
2148                     test_drv_cb_drain_subtree);
2149 
2150     g_test_add_func("/bdrv-drain/driver-cb/co/drain_all",
2151                     test_drv_cb_co_drain_all);
2152     g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
2153     g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
2154                     test_drv_cb_co_drain_subtree);
2155 
2156 
2157     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
2158     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
2159     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
2160                     test_quiesce_drain_subtree);
2161 
2162     g_test_add_func("/bdrv-drain/quiesce/co/drain_all",
2163                     test_quiesce_co_drain_all);
2164     g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
2165     g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
2166                     test_quiesce_co_drain_subtree);
2167 
2168     g_test_add_func("/bdrv-drain/nested", test_nested);
2169     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
2170 
2171     g_test_add_func("/bdrv-drain/graph-change/drain_subtree",
2172                     test_graph_change_drain_subtree);
2173     g_test_add_func("/bdrv-drain/graph-change/drain_all",
2174                     test_graph_change_drain_all);
2175 
2176     g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
2177     g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
2178     g_test_add_func("/bdrv-drain/iothread/drain_subtree",
2179                     test_iothread_drain_subtree);
2180 
2181     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
2182     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
2183     g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
2184                     test_blockjob_drain_subtree);
2185 
2186     g_test_add_func("/bdrv-drain/blockjob/error/drain_all",
2187                     test_blockjob_error_drain_all);
2188     g_test_add_func("/bdrv-drain/blockjob/error/drain",
2189                     test_blockjob_error_drain);
2190     g_test_add_func("/bdrv-drain/blockjob/error/drain_subtree",
2191                     test_blockjob_error_drain_subtree);
2192 
2193     g_test_add_func("/bdrv-drain/blockjob/iothread/drain_all",
2194                     test_blockjob_iothread_drain_all);
2195     g_test_add_func("/bdrv-drain/blockjob/iothread/drain",
2196                     test_blockjob_iothread_drain);
2197     g_test_add_func("/bdrv-drain/blockjob/iothread/drain_subtree",
2198                     test_blockjob_iothread_drain_subtree);
2199 
2200     g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_all",
2201                     test_blockjob_iothread_error_drain_all);
2202     g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain",
2203                     test_blockjob_iothread_error_drain);
2204     g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_subtree",
2205                     test_blockjob_iothread_error_drain_subtree);
2206 
2207     g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
2208     g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all);
2209     g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
2210     g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
2211     g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
2212     g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
2213 
2214     g_test_add_func("/bdrv-drain/attach/drain", test_append_to_drained);
2215 
2216     g_test_add_func("/bdrv-drain/set_aio_context", test_set_aio_context);
2217 
2218     g_test_add_func("/bdrv-drain/blockjob/commit_by_drained_end",
2219                     test_blockjob_commit_by_drained_end);
2220 
2221     g_test_add_func("/bdrv-drain/bdrv_drop_intermediate/poll",
2222                     test_drop_intermediate_poll);
2223 
2224     g_test_add_func("/bdrv-drain/replace_child/mid-drain",
2225                     test_replace_child_mid_drain);
2226 
2227     ret = g_test_run();
2228     qemu_event_destroy(&done_event);
2229     return ret;
2230 }
2231