xref: /openbmc/qemu/tests/unit/test-bdrv-drain.c (revision a05ca2d4163139c5f2e5488c36326f725a11a6d0)
1 /*
2  * Block node draining tests
3  *
4  * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "block/block.h"
27 #include "block/blockjob_int.h"
28 #include "sysemu/block-backend.h"
29 #include "qapi/error.h"
30 #include "qemu/main-loop.h"
31 #include "iothread.h"
32 
33 static QemuEvent done_event;
34 
35 typedef struct BDRVTestState {
36     int drain_count;
37     AioContext *bh_indirection_ctx;
38     bool sleep_in_drain_begin;
39 } BDRVTestState;
40 
41 static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
42 {
43     BDRVTestState *s = bs->opaque;
44     s->drain_count++;
45     if (s->sleep_in_drain_begin) {
46         qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
47     }
48 }
49 
50 static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
51 {
52     BDRVTestState *s = bs->opaque;
53     s->drain_count--;
54 }
55 
56 static void bdrv_test_close(BlockDriverState *bs)
57 {
58     BDRVTestState *s = bs->opaque;
59     g_assert_cmpint(s->drain_count, >, 0);
60 }
61 
62 static void co_reenter_bh(void *opaque)
63 {
64     aio_co_wake(opaque);
65 }
66 
67 static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
68                                             uint64_t offset, uint64_t bytes,
69                                             QEMUIOVector *qiov, int flags)
70 {
71     BDRVTestState *s = bs->opaque;
72 
73     /* We want this request to stay until the polling loop in drain waits for
74      * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
75      * first and polls its result, too, but it shouldn't accidentally complete
76      * this request yet. */
77     qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
78 
79     if (s->bh_indirection_ctx) {
80         aio_bh_schedule_oneshot(s->bh_indirection_ctx, co_reenter_bh,
81                                 qemu_coroutine_self());
82         qemu_coroutine_yield();
83     }
84 
85     return 0;
86 }
87 
88 static int bdrv_test_change_backing_file(BlockDriverState *bs,
89                                          const char *backing_file,
90                                          const char *backing_fmt)
91 {
92     return 0;
93 }
94 
95 static BlockDriver bdrv_test = {
96     .format_name            = "test",
97     .instance_size          = sizeof(BDRVTestState),
98     .supports_backing       = true,
99 
100     .bdrv_close             = bdrv_test_close,
101     .bdrv_co_preadv         = bdrv_test_co_preadv,
102 
103     .bdrv_co_drain_begin    = bdrv_test_co_drain_begin,
104     .bdrv_co_drain_end      = bdrv_test_co_drain_end,
105 
106     .bdrv_child_perm        = bdrv_default_perms,
107 
108     .bdrv_change_backing_file = bdrv_test_change_backing_file,
109 };
110 
111 static void aio_ret_cb(void *opaque, int ret)
112 {
113     int *aio_ret = opaque;
114     *aio_ret = ret;
115 }
116 
117 typedef struct CallInCoroutineData {
118     void (*entry)(void);
119     bool done;
120 } CallInCoroutineData;
121 
122 static coroutine_fn void call_in_coroutine_entry(void *opaque)
123 {
124     CallInCoroutineData *data = opaque;
125 
126     data->entry();
127     data->done = true;
128 }
129 
130 static void call_in_coroutine(void (*entry)(void))
131 {
132     Coroutine *co;
133     CallInCoroutineData data = {
134         .entry  = entry,
135         .done   = false,
136     };
137 
138     co = qemu_coroutine_create(call_in_coroutine_entry, &data);
139     qemu_coroutine_enter(co);
140     while (!data.done) {
141         aio_poll(qemu_get_aio_context(), true);
142     }
143 }
144 
145 enum drain_type {
146     BDRV_DRAIN_ALL,
147     BDRV_DRAIN,
148     BDRV_SUBTREE_DRAIN,
149     DRAIN_TYPE_MAX,
150 };
151 
152 static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
153 {
154     switch (drain_type) {
155     case BDRV_DRAIN_ALL:        bdrv_drain_all_begin(); break;
156     case BDRV_DRAIN:            bdrv_drained_begin(bs); break;
157     case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_begin(bs); break;
158     default:                    g_assert_not_reached();
159     }
160 }
161 
162 static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
163 {
164     switch (drain_type) {
165     case BDRV_DRAIN_ALL:        bdrv_drain_all_end(); break;
166     case BDRV_DRAIN:            bdrv_drained_end(bs); break;
167     case BDRV_SUBTREE_DRAIN:    bdrv_subtree_drained_end(bs); break;
168     default:                    g_assert_not_reached();
169     }
170 }
171 
172 static void do_drain_begin_unlocked(enum drain_type drain_type, BlockDriverState *bs)
173 {
174     if (drain_type != BDRV_DRAIN_ALL) {
175         aio_context_acquire(bdrv_get_aio_context(bs));
176     }
177     do_drain_begin(drain_type, bs);
178     if (drain_type != BDRV_DRAIN_ALL) {
179         aio_context_release(bdrv_get_aio_context(bs));
180     }
181 }
182 
183 static void do_drain_end_unlocked(enum drain_type drain_type, BlockDriverState *bs)
184 {
185     if (drain_type != BDRV_DRAIN_ALL) {
186         aio_context_acquire(bdrv_get_aio_context(bs));
187     }
188     do_drain_end(drain_type, bs);
189     if (drain_type != BDRV_DRAIN_ALL) {
190         aio_context_release(bdrv_get_aio_context(bs));
191     }
192 }
193 
194 static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
195 {
196     BlockBackend *blk;
197     BlockDriverState *bs, *backing;
198     BDRVTestState *s, *backing_s;
199     BlockAIOCB *acb;
200     int aio_ret;
201 
202     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, 0);
203 
204     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
205     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
206                               &error_abort);
207     s = bs->opaque;
208     blk_insert_bs(blk, bs, &error_abort);
209 
210     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
211     backing_s = backing->opaque;
212     bdrv_set_backing_hd(bs, backing, &error_abort);
213 
214     /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
215     g_assert_cmpint(s->drain_count, ==, 0);
216     g_assert_cmpint(backing_s->drain_count, ==, 0);
217 
218     do_drain_begin(drain_type, bs);
219 
220     g_assert_cmpint(s->drain_count, ==, 1);
221     g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
222 
223     do_drain_end(drain_type, bs);
224 
225     g_assert_cmpint(s->drain_count, ==, 0);
226     g_assert_cmpint(backing_s->drain_count, ==, 0);
227 
228     /* Now do the same while a request is pending */
229     aio_ret = -EINPROGRESS;
230     acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
231     g_assert(acb != NULL);
232     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
233 
234     g_assert_cmpint(s->drain_count, ==, 0);
235     g_assert_cmpint(backing_s->drain_count, ==, 0);
236 
237     do_drain_begin(drain_type, bs);
238 
239     g_assert_cmpint(aio_ret, ==, 0);
240     g_assert_cmpint(s->drain_count, ==, 1);
241     g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
242 
243     do_drain_end(drain_type, bs);
244 
245     g_assert_cmpint(s->drain_count, ==, 0);
246     g_assert_cmpint(backing_s->drain_count, ==, 0);
247 
248     bdrv_unref(backing);
249     bdrv_unref(bs);
250     blk_unref(blk);
251 }
252 
253 static void test_drv_cb_drain_all(void)
254 {
255     test_drv_cb_common(BDRV_DRAIN_ALL, true);
256 }
257 
258 static void test_drv_cb_drain(void)
259 {
260     test_drv_cb_common(BDRV_DRAIN, false);
261 }
262 
263 static void test_drv_cb_drain_subtree(void)
264 {
265     test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
266 }
267 
268 static void test_drv_cb_co_drain_all(void)
269 {
270     call_in_coroutine(test_drv_cb_drain_all);
271 }
272 
273 static void test_drv_cb_co_drain(void)
274 {
275     call_in_coroutine(test_drv_cb_drain);
276 }
277 
278 static void test_drv_cb_co_drain_subtree(void)
279 {
280     call_in_coroutine(test_drv_cb_drain_subtree);
281 }
282 
283 static void test_quiesce_common(enum drain_type drain_type, bool recursive)
284 {
285     BlockBackend *blk;
286     BlockDriverState *bs, *backing;
287 
288     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
289     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
290                               &error_abort);
291     blk_insert_bs(blk, bs, &error_abort);
292 
293     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
294     bdrv_set_backing_hd(bs, backing, &error_abort);
295 
296     g_assert_cmpint(bs->quiesce_counter, ==, 0);
297     g_assert_cmpint(backing->quiesce_counter, ==, 0);
298 
299     do_drain_begin(drain_type, bs);
300 
301     g_assert_cmpint(bs->quiesce_counter, ==, 1);
302     g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
303 
304     do_drain_end(drain_type, bs);
305 
306     g_assert_cmpint(bs->quiesce_counter, ==, 0);
307     g_assert_cmpint(backing->quiesce_counter, ==, 0);
308 
309     bdrv_unref(backing);
310     bdrv_unref(bs);
311     blk_unref(blk);
312 }
313 
314 static void test_quiesce_drain_all(void)
315 {
316     test_quiesce_common(BDRV_DRAIN_ALL, true);
317 }
318 
319 static void test_quiesce_drain(void)
320 {
321     test_quiesce_common(BDRV_DRAIN, false);
322 }
323 
324 static void test_quiesce_drain_subtree(void)
325 {
326     test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
327 }
328 
329 static void test_quiesce_co_drain_all(void)
330 {
331     call_in_coroutine(test_quiesce_drain_all);
332 }
333 
334 static void test_quiesce_co_drain(void)
335 {
336     call_in_coroutine(test_quiesce_drain);
337 }
338 
339 static void test_quiesce_co_drain_subtree(void)
340 {
341     call_in_coroutine(test_quiesce_drain_subtree);
342 }
343 
344 static void test_nested(void)
345 {
346     BlockBackend *blk;
347     BlockDriverState *bs, *backing;
348     BDRVTestState *s, *backing_s;
349     enum drain_type outer, inner;
350 
351     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
352     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
353                               &error_abort);
354     s = bs->opaque;
355     blk_insert_bs(blk, bs, &error_abort);
356 
357     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
358     backing_s = backing->opaque;
359     bdrv_set_backing_hd(bs, backing, &error_abort);
360 
361     for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
362         for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
363             int backing_quiesce = (outer != BDRV_DRAIN) +
364                                   (inner != BDRV_DRAIN);
365 
366             g_assert_cmpint(bs->quiesce_counter, ==, 0);
367             g_assert_cmpint(backing->quiesce_counter, ==, 0);
368             g_assert_cmpint(s->drain_count, ==, 0);
369             g_assert_cmpint(backing_s->drain_count, ==, 0);
370 
371             do_drain_begin(outer, bs);
372             do_drain_begin(inner, bs);
373 
374             g_assert_cmpint(bs->quiesce_counter, ==, 2);
375             g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
376             g_assert_cmpint(s->drain_count, ==, 2);
377             g_assert_cmpint(backing_s->drain_count, ==, backing_quiesce);
378 
379             do_drain_end(inner, bs);
380             do_drain_end(outer, bs);
381 
382             g_assert_cmpint(bs->quiesce_counter, ==, 0);
383             g_assert_cmpint(backing->quiesce_counter, ==, 0);
384             g_assert_cmpint(s->drain_count, ==, 0);
385             g_assert_cmpint(backing_s->drain_count, ==, 0);
386         }
387     }
388 
389     bdrv_unref(backing);
390     bdrv_unref(bs);
391     blk_unref(blk);
392 }
393 
394 static void test_multiparent(void)
395 {
396     BlockBackend *blk_a, *blk_b;
397     BlockDriverState *bs_a, *bs_b, *backing;
398     BDRVTestState *a_s, *b_s, *backing_s;
399 
400     blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
401     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
402                                 &error_abort);
403     a_s = bs_a->opaque;
404     blk_insert_bs(blk_a, bs_a, &error_abort);
405 
406     blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
407     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
408                                 &error_abort);
409     b_s = bs_b->opaque;
410     blk_insert_bs(blk_b, bs_b, &error_abort);
411 
412     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
413     backing_s = backing->opaque;
414     bdrv_set_backing_hd(bs_a, backing, &error_abort);
415     bdrv_set_backing_hd(bs_b, backing, &error_abort);
416 
417     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
418     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
419     g_assert_cmpint(backing->quiesce_counter, ==, 0);
420     g_assert_cmpint(a_s->drain_count, ==, 0);
421     g_assert_cmpint(b_s->drain_count, ==, 0);
422     g_assert_cmpint(backing_s->drain_count, ==, 0);
423 
424     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
425 
426     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
427     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
428     g_assert_cmpint(backing->quiesce_counter, ==, 1);
429     g_assert_cmpint(a_s->drain_count, ==, 1);
430     g_assert_cmpint(b_s->drain_count, ==, 1);
431     g_assert_cmpint(backing_s->drain_count, ==, 1);
432 
433     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
434 
435     g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
436     g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
437     g_assert_cmpint(backing->quiesce_counter, ==, 2);
438     g_assert_cmpint(a_s->drain_count, ==, 2);
439     g_assert_cmpint(b_s->drain_count, ==, 2);
440     g_assert_cmpint(backing_s->drain_count, ==, 2);
441 
442     do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
443 
444     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
445     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
446     g_assert_cmpint(backing->quiesce_counter, ==, 1);
447     g_assert_cmpint(a_s->drain_count, ==, 1);
448     g_assert_cmpint(b_s->drain_count, ==, 1);
449     g_assert_cmpint(backing_s->drain_count, ==, 1);
450 
451     do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
452 
453     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
454     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
455     g_assert_cmpint(backing->quiesce_counter, ==, 0);
456     g_assert_cmpint(a_s->drain_count, ==, 0);
457     g_assert_cmpint(b_s->drain_count, ==, 0);
458     g_assert_cmpint(backing_s->drain_count, ==, 0);
459 
460     bdrv_unref(backing);
461     bdrv_unref(bs_a);
462     bdrv_unref(bs_b);
463     blk_unref(blk_a);
464     blk_unref(blk_b);
465 }
466 
467 static void test_graph_change_drain_subtree(void)
468 {
469     BlockBackend *blk_a, *blk_b;
470     BlockDriverState *bs_a, *bs_b, *backing;
471     BDRVTestState *a_s, *b_s, *backing_s;
472 
473     blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
474     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
475                                 &error_abort);
476     a_s = bs_a->opaque;
477     blk_insert_bs(blk_a, bs_a, &error_abort);
478 
479     blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
480     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
481                                 &error_abort);
482     b_s = bs_b->opaque;
483     blk_insert_bs(blk_b, bs_b, &error_abort);
484 
485     backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
486     backing_s = backing->opaque;
487     bdrv_set_backing_hd(bs_a, backing, &error_abort);
488 
489     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
490     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
491     g_assert_cmpint(backing->quiesce_counter, ==, 0);
492     g_assert_cmpint(a_s->drain_count, ==, 0);
493     g_assert_cmpint(b_s->drain_count, ==, 0);
494     g_assert_cmpint(backing_s->drain_count, ==, 0);
495 
496     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
497     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
498     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
499     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
500     do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
501 
502     bdrv_set_backing_hd(bs_b, backing, &error_abort);
503     g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
504     g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
505     g_assert_cmpint(backing->quiesce_counter, ==, 5);
506     g_assert_cmpint(a_s->drain_count, ==, 5);
507     g_assert_cmpint(b_s->drain_count, ==, 5);
508     g_assert_cmpint(backing_s->drain_count, ==, 5);
509 
510     bdrv_set_backing_hd(bs_b, NULL, &error_abort);
511     g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
512     g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
513     g_assert_cmpint(backing->quiesce_counter, ==, 3);
514     g_assert_cmpint(a_s->drain_count, ==, 3);
515     g_assert_cmpint(b_s->drain_count, ==, 2);
516     g_assert_cmpint(backing_s->drain_count, ==, 3);
517 
518     bdrv_set_backing_hd(bs_b, backing, &error_abort);
519     g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
520     g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
521     g_assert_cmpint(backing->quiesce_counter, ==, 5);
522     g_assert_cmpint(a_s->drain_count, ==, 5);
523     g_assert_cmpint(b_s->drain_count, ==, 5);
524     g_assert_cmpint(backing_s->drain_count, ==, 5);
525 
526     do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
527     do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
528     do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
529     do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
530     do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
531 
532     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
533     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
534     g_assert_cmpint(backing->quiesce_counter, ==, 0);
535     g_assert_cmpint(a_s->drain_count, ==, 0);
536     g_assert_cmpint(b_s->drain_count, ==, 0);
537     g_assert_cmpint(backing_s->drain_count, ==, 0);
538 
539     bdrv_unref(backing);
540     bdrv_unref(bs_a);
541     bdrv_unref(bs_b);
542     blk_unref(blk_a);
543     blk_unref(blk_b);
544 }
545 
546 static void test_graph_change_drain_all(void)
547 {
548     BlockBackend *blk_a, *blk_b;
549     BlockDriverState *bs_a, *bs_b;
550     BDRVTestState *a_s, *b_s;
551 
552     /* Create node A with a BlockBackend */
553     blk_a = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
554     bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
555                                 &error_abort);
556     a_s = bs_a->opaque;
557     blk_insert_bs(blk_a, bs_a, &error_abort);
558 
559     g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
560     g_assert_cmpint(a_s->drain_count, ==, 0);
561 
562     /* Call bdrv_drain_all_begin() */
563     bdrv_drain_all_begin();
564 
565     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
566     g_assert_cmpint(a_s->drain_count, ==, 1);
567 
568     /* Create node B with a BlockBackend */
569     blk_b = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
570     bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
571                                 &error_abort);
572     b_s = bs_b->opaque;
573     blk_insert_bs(blk_b, bs_b, &error_abort);
574 
575     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
576     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
577     g_assert_cmpint(a_s->drain_count, ==, 1);
578     g_assert_cmpint(b_s->drain_count, ==, 1);
579 
580     /* Unref and finally delete node A */
581     blk_unref(blk_a);
582 
583     g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
584     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
585     g_assert_cmpint(a_s->drain_count, ==, 1);
586     g_assert_cmpint(b_s->drain_count, ==, 1);
587 
588     bdrv_unref(bs_a);
589 
590     g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
591     g_assert_cmpint(b_s->drain_count, ==, 1);
592 
593     /* End the drained section */
594     bdrv_drain_all_end();
595 
596     g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
597     g_assert_cmpint(b_s->drain_count, ==, 0);
598     g_assert_cmpint(qemu_get_aio_context()->external_disable_cnt, ==, 0);
599 
600     bdrv_unref(bs_b);
601     blk_unref(blk_b);
602 }
603 
604 struct test_iothread_data {
605     BlockDriverState *bs;
606     enum drain_type drain_type;
607     int *aio_ret;
608 };
609 
610 static void test_iothread_drain_entry(void *opaque)
611 {
612     struct test_iothread_data *data = opaque;
613 
614     aio_context_acquire(bdrv_get_aio_context(data->bs));
615     do_drain_begin(data->drain_type, data->bs);
616     g_assert_cmpint(*data->aio_ret, ==, 0);
617     do_drain_end(data->drain_type, data->bs);
618     aio_context_release(bdrv_get_aio_context(data->bs));
619 
620     qemu_event_set(&done_event);
621 }
622 
623 static void test_iothread_aio_cb(void *opaque, int ret)
624 {
625     int *aio_ret = opaque;
626     *aio_ret = ret;
627     qemu_event_set(&done_event);
628 }
629 
630 static void test_iothread_main_thread_bh(void *opaque)
631 {
632     struct test_iothread_data *data = opaque;
633 
634     /* Test that the AioContext is not yet locked in a random BH that is
635      * executed during drain, otherwise this would deadlock. */
636     aio_context_acquire(bdrv_get_aio_context(data->bs));
637     bdrv_flush(data->bs);
638     aio_context_release(bdrv_get_aio_context(data->bs));
639 }
640 
641 /*
642  * Starts an AIO request on a BDS that runs in the AioContext of iothread 1.
643  * The request involves a BH on iothread 2 before it can complete.
644  *
645  * @drain_thread = 0 means that do_drain_begin/end are called from the main
646  * thread, @drain_thread = 1 means that they are called from iothread 1. Drain
647  * for this BDS cannot be called from iothread 2 because only the main thread
648  * may do cross-AioContext polling.
649  */
650 static void test_iothread_common(enum drain_type drain_type, int drain_thread)
651 {
652     BlockBackend *blk;
653     BlockDriverState *bs;
654     BDRVTestState *s;
655     BlockAIOCB *acb;
656     int aio_ret;
657     struct test_iothread_data data;
658 
659     IOThread *a = iothread_new();
660     IOThread *b = iothread_new();
661     AioContext *ctx_a = iothread_get_aio_context(a);
662     AioContext *ctx_b = iothread_get_aio_context(b);
663 
664     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, 0);
665 
666     /* bdrv_drain_all() may only be called from the main loop thread */
667     if (drain_type == BDRV_DRAIN_ALL && drain_thread != 0) {
668         goto out;
669     }
670 
671     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
672     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
673                               &error_abort);
674     s = bs->opaque;
675     blk_insert_bs(blk, bs, &error_abort);
676     blk_set_disable_request_queuing(blk, true);
677 
678     blk_set_aio_context(blk, ctx_a, &error_abort);
679     aio_context_acquire(ctx_a);
680 
681     s->bh_indirection_ctx = ctx_b;
682 
683     aio_ret = -EINPROGRESS;
684     qemu_event_reset(&done_event);
685 
686     if (drain_thread == 0) {
687         acb = blk_aio_preadv(blk, 0, &qiov, 0, test_iothread_aio_cb, &aio_ret);
688     } else {
689         acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
690     }
691     g_assert(acb != NULL);
692     g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
693 
694     aio_context_release(ctx_a);
695 
696     data = (struct test_iothread_data) {
697         .bs         = bs,
698         .drain_type = drain_type,
699         .aio_ret    = &aio_ret,
700     };
701 
702     switch (drain_thread) {
703     case 0:
704         if (drain_type != BDRV_DRAIN_ALL) {
705             aio_context_acquire(ctx_a);
706         }
707 
708         aio_bh_schedule_oneshot(ctx_a, test_iothread_main_thread_bh, &data);
709 
710         /* The request is running on the IOThread a. Draining its block device
711          * will make sure that it has completed as far as the BDS is concerned,
712          * but the drain in this thread can continue immediately after
713          * bdrv_dec_in_flight() and aio_ret might be assigned only slightly
714          * later. */
715         do_drain_begin(drain_type, bs);
716         g_assert_cmpint(bs->in_flight, ==, 0);
717 
718         if (drain_type != BDRV_DRAIN_ALL) {
719             aio_context_release(ctx_a);
720         }
721         qemu_event_wait(&done_event);
722         if (drain_type != BDRV_DRAIN_ALL) {
723             aio_context_acquire(ctx_a);
724         }
725 
726         g_assert_cmpint(aio_ret, ==, 0);
727         do_drain_end(drain_type, bs);
728 
729         if (drain_type != BDRV_DRAIN_ALL) {
730             aio_context_release(ctx_a);
731         }
732         break;
733     case 1:
734         aio_bh_schedule_oneshot(ctx_a, test_iothread_drain_entry, &data);
735         qemu_event_wait(&done_event);
736         break;
737     default:
738         g_assert_not_reached();
739     }
740 
741     aio_context_acquire(ctx_a);
742     blk_set_aio_context(blk, qemu_get_aio_context(), &error_abort);
743     aio_context_release(ctx_a);
744 
745     bdrv_unref(bs);
746     blk_unref(blk);
747 
748 out:
749     iothread_join(a);
750     iothread_join(b);
751 }
752 
753 static void test_iothread_drain_all(void)
754 {
755     test_iothread_common(BDRV_DRAIN_ALL, 0);
756     test_iothread_common(BDRV_DRAIN_ALL, 1);
757 }
758 
759 static void test_iothread_drain(void)
760 {
761     test_iothread_common(BDRV_DRAIN, 0);
762     test_iothread_common(BDRV_DRAIN, 1);
763 }
764 
765 static void test_iothread_drain_subtree(void)
766 {
767     test_iothread_common(BDRV_SUBTREE_DRAIN, 0);
768     test_iothread_common(BDRV_SUBTREE_DRAIN, 1);
769 }
770 
771 
772 typedef struct TestBlockJob {
773     BlockJob common;
774     int run_ret;
775     int prepare_ret;
776     bool running;
777     bool should_complete;
778 } TestBlockJob;
779 
780 static int test_job_prepare(Job *job)
781 {
782     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
783 
784     /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */
785     blk_flush(s->common.blk);
786     return s->prepare_ret;
787 }
788 
789 static void test_job_commit(Job *job)
790 {
791     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
792 
793     /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */
794     blk_flush(s->common.blk);
795 }
796 
797 static void test_job_abort(Job *job)
798 {
799     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
800 
801     /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */
802     blk_flush(s->common.blk);
803 }
804 
805 static int coroutine_fn test_job_run(Job *job, Error **errp)
806 {
807     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
808 
809     /* We are running the actual job code past the pause point in
810      * job_co_entry(). */
811     s->running = true;
812 
813     job_transition_to_ready(&s->common.job);
814     while (!s->should_complete) {
815         /* Avoid job_sleep_ns() because it marks the job as !busy. We want to
816          * emulate some actual activity (probably some I/O) here so that drain
817          * has to wait for this activity to stop. */
818         qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
819 
820         job_pause_point(&s->common.job);
821     }
822 
823     return s->run_ret;
824 }
825 
826 static void test_job_complete(Job *job, Error **errp)
827 {
828     TestBlockJob *s = container_of(job, TestBlockJob, common.job);
829     s->should_complete = true;
830 }
831 
832 BlockJobDriver test_job_driver = {
833     .job_driver = {
834         .instance_size  = sizeof(TestBlockJob),
835         .free           = block_job_free,
836         .user_resume    = block_job_user_resume,
837         .run            = test_job_run,
838         .complete       = test_job_complete,
839         .prepare        = test_job_prepare,
840         .commit         = test_job_commit,
841         .abort          = test_job_abort,
842     },
843 };
844 
845 enum test_job_result {
846     TEST_JOB_SUCCESS,
847     TEST_JOB_FAIL_RUN,
848     TEST_JOB_FAIL_PREPARE,
849 };
850 
851 enum test_job_drain_node {
852     TEST_JOB_DRAIN_SRC,
853     TEST_JOB_DRAIN_SRC_CHILD,
854     TEST_JOB_DRAIN_SRC_PARENT,
855 };
856 
857 static void test_blockjob_common_drain_node(enum drain_type drain_type,
858                                             bool use_iothread,
859                                             enum test_job_result result,
860                                             enum test_job_drain_node drain_node)
861 {
862     BlockBackend *blk_src, *blk_target;
863     BlockDriverState *src, *src_backing, *src_overlay, *target, *drain_bs;
864     BlockJob *job;
865     TestBlockJob *tjob;
866     IOThread *iothread = NULL;
867     AioContext *ctx;
868     int ret;
869 
870     src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
871                                &error_abort);
872     src_backing = bdrv_new_open_driver(&bdrv_test, "source-backing",
873                                        BDRV_O_RDWR, &error_abort);
874     src_overlay = bdrv_new_open_driver(&bdrv_test, "source-overlay",
875                                        BDRV_O_RDWR, &error_abort);
876 
877     bdrv_set_backing_hd(src_overlay, src, &error_abort);
878     bdrv_unref(src);
879     bdrv_set_backing_hd(src, src_backing, &error_abort);
880     bdrv_unref(src_backing);
881 
882     blk_src = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
883     blk_insert_bs(blk_src, src_overlay, &error_abort);
884 
885     switch (drain_node) {
886     case TEST_JOB_DRAIN_SRC:
887         drain_bs = src;
888         break;
889     case TEST_JOB_DRAIN_SRC_CHILD:
890         drain_bs = src_backing;
891         break;
892     case TEST_JOB_DRAIN_SRC_PARENT:
893         drain_bs = src_overlay;
894         break;
895     default:
896         g_assert_not_reached();
897     }
898 
899     if (use_iothread) {
900         iothread = iothread_new();
901         ctx = iothread_get_aio_context(iothread);
902         blk_set_aio_context(blk_src, ctx, &error_abort);
903     } else {
904         ctx = qemu_get_aio_context();
905     }
906 
907     target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
908                                   &error_abort);
909     blk_target = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
910     blk_insert_bs(blk_target, target, &error_abort);
911     blk_set_allow_aio_context_change(blk_target, true);
912 
913     aio_context_acquire(ctx);
914     tjob = block_job_create("job0", &test_job_driver, NULL, src,
915                             0, BLK_PERM_ALL,
916                             0, 0, NULL, NULL, &error_abort);
917     job = &tjob->common;
918     block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
919 
920     switch (result) {
921     case TEST_JOB_SUCCESS:
922         break;
923     case TEST_JOB_FAIL_RUN:
924         tjob->run_ret = -EIO;
925         break;
926     case TEST_JOB_FAIL_PREPARE:
927         tjob->prepare_ret = -EIO;
928         break;
929     }
930 
931     job_start(&job->job);
932     aio_context_release(ctx);
933 
934     if (use_iothread) {
935         /* job_co_entry() is run in the I/O thread, wait for the actual job
936          * code to start (we don't want to catch the job in the pause point in
937          * job_co_entry(). */
938         while (!tjob->running) {
939             aio_poll(qemu_get_aio_context(), false);
940         }
941     }
942 
943     g_assert_cmpint(job->job.pause_count, ==, 0);
944     g_assert_false(job->job.paused);
945     g_assert_true(tjob->running);
946     g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
947 
948     do_drain_begin_unlocked(drain_type, drain_bs);
949 
950     if (drain_type == BDRV_DRAIN_ALL) {
951         /* bdrv_drain_all() drains both src and target */
952         g_assert_cmpint(job->job.pause_count, ==, 2);
953     } else {
954         g_assert_cmpint(job->job.pause_count, ==, 1);
955     }
956     g_assert_true(job->job.paused);
957     g_assert_false(job->job.busy); /* The job is paused */
958 
959     do_drain_end_unlocked(drain_type, drain_bs);
960 
961     if (use_iothread) {
962         /* paused is reset in the I/O thread, wait for it */
963         while (job->job.paused) {
964             aio_poll(qemu_get_aio_context(), false);
965         }
966     }
967 
968     g_assert_cmpint(job->job.pause_count, ==, 0);
969     g_assert_false(job->job.paused);
970     g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
971 
972     do_drain_begin_unlocked(drain_type, target);
973 
974     if (drain_type == BDRV_DRAIN_ALL) {
975         /* bdrv_drain_all() drains both src and target */
976         g_assert_cmpint(job->job.pause_count, ==, 2);
977     } else {
978         g_assert_cmpint(job->job.pause_count, ==, 1);
979     }
980     g_assert_true(job->job.paused);
981     g_assert_false(job->job.busy); /* The job is paused */
982 
983     do_drain_end_unlocked(drain_type, target);
984 
985     if (use_iothread) {
986         /* paused is reset in the I/O thread, wait for it */
987         while (job->job.paused) {
988             aio_poll(qemu_get_aio_context(), false);
989         }
990     }
991 
992     g_assert_cmpint(job->job.pause_count, ==, 0);
993     g_assert_false(job->job.paused);
994     g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */
995 
996     aio_context_acquire(ctx);
997     ret = job_complete_sync(&job->job, &error_abort);
998     g_assert_cmpint(ret, ==, (result == TEST_JOB_SUCCESS ? 0 : -EIO));
999 
1000     if (use_iothread) {
1001         blk_set_aio_context(blk_src, qemu_get_aio_context(), &error_abort);
1002         assert(blk_get_aio_context(blk_target) == qemu_get_aio_context());
1003     }
1004     aio_context_release(ctx);
1005 
1006     blk_unref(blk_src);
1007     blk_unref(blk_target);
1008     bdrv_unref(src_overlay);
1009     bdrv_unref(target);
1010 
1011     if (iothread) {
1012         iothread_join(iothread);
1013     }
1014 }
1015 
1016 static void test_blockjob_common(enum drain_type drain_type, bool use_iothread,
1017                                  enum test_job_result result)
1018 {
1019     test_blockjob_common_drain_node(drain_type, use_iothread, result,
1020                                     TEST_JOB_DRAIN_SRC);
1021     test_blockjob_common_drain_node(drain_type, use_iothread, result,
1022                                     TEST_JOB_DRAIN_SRC_CHILD);
1023     if (drain_type == BDRV_SUBTREE_DRAIN) {
1024         test_blockjob_common_drain_node(drain_type, use_iothread, result,
1025                                         TEST_JOB_DRAIN_SRC_PARENT);
1026     }
1027 }
1028 
1029 static void test_blockjob_drain_all(void)
1030 {
1031     test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_SUCCESS);
1032 }
1033 
1034 static void test_blockjob_drain(void)
1035 {
1036     test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_SUCCESS);
1037 }
1038 
1039 static void test_blockjob_drain_subtree(void)
1040 {
1041     test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_SUCCESS);
1042 }
1043 
1044 static void test_blockjob_error_drain_all(void)
1045 {
1046     test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_RUN);
1047     test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_PREPARE);
1048 }
1049 
1050 static void test_blockjob_error_drain(void)
1051 {
1052     test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_RUN);
1053     test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_PREPARE);
1054 }
1055 
1056 static void test_blockjob_error_drain_subtree(void)
1057 {
1058     test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_RUN);
1059     test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_PREPARE);
1060 }
1061 
1062 static void test_blockjob_iothread_drain_all(void)
1063 {
1064     test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_SUCCESS);
1065 }
1066 
1067 static void test_blockjob_iothread_drain(void)
1068 {
1069     test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_SUCCESS);
1070 }
1071 
1072 static void test_blockjob_iothread_drain_subtree(void)
1073 {
1074     test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_SUCCESS);
1075 }
1076 
1077 static void test_blockjob_iothread_error_drain_all(void)
1078 {
1079     test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_RUN);
1080     test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_PREPARE);
1081 }
1082 
1083 static void test_blockjob_iothread_error_drain(void)
1084 {
1085     test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_RUN);
1086     test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_PREPARE);
1087 }
1088 
1089 static void test_blockjob_iothread_error_drain_subtree(void)
1090 {
1091     test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_RUN);
1092     test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_PREPARE);
1093 }
1094 
1095 
1096 typedef struct BDRVTestTopState {
1097     BdrvChild *wait_child;
1098 } BDRVTestTopState;
1099 
1100 static void bdrv_test_top_close(BlockDriverState *bs)
1101 {
1102     BdrvChild *c, *next_c;
1103     QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
1104         bdrv_unref_child(bs, c);
1105     }
1106 }
1107 
1108 static int coroutine_fn bdrv_test_top_co_preadv(BlockDriverState *bs,
1109                                                 uint64_t offset, uint64_t bytes,
1110                                                 QEMUIOVector *qiov, int flags)
1111 {
1112     BDRVTestTopState *tts = bs->opaque;
1113     return bdrv_co_preadv(tts->wait_child, offset, bytes, qiov, flags);
1114 }
1115 
1116 static BlockDriver bdrv_test_top_driver = {
1117     .format_name            = "test_top_driver",
1118     .instance_size          = sizeof(BDRVTestTopState),
1119 
1120     .bdrv_close             = bdrv_test_top_close,
1121     .bdrv_co_preadv         = bdrv_test_top_co_preadv,
1122 
1123     .bdrv_child_perm        = bdrv_default_perms,
1124 };
1125 
1126 typedef struct TestCoDeleteByDrainData {
1127     BlockBackend *blk;
1128     bool detach_instead_of_delete;
1129     bool done;
1130 } TestCoDeleteByDrainData;
1131 
1132 static void coroutine_fn test_co_delete_by_drain(void *opaque)
1133 {
1134     TestCoDeleteByDrainData *dbdd = opaque;
1135     BlockBackend *blk = dbdd->blk;
1136     BlockDriverState *bs = blk_bs(blk);
1137     BDRVTestTopState *tts = bs->opaque;
1138     void *buffer = g_malloc(65536);
1139     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buffer, 65536);
1140 
1141     /* Pretend some internal write operation from parent to child.
1142      * Important: We have to read from the child, not from the parent!
1143      * Draining works by first propagating it all up the tree to the
1144      * root and then waiting for drainage from root to the leaves
1145      * (protocol nodes).  If we have a request waiting on the root,
1146      * everything will be drained before we go back down the tree, but
1147      * we do not want that.  We want to be in the middle of draining
1148      * when this following requests returns. */
1149     bdrv_co_preadv(tts->wait_child, 0, 65536, &qiov, 0);
1150 
1151     g_assert_cmpint(bs->refcnt, ==, 1);
1152 
1153     if (!dbdd->detach_instead_of_delete) {
1154         blk_unref(blk);
1155     } else {
1156         BdrvChild *c, *next_c;
1157         QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) {
1158             bdrv_unref_child(bs, c);
1159         }
1160     }
1161 
1162     dbdd->done = true;
1163     g_free(buffer);
1164 }
1165 
1166 /**
1167  * Test what happens when some BDS has some children, you drain one of
1168  * them and this results in the BDS being deleted.
1169  *
1170  * If @detach_instead_of_delete is set, the BDS is not going to be
1171  * deleted but will only detach all of its children.
1172  */
1173 static void do_test_delete_by_drain(bool detach_instead_of_delete,
1174                                     enum drain_type drain_type)
1175 {
1176     BlockBackend *blk;
1177     BlockDriverState *bs, *child_bs, *null_bs;
1178     BDRVTestTopState *tts;
1179     TestCoDeleteByDrainData dbdd;
1180     Coroutine *co;
1181 
1182     bs = bdrv_new_open_driver(&bdrv_test_top_driver, "top", BDRV_O_RDWR,
1183                               &error_abort);
1184     bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
1185     tts = bs->opaque;
1186 
1187     null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
1188                         &error_abort);
1189     bdrv_attach_child(bs, null_bs, "null-child", &child_of_bds,
1190                       BDRV_CHILD_DATA, &error_abort);
1191 
1192     /* This child will be the one to pass to requests through to, and
1193      * it will stall until a drain occurs */
1194     child_bs = bdrv_new_open_driver(&bdrv_test, "child", BDRV_O_RDWR,
1195                                     &error_abort);
1196     child_bs->total_sectors = 65536 >> BDRV_SECTOR_BITS;
1197     /* Takes our reference to child_bs */
1198     tts->wait_child = bdrv_attach_child(bs, child_bs, "wait-child",
1199                                         &child_of_bds,
1200                                         BDRV_CHILD_DATA | BDRV_CHILD_PRIMARY,
1201                                         &error_abort);
1202 
1203     /* This child is just there to be deleted
1204      * (for detach_instead_of_delete == true) */
1205     null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
1206                         &error_abort);
1207     bdrv_attach_child(bs, null_bs, "null-child", &child_of_bds, BDRV_CHILD_DATA,
1208                       &error_abort);
1209 
1210     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
1211     blk_insert_bs(blk, bs, &error_abort);
1212 
1213     /* Referenced by blk now */
1214     bdrv_unref(bs);
1215 
1216     g_assert_cmpint(bs->refcnt, ==, 1);
1217     g_assert_cmpint(child_bs->refcnt, ==, 1);
1218     g_assert_cmpint(null_bs->refcnt, ==, 1);
1219 
1220 
1221     dbdd = (TestCoDeleteByDrainData){
1222         .blk = blk,
1223         .detach_instead_of_delete = detach_instead_of_delete,
1224         .done = false,
1225     };
1226     co = qemu_coroutine_create(test_co_delete_by_drain, &dbdd);
1227     qemu_coroutine_enter(co);
1228 
1229     /* Drain the child while the read operation is still pending.
1230      * This should result in the operation finishing and
1231      * test_co_delete_by_drain() resuming.  Thus, @bs will be deleted
1232      * and the coroutine will exit while this drain operation is still
1233      * in progress. */
1234     switch (drain_type) {
1235     case BDRV_DRAIN:
1236         bdrv_ref(child_bs);
1237         bdrv_drain(child_bs);
1238         bdrv_unref(child_bs);
1239         break;
1240     case BDRV_SUBTREE_DRAIN:
1241         /* Would have to ref/unref bs here for !detach_instead_of_delete, but
1242          * then the whole test becomes pointless because the graph changes
1243          * don't occur during the drain any more. */
1244         assert(detach_instead_of_delete);
1245         bdrv_subtree_drained_begin(bs);
1246         bdrv_subtree_drained_end(bs);
1247         break;
1248     case BDRV_DRAIN_ALL:
1249         bdrv_drain_all_begin();
1250         bdrv_drain_all_end();
1251         break;
1252     default:
1253         g_assert_not_reached();
1254     }
1255 
1256     while (!dbdd.done) {
1257         aio_poll(qemu_get_aio_context(), true);
1258     }
1259 
1260     if (detach_instead_of_delete) {
1261         /* Here, the reference has not passed over to the coroutine,
1262          * so we have to delete the BB ourselves */
1263         blk_unref(blk);
1264     }
1265 }
1266 
1267 static void test_delete_by_drain(void)
1268 {
1269     do_test_delete_by_drain(false, BDRV_DRAIN);
1270 }
1271 
1272 static void test_detach_by_drain_all(void)
1273 {
1274     do_test_delete_by_drain(true, BDRV_DRAIN_ALL);
1275 }
1276 
1277 static void test_detach_by_drain(void)
1278 {
1279     do_test_delete_by_drain(true, BDRV_DRAIN);
1280 }
1281 
1282 static void test_detach_by_drain_subtree(void)
1283 {
1284     do_test_delete_by_drain(true, BDRV_SUBTREE_DRAIN);
1285 }
1286 
1287 
1288 struct detach_by_parent_data {
1289     BlockDriverState *parent_b;
1290     BdrvChild *child_b;
1291     BlockDriverState *c;
1292     BdrvChild *child_c;
1293     bool by_parent_cb;
1294 };
1295 static struct detach_by_parent_data detach_by_parent_data;
1296 
1297 static void detach_indirect_bh(void *opaque)
1298 {
1299     struct detach_by_parent_data *data = opaque;
1300 
1301     bdrv_unref_child(data->parent_b, data->child_b);
1302 
1303     bdrv_ref(data->c);
1304     data->child_c = bdrv_attach_child(data->parent_b, data->c, "PB-C",
1305                                       &child_of_bds, BDRV_CHILD_DATA,
1306                                       &error_abort);
1307 }
1308 
1309 static void detach_by_parent_aio_cb(void *opaque, int ret)
1310 {
1311     struct detach_by_parent_data *data = &detach_by_parent_data;
1312 
1313     g_assert_cmpint(ret, ==, 0);
1314     if (data->by_parent_cb) {
1315         detach_indirect_bh(data);
1316     }
1317 }
1318 
1319 static void detach_by_driver_cb_drained_begin(BdrvChild *child)
1320 {
1321     aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
1322                             detach_indirect_bh, &detach_by_parent_data);
1323     child_of_bds.drained_begin(child);
1324 }
1325 
1326 static BdrvChildClass detach_by_driver_cb_class;
1327 
1328 /*
1329  * Initial graph:
1330  *
1331  * PA     PB
1332  *    \ /   \
1333  *     A     B     C
1334  *
1335  * by_parent_cb == true:  Test that parent callbacks don't poll
1336  *
1337  *     PA has a pending write request whose callback changes the child nodes of
1338  *     PB: It removes B and adds C instead. The subtree of PB is drained, which
1339  *     will indirectly drain the write request, too.
1340  *
1341  * by_parent_cb == false: Test that bdrv_drain_invoke() doesn't poll
1342  *
1343  *     PA's BdrvChildClass has a .drained_begin callback that schedules a BH
1344  *     that does the same graph change. If bdrv_drain_invoke() calls it, the
1345  *     state is messed up, but if it is only polled in the single
1346  *     BDRV_POLL_WHILE() at the end of the drain, this should work fine.
1347  */
1348 static void test_detach_indirect(bool by_parent_cb)
1349 {
1350     BlockBackend *blk;
1351     BlockDriverState *parent_a, *parent_b, *a, *b, *c;
1352     BdrvChild *child_a, *child_b;
1353     BlockAIOCB *acb;
1354 
1355     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, 0);
1356 
1357     if (!by_parent_cb) {
1358         detach_by_driver_cb_class = child_of_bds;
1359         detach_by_driver_cb_class.drained_begin =
1360             detach_by_driver_cb_drained_begin;
1361     }
1362 
1363     /* Create all involved nodes */
1364     parent_a = bdrv_new_open_driver(&bdrv_test, "parent-a", BDRV_O_RDWR,
1365                                     &error_abort);
1366     parent_b = bdrv_new_open_driver(&bdrv_test, "parent-b", 0,
1367                                     &error_abort);
1368 
1369     a = bdrv_new_open_driver(&bdrv_test, "a", BDRV_O_RDWR, &error_abort);
1370     b = bdrv_new_open_driver(&bdrv_test, "b", BDRV_O_RDWR, &error_abort);
1371     c = bdrv_new_open_driver(&bdrv_test, "c", BDRV_O_RDWR, &error_abort);
1372 
1373     /* blk is a BB for parent-a */
1374     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
1375     blk_insert_bs(blk, parent_a, &error_abort);
1376     bdrv_unref(parent_a);
1377 
1378     /* If we want to get bdrv_drain_invoke() to call aio_poll(), the driver
1379      * callback must not return immediately. */
1380     if (!by_parent_cb) {
1381         BDRVTestState *s = parent_a->opaque;
1382         s->sleep_in_drain_begin = true;
1383     }
1384 
1385     /* Set child relationships */
1386     bdrv_ref(b);
1387     bdrv_ref(a);
1388     child_b = bdrv_attach_child(parent_b, b, "PB-B", &child_of_bds,
1389                                 BDRV_CHILD_DATA, &error_abort);
1390     child_a = bdrv_attach_child(parent_b, a, "PB-A", &child_of_bds,
1391                                 BDRV_CHILD_COW, &error_abort);
1392 
1393     bdrv_ref(a);
1394     bdrv_attach_child(parent_a, a, "PA-A",
1395                       by_parent_cb ? &child_of_bds : &detach_by_driver_cb_class,
1396                       BDRV_CHILD_DATA, &error_abort);
1397 
1398     g_assert_cmpint(parent_a->refcnt, ==, 1);
1399     g_assert_cmpint(parent_b->refcnt, ==, 1);
1400     g_assert_cmpint(a->refcnt, ==, 3);
1401     g_assert_cmpint(b->refcnt, ==, 2);
1402     g_assert_cmpint(c->refcnt, ==, 1);
1403 
1404     g_assert(QLIST_FIRST(&parent_b->children) == child_a);
1405     g_assert(QLIST_NEXT(child_a, next) == child_b);
1406     g_assert(QLIST_NEXT(child_b, next) == NULL);
1407 
1408     /* Start the evil write request */
1409     detach_by_parent_data = (struct detach_by_parent_data) {
1410         .parent_b = parent_b,
1411         .child_b = child_b,
1412         .c = c,
1413         .by_parent_cb = by_parent_cb,
1414     };
1415     acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
1416     g_assert(acb != NULL);
1417 
1418     /* Drain and check the expected result */
1419     bdrv_subtree_drained_begin(parent_b);
1420 
1421     g_assert(detach_by_parent_data.child_c != NULL);
1422 
1423     g_assert_cmpint(parent_a->refcnt, ==, 1);
1424     g_assert_cmpint(parent_b->refcnt, ==, 1);
1425     g_assert_cmpint(a->refcnt, ==, 3);
1426     g_assert_cmpint(b->refcnt, ==, 1);
1427     g_assert_cmpint(c->refcnt, ==, 2);
1428 
1429     g_assert(QLIST_FIRST(&parent_b->children) == detach_by_parent_data.child_c);
1430     g_assert(QLIST_NEXT(detach_by_parent_data.child_c, next) == child_a);
1431     g_assert(QLIST_NEXT(child_a, next) == NULL);
1432 
1433     g_assert_cmpint(parent_a->quiesce_counter, ==, 1);
1434     g_assert_cmpint(parent_b->quiesce_counter, ==, 1);
1435     g_assert_cmpint(a->quiesce_counter, ==, 1);
1436     g_assert_cmpint(b->quiesce_counter, ==, 0);
1437     g_assert_cmpint(c->quiesce_counter, ==, 1);
1438 
1439     bdrv_subtree_drained_end(parent_b);
1440 
1441     bdrv_unref(parent_b);
1442     blk_unref(blk);
1443 
1444     g_assert_cmpint(a->refcnt, ==, 1);
1445     g_assert_cmpint(b->refcnt, ==, 1);
1446     g_assert_cmpint(c->refcnt, ==, 1);
1447     bdrv_unref(a);
1448     bdrv_unref(b);
1449     bdrv_unref(c);
1450 }
1451 
1452 static void test_detach_by_parent_cb(void)
1453 {
1454     test_detach_indirect(true);
1455 }
1456 
1457 static void test_detach_by_driver_cb(void)
1458 {
1459     test_detach_indirect(false);
1460 }
1461 
1462 static void test_append_to_drained(void)
1463 {
1464     BlockBackend *blk;
1465     BlockDriverState *base, *overlay;
1466     BDRVTestState *base_s, *overlay_s;
1467 
1468     blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
1469     base = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort);
1470     base_s = base->opaque;
1471     blk_insert_bs(blk, base, &error_abort);
1472 
1473     overlay = bdrv_new_open_driver(&bdrv_test, "overlay", BDRV_O_RDWR,
1474                                    &error_abort);
1475     overlay_s = overlay->opaque;
1476 
1477     do_drain_begin(BDRV_DRAIN, base);
1478     g_assert_cmpint(base->quiesce_counter, ==, 1);
1479     g_assert_cmpint(base_s->drain_count, ==, 1);
1480     g_assert_cmpint(base->in_flight, ==, 0);
1481 
1482     bdrv_append(overlay, base, &error_abort);
1483     g_assert_cmpint(base->in_flight, ==, 0);
1484     g_assert_cmpint(overlay->in_flight, ==, 0);
1485 
1486     g_assert_cmpint(base->quiesce_counter, ==, 1);
1487     g_assert_cmpint(base_s->drain_count, ==, 1);
1488     g_assert_cmpint(overlay->quiesce_counter, ==, 1);
1489     g_assert_cmpint(overlay_s->drain_count, ==, 1);
1490 
1491     do_drain_end(BDRV_DRAIN, base);
1492 
1493     g_assert_cmpint(base->quiesce_counter, ==, 0);
1494     g_assert_cmpint(base_s->drain_count, ==, 0);
1495     g_assert_cmpint(overlay->quiesce_counter, ==, 0);
1496     g_assert_cmpint(overlay_s->drain_count, ==, 0);
1497 
1498     bdrv_unref(overlay);
1499     bdrv_unref(base);
1500     blk_unref(blk);
1501 }
1502 
1503 static void test_set_aio_context(void)
1504 {
1505     BlockDriverState *bs;
1506     IOThread *a = iothread_new();
1507     IOThread *b = iothread_new();
1508     AioContext *ctx_a = iothread_get_aio_context(a);
1509     AioContext *ctx_b = iothread_get_aio_context(b);
1510 
1511     bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
1512                               &error_abort);
1513 
1514     bdrv_drained_begin(bs);
1515     bdrv_try_set_aio_context(bs, ctx_a, &error_abort);
1516 
1517     aio_context_acquire(ctx_a);
1518     bdrv_drained_end(bs);
1519 
1520     bdrv_drained_begin(bs);
1521     bdrv_try_set_aio_context(bs, ctx_b, &error_abort);
1522     aio_context_release(ctx_a);
1523     aio_context_acquire(ctx_b);
1524     bdrv_try_set_aio_context(bs, qemu_get_aio_context(), &error_abort);
1525     aio_context_release(ctx_b);
1526     bdrv_drained_end(bs);
1527 
1528     bdrv_unref(bs);
1529     iothread_join(a);
1530     iothread_join(b);
1531 }
1532 
1533 
1534 typedef struct TestDropBackingBlockJob {
1535     BlockJob common;
1536     bool should_complete;
1537     bool *did_complete;
1538     BlockDriverState *detach_also;
1539 } TestDropBackingBlockJob;
1540 
1541 static int coroutine_fn test_drop_backing_job_run(Job *job, Error **errp)
1542 {
1543     TestDropBackingBlockJob *s =
1544         container_of(job, TestDropBackingBlockJob, common.job);
1545 
1546     while (!s->should_complete) {
1547         job_sleep_ns(job, 0);
1548     }
1549 
1550     return 0;
1551 }
1552 
1553 static void test_drop_backing_job_commit(Job *job)
1554 {
1555     TestDropBackingBlockJob *s =
1556         container_of(job, TestDropBackingBlockJob, common.job);
1557 
1558     bdrv_set_backing_hd(blk_bs(s->common.blk), NULL, &error_abort);
1559     bdrv_set_backing_hd(s->detach_also, NULL, &error_abort);
1560 
1561     *s->did_complete = true;
1562 }
1563 
1564 static const BlockJobDriver test_drop_backing_job_driver = {
1565     .job_driver = {
1566         .instance_size  = sizeof(TestDropBackingBlockJob),
1567         .free           = block_job_free,
1568         .user_resume    = block_job_user_resume,
1569         .run            = test_drop_backing_job_run,
1570         .commit         = test_drop_backing_job_commit,
1571     }
1572 };
1573 
1574 /**
1575  * Creates a child node with three parent nodes on it, and then runs a
1576  * block job on the final one, parent-node-2.
1577  *
1578  * The job is then asked to complete before a section where the child
1579  * is drained.
1580  *
1581  * Ending this section will undrain the child's parents, first
1582  * parent-node-2, then parent-node-1, then parent-node-0 -- the parent
1583  * list is in reverse order of how they were added.  Ending the drain
1584  * on parent-node-2 will resume the job, thus completing it and
1585  * scheduling job_exit().
1586  *
1587  * Ending the drain on parent-node-1 will poll the AioContext, which
1588  * lets job_exit() and thus test_drop_backing_job_commit() run.  That
1589  * function first removes the child as parent-node-2's backing file.
1590  *
1591  * In old (and buggy) implementations, there are two problems with
1592  * that:
1593  * (A) bdrv_drain_invoke() polls for every node that leaves the
1594  *     drained section.  This means that job_exit() is scheduled
1595  *     before the child has left the drained section.  Its
1596  *     quiesce_counter is therefore still 1 when it is removed from
1597  *     parent-node-2.
1598  *
1599  * (B) bdrv_replace_child_noperm() calls drained_end() on the old
1600  *     child's parents as many times as the child is quiesced.  This
1601  *     means it will call drained_end() on parent-node-2 once.
1602  *     Because parent-node-2 is no longer quiesced at this point, this
1603  *     will fail.
1604  *
1605  * bdrv_replace_child_noperm() therefore must call drained_end() on
1606  * the parent only if it really is still drained because the child is
1607  * drained.
1608  *
1609  * If removing child from parent-node-2 was successful (as it should
1610  * be), test_drop_backing_job_commit() will then also remove the child
1611  * from parent-node-0.
1612  *
1613  * With an old version of our drain infrastructure ((A) above), that
1614  * resulted in the following flow:
1615  *
1616  * 1. child attempts to leave its drained section.  The call recurses
1617  *    to its parents.
1618  *
1619  * 2. parent-node-2 leaves the drained section.  Polling in
1620  *    bdrv_drain_invoke() will schedule job_exit().
1621  *
1622  * 3. parent-node-1 leaves the drained section.  Polling in
1623  *    bdrv_drain_invoke() will run job_exit(), thus disconnecting
1624  *    parent-node-0 from the child node.
1625  *
1626  * 4. bdrv_parent_drained_end() uses a QLIST_FOREACH_SAFE() loop to
1627  *    iterate over the parents.  Thus, it now accesses the BdrvChild
1628  *    object that used to connect parent-node-0 and the child node.
1629  *    However, that object no longer exists, so it accesses a dangling
1630  *    pointer.
1631  *
1632  * The solution is to only poll once when running a bdrv_drained_end()
1633  * operation, specifically at the end when all drained_end()
1634  * operations for all involved nodes have been scheduled.
1635  * Note that this also solves (A) above, thus hiding (B).
1636  */
1637 static void test_blockjob_commit_by_drained_end(void)
1638 {
1639     BlockDriverState *bs_child, *bs_parents[3];
1640     TestDropBackingBlockJob *job;
1641     bool job_has_completed = false;
1642     int i;
1643 
1644     bs_child = bdrv_new_open_driver(&bdrv_test, "child-node", BDRV_O_RDWR,
1645                                     &error_abort);
1646 
1647     for (i = 0; i < 3; i++) {
1648         char name[32];
1649         snprintf(name, sizeof(name), "parent-node-%i", i);
1650         bs_parents[i] = bdrv_new_open_driver(&bdrv_test, name, BDRV_O_RDWR,
1651                                              &error_abort);
1652         bdrv_set_backing_hd(bs_parents[i], bs_child, &error_abort);
1653     }
1654 
1655     job = block_job_create("job", &test_drop_backing_job_driver, NULL,
1656                            bs_parents[2], 0, BLK_PERM_ALL, 0, 0, NULL, NULL,
1657                            &error_abort);
1658 
1659     job->detach_also = bs_parents[0];
1660     job->did_complete = &job_has_completed;
1661 
1662     job_start(&job->common.job);
1663 
1664     job->should_complete = true;
1665     bdrv_drained_begin(bs_child);
1666     g_assert(!job_has_completed);
1667     bdrv_drained_end(bs_child);
1668     g_assert(job_has_completed);
1669 
1670     bdrv_unref(bs_parents[0]);
1671     bdrv_unref(bs_parents[1]);
1672     bdrv_unref(bs_parents[2]);
1673     bdrv_unref(bs_child);
1674 }
1675 
1676 
1677 typedef struct TestSimpleBlockJob {
1678     BlockJob common;
1679     bool should_complete;
1680     bool *did_complete;
1681 } TestSimpleBlockJob;
1682 
1683 static int coroutine_fn test_simple_job_run(Job *job, Error **errp)
1684 {
1685     TestSimpleBlockJob *s = container_of(job, TestSimpleBlockJob, common.job);
1686 
1687     while (!s->should_complete) {
1688         job_sleep_ns(job, 0);
1689     }
1690 
1691     return 0;
1692 }
1693 
1694 static void test_simple_job_clean(Job *job)
1695 {
1696     TestSimpleBlockJob *s = container_of(job, TestSimpleBlockJob, common.job);
1697     *s->did_complete = true;
1698 }
1699 
1700 static const BlockJobDriver test_simple_job_driver = {
1701     .job_driver = {
1702         .instance_size  = sizeof(TestSimpleBlockJob),
1703         .free           = block_job_free,
1704         .user_resume    = block_job_user_resume,
1705         .run            = test_simple_job_run,
1706         .clean          = test_simple_job_clean,
1707     },
1708 };
1709 
1710 static int drop_intermediate_poll_update_filename(BdrvChild *child,
1711                                                   BlockDriverState *new_base,
1712                                                   const char *filename,
1713                                                   Error **errp)
1714 {
1715     /*
1716      * We are free to poll here, which may change the block graph, if
1717      * it is not drained.
1718      */
1719 
1720     /* If the job is not drained: Complete it, schedule job_exit() */
1721     aio_poll(qemu_get_current_aio_context(), false);
1722     /* If the job is not drained: Run job_exit(), finish the job */
1723     aio_poll(qemu_get_current_aio_context(), false);
1724 
1725     return 0;
1726 }
1727 
1728 /**
1729  * Test a poll in the midst of bdrv_drop_intermediate().
1730  *
1731  * bdrv_drop_intermediate() calls BdrvChildClass.update_filename(),
1732  * which can yield or poll.  This may lead to graph changes, unless
1733  * the whole subtree in question is drained.
1734  *
1735  * We test this on the following graph:
1736  *
1737  *                    Job
1738  *
1739  *                     |
1740  *                  job-node
1741  *                     |
1742  *                     v
1743  *
1744  *                  job-node
1745  *
1746  *                     |
1747  *                  backing
1748  *                     |
1749  *                     v
1750  *
1751  * node-2 --chain--> node-1 --chain--> node-0
1752  *
1753  * We drop node-1 with bdrv_drop_intermediate(top=node-1, base=node-0).
1754  *
1755  * This first updates node-2's backing filename by invoking
1756  * drop_intermediate_poll_update_filename(), which polls twice.  This
1757  * causes the job to finish, which in turns causes the job-node to be
1758  * deleted.
1759  *
1760  * bdrv_drop_intermediate() uses a QLIST_FOREACH_SAFE() loop, so it
1761  * already has a pointer to the BdrvChild edge between job-node and
1762  * node-1.  When it tries to handle that edge, we probably get a
1763  * segmentation fault because the object no longer exists.
1764  *
1765  *
1766  * The solution is for bdrv_drop_intermediate() to drain top's
1767  * subtree.  This prevents graph changes from happening just because
1768  * BdrvChildClass.update_filename() yields or polls.  Thus, the block
1769  * job is paused during that drained section and must finish before or
1770  * after.
1771  *
1772  * (In addition, bdrv_replace_child() must keep the job paused.)
1773  */
1774 static void test_drop_intermediate_poll(void)
1775 {
1776     static BdrvChildClass chain_child_class;
1777     BlockDriverState *chain[3];
1778     TestSimpleBlockJob *job;
1779     BlockDriverState *job_node;
1780     bool job_has_completed = false;
1781     int i;
1782     int ret;
1783 
1784     chain_child_class = child_of_bds;
1785     chain_child_class.update_filename = drop_intermediate_poll_update_filename;
1786 
1787     for (i = 0; i < 3; i++) {
1788         char name[32];
1789         snprintf(name, 32, "node-%i", i);
1790 
1791         chain[i] = bdrv_new_open_driver(&bdrv_test, name, 0, &error_abort);
1792     }
1793 
1794     job_node = bdrv_new_open_driver(&bdrv_test, "job-node", BDRV_O_RDWR,
1795                                     &error_abort);
1796     bdrv_set_backing_hd(job_node, chain[1], &error_abort);
1797 
1798     /*
1799      * Establish the chain last, so the chain links are the first
1800      * elements in the BDS.parents lists
1801      */
1802     for (i = 0; i < 3; i++) {
1803         if (i) {
1804             /* Takes the reference to chain[i - 1] */
1805             chain[i]->backing = bdrv_attach_child(chain[i], chain[i - 1],
1806                                                   "chain", &chain_child_class,
1807                                                   BDRV_CHILD_COW, &error_abort);
1808         }
1809     }
1810 
1811     job = block_job_create("job", &test_simple_job_driver, NULL, job_node,
1812                            0, BLK_PERM_ALL, 0, 0, NULL, NULL, &error_abort);
1813 
1814     /* The job has a reference now */
1815     bdrv_unref(job_node);
1816 
1817     job->did_complete = &job_has_completed;
1818 
1819     job_start(&job->common.job);
1820     job->should_complete = true;
1821 
1822     g_assert(!job_has_completed);
1823     ret = bdrv_drop_intermediate(chain[1], chain[0], NULL);
1824     g_assert(ret == 0);
1825     g_assert(job_has_completed);
1826 
1827     bdrv_unref(chain[2]);
1828 }
1829 
1830 
1831 typedef struct BDRVReplaceTestState {
1832     bool was_drained;
1833     bool was_undrained;
1834     bool has_read;
1835 
1836     int drain_count;
1837 
1838     bool yield_before_read;
1839     Coroutine *io_co;
1840     Coroutine *drain_co;
1841 } BDRVReplaceTestState;
1842 
1843 static void bdrv_replace_test_close(BlockDriverState *bs)
1844 {
1845 }
1846 
1847 /**
1848  * If @bs has a backing file:
1849  *   Yield if .yield_before_read is true (and wait for drain_begin to
1850  *   wake us up).
1851  *   Forward the read to bs->backing.  Set .has_read to true.
1852  *   If drain_begin has woken us, wake it in turn.
1853  *
1854  * Otherwise:
1855  *   Set .has_read to true and return success.
1856  */
1857 static int coroutine_fn bdrv_replace_test_co_preadv(BlockDriverState *bs,
1858                                                     uint64_t offset,
1859                                                     uint64_t bytes,
1860                                                     QEMUIOVector *qiov,
1861                                                     int flags)
1862 {
1863     BDRVReplaceTestState *s = bs->opaque;
1864 
1865     if (bs->backing) {
1866         int ret;
1867 
1868         g_assert(!s->drain_count);
1869 
1870         s->io_co = qemu_coroutine_self();
1871         if (s->yield_before_read) {
1872             s->yield_before_read = false;
1873             qemu_coroutine_yield();
1874         }
1875         s->io_co = NULL;
1876 
1877         ret = bdrv_co_preadv(bs->backing, offset, bytes, qiov, 0);
1878         s->has_read = true;
1879 
1880         /* Wake up drain_co if it runs */
1881         if (s->drain_co) {
1882             aio_co_wake(s->drain_co);
1883         }
1884 
1885         return ret;
1886     }
1887 
1888     s->has_read = true;
1889     return 0;
1890 }
1891 
1892 /**
1893  * If .drain_count is 0, wake up .io_co if there is one; and set
1894  * .was_drained.
1895  * Increment .drain_count.
1896  */
1897 static void coroutine_fn bdrv_replace_test_co_drain_begin(BlockDriverState *bs)
1898 {
1899     BDRVReplaceTestState *s = bs->opaque;
1900 
1901     if (!s->drain_count) {
1902         /* Keep waking io_co up until it is done */
1903         s->drain_co = qemu_coroutine_self();
1904         while (s->io_co) {
1905             aio_co_wake(s->io_co);
1906             s->io_co = NULL;
1907             qemu_coroutine_yield();
1908         }
1909         s->drain_co = NULL;
1910 
1911         s->was_drained = true;
1912     }
1913     s->drain_count++;
1914 }
1915 
1916 /**
1917  * Reduce .drain_count, set .was_undrained once it reaches 0.
1918  * If .drain_count reaches 0 and the node has a backing file, issue a
1919  * read request.
1920  */
1921 static void coroutine_fn bdrv_replace_test_co_drain_end(BlockDriverState *bs)
1922 {
1923     BDRVReplaceTestState *s = bs->opaque;
1924 
1925     g_assert(s->drain_count > 0);
1926     if (!--s->drain_count) {
1927         int ret;
1928 
1929         s->was_undrained = true;
1930 
1931         if (bs->backing) {
1932             char data;
1933             QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1);
1934 
1935             /* Queue a read request post-drain */
1936             ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0);
1937             g_assert(ret >= 0);
1938         }
1939     }
1940 }
1941 
1942 static BlockDriver bdrv_replace_test = {
1943     .format_name            = "replace_test",
1944     .instance_size          = sizeof(BDRVReplaceTestState),
1945 
1946     .bdrv_close             = bdrv_replace_test_close,
1947     .bdrv_co_preadv         = bdrv_replace_test_co_preadv,
1948 
1949     .bdrv_co_drain_begin    = bdrv_replace_test_co_drain_begin,
1950     .bdrv_co_drain_end      = bdrv_replace_test_co_drain_end,
1951 
1952     .bdrv_child_perm        = bdrv_default_perms,
1953 };
1954 
1955 static void coroutine_fn test_replace_child_mid_drain_read_co(void *opaque)
1956 {
1957     int ret;
1958     char data;
1959 
1960     ret = blk_co_pread(opaque, 0, 1, &data, 0);
1961     g_assert(ret >= 0);
1962 }
1963 
1964 /**
1965  * We test two things:
1966  * (1) bdrv_replace_child_noperm() must not undrain the parent if both
1967  *     children are drained.
1968  * (2) bdrv_replace_child_noperm() must never flush I/O requests to a
1969  *     drained child.  If the old child is drained, it must flush I/O
1970  *     requests after the new one has been attached.  If the new child
1971  *     is drained, it must flush I/O requests before the old one is
1972  *     detached.
1973  *
1974  * To do so, we create one parent node and two child nodes; then
1975  * attach one of the children (old_child_bs) to the parent, then
1976  * drain both old_child_bs and new_child_bs according to
1977  * old_drain_count and new_drain_count, respectively, and finally
1978  * we invoke bdrv_replace_node() to replace old_child_bs by
1979  * new_child_bs.
1980  *
1981  * The test block driver we use here (bdrv_replace_test) has a read
1982  * function that:
1983  * - For the parent node, can optionally yield, and then forwards the
1984  *   read to bdrv_preadv(),
1985  * - For the child node, just returns immediately.
1986  *
1987  * If the read yields, the drain_begin function will wake it up.
1988  *
1989  * The drain_end function issues a read on the parent once it is fully
1990  * undrained (which simulates requests starting to come in again).
1991  */
1992 static void do_test_replace_child_mid_drain(int old_drain_count,
1993                                             int new_drain_count)
1994 {
1995     BlockBackend *parent_blk;
1996     BlockDriverState *parent_bs;
1997     BlockDriverState *old_child_bs, *new_child_bs;
1998     BDRVReplaceTestState *parent_s;
1999     BDRVReplaceTestState *old_child_s, *new_child_s;
2000     Coroutine *io_co;
2001     int i;
2002 
2003     parent_bs = bdrv_new_open_driver(&bdrv_replace_test, "parent", 0,
2004                                      &error_abort);
2005     parent_s = parent_bs->opaque;
2006 
2007     parent_blk = blk_new(qemu_get_aio_context(),
2008                          BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
2009     blk_insert_bs(parent_blk, parent_bs, &error_abort);
2010 
2011     old_child_bs = bdrv_new_open_driver(&bdrv_replace_test, "old-child", 0,
2012                                         &error_abort);
2013     new_child_bs = bdrv_new_open_driver(&bdrv_replace_test, "new-child", 0,
2014                                         &error_abort);
2015     old_child_s = old_child_bs->opaque;
2016     new_child_s = new_child_bs->opaque;
2017 
2018     /* So that we can read something */
2019     parent_bs->total_sectors = 1;
2020     old_child_bs->total_sectors = 1;
2021     new_child_bs->total_sectors = 1;
2022 
2023     bdrv_ref(old_child_bs);
2024     parent_bs->backing = bdrv_attach_child(parent_bs, old_child_bs, "child",
2025                                            &child_of_bds, BDRV_CHILD_COW,
2026                                            &error_abort);
2027 
2028     for (i = 0; i < old_drain_count; i++) {
2029         bdrv_drained_begin(old_child_bs);
2030     }
2031     for (i = 0; i < new_drain_count; i++) {
2032         bdrv_drained_begin(new_child_bs);
2033     }
2034 
2035     if (!old_drain_count) {
2036         /*
2037          * Start a read operation that will yield, so it will not
2038          * complete before the node is drained.
2039          */
2040         parent_s->yield_before_read = true;
2041         io_co = qemu_coroutine_create(test_replace_child_mid_drain_read_co,
2042                                       parent_blk);
2043         qemu_coroutine_enter(io_co);
2044     }
2045 
2046     /* If we have started a read operation, it should have yielded */
2047     g_assert(!parent_s->has_read);
2048 
2049     /* Reset drained status so we can see what bdrv_replace_node() does */
2050     parent_s->was_drained = false;
2051     parent_s->was_undrained = false;
2052 
2053     g_assert(parent_bs->quiesce_counter == old_drain_count);
2054     bdrv_replace_node(old_child_bs, new_child_bs, &error_abort);
2055     g_assert(parent_bs->quiesce_counter == new_drain_count);
2056 
2057     if (!old_drain_count && !new_drain_count) {
2058         /*
2059          * From undrained to undrained drains and undrains the parent,
2060          * because bdrv_replace_node() contains a drained section for
2061          * @old_child_bs.
2062          */
2063         g_assert(parent_s->was_drained && parent_s->was_undrained);
2064     } else if (!old_drain_count && new_drain_count) {
2065         /*
2066          * From undrained to drained should drain the parent and keep
2067          * it that way.
2068          */
2069         g_assert(parent_s->was_drained && !parent_s->was_undrained);
2070     } else if (old_drain_count && !new_drain_count) {
2071         /*
2072          * From drained to undrained should undrain the parent and
2073          * keep it that way.
2074          */
2075         g_assert(!parent_s->was_drained && parent_s->was_undrained);
2076     } else /* if (old_drain_count && new_drain_count) */ {
2077         /*
2078          * From drained to drained must not undrain the parent at any
2079          * point
2080          */
2081         g_assert(!parent_s->was_drained && !parent_s->was_undrained);
2082     }
2083 
2084     if (!old_drain_count || !new_drain_count) {
2085         /*
2086          * If !old_drain_count, we have started a read request before
2087          * bdrv_replace_node().  If !new_drain_count, the parent must
2088          * have been undrained at some point, and
2089          * bdrv_replace_test_co_drain_end() starts a read request
2090          * then.
2091          */
2092         g_assert(parent_s->has_read);
2093     } else {
2094         /*
2095          * If the parent was never undrained, there is no way to start
2096          * a read request.
2097          */
2098         g_assert(!parent_s->has_read);
2099     }
2100 
2101     /* A drained child must have not received any request */
2102     g_assert(!(old_drain_count && old_child_s->has_read));
2103     g_assert(!(new_drain_count && new_child_s->has_read));
2104 
2105     for (i = 0; i < new_drain_count; i++) {
2106         bdrv_drained_end(new_child_bs);
2107     }
2108     for (i = 0; i < old_drain_count; i++) {
2109         bdrv_drained_end(old_child_bs);
2110     }
2111 
2112     /*
2113      * By now, bdrv_replace_test_co_drain_end() must have been called
2114      * at some point while the new child was attached to the parent.
2115      */
2116     g_assert(parent_s->has_read);
2117     g_assert(new_child_s->has_read);
2118 
2119     blk_unref(parent_blk);
2120     bdrv_unref(parent_bs);
2121     bdrv_unref(old_child_bs);
2122     bdrv_unref(new_child_bs);
2123 }
2124 
2125 static void test_replace_child_mid_drain(void)
2126 {
2127     int old_drain_count, new_drain_count;
2128 
2129     for (old_drain_count = 0; old_drain_count < 2; old_drain_count++) {
2130         for (new_drain_count = 0; new_drain_count < 2; new_drain_count++) {
2131             do_test_replace_child_mid_drain(old_drain_count, new_drain_count);
2132         }
2133     }
2134 }
2135 
2136 int main(int argc, char **argv)
2137 {
2138     int ret;
2139 
2140     bdrv_init();
2141     qemu_init_main_loop(&error_abort);
2142 
2143     g_test_init(&argc, &argv, NULL);
2144     qemu_event_init(&done_event, false);
2145 
2146     g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
2147     g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
2148     g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
2149                     test_drv_cb_drain_subtree);
2150 
2151     g_test_add_func("/bdrv-drain/driver-cb/co/drain_all",
2152                     test_drv_cb_co_drain_all);
2153     g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
2154     g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
2155                     test_drv_cb_co_drain_subtree);
2156 
2157 
2158     g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
2159     g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
2160     g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
2161                     test_quiesce_drain_subtree);
2162 
2163     g_test_add_func("/bdrv-drain/quiesce/co/drain_all",
2164                     test_quiesce_co_drain_all);
2165     g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
2166     g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
2167                     test_quiesce_co_drain_subtree);
2168 
2169     g_test_add_func("/bdrv-drain/nested", test_nested);
2170     g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
2171 
2172     g_test_add_func("/bdrv-drain/graph-change/drain_subtree",
2173                     test_graph_change_drain_subtree);
2174     g_test_add_func("/bdrv-drain/graph-change/drain_all",
2175                     test_graph_change_drain_all);
2176 
2177     g_test_add_func("/bdrv-drain/iothread/drain_all", test_iothread_drain_all);
2178     g_test_add_func("/bdrv-drain/iothread/drain", test_iothread_drain);
2179     g_test_add_func("/bdrv-drain/iothread/drain_subtree",
2180                     test_iothread_drain_subtree);
2181 
2182     g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
2183     g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
2184     g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
2185                     test_blockjob_drain_subtree);
2186 
2187     g_test_add_func("/bdrv-drain/blockjob/error/drain_all",
2188                     test_blockjob_error_drain_all);
2189     g_test_add_func("/bdrv-drain/blockjob/error/drain",
2190                     test_blockjob_error_drain);
2191     g_test_add_func("/bdrv-drain/blockjob/error/drain_subtree",
2192                     test_blockjob_error_drain_subtree);
2193 
2194     g_test_add_func("/bdrv-drain/blockjob/iothread/drain_all",
2195                     test_blockjob_iothread_drain_all);
2196     g_test_add_func("/bdrv-drain/blockjob/iothread/drain",
2197                     test_blockjob_iothread_drain);
2198     g_test_add_func("/bdrv-drain/blockjob/iothread/drain_subtree",
2199                     test_blockjob_iothread_drain_subtree);
2200 
2201     g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_all",
2202                     test_blockjob_iothread_error_drain_all);
2203     g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain",
2204                     test_blockjob_iothread_error_drain);
2205     g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_subtree",
2206                     test_blockjob_iothread_error_drain_subtree);
2207 
2208     g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain);
2209     g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all);
2210     g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain);
2211     g_test_add_func("/bdrv-drain/detach/drain_subtree", test_detach_by_drain_subtree);
2212     g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
2213     g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
2214 
2215     g_test_add_func("/bdrv-drain/attach/drain", test_append_to_drained);
2216 
2217     g_test_add_func("/bdrv-drain/set_aio_context", test_set_aio_context);
2218 
2219     g_test_add_func("/bdrv-drain/blockjob/commit_by_drained_end",
2220                     test_blockjob_commit_by_drained_end);
2221 
2222     g_test_add_func("/bdrv-drain/bdrv_drop_intermediate/poll",
2223                     test_drop_intermediate_poll);
2224 
2225     g_test_add_func("/bdrv-drain/replace_child/mid-drain",
2226                     test_replace_child_mid_drain);
2227 
2228     ret = g_test_run();
2229     qemu_event_destroy(&done_event);
2230     return ret;
2231 }
2232