1 /*
2 * QEMU coroutines
3 *
4 * Copyright IBM, Corp. 2011
5 *
6 * Authors:
7 * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
8 * Kevin Wolf <kwolf@redhat.com>
9 *
10 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11 * See the COPYING.LIB file in the top-level directory.
12 *
13 */
14
15 #include "qemu/osdep.h"
16 #include "trace.h"
17 #include "qemu/thread.h"
18 #include "qemu/atomic.h"
19 #include "qemu/coroutine_int.h"
20 #include "qemu/coroutine-tls.h"
21 #include "qemu/cutils.h"
22 #include "block/aio.h"
23
24 enum {
25 COROUTINE_POOL_BATCH_MAX_SIZE = 128,
26 };
27
28 /*
29 * Coroutine creation and deletion is expensive so a pool of unused coroutines
30 * is kept as a cache. When the pool has coroutines available, they are
31 * recycled instead of creating new ones from scratch. Coroutines are added to
32 * the pool upon termination.
33 *
34 * The pool is global but each thread maintains a small local pool to avoid
35 * global pool contention. Threads fetch and return batches of coroutines from
36 * the global pool to maintain their local pool. The local pool holds up to two
37 * batches whereas the maximum size of the global pool is controlled by the
38 * qemu_coroutine_inc_pool_size() API.
39 *
40 * .-----------------------------------.
41 * | Batch 1 | Batch 2 | Batch 3 | ... | global_pool
42 * `-----------------------------------'
43 *
44 * .-------------------.
45 * | Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches)
46 * `-------------------'
47 */
48 typedef struct CoroutinePoolBatch {
49 /* Batches are kept in a list */
50 QSLIST_ENTRY(CoroutinePoolBatch) next;
51
52 /* This batch holds up to @COROUTINE_POOL_BATCH_MAX_SIZE coroutines */
53 QSLIST_HEAD(, Coroutine) list;
54 unsigned int size;
55 } CoroutinePoolBatch;
56
57 typedef QSLIST_HEAD(, CoroutinePoolBatch) CoroutinePool;
58
59 /* Host operating system limit on number of pooled coroutines */
60 static unsigned int global_pool_hard_max_size;
61
62 static QemuMutex global_pool_lock; /* protects the following variables */
63 static CoroutinePool global_pool = QSLIST_HEAD_INITIALIZER(global_pool);
64 static unsigned int global_pool_size;
65 static unsigned int global_pool_max_size = COROUTINE_POOL_BATCH_MAX_SIZE;
66
67 QEMU_DEFINE_STATIC_CO_TLS(CoroutinePool, local_pool);
68 QEMU_DEFINE_STATIC_CO_TLS(Notifier, local_pool_cleanup_notifier);
69
coroutine_pool_batch_new(void)70 static CoroutinePoolBatch *coroutine_pool_batch_new(void)
71 {
72 CoroutinePoolBatch *batch = g_new(CoroutinePoolBatch, 1);
73
74 QSLIST_INIT(&batch->list);
75 batch->size = 0;
76 return batch;
77 }
78
coroutine_pool_batch_delete(CoroutinePoolBatch * batch)79 static void coroutine_pool_batch_delete(CoroutinePoolBatch *batch)
80 {
81 Coroutine *co;
82 Coroutine *tmp;
83
84 QSLIST_FOREACH_SAFE(co, &batch->list, pool_next, tmp) {
85 QSLIST_REMOVE_HEAD(&batch->list, pool_next);
86 qemu_coroutine_delete(co);
87 }
88 g_free(batch);
89 }
90
local_pool_cleanup(Notifier * n,void * value)91 static void local_pool_cleanup(Notifier *n, void *value)
92 {
93 CoroutinePool *local_pool = get_ptr_local_pool();
94 CoroutinePoolBatch *batch;
95 CoroutinePoolBatch *tmp;
96
97 QSLIST_FOREACH_SAFE(batch, local_pool, next, tmp) {
98 QSLIST_REMOVE_HEAD(local_pool, next);
99 coroutine_pool_batch_delete(batch);
100 }
101 }
102
103 /* Ensure the atexit notifier is registered */
local_pool_cleanup_init_once(void)104 static void local_pool_cleanup_init_once(void)
105 {
106 Notifier *notifier = get_ptr_local_pool_cleanup_notifier();
107 if (!notifier->notify) {
108 notifier->notify = local_pool_cleanup;
109 qemu_thread_atexit_add(notifier);
110 }
111 }
112
113 /* Helper to get the next unused coroutine from the local pool */
coroutine_pool_get_local(void)114 static Coroutine *coroutine_pool_get_local(void)
115 {
116 CoroutinePool *local_pool = get_ptr_local_pool();
117 CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool);
118 Coroutine *co;
119
120 if (unlikely(!batch)) {
121 return NULL;
122 }
123
124 co = QSLIST_FIRST(&batch->list);
125 QSLIST_REMOVE_HEAD(&batch->list, pool_next);
126 batch->size--;
127
128 if (batch->size == 0) {
129 QSLIST_REMOVE_HEAD(local_pool, next);
130 coroutine_pool_batch_delete(batch);
131 }
132 return co;
133 }
134
135 /* Get the next batch from the global pool */
coroutine_pool_refill_local(void)136 static void coroutine_pool_refill_local(void)
137 {
138 CoroutinePool *local_pool = get_ptr_local_pool();
139 CoroutinePoolBatch *batch = NULL;
140
141 WITH_QEMU_LOCK_GUARD(&global_pool_lock) {
142 batch = QSLIST_FIRST(&global_pool);
143
144 if (batch) {
145 QSLIST_REMOVE_HEAD(&global_pool, next);
146 global_pool_size -= batch->size;
147 }
148 }
149
150 if (batch) {
151 QSLIST_INSERT_HEAD(local_pool, batch, next);
152 local_pool_cleanup_init_once();
153 }
154 }
155
156 /* Add a batch of coroutines to the global pool */
coroutine_pool_put_global(CoroutinePoolBatch * batch)157 static void coroutine_pool_put_global(CoroutinePoolBatch *batch)
158 {
159 WITH_QEMU_LOCK_GUARD(&global_pool_lock) {
160 unsigned int max = MIN(global_pool_max_size,
161 global_pool_hard_max_size);
162
163 if (global_pool_size < max) {
164 QSLIST_INSERT_HEAD(&global_pool, batch, next);
165
166 /* Overshooting the max pool size is allowed */
167 global_pool_size += batch->size;
168 return;
169 }
170 }
171
172 /* The global pool was full, so throw away this batch */
173 coroutine_pool_batch_delete(batch);
174 }
175
176 /* Get the next unused coroutine from the pool or return NULL */
coroutine_pool_get(void)177 static Coroutine *coroutine_pool_get(void)
178 {
179 Coroutine *co;
180
181 co = coroutine_pool_get_local();
182 if (!co) {
183 coroutine_pool_refill_local();
184 co = coroutine_pool_get_local();
185 }
186 return co;
187 }
188
coroutine_pool_put(Coroutine * co)189 static void coroutine_pool_put(Coroutine *co)
190 {
191 CoroutinePool *local_pool = get_ptr_local_pool();
192 CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool);
193
194 if (unlikely(!batch)) {
195 batch = coroutine_pool_batch_new();
196 QSLIST_INSERT_HEAD(local_pool, batch, next);
197 local_pool_cleanup_init_once();
198 }
199
200 if (unlikely(batch->size >= COROUTINE_POOL_BATCH_MAX_SIZE)) {
201 CoroutinePoolBatch *next = QSLIST_NEXT(batch, next);
202
203 /* Is the local pool full? */
204 if (next) {
205 QSLIST_REMOVE_HEAD(local_pool, next);
206 coroutine_pool_put_global(batch);
207 }
208
209 batch = coroutine_pool_batch_new();
210 QSLIST_INSERT_HEAD(local_pool, batch, next);
211 }
212
213 QSLIST_INSERT_HEAD(&batch->list, co, pool_next);
214 batch->size++;
215 }
216
qemu_coroutine_create(CoroutineEntry * entry,void * opaque)217 Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
218 {
219 Coroutine *co = NULL;
220
221 if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
222 co = coroutine_pool_get();
223 }
224
225 if (!co) {
226 co = qemu_coroutine_new();
227 }
228
229 co->entry = entry;
230 co->entry_arg = opaque;
231 QSIMPLEQ_INIT(&co->co_queue_wakeup);
232 return co;
233 }
234
coroutine_delete(Coroutine * co)235 static void coroutine_delete(Coroutine *co)
236 {
237 co->caller = NULL;
238
239 if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
240 coroutine_pool_put(co);
241 } else {
242 qemu_coroutine_delete(co);
243 }
244 }
245
qemu_aio_coroutine_enter(AioContext * ctx,Coroutine * co)246 void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
247 {
248 QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending);
249 Coroutine *from = qemu_coroutine_self();
250
251 QSIMPLEQ_INSERT_TAIL(&pending, co, co_queue_next);
252
253 /* Run co and any queued coroutines */
254 while (!QSIMPLEQ_EMPTY(&pending)) {
255 Coroutine *to = QSIMPLEQ_FIRST(&pending);
256 CoroutineAction ret;
257
258 /*
259 * Read to before to->scheduled; pairs with qatomic_cmpxchg in
260 * qemu_co_sleep(), aio_co_schedule() etc.
261 */
262 smp_read_barrier_depends();
263
264 const char *scheduled = qatomic_read(&to->scheduled);
265
266 QSIMPLEQ_REMOVE_HEAD(&pending, co_queue_next);
267
268 trace_qemu_aio_coroutine_enter(ctx, from, to, to->entry_arg);
269
270 /* if the Coroutine has already been scheduled, entering it again will
271 * cause us to enter it twice, potentially even after the coroutine has
272 * been deleted */
273 if (scheduled) {
274 fprintf(stderr,
275 "%s: Co-routine was already scheduled in '%s'\n",
276 __func__, scheduled);
277 abort();
278 }
279
280 if (to->caller) {
281 fprintf(stderr, "Co-routine re-entered recursively\n");
282 abort();
283 }
284
285 to->caller = from;
286 to->ctx = ctx;
287
288 /* Store to->ctx before anything that stores to. Matches
289 * barrier in aio_co_wake and qemu_co_mutex_wake.
290 */
291 smp_wmb();
292
293 ret = qemu_coroutine_switch(from, to, COROUTINE_ENTER);
294
295 /* Queued coroutines are run depth-first; previously pending coroutines
296 * run after those queued more recently.
297 */
298 QSIMPLEQ_PREPEND(&pending, &to->co_queue_wakeup);
299
300 switch (ret) {
301 case COROUTINE_YIELD:
302 break;
303 case COROUTINE_TERMINATE:
304 assert(!to->locks_held);
305 trace_qemu_coroutine_terminate(to);
306 coroutine_delete(to);
307 break;
308 default:
309 abort();
310 }
311 }
312 }
313
qemu_coroutine_enter(Coroutine * co)314 void qemu_coroutine_enter(Coroutine *co)
315 {
316 qemu_aio_coroutine_enter(qemu_get_current_aio_context(), co);
317 }
318
qemu_coroutine_enter_if_inactive(Coroutine * co)319 void qemu_coroutine_enter_if_inactive(Coroutine *co)
320 {
321 if (!qemu_coroutine_entered(co)) {
322 qemu_coroutine_enter(co);
323 }
324 }
325
qemu_coroutine_yield(void)326 void coroutine_fn qemu_coroutine_yield(void)
327 {
328 Coroutine *self = qemu_coroutine_self();
329 Coroutine *to = self->caller;
330
331 trace_qemu_coroutine_yield(self, to);
332
333 if (!to) {
334 fprintf(stderr, "Co-routine is yielding to no one\n");
335 abort();
336 }
337
338 self->caller = NULL;
339 qemu_coroutine_switch(self, to, COROUTINE_YIELD);
340 }
341
qemu_coroutine_entered(Coroutine * co)342 bool qemu_coroutine_entered(Coroutine *co)
343 {
344 return co->caller;
345 }
346
qemu_coroutine_get_aio_context(Coroutine * co)347 AioContext *qemu_coroutine_get_aio_context(Coroutine *co)
348 {
349 return co->ctx;
350 }
351
qemu_coroutine_inc_pool_size(unsigned int additional_pool_size)352 void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size)
353 {
354 QEMU_LOCK_GUARD(&global_pool_lock);
355 global_pool_max_size += additional_pool_size;
356 }
357
qemu_coroutine_dec_pool_size(unsigned int removing_pool_size)358 void qemu_coroutine_dec_pool_size(unsigned int removing_pool_size)
359 {
360 QEMU_LOCK_GUARD(&global_pool_lock);
361 global_pool_max_size -= removing_pool_size;
362 }
363
get_global_pool_hard_max_size(void)364 static unsigned int get_global_pool_hard_max_size(void)
365 {
366 #ifdef __linux__
367 g_autofree char *contents = NULL;
368 int max_map_count;
369
370 /*
371 * Linux processes can have up to max_map_count virtual memory areas
372 * (VMAs). mmap(2), mprotect(2), etc fail with ENOMEM beyond this limit. We
373 * must limit the coroutine pool to a safe size to avoid running out of
374 * VMAs.
375 */
376 if (g_file_get_contents("/proc/sys/vm/max_map_count", &contents, NULL,
377 NULL) &&
378 qemu_strtoi(contents, NULL, 10, &max_map_count) == 0) {
379 /*
380 * This is an upper bound that avoids exceeding max_map_count. Leave a
381 * fixed amount for non-coroutine users like library dependencies,
382 * vhost-user, etc. Each coroutine takes up 2 VMAs so halve the
383 * remaining amount.
384 */
385 if (max_map_count > 5000) {
386 return (max_map_count - 5000) / 2;
387 } else {
388 /* Disable the global pool but threads still have local pools */
389 return 0;
390 }
391 }
392 #endif
393
394 return UINT_MAX;
395 }
396
qemu_coroutine_init(void)397 static void __attribute__((constructor)) qemu_coroutine_init(void)
398 {
399 qemu_mutex_init(&global_pool_lock);
400 global_pool_hard_max_size = get_global_pool_hard_max_size();
401 }
402