xref: /openbmc/linux/kernel/workqueue.c (revision 4800cd83)
1 /*
2  * kernel/workqueue.c - generic async execution with shared worker pool
3  *
4  * Copyright (C) 2002		Ingo Molnar
5  *
6  *   Derived from the taskqueue/keventd code by:
7  *     David Woodhouse <dwmw2@infradead.org>
8  *     Andrew Morton
9  *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
10  *     Theodore Ts'o <tytso@mit.edu>
11  *
12  * Made to use alloc_percpu by Christoph Lameter.
13  *
14  * Copyright (C) 2010		SUSE Linux Products GmbH
15  * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16  *
17  * This is the generic async execution mechanism.  Work items as are
18  * executed in process context.  The worker pool is shared and
19  * automatically managed.  There is one worker pool for each CPU and
20  * one extra for works which are better served by workers which are
21  * not bound to any specific CPU.
22  *
23  * Please read Documentation/workqueue.txt for details.
24  */
25 
26 #include <linux/module.h>
27 #include <linux/kernel.h>
28 #include <linux/sched.h>
29 #include <linux/init.h>
30 #include <linux/signal.h>
31 #include <linux/completion.h>
32 #include <linux/workqueue.h>
33 #include <linux/slab.h>
34 #include <linux/cpu.h>
35 #include <linux/notifier.h>
36 #include <linux/kthread.h>
37 #include <linux/hardirq.h>
38 #include <linux/mempolicy.h>
39 #include <linux/freezer.h>
40 #include <linux/kallsyms.h>
41 #include <linux/debug_locks.h>
42 #include <linux/lockdep.h>
43 #include <linux/idr.h>
44 
45 #include "workqueue_sched.h"
46 
47 enum {
48 	/* global_cwq flags */
49 	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
50 	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
51 	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
52 	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
53 	GCWQ_HIGHPRI_PENDING	= 1 << 4,	/* highpri works on queue */
54 
55 	/* worker flags */
56 	WORKER_STARTED		= 1 << 0,	/* started */
57 	WORKER_DIE		= 1 << 1,	/* die die die */
58 	WORKER_IDLE		= 1 << 2,	/* is idle */
59 	WORKER_PREP		= 1 << 3,	/* preparing to run works */
60 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
61 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
62 	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
63 	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
64 
65 	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
66 				  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
67 
68 	/* gcwq->trustee_state */
69 	TRUSTEE_START		= 0,		/* start */
70 	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
71 	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
72 	TRUSTEE_RELEASE		= 3,		/* release workers */
73 	TRUSTEE_DONE		= 4,		/* trustee is done */
74 
75 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
76 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
77 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
78 
79 	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
80 	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
81 
82 	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
83 						/* call for help after 10ms
84 						   (min two ticks) */
85 	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
86 	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
87 	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
88 
89 	/*
90 	 * Rescue workers are used only on emergencies and shared by
91 	 * all cpus.  Give -20.
92 	 */
93 	RESCUER_NICE_LEVEL	= -20,
94 };
95 
96 /*
97  * Structure fields follow one of the following exclusion rules.
98  *
99  * I: Modifiable by initialization/destruction paths and read-only for
100  *    everyone else.
101  *
102  * P: Preemption protected.  Disabling preemption is enough and should
103  *    only be modified and accessed from the local cpu.
104  *
105  * L: gcwq->lock protected.  Access with gcwq->lock held.
106  *
107  * X: During normal operation, modification requires gcwq->lock and
108  *    should be done only from local cpu.  Either disabling preemption
109  *    on local cpu or grabbing gcwq->lock is enough for read access.
110  *    If GCWQ_DISASSOCIATED is set, it's identical to L.
111  *
112  * F: wq->flush_mutex protected.
113  *
114  * W: workqueue_lock protected.
115  */
116 
117 struct global_cwq;
118 
119 /*
120  * The poor guys doing the actual heavy lifting.  All on-duty workers
121  * are either serving the manager role, on idle list or on busy hash.
122  */
123 struct worker {
124 	/* on idle list while idle, on busy hash table while busy */
125 	union {
126 		struct list_head	entry;	/* L: while idle */
127 		struct hlist_node	hentry;	/* L: while busy */
128 	};
129 
130 	struct work_struct	*current_work;	/* L: work being processed */
131 	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
132 	struct list_head	scheduled;	/* L: scheduled works */
133 	struct task_struct	*task;		/* I: worker task */
134 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
135 	/* 64 bytes boundary on 64bit, 32 on 32bit */
136 	unsigned long		last_active;	/* L: last active timestamp */
137 	unsigned int		flags;		/* X: flags */
138 	int			id;		/* I: worker id */
139 	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
140 };
141 
142 /*
143  * Global per-cpu workqueue.  There's one and only one for each cpu
144  * and all works are queued and processed here regardless of their
145  * target workqueues.
146  */
147 struct global_cwq {
148 	spinlock_t		lock;		/* the gcwq lock */
149 	struct list_head	worklist;	/* L: list of pending works */
150 	unsigned int		cpu;		/* I: the associated cpu */
151 	unsigned int		flags;		/* L: GCWQ_* flags */
152 
153 	int			nr_workers;	/* L: total number of workers */
154 	int			nr_idle;	/* L: currently idle ones */
155 
156 	/* workers are chained either in the idle_list or busy_hash */
157 	struct list_head	idle_list;	/* X: list of idle workers */
158 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
159 						/* L: hash of busy workers */
160 
161 	struct timer_list	idle_timer;	/* L: worker idle timeout */
162 	struct timer_list	mayday_timer;	/* L: SOS timer for dworkers */
163 
164 	struct ida		worker_ida;	/* L: for worker IDs */
165 
166 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
167 	unsigned int		trustee_state;	/* L: trustee state */
168 	wait_queue_head_t	trustee_wait;	/* trustee wait */
169 	struct worker		*first_idle;	/* L: first idle worker */
170 } ____cacheline_aligned_in_smp;
171 
172 /*
173  * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
174  * work_struct->data are used for flags and thus cwqs need to be
175  * aligned at two's power of the number of flag bits.
176  */
177 struct cpu_workqueue_struct {
178 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
179 	struct workqueue_struct *wq;		/* I: the owning workqueue */
180 	int			work_color;	/* L: current color */
181 	int			flush_color;	/* L: flushing color */
182 	int			nr_in_flight[WORK_NR_COLORS];
183 						/* L: nr of in_flight works */
184 	int			nr_active;	/* L: nr of active works */
185 	int			max_active;	/* L: max active works */
186 	struct list_head	delayed_works;	/* L: delayed works */
187 };
188 
189 /*
190  * Structure used to wait for workqueue flush.
191  */
192 struct wq_flusher {
193 	struct list_head	list;		/* F: list of flushers */
194 	int			flush_color;	/* F: flush color waiting for */
195 	struct completion	done;		/* flush completion */
196 };
197 
198 /*
199  * All cpumasks are assumed to be always set on UP and thus can't be
200  * used to determine whether there's something to be done.
201  */
202 #ifdef CONFIG_SMP
203 typedef cpumask_var_t mayday_mask_t;
204 #define mayday_test_and_set_cpu(cpu, mask)	\
205 	cpumask_test_and_set_cpu((cpu), (mask))
206 #define mayday_clear_cpu(cpu, mask)		cpumask_clear_cpu((cpu), (mask))
207 #define for_each_mayday_cpu(cpu, mask)		for_each_cpu((cpu), (mask))
208 #define alloc_mayday_mask(maskp, gfp)		zalloc_cpumask_var((maskp), (gfp))
209 #define free_mayday_mask(mask)			free_cpumask_var((mask))
210 #else
211 typedef unsigned long mayday_mask_t;
212 #define mayday_test_and_set_cpu(cpu, mask)	test_and_set_bit(0, &(mask))
213 #define mayday_clear_cpu(cpu, mask)		clear_bit(0, &(mask))
214 #define for_each_mayday_cpu(cpu, mask)		if ((cpu) = 0, (mask))
215 #define alloc_mayday_mask(maskp, gfp)		true
216 #define free_mayday_mask(mask)			do { } while (0)
217 #endif
218 
219 /*
220  * The externally visible workqueue abstraction is an array of
221  * per-CPU workqueues:
222  */
223 struct workqueue_struct {
224 	unsigned int		flags;		/* I: WQ_* flags */
225 	union {
226 		struct cpu_workqueue_struct __percpu	*pcpu;
227 		struct cpu_workqueue_struct		*single;
228 		unsigned long				v;
229 	} cpu_wq;				/* I: cwq's */
230 	struct list_head	list;		/* W: list of all workqueues */
231 
232 	struct mutex		flush_mutex;	/* protects wq flushing */
233 	int			work_color;	/* F: current work color */
234 	int			flush_color;	/* F: current flush color */
235 	atomic_t		nr_cwqs_to_flush; /* flush in progress */
236 	struct wq_flusher	*first_flusher;	/* F: first flusher */
237 	struct list_head	flusher_queue;	/* F: flush waiters */
238 	struct list_head	flusher_overflow; /* F: flush overflow list */
239 
240 	mayday_mask_t		mayday_mask;	/* cpus requesting rescue */
241 	struct worker		*rescuer;	/* I: rescue worker */
242 
243 	int			saved_max_active; /* W: saved cwq max_active */
244 	const char		*name;		/* I: workqueue name */
245 #ifdef CONFIG_LOCKDEP
246 	struct lockdep_map	lockdep_map;
247 #endif
248 };
249 
250 struct workqueue_struct *system_wq __read_mostly;
251 struct workqueue_struct *system_long_wq __read_mostly;
252 struct workqueue_struct *system_nrt_wq __read_mostly;
253 struct workqueue_struct *system_unbound_wq __read_mostly;
254 EXPORT_SYMBOL_GPL(system_wq);
255 EXPORT_SYMBOL_GPL(system_long_wq);
256 EXPORT_SYMBOL_GPL(system_nrt_wq);
257 EXPORT_SYMBOL_GPL(system_unbound_wq);
258 
259 #define CREATE_TRACE_POINTS
260 #include <trace/events/workqueue.h>
261 
262 #define for_each_busy_worker(worker, i, pos, gcwq)			\
263 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
264 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
265 
266 static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
267 				  unsigned int sw)
268 {
269 	if (cpu < nr_cpu_ids) {
270 		if (sw & 1) {
271 			cpu = cpumask_next(cpu, mask);
272 			if (cpu < nr_cpu_ids)
273 				return cpu;
274 		}
275 		if (sw & 2)
276 			return WORK_CPU_UNBOUND;
277 	}
278 	return WORK_CPU_NONE;
279 }
280 
281 static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
282 				struct workqueue_struct *wq)
283 {
284 	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
285 }
286 
287 /*
288  * CPU iterators
289  *
290  * An extra gcwq is defined for an invalid cpu number
291  * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
292  * specific CPU.  The following iterators are similar to
293  * for_each_*_cpu() iterators but also considers the unbound gcwq.
294  *
295  * for_each_gcwq_cpu()		: possible CPUs + WORK_CPU_UNBOUND
296  * for_each_online_gcwq_cpu()	: online CPUs + WORK_CPU_UNBOUND
297  * for_each_cwq_cpu()		: possible CPUs for bound workqueues,
298  *				  WORK_CPU_UNBOUND for unbound workqueues
299  */
300 #define for_each_gcwq_cpu(cpu)						\
301 	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\
302 	     (cpu) < WORK_CPU_NONE;					\
303 	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
304 
305 #define for_each_online_gcwq_cpu(cpu)					\
306 	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\
307 	     (cpu) < WORK_CPU_NONE;					\
308 	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
309 
310 #define for_each_cwq_cpu(cpu, wq)					\
311 	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\
312 	     (cpu) < WORK_CPU_NONE;					\
313 	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
314 
315 #ifdef CONFIG_DEBUG_OBJECTS_WORK
316 
317 static struct debug_obj_descr work_debug_descr;
318 
319 /*
320  * fixup_init is called when:
321  * - an active object is initialized
322  */
323 static int work_fixup_init(void *addr, enum debug_obj_state state)
324 {
325 	struct work_struct *work = addr;
326 
327 	switch (state) {
328 	case ODEBUG_STATE_ACTIVE:
329 		cancel_work_sync(work);
330 		debug_object_init(work, &work_debug_descr);
331 		return 1;
332 	default:
333 		return 0;
334 	}
335 }
336 
337 /*
338  * fixup_activate is called when:
339  * - an active object is activated
340  * - an unknown object is activated (might be a statically initialized object)
341  */
342 static int work_fixup_activate(void *addr, enum debug_obj_state state)
343 {
344 	struct work_struct *work = addr;
345 
346 	switch (state) {
347 
348 	case ODEBUG_STATE_NOTAVAILABLE:
349 		/*
350 		 * This is not really a fixup. The work struct was
351 		 * statically initialized. We just make sure that it
352 		 * is tracked in the object tracker.
353 		 */
354 		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
355 			debug_object_init(work, &work_debug_descr);
356 			debug_object_activate(work, &work_debug_descr);
357 			return 0;
358 		}
359 		WARN_ON_ONCE(1);
360 		return 0;
361 
362 	case ODEBUG_STATE_ACTIVE:
363 		WARN_ON(1);
364 
365 	default:
366 		return 0;
367 	}
368 }
369 
370 /*
371  * fixup_free is called when:
372  * - an active object is freed
373  */
374 static int work_fixup_free(void *addr, enum debug_obj_state state)
375 {
376 	struct work_struct *work = addr;
377 
378 	switch (state) {
379 	case ODEBUG_STATE_ACTIVE:
380 		cancel_work_sync(work);
381 		debug_object_free(work, &work_debug_descr);
382 		return 1;
383 	default:
384 		return 0;
385 	}
386 }
387 
388 static struct debug_obj_descr work_debug_descr = {
389 	.name		= "work_struct",
390 	.fixup_init	= work_fixup_init,
391 	.fixup_activate	= work_fixup_activate,
392 	.fixup_free	= work_fixup_free,
393 };
394 
395 static inline void debug_work_activate(struct work_struct *work)
396 {
397 	debug_object_activate(work, &work_debug_descr);
398 }
399 
400 static inline void debug_work_deactivate(struct work_struct *work)
401 {
402 	debug_object_deactivate(work, &work_debug_descr);
403 }
404 
405 void __init_work(struct work_struct *work, int onstack)
406 {
407 	if (onstack)
408 		debug_object_init_on_stack(work, &work_debug_descr);
409 	else
410 		debug_object_init(work, &work_debug_descr);
411 }
412 EXPORT_SYMBOL_GPL(__init_work);
413 
414 void destroy_work_on_stack(struct work_struct *work)
415 {
416 	debug_object_free(work, &work_debug_descr);
417 }
418 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
419 
420 #else
421 static inline void debug_work_activate(struct work_struct *work) { }
422 static inline void debug_work_deactivate(struct work_struct *work) { }
423 #endif
424 
425 /* Serializes the accesses to the list of workqueues. */
426 static DEFINE_SPINLOCK(workqueue_lock);
427 static LIST_HEAD(workqueues);
428 static bool workqueue_freezing;		/* W: have wqs started freezing? */
429 
430 /*
431  * The almighty global cpu workqueues.  nr_running is the only field
432  * which is expected to be used frequently by other cpus via
433  * try_to_wake_up().  Put it in a separate cacheline.
434  */
435 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
436 static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
437 
438 /*
439  * Global cpu workqueue and nr_running counter for unbound gcwq.  The
440  * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
441  * workers have WORKER_UNBOUND set.
442  */
443 static struct global_cwq unbound_global_cwq;
444 static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);	/* always 0 */
445 
446 static int worker_thread(void *__worker);
447 
448 static struct global_cwq *get_gcwq(unsigned int cpu)
449 {
450 	if (cpu != WORK_CPU_UNBOUND)
451 		return &per_cpu(global_cwq, cpu);
452 	else
453 		return &unbound_global_cwq;
454 }
455 
456 static atomic_t *get_gcwq_nr_running(unsigned int cpu)
457 {
458 	if (cpu != WORK_CPU_UNBOUND)
459 		return &per_cpu(gcwq_nr_running, cpu);
460 	else
461 		return &unbound_gcwq_nr_running;
462 }
463 
464 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
465 					    struct workqueue_struct *wq)
466 {
467 	if (!(wq->flags & WQ_UNBOUND)) {
468 		if (likely(cpu < nr_cpu_ids)) {
469 #ifdef CONFIG_SMP
470 			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
471 #else
472 			return wq->cpu_wq.single;
473 #endif
474 		}
475 	} else if (likely(cpu == WORK_CPU_UNBOUND))
476 		return wq->cpu_wq.single;
477 	return NULL;
478 }
479 
480 static unsigned int work_color_to_flags(int color)
481 {
482 	return color << WORK_STRUCT_COLOR_SHIFT;
483 }
484 
485 static int get_work_color(struct work_struct *work)
486 {
487 	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
488 		((1 << WORK_STRUCT_COLOR_BITS) - 1);
489 }
490 
491 static int work_next_color(int color)
492 {
493 	return (color + 1) % WORK_NR_COLORS;
494 }
495 
496 /*
497  * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
498  * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
499  * cleared and the work data contains the cpu number it was last on.
500  *
501  * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
502  * cwq, cpu or clear work->data.  These functions should only be
503  * called while the work is owned - ie. while the PENDING bit is set.
504  *
505  * get_work_[g]cwq() can be used to obtain the gcwq or cwq
506  * corresponding to a work.  gcwq is available once the work has been
507  * queued anywhere after initialization.  cwq is available only from
508  * queueing until execution starts.
509  */
510 static inline void set_work_data(struct work_struct *work, unsigned long data,
511 				 unsigned long flags)
512 {
513 	BUG_ON(!work_pending(work));
514 	atomic_long_set(&work->data, data | flags | work_static(work));
515 }
516 
517 static void set_work_cwq(struct work_struct *work,
518 			 struct cpu_workqueue_struct *cwq,
519 			 unsigned long extra_flags)
520 {
521 	set_work_data(work, (unsigned long)cwq,
522 		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
523 }
524 
525 static void set_work_cpu(struct work_struct *work, unsigned int cpu)
526 {
527 	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
528 }
529 
530 static void clear_work_data(struct work_struct *work)
531 {
532 	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
533 }
534 
535 static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
536 {
537 	unsigned long data = atomic_long_read(&work->data);
538 
539 	if (data & WORK_STRUCT_CWQ)
540 		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
541 	else
542 		return NULL;
543 }
544 
545 static struct global_cwq *get_work_gcwq(struct work_struct *work)
546 {
547 	unsigned long data = atomic_long_read(&work->data);
548 	unsigned int cpu;
549 
550 	if (data & WORK_STRUCT_CWQ)
551 		return ((struct cpu_workqueue_struct *)
552 			(data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
553 
554 	cpu = data >> WORK_STRUCT_FLAG_BITS;
555 	if (cpu == WORK_CPU_NONE)
556 		return NULL;
557 
558 	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
559 	return get_gcwq(cpu);
560 }
561 
562 /*
563  * Policy functions.  These define the policies on how the global
564  * worker pool is managed.  Unless noted otherwise, these functions
565  * assume that they're being called with gcwq->lock held.
566  */
567 
568 static bool __need_more_worker(struct global_cwq *gcwq)
569 {
570 	return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
571 		gcwq->flags & GCWQ_HIGHPRI_PENDING;
572 }
573 
574 /*
575  * Need to wake up a worker?  Called from anything but currently
576  * running workers.
577  */
578 static bool need_more_worker(struct global_cwq *gcwq)
579 {
580 	return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
581 }
582 
583 /* Can I start working?  Called from busy but !running workers. */
584 static bool may_start_working(struct global_cwq *gcwq)
585 {
586 	return gcwq->nr_idle;
587 }
588 
589 /* Do I need to keep working?  Called from currently running workers. */
590 static bool keep_working(struct global_cwq *gcwq)
591 {
592 	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
593 
594 	return !list_empty(&gcwq->worklist) &&
595 		(atomic_read(nr_running) <= 1 ||
596 		 gcwq->flags & GCWQ_HIGHPRI_PENDING);
597 }
598 
599 /* Do we need a new worker?  Called from manager. */
600 static bool need_to_create_worker(struct global_cwq *gcwq)
601 {
602 	return need_more_worker(gcwq) && !may_start_working(gcwq);
603 }
604 
605 /* Do I need to be the manager? */
606 static bool need_to_manage_workers(struct global_cwq *gcwq)
607 {
608 	return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
609 }
610 
611 /* Do we have too many workers and should some go away? */
612 static bool too_many_workers(struct global_cwq *gcwq)
613 {
614 	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
615 	int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
616 	int nr_busy = gcwq->nr_workers - nr_idle;
617 
618 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
619 }
620 
621 /*
622  * Wake up functions.
623  */
624 
625 /* Return the first worker.  Safe with preemption disabled */
626 static struct worker *first_worker(struct global_cwq *gcwq)
627 {
628 	if (unlikely(list_empty(&gcwq->idle_list)))
629 		return NULL;
630 
631 	return list_first_entry(&gcwq->idle_list, struct worker, entry);
632 }
633 
634 /**
635  * wake_up_worker - wake up an idle worker
636  * @gcwq: gcwq to wake worker for
637  *
638  * Wake up the first idle worker of @gcwq.
639  *
640  * CONTEXT:
641  * spin_lock_irq(gcwq->lock).
642  */
643 static void wake_up_worker(struct global_cwq *gcwq)
644 {
645 	struct worker *worker = first_worker(gcwq);
646 
647 	if (likely(worker))
648 		wake_up_process(worker->task);
649 }
650 
651 /**
652  * wq_worker_waking_up - a worker is waking up
653  * @task: task waking up
654  * @cpu: CPU @task is waking up to
655  *
656  * This function is called during try_to_wake_up() when a worker is
657  * being awoken.
658  *
659  * CONTEXT:
660  * spin_lock_irq(rq->lock)
661  */
662 void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
663 {
664 	struct worker *worker = kthread_data(task);
665 
666 	if (!(worker->flags & WORKER_NOT_RUNNING))
667 		atomic_inc(get_gcwq_nr_running(cpu));
668 }
669 
670 /**
671  * wq_worker_sleeping - a worker is going to sleep
672  * @task: task going to sleep
673  * @cpu: CPU in question, must be the current CPU number
674  *
675  * This function is called during schedule() when a busy worker is
676  * going to sleep.  Worker on the same cpu can be woken up by
677  * returning pointer to its task.
678  *
679  * CONTEXT:
680  * spin_lock_irq(rq->lock)
681  *
682  * RETURNS:
683  * Worker task on @cpu to wake up, %NULL if none.
684  */
685 struct task_struct *wq_worker_sleeping(struct task_struct *task,
686 				       unsigned int cpu)
687 {
688 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
689 	struct global_cwq *gcwq = get_gcwq(cpu);
690 	atomic_t *nr_running = get_gcwq_nr_running(cpu);
691 
692 	if (worker->flags & WORKER_NOT_RUNNING)
693 		return NULL;
694 
695 	/* this can only happen on the local cpu */
696 	BUG_ON(cpu != raw_smp_processor_id());
697 
698 	/*
699 	 * The counterpart of the following dec_and_test, implied mb,
700 	 * worklist not empty test sequence is in insert_work().
701 	 * Please read comment there.
702 	 *
703 	 * NOT_RUNNING is clear.  This means that trustee is not in
704 	 * charge and we're running on the local cpu w/ rq lock held
705 	 * and preemption disabled, which in turn means that none else
706 	 * could be manipulating idle_list, so dereferencing idle_list
707 	 * without gcwq lock is safe.
708 	 */
709 	if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
710 		to_wakeup = first_worker(gcwq);
711 	return to_wakeup ? to_wakeup->task : NULL;
712 }
713 
714 /**
715  * worker_set_flags - set worker flags and adjust nr_running accordingly
716  * @worker: self
717  * @flags: flags to set
718  * @wakeup: wakeup an idle worker if necessary
719  *
720  * Set @flags in @worker->flags and adjust nr_running accordingly.  If
721  * nr_running becomes zero and @wakeup is %true, an idle worker is
722  * woken up.
723  *
724  * CONTEXT:
725  * spin_lock_irq(gcwq->lock)
726  */
727 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
728 				    bool wakeup)
729 {
730 	struct global_cwq *gcwq = worker->gcwq;
731 
732 	WARN_ON_ONCE(worker->task != current);
733 
734 	/*
735 	 * If transitioning into NOT_RUNNING, adjust nr_running and
736 	 * wake up an idle worker as necessary if requested by
737 	 * @wakeup.
738 	 */
739 	if ((flags & WORKER_NOT_RUNNING) &&
740 	    !(worker->flags & WORKER_NOT_RUNNING)) {
741 		atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
742 
743 		if (wakeup) {
744 			if (atomic_dec_and_test(nr_running) &&
745 			    !list_empty(&gcwq->worklist))
746 				wake_up_worker(gcwq);
747 		} else
748 			atomic_dec(nr_running);
749 	}
750 
751 	worker->flags |= flags;
752 }
753 
754 /**
755  * worker_clr_flags - clear worker flags and adjust nr_running accordingly
756  * @worker: self
757  * @flags: flags to clear
758  *
759  * Clear @flags in @worker->flags and adjust nr_running accordingly.
760  *
761  * CONTEXT:
762  * spin_lock_irq(gcwq->lock)
763  */
764 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
765 {
766 	struct global_cwq *gcwq = worker->gcwq;
767 	unsigned int oflags = worker->flags;
768 
769 	WARN_ON_ONCE(worker->task != current);
770 
771 	worker->flags &= ~flags;
772 
773 	/*
774 	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
775 	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
776 	 * of multiple flags, not a single flag.
777 	 */
778 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
779 		if (!(worker->flags & WORKER_NOT_RUNNING))
780 			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
781 }
782 
783 /**
784  * busy_worker_head - return the busy hash head for a work
785  * @gcwq: gcwq of interest
786  * @work: work to be hashed
787  *
788  * Return hash head of @gcwq for @work.
789  *
790  * CONTEXT:
791  * spin_lock_irq(gcwq->lock).
792  *
793  * RETURNS:
794  * Pointer to the hash head.
795  */
796 static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
797 					   struct work_struct *work)
798 {
799 	const int base_shift = ilog2(sizeof(struct work_struct));
800 	unsigned long v = (unsigned long)work;
801 
802 	/* simple shift and fold hash, do we need something better? */
803 	v >>= base_shift;
804 	v += v >> BUSY_WORKER_HASH_ORDER;
805 	v &= BUSY_WORKER_HASH_MASK;
806 
807 	return &gcwq->busy_hash[v];
808 }
809 
810 /**
811  * __find_worker_executing_work - find worker which is executing a work
812  * @gcwq: gcwq of interest
813  * @bwh: hash head as returned by busy_worker_head()
814  * @work: work to find worker for
815  *
816  * Find a worker which is executing @work on @gcwq.  @bwh should be
817  * the hash head obtained by calling busy_worker_head() with the same
818  * work.
819  *
820  * CONTEXT:
821  * spin_lock_irq(gcwq->lock).
822  *
823  * RETURNS:
824  * Pointer to worker which is executing @work if found, NULL
825  * otherwise.
826  */
827 static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
828 						   struct hlist_head *bwh,
829 						   struct work_struct *work)
830 {
831 	struct worker *worker;
832 	struct hlist_node *tmp;
833 
834 	hlist_for_each_entry(worker, tmp, bwh, hentry)
835 		if (worker->current_work == work)
836 			return worker;
837 	return NULL;
838 }
839 
840 /**
841  * find_worker_executing_work - find worker which is executing a work
842  * @gcwq: gcwq of interest
843  * @work: work to find worker for
844  *
845  * Find a worker which is executing @work on @gcwq.  This function is
846  * identical to __find_worker_executing_work() except that this
847  * function calculates @bwh itself.
848  *
849  * CONTEXT:
850  * spin_lock_irq(gcwq->lock).
851  *
852  * RETURNS:
853  * Pointer to worker which is executing @work if found, NULL
854  * otherwise.
855  */
856 static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
857 						 struct work_struct *work)
858 {
859 	return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
860 					    work);
861 }
862 
863 /**
864  * gcwq_determine_ins_pos - find insertion position
865  * @gcwq: gcwq of interest
866  * @cwq: cwq a work is being queued for
867  *
868  * A work for @cwq is about to be queued on @gcwq, determine insertion
869  * position for the work.  If @cwq is for HIGHPRI wq, the work is
870  * queued at the head of the queue but in FIFO order with respect to
871  * other HIGHPRI works; otherwise, at the end of the queue.  This
872  * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
873  * there are HIGHPRI works pending.
874  *
875  * CONTEXT:
876  * spin_lock_irq(gcwq->lock).
877  *
878  * RETURNS:
879  * Pointer to inserstion position.
880  */
881 static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
882 					       struct cpu_workqueue_struct *cwq)
883 {
884 	struct work_struct *twork;
885 
886 	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
887 		return &gcwq->worklist;
888 
889 	list_for_each_entry(twork, &gcwq->worklist, entry) {
890 		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
891 
892 		if (!(tcwq->wq->flags & WQ_HIGHPRI))
893 			break;
894 	}
895 
896 	gcwq->flags |= GCWQ_HIGHPRI_PENDING;
897 	return &twork->entry;
898 }
899 
900 /**
901  * insert_work - insert a work into gcwq
902  * @cwq: cwq @work belongs to
903  * @work: work to insert
904  * @head: insertion point
905  * @extra_flags: extra WORK_STRUCT_* flags to set
906  *
907  * Insert @work which belongs to @cwq into @gcwq after @head.
908  * @extra_flags is or'd to work_struct flags.
909  *
910  * CONTEXT:
911  * spin_lock_irq(gcwq->lock).
912  */
913 static void insert_work(struct cpu_workqueue_struct *cwq,
914 			struct work_struct *work, struct list_head *head,
915 			unsigned int extra_flags)
916 {
917 	struct global_cwq *gcwq = cwq->gcwq;
918 
919 	/* we own @work, set data and link */
920 	set_work_cwq(work, cwq, extra_flags);
921 
922 	/*
923 	 * Ensure that we get the right work->data if we see the
924 	 * result of list_add() below, see try_to_grab_pending().
925 	 */
926 	smp_wmb();
927 
928 	list_add_tail(&work->entry, head);
929 
930 	/*
931 	 * Ensure either worker_sched_deactivated() sees the above
932 	 * list_add_tail() or we see zero nr_running to avoid workers
933 	 * lying around lazily while there are works to be processed.
934 	 */
935 	smp_mb();
936 
937 	if (__need_more_worker(gcwq))
938 		wake_up_worker(gcwq);
939 }
940 
941 /*
942  * Test whether @work is being queued from another work executing on the
943  * same workqueue.  This is rather expensive and should only be used from
944  * cold paths.
945  */
946 static bool is_chained_work(struct workqueue_struct *wq)
947 {
948 	unsigned long flags;
949 	unsigned int cpu;
950 
951 	for_each_gcwq_cpu(cpu) {
952 		struct global_cwq *gcwq = get_gcwq(cpu);
953 		struct worker *worker;
954 		struct hlist_node *pos;
955 		int i;
956 
957 		spin_lock_irqsave(&gcwq->lock, flags);
958 		for_each_busy_worker(worker, i, pos, gcwq) {
959 			if (worker->task != current)
960 				continue;
961 			spin_unlock_irqrestore(&gcwq->lock, flags);
962 			/*
963 			 * I'm @worker, no locking necessary.  See if @work
964 			 * is headed to the same workqueue.
965 			 */
966 			return worker->current_cwq->wq == wq;
967 		}
968 		spin_unlock_irqrestore(&gcwq->lock, flags);
969 	}
970 	return false;
971 }
972 
973 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
974 			 struct work_struct *work)
975 {
976 	struct global_cwq *gcwq;
977 	struct cpu_workqueue_struct *cwq;
978 	struct list_head *worklist;
979 	unsigned int work_flags;
980 	unsigned long flags;
981 
982 	debug_work_activate(work);
983 
984 	/* if dying, only works from the same workqueue are allowed */
985 	if (unlikely(wq->flags & WQ_DYING) &&
986 	    WARN_ON_ONCE(!is_chained_work(wq)))
987 		return;
988 
989 	/* determine gcwq to use */
990 	if (!(wq->flags & WQ_UNBOUND)) {
991 		struct global_cwq *last_gcwq;
992 
993 		if (unlikely(cpu == WORK_CPU_UNBOUND))
994 			cpu = raw_smp_processor_id();
995 
996 		/*
997 		 * It's multi cpu.  If @wq is non-reentrant and @work
998 		 * was previously on a different cpu, it might still
999 		 * be running there, in which case the work needs to
1000 		 * be queued on that cpu to guarantee non-reentrance.
1001 		 */
1002 		gcwq = get_gcwq(cpu);
1003 		if (wq->flags & WQ_NON_REENTRANT &&
1004 		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
1005 			struct worker *worker;
1006 
1007 			spin_lock_irqsave(&last_gcwq->lock, flags);
1008 
1009 			worker = find_worker_executing_work(last_gcwq, work);
1010 
1011 			if (worker && worker->current_cwq->wq == wq)
1012 				gcwq = last_gcwq;
1013 			else {
1014 				/* meh... not running there, queue here */
1015 				spin_unlock_irqrestore(&last_gcwq->lock, flags);
1016 				spin_lock_irqsave(&gcwq->lock, flags);
1017 			}
1018 		} else
1019 			spin_lock_irqsave(&gcwq->lock, flags);
1020 	} else {
1021 		gcwq = get_gcwq(WORK_CPU_UNBOUND);
1022 		spin_lock_irqsave(&gcwq->lock, flags);
1023 	}
1024 
1025 	/* gcwq determined, get cwq and queue */
1026 	cwq = get_cwq(gcwq->cpu, wq);
1027 	trace_workqueue_queue_work(cpu, cwq, work);
1028 
1029 	BUG_ON(!list_empty(&work->entry));
1030 
1031 	cwq->nr_in_flight[cwq->work_color]++;
1032 	work_flags = work_color_to_flags(cwq->work_color);
1033 
1034 	if (likely(cwq->nr_active < cwq->max_active)) {
1035 		trace_workqueue_activate_work(work);
1036 		cwq->nr_active++;
1037 		worklist = gcwq_determine_ins_pos(gcwq, cwq);
1038 	} else {
1039 		work_flags |= WORK_STRUCT_DELAYED;
1040 		worklist = &cwq->delayed_works;
1041 	}
1042 
1043 	insert_work(cwq, work, worklist, work_flags);
1044 
1045 	spin_unlock_irqrestore(&gcwq->lock, flags);
1046 }
1047 
1048 /**
1049  * queue_work - queue work on a workqueue
1050  * @wq: workqueue to use
1051  * @work: work to queue
1052  *
1053  * Returns 0 if @work was already on a queue, non-zero otherwise.
1054  *
1055  * We queue the work to the CPU on which it was submitted, but if the CPU dies
1056  * it can be processed by another CPU.
1057  */
1058 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
1059 {
1060 	int ret;
1061 
1062 	ret = queue_work_on(get_cpu(), wq, work);
1063 	put_cpu();
1064 
1065 	return ret;
1066 }
1067 EXPORT_SYMBOL_GPL(queue_work);
1068 
1069 /**
1070  * queue_work_on - queue work on specific cpu
1071  * @cpu: CPU number to execute work on
1072  * @wq: workqueue to use
1073  * @work: work to queue
1074  *
1075  * Returns 0 if @work was already on a queue, non-zero otherwise.
1076  *
1077  * We queue the work to a specific CPU, the caller must ensure it
1078  * can't go away.
1079  */
1080 int
1081 queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
1082 {
1083 	int ret = 0;
1084 
1085 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1086 		__queue_work(cpu, wq, work);
1087 		ret = 1;
1088 	}
1089 	return ret;
1090 }
1091 EXPORT_SYMBOL_GPL(queue_work_on);
1092 
1093 static void delayed_work_timer_fn(unsigned long __data)
1094 {
1095 	struct delayed_work *dwork = (struct delayed_work *)__data;
1096 	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1097 
1098 	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
1099 }
1100 
1101 /**
1102  * queue_delayed_work - queue work on a workqueue after delay
1103  * @wq: workqueue to use
1104  * @dwork: delayable work to queue
1105  * @delay: number of jiffies to wait before queueing
1106  *
1107  * Returns 0 if @work was already on a queue, non-zero otherwise.
1108  */
1109 int queue_delayed_work(struct workqueue_struct *wq,
1110 			struct delayed_work *dwork, unsigned long delay)
1111 {
1112 	if (delay == 0)
1113 		return queue_work(wq, &dwork->work);
1114 
1115 	return queue_delayed_work_on(-1, wq, dwork, delay);
1116 }
1117 EXPORT_SYMBOL_GPL(queue_delayed_work);
1118 
1119 /**
1120  * queue_delayed_work_on - queue work on specific CPU after delay
1121  * @cpu: CPU number to execute work on
1122  * @wq: workqueue to use
1123  * @dwork: work to queue
1124  * @delay: number of jiffies to wait before queueing
1125  *
1126  * Returns 0 if @work was already on a queue, non-zero otherwise.
1127  */
1128 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1129 			struct delayed_work *dwork, unsigned long delay)
1130 {
1131 	int ret = 0;
1132 	struct timer_list *timer = &dwork->timer;
1133 	struct work_struct *work = &dwork->work;
1134 
1135 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1136 		unsigned int lcpu;
1137 
1138 		BUG_ON(timer_pending(timer));
1139 		BUG_ON(!list_empty(&work->entry));
1140 
1141 		timer_stats_timer_set_start_info(&dwork->timer);
1142 
1143 		/*
1144 		 * This stores cwq for the moment, for the timer_fn.
1145 		 * Note that the work's gcwq is preserved to allow
1146 		 * reentrance detection for delayed works.
1147 		 */
1148 		if (!(wq->flags & WQ_UNBOUND)) {
1149 			struct global_cwq *gcwq = get_work_gcwq(work);
1150 
1151 			if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1152 				lcpu = gcwq->cpu;
1153 			else
1154 				lcpu = raw_smp_processor_id();
1155 		} else
1156 			lcpu = WORK_CPU_UNBOUND;
1157 
1158 		set_work_cwq(work, get_cwq(lcpu, wq), 0);
1159 
1160 		timer->expires = jiffies + delay;
1161 		timer->data = (unsigned long)dwork;
1162 		timer->function = delayed_work_timer_fn;
1163 
1164 		if (unlikely(cpu >= 0))
1165 			add_timer_on(timer, cpu);
1166 		else
1167 			add_timer(timer);
1168 		ret = 1;
1169 	}
1170 	return ret;
1171 }
1172 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1173 
1174 /**
1175  * worker_enter_idle - enter idle state
1176  * @worker: worker which is entering idle state
1177  *
1178  * @worker is entering idle state.  Update stats and idle timer if
1179  * necessary.
1180  *
1181  * LOCKING:
1182  * spin_lock_irq(gcwq->lock).
1183  */
1184 static void worker_enter_idle(struct worker *worker)
1185 {
1186 	struct global_cwq *gcwq = worker->gcwq;
1187 
1188 	BUG_ON(worker->flags & WORKER_IDLE);
1189 	BUG_ON(!list_empty(&worker->entry) &&
1190 	       (worker->hentry.next || worker->hentry.pprev));
1191 
1192 	/* can't use worker_set_flags(), also called from start_worker() */
1193 	worker->flags |= WORKER_IDLE;
1194 	gcwq->nr_idle++;
1195 	worker->last_active = jiffies;
1196 
1197 	/* idle_list is LIFO */
1198 	list_add(&worker->entry, &gcwq->idle_list);
1199 
1200 	if (likely(!(worker->flags & WORKER_ROGUE))) {
1201 		if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1202 			mod_timer(&gcwq->idle_timer,
1203 				  jiffies + IDLE_WORKER_TIMEOUT);
1204 	} else
1205 		wake_up_all(&gcwq->trustee_wait);
1206 
1207 	/* sanity check nr_running */
1208 	WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1209 		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1210 }
1211 
1212 /**
1213  * worker_leave_idle - leave idle state
1214  * @worker: worker which is leaving idle state
1215  *
1216  * @worker is leaving idle state.  Update stats.
1217  *
1218  * LOCKING:
1219  * spin_lock_irq(gcwq->lock).
1220  */
1221 static void worker_leave_idle(struct worker *worker)
1222 {
1223 	struct global_cwq *gcwq = worker->gcwq;
1224 
1225 	BUG_ON(!(worker->flags & WORKER_IDLE));
1226 	worker_clr_flags(worker, WORKER_IDLE);
1227 	gcwq->nr_idle--;
1228 	list_del_init(&worker->entry);
1229 }
1230 
1231 /**
1232  * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1233  * @worker: self
1234  *
1235  * Works which are scheduled while the cpu is online must at least be
1236  * scheduled to a worker which is bound to the cpu so that if they are
1237  * flushed from cpu callbacks while cpu is going down, they are
1238  * guaranteed to execute on the cpu.
1239  *
1240  * This function is to be used by rogue workers and rescuers to bind
1241  * themselves to the target cpu and may race with cpu going down or
1242  * coming online.  kthread_bind() can't be used because it may put the
1243  * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1244  * verbatim as it's best effort and blocking and gcwq may be
1245  * [dis]associated in the meantime.
1246  *
1247  * This function tries set_cpus_allowed() and locks gcwq and verifies
1248  * the binding against GCWQ_DISASSOCIATED which is set during
1249  * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1250  * idle state or fetches works without dropping lock, it can guarantee
1251  * the scheduling requirement described in the first paragraph.
1252  *
1253  * CONTEXT:
1254  * Might sleep.  Called without any lock but returns with gcwq->lock
1255  * held.
1256  *
1257  * RETURNS:
1258  * %true if the associated gcwq is online (@worker is successfully
1259  * bound), %false if offline.
1260  */
1261 static bool worker_maybe_bind_and_lock(struct worker *worker)
1262 __acquires(&gcwq->lock)
1263 {
1264 	struct global_cwq *gcwq = worker->gcwq;
1265 	struct task_struct *task = worker->task;
1266 
1267 	while (true) {
1268 		/*
1269 		 * The following call may fail, succeed or succeed
1270 		 * without actually migrating the task to the cpu if
1271 		 * it races with cpu hotunplug operation.  Verify
1272 		 * against GCWQ_DISASSOCIATED.
1273 		 */
1274 		if (!(gcwq->flags & GCWQ_DISASSOCIATED))
1275 			set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
1276 
1277 		spin_lock_irq(&gcwq->lock);
1278 		if (gcwq->flags & GCWQ_DISASSOCIATED)
1279 			return false;
1280 		if (task_cpu(task) == gcwq->cpu &&
1281 		    cpumask_equal(&current->cpus_allowed,
1282 				  get_cpu_mask(gcwq->cpu)))
1283 			return true;
1284 		spin_unlock_irq(&gcwq->lock);
1285 
1286 		/* CPU has come up inbetween, retry migration */
1287 		cpu_relax();
1288 	}
1289 }
1290 
1291 /*
1292  * Function for worker->rebind_work used to rebind rogue busy workers
1293  * to the associated cpu which is coming back online.  This is
1294  * scheduled by cpu up but can race with other cpu hotplug operations
1295  * and may be executed twice without intervening cpu down.
1296  */
1297 static void worker_rebind_fn(struct work_struct *work)
1298 {
1299 	struct worker *worker = container_of(work, struct worker, rebind_work);
1300 	struct global_cwq *gcwq = worker->gcwq;
1301 
1302 	if (worker_maybe_bind_and_lock(worker))
1303 		worker_clr_flags(worker, WORKER_REBIND);
1304 
1305 	spin_unlock_irq(&gcwq->lock);
1306 }
1307 
1308 static struct worker *alloc_worker(void)
1309 {
1310 	struct worker *worker;
1311 
1312 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1313 	if (worker) {
1314 		INIT_LIST_HEAD(&worker->entry);
1315 		INIT_LIST_HEAD(&worker->scheduled);
1316 		INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1317 		/* on creation a worker is in !idle && prep state */
1318 		worker->flags = WORKER_PREP;
1319 	}
1320 	return worker;
1321 }
1322 
1323 /**
1324  * create_worker - create a new workqueue worker
1325  * @gcwq: gcwq the new worker will belong to
1326  * @bind: whether to set affinity to @cpu or not
1327  *
1328  * Create a new worker which is bound to @gcwq.  The returned worker
1329  * can be started by calling start_worker() or destroyed using
1330  * destroy_worker().
1331  *
1332  * CONTEXT:
1333  * Might sleep.  Does GFP_KERNEL allocations.
1334  *
1335  * RETURNS:
1336  * Pointer to the newly created worker.
1337  */
1338 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1339 {
1340 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1341 	struct worker *worker = NULL;
1342 	int id = -1;
1343 
1344 	spin_lock_irq(&gcwq->lock);
1345 	while (ida_get_new(&gcwq->worker_ida, &id)) {
1346 		spin_unlock_irq(&gcwq->lock);
1347 		if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1348 			goto fail;
1349 		spin_lock_irq(&gcwq->lock);
1350 	}
1351 	spin_unlock_irq(&gcwq->lock);
1352 
1353 	worker = alloc_worker();
1354 	if (!worker)
1355 		goto fail;
1356 
1357 	worker->gcwq = gcwq;
1358 	worker->id = id;
1359 
1360 	if (!on_unbound_cpu)
1361 		worker->task = kthread_create(worker_thread, worker,
1362 					      "kworker/%u:%d", gcwq->cpu, id);
1363 	else
1364 		worker->task = kthread_create(worker_thread, worker,
1365 					      "kworker/u:%d", id);
1366 	if (IS_ERR(worker->task))
1367 		goto fail;
1368 
1369 	/*
1370 	 * A rogue worker will become a regular one if CPU comes
1371 	 * online later on.  Make sure every worker has
1372 	 * PF_THREAD_BOUND set.
1373 	 */
1374 	if (bind && !on_unbound_cpu)
1375 		kthread_bind(worker->task, gcwq->cpu);
1376 	else {
1377 		worker->task->flags |= PF_THREAD_BOUND;
1378 		if (on_unbound_cpu)
1379 			worker->flags |= WORKER_UNBOUND;
1380 	}
1381 
1382 	return worker;
1383 fail:
1384 	if (id >= 0) {
1385 		spin_lock_irq(&gcwq->lock);
1386 		ida_remove(&gcwq->worker_ida, id);
1387 		spin_unlock_irq(&gcwq->lock);
1388 	}
1389 	kfree(worker);
1390 	return NULL;
1391 }
1392 
1393 /**
1394  * start_worker - start a newly created worker
1395  * @worker: worker to start
1396  *
1397  * Make the gcwq aware of @worker and start it.
1398  *
1399  * CONTEXT:
1400  * spin_lock_irq(gcwq->lock).
1401  */
1402 static void start_worker(struct worker *worker)
1403 {
1404 	worker->flags |= WORKER_STARTED;
1405 	worker->gcwq->nr_workers++;
1406 	worker_enter_idle(worker);
1407 	wake_up_process(worker->task);
1408 }
1409 
1410 /**
1411  * destroy_worker - destroy a workqueue worker
1412  * @worker: worker to be destroyed
1413  *
1414  * Destroy @worker and adjust @gcwq stats accordingly.
1415  *
1416  * CONTEXT:
1417  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1418  */
1419 static void destroy_worker(struct worker *worker)
1420 {
1421 	struct global_cwq *gcwq = worker->gcwq;
1422 	int id = worker->id;
1423 
1424 	/* sanity check frenzy */
1425 	BUG_ON(worker->current_work);
1426 	BUG_ON(!list_empty(&worker->scheduled));
1427 
1428 	if (worker->flags & WORKER_STARTED)
1429 		gcwq->nr_workers--;
1430 	if (worker->flags & WORKER_IDLE)
1431 		gcwq->nr_idle--;
1432 
1433 	list_del_init(&worker->entry);
1434 	worker->flags |= WORKER_DIE;
1435 
1436 	spin_unlock_irq(&gcwq->lock);
1437 
1438 	kthread_stop(worker->task);
1439 	kfree(worker);
1440 
1441 	spin_lock_irq(&gcwq->lock);
1442 	ida_remove(&gcwq->worker_ida, id);
1443 }
1444 
1445 static void idle_worker_timeout(unsigned long __gcwq)
1446 {
1447 	struct global_cwq *gcwq = (void *)__gcwq;
1448 
1449 	spin_lock_irq(&gcwq->lock);
1450 
1451 	if (too_many_workers(gcwq)) {
1452 		struct worker *worker;
1453 		unsigned long expires;
1454 
1455 		/* idle_list is kept in LIFO order, check the last one */
1456 		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1457 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1458 
1459 		if (time_before(jiffies, expires))
1460 			mod_timer(&gcwq->idle_timer, expires);
1461 		else {
1462 			/* it's been idle for too long, wake up manager */
1463 			gcwq->flags |= GCWQ_MANAGE_WORKERS;
1464 			wake_up_worker(gcwq);
1465 		}
1466 	}
1467 
1468 	spin_unlock_irq(&gcwq->lock);
1469 }
1470 
1471 static bool send_mayday(struct work_struct *work)
1472 {
1473 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1474 	struct workqueue_struct *wq = cwq->wq;
1475 	unsigned int cpu;
1476 
1477 	if (!(wq->flags & WQ_RESCUER))
1478 		return false;
1479 
1480 	/* mayday mayday mayday */
1481 	cpu = cwq->gcwq->cpu;
1482 	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1483 	if (cpu == WORK_CPU_UNBOUND)
1484 		cpu = 0;
1485 	if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1486 		wake_up_process(wq->rescuer->task);
1487 	return true;
1488 }
1489 
1490 static void gcwq_mayday_timeout(unsigned long __gcwq)
1491 {
1492 	struct global_cwq *gcwq = (void *)__gcwq;
1493 	struct work_struct *work;
1494 
1495 	spin_lock_irq(&gcwq->lock);
1496 
1497 	if (need_to_create_worker(gcwq)) {
1498 		/*
1499 		 * We've been trying to create a new worker but
1500 		 * haven't been successful.  We might be hitting an
1501 		 * allocation deadlock.  Send distress signals to
1502 		 * rescuers.
1503 		 */
1504 		list_for_each_entry(work, &gcwq->worklist, entry)
1505 			send_mayday(work);
1506 	}
1507 
1508 	spin_unlock_irq(&gcwq->lock);
1509 
1510 	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
1511 }
1512 
1513 /**
1514  * maybe_create_worker - create a new worker if necessary
1515  * @gcwq: gcwq to create a new worker for
1516  *
1517  * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
1518  * have at least one idle worker on return from this function.  If
1519  * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1520  * sent to all rescuers with works scheduled on @gcwq to resolve
1521  * possible allocation deadlock.
1522  *
1523  * On return, need_to_create_worker() is guaranteed to be false and
1524  * may_start_working() true.
1525  *
1526  * LOCKING:
1527  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1528  * multiple times.  Does GFP_KERNEL allocations.  Called only from
1529  * manager.
1530  *
1531  * RETURNS:
1532  * false if no action was taken and gcwq->lock stayed locked, true
1533  * otherwise.
1534  */
1535 static bool maybe_create_worker(struct global_cwq *gcwq)
1536 __releases(&gcwq->lock)
1537 __acquires(&gcwq->lock)
1538 {
1539 	if (!need_to_create_worker(gcwq))
1540 		return false;
1541 restart:
1542 	spin_unlock_irq(&gcwq->lock);
1543 
1544 	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1545 	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1546 
1547 	while (true) {
1548 		struct worker *worker;
1549 
1550 		worker = create_worker(gcwq, true);
1551 		if (worker) {
1552 			del_timer_sync(&gcwq->mayday_timer);
1553 			spin_lock_irq(&gcwq->lock);
1554 			start_worker(worker);
1555 			BUG_ON(need_to_create_worker(gcwq));
1556 			return true;
1557 		}
1558 
1559 		if (!need_to_create_worker(gcwq))
1560 			break;
1561 
1562 		__set_current_state(TASK_INTERRUPTIBLE);
1563 		schedule_timeout(CREATE_COOLDOWN);
1564 
1565 		if (!need_to_create_worker(gcwq))
1566 			break;
1567 	}
1568 
1569 	del_timer_sync(&gcwq->mayday_timer);
1570 	spin_lock_irq(&gcwq->lock);
1571 	if (need_to_create_worker(gcwq))
1572 		goto restart;
1573 	return true;
1574 }
1575 
1576 /**
1577  * maybe_destroy_worker - destroy workers which have been idle for a while
1578  * @gcwq: gcwq to destroy workers for
1579  *
1580  * Destroy @gcwq workers which have been idle for longer than
1581  * IDLE_WORKER_TIMEOUT.
1582  *
1583  * LOCKING:
1584  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1585  * multiple times.  Called only from manager.
1586  *
1587  * RETURNS:
1588  * false if no action was taken and gcwq->lock stayed locked, true
1589  * otherwise.
1590  */
1591 static bool maybe_destroy_workers(struct global_cwq *gcwq)
1592 {
1593 	bool ret = false;
1594 
1595 	while (too_many_workers(gcwq)) {
1596 		struct worker *worker;
1597 		unsigned long expires;
1598 
1599 		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1600 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1601 
1602 		if (time_before(jiffies, expires)) {
1603 			mod_timer(&gcwq->idle_timer, expires);
1604 			break;
1605 		}
1606 
1607 		destroy_worker(worker);
1608 		ret = true;
1609 	}
1610 
1611 	return ret;
1612 }
1613 
1614 /**
1615  * manage_workers - manage worker pool
1616  * @worker: self
1617  *
1618  * Assume the manager role and manage gcwq worker pool @worker belongs
1619  * to.  At any given time, there can be only zero or one manager per
1620  * gcwq.  The exclusion is handled automatically by this function.
1621  *
1622  * The caller can safely start processing works on false return.  On
1623  * true return, it's guaranteed that need_to_create_worker() is false
1624  * and may_start_working() is true.
1625  *
1626  * CONTEXT:
1627  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1628  * multiple times.  Does GFP_KERNEL allocations.
1629  *
1630  * RETURNS:
1631  * false if no action was taken and gcwq->lock stayed locked, true if
1632  * some action was taken.
1633  */
1634 static bool manage_workers(struct worker *worker)
1635 {
1636 	struct global_cwq *gcwq = worker->gcwq;
1637 	bool ret = false;
1638 
1639 	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1640 		return ret;
1641 
1642 	gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1643 	gcwq->flags |= GCWQ_MANAGING_WORKERS;
1644 
1645 	/*
1646 	 * Destroy and then create so that may_start_working() is true
1647 	 * on return.
1648 	 */
1649 	ret |= maybe_destroy_workers(gcwq);
1650 	ret |= maybe_create_worker(gcwq);
1651 
1652 	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1653 
1654 	/*
1655 	 * The trustee might be waiting to take over the manager
1656 	 * position, tell it we're done.
1657 	 */
1658 	if (unlikely(gcwq->trustee))
1659 		wake_up_all(&gcwq->trustee_wait);
1660 
1661 	return ret;
1662 }
1663 
1664 /**
1665  * move_linked_works - move linked works to a list
1666  * @work: start of series of works to be scheduled
1667  * @head: target list to append @work to
1668  * @nextp: out paramter for nested worklist walking
1669  *
1670  * Schedule linked works starting from @work to @head.  Work series to
1671  * be scheduled starts at @work and includes any consecutive work with
1672  * WORK_STRUCT_LINKED set in its predecessor.
1673  *
1674  * If @nextp is not NULL, it's updated to point to the next work of
1675  * the last scheduled work.  This allows move_linked_works() to be
1676  * nested inside outer list_for_each_entry_safe().
1677  *
1678  * CONTEXT:
1679  * spin_lock_irq(gcwq->lock).
1680  */
1681 static void move_linked_works(struct work_struct *work, struct list_head *head,
1682 			      struct work_struct **nextp)
1683 {
1684 	struct work_struct *n;
1685 
1686 	/*
1687 	 * Linked worklist will always end before the end of the list,
1688 	 * use NULL for list head.
1689 	 */
1690 	list_for_each_entry_safe_from(work, n, NULL, entry) {
1691 		list_move_tail(&work->entry, head);
1692 		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1693 			break;
1694 	}
1695 
1696 	/*
1697 	 * If we're already inside safe list traversal and have moved
1698 	 * multiple works to the scheduled queue, the next position
1699 	 * needs to be updated.
1700 	 */
1701 	if (nextp)
1702 		*nextp = n;
1703 }
1704 
1705 static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1706 {
1707 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
1708 						    struct work_struct, entry);
1709 	struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1710 
1711 	trace_workqueue_activate_work(work);
1712 	move_linked_works(work, pos, NULL);
1713 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1714 	cwq->nr_active++;
1715 }
1716 
1717 /**
1718  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1719  * @cwq: cwq of interest
1720  * @color: color of work which left the queue
1721  * @delayed: for a delayed work
1722  *
1723  * A work either has completed or is removed from pending queue,
1724  * decrement nr_in_flight of its cwq and handle workqueue flushing.
1725  *
1726  * CONTEXT:
1727  * spin_lock_irq(gcwq->lock).
1728  */
1729 static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1730 				 bool delayed)
1731 {
1732 	/* ignore uncolored works */
1733 	if (color == WORK_NO_COLOR)
1734 		return;
1735 
1736 	cwq->nr_in_flight[color]--;
1737 
1738 	if (!delayed) {
1739 		cwq->nr_active--;
1740 		if (!list_empty(&cwq->delayed_works)) {
1741 			/* one down, submit a delayed one */
1742 			if (cwq->nr_active < cwq->max_active)
1743 				cwq_activate_first_delayed(cwq);
1744 		}
1745 	}
1746 
1747 	/* is flush in progress and are we at the flushing tip? */
1748 	if (likely(cwq->flush_color != color))
1749 		return;
1750 
1751 	/* are there still in-flight works? */
1752 	if (cwq->nr_in_flight[color])
1753 		return;
1754 
1755 	/* this cwq is done, clear flush_color */
1756 	cwq->flush_color = -1;
1757 
1758 	/*
1759 	 * If this was the last cwq, wake up the first flusher.  It
1760 	 * will handle the rest.
1761 	 */
1762 	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1763 		complete(&cwq->wq->first_flusher->done);
1764 }
1765 
1766 /**
1767  * process_one_work - process single work
1768  * @worker: self
1769  * @work: work to process
1770  *
1771  * Process @work.  This function contains all the logics necessary to
1772  * process a single work including synchronization against and
1773  * interaction with other workers on the same cpu, queueing and
1774  * flushing.  As long as context requirement is met, any worker can
1775  * call this function to process a work.
1776  *
1777  * CONTEXT:
1778  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1779  */
1780 static void process_one_work(struct worker *worker, struct work_struct *work)
1781 __releases(&gcwq->lock)
1782 __acquires(&gcwq->lock)
1783 {
1784 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1785 	struct global_cwq *gcwq = cwq->gcwq;
1786 	struct hlist_head *bwh = busy_worker_head(gcwq, work);
1787 	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1788 	work_func_t f = work->func;
1789 	int work_color;
1790 	struct worker *collision;
1791 #ifdef CONFIG_LOCKDEP
1792 	/*
1793 	 * It is permissible to free the struct work_struct from
1794 	 * inside the function that is called from it, this we need to
1795 	 * take into account for lockdep too.  To avoid bogus "held
1796 	 * lock freed" warnings as well as problems when looking into
1797 	 * work->lockdep_map, make a copy and use that here.
1798 	 */
1799 	struct lockdep_map lockdep_map = work->lockdep_map;
1800 #endif
1801 	/*
1802 	 * A single work shouldn't be executed concurrently by
1803 	 * multiple workers on a single cpu.  Check whether anyone is
1804 	 * already processing the work.  If so, defer the work to the
1805 	 * currently executing one.
1806 	 */
1807 	collision = __find_worker_executing_work(gcwq, bwh, work);
1808 	if (unlikely(collision)) {
1809 		move_linked_works(work, &collision->scheduled, NULL);
1810 		return;
1811 	}
1812 
1813 	/* claim and process */
1814 	debug_work_deactivate(work);
1815 	hlist_add_head(&worker->hentry, bwh);
1816 	worker->current_work = work;
1817 	worker->current_cwq = cwq;
1818 	work_color = get_work_color(work);
1819 
1820 	/* record the current cpu number in the work data and dequeue */
1821 	set_work_cpu(work, gcwq->cpu);
1822 	list_del_init(&work->entry);
1823 
1824 	/*
1825 	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1826 	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1827 	 */
1828 	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1829 		struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1830 						struct work_struct, entry);
1831 
1832 		if (!list_empty(&gcwq->worklist) &&
1833 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1834 			wake_up_worker(gcwq);
1835 		else
1836 			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1837 	}
1838 
1839 	/*
1840 	 * CPU intensive works don't participate in concurrency
1841 	 * management.  They're the scheduler's responsibility.
1842 	 */
1843 	if (unlikely(cpu_intensive))
1844 		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1845 
1846 	spin_unlock_irq(&gcwq->lock);
1847 
1848 	work_clear_pending(work);
1849 	lock_map_acquire_read(&cwq->wq->lockdep_map);
1850 	lock_map_acquire(&lockdep_map);
1851 	trace_workqueue_execute_start(work);
1852 	f(work);
1853 	/*
1854 	 * While we must be careful to not use "work" after this, the trace
1855 	 * point will only record its address.
1856 	 */
1857 	trace_workqueue_execute_end(work);
1858 	lock_map_release(&lockdep_map);
1859 	lock_map_release(&cwq->wq->lockdep_map);
1860 
1861 	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1862 		printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1863 		       "%s/0x%08x/%d\n",
1864 		       current->comm, preempt_count(), task_pid_nr(current));
1865 		printk(KERN_ERR "    last function: ");
1866 		print_symbol("%s\n", (unsigned long)f);
1867 		debug_show_held_locks(current);
1868 		dump_stack();
1869 	}
1870 
1871 	spin_lock_irq(&gcwq->lock);
1872 
1873 	/* clear cpu intensive status */
1874 	if (unlikely(cpu_intensive))
1875 		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1876 
1877 	/* we're done with it, release */
1878 	hlist_del_init(&worker->hentry);
1879 	worker->current_work = NULL;
1880 	worker->current_cwq = NULL;
1881 	cwq_dec_nr_in_flight(cwq, work_color, false);
1882 }
1883 
1884 /**
1885  * process_scheduled_works - process scheduled works
1886  * @worker: self
1887  *
1888  * Process all scheduled works.  Please note that the scheduled list
1889  * may change while processing a work, so this function repeatedly
1890  * fetches a work from the top and executes it.
1891  *
1892  * CONTEXT:
1893  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1894  * multiple times.
1895  */
1896 static void process_scheduled_works(struct worker *worker)
1897 {
1898 	while (!list_empty(&worker->scheduled)) {
1899 		struct work_struct *work = list_first_entry(&worker->scheduled,
1900 						struct work_struct, entry);
1901 		process_one_work(worker, work);
1902 	}
1903 }
1904 
1905 /**
1906  * worker_thread - the worker thread function
1907  * @__worker: self
1908  *
1909  * The gcwq worker thread function.  There's a single dynamic pool of
1910  * these per each cpu.  These workers process all works regardless of
1911  * their specific target workqueue.  The only exception is works which
1912  * belong to workqueues with a rescuer which will be explained in
1913  * rescuer_thread().
1914  */
1915 static int worker_thread(void *__worker)
1916 {
1917 	struct worker *worker = __worker;
1918 	struct global_cwq *gcwq = worker->gcwq;
1919 
1920 	/* tell the scheduler that this is a workqueue worker */
1921 	worker->task->flags |= PF_WQ_WORKER;
1922 woke_up:
1923 	spin_lock_irq(&gcwq->lock);
1924 
1925 	/* DIE can be set only while we're idle, checking here is enough */
1926 	if (worker->flags & WORKER_DIE) {
1927 		spin_unlock_irq(&gcwq->lock);
1928 		worker->task->flags &= ~PF_WQ_WORKER;
1929 		return 0;
1930 	}
1931 
1932 	worker_leave_idle(worker);
1933 recheck:
1934 	/* no more worker necessary? */
1935 	if (!need_more_worker(gcwq))
1936 		goto sleep;
1937 
1938 	/* do we need to manage? */
1939 	if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1940 		goto recheck;
1941 
1942 	/*
1943 	 * ->scheduled list can only be filled while a worker is
1944 	 * preparing to process a work or actually processing it.
1945 	 * Make sure nobody diddled with it while I was sleeping.
1946 	 */
1947 	BUG_ON(!list_empty(&worker->scheduled));
1948 
1949 	/*
1950 	 * When control reaches this point, we're guaranteed to have
1951 	 * at least one idle worker or that someone else has already
1952 	 * assumed the manager role.
1953 	 */
1954 	worker_clr_flags(worker, WORKER_PREP);
1955 
1956 	do {
1957 		struct work_struct *work =
1958 			list_first_entry(&gcwq->worklist,
1959 					 struct work_struct, entry);
1960 
1961 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1962 			/* optimization path, not strictly necessary */
1963 			process_one_work(worker, work);
1964 			if (unlikely(!list_empty(&worker->scheduled)))
1965 				process_scheduled_works(worker);
1966 		} else {
1967 			move_linked_works(work, &worker->scheduled, NULL);
1968 			process_scheduled_works(worker);
1969 		}
1970 	} while (keep_working(gcwq));
1971 
1972 	worker_set_flags(worker, WORKER_PREP, false);
1973 sleep:
1974 	if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1975 		goto recheck;
1976 
1977 	/*
1978 	 * gcwq->lock is held and there's no work to process and no
1979 	 * need to manage, sleep.  Workers are woken up only while
1980 	 * holding gcwq->lock or from local cpu, so setting the
1981 	 * current state before releasing gcwq->lock is enough to
1982 	 * prevent losing any event.
1983 	 */
1984 	worker_enter_idle(worker);
1985 	__set_current_state(TASK_INTERRUPTIBLE);
1986 	spin_unlock_irq(&gcwq->lock);
1987 	schedule();
1988 	goto woke_up;
1989 }
1990 
1991 /**
1992  * rescuer_thread - the rescuer thread function
1993  * @__wq: the associated workqueue
1994  *
1995  * Workqueue rescuer thread function.  There's one rescuer for each
1996  * workqueue which has WQ_RESCUER set.
1997  *
1998  * Regular work processing on a gcwq may block trying to create a new
1999  * worker which uses GFP_KERNEL allocation which has slight chance of
2000  * developing into deadlock if some works currently on the same queue
2001  * need to be processed to satisfy the GFP_KERNEL allocation.  This is
2002  * the problem rescuer solves.
2003  *
2004  * When such condition is possible, the gcwq summons rescuers of all
2005  * workqueues which have works queued on the gcwq and let them process
2006  * those works so that forward progress can be guaranteed.
2007  *
2008  * This should happen rarely.
2009  */
2010 static int rescuer_thread(void *__wq)
2011 {
2012 	struct workqueue_struct *wq = __wq;
2013 	struct worker *rescuer = wq->rescuer;
2014 	struct list_head *scheduled = &rescuer->scheduled;
2015 	bool is_unbound = wq->flags & WQ_UNBOUND;
2016 	unsigned int cpu;
2017 
2018 	set_user_nice(current, RESCUER_NICE_LEVEL);
2019 repeat:
2020 	set_current_state(TASK_INTERRUPTIBLE);
2021 
2022 	if (kthread_should_stop())
2023 		return 0;
2024 
2025 	/*
2026 	 * See whether any cpu is asking for help.  Unbounded
2027 	 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
2028 	 */
2029 	for_each_mayday_cpu(cpu, wq->mayday_mask) {
2030 		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2031 		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2032 		struct global_cwq *gcwq = cwq->gcwq;
2033 		struct work_struct *work, *n;
2034 
2035 		__set_current_state(TASK_RUNNING);
2036 		mayday_clear_cpu(cpu, wq->mayday_mask);
2037 
2038 		/* migrate to the target cpu if possible */
2039 		rescuer->gcwq = gcwq;
2040 		worker_maybe_bind_and_lock(rescuer);
2041 
2042 		/*
2043 		 * Slurp in all works issued via this workqueue and
2044 		 * process'em.
2045 		 */
2046 		BUG_ON(!list_empty(&rescuer->scheduled));
2047 		list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2048 			if (get_work_cwq(work) == cwq)
2049 				move_linked_works(work, scheduled, &n);
2050 
2051 		process_scheduled_works(rescuer);
2052 
2053 		/*
2054 		 * Leave this gcwq.  If keep_working() is %true, notify a
2055 		 * regular worker; otherwise, we end up with 0 concurrency
2056 		 * and stalling the execution.
2057 		 */
2058 		if (keep_working(gcwq))
2059 			wake_up_worker(gcwq);
2060 
2061 		spin_unlock_irq(&gcwq->lock);
2062 	}
2063 
2064 	schedule();
2065 	goto repeat;
2066 }
2067 
2068 struct wq_barrier {
2069 	struct work_struct	work;
2070 	struct completion	done;
2071 };
2072 
2073 static void wq_barrier_func(struct work_struct *work)
2074 {
2075 	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2076 	complete(&barr->done);
2077 }
2078 
2079 /**
2080  * insert_wq_barrier - insert a barrier work
2081  * @cwq: cwq to insert barrier into
2082  * @barr: wq_barrier to insert
2083  * @target: target work to attach @barr to
2084  * @worker: worker currently executing @target, NULL if @target is not executing
2085  *
2086  * @barr is linked to @target such that @barr is completed only after
2087  * @target finishes execution.  Please note that the ordering
2088  * guarantee is observed only with respect to @target and on the local
2089  * cpu.
2090  *
2091  * Currently, a queued barrier can't be canceled.  This is because
2092  * try_to_grab_pending() can't determine whether the work to be
2093  * grabbed is at the head of the queue and thus can't clear LINKED
2094  * flag of the previous work while there must be a valid next work
2095  * after a work with LINKED flag set.
2096  *
2097  * Note that when @worker is non-NULL, @target may be modified
2098  * underneath us, so we can't reliably determine cwq from @target.
2099  *
2100  * CONTEXT:
2101  * spin_lock_irq(gcwq->lock).
2102  */
2103 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2104 			      struct wq_barrier *barr,
2105 			      struct work_struct *target, struct worker *worker)
2106 {
2107 	struct list_head *head;
2108 	unsigned int linked = 0;
2109 
2110 	/*
2111 	 * debugobject calls are safe here even with gcwq->lock locked
2112 	 * as we know for sure that this will not trigger any of the
2113 	 * checks and call back into the fixup functions where we
2114 	 * might deadlock.
2115 	 */
2116 	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2117 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2118 	init_completion(&barr->done);
2119 
2120 	/*
2121 	 * If @target is currently being executed, schedule the
2122 	 * barrier to the worker; otherwise, put it after @target.
2123 	 */
2124 	if (worker)
2125 		head = worker->scheduled.next;
2126 	else {
2127 		unsigned long *bits = work_data_bits(target);
2128 
2129 		head = target->entry.next;
2130 		/* there can already be other linked works, inherit and set */
2131 		linked = *bits & WORK_STRUCT_LINKED;
2132 		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
2133 	}
2134 
2135 	debug_work_activate(&barr->work);
2136 	insert_work(cwq, &barr->work, head,
2137 		    work_color_to_flags(WORK_NO_COLOR) | linked);
2138 }
2139 
2140 /**
2141  * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2142  * @wq: workqueue being flushed
2143  * @flush_color: new flush color, < 0 for no-op
2144  * @work_color: new work color, < 0 for no-op
2145  *
2146  * Prepare cwqs for workqueue flushing.
2147  *
2148  * If @flush_color is non-negative, flush_color on all cwqs should be
2149  * -1.  If no cwq has in-flight commands at the specified color, all
2150  * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
2151  * has in flight commands, its cwq->flush_color is set to
2152  * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2153  * wakeup logic is armed and %true is returned.
2154  *
2155  * The caller should have initialized @wq->first_flusher prior to
2156  * calling this function with non-negative @flush_color.  If
2157  * @flush_color is negative, no flush color update is done and %false
2158  * is returned.
2159  *
2160  * If @work_color is non-negative, all cwqs should have the same
2161  * work_color which is previous to @work_color and all will be
2162  * advanced to @work_color.
2163  *
2164  * CONTEXT:
2165  * mutex_lock(wq->flush_mutex).
2166  *
2167  * RETURNS:
2168  * %true if @flush_color >= 0 and there's something to flush.  %false
2169  * otherwise.
2170  */
2171 static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2172 				      int flush_color, int work_color)
2173 {
2174 	bool wait = false;
2175 	unsigned int cpu;
2176 
2177 	if (flush_color >= 0) {
2178 		BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
2179 		atomic_set(&wq->nr_cwqs_to_flush, 1);
2180 	}
2181 
2182 	for_each_cwq_cpu(cpu, wq) {
2183 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2184 		struct global_cwq *gcwq = cwq->gcwq;
2185 
2186 		spin_lock_irq(&gcwq->lock);
2187 
2188 		if (flush_color >= 0) {
2189 			BUG_ON(cwq->flush_color != -1);
2190 
2191 			if (cwq->nr_in_flight[flush_color]) {
2192 				cwq->flush_color = flush_color;
2193 				atomic_inc(&wq->nr_cwqs_to_flush);
2194 				wait = true;
2195 			}
2196 		}
2197 
2198 		if (work_color >= 0) {
2199 			BUG_ON(work_color != work_next_color(cwq->work_color));
2200 			cwq->work_color = work_color;
2201 		}
2202 
2203 		spin_unlock_irq(&gcwq->lock);
2204 	}
2205 
2206 	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2207 		complete(&wq->first_flusher->done);
2208 
2209 	return wait;
2210 }
2211 
2212 /**
2213  * flush_workqueue - ensure that any scheduled work has run to completion.
2214  * @wq: workqueue to flush
2215  *
2216  * Forces execution of the workqueue and blocks until its completion.
2217  * This is typically used in driver shutdown handlers.
2218  *
2219  * We sleep until all works which were queued on entry have been handled,
2220  * but we are not livelocked by new incoming ones.
2221  */
2222 void flush_workqueue(struct workqueue_struct *wq)
2223 {
2224 	struct wq_flusher this_flusher = {
2225 		.list = LIST_HEAD_INIT(this_flusher.list),
2226 		.flush_color = -1,
2227 		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2228 	};
2229 	int next_color;
2230 
2231 	lock_map_acquire(&wq->lockdep_map);
2232 	lock_map_release(&wq->lockdep_map);
2233 
2234 	mutex_lock(&wq->flush_mutex);
2235 
2236 	/*
2237 	 * Start-to-wait phase
2238 	 */
2239 	next_color = work_next_color(wq->work_color);
2240 
2241 	if (next_color != wq->flush_color) {
2242 		/*
2243 		 * Color space is not full.  The current work_color
2244 		 * becomes our flush_color and work_color is advanced
2245 		 * by one.
2246 		 */
2247 		BUG_ON(!list_empty(&wq->flusher_overflow));
2248 		this_flusher.flush_color = wq->work_color;
2249 		wq->work_color = next_color;
2250 
2251 		if (!wq->first_flusher) {
2252 			/* no flush in progress, become the first flusher */
2253 			BUG_ON(wq->flush_color != this_flusher.flush_color);
2254 
2255 			wq->first_flusher = &this_flusher;
2256 
2257 			if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2258 						       wq->work_color)) {
2259 				/* nothing to flush, done */
2260 				wq->flush_color = next_color;
2261 				wq->first_flusher = NULL;
2262 				goto out_unlock;
2263 			}
2264 		} else {
2265 			/* wait in queue */
2266 			BUG_ON(wq->flush_color == this_flusher.flush_color);
2267 			list_add_tail(&this_flusher.list, &wq->flusher_queue);
2268 			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2269 		}
2270 	} else {
2271 		/*
2272 		 * Oops, color space is full, wait on overflow queue.
2273 		 * The next flush completion will assign us
2274 		 * flush_color and transfer to flusher_queue.
2275 		 */
2276 		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2277 	}
2278 
2279 	mutex_unlock(&wq->flush_mutex);
2280 
2281 	wait_for_completion(&this_flusher.done);
2282 
2283 	/*
2284 	 * Wake-up-and-cascade phase
2285 	 *
2286 	 * First flushers are responsible for cascading flushes and
2287 	 * handling overflow.  Non-first flushers can simply return.
2288 	 */
2289 	if (wq->first_flusher != &this_flusher)
2290 		return;
2291 
2292 	mutex_lock(&wq->flush_mutex);
2293 
2294 	/* we might have raced, check again with mutex held */
2295 	if (wq->first_flusher != &this_flusher)
2296 		goto out_unlock;
2297 
2298 	wq->first_flusher = NULL;
2299 
2300 	BUG_ON(!list_empty(&this_flusher.list));
2301 	BUG_ON(wq->flush_color != this_flusher.flush_color);
2302 
2303 	while (true) {
2304 		struct wq_flusher *next, *tmp;
2305 
2306 		/* complete all the flushers sharing the current flush color */
2307 		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2308 			if (next->flush_color != wq->flush_color)
2309 				break;
2310 			list_del_init(&next->list);
2311 			complete(&next->done);
2312 		}
2313 
2314 		BUG_ON(!list_empty(&wq->flusher_overflow) &&
2315 		       wq->flush_color != work_next_color(wq->work_color));
2316 
2317 		/* this flush_color is finished, advance by one */
2318 		wq->flush_color = work_next_color(wq->flush_color);
2319 
2320 		/* one color has been freed, handle overflow queue */
2321 		if (!list_empty(&wq->flusher_overflow)) {
2322 			/*
2323 			 * Assign the same color to all overflowed
2324 			 * flushers, advance work_color and append to
2325 			 * flusher_queue.  This is the start-to-wait
2326 			 * phase for these overflowed flushers.
2327 			 */
2328 			list_for_each_entry(tmp, &wq->flusher_overflow, list)
2329 				tmp->flush_color = wq->work_color;
2330 
2331 			wq->work_color = work_next_color(wq->work_color);
2332 
2333 			list_splice_tail_init(&wq->flusher_overflow,
2334 					      &wq->flusher_queue);
2335 			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2336 		}
2337 
2338 		if (list_empty(&wq->flusher_queue)) {
2339 			BUG_ON(wq->flush_color != wq->work_color);
2340 			break;
2341 		}
2342 
2343 		/*
2344 		 * Need to flush more colors.  Make the next flusher
2345 		 * the new first flusher and arm cwqs.
2346 		 */
2347 		BUG_ON(wq->flush_color == wq->work_color);
2348 		BUG_ON(wq->flush_color != next->flush_color);
2349 
2350 		list_del_init(&next->list);
2351 		wq->first_flusher = next;
2352 
2353 		if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2354 			break;
2355 
2356 		/*
2357 		 * Meh... this color is already done, clear first
2358 		 * flusher and repeat cascading.
2359 		 */
2360 		wq->first_flusher = NULL;
2361 	}
2362 
2363 out_unlock:
2364 	mutex_unlock(&wq->flush_mutex);
2365 }
2366 EXPORT_SYMBOL_GPL(flush_workqueue);
2367 
2368 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2369 			     bool wait_executing)
2370 {
2371 	struct worker *worker = NULL;
2372 	struct global_cwq *gcwq;
2373 	struct cpu_workqueue_struct *cwq;
2374 
2375 	might_sleep();
2376 	gcwq = get_work_gcwq(work);
2377 	if (!gcwq)
2378 		return false;
2379 
2380 	spin_lock_irq(&gcwq->lock);
2381 	if (!list_empty(&work->entry)) {
2382 		/*
2383 		 * See the comment near try_to_grab_pending()->smp_rmb().
2384 		 * If it was re-queued to a different gcwq under us, we
2385 		 * are not going to wait.
2386 		 */
2387 		smp_rmb();
2388 		cwq = get_work_cwq(work);
2389 		if (unlikely(!cwq || gcwq != cwq->gcwq))
2390 			goto already_gone;
2391 	} else if (wait_executing) {
2392 		worker = find_worker_executing_work(gcwq, work);
2393 		if (!worker)
2394 			goto already_gone;
2395 		cwq = worker->current_cwq;
2396 	} else
2397 		goto already_gone;
2398 
2399 	insert_wq_barrier(cwq, barr, work, worker);
2400 	spin_unlock_irq(&gcwq->lock);
2401 
2402 	/*
2403 	 * If @max_active is 1 or rescuer is in use, flushing another work
2404 	 * item on the same workqueue may lead to deadlock.  Make sure the
2405 	 * flusher is not running on the same workqueue by verifying write
2406 	 * access.
2407 	 */
2408 	if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2409 		lock_map_acquire(&cwq->wq->lockdep_map);
2410 	else
2411 		lock_map_acquire_read(&cwq->wq->lockdep_map);
2412 	lock_map_release(&cwq->wq->lockdep_map);
2413 
2414 	return true;
2415 already_gone:
2416 	spin_unlock_irq(&gcwq->lock);
2417 	return false;
2418 }
2419 
2420 /**
2421  * flush_work - wait for a work to finish executing the last queueing instance
2422  * @work: the work to flush
2423  *
2424  * Wait until @work has finished execution.  This function considers
2425  * only the last queueing instance of @work.  If @work has been
2426  * enqueued across different CPUs on a non-reentrant workqueue or on
2427  * multiple workqueues, @work might still be executing on return on
2428  * some of the CPUs from earlier queueing.
2429  *
2430  * If @work was queued only on a non-reentrant, ordered or unbound
2431  * workqueue, @work is guaranteed to be idle on return if it hasn't
2432  * been requeued since flush started.
2433  *
2434  * RETURNS:
2435  * %true if flush_work() waited for the work to finish execution,
2436  * %false if it was already idle.
2437  */
2438 bool flush_work(struct work_struct *work)
2439 {
2440 	struct wq_barrier barr;
2441 
2442 	if (start_flush_work(work, &barr, true)) {
2443 		wait_for_completion(&barr.done);
2444 		destroy_work_on_stack(&barr.work);
2445 		return true;
2446 	} else
2447 		return false;
2448 }
2449 EXPORT_SYMBOL_GPL(flush_work);
2450 
2451 static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2452 {
2453 	struct wq_barrier barr;
2454 	struct worker *worker;
2455 
2456 	spin_lock_irq(&gcwq->lock);
2457 
2458 	worker = find_worker_executing_work(gcwq, work);
2459 	if (unlikely(worker))
2460 		insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2461 
2462 	spin_unlock_irq(&gcwq->lock);
2463 
2464 	if (unlikely(worker)) {
2465 		wait_for_completion(&barr.done);
2466 		destroy_work_on_stack(&barr.work);
2467 		return true;
2468 	} else
2469 		return false;
2470 }
2471 
2472 static bool wait_on_work(struct work_struct *work)
2473 {
2474 	bool ret = false;
2475 	int cpu;
2476 
2477 	might_sleep();
2478 
2479 	lock_map_acquire(&work->lockdep_map);
2480 	lock_map_release(&work->lockdep_map);
2481 
2482 	for_each_gcwq_cpu(cpu)
2483 		ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2484 	return ret;
2485 }
2486 
2487 /**
2488  * flush_work_sync - wait until a work has finished execution
2489  * @work: the work to flush
2490  *
2491  * Wait until @work has finished execution.  On return, it's
2492  * guaranteed that all queueing instances of @work which happened
2493  * before this function is called are finished.  In other words, if
2494  * @work hasn't been requeued since this function was called, @work is
2495  * guaranteed to be idle on return.
2496  *
2497  * RETURNS:
2498  * %true if flush_work_sync() waited for the work to finish execution,
2499  * %false if it was already idle.
2500  */
2501 bool flush_work_sync(struct work_struct *work)
2502 {
2503 	struct wq_barrier barr;
2504 	bool pending, waited;
2505 
2506 	/* we'll wait for executions separately, queue barr only if pending */
2507 	pending = start_flush_work(work, &barr, false);
2508 
2509 	/* wait for executions to finish */
2510 	waited = wait_on_work(work);
2511 
2512 	/* wait for the pending one */
2513 	if (pending) {
2514 		wait_for_completion(&barr.done);
2515 		destroy_work_on_stack(&barr.work);
2516 	}
2517 
2518 	return pending || waited;
2519 }
2520 EXPORT_SYMBOL_GPL(flush_work_sync);
2521 
2522 /*
2523  * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2524  * so this work can't be re-armed in any way.
2525  */
2526 static int try_to_grab_pending(struct work_struct *work)
2527 {
2528 	struct global_cwq *gcwq;
2529 	int ret = -1;
2530 
2531 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2532 		return 0;
2533 
2534 	/*
2535 	 * The queueing is in progress, or it is already queued. Try to
2536 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2537 	 */
2538 	gcwq = get_work_gcwq(work);
2539 	if (!gcwq)
2540 		return ret;
2541 
2542 	spin_lock_irq(&gcwq->lock);
2543 	if (!list_empty(&work->entry)) {
2544 		/*
2545 		 * This work is queued, but perhaps we locked the wrong gcwq.
2546 		 * In that case we must see the new value after rmb(), see
2547 		 * insert_work()->wmb().
2548 		 */
2549 		smp_rmb();
2550 		if (gcwq == get_work_gcwq(work)) {
2551 			debug_work_deactivate(work);
2552 			list_del_init(&work->entry);
2553 			cwq_dec_nr_in_flight(get_work_cwq(work),
2554 				get_work_color(work),
2555 				*work_data_bits(work) & WORK_STRUCT_DELAYED);
2556 			ret = 1;
2557 		}
2558 	}
2559 	spin_unlock_irq(&gcwq->lock);
2560 
2561 	return ret;
2562 }
2563 
2564 static bool __cancel_work_timer(struct work_struct *work,
2565 				struct timer_list* timer)
2566 {
2567 	int ret;
2568 
2569 	do {
2570 		ret = (timer && likely(del_timer(timer)));
2571 		if (!ret)
2572 			ret = try_to_grab_pending(work);
2573 		wait_on_work(work);
2574 	} while (unlikely(ret < 0));
2575 
2576 	clear_work_data(work);
2577 	return ret;
2578 }
2579 
2580 /**
2581  * cancel_work_sync - cancel a work and wait for it to finish
2582  * @work: the work to cancel
2583  *
2584  * Cancel @work and wait for its execution to finish.  This function
2585  * can be used even if the work re-queues itself or migrates to
2586  * another workqueue.  On return from this function, @work is
2587  * guaranteed to be not pending or executing on any CPU.
2588  *
2589  * cancel_work_sync(&delayed_work->work) must not be used for
2590  * delayed_work's.  Use cancel_delayed_work_sync() instead.
2591  *
2592  * The caller must ensure that the workqueue on which @work was last
2593  * queued can't be destroyed before this function returns.
2594  *
2595  * RETURNS:
2596  * %true if @work was pending, %false otherwise.
2597  */
2598 bool cancel_work_sync(struct work_struct *work)
2599 {
2600 	return __cancel_work_timer(work, NULL);
2601 }
2602 EXPORT_SYMBOL_GPL(cancel_work_sync);
2603 
2604 /**
2605  * flush_delayed_work - wait for a dwork to finish executing the last queueing
2606  * @dwork: the delayed work to flush
2607  *
2608  * Delayed timer is cancelled and the pending work is queued for
2609  * immediate execution.  Like flush_work(), this function only
2610  * considers the last queueing instance of @dwork.
2611  *
2612  * RETURNS:
2613  * %true if flush_work() waited for the work to finish execution,
2614  * %false if it was already idle.
2615  */
2616 bool flush_delayed_work(struct delayed_work *dwork)
2617 {
2618 	if (del_timer_sync(&dwork->timer))
2619 		__queue_work(raw_smp_processor_id(),
2620 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
2621 	return flush_work(&dwork->work);
2622 }
2623 EXPORT_SYMBOL(flush_delayed_work);
2624 
2625 /**
2626  * flush_delayed_work_sync - wait for a dwork to finish
2627  * @dwork: the delayed work to flush
2628  *
2629  * Delayed timer is cancelled and the pending work is queued for
2630  * execution immediately.  Other than timer handling, its behavior
2631  * is identical to flush_work_sync().
2632  *
2633  * RETURNS:
2634  * %true if flush_work_sync() waited for the work to finish execution,
2635  * %false if it was already idle.
2636  */
2637 bool flush_delayed_work_sync(struct delayed_work *dwork)
2638 {
2639 	if (del_timer_sync(&dwork->timer))
2640 		__queue_work(raw_smp_processor_id(),
2641 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
2642 	return flush_work_sync(&dwork->work);
2643 }
2644 EXPORT_SYMBOL(flush_delayed_work_sync);
2645 
2646 /**
2647  * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2648  * @dwork: the delayed work cancel
2649  *
2650  * This is cancel_work_sync() for delayed works.
2651  *
2652  * RETURNS:
2653  * %true if @dwork was pending, %false otherwise.
2654  */
2655 bool cancel_delayed_work_sync(struct delayed_work *dwork)
2656 {
2657 	return __cancel_work_timer(&dwork->work, &dwork->timer);
2658 }
2659 EXPORT_SYMBOL(cancel_delayed_work_sync);
2660 
2661 /**
2662  * schedule_work - put work task in global workqueue
2663  * @work: job to be done
2664  *
2665  * Returns zero if @work was already on the kernel-global workqueue and
2666  * non-zero otherwise.
2667  *
2668  * This puts a job in the kernel-global workqueue if it was not already
2669  * queued and leaves it in the same position on the kernel-global
2670  * workqueue otherwise.
2671  */
2672 int schedule_work(struct work_struct *work)
2673 {
2674 	return queue_work(system_wq, work);
2675 }
2676 EXPORT_SYMBOL(schedule_work);
2677 
2678 /*
2679  * schedule_work_on - put work task on a specific cpu
2680  * @cpu: cpu to put the work task on
2681  * @work: job to be done
2682  *
2683  * This puts a job on a specific cpu
2684  */
2685 int schedule_work_on(int cpu, struct work_struct *work)
2686 {
2687 	return queue_work_on(cpu, system_wq, work);
2688 }
2689 EXPORT_SYMBOL(schedule_work_on);
2690 
2691 /**
2692  * schedule_delayed_work - put work task in global workqueue after delay
2693  * @dwork: job to be done
2694  * @delay: number of jiffies to wait or 0 for immediate execution
2695  *
2696  * After waiting for a given time this puts a job in the kernel-global
2697  * workqueue.
2698  */
2699 int schedule_delayed_work(struct delayed_work *dwork,
2700 					unsigned long delay)
2701 {
2702 	return queue_delayed_work(system_wq, dwork, delay);
2703 }
2704 EXPORT_SYMBOL(schedule_delayed_work);
2705 
2706 /**
2707  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2708  * @cpu: cpu to use
2709  * @dwork: job to be done
2710  * @delay: number of jiffies to wait
2711  *
2712  * After waiting for a given time this puts a job in the kernel-global
2713  * workqueue on the specified CPU.
2714  */
2715 int schedule_delayed_work_on(int cpu,
2716 			struct delayed_work *dwork, unsigned long delay)
2717 {
2718 	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2719 }
2720 EXPORT_SYMBOL(schedule_delayed_work_on);
2721 
2722 /**
2723  * schedule_on_each_cpu - execute a function synchronously on each online CPU
2724  * @func: the function to call
2725  *
2726  * schedule_on_each_cpu() executes @func on each online CPU using the
2727  * system workqueue and blocks until all CPUs have completed.
2728  * schedule_on_each_cpu() is very slow.
2729  *
2730  * RETURNS:
2731  * 0 on success, -errno on failure.
2732  */
2733 int schedule_on_each_cpu(work_func_t func)
2734 {
2735 	int cpu;
2736 	struct work_struct __percpu *works;
2737 
2738 	works = alloc_percpu(struct work_struct);
2739 	if (!works)
2740 		return -ENOMEM;
2741 
2742 	get_online_cpus();
2743 
2744 	for_each_online_cpu(cpu) {
2745 		struct work_struct *work = per_cpu_ptr(works, cpu);
2746 
2747 		INIT_WORK(work, func);
2748 		schedule_work_on(cpu, work);
2749 	}
2750 
2751 	for_each_online_cpu(cpu)
2752 		flush_work(per_cpu_ptr(works, cpu));
2753 
2754 	put_online_cpus();
2755 	free_percpu(works);
2756 	return 0;
2757 }
2758 
2759 /**
2760  * flush_scheduled_work - ensure that any scheduled work has run to completion.
2761  *
2762  * Forces execution of the kernel-global workqueue and blocks until its
2763  * completion.
2764  *
2765  * Think twice before calling this function!  It's very easy to get into
2766  * trouble if you don't take great care.  Either of the following situations
2767  * will lead to deadlock:
2768  *
2769  *	One of the work items currently on the workqueue needs to acquire
2770  *	a lock held by your code or its caller.
2771  *
2772  *	Your code is running in the context of a work routine.
2773  *
2774  * They will be detected by lockdep when they occur, but the first might not
2775  * occur very often.  It depends on what work items are on the workqueue and
2776  * what locks they need, which you have no control over.
2777  *
2778  * In most situations flushing the entire workqueue is overkill; you merely
2779  * need to know that a particular work item isn't queued and isn't running.
2780  * In such cases you should use cancel_delayed_work_sync() or
2781  * cancel_work_sync() instead.
2782  */
2783 void flush_scheduled_work(void)
2784 {
2785 	flush_workqueue(system_wq);
2786 }
2787 EXPORT_SYMBOL(flush_scheduled_work);
2788 
2789 /**
2790  * execute_in_process_context - reliably execute the routine with user context
2791  * @fn:		the function to execute
2792  * @ew:		guaranteed storage for the execute work structure (must
2793  *		be available when the work executes)
2794  *
2795  * Executes the function immediately if process context is available,
2796  * otherwise schedules the function for delayed execution.
2797  *
2798  * Returns:	0 - function was executed
2799  *		1 - function was scheduled for execution
2800  */
2801 int execute_in_process_context(work_func_t fn, struct execute_work *ew)
2802 {
2803 	if (!in_interrupt()) {
2804 		fn(&ew->work);
2805 		return 0;
2806 	}
2807 
2808 	INIT_WORK(&ew->work, fn);
2809 	schedule_work(&ew->work);
2810 
2811 	return 1;
2812 }
2813 EXPORT_SYMBOL_GPL(execute_in_process_context);
2814 
2815 int keventd_up(void)
2816 {
2817 	return system_wq != NULL;
2818 }
2819 
2820 static int alloc_cwqs(struct workqueue_struct *wq)
2821 {
2822 	/*
2823 	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
2824 	 * Make sure that the alignment isn't lower than that of
2825 	 * unsigned long long.
2826 	 */
2827 	const size_t size = sizeof(struct cpu_workqueue_struct);
2828 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2829 				   __alignof__(unsigned long long));
2830 #ifdef CONFIG_SMP
2831 	bool percpu = !(wq->flags & WQ_UNBOUND);
2832 #else
2833 	bool percpu = false;
2834 #endif
2835 
2836 	if (percpu)
2837 		wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2838 	else {
2839 		void *ptr;
2840 
2841 		/*
2842 		 * Allocate enough room to align cwq and put an extra
2843 		 * pointer at the end pointing back to the originally
2844 		 * allocated pointer which will be used for free.
2845 		 */
2846 		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2847 		if (ptr) {
2848 			wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2849 			*(void **)(wq->cpu_wq.single + 1) = ptr;
2850 		}
2851 	}
2852 
2853 	/* just in case, make sure it's actually aligned
2854 	 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2855 	 */
2856 	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2857 	return wq->cpu_wq.v ? 0 : -ENOMEM;
2858 }
2859 
2860 static void free_cwqs(struct workqueue_struct *wq)
2861 {
2862 #ifdef CONFIG_SMP
2863 	bool percpu = !(wq->flags & WQ_UNBOUND);
2864 #else
2865 	bool percpu = false;
2866 #endif
2867 
2868 	if (percpu)
2869 		free_percpu(wq->cpu_wq.pcpu);
2870 	else if (wq->cpu_wq.single) {
2871 		/* the pointer to free is stored right after the cwq */
2872 		kfree(*(void **)(wq->cpu_wq.single + 1));
2873 	}
2874 }
2875 
2876 static int wq_clamp_max_active(int max_active, unsigned int flags,
2877 			       const char *name)
2878 {
2879 	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
2880 
2881 	if (max_active < 1 || max_active > lim)
2882 		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
2883 		       "is out of range, clamping between %d and %d\n",
2884 		       max_active, name, 1, lim);
2885 
2886 	return clamp_val(max_active, 1, lim);
2887 }
2888 
2889 struct workqueue_struct *__alloc_workqueue_key(const char *name,
2890 					       unsigned int flags,
2891 					       int max_active,
2892 					       struct lock_class_key *key,
2893 					       const char *lock_name)
2894 {
2895 	struct workqueue_struct *wq;
2896 	unsigned int cpu;
2897 
2898 	/*
2899 	 * Workqueues which may be used during memory reclaim should
2900 	 * have a rescuer to guarantee forward progress.
2901 	 */
2902 	if (flags & WQ_MEM_RECLAIM)
2903 		flags |= WQ_RESCUER;
2904 
2905 	/*
2906 	 * Unbound workqueues aren't concurrency managed and should be
2907 	 * dispatched to workers immediately.
2908 	 */
2909 	if (flags & WQ_UNBOUND)
2910 		flags |= WQ_HIGHPRI;
2911 
2912 	max_active = max_active ?: WQ_DFL_ACTIVE;
2913 	max_active = wq_clamp_max_active(max_active, flags, name);
2914 
2915 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2916 	if (!wq)
2917 		goto err;
2918 
2919 	wq->flags = flags;
2920 	wq->saved_max_active = max_active;
2921 	mutex_init(&wq->flush_mutex);
2922 	atomic_set(&wq->nr_cwqs_to_flush, 0);
2923 	INIT_LIST_HEAD(&wq->flusher_queue);
2924 	INIT_LIST_HEAD(&wq->flusher_overflow);
2925 
2926 	wq->name = name;
2927 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
2928 	INIT_LIST_HEAD(&wq->list);
2929 
2930 	if (alloc_cwqs(wq) < 0)
2931 		goto err;
2932 
2933 	for_each_cwq_cpu(cpu, wq) {
2934 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2935 		struct global_cwq *gcwq = get_gcwq(cpu);
2936 
2937 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
2938 		cwq->gcwq = gcwq;
2939 		cwq->wq = wq;
2940 		cwq->flush_color = -1;
2941 		cwq->max_active = max_active;
2942 		INIT_LIST_HEAD(&cwq->delayed_works);
2943 	}
2944 
2945 	if (flags & WQ_RESCUER) {
2946 		struct worker *rescuer;
2947 
2948 		if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
2949 			goto err;
2950 
2951 		wq->rescuer = rescuer = alloc_worker();
2952 		if (!rescuer)
2953 			goto err;
2954 
2955 		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
2956 		if (IS_ERR(rescuer->task))
2957 			goto err;
2958 
2959 		rescuer->task->flags |= PF_THREAD_BOUND;
2960 		wake_up_process(rescuer->task);
2961 	}
2962 
2963 	/*
2964 	 * workqueue_lock protects global freeze state and workqueues
2965 	 * list.  Grab it, set max_active accordingly and add the new
2966 	 * workqueue to workqueues list.
2967 	 */
2968 	spin_lock(&workqueue_lock);
2969 
2970 	if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
2971 		for_each_cwq_cpu(cpu, wq)
2972 			get_cwq(cpu, wq)->max_active = 0;
2973 
2974 	list_add(&wq->list, &workqueues);
2975 
2976 	spin_unlock(&workqueue_lock);
2977 
2978 	return wq;
2979 err:
2980 	if (wq) {
2981 		free_cwqs(wq);
2982 		free_mayday_mask(wq->mayday_mask);
2983 		kfree(wq->rescuer);
2984 		kfree(wq);
2985 	}
2986 	return NULL;
2987 }
2988 EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2989 
2990 /**
2991  * destroy_workqueue - safely terminate a workqueue
2992  * @wq: target workqueue
2993  *
2994  * Safely destroy a workqueue. All work currently pending will be done first.
2995  */
2996 void destroy_workqueue(struct workqueue_struct *wq)
2997 {
2998 	unsigned int flush_cnt = 0;
2999 	unsigned int cpu;
3000 
3001 	/*
3002 	 * Mark @wq dying and drain all pending works.  Once WQ_DYING is
3003 	 * set, only chain queueing is allowed.  IOW, only currently
3004 	 * pending or running work items on @wq can queue further work
3005 	 * items on it.  @wq is flushed repeatedly until it becomes empty.
3006 	 * The number of flushing is detemined by the depth of chaining and
3007 	 * should be relatively short.  Whine if it takes too long.
3008 	 */
3009 	wq->flags |= WQ_DYING;
3010 reflush:
3011 	flush_workqueue(wq);
3012 
3013 	for_each_cwq_cpu(cpu, wq) {
3014 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3015 
3016 		if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3017 			continue;
3018 
3019 		if (++flush_cnt == 10 ||
3020 		    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3021 			printk(KERN_WARNING "workqueue %s: flush on "
3022 			       "destruction isn't complete after %u tries\n",
3023 			       wq->name, flush_cnt);
3024 		goto reflush;
3025 	}
3026 
3027 	/*
3028 	 * wq list is used to freeze wq, remove from list after
3029 	 * flushing is complete in case freeze races us.
3030 	 */
3031 	spin_lock(&workqueue_lock);
3032 	list_del(&wq->list);
3033 	spin_unlock(&workqueue_lock);
3034 
3035 	/* sanity check */
3036 	for_each_cwq_cpu(cpu, wq) {
3037 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3038 		int i;
3039 
3040 		for (i = 0; i < WORK_NR_COLORS; i++)
3041 			BUG_ON(cwq->nr_in_flight[i]);
3042 		BUG_ON(cwq->nr_active);
3043 		BUG_ON(!list_empty(&cwq->delayed_works));
3044 	}
3045 
3046 	if (wq->flags & WQ_RESCUER) {
3047 		kthread_stop(wq->rescuer->task);
3048 		free_mayday_mask(wq->mayday_mask);
3049 		kfree(wq->rescuer);
3050 	}
3051 
3052 	free_cwqs(wq);
3053 	kfree(wq);
3054 }
3055 EXPORT_SYMBOL_GPL(destroy_workqueue);
3056 
3057 /**
3058  * workqueue_set_max_active - adjust max_active of a workqueue
3059  * @wq: target workqueue
3060  * @max_active: new max_active value.
3061  *
3062  * Set max_active of @wq to @max_active.
3063  *
3064  * CONTEXT:
3065  * Don't call from IRQ context.
3066  */
3067 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3068 {
3069 	unsigned int cpu;
3070 
3071 	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3072 
3073 	spin_lock(&workqueue_lock);
3074 
3075 	wq->saved_max_active = max_active;
3076 
3077 	for_each_cwq_cpu(cpu, wq) {
3078 		struct global_cwq *gcwq = get_gcwq(cpu);
3079 
3080 		spin_lock_irq(&gcwq->lock);
3081 
3082 		if (!(wq->flags & WQ_FREEZABLE) ||
3083 		    !(gcwq->flags & GCWQ_FREEZING))
3084 			get_cwq(gcwq->cpu, wq)->max_active = max_active;
3085 
3086 		spin_unlock_irq(&gcwq->lock);
3087 	}
3088 
3089 	spin_unlock(&workqueue_lock);
3090 }
3091 EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3092 
3093 /**
3094  * workqueue_congested - test whether a workqueue is congested
3095  * @cpu: CPU in question
3096  * @wq: target workqueue
3097  *
3098  * Test whether @wq's cpu workqueue for @cpu is congested.  There is
3099  * no synchronization around this function and the test result is
3100  * unreliable and only useful as advisory hints or for debugging.
3101  *
3102  * RETURNS:
3103  * %true if congested, %false otherwise.
3104  */
3105 bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3106 {
3107 	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3108 
3109 	return !list_empty(&cwq->delayed_works);
3110 }
3111 EXPORT_SYMBOL_GPL(workqueue_congested);
3112 
3113 /**
3114  * work_cpu - return the last known associated cpu for @work
3115  * @work: the work of interest
3116  *
3117  * RETURNS:
3118  * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
3119  */
3120 unsigned int work_cpu(struct work_struct *work)
3121 {
3122 	struct global_cwq *gcwq = get_work_gcwq(work);
3123 
3124 	return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3125 }
3126 EXPORT_SYMBOL_GPL(work_cpu);
3127 
3128 /**
3129  * work_busy - test whether a work is currently pending or running
3130  * @work: the work to be tested
3131  *
3132  * Test whether @work is currently pending or running.  There is no
3133  * synchronization around this function and the test result is
3134  * unreliable and only useful as advisory hints or for debugging.
3135  * Especially for reentrant wqs, the pending state might hide the
3136  * running state.
3137  *
3138  * RETURNS:
3139  * OR'd bitmask of WORK_BUSY_* bits.
3140  */
3141 unsigned int work_busy(struct work_struct *work)
3142 {
3143 	struct global_cwq *gcwq = get_work_gcwq(work);
3144 	unsigned long flags;
3145 	unsigned int ret = 0;
3146 
3147 	if (!gcwq)
3148 		return false;
3149 
3150 	spin_lock_irqsave(&gcwq->lock, flags);
3151 
3152 	if (work_pending(work))
3153 		ret |= WORK_BUSY_PENDING;
3154 	if (find_worker_executing_work(gcwq, work))
3155 		ret |= WORK_BUSY_RUNNING;
3156 
3157 	spin_unlock_irqrestore(&gcwq->lock, flags);
3158 
3159 	return ret;
3160 }
3161 EXPORT_SYMBOL_GPL(work_busy);
3162 
3163 /*
3164  * CPU hotplug.
3165  *
3166  * There are two challenges in supporting CPU hotplug.  Firstly, there
3167  * are a lot of assumptions on strong associations among work, cwq and
3168  * gcwq which make migrating pending and scheduled works very
3169  * difficult to implement without impacting hot paths.  Secondly,
3170  * gcwqs serve mix of short, long and very long running works making
3171  * blocked draining impractical.
3172  *
3173  * This is solved by allowing a gcwq to be detached from CPU, running
3174  * it with unbound (rogue) workers and allowing it to be reattached
3175  * later if the cpu comes back online.  A separate thread is created
3176  * to govern a gcwq in such state and is called the trustee of the
3177  * gcwq.
3178  *
3179  * Trustee states and their descriptions.
3180  *
3181  * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
3182  *		new trustee is started with this state.
3183  *
3184  * IN_CHARGE	Once started, trustee will enter this state after
3185  *		assuming the manager role and making all existing
3186  *		workers rogue.  DOWN_PREPARE waits for trustee to
3187  *		enter this state.  After reaching IN_CHARGE, trustee
3188  *		tries to execute the pending worklist until it's empty
3189  *		and the state is set to BUTCHER, or the state is set
3190  *		to RELEASE.
3191  *
3192  * BUTCHER	Command state which is set by the cpu callback after
3193  *		the cpu has went down.  Once this state is set trustee
3194  *		knows that there will be no new works on the worklist
3195  *		and once the worklist is empty it can proceed to
3196  *		killing idle workers.
3197  *
3198  * RELEASE	Command state which is set by the cpu callback if the
3199  *		cpu down has been canceled or it has come online
3200  *		again.  After recognizing this state, trustee stops
3201  *		trying to drain or butcher and clears ROGUE, rebinds
3202  *		all remaining workers back to the cpu and releases
3203  *		manager role.
3204  *
3205  * DONE		Trustee will enter this state after BUTCHER or RELEASE
3206  *		is complete.
3207  *
3208  *          trustee                 CPU                draining
3209  *         took over                down               complete
3210  * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3211  *                        |                     |                  ^
3212  *                        | CPU is back online  v   return workers |
3213  *                         ----------------> RELEASE --------------
3214  */
3215 
3216 /**
3217  * trustee_wait_event_timeout - timed event wait for trustee
3218  * @cond: condition to wait for
3219  * @timeout: timeout in jiffies
3220  *
3221  * wait_event_timeout() for trustee to use.  Handles locking and
3222  * checks for RELEASE request.
3223  *
3224  * CONTEXT:
3225  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3226  * multiple times.  To be used by trustee.
3227  *
3228  * RETURNS:
3229  * Positive indicating left time if @cond is satisfied, 0 if timed
3230  * out, -1 if canceled.
3231  */
3232 #define trustee_wait_event_timeout(cond, timeout) ({			\
3233 	long __ret = (timeout);						\
3234 	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
3235 	       __ret) {							\
3236 		spin_unlock_irq(&gcwq->lock);				\
3237 		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
3238 			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
3239 			__ret);						\
3240 		spin_lock_irq(&gcwq->lock);				\
3241 	}								\
3242 	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
3243 })
3244 
3245 /**
3246  * trustee_wait_event - event wait for trustee
3247  * @cond: condition to wait for
3248  *
3249  * wait_event() for trustee to use.  Automatically handles locking and
3250  * checks for CANCEL request.
3251  *
3252  * CONTEXT:
3253  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3254  * multiple times.  To be used by trustee.
3255  *
3256  * RETURNS:
3257  * 0 if @cond is satisfied, -1 if canceled.
3258  */
3259 #define trustee_wait_event(cond) ({					\
3260 	long __ret1;							\
3261 	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3262 	__ret1 < 0 ? -1 : 0;						\
3263 })
3264 
3265 static int __cpuinit trustee_thread(void *__gcwq)
3266 {
3267 	struct global_cwq *gcwq = __gcwq;
3268 	struct worker *worker;
3269 	struct work_struct *work;
3270 	struct hlist_node *pos;
3271 	long rc;
3272 	int i;
3273 
3274 	BUG_ON(gcwq->cpu != smp_processor_id());
3275 
3276 	spin_lock_irq(&gcwq->lock);
3277 	/*
3278 	 * Claim the manager position and make all workers rogue.
3279 	 * Trustee must be bound to the target cpu and can't be
3280 	 * cancelled.
3281 	 */
3282 	BUG_ON(gcwq->cpu != smp_processor_id());
3283 	rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3284 	BUG_ON(rc < 0);
3285 
3286 	gcwq->flags |= GCWQ_MANAGING_WORKERS;
3287 
3288 	list_for_each_entry(worker, &gcwq->idle_list, entry)
3289 		worker->flags |= WORKER_ROGUE;
3290 
3291 	for_each_busy_worker(worker, i, pos, gcwq)
3292 		worker->flags |= WORKER_ROGUE;
3293 
3294 	/*
3295 	 * Call schedule() so that we cross rq->lock and thus can
3296 	 * guarantee sched callbacks see the rogue flag.  This is
3297 	 * necessary as scheduler callbacks may be invoked from other
3298 	 * cpus.
3299 	 */
3300 	spin_unlock_irq(&gcwq->lock);
3301 	schedule();
3302 	spin_lock_irq(&gcwq->lock);
3303 
3304 	/*
3305 	 * Sched callbacks are disabled now.  Zap nr_running.  After
3306 	 * this, nr_running stays zero and need_more_worker() and
3307 	 * keep_working() are always true as long as the worklist is
3308 	 * not empty.
3309 	 */
3310 	atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3311 
3312 	spin_unlock_irq(&gcwq->lock);
3313 	del_timer_sync(&gcwq->idle_timer);
3314 	spin_lock_irq(&gcwq->lock);
3315 
3316 	/*
3317 	 * We're now in charge.  Notify and proceed to drain.  We need
3318 	 * to keep the gcwq running during the whole CPU down
3319 	 * procedure as other cpu hotunplug callbacks may need to
3320 	 * flush currently running tasks.
3321 	 */
3322 	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3323 	wake_up_all(&gcwq->trustee_wait);
3324 
3325 	/*
3326 	 * The original cpu is in the process of dying and may go away
3327 	 * anytime now.  When that happens, we and all workers would
3328 	 * be migrated to other cpus.  Try draining any left work.  We
3329 	 * want to get it over with ASAP - spam rescuers, wake up as
3330 	 * many idlers as necessary and create new ones till the
3331 	 * worklist is empty.  Note that if the gcwq is frozen, there
3332 	 * may be frozen works in freezable cwqs.  Don't declare
3333 	 * completion while frozen.
3334 	 */
3335 	while (gcwq->nr_workers != gcwq->nr_idle ||
3336 	       gcwq->flags & GCWQ_FREEZING ||
3337 	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3338 		int nr_works = 0;
3339 
3340 		list_for_each_entry(work, &gcwq->worklist, entry) {
3341 			send_mayday(work);
3342 			nr_works++;
3343 		}
3344 
3345 		list_for_each_entry(worker, &gcwq->idle_list, entry) {
3346 			if (!nr_works--)
3347 				break;
3348 			wake_up_process(worker->task);
3349 		}
3350 
3351 		if (need_to_create_worker(gcwq)) {
3352 			spin_unlock_irq(&gcwq->lock);
3353 			worker = create_worker(gcwq, false);
3354 			spin_lock_irq(&gcwq->lock);
3355 			if (worker) {
3356 				worker->flags |= WORKER_ROGUE;
3357 				start_worker(worker);
3358 			}
3359 		}
3360 
3361 		/* give a breather */
3362 		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3363 			break;
3364 	}
3365 
3366 	/*
3367 	 * Either all works have been scheduled and cpu is down, or
3368 	 * cpu down has already been canceled.  Wait for and butcher
3369 	 * all workers till we're canceled.
3370 	 */
3371 	do {
3372 		rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3373 		while (!list_empty(&gcwq->idle_list))
3374 			destroy_worker(list_first_entry(&gcwq->idle_list,
3375 							struct worker, entry));
3376 	} while (gcwq->nr_workers && rc >= 0);
3377 
3378 	/*
3379 	 * At this point, either draining has completed and no worker
3380 	 * is left, or cpu down has been canceled or the cpu is being
3381 	 * brought back up.  There shouldn't be any idle one left.
3382 	 * Tell the remaining busy ones to rebind once it finishes the
3383 	 * currently scheduled works by scheduling the rebind_work.
3384 	 */
3385 	WARN_ON(!list_empty(&gcwq->idle_list));
3386 
3387 	for_each_busy_worker(worker, i, pos, gcwq) {
3388 		struct work_struct *rebind_work = &worker->rebind_work;
3389 
3390 		/*
3391 		 * Rebind_work may race with future cpu hotplug
3392 		 * operations.  Use a separate flag to mark that
3393 		 * rebinding is scheduled.
3394 		 */
3395 		worker->flags |= WORKER_REBIND;
3396 		worker->flags &= ~WORKER_ROGUE;
3397 
3398 		/* queue rebind_work, wq doesn't matter, use the default one */
3399 		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3400 				     work_data_bits(rebind_work)))
3401 			continue;
3402 
3403 		debug_work_activate(rebind_work);
3404 		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3405 			    worker->scheduled.next,
3406 			    work_color_to_flags(WORK_NO_COLOR));
3407 	}
3408 
3409 	/* relinquish manager role */
3410 	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3411 
3412 	/* notify completion */
3413 	gcwq->trustee = NULL;
3414 	gcwq->trustee_state = TRUSTEE_DONE;
3415 	wake_up_all(&gcwq->trustee_wait);
3416 	spin_unlock_irq(&gcwq->lock);
3417 	return 0;
3418 }
3419 
3420 /**
3421  * wait_trustee_state - wait for trustee to enter the specified state
3422  * @gcwq: gcwq the trustee of interest belongs to
3423  * @state: target state to wait for
3424  *
3425  * Wait for the trustee to reach @state.  DONE is already matched.
3426  *
3427  * CONTEXT:
3428  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3429  * multiple times.  To be used by cpu_callback.
3430  */
3431 static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3432 __releases(&gcwq->lock)
3433 __acquires(&gcwq->lock)
3434 {
3435 	if (!(gcwq->trustee_state == state ||
3436 	      gcwq->trustee_state == TRUSTEE_DONE)) {
3437 		spin_unlock_irq(&gcwq->lock);
3438 		__wait_event(gcwq->trustee_wait,
3439 			     gcwq->trustee_state == state ||
3440 			     gcwq->trustee_state == TRUSTEE_DONE);
3441 		spin_lock_irq(&gcwq->lock);
3442 	}
3443 }
3444 
3445 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3446 						unsigned long action,
3447 						void *hcpu)
3448 {
3449 	unsigned int cpu = (unsigned long)hcpu;
3450 	struct global_cwq *gcwq = get_gcwq(cpu);
3451 	struct task_struct *new_trustee = NULL;
3452 	struct worker *uninitialized_var(new_worker);
3453 	unsigned long flags;
3454 
3455 	action &= ~CPU_TASKS_FROZEN;
3456 
3457 	switch (action) {
3458 	case CPU_DOWN_PREPARE:
3459 		new_trustee = kthread_create(trustee_thread, gcwq,
3460 					     "workqueue_trustee/%d\n", cpu);
3461 		if (IS_ERR(new_trustee))
3462 			return notifier_from_errno(PTR_ERR(new_trustee));
3463 		kthread_bind(new_trustee, cpu);
3464 		/* fall through */
3465 	case CPU_UP_PREPARE:
3466 		BUG_ON(gcwq->first_idle);
3467 		new_worker = create_worker(gcwq, false);
3468 		if (!new_worker) {
3469 			if (new_trustee)
3470 				kthread_stop(new_trustee);
3471 			return NOTIFY_BAD;
3472 		}
3473 	}
3474 
3475 	/* some are called w/ irq disabled, don't disturb irq status */
3476 	spin_lock_irqsave(&gcwq->lock, flags);
3477 
3478 	switch (action) {
3479 	case CPU_DOWN_PREPARE:
3480 		/* initialize trustee and tell it to acquire the gcwq */
3481 		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3482 		gcwq->trustee = new_trustee;
3483 		gcwq->trustee_state = TRUSTEE_START;
3484 		wake_up_process(gcwq->trustee);
3485 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3486 		/* fall through */
3487 	case CPU_UP_PREPARE:
3488 		BUG_ON(gcwq->first_idle);
3489 		gcwq->first_idle = new_worker;
3490 		break;
3491 
3492 	case CPU_DYING:
3493 		/*
3494 		 * Before this, the trustee and all workers except for
3495 		 * the ones which are still executing works from
3496 		 * before the last CPU down must be on the cpu.  After
3497 		 * this, they'll all be diasporas.
3498 		 */
3499 		gcwq->flags |= GCWQ_DISASSOCIATED;
3500 		break;
3501 
3502 	case CPU_POST_DEAD:
3503 		gcwq->trustee_state = TRUSTEE_BUTCHER;
3504 		/* fall through */
3505 	case CPU_UP_CANCELED:
3506 		destroy_worker(gcwq->first_idle);
3507 		gcwq->first_idle = NULL;
3508 		break;
3509 
3510 	case CPU_DOWN_FAILED:
3511 	case CPU_ONLINE:
3512 		gcwq->flags &= ~GCWQ_DISASSOCIATED;
3513 		if (gcwq->trustee_state != TRUSTEE_DONE) {
3514 			gcwq->trustee_state = TRUSTEE_RELEASE;
3515 			wake_up_process(gcwq->trustee);
3516 			wait_trustee_state(gcwq, TRUSTEE_DONE);
3517 		}
3518 
3519 		/*
3520 		 * Trustee is done and there might be no worker left.
3521 		 * Put the first_idle in and request a real manager to
3522 		 * take a look.
3523 		 */
3524 		spin_unlock_irq(&gcwq->lock);
3525 		kthread_bind(gcwq->first_idle->task, cpu);
3526 		spin_lock_irq(&gcwq->lock);
3527 		gcwq->flags |= GCWQ_MANAGE_WORKERS;
3528 		start_worker(gcwq->first_idle);
3529 		gcwq->first_idle = NULL;
3530 		break;
3531 	}
3532 
3533 	spin_unlock_irqrestore(&gcwq->lock, flags);
3534 
3535 	return notifier_from_errno(0);
3536 }
3537 
3538 #ifdef CONFIG_SMP
3539 
3540 struct work_for_cpu {
3541 	struct completion completion;
3542 	long (*fn)(void *);
3543 	void *arg;
3544 	long ret;
3545 };
3546 
3547 static int do_work_for_cpu(void *_wfc)
3548 {
3549 	struct work_for_cpu *wfc = _wfc;
3550 	wfc->ret = wfc->fn(wfc->arg);
3551 	complete(&wfc->completion);
3552 	return 0;
3553 }
3554 
3555 /**
3556  * work_on_cpu - run a function in user context on a particular cpu
3557  * @cpu: the cpu to run on
3558  * @fn: the function to run
3559  * @arg: the function arg
3560  *
3561  * This will return the value @fn returns.
3562  * It is up to the caller to ensure that the cpu doesn't go offline.
3563  * The caller must not hold any locks which would prevent @fn from completing.
3564  */
3565 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3566 {
3567 	struct task_struct *sub_thread;
3568 	struct work_for_cpu wfc = {
3569 		.completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
3570 		.fn = fn,
3571 		.arg = arg,
3572 	};
3573 
3574 	sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
3575 	if (IS_ERR(sub_thread))
3576 		return PTR_ERR(sub_thread);
3577 	kthread_bind(sub_thread, cpu);
3578 	wake_up_process(sub_thread);
3579 	wait_for_completion(&wfc.completion);
3580 	return wfc.ret;
3581 }
3582 EXPORT_SYMBOL_GPL(work_on_cpu);
3583 #endif /* CONFIG_SMP */
3584 
3585 #ifdef CONFIG_FREEZER
3586 
3587 /**
3588  * freeze_workqueues_begin - begin freezing workqueues
3589  *
3590  * Start freezing workqueues.  After this function returns, all freezable
3591  * workqueues will queue new works to their frozen_works list instead of
3592  * gcwq->worklist.
3593  *
3594  * CONTEXT:
3595  * Grabs and releases workqueue_lock and gcwq->lock's.
3596  */
3597 void freeze_workqueues_begin(void)
3598 {
3599 	unsigned int cpu;
3600 
3601 	spin_lock(&workqueue_lock);
3602 
3603 	BUG_ON(workqueue_freezing);
3604 	workqueue_freezing = true;
3605 
3606 	for_each_gcwq_cpu(cpu) {
3607 		struct global_cwq *gcwq = get_gcwq(cpu);
3608 		struct workqueue_struct *wq;
3609 
3610 		spin_lock_irq(&gcwq->lock);
3611 
3612 		BUG_ON(gcwq->flags & GCWQ_FREEZING);
3613 		gcwq->flags |= GCWQ_FREEZING;
3614 
3615 		list_for_each_entry(wq, &workqueues, list) {
3616 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3617 
3618 			if (cwq && wq->flags & WQ_FREEZABLE)
3619 				cwq->max_active = 0;
3620 		}
3621 
3622 		spin_unlock_irq(&gcwq->lock);
3623 	}
3624 
3625 	spin_unlock(&workqueue_lock);
3626 }
3627 
3628 /**
3629  * freeze_workqueues_busy - are freezable workqueues still busy?
3630  *
3631  * Check whether freezing is complete.  This function must be called
3632  * between freeze_workqueues_begin() and thaw_workqueues().
3633  *
3634  * CONTEXT:
3635  * Grabs and releases workqueue_lock.
3636  *
3637  * RETURNS:
3638  * %true if some freezable workqueues are still busy.  %false if freezing
3639  * is complete.
3640  */
3641 bool freeze_workqueues_busy(void)
3642 {
3643 	unsigned int cpu;
3644 	bool busy = false;
3645 
3646 	spin_lock(&workqueue_lock);
3647 
3648 	BUG_ON(!workqueue_freezing);
3649 
3650 	for_each_gcwq_cpu(cpu) {
3651 		struct workqueue_struct *wq;
3652 		/*
3653 		 * nr_active is monotonically decreasing.  It's safe
3654 		 * to peek without lock.
3655 		 */
3656 		list_for_each_entry(wq, &workqueues, list) {
3657 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3658 
3659 			if (!cwq || !(wq->flags & WQ_FREEZABLE))
3660 				continue;
3661 
3662 			BUG_ON(cwq->nr_active < 0);
3663 			if (cwq->nr_active) {
3664 				busy = true;
3665 				goto out_unlock;
3666 			}
3667 		}
3668 	}
3669 out_unlock:
3670 	spin_unlock(&workqueue_lock);
3671 	return busy;
3672 }
3673 
3674 /**
3675  * thaw_workqueues - thaw workqueues
3676  *
3677  * Thaw workqueues.  Normal queueing is restored and all collected
3678  * frozen works are transferred to their respective gcwq worklists.
3679  *
3680  * CONTEXT:
3681  * Grabs and releases workqueue_lock and gcwq->lock's.
3682  */
3683 void thaw_workqueues(void)
3684 {
3685 	unsigned int cpu;
3686 
3687 	spin_lock(&workqueue_lock);
3688 
3689 	if (!workqueue_freezing)
3690 		goto out_unlock;
3691 
3692 	for_each_gcwq_cpu(cpu) {
3693 		struct global_cwq *gcwq = get_gcwq(cpu);
3694 		struct workqueue_struct *wq;
3695 
3696 		spin_lock_irq(&gcwq->lock);
3697 
3698 		BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3699 		gcwq->flags &= ~GCWQ_FREEZING;
3700 
3701 		list_for_each_entry(wq, &workqueues, list) {
3702 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3703 
3704 			if (!cwq || !(wq->flags & WQ_FREEZABLE))
3705 				continue;
3706 
3707 			/* restore max_active and repopulate worklist */
3708 			cwq->max_active = wq->saved_max_active;
3709 
3710 			while (!list_empty(&cwq->delayed_works) &&
3711 			       cwq->nr_active < cwq->max_active)
3712 				cwq_activate_first_delayed(cwq);
3713 		}
3714 
3715 		wake_up_worker(gcwq);
3716 
3717 		spin_unlock_irq(&gcwq->lock);
3718 	}
3719 
3720 	workqueue_freezing = false;
3721 out_unlock:
3722 	spin_unlock(&workqueue_lock);
3723 }
3724 #endif /* CONFIG_FREEZER */
3725 
3726 static int __init init_workqueues(void)
3727 {
3728 	unsigned int cpu;
3729 	int i;
3730 
3731 	cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3732 
3733 	/* initialize gcwqs */
3734 	for_each_gcwq_cpu(cpu) {
3735 		struct global_cwq *gcwq = get_gcwq(cpu);
3736 
3737 		spin_lock_init(&gcwq->lock);
3738 		INIT_LIST_HEAD(&gcwq->worklist);
3739 		gcwq->cpu = cpu;
3740 		gcwq->flags |= GCWQ_DISASSOCIATED;
3741 
3742 		INIT_LIST_HEAD(&gcwq->idle_list);
3743 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3744 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3745 
3746 		init_timer_deferrable(&gcwq->idle_timer);
3747 		gcwq->idle_timer.function = idle_worker_timeout;
3748 		gcwq->idle_timer.data = (unsigned long)gcwq;
3749 
3750 		setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3751 			    (unsigned long)gcwq);
3752 
3753 		ida_init(&gcwq->worker_ida);
3754 
3755 		gcwq->trustee_state = TRUSTEE_DONE;
3756 		init_waitqueue_head(&gcwq->trustee_wait);
3757 	}
3758 
3759 	/* create the initial worker */
3760 	for_each_online_gcwq_cpu(cpu) {
3761 		struct global_cwq *gcwq = get_gcwq(cpu);
3762 		struct worker *worker;
3763 
3764 		if (cpu != WORK_CPU_UNBOUND)
3765 			gcwq->flags &= ~GCWQ_DISASSOCIATED;
3766 		worker = create_worker(gcwq, true);
3767 		BUG_ON(!worker);
3768 		spin_lock_irq(&gcwq->lock);
3769 		start_worker(worker);
3770 		spin_unlock_irq(&gcwq->lock);
3771 	}
3772 
3773 	system_wq = alloc_workqueue("events", 0, 0);
3774 	system_long_wq = alloc_workqueue("events_long", 0, 0);
3775 	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3776 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3777 					    WQ_UNBOUND_MAX_ACTIVE);
3778 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3779 	       !system_unbound_wq);
3780 	return 0;
3781 }
3782 early_initcall(init_workqueues);
3783