xref: /openbmc/linux/kernel/sched/membarrier.c (revision 6cc23ed2)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4  *
5  * membarrier system call
6  */
7 #include "sched.h"
8 
9 /*
10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11  * except MEMBARRIER_CMD_QUERY.
12  */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
15 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
16 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
19 #endif
20 
21 #define MEMBARRIER_CMD_BITMASK						\
22 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
23 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
24 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
25 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
26 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
27 
28 static void ipi_mb(void *info)
29 {
30 	smp_mb();	/* IPIs should be serializing but paranoid. */
31 }
32 
33 static void ipi_sync_rq_state(void *info)
34 {
35 	struct mm_struct *mm = (struct mm_struct *) info;
36 
37 	if (current->mm != mm)
38 		return;
39 	this_cpu_write(runqueues.membarrier_state,
40 		       atomic_read(&mm->membarrier_state));
41 	/*
42 	 * Issue a memory barrier after setting
43 	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
44 	 * guarantee that no memory access following registration is reordered
45 	 * before registration.
46 	 */
47 	smp_mb();
48 }
49 
50 void membarrier_exec_mmap(struct mm_struct *mm)
51 {
52 	/*
53 	 * Issue a memory barrier before clearing membarrier_state to
54 	 * guarantee that no memory access prior to exec is reordered after
55 	 * clearing this state.
56 	 */
57 	smp_mb();
58 	atomic_set(&mm->membarrier_state, 0);
59 	/*
60 	 * Keep the runqueue membarrier_state in sync with this mm
61 	 * membarrier_state.
62 	 */
63 	this_cpu_write(runqueues.membarrier_state, 0);
64 }
65 
66 static int membarrier_global_expedited(void)
67 {
68 	int cpu;
69 	cpumask_var_t tmpmask;
70 
71 	if (num_online_cpus() == 1)
72 		return 0;
73 
74 	/*
75 	 * Matches memory barriers around rq->curr modification in
76 	 * scheduler.
77 	 */
78 	smp_mb();	/* system call entry is not a mb. */
79 
80 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
81 		return -ENOMEM;
82 
83 	cpus_read_lock();
84 	rcu_read_lock();
85 	for_each_online_cpu(cpu) {
86 		struct task_struct *p;
87 
88 		/*
89 		 * Skipping the current CPU is OK even through we can be
90 		 * migrated at any point. The current CPU, at the point
91 		 * where we read raw_smp_processor_id(), is ensured to
92 		 * be in program order with respect to the caller
93 		 * thread. Therefore, we can skip this CPU from the
94 		 * iteration.
95 		 */
96 		if (cpu == raw_smp_processor_id())
97 			continue;
98 
99 		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
100 		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
101 			continue;
102 
103 		/*
104 		 * Skip the CPU if it runs a kernel thread. The scheduler
105 		 * leaves the prior task mm in place as an optimization when
106 		 * scheduling a kthread.
107 		 */
108 		p = rcu_dereference(cpu_rq(cpu)->curr);
109 		if (p->flags & PF_KTHREAD)
110 			continue;
111 
112 		__cpumask_set_cpu(cpu, tmpmask);
113 	}
114 	rcu_read_unlock();
115 
116 	preempt_disable();
117 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
118 	preempt_enable();
119 
120 	free_cpumask_var(tmpmask);
121 	cpus_read_unlock();
122 
123 	/*
124 	 * Memory barrier on the caller thread _after_ we finished
125 	 * waiting for the last IPI. Matches memory barriers around
126 	 * rq->curr modification in scheduler.
127 	 */
128 	smp_mb();	/* exit from system call is not a mb */
129 	return 0;
130 }
131 
132 static int membarrier_private_expedited(int flags)
133 {
134 	int cpu;
135 	cpumask_var_t tmpmask;
136 	struct mm_struct *mm = current->mm;
137 
138 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
139 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
140 			return -EINVAL;
141 		if (!(atomic_read(&mm->membarrier_state) &
142 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
143 			return -EPERM;
144 	} else {
145 		if (!(atomic_read(&mm->membarrier_state) &
146 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
147 			return -EPERM;
148 	}
149 
150 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
151 		return 0;
152 
153 	/*
154 	 * Matches memory barriers around rq->curr modification in
155 	 * scheduler.
156 	 */
157 	smp_mb();	/* system call entry is not a mb. */
158 
159 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
160 		return -ENOMEM;
161 
162 	cpus_read_lock();
163 	rcu_read_lock();
164 	for_each_online_cpu(cpu) {
165 		struct task_struct *p;
166 
167 		/*
168 		 * Skipping the current CPU is OK even through we can be
169 		 * migrated at any point. The current CPU, at the point
170 		 * where we read raw_smp_processor_id(), is ensured to
171 		 * be in program order with respect to the caller
172 		 * thread. Therefore, we can skip this CPU from the
173 		 * iteration.
174 		 */
175 		if (cpu == raw_smp_processor_id())
176 			continue;
177 		rcu_read_lock();
178 		p = rcu_dereference(cpu_rq(cpu)->curr);
179 		if (p && p->mm == mm)
180 			__cpumask_set_cpu(cpu, tmpmask);
181 	}
182 	rcu_read_unlock();
183 
184 	preempt_disable();
185 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
186 	preempt_enable();
187 
188 	free_cpumask_var(tmpmask);
189 	cpus_read_unlock();
190 
191 	/*
192 	 * Memory barrier on the caller thread _after_ we finished
193 	 * waiting for the last IPI. Matches memory barriers around
194 	 * rq->curr modification in scheduler.
195 	 */
196 	smp_mb();	/* exit from system call is not a mb */
197 
198 	return 0;
199 }
200 
201 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
202 {
203 	int membarrier_state = atomic_read(&mm->membarrier_state);
204 	cpumask_var_t tmpmask;
205 	int cpu;
206 
207 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
208 		this_cpu_write(runqueues.membarrier_state, membarrier_state);
209 
210 		/*
211 		 * For single mm user, we can simply issue a memory barrier
212 		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
213 		 * mm and in the current runqueue to guarantee that no memory
214 		 * access following registration is reordered before
215 		 * registration.
216 		 */
217 		smp_mb();
218 		return 0;
219 	}
220 
221 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
222 		return -ENOMEM;
223 
224 	/*
225 	 * For mm with multiple users, we need to ensure all future
226 	 * scheduler executions will observe @mm's new membarrier
227 	 * state.
228 	 */
229 	synchronize_rcu();
230 
231 	/*
232 	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
233 	 * @mm's membarrier state set bits are also set in in the runqueue's
234 	 * membarrier state. This ensures that a runqueue scheduling
235 	 * between threads which are users of @mm has its membarrier state
236 	 * updated.
237 	 */
238 	cpus_read_lock();
239 	rcu_read_lock();
240 	for_each_online_cpu(cpu) {
241 		struct rq *rq = cpu_rq(cpu);
242 		struct task_struct *p;
243 
244 		p = rcu_dereference(rq->curr);
245 		if (p && p->mm == mm)
246 			__cpumask_set_cpu(cpu, tmpmask);
247 	}
248 	rcu_read_unlock();
249 
250 	preempt_disable();
251 	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
252 	preempt_enable();
253 
254 	free_cpumask_var(tmpmask);
255 	cpus_read_unlock();
256 
257 	return 0;
258 }
259 
260 static int membarrier_register_global_expedited(void)
261 {
262 	struct task_struct *p = current;
263 	struct mm_struct *mm = p->mm;
264 	int ret;
265 
266 	if (atomic_read(&mm->membarrier_state) &
267 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
268 		return 0;
269 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
270 	ret = sync_runqueues_membarrier_state(mm);
271 	if (ret)
272 		return ret;
273 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
274 		  &mm->membarrier_state);
275 
276 	return 0;
277 }
278 
279 static int membarrier_register_private_expedited(int flags)
280 {
281 	struct task_struct *p = current;
282 	struct mm_struct *mm = p->mm;
283 	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
284 	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
285 	    ret;
286 
287 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
288 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
289 			return -EINVAL;
290 		ready_state =
291 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
292 	}
293 
294 	/*
295 	 * We need to consider threads belonging to different thread
296 	 * groups, which use the same mm. (CLONE_VM but not
297 	 * CLONE_THREAD).
298 	 */
299 	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
300 		return 0;
301 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
302 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
303 	atomic_or(set_state, &mm->membarrier_state);
304 	ret = sync_runqueues_membarrier_state(mm);
305 	if (ret)
306 		return ret;
307 	atomic_or(ready_state, &mm->membarrier_state);
308 
309 	return 0;
310 }
311 
312 /**
313  * sys_membarrier - issue memory barriers on a set of threads
314  * @cmd:   Takes command values defined in enum membarrier_cmd.
315  * @flags: Currently needs to be 0. For future extensions.
316  *
317  * If this system call is not implemented, -ENOSYS is returned. If the
318  * command specified does not exist, not available on the running
319  * kernel, or if the command argument is invalid, this system call
320  * returns -EINVAL. For a given command, with flags argument set to 0,
321  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
322  * always return the same value until reboot. In addition, it can return
323  * -ENOMEM if there is not enough memory available to perform the system
324  * call.
325  *
326  * All memory accesses performed in program order from each targeted thread
327  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
328  * the semantic "barrier()" to represent a compiler barrier forcing memory
329  * accesses to be performed in program order across the barrier, and
330  * smp_mb() to represent explicit memory barriers forcing full memory
331  * ordering across the barrier, we have the following ordering table for
332  * each pair of barrier(), sys_membarrier() and smp_mb():
333  *
334  * The pair ordering is detailed as (O: ordered, X: not ordered):
335  *
336  *                        barrier()   smp_mb() sys_membarrier()
337  *        barrier()          X           X            O
338  *        smp_mb()           X           O            O
339  *        sys_membarrier()   O           O            O
340  */
341 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
342 {
343 	if (unlikely(flags))
344 		return -EINVAL;
345 	switch (cmd) {
346 	case MEMBARRIER_CMD_QUERY:
347 	{
348 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
349 
350 		if (tick_nohz_full_enabled())
351 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
352 		return cmd_mask;
353 	}
354 	case MEMBARRIER_CMD_GLOBAL:
355 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
356 		if (tick_nohz_full_enabled())
357 			return -EINVAL;
358 		if (num_online_cpus() > 1)
359 			synchronize_rcu();
360 		return 0;
361 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
362 		return membarrier_global_expedited();
363 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
364 		return membarrier_register_global_expedited();
365 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
366 		return membarrier_private_expedited(0);
367 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
368 		return membarrier_register_private_expedited(0);
369 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
370 		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
371 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
372 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
373 	default:
374 		return -EINVAL;
375 	}
376 }
377