xref: /openbmc/linux/kernel/sched/membarrier.c (revision 4f727ecefefbd180de10e25b3e74c03dce3f1e75)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4  *
5  * membarrier system call
6  */
7 #include "sched.h"
8 
9 /*
10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11  * except MEMBARRIER_CMD_QUERY.
12  */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
15 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
16 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
19 #endif
20 
21 #define MEMBARRIER_CMD_BITMASK						\
22 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
23 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
24 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
25 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
26 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
27 
28 static void ipi_mb(void *info)
29 {
30 	smp_mb();	/* IPIs should be serializing but paranoid. */
31 }
32 
33 static int membarrier_global_expedited(void)
34 {
35 	int cpu;
36 	bool fallback = false;
37 	cpumask_var_t tmpmask;
38 
39 	if (num_online_cpus() == 1)
40 		return 0;
41 
42 	/*
43 	 * Matches memory barriers around rq->curr modification in
44 	 * scheduler.
45 	 */
46 	smp_mb();	/* system call entry is not a mb. */
47 
48 	/*
49 	 * Expedited membarrier commands guarantee that they won't
50 	 * block, hence the GFP_NOWAIT allocation flag and fallback
51 	 * implementation.
52 	 */
53 	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
54 		/* Fallback for OOM. */
55 		fallback = true;
56 	}
57 
58 	cpus_read_lock();
59 	for_each_online_cpu(cpu) {
60 		struct task_struct *p;
61 
62 		/*
63 		 * Skipping the current CPU is OK even through we can be
64 		 * migrated at any point. The current CPU, at the point
65 		 * where we read raw_smp_processor_id(), is ensured to
66 		 * be in program order with respect to the caller
67 		 * thread. Therefore, we can skip this CPU from the
68 		 * iteration.
69 		 */
70 		if (cpu == raw_smp_processor_id())
71 			continue;
72 
73 		rcu_read_lock();
74 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
75 		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
76 				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
77 			if (!fallback)
78 				__cpumask_set_cpu(cpu, tmpmask);
79 			else
80 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
81 		}
82 		rcu_read_unlock();
83 	}
84 	if (!fallback) {
85 		preempt_disable();
86 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
87 		preempt_enable();
88 		free_cpumask_var(tmpmask);
89 	}
90 	cpus_read_unlock();
91 
92 	/*
93 	 * Memory barrier on the caller thread _after_ we finished
94 	 * waiting for the last IPI. Matches memory barriers around
95 	 * rq->curr modification in scheduler.
96 	 */
97 	smp_mb();	/* exit from system call is not a mb */
98 	return 0;
99 }
100 
101 static int membarrier_private_expedited(int flags)
102 {
103 	int cpu;
104 	bool fallback = false;
105 	cpumask_var_t tmpmask;
106 
107 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
108 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
109 			return -EINVAL;
110 		if (!(atomic_read(&current->mm->membarrier_state) &
111 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
112 			return -EPERM;
113 	} else {
114 		if (!(atomic_read(&current->mm->membarrier_state) &
115 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
116 			return -EPERM;
117 	}
118 
119 	if (num_online_cpus() == 1)
120 		return 0;
121 
122 	/*
123 	 * Matches memory barriers around rq->curr modification in
124 	 * scheduler.
125 	 */
126 	smp_mb();	/* system call entry is not a mb. */
127 
128 	/*
129 	 * Expedited membarrier commands guarantee that they won't
130 	 * block, hence the GFP_NOWAIT allocation flag and fallback
131 	 * implementation.
132 	 */
133 	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
134 		/* Fallback for OOM. */
135 		fallback = true;
136 	}
137 
138 	cpus_read_lock();
139 	for_each_online_cpu(cpu) {
140 		struct task_struct *p;
141 
142 		/*
143 		 * Skipping the current CPU is OK even through we can be
144 		 * migrated at any point. The current CPU, at the point
145 		 * where we read raw_smp_processor_id(), is ensured to
146 		 * be in program order with respect to the caller
147 		 * thread. Therefore, we can skip this CPU from the
148 		 * iteration.
149 		 */
150 		if (cpu == raw_smp_processor_id())
151 			continue;
152 		rcu_read_lock();
153 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
154 		if (p && p->mm == current->mm) {
155 			if (!fallback)
156 				__cpumask_set_cpu(cpu, tmpmask);
157 			else
158 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
159 		}
160 		rcu_read_unlock();
161 	}
162 	if (!fallback) {
163 		preempt_disable();
164 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
165 		preempt_enable();
166 		free_cpumask_var(tmpmask);
167 	}
168 	cpus_read_unlock();
169 
170 	/*
171 	 * Memory barrier on the caller thread _after_ we finished
172 	 * waiting for the last IPI. Matches memory barriers around
173 	 * rq->curr modification in scheduler.
174 	 */
175 	smp_mb();	/* exit from system call is not a mb */
176 
177 	return 0;
178 }
179 
180 static int membarrier_register_global_expedited(void)
181 {
182 	struct task_struct *p = current;
183 	struct mm_struct *mm = p->mm;
184 
185 	if (atomic_read(&mm->membarrier_state) &
186 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
187 		return 0;
188 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
189 	if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
190 		/*
191 		 * For single mm user, single threaded process, we can
192 		 * simply issue a memory barrier after setting
193 		 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
194 		 * no memory access following registration is reordered
195 		 * before registration.
196 		 */
197 		smp_mb();
198 	} else {
199 		/*
200 		 * For multi-mm user threads, we need to ensure all
201 		 * future scheduler executions will observe the new
202 		 * thread flag state for this mm.
203 		 */
204 		synchronize_rcu();
205 	}
206 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
207 		  &mm->membarrier_state);
208 
209 	return 0;
210 }
211 
212 static int membarrier_register_private_expedited(int flags)
213 {
214 	struct task_struct *p = current;
215 	struct mm_struct *mm = p->mm;
216 	int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
217 
218 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
219 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
220 			return -EINVAL;
221 		state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
222 	}
223 
224 	/*
225 	 * We need to consider threads belonging to different thread
226 	 * groups, which use the same mm. (CLONE_VM but not
227 	 * CLONE_THREAD).
228 	 */
229 	if (atomic_read(&mm->membarrier_state) & state)
230 		return 0;
231 	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
232 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
233 		atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
234 			  &mm->membarrier_state);
235 	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
236 		/*
237 		 * Ensure all future scheduler executions will observe the
238 		 * new thread flag state for this process.
239 		 */
240 		synchronize_rcu();
241 	}
242 	atomic_or(state, &mm->membarrier_state);
243 
244 	return 0;
245 }
246 
247 /**
248  * sys_membarrier - issue memory barriers on a set of threads
249  * @cmd:   Takes command values defined in enum membarrier_cmd.
250  * @flags: Currently needs to be 0. For future extensions.
251  *
252  * If this system call is not implemented, -ENOSYS is returned. If the
253  * command specified does not exist, not available on the running
254  * kernel, or if the command argument is invalid, this system call
255  * returns -EINVAL. For a given command, with flags argument set to 0,
256  * this system call is guaranteed to always return the same value until
257  * reboot.
258  *
259  * All memory accesses performed in program order from each targeted thread
260  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
261  * the semantic "barrier()" to represent a compiler barrier forcing memory
262  * accesses to be performed in program order across the barrier, and
263  * smp_mb() to represent explicit memory barriers forcing full memory
264  * ordering across the barrier, we have the following ordering table for
265  * each pair of barrier(), sys_membarrier() and smp_mb():
266  *
267  * The pair ordering is detailed as (O: ordered, X: not ordered):
268  *
269  *                        barrier()   smp_mb() sys_membarrier()
270  *        barrier()          X           X            O
271  *        smp_mb()           X           O            O
272  *        sys_membarrier()   O           O            O
273  */
274 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
275 {
276 	if (unlikely(flags))
277 		return -EINVAL;
278 	switch (cmd) {
279 	case MEMBARRIER_CMD_QUERY:
280 	{
281 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
282 
283 		if (tick_nohz_full_enabled())
284 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
285 		return cmd_mask;
286 	}
287 	case MEMBARRIER_CMD_GLOBAL:
288 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
289 		if (tick_nohz_full_enabled())
290 			return -EINVAL;
291 		if (num_online_cpus() > 1)
292 			synchronize_rcu();
293 		return 0;
294 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
295 		return membarrier_global_expedited();
296 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
297 		return membarrier_register_global_expedited();
298 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
299 		return membarrier_private_expedited(0);
300 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
301 		return membarrier_register_private_expedited(0);
302 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
303 		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
304 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
305 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
306 	default:
307 		return -EINVAL;
308 	}
309 }
310