1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 4 * 5 * membarrier system call 6 */ 7 #include "sched.h" 8 9 /* 10 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 11 * except MEMBARRIER_CMD_QUERY. 12 */ 13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 17 #else 18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 19 #endif 20 21 #define MEMBARRIER_CMD_BITMASK \ 22 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 23 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 24 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 26 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 27 28 static void ipi_mb(void *info) 29 { 30 smp_mb(); /* IPIs should be serializing but paranoid. */ 31 } 32 33 static void ipi_sync_rq_state(void *info) 34 { 35 struct mm_struct *mm = (struct mm_struct *) info; 36 37 if (current->mm != mm) 38 return; 39 this_cpu_write(runqueues.membarrier_state, 40 atomic_read(&mm->membarrier_state)); 41 /* 42 * Issue a memory barrier after setting 43 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to 44 * guarantee that no memory access following registration is reordered 45 * before registration. 46 */ 47 smp_mb(); 48 } 49 50 void membarrier_exec_mmap(struct mm_struct *mm) 51 { 52 /* 53 * Issue a memory barrier before clearing membarrier_state to 54 * guarantee that no memory access prior to exec is reordered after 55 * clearing this state. 56 */ 57 smp_mb(); 58 atomic_set(&mm->membarrier_state, 0); 59 /* 60 * Keep the runqueue membarrier_state in sync with this mm 61 * membarrier_state. 62 */ 63 this_cpu_write(runqueues.membarrier_state, 0); 64 } 65 66 static int membarrier_global_expedited(void) 67 { 68 int cpu; 69 cpumask_var_t tmpmask; 70 71 if (num_online_cpus() == 1) 72 return 0; 73 74 /* 75 * Matches memory barriers around rq->curr modification in 76 * scheduler. 77 */ 78 smp_mb(); /* system call entry is not a mb. */ 79 80 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 81 return -ENOMEM; 82 83 cpus_read_lock(); 84 rcu_read_lock(); 85 for_each_online_cpu(cpu) { 86 struct task_struct *p; 87 88 /* 89 * Skipping the current CPU is OK even through we can be 90 * migrated at any point. The current CPU, at the point 91 * where we read raw_smp_processor_id(), is ensured to 92 * be in program order with respect to the caller 93 * thread. Therefore, we can skip this CPU from the 94 * iteration. 95 */ 96 if (cpu == raw_smp_processor_id()) 97 continue; 98 99 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & 100 MEMBARRIER_STATE_GLOBAL_EXPEDITED)) 101 continue; 102 103 /* 104 * Skip the CPU if it runs a kernel thread. The scheduler 105 * leaves the prior task mm in place as an optimization when 106 * scheduling a kthread. 107 */ 108 p = rcu_dereference(cpu_rq(cpu)->curr); 109 if (p->flags & PF_KTHREAD) 110 continue; 111 112 __cpumask_set_cpu(cpu, tmpmask); 113 } 114 rcu_read_unlock(); 115 116 preempt_disable(); 117 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 118 preempt_enable(); 119 120 free_cpumask_var(tmpmask); 121 cpus_read_unlock(); 122 123 /* 124 * Memory barrier on the caller thread _after_ we finished 125 * waiting for the last IPI. Matches memory barriers around 126 * rq->curr modification in scheduler. 127 */ 128 smp_mb(); /* exit from system call is not a mb */ 129 return 0; 130 } 131 132 static int membarrier_private_expedited(int flags) 133 { 134 int cpu; 135 cpumask_var_t tmpmask; 136 struct mm_struct *mm = current->mm; 137 138 if (flags & MEMBARRIER_FLAG_SYNC_CORE) { 139 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 140 return -EINVAL; 141 if (!(atomic_read(&mm->membarrier_state) & 142 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) 143 return -EPERM; 144 } else { 145 if (!(atomic_read(&mm->membarrier_state) & 146 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) 147 return -EPERM; 148 } 149 150 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) 151 return 0; 152 153 /* 154 * Matches memory barriers around rq->curr modification in 155 * scheduler. 156 */ 157 smp_mb(); /* system call entry is not a mb. */ 158 159 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 160 return -ENOMEM; 161 162 cpus_read_lock(); 163 rcu_read_lock(); 164 for_each_online_cpu(cpu) { 165 struct task_struct *p; 166 167 /* 168 * Skipping the current CPU is OK even through we can be 169 * migrated at any point. The current CPU, at the point 170 * where we read raw_smp_processor_id(), is ensured to 171 * be in program order with respect to the caller 172 * thread. Therefore, we can skip this CPU from the 173 * iteration. 174 */ 175 if (cpu == raw_smp_processor_id()) 176 continue; 177 p = rcu_dereference(cpu_rq(cpu)->curr); 178 if (p && p->mm == mm) 179 __cpumask_set_cpu(cpu, tmpmask); 180 } 181 rcu_read_unlock(); 182 183 preempt_disable(); 184 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 185 preempt_enable(); 186 187 free_cpumask_var(tmpmask); 188 cpus_read_unlock(); 189 190 /* 191 * Memory barrier on the caller thread _after_ we finished 192 * waiting for the last IPI. Matches memory barriers around 193 * rq->curr modification in scheduler. 194 */ 195 smp_mb(); /* exit from system call is not a mb */ 196 197 return 0; 198 } 199 200 static int sync_runqueues_membarrier_state(struct mm_struct *mm) 201 { 202 int membarrier_state = atomic_read(&mm->membarrier_state); 203 cpumask_var_t tmpmask; 204 int cpu; 205 206 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { 207 this_cpu_write(runqueues.membarrier_state, membarrier_state); 208 209 /* 210 * For single mm user, we can simply issue a memory barrier 211 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the 212 * mm and in the current runqueue to guarantee that no memory 213 * access following registration is reordered before 214 * registration. 215 */ 216 smp_mb(); 217 return 0; 218 } 219 220 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 221 return -ENOMEM; 222 223 /* 224 * For mm with multiple users, we need to ensure all future 225 * scheduler executions will observe @mm's new membarrier 226 * state. 227 */ 228 synchronize_rcu(); 229 230 /* 231 * For each cpu runqueue, if the task's mm match @mm, ensure that all 232 * @mm's membarrier state set bits are also set in in the runqueue's 233 * membarrier state. This ensures that a runqueue scheduling 234 * between threads which are users of @mm has its membarrier state 235 * updated. 236 */ 237 cpus_read_lock(); 238 rcu_read_lock(); 239 for_each_online_cpu(cpu) { 240 struct rq *rq = cpu_rq(cpu); 241 struct task_struct *p; 242 243 p = rcu_dereference(rq->curr); 244 if (p && p->mm == mm) 245 __cpumask_set_cpu(cpu, tmpmask); 246 } 247 rcu_read_unlock(); 248 249 preempt_disable(); 250 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); 251 preempt_enable(); 252 253 free_cpumask_var(tmpmask); 254 cpus_read_unlock(); 255 256 return 0; 257 } 258 259 static int membarrier_register_global_expedited(void) 260 { 261 struct task_struct *p = current; 262 struct mm_struct *mm = p->mm; 263 int ret; 264 265 if (atomic_read(&mm->membarrier_state) & 266 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) 267 return 0; 268 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); 269 ret = sync_runqueues_membarrier_state(mm); 270 if (ret) 271 return ret; 272 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 273 &mm->membarrier_state); 274 275 return 0; 276 } 277 278 static int membarrier_register_private_expedited(int flags) 279 { 280 struct task_struct *p = current; 281 struct mm_struct *mm = p->mm; 282 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, 283 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, 284 ret; 285 286 if (flags & MEMBARRIER_FLAG_SYNC_CORE) { 287 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 288 return -EINVAL; 289 ready_state = 290 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; 291 } 292 293 /* 294 * We need to consider threads belonging to different thread 295 * groups, which use the same mm. (CLONE_VM but not 296 * CLONE_THREAD). 297 */ 298 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) 299 return 0; 300 if (flags & MEMBARRIER_FLAG_SYNC_CORE) 301 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; 302 atomic_or(set_state, &mm->membarrier_state); 303 ret = sync_runqueues_membarrier_state(mm); 304 if (ret) 305 return ret; 306 atomic_or(ready_state, &mm->membarrier_state); 307 308 return 0; 309 } 310 311 /** 312 * sys_membarrier - issue memory barriers on a set of threads 313 * @cmd: Takes command values defined in enum membarrier_cmd. 314 * @flags: Currently needs to be 0. For future extensions. 315 * 316 * If this system call is not implemented, -ENOSYS is returned. If the 317 * command specified does not exist, not available on the running 318 * kernel, or if the command argument is invalid, this system call 319 * returns -EINVAL. For a given command, with flags argument set to 0, 320 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to 321 * always return the same value until reboot. In addition, it can return 322 * -ENOMEM if there is not enough memory available to perform the system 323 * call. 324 * 325 * All memory accesses performed in program order from each targeted thread 326 * is guaranteed to be ordered with respect to sys_membarrier(). If we use 327 * the semantic "barrier()" to represent a compiler barrier forcing memory 328 * accesses to be performed in program order across the barrier, and 329 * smp_mb() to represent explicit memory barriers forcing full memory 330 * ordering across the barrier, we have the following ordering table for 331 * each pair of barrier(), sys_membarrier() and smp_mb(): 332 * 333 * The pair ordering is detailed as (O: ordered, X: not ordered): 334 * 335 * barrier() smp_mb() sys_membarrier() 336 * barrier() X X O 337 * smp_mb() X O O 338 * sys_membarrier() O O O 339 */ 340 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) 341 { 342 if (unlikely(flags)) 343 return -EINVAL; 344 switch (cmd) { 345 case MEMBARRIER_CMD_QUERY: 346 { 347 int cmd_mask = MEMBARRIER_CMD_BITMASK; 348 349 if (tick_nohz_full_enabled()) 350 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; 351 return cmd_mask; 352 } 353 case MEMBARRIER_CMD_GLOBAL: 354 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ 355 if (tick_nohz_full_enabled()) 356 return -EINVAL; 357 if (num_online_cpus() > 1) 358 synchronize_rcu(); 359 return 0; 360 case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 361 return membarrier_global_expedited(); 362 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 363 return membarrier_register_global_expedited(); 364 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 365 return membarrier_private_expedited(0); 366 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 367 return membarrier_register_private_expedited(0); 368 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 369 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); 370 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 371 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); 372 default: 373 return -EINVAL; 374 } 375 } 376