1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 4 * 5 * membarrier system call 6 */ 7 #include "sched.h" 8 9 /* 10 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 11 * except MEMBARRIER_CMD_QUERY. 12 */ 13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 17 #else 18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 19 #endif 20 21 #ifdef CONFIG_RSEQ 22 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ 23 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ 24 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) 25 #else 26 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 27 #endif 28 29 #define MEMBARRIER_CMD_BITMASK \ 30 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 31 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 32 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 33 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 34 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 35 36 static void ipi_mb(void *info) 37 { 38 smp_mb(); /* IPIs should be serializing but paranoid. */ 39 } 40 41 static void ipi_sync_core(void *info) 42 { 43 /* 44 * The smp_mb() in membarrier after all the IPIs is supposed to 45 * ensure that memory on remote CPUs that occur before the IPI 46 * become visible to membarrier()'s caller -- see scenario B in 47 * the big comment at the top of this file. 48 * 49 * A sync_core() would provide this guarantee, but 50 * sync_core_before_usermode() might end up being deferred until 51 * after membarrier()'s smp_mb(). 52 */ 53 smp_mb(); /* IPIs should be serializing but paranoid. */ 54 55 sync_core_before_usermode(); 56 } 57 58 static void ipi_rseq(void *info) 59 { 60 /* 61 * Ensure that all stores done by the calling thread are visible 62 * to the current task before the current task resumes. We could 63 * probably optimize this away on most architectures, but by the 64 * time we've already sent an IPI, the cost of the extra smp_mb() 65 * is negligible. 66 */ 67 smp_mb(); 68 rseq_preempt(current); 69 } 70 71 static void ipi_sync_rq_state(void *info) 72 { 73 struct mm_struct *mm = (struct mm_struct *) info; 74 75 if (current->mm != mm) 76 return; 77 this_cpu_write(runqueues.membarrier_state, 78 atomic_read(&mm->membarrier_state)); 79 /* 80 * Issue a memory barrier after setting 81 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to 82 * guarantee that no memory access following registration is reordered 83 * before registration. 84 */ 85 smp_mb(); 86 } 87 88 void membarrier_exec_mmap(struct mm_struct *mm) 89 { 90 /* 91 * Issue a memory barrier before clearing membarrier_state to 92 * guarantee that no memory access prior to exec is reordered after 93 * clearing this state. 94 */ 95 smp_mb(); 96 atomic_set(&mm->membarrier_state, 0); 97 /* 98 * Keep the runqueue membarrier_state in sync with this mm 99 * membarrier_state. 100 */ 101 this_cpu_write(runqueues.membarrier_state, 0); 102 } 103 104 static int membarrier_global_expedited(void) 105 { 106 int cpu; 107 cpumask_var_t tmpmask; 108 109 if (num_online_cpus() == 1) 110 return 0; 111 112 /* 113 * Matches memory barriers around rq->curr modification in 114 * scheduler. 115 */ 116 smp_mb(); /* system call entry is not a mb. */ 117 118 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 119 return -ENOMEM; 120 121 cpus_read_lock(); 122 rcu_read_lock(); 123 for_each_online_cpu(cpu) { 124 struct task_struct *p; 125 126 /* 127 * Skipping the current CPU is OK even through we can be 128 * migrated at any point. The current CPU, at the point 129 * where we read raw_smp_processor_id(), is ensured to 130 * be in program order with respect to the caller 131 * thread. Therefore, we can skip this CPU from the 132 * iteration. 133 */ 134 if (cpu == raw_smp_processor_id()) 135 continue; 136 137 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & 138 MEMBARRIER_STATE_GLOBAL_EXPEDITED)) 139 continue; 140 141 /* 142 * Skip the CPU if it runs a kernel thread. The scheduler 143 * leaves the prior task mm in place as an optimization when 144 * scheduling a kthread. 145 */ 146 p = rcu_dereference(cpu_rq(cpu)->curr); 147 if (p->flags & PF_KTHREAD) 148 continue; 149 150 __cpumask_set_cpu(cpu, tmpmask); 151 } 152 rcu_read_unlock(); 153 154 preempt_disable(); 155 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 156 preempt_enable(); 157 158 free_cpumask_var(tmpmask); 159 cpus_read_unlock(); 160 161 /* 162 * Memory barrier on the caller thread _after_ we finished 163 * waiting for the last IPI. Matches memory barriers around 164 * rq->curr modification in scheduler. 165 */ 166 smp_mb(); /* exit from system call is not a mb */ 167 return 0; 168 } 169 170 static int membarrier_private_expedited(int flags, int cpu_id) 171 { 172 cpumask_var_t tmpmask; 173 struct mm_struct *mm = current->mm; 174 smp_call_func_t ipi_func = ipi_mb; 175 176 if (flags == MEMBARRIER_FLAG_SYNC_CORE) { 177 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 178 return -EINVAL; 179 if (!(atomic_read(&mm->membarrier_state) & 180 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) 181 return -EPERM; 182 ipi_func = ipi_sync_core; 183 } else if (flags == MEMBARRIER_FLAG_RSEQ) { 184 if (!IS_ENABLED(CONFIG_RSEQ)) 185 return -EINVAL; 186 if (!(atomic_read(&mm->membarrier_state) & 187 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) 188 return -EPERM; 189 ipi_func = ipi_rseq; 190 } else { 191 WARN_ON_ONCE(flags); 192 if (!(atomic_read(&mm->membarrier_state) & 193 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) 194 return -EPERM; 195 } 196 197 if (flags != MEMBARRIER_FLAG_SYNC_CORE && 198 (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) 199 return 0; 200 201 /* 202 * Matches memory barriers around rq->curr modification in 203 * scheduler. 204 */ 205 smp_mb(); /* system call entry is not a mb. */ 206 207 if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 208 return -ENOMEM; 209 210 cpus_read_lock(); 211 212 if (cpu_id >= 0) { 213 struct task_struct *p; 214 215 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) 216 goto out; 217 rcu_read_lock(); 218 p = rcu_dereference(cpu_rq(cpu_id)->curr); 219 if (!p || p->mm != mm) { 220 rcu_read_unlock(); 221 goto out; 222 } 223 rcu_read_unlock(); 224 } else { 225 int cpu; 226 227 rcu_read_lock(); 228 for_each_online_cpu(cpu) { 229 struct task_struct *p; 230 231 p = rcu_dereference(cpu_rq(cpu)->curr); 232 if (p && p->mm == mm) 233 __cpumask_set_cpu(cpu, tmpmask); 234 } 235 rcu_read_unlock(); 236 } 237 238 if (cpu_id >= 0) { 239 /* 240 * smp_call_function_single() will call ipi_func() if cpu_id 241 * is the calling CPU. 242 */ 243 smp_call_function_single(cpu_id, ipi_func, NULL, 1); 244 } else { 245 /* 246 * For regular membarrier, we can save a few cycles by 247 * skipping the current cpu -- we're about to do smp_mb() 248 * below, and if we migrate to a different cpu, this cpu 249 * and the new cpu will execute a full barrier in the 250 * scheduler. 251 * 252 * For SYNC_CORE, we do need a barrier on the current cpu -- 253 * otherwise, if we are migrated and replaced by a different 254 * task in the same mm just before, during, or after 255 * membarrier, we will end up with some thread in the mm 256 * running without a core sync. 257 * 258 * For RSEQ, don't rseq_preempt() the caller. User code 259 * is not supposed to issue syscalls at all from inside an 260 * rseq critical section. 261 */ 262 if (flags != MEMBARRIER_FLAG_SYNC_CORE) { 263 preempt_disable(); 264 smp_call_function_many(tmpmask, ipi_func, NULL, true); 265 preempt_enable(); 266 } else { 267 on_each_cpu_mask(tmpmask, ipi_func, NULL, true); 268 } 269 } 270 271 out: 272 if (cpu_id < 0) 273 free_cpumask_var(tmpmask); 274 cpus_read_unlock(); 275 276 /* 277 * Memory barrier on the caller thread _after_ we finished 278 * waiting for the last IPI. Matches memory barriers around 279 * rq->curr modification in scheduler. 280 */ 281 smp_mb(); /* exit from system call is not a mb */ 282 283 return 0; 284 } 285 286 static int sync_runqueues_membarrier_state(struct mm_struct *mm) 287 { 288 int membarrier_state = atomic_read(&mm->membarrier_state); 289 cpumask_var_t tmpmask; 290 int cpu; 291 292 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { 293 this_cpu_write(runqueues.membarrier_state, membarrier_state); 294 295 /* 296 * For single mm user, we can simply issue a memory barrier 297 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the 298 * mm and in the current runqueue to guarantee that no memory 299 * access following registration is reordered before 300 * registration. 301 */ 302 smp_mb(); 303 return 0; 304 } 305 306 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 307 return -ENOMEM; 308 309 /* 310 * For mm with multiple users, we need to ensure all future 311 * scheduler executions will observe @mm's new membarrier 312 * state. 313 */ 314 synchronize_rcu(); 315 316 /* 317 * For each cpu runqueue, if the task's mm match @mm, ensure that all 318 * @mm's membarrier state set bits are also set in in the runqueue's 319 * membarrier state. This ensures that a runqueue scheduling 320 * between threads which are users of @mm has its membarrier state 321 * updated. 322 */ 323 cpus_read_lock(); 324 rcu_read_lock(); 325 for_each_online_cpu(cpu) { 326 struct rq *rq = cpu_rq(cpu); 327 struct task_struct *p; 328 329 p = rcu_dereference(rq->curr); 330 if (p && p->mm == mm) 331 __cpumask_set_cpu(cpu, tmpmask); 332 } 333 rcu_read_unlock(); 334 335 preempt_disable(); 336 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); 337 preempt_enable(); 338 339 free_cpumask_var(tmpmask); 340 cpus_read_unlock(); 341 342 return 0; 343 } 344 345 static int membarrier_register_global_expedited(void) 346 { 347 struct task_struct *p = current; 348 struct mm_struct *mm = p->mm; 349 int ret; 350 351 if (atomic_read(&mm->membarrier_state) & 352 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) 353 return 0; 354 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); 355 ret = sync_runqueues_membarrier_state(mm); 356 if (ret) 357 return ret; 358 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 359 &mm->membarrier_state); 360 361 return 0; 362 } 363 364 static int membarrier_register_private_expedited(int flags) 365 { 366 struct task_struct *p = current; 367 struct mm_struct *mm = p->mm; 368 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, 369 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, 370 ret; 371 372 if (flags == MEMBARRIER_FLAG_SYNC_CORE) { 373 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 374 return -EINVAL; 375 ready_state = 376 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; 377 } else if (flags == MEMBARRIER_FLAG_RSEQ) { 378 if (!IS_ENABLED(CONFIG_RSEQ)) 379 return -EINVAL; 380 ready_state = 381 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; 382 } else { 383 WARN_ON_ONCE(flags); 384 } 385 386 /* 387 * We need to consider threads belonging to different thread 388 * groups, which use the same mm. (CLONE_VM but not 389 * CLONE_THREAD). 390 */ 391 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) 392 return 0; 393 if (flags & MEMBARRIER_FLAG_SYNC_CORE) 394 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; 395 if (flags & MEMBARRIER_FLAG_RSEQ) 396 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; 397 atomic_or(set_state, &mm->membarrier_state); 398 ret = sync_runqueues_membarrier_state(mm); 399 if (ret) 400 return ret; 401 atomic_or(ready_state, &mm->membarrier_state); 402 403 return 0; 404 } 405 406 /** 407 * sys_membarrier - issue memory barriers on a set of threads 408 * @cmd: Takes command values defined in enum membarrier_cmd. 409 * @flags: Currently needs to be 0 for all commands other than 410 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter 411 * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id 412 * contains the CPU on which to interrupt (= restart) 413 * the RSEQ critical section. 414 * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which 415 * RSEQ CS should be interrupted (@cmd must be 416 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). 417 * 418 * If this system call is not implemented, -ENOSYS is returned. If the 419 * command specified does not exist, not available on the running 420 * kernel, or if the command argument is invalid, this system call 421 * returns -EINVAL. For a given command, with flags argument set to 0, 422 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to 423 * always return the same value until reboot. In addition, it can return 424 * -ENOMEM if there is not enough memory available to perform the system 425 * call. 426 * 427 * All memory accesses performed in program order from each targeted thread 428 * is guaranteed to be ordered with respect to sys_membarrier(). If we use 429 * the semantic "barrier()" to represent a compiler barrier forcing memory 430 * accesses to be performed in program order across the barrier, and 431 * smp_mb() to represent explicit memory barriers forcing full memory 432 * ordering across the barrier, we have the following ordering table for 433 * each pair of barrier(), sys_membarrier() and smp_mb(): 434 * 435 * The pair ordering is detailed as (O: ordered, X: not ordered): 436 * 437 * barrier() smp_mb() sys_membarrier() 438 * barrier() X X O 439 * smp_mb() X O O 440 * sys_membarrier() O O O 441 */ 442 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) 443 { 444 switch (cmd) { 445 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: 446 if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) 447 return -EINVAL; 448 break; 449 default: 450 if (unlikely(flags)) 451 return -EINVAL; 452 } 453 454 if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) 455 cpu_id = -1; 456 457 switch (cmd) { 458 case MEMBARRIER_CMD_QUERY: 459 { 460 int cmd_mask = MEMBARRIER_CMD_BITMASK; 461 462 if (tick_nohz_full_enabled()) 463 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; 464 return cmd_mask; 465 } 466 case MEMBARRIER_CMD_GLOBAL: 467 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ 468 if (tick_nohz_full_enabled()) 469 return -EINVAL; 470 if (num_online_cpus() > 1) 471 synchronize_rcu(); 472 return 0; 473 case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 474 return membarrier_global_expedited(); 475 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 476 return membarrier_register_global_expedited(); 477 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 478 return membarrier_private_expedited(0, cpu_id); 479 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 480 return membarrier_register_private_expedited(0); 481 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 482 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); 483 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 484 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); 485 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: 486 return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); 487 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: 488 return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); 489 default: 490 return -EINVAL; 491 } 492 } 493