1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 4 * 5 * membarrier system call 6 */ 7 #include "sched.h" 8 9 /* 10 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 11 * except MEMBARRIER_CMD_QUERY. 12 */ 13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 17 #else 18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 19 #endif 20 21 #ifdef CONFIG_RSEQ 22 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ 23 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ 24 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) 25 #else 26 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 27 #endif 28 29 #define MEMBARRIER_CMD_BITMASK \ 30 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 31 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 32 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 33 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 34 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 35 36 static void ipi_mb(void *info) 37 { 38 smp_mb(); /* IPIs should be serializing but paranoid. */ 39 } 40 41 static void ipi_rseq(void *info) 42 { 43 rseq_preempt(current); 44 } 45 46 static void ipi_sync_rq_state(void *info) 47 { 48 struct mm_struct *mm = (struct mm_struct *) info; 49 50 if (current->mm != mm) 51 return; 52 this_cpu_write(runqueues.membarrier_state, 53 atomic_read(&mm->membarrier_state)); 54 /* 55 * Issue a memory barrier after setting 56 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to 57 * guarantee that no memory access following registration is reordered 58 * before registration. 59 */ 60 smp_mb(); 61 } 62 63 void membarrier_exec_mmap(struct mm_struct *mm) 64 { 65 /* 66 * Issue a memory barrier before clearing membarrier_state to 67 * guarantee that no memory access prior to exec is reordered after 68 * clearing this state. 69 */ 70 smp_mb(); 71 atomic_set(&mm->membarrier_state, 0); 72 /* 73 * Keep the runqueue membarrier_state in sync with this mm 74 * membarrier_state. 75 */ 76 this_cpu_write(runqueues.membarrier_state, 0); 77 } 78 79 static int membarrier_global_expedited(void) 80 { 81 int cpu; 82 cpumask_var_t tmpmask; 83 84 if (num_online_cpus() == 1) 85 return 0; 86 87 /* 88 * Matches memory barriers around rq->curr modification in 89 * scheduler. 90 */ 91 smp_mb(); /* system call entry is not a mb. */ 92 93 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 94 return -ENOMEM; 95 96 cpus_read_lock(); 97 rcu_read_lock(); 98 for_each_online_cpu(cpu) { 99 struct task_struct *p; 100 101 /* 102 * Skipping the current CPU is OK even through we can be 103 * migrated at any point. The current CPU, at the point 104 * where we read raw_smp_processor_id(), is ensured to 105 * be in program order with respect to the caller 106 * thread. Therefore, we can skip this CPU from the 107 * iteration. 108 */ 109 if (cpu == raw_smp_processor_id()) 110 continue; 111 112 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & 113 MEMBARRIER_STATE_GLOBAL_EXPEDITED)) 114 continue; 115 116 /* 117 * Skip the CPU if it runs a kernel thread. The scheduler 118 * leaves the prior task mm in place as an optimization when 119 * scheduling a kthread. 120 */ 121 p = rcu_dereference(cpu_rq(cpu)->curr); 122 if (p->flags & PF_KTHREAD) 123 continue; 124 125 __cpumask_set_cpu(cpu, tmpmask); 126 } 127 rcu_read_unlock(); 128 129 preempt_disable(); 130 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 131 preempt_enable(); 132 133 free_cpumask_var(tmpmask); 134 cpus_read_unlock(); 135 136 /* 137 * Memory barrier on the caller thread _after_ we finished 138 * waiting for the last IPI. Matches memory barriers around 139 * rq->curr modification in scheduler. 140 */ 141 smp_mb(); /* exit from system call is not a mb */ 142 return 0; 143 } 144 145 static int membarrier_private_expedited(int flags, int cpu_id) 146 { 147 cpumask_var_t tmpmask; 148 struct mm_struct *mm = current->mm; 149 smp_call_func_t ipi_func = ipi_mb; 150 151 if (flags == MEMBARRIER_FLAG_SYNC_CORE) { 152 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 153 return -EINVAL; 154 if (!(atomic_read(&mm->membarrier_state) & 155 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) 156 return -EPERM; 157 } else if (flags == MEMBARRIER_FLAG_RSEQ) { 158 if (!IS_ENABLED(CONFIG_RSEQ)) 159 return -EINVAL; 160 if (!(atomic_read(&mm->membarrier_state) & 161 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) 162 return -EPERM; 163 ipi_func = ipi_rseq; 164 } else { 165 WARN_ON_ONCE(flags); 166 if (!(atomic_read(&mm->membarrier_state) & 167 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) 168 return -EPERM; 169 } 170 171 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) 172 return 0; 173 174 /* 175 * Matches memory barriers around rq->curr modification in 176 * scheduler. 177 */ 178 smp_mb(); /* system call entry is not a mb. */ 179 180 if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 181 return -ENOMEM; 182 183 cpus_read_lock(); 184 185 if (cpu_id >= 0) { 186 struct task_struct *p; 187 188 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) 189 goto out; 190 if (cpu_id == raw_smp_processor_id()) 191 goto out; 192 rcu_read_lock(); 193 p = rcu_dereference(cpu_rq(cpu_id)->curr); 194 if (!p || p->mm != mm) { 195 rcu_read_unlock(); 196 goto out; 197 } 198 rcu_read_unlock(); 199 } else { 200 int cpu; 201 202 rcu_read_lock(); 203 for_each_online_cpu(cpu) { 204 struct task_struct *p; 205 206 /* 207 * Skipping the current CPU is OK even through we can be 208 * migrated at any point. The current CPU, at the point 209 * where we read raw_smp_processor_id(), is ensured to 210 * be in program order with respect to the caller 211 * thread. Therefore, we can skip this CPU from the 212 * iteration. 213 */ 214 if (cpu == raw_smp_processor_id()) 215 continue; 216 p = rcu_dereference(cpu_rq(cpu)->curr); 217 if (p && p->mm == mm) 218 __cpumask_set_cpu(cpu, tmpmask); 219 } 220 rcu_read_unlock(); 221 } 222 223 preempt_disable(); 224 if (cpu_id >= 0) 225 smp_call_function_single(cpu_id, ipi_func, NULL, 1); 226 else 227 smp_call_function_many(tmpmask, ipi_func, NULL, 1); 228 preempt_enable(); 229 230 out: 231 if (cpu_id < 0) 232 free_cpumask_var(tmpmask); 233 cpus_read_unlock(); 234 235 /* 236 * Memory barrier on the caller thread _after_ we finished 237 * waiting for the last IPI. Matches memory barriers around 238 * rq->curr modification in scheduler. 239 */ 240 smp_mb(); /* exit from system call is not a mb */ 241 242 return 0; 243 } 244 245 static int sync_runqueues_membarrier_state(struct mm_struct *mm) 246 { 247 int membarrier_state = atomic_read(&mm->membarrier_state); 248 cpumask_var_t tmpmask; 249 int cpu; 250 251 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { 252 this_cpu_write(runqueues.membarrier_state, membarrier_state); 253 254 /* 255 * For single mm user, we can simply issue a memory barrier 256 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the 257 * mm and in the current runqueue to guarantee that no memory 258 * access following registration is reordered before 259 * registration. 260 */ 261 smp_mb(); 262 return 0; 263 } 264 265 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 266 return -ENOMEM; 267 268 /* 269 * For mm with multiple users, we need to ensure all future 270 * scheduler executions will observe @mm's new membarrier 271 * state. 272 */ 273 synchronize_rcu(); 274 275 /* 276 * For each cpu runqueue, if the task's mm match @mm, ensure that all 277 * @mm's membarrier state set bits are also set in in the runqueue's 278 * membarrier state. This ensures that a runqueue scheduling 279 * between threads which are users of @mm has its membarrier state 280 * updated. 281 */ 282 cpus_read_lock(); 283 rcu_read_lock(); 284 for_each_online_cpu(cpu) { 285 struct rq *rq = cpu_rq(cpu); 286 struct task_struct *p; 287 288 p = rcu_dereference(rq->curr); 289 if (p && p->mm == mm) 290 __cpumask_set_cpu(cpu, tmpmask); 291 } 292 rcu_read_unlock(); 293 294 preempt_disable(); 295 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); 296 preempt_enable(); 297 298 free_cpumask_var(tmpmask); 299 cpus_read_unlock(); 300 301 return 0; 302 } 303 304 static int membarrier_register_global_expedited(void) 305 { 306 struct task_struct *p = current; 307 struct mm_struct *mm = p->mm; 308 int ret; 309 310 if (atomic_read(&mm->membarrier_state) & 311 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) 312 return 0; 313 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); 314 ret = sync_runqueues_membarrier_state(mm); 315 if (ret) 316 return ret; 317 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 318 &mm->membarrier_state); 319 320 return 0; 321 } 322 323 static int membarrier_register_private_expedited(int flags) 324 { 325 struct task_struct *p = current; 326 struct mm_struct *mm = p->mm; 327 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, 328 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, 329 ret; 330 331 if (flags == MEMBARRIER_FLAG_SYNC_CORE) { 332 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 333 return -EINVAL; 334 ready_state = 335 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; 336 } else if (flags == MEMBARRIER_FLAG_RSEQ) { 337 if (!IS_ENABLED(CONFIG_RSEQ)) 338 return -EINVAL; 339 ready_state = 340 MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; 341 } else { 342 WARN_ON_ONCE(flags); 343 } 344 345 /* 346 * We need to consider threads belonging to different thread 347 * groups, which use the same mm. (CLONE_VM but not 348 * CLONE_THREAD). 349 */ 350 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) 351 return 0; 352 if (flags & MEMBARRIER_FLAG_SYNC_CORE) 353 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; 354 if (flags & MEMBARRIER_FLAG_RSEQ) 355 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; 356 atomic_or(set_state, &mm->membarrier_state); 357 ret = sync_runqueues_membarrier_state(mm); 358 if (ret) 359 return ret; 360 atomic_or(ready_state, &mm->membarrier_state); 361 362 return 0; 363 } 364 365 /** 366 * sys_membarrier - issue memory barriers on a set of threads 367 * @cmd: Takes command values defined in enum membarrier_cmd. 368 * @flags: Currently needs to be 0 for all commands other than 369 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter 370 * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id 371 * contains the CPU on which to interrupt (= restart) 372 * the RSEQ critical section. 373 * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which 374 * RSEQ CS should be interrupted (@cmd must be 375 * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). 376 * 377 * If this system call is not implemented, -ENOSYS is returned. If the 378 * command specified does not exist, not available on the running 379 * kernel, or if the command argument is invalid, this system call 380 * returns -EINVAL. For a given command, with flags argument set to 0, 381 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to 382 * always return the same value until reboot. In addition, it can return 383 * -ENOMEM if there is not enough memory available to perform the system 384 * call. 385 * 386 * All memory accesses performed in program order from each targeted thread 387 * is guaranteed to be ordered with respect to sys_membarrier(). If we use 388 * the semantic "barrier()" to represent a compiler barrier forcing memory 389 * accesses to be performed in program order across the barrier, and 390 * smp_mb() to represent explicit memory barriers forcing full memory 391 * ordering across the barrier, we have the following ordering table for 392 * each pair of barrier(), sys_membarrier() and smp_mb(): 393 * 394 * The pair ordering is detailed as (O: ordered, X: not ordered): 395 * 396 * barrier() smp_mb() sys_membarrier() 397 * barrier() X X O 398 * smp_mb() X O O 399 * sys_membarrier() O O O 400 */ 401 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) 402 { 403 switch (cmd) { 404 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: 405 if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) 406 return -EINVAL; 407 break; 408 default: 409 if (unlikely(flags)) 410 return -EINVAL; 411 } 412 413 if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) 414 cpu_id = -1; 415 416 switch (cmd) { 417 case MEMBARRIER_CMD_QUERY: 418 { 419 int cmd_mask = MEMBARRIER_CMD_BITMASK; 420 421 if (tick_nohz_full_enabled()) 422 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; 423 return cmd_mask; 424 } 425 case MEMBARRIER_CMD_GLOBAL: 426 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ 427 if (tick_nohz_full_enabled()) 428 return -EINVAL; 429 if (num_online_cpus() > 1) 430 synchronize_rcu(); 431 return 0; 432 case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 433 return membarrier_global_expedited(); 434 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 435 return membarrier_register_global_expedited(); 436 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 437 return membarrier_private_expedited(0, cpu_id); 438 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 439 return membarrier_register_private_expedited(0); 440 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 441 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); 442 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 443 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); 444 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: 445 return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); 446 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: 447 return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); 448 default: 449 return -EINVAL; 450 } 451 } 452