1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 4 * 5 * membarrier system call 6 */ 7 #include "sched.h" 8 9 /* 10 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 11 * except MEMBARRIER_CMD_QUERY. 12 */ 13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 17 #else 18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 19 #endif 20 21 #define MEMBARRIER_CMD_BITMASK \ 22 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 23 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 24 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 26 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 27 28 static void ipi_mb(void *info) 29 { 30 smp_mb(); /* IPIs should be serializing but paranoid. */ 31 } 32 33 static void ipi_sync_rq_state(void *info) 34 { 35 struct mm_struct *mm = (struct mm_struct *) info; 36 37 if (current->mm != mm) 38 return; 39 this_cpu_write(runqueues.membarrier_state, 40 atomic_read(&mm->membarrier_state)); 41 /* 42 * Issue a memory barrier after setting 43 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to 44 * guarantee that no memory access following registration is reordered 45 * before registration. 46 */ 47 smp_mb(); 48 } 49 50 void membarrier_exec_mmap(struct mm_struct *mm) 51 { 52 /* 53 * Issue a memory barrier before clearing membarrier_state to 54 * guarantee that no memory access prior to exec is reordered after 55 * clearing this state. 56 */ 57 smp_mb(); 58 atomic_set(&mm->membarrier_state, 0); 59 /* 60 * Keep the runqueue membarrier_state in sync with this mm 61 * membarrier_state. 62 */ 63 this_cpu_write(runqueues.membarrier_state, 0); 64 } 65 66 static int membarrier_global_expedited(void) 67 { 68 int cpu; 69 cpumask_var_t tmpmask; 70 71 if (num_online_cpus() == 1) 72 return 0; 73 74 /* 75 * Matches memory barriers around rq->curr modification in 76 * scheduler. 77 */ 78 smp_mb(); /* system call entry is not a mb. */ 79 80 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 81 return -ENOMEM; 82 83 cpus_read_lock(); 84 rcu_read_lock(); 85 for_each_online_cpu(cpu) { 86 struct task_struct *p; 87 88 /* 89 * Skipping the current CPU is OK even through we can be 90 * migrated at any point. The current CPU, at the point 91 * where we read raw_smp_processor_id(), is ensured to 92 * be in program order with respect to the caller 93 * thread. Therefore, we can skip this CPU from the 94 * iteration. 95 */ 96 if (cpu == raw_smp_processor_id()) 97 continue; 98 99 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & 100 MEMBARRIER_STATE_GLOBAL_EXPEDITED)) 101 continue; 102 103 /* 104 * Skip the CPU if it runs a kernel thread. The scheduler 105 * leaves the prior task mm in place as an optimization when 106 * scheduling a kthread. 107 */ 108 p = rcu_dereference(cpu_rq(cpu)->curr); 109 if (p->flags & PF_KTHREAD) 110 continue; 111 112 __cpumask_set_cpu(cpu, tmpmask); 113 } 114 rcu_read_unlock(); 115 116 preempt_disable(); 117 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 118 preempt_enable(); 119 120 free_cpumask_var(tmpmask); 121 cpus_read_unlock(); 122 123 /* 124 * Memory barrier on the caller thread _after_ we finished 125 * waiting for the last IPI. Matches memory barriers around 126 * rq->curr modification in scheduler. 127 */ 128 smp_mb(); /* exit from system call is not a mb */ 129 return 0; 130 } 131 132 static int membarrier_private_expedited(int flags) 133 { 134 int cpu; 135 cpumask_var_t tmpmask; 136 struct mm_struct *mm = current->mm; 137 138 if (flags & MEMBARRIER_FLAG_SYNC_CORE) { 139 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 140 return -EINVAL; 141 if (!(atomic_read(&mm->membarrier_state) & 142 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) 143 return -EPERM; 144 } else { 145 if (!(atomic_read(&mm->membarrier_state) & 146 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) 147 return -EPERM; 148 } 149 150 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) 151 return 0; 152 153 /* 154 * Matches memory barriers around rq->curr modification in 155 * scheduler. 156 */ 157 smp_mb(); /* system call entry is not a mb. */ 158 159 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 160 return -ENOMEM; 161 162 cpus_read_lock(); 163 rcu_read_lock(); 164 for_each_online_cpu(cpu) { 165 struct task_struct *p; 166 167 /* 168 * Skipping the current CPU is OK even through we can be 169 * migrated at any point. The current CPU, at the point 170 * where we read raw_smp_processor_id(), is ensured to 171 * be in program order with respect to the caller 172 * thread. Therefore, we can skip this CPU from the 173 * iteration. 174 */ 175 if (cpu == raw_smp_processor_id()) 176 continue; 177 rcu_read_lock(); 178 p = rcu_dereference(cpu_rq(cpu)->curr); 179 if (p && p->mm == mm) 180 __cpumask_set_cpu(cpu, tmpmask); 181 } 182 rcu_read_unlock(); 183 184 preempt_disable(); 185 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 186 preempt_enable(); 187 188 free_cpumask_var(tmpmask); 189 cpus_read_unlock(); 190 191 /* 192 * Memory barrier on the caller thread _after_ we finished 193 * waiting for the last IPI. Matches memory barriers around 194 * rq->curr modification in scheduler. 195 */ 196 smp_mb(); /* exit from system call is not a mb */ 197 198 return 0; 199 } 200 201 static int sync_runqueues_membarrier_state(struct mm_struct *mm) 202 { 203 int membarrier_state = atomic_read(&mm->membarrier_state); 204 cpumask_var_t tmpmask; 205 int cpu; 206 207 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { 208 this_cpu_write(runqueues.membarrier_state, membarrier_state); 209 210 /* 211 * For single mm user, we can simply issue a memory barrier 212 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the 213 * mm and in the current runqueue to guarantee that no memory 214 * access following registration is reordered before 215 * registration. 216 */ 217 smp_mb(); 218 return 0; 219 } 220 221 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 222 return -ENOMEM; 223 224 /* 225 * For mm with multiple users, we need to ensure all future 226 * scheduler executions will observe @mm's new membarrier 227 * state. 228 */ 229 synchronize_rcu(); 230 231 /* 232 * For each cpu runqueue, if the task's mm match @mm, ensure that all 233 * @mm's membarrier state set bits are also set in in the runqueue's 234 * membarrier state. This ensures that a runqueue scheduling 235 * between threads which are users of @mm has its membarrier state 236 * updated. 237 */ 238 cpus_read_lock(); 239 rcu_read_lock(); 240 for_each_online_cpu(cpu) { 241 struct rq *rq = cpu_rq(cpu); 242 struct task_struct *p; 243 244 p = rcu_dereference(rq->curr); 245 if (p && p->mm == mm) 246 __cpumask_set_cpu(cpu, tmpmask); 247 } 248 rcu_read_unlock(); 249 250 preempt_disable(); 251 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); 252 preempt_enable(); 253 254 free_cpumask_var(tmpmask); 255 cpus_read_unlock(); 256 257 return 0; 258 } 259 260 static int membarrier_register_global_expedited(void) 261 { 262 struct task_struct *p = current; 263 struct mm_struct *mm = p->mm; 264 int ret; 265 266 if (atomic_read(&mm->membarrier_state) & 267 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) 268 return 0; 269 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); 270 ret = sync_runqueues_membarrier_state(mm); 271 if (ret) 272 return ret; 273 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 274 &mm->membarrier_state); 275 276 return 0; 277 } 278 279 static int membarrier_register_private_expedited(int flags) 280 { 281 struct task_struct *p = current; 282 struct mm_struct *mm = p->mm; 283 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, 284 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, 285 ret; 286 287 if (flags & MEMBARRIER_FLAG_SYNC_CORE) { 288 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 289 return -EINVAL; 290 ready_state = 291 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; 292 } 293 294 /* 295 * We need to consider threads belonging to different thread 296 * groups, which use the same mm. (CLONE_VM but not 297 * CLONE_THREAD). 298 */ 299 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) 300 return 0; 301 if (flags & MEMBARRIER_FLAG_SYNC_CORE) 302 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; 303 atomic_or(set_state, &mm->membarrier_state); 304 ret = sync_runqueues_membarrier_state(mm); 305 if (ret) 306 return ret; 307 atomic_or(ready_state, &mm->membarrier_state); 308 309 return 0; 310 } 311 312 /** 313 * sys_membarrier - issue memory barriers on a set of threads 314 * @cmd: Takes command values defined in enum membarrier_cmd. 315 * @flags: Currently needs to be 0. For future extensions. 316 * 317 * If this system call is not implemented, -ENOSYS is returned. If the 318 * command specified does not exist, not available on the running 319 * kernel, or if the command argument is invalid, this system call 320 * returns -EINVAL. For a given command, with flags argument set to 0, 321 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to 322 * always return the same value until reboot. In addition, it can return 323 * -ENOMEM if there is not enough memory available to perform the system 324 * call. 325 * 326 * All memory accesses performed in program order from each targeted thread 327 * is guaranteed to be ordered with respect to sys_membarrier(). If we use 328 * the semantic "barrier()" to represent a compiler barrier forcing memory 329 * accesses to be performed in program order across the barrier, and 330 * smp_mb() to represent explicit memory barriers forcing full memory 331 * ordering across the barrier, we have the following ordering table for 332 * each pair of barrier(), sys_membarrier() and smp_mb(): 333 * 334 * The pair ordering is detailed as (O: ordered, X: not ordered): 335 * 336 * barrier() smp_mb() sys_membarrier() 337 * barrier() X X O 338 * smp_mb() X O O 339 * sys_membarrier() O O O 340 */ 341 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) 342 { 343 if (unlikely(flags)) 344 return -EINVAL; 345 switch (cmd) { 346 case MEMBARRIER_CMD_QUERY: 347 { 348 int cmd_mask = MEMBARRIER_CMD_BITMASK; 349 350 if (tick_nohz_full_enabled()) 351 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; 352 return cmd_mask; 353 } 354 case MEMBARRIER_CMD_GLOBAL: 355 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ 356 if (tick_nohz_full_enabled()) 357 return -EINVAL; 358 if (num_online_cpus() > 1) 359 synchronize_rcu(); 360 return 0; 361 case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 362 return membarrier_global_expedited(); 363 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 364 return membarrier_register_global_expedited(); 365 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 366 return membarrier_private_expedited(0); 367 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 368 return membarrier_register_private_expedited(0); 369 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 370 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); 371 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 372 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); 373 default: 374 return -EINVAL; 375 } 376 } 377