1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 4 * 5 * membarrier system call 6 */ 7 #include "sched.h" 8 9 /* 10 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 11 * except MEMBARRIER_CMD_QUERY. 12 */ 13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 17 #else 18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 19 #endif 20 21 #define MEMBARRIER_CMD_BITMASK \ 22 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 23 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 24 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 26 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 27 28 static void ipi_mb(void *info) 29 { 30 smp_mb(); /* IPIs should be serializing but paranoid. */ 31 } 32 33 static int membarrier_global_expedited(void) 34 { 35 int cpu; 36 bool fallback = false; 37 cpumask_var_t tmpmask; 38 39 if (num_online_cpus() == 1) 40 return 0; 41 42 /* 43 * Matches memory barriers around rq->curr modification in 44 * scheduler. 45 */ 46 smp_mb(); /* system call entry is not a mb. */ 47 48 /* 49 * Expedited membarrier commands guarantee that they won't 50 * block, hence the GFP_NOWAIT allocation flag and fallback 51 * implementation. 52 */ 53 if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { 54 /* Fallback for OOM. */ 55 fallback = true; 56 } 57 58 cpus_read_lock(); 59 for_each_online_cpu(cpu) { 60 struct task_struct *p; 61 62 /* 63 * Skipping the current CPU is OK even through we can be 64 * migrated at any point. The current CPU, at the point 65 * where we read raw_smp_processor_id(), is ensured to 66 * be in program order with respect to the caller 67 * thread. Therefore, we can skip this CPU from the 68 * iteration. 69 */ 70 if (cpu == raw_smp_processor_id()) 71 continue; 72 73 rcu_read_lock(); 74 p = task_rcu_dereference(&cpu_rq(cpu)->curr); 75 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & 76 MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { 77 if (!fallback) 78 __cpumask_set_cpu(cpu, tmpmask); 79 else 80 smp_call_function_single(cpu, ipi_mb, NULL, 1); 81 } 82 rcu_read_unlock(); 83 } 84 if (!fallback) { 85 preempt_disable(); 86 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 87 preempt_enable(); 88 free_cpumask_var(tmpmask); 89 } 90 cpus_read_unlock(); 91 92 /* 93 * Memory barrier on the caller thread _after_ we finished 94 * waiting for the last IPI. Matches memory barriers around 95 * rq->curr modification in scheduler. 96 */ 97 smp_mb(); /* exit from system call is not a mb */ 98 return 0; 99 } 100 101 static int membarrier_private_expedited(int flags) 102 { 103 int cpu; 104 bool fallback = false; 105 cpumask_var_t tmpmask; 106 107 if (flags & MEMBARRIER_FLAG_SYNC_CORE) { 108 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 109 return -EINVAL; 110 if (!(atomic_read(¤t->mm->membarrier_state) & 111 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) 112 return -EPERM; 113 } else { 114 if (!(atomic_read(¤t->mm->membarrier_state) & 115 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) 116 return -EPERM; 117 } 118 119 if (num_online_cpus() == 1) 120 return 0; 121 122 /* 123 * Matches memory barriers around rq->curr modification in 124 * scheduler. 125 */ 126 smp_mb(); /* system call entry is not a mb. */ 127 128 /* 129 * Expedited membarrier commands guarantee that they won't 130 * block, hence the GFP_NOWAIT allocation flag and fallback 131 * implementation. 132 */ 133 if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { 134 /* Fallback for OOM. */ 135 fallback = true; 136 } 137 138 cpus_read_lock(); 139 for_each_online_cpu(cpu) { 140 struct task_struct *p; 141 142 /* 143 * Skipping the current CPU is OK even through we can be 144 * migrated at any point. The current CPU, at the point 145 * where we read raw_smp_processor_id(), is ensured to 146 * be in program order with respect to the caller 147 * thread. Therefore, we can skip this CPU from the 148 * iteration. 149 */ 150 if (cpu == raw_smp_processor_id()) 151 continue; 152 rcu_read_lock(); 153 p = task_rcu_dereference(&cpu_rq(cpu)->curr); 154 if (p && p->mm == current->mm) { 155 if (!fallback) 156 __cpumask_set_cpu(cpu, tmpmask); 157 else 158 smp_call_function_single(cpu, ipi_mb, NULL, 1); 159 } 160 rcu_read_unlock(); 161 } 162 if (!fallback) { 163 preempt_disable(); 164 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 165 preempt_enable(); 166 free_cpumask_var(tmpmask); 167 } 168 cpus_read_unlock(); 169 170 /* 171 * Memory barrier on the caller thread _after_ we finished 172 * waiting for the last IPI. Matches memory barriers around 173 * rq->curr modification in scheduler. 174 */ 175 smp_mb(); /* exit from system call is not a mb */ 176 177 return 0; 178 } 179 180 static int membarrier_register_global_expedited(void) 181 { 182 struct task_struct *p = current; 183 struct mm_struct *mm = p->mm; 184 185 if (atomic_read(&mm->membarrier_state) & 186 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) 187 return 0; 188 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); 189 if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { 190 /* 191 * For single mm user, single threaded process, we can 192 * simply issue a memory barrier after setting 193 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that 194 * no memory access following registration is reordered 195 * before registration. 196 */ 197 smp_mb(); 198 } else { 199 /* 200 * For multi-mm user threads, we need to ensure all 201 * future scheduler executions will observe the new 202 * thread flag state for this mm. 203 */ 204 synchronize_rcu(); 205 } 206 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 207 &mm->membarrier_state); 208 209 return 0; 210 } 211 212 static int membarrier_register_private_expedited(int flags) 213 { 214 struct task_struct *p = current; 215 struct mm_struct *mm = p->mm; 216 int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; 217 218 if (flags & MEMBARRIER_FLAG_SYNC_CORE) { 219 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 220 return -EINVAL; 221 state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; 222 } 223 224 /* 225 * We need to consider threads belonging to different thread 226 * groups, which use the same mm. (CLONE_VM but not 227 * CLONE_THREAD). 228 */ 229 if (atomic_read(&mm->membarrier_state) & state) 230 return 0; 231 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); 232 if (flags & MEMBARRIER_FLAG_SYNC_CORE) 233 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, 234 &mm->membarrier_state); 235 if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { 236 /* 237 * Ensure all future scheduler executions will observe the 238 * new thread flag state for this process. 239 */ 240 synchronize_rcu(); 241 } 242 atomic_or(state, &mm->membarrier_state); 243 244 return 0; 245 } 246 247 /** 248 * sys_membarrier - issue memory barriers on a set of threads 249 * @cmd: Takes command values defined in enum membarrier_cmd. 250 * @flags: Currently needs to be 0. For future extensions. 251 * 252 * If this system call is not implemented, -ENOSYS is returned. If the 253 * command specified does not exist, not available on the running 254 * kernel, or if the command argument is invalid, this system call 255 * returns -EINVAL. For a given command, with flags argument set to 0, 256 * this system call is guaranteed to always return the same value until 257 * reboot. 258 * 259 * All memory accesses performed in program order from each targeted thread 260 * is guaranteed to be ordered with respect to sys_membarrier(). If we use 261 * the semantic "barrier()" to represent a compiler barrier forcing memory 262 * accesses to be performed in program order across the barrier, and 263 * smp_mb() to represent explicit memory barriers forcing full memory 264 * ordering across the barrier, we have the following ordering table for 265 * each pair of barrier(), sys_membarrier() and smp_mb(): 266 * 267 * The pair ordering is detailed as (O: ordered, X: not ordered): 268 * 269 * barrier() smp_mb() sys_membarrier() 270 * barrier() X X O 271 * smp_mb() X O O 272 * sys_membarrier() O O O 273 */ 274 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) 275 { 276 if (unlikely(flags)) 277 return -EINVAL; 278 switch (cmd) { 279 case MEMBARRIER_CMD_QUERY: 280 { 281 int cmd_mask = MEMBARRIER_CMD_BITMASK; 282 283 if (tick_nohz_full_enabled()) 284 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; 285 return cmd_mask; 286 } 287 case MEMBARRIER_CMD_GLOBAL: 288 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ 289 if (tick_nohz_full_enabled()) 290 return -EINVAL; 291 if (num_online_cpus() > 1) 292 synchronize_rcu(); 293 return 0; 294 case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 295 return membarrier_global_expedited(); 296 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 297 return membarrier_register_global_expedited(); 298 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 299 return membarrier_private_expedited(0); 300 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 301 return membarrier_register_private_expedited(0); 302 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 303 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); 304 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 305 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); 306 default: 307 return -EINVAL; 308 } 309 } 310