1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Author: Andrei Vagin <avagin@openvz.org> 4 * Author: Dmitry Safonov <dima@arista.com> 5 */ 6 7 #include <linux/time_namespace.h> 8 #include <linux/user_namespace.h> 9 #include <linux/sched/signal.h> 10 #include <linux/sched/task.h> 11 #include <linux/seq_file.h> 12 #include <linux/proc_ns.h> 13 #include <linux/export.h> 14 #include <linux/time.h> 15 #include <linux/slab.h> 16 #include <linux/cred.h> 17 #include <linux/err.h> 18 #include <linux/mm.h> 19 20 #include <vdso/datapage.h> 21 22 ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, 23 struct timens_offsets *ns_offsets) 24 { 25 ktime_t offset; 26 27 switch (clockid) { 28 case CLOCK_MONOTONIC: 29 offset = timespec64_to_ktime(ns_offsets->monotonic); 30 break; 31 case CLOCK_BOOTTIME: 32 case CLOCK_BOOTTIME_ALARM: 33 offset = timespec64_to_ktime(ns_offsets->boottime); 34 break; 35 default: 36 return tim; 37 } 38 39 /* 40 * Check that @tim value is in [offset, KTIME_MAX + offset] 41 * and subtract offset. 42 */ 43 if (tim < offset) { 44 /* 45 * User can specify @tim *absolute* value - if it's lesser than 46 * the time namespace's offset - it's already expired. 47 */ 48 tim = 0; 49 } else { 50 tim = ktime_sub(tim, offset); 51 if (unlikely(tim > KTIME_MAX)) 52 tim = KTIME_MAX; 53 } 54 55 return tim; 56 } 57 58 static struct ucounts *inc_time_namespaces(struct user_namespace *ns) 59 { 60 return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); 61 } 62 63 static void dec_time_namespaces(struct ucounts *ucounts) 64 { 65 dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); 66 } 67 68 /** 69 * clone_time_ns - Clone a time namespace 70 * @user_ns: User namespace which owns a new namespace. 71 * @old_ns: Namespace to clone 72 * 73 * Clone @old_ns and set the clone refcount to 1 74 * 75 * Return: The new namespace or ERR_PTR. 76 */ 77 static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, 78 struct time_namespace *old_ns) 79 { 80 struct time_namespace *ns; 81 struct ucounts *ucounts; 82 int err; 83 84 err = -ENOSPC; 85 ucounts = inc_time_namespaces(user_ns); 86 if (!ucounts) 87 goto fail; 88 89 err = -ENOMEM; 90 ns = kmalloc(sizeof(*ns), GFP_KERNEL); 91 if (!ns) 92 goto fail_dec; 93 94 kref_init(&ns->kref); 95 96 ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 97 if (!ns->vvar_page) 98 goto fail_free; 99 100 err = ns_alloc_inum(&ns->ns); 101 if (err) 102 goto fail_free_page; 103 104 ns->ucounts = ucounts; 105 ns->ns.ops = &timens_operations; 106 ns->user_ns = get_user_ns(user_ns); 107 ns->offsets = old_ns->offsets; 108 ns->frozen_offsets = false; 109 return ns; 110 111 fail_free_page: 112 __free_page(ns->vvar_page); 113 fail_free: 114 kfree(ns); 115 fail_dec: 116 dec_time_namespaces(ucounts); 117 fail: 118 return ERR_PTR(err); 119 } 120 121 /** 122 * copy_time_ns - Create timens_for_children from @old_ns 123 * @flags: Cloning flags 124 * @user_ns: User namespace which owns a new namespace. 125 * @old_ns: Namespace to clone 126 * 127 * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; 128 * adds a refcounter to @old_ns otherwise. 129 * 130 * Return: timens_for_children namespace or ERR_PTR. 131 */ 132 struct time_namespace *copy_time_ns(unsigned long flags, 133 struct user_namespace *user_ns, struct time_namespace *old_ns) 134 { 135 if (!(flags & CLONE_NEWTIME)) 136 return get_time_ns(old_ns); 137 138 return clone_time_ns(user_ns, old_ns); 139 } 140 141 static struct timens_offset offset_from_ts(struct timespec64 off) 142 { 143 struct timens_offset ret; 144 145 ret.sec = off.tv_sec; 146 ret.nsec = off.tv_nsec; 147 148 return ret; 149 } 150 151 /* 152 * A time namespace VVAR page has the same layout as the VVAR page which 153 * contains the system wide VDSO data. 154 * 155 * For a normal task the VVAR pages are installed in the normal ordering: 156 * VVAR 157 * PVCLOCK 158 * HVCLOCK 159 * TIMENS <- Not really required 160 * 161 * Now for a timens task the pages are installed in the following order: 162 * TIMENS 163 * PVCLOCK 164 * HVCLOCK 165 * VVAR 166 * 167 * The check for vdso_data->clock_mode is in the unlikely path of 168 * the seq begin magic. So for the non-timens case most of the time 169 * 'seq' is even, so the branch is not taken. 170 * 171 * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check 172 * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the 173 * update to finish and for 'seq' to become even anyway. 174 * 175 * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces 176 * the time namespace handling path. 177 */ 178 static void timens_setup_vdso_data(struct vdso_data *vdata, 179 struct time_namespace *ns) 180 { 181 struct timens_offset *offset = vdata->offset; 182 struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); 183 struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); 184 185 vdata->seq = 1; 186 vdata->clock_mode = VCLOCK_TIMENS; 187 offset[CLOCK_MONOTONIC] = monotonic; 188 offset[CLOCK_MONOTONIC_RAW] = monotonic; 189 offset[CLOCK_MONOTONIC_COARSE] = monotonic; 190 offset[CLOCK_BOOTTIME] = boottime; 191 offset[CLOCK_BOOTTIME_ALARM] = boottime; 192 } 193 194 /* 195 * Protects possibly multiple offsets writers racing each other 196 * and tasks entering the namespace. 197 */ 198 static DEFINE_MUTEX(offset_lock); 199 200 static void timens_set_vvar_page(struct task_struct *task, 201 struct time_namespace *ns) 202 { 203 struct vdso_data *vdata; 204 unsigned int i; 205 206 if (ns == &init_time_ns) 207 return; 208 209 /* Fast-path, taken by every task in namespace except the first. */ 210 if (likely(ns->frozen_offsets)) 211 return; 212 213 mutex_lock(&offset_lock); 214 /* Nothing to-do: vvar_page has been already initialized. */ 215 if (ns->frozen_offsets) 216 goto out; 217 218 ns->frozen_offsets = true; 219 vdata = arch_get_vdso_data(page_address(ns->vvar_page)); 220 221 for (i = 0; i < CS_BASES; i++) 222 timens_setup_vdso_data(&vdata[i], ns); 223 224 out: 225 mutex_unlock(&offset_lock); 226 } 227 228 void free_time_ns(struct kref *kref) 229 { 230 struct time_namespace *ns; 231 232 ns = container_of(kref, struct time_namespace, kref); 233 dec_time_namespaces(ns->ucounts); 234 put_user_ns(ns->user_ns); 235 ns_free_inum(&ns->ns); 236 __free_page(ns->vvar_page); 237 kfree(ns); 238 } 239 240 static struct time_namespace *to_time_ns(struct ns_common *ns) 241 { 242 return container_of(ns, struct time_namespace, ns); 243 } 244 245 static struct ns_common *timens_get(struct task_struct *task) 246 { 247 struct time_namespace *ns = NULL; 248 struct nsproxy *nsproxy; 249 250 task_lock(task); 251 nsproxy = task->nsproxy; 252 if (nsproxy) { 253 ns = nsproxy->time_ns; 254 get_time_ns(ns); 255 } 256 task_unlock(task); 257 258 return ns ? &ns->ns : NULL; 259 } 260 261 static struct ns_common *timens_for_children_get(struct task_struct *task) 262 { 263 struct time_namespace *ns = NULL; 264 struct nsproxy *nsproxy; 265 266 task_lock(task); 267 nsproxy = task->nsproxy; 268 if (nsproxy) { 269 ns = nsproxy->time_ns_for_children; 270 get_time_ns(ns); 271 } 272 task_unlock(task); 273 274 return ns ? &ns->ns : NULL; 275 } 276 277 static void timens_put(struct ns_common *ns) 278 { 279 put_time_ns(to_time_ns(ns)); 280 } 281 282 static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) 283 { 284 struct time_namespace *ns = to_time_ns(new); 285 int err; 286 287 if (!current_is_single_threaded()) 288 return -EUSERS; 289 290 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 291 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 292 return -EPERM; 293 294 timens_set_vvar_page(current, ns); 295 296 err = vdso_join_timens(current, ns); 297 if (err) 298 return err; 299 300 get_time_ns(ns); 301 put_time_ns(nsproxy->time_ns); 302 nsproxy->time_ns = ns; 303 304 get_time_ns(ns); 305 put_time_ns(nsproxy->time_ns_for_children); 306 nsproxy->time_ns_for_children = ns; 307 return 0; 308 } 309 310 int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) 311 { 312 struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; 313 struct time_namespace *ns = to_time_ns(nsc); 314 int err; 315 316 /* create_new_namespaces() already incremented the ref counter */ 317 if (nsproxy->time_ns == nsproxy->time_ns_for_children) 318 return 0; 319 320 timens_set_vvar_page(tsk, ns); 321 322 err = vdso_join_timens(tsk, ns); 323 if (err) 324 return err; 325 326 get_time_ns(ns); 327 put_time_ns(nsproxy->time_ns); 328 nsproxy->time_ns = ns; 329 330 return 0; 331 } 332 333 static struct user_namespace *timens_owner(struct ns_common *ns) 334 { 335 return to_time_ns(ns)->user_ns; 336 } 337 338 static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) 339 { 340 seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); 341 } 342 343 void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) 344 { 345 struct ns_common *ns; 346 struct time_namespace *time_ns; 347 348 ns = timens_for_children_get(p); 349 if (!ns) 350 return; 351 time_ns = to_time_ns(ns); 352 353 show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); 354 show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); 355 put_time_ns(time_ns); 356 } 357 358 int proc_timens_set_offset(struct file *file, struct task_struct *p, 359 struct proc_timens_offset *offsets, int noffsets) 360 { 361 struct ns_common *ns; 362 struct time_namespace *time_ns; 363 struct timespec64 tp; 364 int i, err; 365 366 ns = timens_for_children_get(p); 367 if (!ns) 368 return -ESRCH; 369 time_ns = to_time_ns(ns); 370 371 if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { 372 put_time_ns(time_ns); 373 return -EPERM; 374 } 375 376 for (i = 0; i < noffsets; i++) { 377 struct proc_timens_offset *off = &offsets[i]; 378 379 switch (off->clockid) { 380 case CLOCK_MONOTONIC: 381 ktime_get_ts64(&tp); 382 break; 383 case CLOCK_BOOTTIME: 384 ktime_get_boottime_ts64(&tp); 385 break; 386 default: 387 err = -EINVAL; 388 goto out; 389 } 390 391 err = -ERANGE; 392 393 if (off->val.tv_sec > KTIME_SEC_MAX || 394 off->val.tv_sec < -KTIME_SEC_MAX) 395 goto out; 396 397 tp = timespec64_add(tp, off->val); 398 /* 399 * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is 400 * still unreachable. 401 */ 402 if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) 403 goto out; 404 } 405 406 mutex_lock(&offset_lock); 407 if (time_ns->frozen_offsets) { 408 err = -EACCES; 409 goto out_unlock; 410 } 411 412 err = 0; 413 /* Don't report errors after this line */ 414 for (i = 0; i < noffsets; i++) { 415 struct proc_timens_offset *off = &offsets[i]; 416 struct timespec64 *offset = NULL; 417 418 switch (off->clockid) { 419 case CLOCK_MONOTONIC: 420 offset = &time_ns->offsets.monotonic; 421 break; 422 case CLOCK_BOOTTIME: 423 offset = &time_ns->offsets.boottime; 424 break; 425 } 426 427 *offset = off->val; 428 } 429 430 out_unlock: 431 mutex_unlock(&offset_lock); 432 out: 433 put_time_ns(time_ns); 434 435 return err; 436 } 437 438 const struct proc_ns_operations timens_operations = { 439 .name = "time", 440 .type = CLONE_NEWTIME, 441 .get = timens_get, 442 .put = timens_put, 443 .install = timens_install, 444 .owner = timens_owner, 445 }; 446 447 const struct proc_ns_operations timens_for_children_operations = { 448 .name = "time_for_children", 449 .type = CLONE_NEWTIME, 450 .get = timens_for_children_get, 451 .put = timens_put, 452 .install = timens_install, 453 .owner = timens_owner, 454 }; 455 456 struct time_namespace init_time_ns = { 457 .kref = KREF_INIT(3), 458 .user_ns = &init_user_ns, 459 .ns.inum = PROC_TIME_INIT_INO, 460 .ns.ops = &timens_operations, 461 .frozen_offsets = true, 462 }; 463 464 static int __init time_ns_init(void) 465 { 466 return 0; 467 } 468 subsys_initcall(time_ns_init); 469