1 /* 2 * device_cgroup.c - device cgroup subsystem 3 * 4 * Copyright 2007 IBM Corp 5 */ 6 7 #include <linux/device_cgroup.h> 8 #include <linux/cgroup.h> 9 #include <linux/ctype.h> 10 #include <linux/list.h> 11 #include <linux/uaccess.h> 12 #include <linux/seq_file.h> 13 #include <linux/slab.h> 14 #include <linux/rcupdate.h> 15 #include <linux/mutex.h> 16 17 #define ACC_MKNOD 1 18 #define ACC_READ 2 19 #define ACC_WRITE 4 20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) 21 22 #define DEV_BLOCK 1 23 #define DEV_CHAR 2 24 #define DEV_ALL 4 /* this represents all devices */ 25 26 static DEFINE_MUTEX(devcgroup_mutex); 27 28 /* 29 * exception list locking rules: 30 * hold devcgroup_mutex for update/read. 31 * hold rcu_read_lock() for read. 32 */ 33 34 struct dev_exception_item { 35 u32 major, minor; 36 short type; 37 short access; 38 struct list_head list; 39 struct rcu_head rcu; 40 }; 41 42 struct dev_cgroup { 43 struct cgroup_subsys_state css; 44 struct list_head exceptions; 45 enum { 46 DEVCG_DEFAULT_ALLOW, 47 DEVCG_DEFAULT_DENY, 48 } behavior; 49 }; 50 51 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 52 { 53 return container_of(s, struct dev_cgroup, css); 54 } 55 56 static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) 57 { 58 return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); 59 } 60 61 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 62 { 63 return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); 64 } 65 66 struct cgroup_subsys devices_subsys; 67 68 static int devcgroup_can_attach(struct cgroup *new_cgrp, 69 struct cgroup_taskset *set) 70 { 71 struct task_struct *task = cgroup_taskset_first(set); 72 73 if (current != task && !capable(CAP_SYS_ADMIN)) 74 return -EPERM; 75 return 0; 76 } 77 78 /* 79 * called under devcgroup_mutex 80 */ 81 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) 82 { 83 struct dev_exception_item *ex, *tmp, *new; 84 85 lockdep_assert_held(&devcgroup_mutex); 86 87 list_for_each_entry(ex, orig, list) { 88 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 89 if (!new) 90 goto free_and_exit; 91 list_add_tail(&new->list, dest); 92 } 93 94 return 0; 95 96 free_and_exit: 97 list_for_each_entry_safe(ex, tmp, dest, list) { 98 list_del(&ex->list); 99 kfree(ex); 100 } 101 return -ENOMEM; 102 } 103 104 /* 105 * called under devcgroup_mutex 106 */ 107 static int dev_exception_add(struct dev_cgroup *dev_cgroup, 108 struct dev_exception_item *ex) 109 { 110 struct dev_exception_item *excopy, *walk; 111 112 lockdep_assert_held(&devcgroup_mutex); 113 114 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 115 if (!excopy) 116 return -ENOMEM; 117 118 list_for_each_entry(walk, &dev_cgroup->exceptions, list) { 119 if (walk->type != ex->type) 120 continue; 121 if (walk->major != ex->major) 122 continue; 123 if (walk->minor != ex->minor) 124 continue; 125 126 walk->access |= ex->access; 127 kfree(excopy); 128 excopy = NULL; 129 } 130 131 if (excopy != NULL) 132 list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); 133 return 0; 134 } 135 136 /* 137 * called under devcgroup_mutex 138 */ 139 static void dev_exception_rm(struct dev_cgroup *dev_cgroup, 140 struct dev_exception_item *ex) 141 { 142 struct dev_exception_item *walk, *tmp; 143 144 lockdep_assert_held(&devcgroup_mutex); 145 146 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { 147 if (walk->type != ex->type) 148 continue; 149 if (walk->major != ex->major) 150 continue; 151 if (walk->minor != ex->minor) 152 continue; 153 154 walk->access &= ~ex->access; 155 if (!walk->access) { 156 list_del_rcu(&walk->list); 157 kfree_rcu(walk, rcu); 158 } 159 } 160 } 161 162 static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) 163 { 164 struct dev_exception_item *ex, *tmp; 165 166 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { 167 list_del_rcu(&ex->list); 168 kfree_rcu(ex, rcu); 169 } 170 } 171 172 /** 173 * dev_exception_clean - frees all entries of the exception list 174 * @dev_cgroup: dev_cgroup with the exception list to be cleaned 175 * 176 * called under devcgroup_mutex 177 */ 178 static void dev_exception_clean(struct dev_cgroup *dev_cgroup) 179 { 180 lockdep_assert_held(&devcgroup_mutex); 181 182 __dev_exception_clean(dev_cgroup); 183 } 184 185 /* 186 * called from kernel/cgroup.c with cgroup_lock() held. 187 */ 188 static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) 189 { 190 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; 191 struct cgroup *parent_cgroup; 192 int ret; 193 194 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); 195 if (!dev_cgroup) 196 return ERR_PTR(-ENOMEM); 197 INIT_LIST_HEAD(&dev_cgroup->exceptions); 198 parent_cgroup = cgroup->parent; 199 200 if (parent_cgroup == NULL) 201 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; 202 else { 203 parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); 204 mutex_lock(&devcgroup_mutex); 205 ret = dev_exceptions_copy(&dev_cgroup->exceptions, 206 &parent_dev_cgroup->exceptions); 207 dev_cgroup->behavior = parent_dev_cgroup->behavior; 208 mutex_unlock(&devcgroup_mutex); 209 if (ret) { 210 kfree(dev_cgroup); 211 return ERR_PTR(ret); 212 } 213 } 214 215 return &dev_cgroup->css; 216 } 217 218 static void devcgroup_css_free(struct cgroup *cgroup) 219 { 220 struct dev_cgroup *dev_cgroup; 221 222 dev_cgroup = cgroup_to_devcgroup(cgroup); 223 __dev_exception_clean(dev_cgroup); 224 kfree(dev_cgroup); 225 } 226 227 #define DEVCG_ALLOW 1 228 #define DEVCG_DENY 2 229 #define DEVCG_LIST 3 230 231 #define MAJMINLEN 13 232 #define ACCLEN 4 233 234 static void set_access(char *acc, short access) 235 { 236 int idx = 0; 237 memset(acc, 0, ACCLEN); 238 if (access & ACC_READ) 239 acc[idx++] = 'r'; 240 if (access & ACC_WRITE) 241 acc[idx++] = 'w'; 242 if (access & ACC_MKNOD) 243 acc[idx++] = 'm'; 244 } 245 246 static char type_to_char(short type) 247 { 248 if (type == DEV_ALL) 249 return 'a'; 250 if (type == DEV_CHAR) 251 return 'c'; 252 if (type == DEV_BLOCK) 253 return 'b'; 254 return 'X'; 255 } 256 257 static void set_majmin(char *str, unsigned m) 258 { 259 if (m == ~0) 260 strcpy(str, "*"); 261 else 262 sprintf(str, "%u", m); 263 } 264 265 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, 266 struct seq_file *m) 267 { 268 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); 269 struct dev_exception_item *ex; 270 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 271 272 rcu_read_lock(); 273 /* 274 * To preserve the compatibility: 275 * - Only show the "all devices" when the default policy is to allow 276 * - List the exceptions in case the default policy is to deny 277 * This way, the file remains as a "whitelist of devices" 278 */ 279 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 280 set_access(acc, ACC_MASK); 281 set_majmin(maj, ~0); 282 set_majmin(min, ~0); 283 seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL), 284 maj, min, acc); 285 } else { 286 list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { 287 set_access(acc, ex->access); 288 set_majmin(maj, ex->major); 289 set_majmin(min, ex->minor); 290 seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), 291 maj, min, acc); 292 } 293 } 294 rcu_read_unlock(); 295 296 return 0; 297 } 298 299 /** 300 * may_access - verifies if a new exception is part of what is allowed 301 * by a dev cgroup based on the default policy + 302 * exceptions. This is used to make sure a child cgroup 303 * won't have more privileges than its parent or to 304 * verify if a certain access is allowed. 305 * @dev_cgroup: dev cgroup to be tested against 306 * @refex: new exception 307 */ 308 static bool may_access(struct dev_cgroup *dev_cgroup, 309 struct dev_exception_item *refex) 310 { 311 struct dev_exception_item *ex; 312 bool match = false; 313 314 rcu_lockdep_assert(rcu_read_lock_held() || 315 lockdep_is_held(&devcgroup_mutex), 316 "device_cgroup::may_access() called without proper synchronization"); 317 318 list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) { 319 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) 320 continue; 321 if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR)) 322 continue; 323 if (ex->major != ~0 && ex->major != refex->major) 324 continue; 325 if (ex->minor != ~0 && ex->minor != refex->minor) 326 continue; 327 if (refex->access & (~ex->access)) 328 continue; 329 match = true; 330 break; 331 } 332 333 /* 334 * In two cases we'll consider this new exception valid: 335 * - the dev cgroup has its default policy to deny + exception list: 336 * the new exception *should* match the exceptions 337 * - the dev cgroup has its default policy to allow + exception list: 338 * the new exception should *not* match any of the exceptions 339 */ 340 if (dev_cgroup->behavior == DEVCG_DEFAULT_DENY) { 341 if (match) 342 return true; 343 } else { 344 if (!match) 345 return true; 346 } 347 return false; 348 } 349 350 /* 351 * parent_has_perm: 352 * when adding a new allow rule to a device exception list, the rule 353 * must be allowed in the parent device 354 */ 355 static int parent_has_perm(struct dev_cgroup *childcg, 356 struct dev_exception_item *ex) 357 { 358 struct cgroup *pcg = childcg->css.cgroup->parent; 359 struct dev_cgroup *parent; 360 361 if (!pcg) 362 return 1; 363 parent = cgroup_to_devcgroup(pcg); 364 return may_access(parent, ex); 365 } 366 367 /** 368 * may_allow_all - checks if it's possible to change the behavior to 369 * allow based on parent's rules. 370 * @parent: device cgroup's parent 371 * returns: != 0 in case it's allowed, 0 otherwise 372 */ 373 static inline int may_allow_all(struct dev_cgroup *parent) 374 { 375 if (!parent) 376 return 1; 377 return parent->behavior == DEVCG_DEFAULT_ALLOW; 378 } 379 380 /* 381 * Modify the exception list using allow/deny rules. 382 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 383 * so we can give a container CAP_MKNOD to let it create devices but not 384 * modify the exception list. 385 * It seems likely we'll want to add a CAP_CONTAINER capability to allow 386 * us to also grant CAP_SYS_ADMIN to containers without giving away the 387 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN 388 * 389 * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting 390 * new access is only allowed if you're in the top-level cgroup, or your 391 * parent cgroup has the access you're asking for. 392 */ 393 static int devcgroup_update_access(struct dev_cgroup *devcgroup, 394 int filetype, const char *buffer) 395 { 396 const char *b; 397 char temp[12]; /* 11 + 1 characters needed for a u32 */ 398 int count, rc; 399 struct dev_exception_item ex; 400 struct cgroup *p = devcgroup->css.cgroup; 401 struct dev_cgroup *parent = NULL; 402 403 if (!capable(CAP_SYS_ADMIN)) 404 return -EPERM; 405 406 if (p->parent) 407 parent = cgroup_to_devcgroup(p->parent); 408 409 memset(&ex, 0, sizeof(ex)); 410 b = buffer; 411 412 switch (*b) { 413 case 'a': 414 switch (filetype) { 415 case DEVCG_ALLOW: 416 if (!may_allow_all(parent)) 417 return -EPERM; 418 dev_exception_clean(devcgroup); 419 devcgroup->behavior = DEVCG_DEFAULT_ALLOW; 420 if (!parent) 421 break; 422 423 rc = dev_exceptions_copy(&devcgroup->exceptions, 424 &parent->exceptions); 425 if (rc) 426 return rc; 427 break; 428 case DEVCG_DENY: 429 dev_exception_clean(devcgroup); 430 devcgroup->behavior = DEVCG_DEFAULT_DENY; 431 break; 432 default: 433 return -EINVAL; 434 } 435 return 0; 436 case 'b': 437 ex.type = DEV_BLOCK; 438 break; 439 case 'c': 440 ex.type = DEV_CHAR; 441 break; 442 default: 443 return -EINVAL; 444 } 445 b++; 446 if (!isspace(*b)) 447 return -EINVAL; 448 b++; 449 if (*b == '*') { 450 ex.major = ~0; 451 b++; 452 } else if (isdigit(*b)) { 453 memset(temp, 0, sizeof(temp)); 454 for (count = 0; count < sizeof(temp) - 1; count++) { 455 temp[count] = *b; 456 b++; 457 if (!isdigit(*b)) 458 break; 459 } 460 rc = kstrtou32(temp, 10, &ex.major); 461 if (rc) 462 return -EINVAL; 463 } else { 464 return -EINVAL; 465 } 466 if (*b != ':') 467 return -EINVAL; 468 b++; 469 470 /* read minor */ 471 if (*b == '*') { 472 ex.minor = ~0; 473 b++; 474 } else if (isdigit(*b)) { 475 memset(temp, 0, sizeof(temp)); 476 for (count = 0; count < sizeof(temp) - 1; count++) { 477 temp[count] = *b; 478 b++; 479 if (!isdigit(*b)) 480 break; 481 } 482 rc = kstrtou32(temp, 10, &ex.minor); 483 if (rc) 484 return -EINVAL; 485 } else { 486 return -EINVAL; 487 } 488 if (!isspace(*b)) 489 return -EINVAL; 490 for (b++, count = 0; count < 3; count++, b++) { 491 switch (*b) { 492 case 'r': 493 ex.access |= ACC_READ; 494 break; 495 case 'w': 496 ex.access |= ACC_WRITE; 497 break; 498 case 'm': 499 ex.access |= ACC_MKNOD; 500 break; 501 case '\n': 502 case '\0': 503 count = 3; 504 break; 505 default: 506 return -EINVAL; 507 } 508 } 509 510 switch (filetype) { 511 case DEVCG_ALLOW: 512 if (!parent_has_perm(devcgroup, &ex)) 513 return -EPERM; 514 /* 515 * If the default policy is to allow by default, try to remove 516 * an matching exception instead. And be silent about it: we 517 * don't want to break compatibility 518 */ 519 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 520 dev_exception_rm(devcgroup, &ex); 521 return 0; 522 } 523 return dev_exception_add(devcgroup, &ex); 524 case DEVCG_DENY: 525 /* 526 * If the default policy is to deny by default, try to remove 527 * an matching exception instead. And be silent about it: we 528 * don't want to break compatibility 529 */ 530 if (devcgroup->behavior == DEVCG_DEFAULT_DENY) { 531 dev_exception_rm(devcgroup, &ex); 532 return 0; 533 } 534 return dev_exception_add(devcgroup, &ex); 535 default: 536 return -EINVAL; 537 } 538 return 0; 539 } 540 541 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 542 const char *buffer) 543 { 544 int retval; 545 546 mutex_lock(&devcgroup_mutex); 547 retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), 548 cft->private, buffer); 549 mutex_unlock(&devcgroup_mutex); 550 return retval; 551 } 552 553 static struct cftype dev_cgroup_files[] = { 554 { 555 .name = "allow", 556 .write_string = devcgroup_access_write, 557 .private = DEVCG_ALLOW, 558 }, 559 { 560 .name = "deny", 561 .write_string = devcgroup_access_write, 562 .private = DEVCG_DENY, 563 }, 564 { 565 .name = "list", 566 .read_seq_string = devcgroup_seq_read, 567 .private = DEVCG_LIST, 568 }, 569 { } /* terminate */ 570 }; 571 572 struct cgroup_subsys devices_subsys = { 573 .name = "devices", 574 .can_attach = devcgroup_can_attach, 575 .css_alloc = devcgroup_css_alloc, 576 .css_free = devcgroup_css_free, 577 .subsys_id = devices_subsys_id, 578 .base_cftypes = dev_cgroup_files, 579 580 /* 581 * While devices cgroup has the rudimentary hierarchy support which 582 * checks the parent's restriction, it doesn't properly propagates 583 * config changes in ancestors to their descendents. A child 584 * should only be allowed to add more restrictions to the parent's 585 * configuration. Fix it and remove the following. 586 */ 587 .broken_hierarchy = true, 588 }; 589 590 /** 591 * __devcgroup_check_permission - checks if an inode operation is permitted 592 * @dev_cgroup: the dev cgroup to be tested against 593 * @type: device type 594 * @major: device major number 595 * @minor: device minor number 596 * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD 597 * 598 * returns 0 on success, -EPERM case the operation is not permitted 599 */ 600 static int __devcgroup_check_permission(short type, u32 major, u32 minor, 601 short access) 602 { 603 struct dev_cgroup *dev_cgroup; 604 struct dev_exception_item ex; 605 int rc; 606 607 memset(&ex, 0, sizeof(ex)); 608 ex.type = type; 609 ex.major = major; 610 ex.minor = minor; 611 ex.access = access; 612 613 rcu_read_lock(); 614 dev_cgroup = task_devcgroup(current); 615 rc = may_access(dev_cgroup, &ex); 616 rcu_read_unlock(); 617 618 if (!rc) 619 return -EPERM; 620 621 return 0; 622 } 623 624 int __devcgroup_inode_permission(struct inode *inode, int mask) 625 { 626 short type, access = 0; 627 628 if (S_ISBLK(inode->i_mode)) 629 type = DEV_BLOCK; 630 if (S_ISCHR(inode->i_mode)) 631 type = DEV_CHAR; 632 if (mask & MAY_WRITE) 633 access |= ACC_WRITE; 634 if (mask & MAY_READ) 635 access |= ACC_READ; 636 637 return __devcgroup_check_permission(type, imajor(inode), iminor(inode), 638 access); 639 } 640 641 int devcgroup_inode_mknod(int mode, dev_t dev) 642 { 643 short type; 644 645 if (!S_ISBLK(mode) && !S_ISCHR(mode)) 646 return 0; 647 648 if (S_ISBLK(mode)) 649 type = DEV_BLOCK; 650 else 651 type = DEV_CHAR; 652 653 return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), 654 ACC_MKNOD); 655 656 } 657