1 /* 2 * device_cgroup.c - device cgroup subsystem 3 * 4 * Copyright 2007 IBM Corp 5 */ 6 7 #include <linux/device_cgroup.h> 8 #include <linux/cgroup.h> 9 #include <linux/ctype.h> 10 #include <linux/list.h> 11 #include <linux/uaccess.h> 12 #include <linux/seq_file.h> 13 #include <linux/slab.h> 14 #include <linux/rcupdate.h> 15 #include <linux/mutex.h> 16 17 #define ACC_MKNOD 1 18 #define ACC_READ 2 19 #define ACC_WRITE 4 20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) 21 22 #define DEV_BLOCK 1 23 #define DEV_CHAR 2 24 #define DEV_ALL 4 /* this represents all devices */ 25 26 static DEFINE_MUTEX(devcgroup_mutex); 27 28 /* 29 * exception list locking rules: 30 * hold devcgroup_mutex for update/read. 31 * hold rcu_read_lock() for read. 32 */ 33 34 struct dev_exception_item { 35 u32 major, minor; 36 short type; 37 short access; 38 struct list_head list; 39 struct rcu_head rcu; 40 }; 41 42 struct dev_cgroup { 43 struct cgroup_subsys_state css; 44 struct list_head exceptions; 45 enum { 46 DEVCG_DEFAULT_ALLOW, 47 DEVCG_DEFAULT_DENY, 48 } behavior; 49 }; 50 51 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 52 { 53 return container_of(s, struct dev_cgroup, css); 54 } 55 56 static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) 57 { 58 return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); 59 } 60 61 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 62 { 63 return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); 64 } 65 66 struct cgroup_subsys devices_subsys; 67 68 static int devcgroup_can_attach(struct cgroup *new_cgrp, 69 struct cgroup_taskset *set) 70 { 71 struct task_struct *task = cgroup_taskset_first(set); 72 73 if (current != task && !capable(CAP_SYS_ADMIN)) 74 return -EPERM; 75 return 0; 76 } 77 78 /* 79 * called under devcgroup_mutex 80 */ 81 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) 82 { 83 struct dev_exception_item *ex, *tmp, *new; 84 85 lockdep_assert_held(&devcgroup_mutex); 86 87 list_for_each_entry(ex, orig, list) { 88 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 89 if (!new) 90 goto free_and_exit; 91 list_add_tail(&new->list, dest); 92 } 93 94 return 0; 95 96 free_and_exit: 97 list_for_each_entry_safe(ex, tmp, dest, list) { 98 list_del(&ex->list); 99 kfree(ex); 100 } 101 return -ENOMEM; 102 } 103 104 /* 105 * called under devcgroup_mutex 106 */ 107 static int dev_exception_add(struct dev_cgroup *dev_cgroup, 108 struct dev_exception_item *ex) 109 { 110 struct dev_exception_item *excopy, *walk; 111 112 lockdep_assert_held(&devcgroup_mutex); 113 114 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 115 if (!excopy) 116 return -ENOMEM; 117 118 list_for_each_entry(walk, &dev_cgroup->exceptions, list) { 119 if (walk->type != ex->type) 120 continue; 121 if (walk->major != ex->major) 122 continue; 123 if (walk->minor != ex->minor) 124 continue; 125 126 walk->access |= ex->access; 127 kfree(excopy); 128 excopy = NULL; 129 } 130 131 if (excopy != NULL) 132 list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); 133 return 0; 134 } 135 136 /* 137 * called under devcgroup_mutex 138 */ 139 static void dev_exception_rm(struct dev_cgroup *dev_cgroup, 140 struct dev_exception_item *ex) 141 { 142 struct dev_exception_item *walk, *tmp; 143 144 lockdep_assert_held(&devcgroup_mutex); 145 146 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { 147 if (walk->type != ex->type) 148 continue; 149 if (walk->major != ex->major) 150 continue; 151 if (walk->minor != ex->minor) 152 continue; 153 154 walk->access &= ~ex->access; 155 if (!walk->access) { 156 list_del_rcu(&walk->list); 157 kfree_rcu(walk, rcu); 158 } 159 } 160 } 161 162 /** 163 * dev_exception_clean - frees all entries of the exception list 164 * @dev_cgroup: dev_cgroup with the exception list to be cleaned 165 * 166 * called under devcgroup_mutex 167 */ 168 static void dev_exception_clean(struct dev_cgroup *dev_cgroup) 169 { 170 struct dev_exception_item *ex, *tmp; 171 172 lockdep_assert_held(&devcgroup_mutex); 173 174 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { 175 list_del_rcu(&ex->list); 176 kfree_rcu(ex, rcu); 177 } 178 } 179 180 /* 181 * called from kernel/cgroup.c with cgroup_lock() held. 182 */ 183 static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) 184 { 185 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; 186 struct cgroup *parent_cgroup; 187 int ret; 188 189 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); 190 if (!dev_cgroup) 191 return ERR_PTR(-ENOMEM); 192 INIT_LIST_HEAD(&dev_cgroup->exceptions); 193 parent_cgroup = cgroup->parent; 194 195 if (parent_cgroup == NULL) 196 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; 197 else { 198 parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); 199 mutex_lock(&devcgroup_mutex); 200 ret = dev_exceptions_copy(&dev_cgroup->exceptions, 201 &parent_dev_cgroup->exceptions); 202 dev_cgroup->behavior = parent_dev_cgroup->behavior; 203 mutex_unlock(&devcgroup_mutex); 204 if (ret) { 205 kfree(dev_cgroup); 206 return ERR_PTR(ret); 207 } 208 } 209 210 return &dev_cgroup->css; 211 } 212 213 static void devcgroup_css_free(struct cgroup *cgroup) 214 { 215 struct dev_cgroup *dev_cgroup; 216 217 dev_cgroup = cgroup_to_devcgroup(cgroup); 218 dev_exception_clean(dev_cgroup); 219 kfree(dev_cgroup); 220 } 221 222 #define DEVCG_ALLOW 1 223 #define DEVCG_DENY 2 224 #define DEVCG_LIST 3 225 226 #define MAJMINLEN 13 227 #define ACCLEN 4 228 229 static void set_access(char *acc, short access) 230 { 231 int idx = 0; 232 memset(acc, 0, ACCLEN); 233 if (access & ACC_READ) 234 acc[idx++] = 'r'; 235 if (access & ACC_WRITE) 236 acc[idx++] = 'w'; 237 if (access & ACC_MKNOD) 238 acc[idx++] = 'm'; 239 } 240 241 static char type_to_char(short type) 242 { 243 if (type == DEV_ALL) 244 return 'a'; 245 if (type == DEV_CHAR) 246 return 'c'; 247 if (type == DEV_BLOCK) 248 return 'b'; 249 return 'X'; 250 } 251 252 static void set_majmin(char *str, unsigned m) 253 { 254 if (m == ~0) 255 strcpy(str, "*"); 256 else 257 sprintf(str, "%u", m); 258 } 259 260 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, 261 struct seq_file *m) 262 { 263 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); 264 struct dev_exception_item *ex; 265 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 266 267 rcu_read_lock(); 268 /* 269 * To preserve the compatibility: 270 * - Only show the "all devices" when the default policy is to allow 271 * - List the exceptions in case the default policy is to deny 272 * This way, the file remains as a "whitelist of devices" 273 */ 274 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 275 set_access(acc, ACC_MASK); 276 set_majmin(maj, ~0); 277 set_majmin(min, ~0); 278 seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL), 279 maj, min, acc); 280 } else { 281 list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { 282 set_access(acc, ex->access); 283 set_majmin(maj, ex->major); 284 set_majmin(min, ex->minor); 285 seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), 286 maj, min, acc); 287 } 288 } 289 rcu_read_unlock(); 290 291 return 0; 292 } 293 294 /** 295 * may_access - verifies if a new exception is part of what is allowed 296 * by a dev cgroup based on the default policy + 297 * exceptions. This is used to make sure a child cgroup 298 * won't have more privileges than its parent or to 299 * verify if a certain access is allowed. 300 * @dev_cgroup: dev cgroup to be tested against 301 * @refex: new exception 302 */ 303 static int may_access(struct dev_cgroup *dev_cgroup, 304 struct dev_exception_item *refex) 305 { 306 struct dev_exception_item *ex; 307 bool match = false; 308 309 rcu_lockdep_assert(rcu_read_lock_held() || 310 lockdep_is_held(&devcgroup_mutex), 311 "device_cgroup::may_access() called without proper synchronization"); 312 313 list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) { 314 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) 315 continue; 316 if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR)) 317 continue; 318 if (ex->major != ~0 && ex->major != refex->major) 319 continue; 320 if (ex->minor != ~0 && ex->minor != refex->minor) 321 continue; 322 if (refex->access & (~ex->access)) 323 continue; 324 match = true; 325 break; 326 } 327 328 /* 329 * In two cases we'll consider this new exception valid: 330 * - the dev cgroup has its default policy to allow + exception list: 331 * the new exception should *not* match any of the exceptions 332 * (behavior == DEVCG_DEFAULT_ALLOW, !match) 333 * - the dev cgroup has its default policy to deny + exception list: 334 * the new exception *should* match the exceptions 335 * (behavior == DEVCG_DEFAULT_DENY, match) 336 */ 337 if ((dev_cgroup->behavior == DEVCG_DEFAULT_DENY) == match) 338 return 1; 339 return 0; 340 } 341 342 /* 343 * parent_has_perm: 344 * when adding a new allow rule to a device exception list, the rule 345 * must be allowed in the parent device 346 */ 347 static int parent_has_perm(struct dev_cgroup *childcg, 348 struct dev_exception_item *ex) 349 { 350 struct cgroup *pcg = childcg->css.cgroup->parent; 351 struct dev_cgroup *parent; 352 353 if (!pcg) 354 return 1; 355 parent = cgroup_to_devcgroup(pcg); 356 return may_access(parent, ex); 357 } 358 359 /** 360 * may_allow_all - checks if it's possible to change the behavior to 361 * allow based on parent's rules. 362 * @parent: device cgroup's parent 363 * returns: != 0 in case it's allowed, 0 otherwise 364 */ 365 static inline int may_allow_all(struct dev_cgroup *parent) 366 { 367 if (!parent) 368 return 1; 369 return parent->behavior == DEVCG_DEFAULT_ALLOW; 370 } 371 372 /* 373 * Modify the exception list using allow/deny rules. 374 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 375 * so we can give a container CAP_MKNOD to let it create devices but not 376 * modify the exception list. 377 * It seems likely we'll want to add a CAP_CONTAINER capability to allow 378 * us to also grant CAP_SYS_ADMIN to containers without giving away the 379 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN 380 * 381 * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting 382 * new access is only allowed if you're in the top-level cgroup, or your 383 * parent cgroup has the access you're asking for. 384 */ 385 static int devcgroup_update_access(struct dev_cgroup *devcgroup, 386 int filetype, const char *buffer) 387 { 388 const char *b; 389 char temp[12]; /* 11 + 1 characters needed for a u32 */ 390 int count, rc; 391 struct dev_exception_item ex; 392 struct cgroup *p = devcgroup->css.cgroup; 393 struct dev_cgroup *parent = NULL; 394 395 if (!capable(CAP_SYS_ADMIN)) 396 return -EPERM; 397 398 if (p->parent) 399 parent = cgroup_to_devcgroup(p->parent); 400 401 memset(&ex, 0, sizeof(ex)); 402 b = buffer; 403 404 switch (*b) { 405 case 'a': 406 switch (filetype) { 407 case DEVCG_ALLOW: 408 if (!may_allow_all(parent)) 409 return -EPERM; 410 dev_exception_clean(devcgroup); 411 devcgroup->behavior = DEVCG_DEFAULT_ALLOW; 412 if (!parent) 413 break; 414 415 rc = dev_exceptions_copy(&devcgroup->exceptions, 416 &parent->exceptions); 417 if (rc) 418 return rc; 419 break; 420 case DEVCG_DENY: 421 dev_exception_clean(devcgroup); 422 devcgroup->behavior = DEVCG_DEFAULT_DENY; 423 break; 424 default: 425 return -EINVAL; 426 } 427 return 0; 428 case 'b': 429 ex.type = DEV_BLOCK; 430 break; 431 case 'c': 432 ex.type = DEV_CHAR; 433 break; 434 default: 435 return -EINVAL; 436 } 437 b++; 438 if (!isspace(*b)) 439 return -EINVAL; 440 b++; 441 if (*b == '*') { 442 ex.major = ~0; 443 b++; 444 } else if (isdigit(*b)) { 445 memset(temp, 0, sizeof(temp)); 446 for (count = 0; count < sizeof(temp) - 1; count++) { 447 temp[count] = *b; 448 b++; 449 if (!isdigit(*b)) 450 break; 451 } 452 rc = kstrtou32(temp, 10, &ex.major); 453 if (rc) 454 return -EINVAL; 455 } else { 456 return -EINVAL; 457 } 458 if (*b != ':') 459 return -EINVAL; 460 b++; 461 462 /* read minor */ 463 if (*b == '*') { 464 ex.minor = ~0; 465 b++; 466 } else if (isdigit(*b)) { 467 memset(temp, 0, sizeof(temp)); 468 for (count = 0; count < sizeof(temp) - 1; count++) { 469 temp[count] = *b; 470 b++; 471 if (!isdigit(*b)) 472 break; 473 } 474 rc = kstrtou32(temp, 10, &ex.minor); 475 if (rc) 476 return -EINVAL; 477 } else { 478 return -EINVAL; 479 } 480 if (!isspace(*b)) 481 return -EINVAL; 482 for (b++, count = 0; count < 3; count++, b++) { 483 switch (*b) { 484 case 'r': 485 ex.access |= ACC_READ; 486 break; 487 case 'w': 488 ex.access |= ACC_WRITE; 489 break; 490 case 'm': 491 ex.access |= ACC_MKNOD; 492 break; 493 case '\n': 494 case '\0': 495 count = 3; 496 break; 497 default: 498 return -EINVAL; 499 } 500 } 501 502 switch (filetype) { 503 case DEVCG_ALLOW: 504 if (!parent_has_perm(devcgroup, &ex)) 505 return -EPERM; 506 /* 507 * If the default policy is to allow by default, try to remove 508 * an matching exception instead. And be silent about it: we 509 * don't want to break compatibility 510 */ 511 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 512 dev_exception_rm(devcgroup, &ex); 513 return 0; 514 } 515 return dev_exception_add(devcgroup, &ex); 516 case DEVCG_DENY: 517 /* 518 * If the default policy is to deny by default, try to remove 519 * an matching exception instead. And be silent about it: we 520 * don't want to break compatibility 521 */ 522 if (devcgroup->behavior == DEVCG_DEFAULT_DENY) { 523 dev_exception_rm(devcgroup, &ex); 524 return 0; 525 } 526 return dev_exception_add(devcgroup, &ex); 527 default: 528 return -EINVAL; 529 } 530 return 0; 531 } 532 533 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 534 const char *buffer) 535 { 536 int retval; 537 538 mutex_lock(&devcgroup_mutex); 539 retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), 540 cft->private, buffer); 541 mutex_unlock(&devcgroup_mutex); 542 return retval; 543 } 544 545 static struct cftype dev_cgroup_files[] = { 546 { 547 .name = "allow", 548 .write_string = devcgroup_access_write, 549 .private = DEVCG_ALLOW, 550 }, 551 { 552 .name = "deny", 553 .write_string = devcgroup_access_write, 554 .private = DEVCG_DENY, 555 }, 556 { 557 .name = "list", 558 .read_seq_string = devcgroup_seq_read, 559 .private = DEVCG_LIST, 560 }, 561 { } /* terminate */ 562 }; 563 564 struct cgroup_subsys devices_subsys = { 565 .name = "devices", 566 .can_attach = devcgroup_can_attach, 567 .css_alloc = devcgroup_css_alloc, 568 .css_free = devcgroup_css_free, 569 .subsys_id = devices_subsys_id, 570 .base_cftypes = dev_cgroup_files, 571 572 /* 573 * While devices cgroup has the rudimentary hierarchy support which 574 * checks the parent's restriction, it doesn't properly propagates 575 * config changes in ancestors to their descendents. A child 576 * should only be allowed to add more restrictions to the parent's 577 * configuration. Fix it and remove the following. 578 */ 579 .broken_hierarchy = true, 580 }; 581 582 /** 583 * __devcgroup_check_permission - checks if an inode operation is permitted 584 * @dev_cgroup: the dev cgroup to be tested against 585 * @type: device type 586 * @major: device major number 587 * @minor: device minor number 588 * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD 589 * 590 * returns 0 on success, -EPERM case the operation is not permitted 591 */ 592 static int __devcgroup_check_permission(short type, u32 major, u32 minor, 593 short access) 594 { 595 struct dev_cgroup *dev_cgroup; 596 struct dev_exception_item ex; 597 int rc; 598 599 memset(&ex, 0, sizeof(ex)); 600 ex.type = type; 601 ex.major = major; 602 ex.minor = minor; 603 ex.access = access; 604 605 rcu_read_lock(); 606 dev_cgroup = task_devcgroup(current); 607 rc = may_access(dev_cgroup, &ex); 608 rcu_read_unlock(); 609 610 if (!rc) 611 return -EPERM; 612 613 return 0; 614 } 615 616 int __devcgroup_inode_permission(struct inode *inode, int mask) 617 { 618 short type, access = 0; 619 620 if (S_ISBLK(inode->i_mode)) 621 type = DEV_BLOCK; 622 if (S_ISCHR(inode->i_mode)) 623 type = DEV_CHAR; 624 if (mask & MAY_WRITE) 625 access |= ACC_WRITE; 626 if (mask & MAY_READ) 627 access |= ACC_READ; 628 629 return __devcgroup_check_permission(type, imajor(inode), iminor(inode), 630 access); 631 } 632 633 int devcgroup_inode_mknod(int mode, dev_t dev) 634 { 635 short type; 636 637 if (!S_ISBLK(mode) && !S_ISCHR(mode)) 638 return 0; 639 640 if (S_ISBLK(mode)) 641 type = DEV_BLOCK; 642 else 643 type = DEV_CHAR; 644 645 return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), 646 ACC_MKNOD); 647 648 } 649