1 /* 2 * This file is part of UBIFS. 3 * 4 * Copyright (C) 2006-2008 Nokia Corporation. 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published by 8 * the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 * more details. 14 * 15 * You should have received a copy of the GNU General Public License along with 16 * this program; if not, write to the Free Software Foundation, Inc., 51 17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * 19 * Authors: Artem Bityutskiy (Битюцкий Артём) 20 * Adrian Hunter 21 */ 22 23 /* 24 * This file implements VFS file and inode operations of regular files, device 25 * nodes and symlinks as well as address space operations. 26 * 27 * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the 28 * page is dirty and is used for budgeting purposes - dirty pages should not be 29 * budgeted. The PG_checked flag is set if full budgeting is required for the 30 * page e.g., when it corresponds to a file hole or it is just beyond the file 31 * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to 32 * fail in this function, and the budget is released in 'ubifs_write_end()'. So 33 * the PG_private and PG_checked flags carry the information about how the page 34 * was budgeted, to make it possible to release the budget properly. 35 * 36 * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations 37 * we implement. However, this is not true for '->writepage()', which might be 38 * called with 'i_mutex' unlocked. For example, when pdflush is performing 39 * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the 40 * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is 41 * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim 42 * path'. So, in '->writepage()' we are only guaranteed that the page is 43 * locked. 44 * 45 * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g., 46 * readahead path does not have it locked ("sys_read -> generic_file_aio_read 47 * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is 48 * not set as well. However, UBIFS disables readahead. 49 * 50 * This, for example means that there might be 2 concurrent '->writepage()' 51 * calls for the same inode, but different inode dirty pages. 52 */ 53 54 #include "ubifs.h" 55 #include <linux/mount.h> 56 #include <linux/namei.h> 57 58 static int read_block(struct inode *inode, void *addr, unsigned int block, 59 struct ubifs_data_node *dn) 60 { 61 struct ubifs_info *c = inode->i_sb->s_fs_info; 62 int err, len, out_len; 63 union ubifs_key key; 64 unsigned int dlen; 65 66 data_key_init(c, &key, inode->i_ino, block); 67 err = ubifs_tnc_lookup(c, &key, dn); 68 if (err) { 69 if (err == -ENOENT) 70 /* Not found, so it must be a hole */ 71 memset(addr, 0, UBIFS_BLOCK_SIZE); 72 return err; 73 } 74 75 ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum); 76 77 len = le32_to_cpu(dn->size); 78 if (len <= 0 || len > UBIFS_BLOCK_SIZE) 79 goto dump; 80 81 dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; 82 out_len = UBIFS_BLOCK_SIZE; 83 err = ubifs_decompress(&dn->data, dlen, addr, &out_len, 84 le16_to_cpu(dn->compr_type)); 85 if (err || len != out_len) 86 goto dump; 87 88 /* 89 * Data length can be less than a full block, even for blocks that are 90 * not the last in the file (e.g., as a result of making a hole and 91 * appending data). Ensure that the remainder is zeroed out. 92 */ 93 if (len < UBIFS_BLOCK_SIZE) 94 memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); 95 96 return 0; 97 98 dump: 99 ubifs_err("bad data node (block %u, inode %lu)", 100 block, inode->i_ino); 101 dbg_dump_node(c, dn); 102 return -EINVAL; 103 } 104 105 static int do_readpage(struct page *page) 106 { 107 void *addr; 108 int err = 0, i; 109 unsigned int block, beyond; 110 struct ubifs_data_node *dn; 111 struct inode *inode = page->mapping->host; 112 loff_t i_size = i_size_read(inode); 113 114 dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", 115 inode->i_ino, page->index, i_size, page->flags); 116 ubifs_assert(!PageChecked(page)); 117 ubifs_assert(!PagePrivate(page)); 118 119 addr = kmap(page); 120 121 block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; 122 beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; 123 if (block >= beyond) { 124 /* Reading beyond inode */ 125 SetPageChecked(page); 126 memset(addr, 0, PAGE_CACHE_SIZE); 127 goto out; 128 } 129 130 dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); 131 if (!dn) { 132 err = -ENOMEM; 133 goto error; 134 } 135 136 i = 0; 137 while (1) { 138 int ret; 139 140 if (block >= beyond) { 141 /* Reading beyond inode */ 142 err = -ENOENT; 143 memset(addr, 0, UBIFS_BLOCK_SIZE); 144 } else { 145 ret = read_block(inode, addr, block, dn); 146 if (ret) { 147 err = ret; 148 if (err != -ENOENT) 149 break; 150 } 151 } 152 if (++i >= UBIFS_BLOCKS_PER_PAGE) 153 break; 154 block += 1; 155 addr += UBIFS_BLOCK_SIZE; 156 } 157 if (err) { 158 if (err == -ENOENT) { 159 /* Not found, so it must be a hole */ 160 SetPageChecked(page); 161 dbg_gen("hole"); 162 goto out_free; 163 } 164 ubifs_err("cannot read page %lu of inode %lu, error %d", 165 page->index, inode->i_ino, err); 166 goto error; 167 } 168 169 out_free: 170 kfree(dn); 171 out: 172 SetPageUptodate(page); 173 ClearPageError(page); 174 flush_dcache_page(page); 175 kunmap(page); 176 return 0; 177 178 error: 179 kfree(dn); 180 ClearPageUptodate(page); 181 SetPageError(page); 182 flush_dcache_page(page); 183 kunmap(page); 184 return err; 185 } 186 187 /** 188 * release_new_page_budget - release budget of a new page. 189 * @c: UBIFS file-system description object 190 * 191 * This is a helper function which releases budget corresponding to the budget 192 * of one new page of data. 193 */ 194 static void release_new_page_budget(struct ubifs_info *c) 195 { 196 struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 }; 197 198 ubifs_release_budget(c, &req); 199 } 200 201 /** 202 * release_existing_page_budget - release budget of an existing page. 203 * @c: UBIFS file-system description object 204 * 205 * This is a helper function which releases budget corresponding to the budget 206 * of changing one one page of data which already exists on the flash media. 207 */ 208 static void release_existing_page_budget(struct ubifs_info *c) 209 { 210 struct ubifs_budget_req req = { .dd_growth = c->page_budget}; 211 212 ubifs_release_budget(c, &req); 213 } 214 215 static int write_begin_slow(struct address_space *mapping, 216 loff_t pos, unsigned len, struct page **pagep) 217 { 218 struct inode *inode = mapping->host; 219 struct ubifs_info *c = inode->i_sb->s_fs_info; 220 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 221 struct ubifs_budget_req req = { .new_page = 1 }; 222 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 223 struct page *page; 224 225 dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", 226 inode->i_ino, pos, len, inode->i_size); 227 228 /* 229 * At the slow path we have to budget before locking the page, because 230 * budgeting may force write-back, which would wait on locked pages and 231 * deadlock if we had the page locked. At this point we do not know 232 * anything about the page, so assume that this is a new page which is 233 * written to a hole. This corresponds to largest budget. Later the 234 * budget will be amended if this is not true. 235 */ 236 if (appending) 237 /* We are appending data, budget for inode change */ 238 req.dirtied_ino = 1; 239 240 err = ubifs_budget_space(c, &req); 241 if (unlikely(err)) 242 return err; 243 244 page = __grab_cache_page(mapping, index); 245 if (unlikely(!page)) { 246 ubifs_release_budget(c, &req); 247 return -ENOMEM; 248 } 249 250 if (!PageUptodate(page)) { 251 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 252 SetPageChecked(page); 253 else { 254 err = do_readpage(page); 255 if (err) { 256 unlock_page(page); 257 page_cache_release(page); 258 return err; 259 } 260 } 261 262 SetPageUptodate(page); 263 ClearPageError(page); 264 } 265 266 if (PagePrivate(page)) 267 /* 268 * The page is dirty, which means it was budgeted twice: 269 * o first time the budget was allocated by the task which 270 * made the page dirty and set the PG_private flag; 271 * o and then we budgeted for it for the second time at the 272 * very beginning of this function. 273 * 274 * So what we have to do is to release the page budget we 275 * allocated. 276 */ 277 release_new_page_budget(c); 278 else if (!PageChecked(page)) 279 /* 280 * We are changing a page which already exists on the media. 281 * This means that changing the page does not make the amount 282 * of indexing information larger, and this part of the budget 283 * which we have already acquired may be released. 284 */ 285 ubifs_convert_page_budget(c); 286 287 if (appending) { 288 struct ubifs_inode *ui = ubifs_inode(inode); 289 290 /* 291 * 'ubifs_write_end()' is optimized from the fast-path part of 292 * 'ubifs_write_begin()' and expects the @ui_mutex to be locked 293 * if data is appended. 294 */ 295 mutex_lock(&ui->ui_mutex); 296 if (ui->dirty) 297 /* 298 * The inode is dirty already, so we may free the 299 * budget we allocated. 300 */ 301 ubifs_release_dirty_inode_budget(c, ui); 302 } 303 304 *pagep = page; 305 return 0; 306 } 307 308 /** 309 * allocate_budget - allocate budget for 'ubifs_write_begin()'. 310 * @c: UBIFS file-system description object 311 * @page: page to allocate budget for 312 * @ui: UBIFS inode object the page belongs to 313 * @appending: non-zero if the page is appended 314 * 315 * This is a helper function for 'ubifs_write_begin()' which allocates budget 316 * for the operation. The budget is allocated differently depending on whether 317 * this is appending, whether the page is dirty or not, and so on. This 318 * function leaves the @ui->ui_mutex locked in case of appending. Returns zero 319 * in case of success and %-ENOSPC in case of failure. 320 */ 321 static int allocate_budget(struct ubifs_info *c, struct page *page, 322 struct ubifs_inode *ui, int appending) 323 { 324 struct ubifs_budget_req req = { .fast = 1 }; 325 326 if (PagePrivate(page)) { 327 if (!appending) 328 /* 329 * The page is dirty and we are not appending, which 330 * means no budget is needed at all. 331 */ 332 return 0; 333 334 mutex_lock(&ui->ui_mutex); 335 if (ui->dirty) 336 /* 337 * The page is dirty and we are appending, so the inode 338 * has to be marked as dirty. However, it is already 339 * dirty, so we do not need any budget. We may return, 340 * but @ui->ui_mutex hast to be left locked because we 341 * should prevent write-back from flushing the inode 342 * and freeing the budget. The lock will be released in 343 * 'ubifs_write_end()'. 344 */ 345 return 0; 346 347 /* 348 * The page is dirty, we are appending, the inode is clean, so 349 * we need to budget the inode change. 350 */ 351 req.dirtied_ino = 1; 352 } else { 353 if (PageChecked(page)) 354 /* 355 * The page corresponds to a hole and does not 356 * exist on the media. So changing it makes 357 * make the amount of indexing information 358 * larger, and we have to budget for a new 359 * page. 360 */ 361 req.new_page = 1; 362 else 363 /* 364 * Not a hole, the change will not add any new 365 * indexing information, budget for page 366 * change. 367 */ 368 req.dirtied_page = 1; 369 370 if (appending) { 371 mutex_lock(&ui->ui_mutex); 372 if (!ui->dirty) 373 /* 374 * The inode is clean but we will have to mark 375 * it as dirty because we are appending. This 376 * needs a budget. 377 */ 378 req.dirtied_ino = 1; 379 } 380 } 381 382 return ubifs_budget_space(c, &req); 383 } 384 385 /* 386 * This function is called when a page of data is going to be written. Since 387 * the page of data will not necessarily go to the flash straight away, UBIFS 388 * has to reserve space on the media for it, which is done by means of 389 * budgeting. 390 * 391 * This is the hot-path of the file-system and we are trying to optimize it as 392 * much as possible. For this reasons it is split on 2 parts - slow and fast. 393 * 394 * There many budgeting cases: 395 * o a new page is appended - we have to budget for a new page and for 396 * changing the inode; however, if the inode is already dirty, there is 397 * no need to budget for it; 398 * o an existing clean page is changed - we have budget for it; if the page 399 * does not exist on the media (a hole), we have to budget for a new 400 * page; otherwise, we may budget for changing an existing page; the 401 * difference between these cases is that changing an existing page does 402 * not introduce anything new to the FS indexing information, so it does 403 * not grow, and smaller budget is acquired in this case; 404 * o an existing dirty page is changed - no need to budget at all, because 405 * the page budget has been acquired by earlier, when the page has been 406 * marked dirty. 407 * 408 * UBIFS budgeting sub-system may force write-back if it thinks there is no 409 * space to reserve. This imposes some locking restrictions and makes it 410 * impossible to take into account the above cases, and makes it impossible to 411 * optimize budgeting. 412 * 413 * The solution for this is that the fast path of 'ubifs_write_begin()' assumes 414 * there is a plenty of flash space and the budget will be acquired quickly, 415 * without forcing write-back. The slow path does not make this assumption. 416 */ 417 static int ubifs_write_begin(struct file *file, struct address_space *mapping, 418 loff_t pos, unsigned len, unsigned flags, 419 struct page **pagep, void **fsdata) 420 { 421 struct inode *inode = mapping->host; 422 struct ubifs_info *c = inode->i_sb->s_fs_info; 423 struct ubifs_inode *ui = ubifs_inode(inode); 424 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 425 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 426 struct page *page; 427 428 429 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 430 431 if (unlikely(c->ro_media)) 432 return -EROFS; 433 434 /* Try out the fast-path part first */ 435 page = __grab_cache_page(mapping, index); 436 if (unlikely(!page)) 437 return -ENOMEM; 438 439 if (!PageUptodate(page)) { 440 /* The page is not loaded from the flash */ 441 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 442 /* 443 * We change whole page so no need to load it. But we 444 * have to set the @PG_checked flag to make the further 445 * code the page is new. This might be not true, but it 446 * is better to budget more that to read the page from 447 * the media. 448 */ 449 SetPageChecked(page); 450 else { 451 err = do_readpage(page); 452 if (err) { 453 unlock_page(page); 454 page_cache_release(page); 455 return err; 456 } 457 } 458 459 SetPageUptodate(page); 460 ClearPageError(page); 461 } 462 463 err = allocate_budget(c, page, ui, appending); 464 if (unlikely(err)) { 465 ubifs_assert(err == -ENOSPC); 466 /* 467 * Budgeting failed which means it would have to force 468 * write-back but didn't, because we set the @fast flag in the 469 * request. Write-back cannot be done now, while we have the 470 * page locked, because it would deadlock. Unlock and free 471 * everything and fall-back to slow-path. 472 */ 473 if (appending) { 474 ubifs_assert(mutex_is_locked(&ui->ui_mutex)); 475 mutex_unlock(&ui->ui_mutex); 476 } 477 unlock_page(page); 478 page_cache_release(page); 479 480 return write_begin_slow(mapping, pos, len, pagep); 481 } 482 483 /* 484 * Whee, we aquired budgeting quickly - without involving 485 * garbage-collection, committing or forceing write-back. We return 486 * with @ui->ui_mutex locked if we are appending pages, and unlocked 487 * otherwise. This is an optimization (slightly hacky though). 488 */ 489 *pagep = page; 490 return 0; 491 492 } 493 494 /** 495 * cancel_budget - cancel budget. 496 * @c: UBIFS file-system description object 497 * @page: page to cancel budget for 498 * @ui: UBIFS inode object the page belongs to 499 * @appending: non-zero if the page is appended 500 * 501 * This is a helper function for a page write operation. It unlocks the 502 * @ui->ui_mutex in case of appending. 503 */ 504 static void cancel_budget(struct ubifs_info *c, struct page *page, 505 struct ubifs_inode *ui, int appending) 506 { 507 if (appending) { 508 if (!ui->dirty) 509 ubifs_release_dirty_inode_budget(c, ui); 510 mutex_unlock(&ui->ui_mutex); 511 } 512 if (!PagePrivate(page)) { 513 if (PageChecked(page)) 514 release_new_page_budget(c); 515 else 516 release_existing_page_budget(c); 517 } 518 } 519 520 static int ubifs_write_end(struct file *file, struct address_space *mapping, 521 loff_t pos, unsigned len, unsigned copied, 522 struct page *page, void *fsdata) 523 { 524 struct inode *inode = mapping->host; 525 struct ubifs_inode *ui = ubifs_inode(inode); 526 struct ubifs_info *c = inode->i_sb->s_fs_info; 527 loff_t end_pos = pos + len; 528 int appending = !!(end_pos > inode->i_size); 529 530 dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", 531 inode->i_ino, pos, page->index, len, copied, inode->i_size); 532 533 if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) { 534 /* 535 * VFS copied less data to the page that it intended and 536 * declared in its '->write_begin()' call via the @len 537 * argument. If the page was not up-to-date, and @len was 538 * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did 539 * not load it from the media (for optimization reasons). This 540 * means that part of the page contains garbage. So read the 541 * page now. 542 */ 543 dbg_gen("copied %d instead of %d, read page and repeat", 544 copied, len); 545 cancel_budget(c, page, ui, appending); 546 547 /* 548 * Return 0 to force VFS to repeat the whole operation, or the 549 * error code if 'do_readpage()' failes. 550 */ 551 copied = do_readpage(page); 552 goto out; 553 } 554 555 if (!PagePrivate(page)) { 556 SetPagePrivate(page); 557 atomic_long_inc(&c->dirty_pg_cnt); 558 __set_page_dirty_nobuffers(page); 559 } 560 561 if (appending) { 562 i_size_write(inode, end_pos); 563 ui->ui_size = end_pos; 564 /* 565 * Note, we do not set @I_DIRTY_PAGES (which means that the 566 * inode has dirty pages), this has been done in 567 * '__set_page_dirty_nobuffers()'. 568 */ 569 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 570 ubifs_assert(mutex_is_locked(&ui->ui_mutex)); 571 mutex_unlock(&ui->ui_mutex); 572 } 573 574 out: 575 unlock_page(page); 576 page_cache_release(page); 577 return copied; 578 } 579 580 static int ubifs_readpage(struct file *file, struct page *page) 581 { 582 do_readpage(page); 583 unlock_page(page); 584 return 0; 585 } 586 587 static int do_writepage(struct page *page, int len) 588 { 589 int err = 0, i, blen; 590 unsigned int block; 591 void *addr; 592 union ubifs_key key; 593 struct inode *inode = page->mapping->host; 594 struct ubifs_info *c = inode->i_sb->s_fs_info; 595 596 #ifdef UBIFS_DEBUG 597 spin_lock(&ui->ui_lock); 598 ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE); 599 spin_unlock(&ui->ui_lock); 600 #endif 601 602 /* Update radix tree tags */ 603 set_page_writeback(page); 604 605 addr = kmap(page); 606 block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; 607 i = 0; 608 while (len) { 609 blen = min_t(int, len, UBIFS_BLOCK_SIZE); 610 data_key_init(c, &key, inode->i_ino, block); 611 err = ubifs_jnl_write_data(c, inode, &key, addr, blen); 612 if (err) 613 break; 614 if (++i >= UBIFS_BLOCKS_PER_PAGE) 615 break; 616 block += 1; 617 addr += blen; 618 len -= blen; 619 } 620 if (err) { 621 SetPageError(page); 622 ubifs_err("cannot write page %lu of inode %lu, error %d", 623 page->index, inode->i_ino, err); 624 ubifs_ro_mode(c, err); 625 } 626 627 ubifs_assert(PagePrivate(page)); 628 if (PageChecked(page)) 629 release_new_page_budget(c); 630 else 631 release_existing_page_budget(c); 632 633 atomic_long_dec(&c->dirty_pg_cnt); 634 ClearPagePrivate(page); 635 ClearPageChecked(page); 636 637 kunmap(page); 638 unlock_page(page); 639 end_page_writeback(page); 640 return err; 641 } 642 643 /* 644 * When writing-back dirty inodes, VFS first writes-back pages belonging to the 645 * inode, then the inode itself. For UBIFS this may cause a problem. Consider a 646 * situation when a we have an inode with size 0, then a megabyte of data is 647 * appended to the inode, then write-back starts and flushes some amount of the 648 * dirty pages, the journal becomes full, commit happens and finishes, and then 649 * an unclean reboot happens. When the file system is mounted next time, the 650 * inode size would still be 0, but there would be many pages which are beyond 651 * the inode size, they would be indexed and consume flash space. Because the 652 * journal has been committed, the replay would not be able to detect this 653 * situation and correct the inode size. This means UBIFS would have to scan 654 * whole index and correct all inode sizes, which is long an unacceptable. 655 * 656 * To prevent situations like this, UBIFS writes pages back only if they are 657 * within last synchronized inode size, i.e. the the size which has been 658 * written to the flash media last time. Otherwise, UBIFS forces inode 659 * write-back, thus making sure the on-flash inode contains current inode size, 660 * and then keeps writing pages back. 661 * 662 * Some locking issues explanation. 'ubifs_writepage()' first is called with 663 * the page locked, and it locks @ui_mutex. However, write-back does take inode 664 * @i_mutex, which means other VFS operations may be run on this inode at the 665 * same time. And the problematic one is truncation to smaller size, from where 666 * we have to call 'vmtruncate()', which first changes @inode->i_size, then 667 * drops the truncated pages. And while dropping the pages, it takes the page 668 * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with 669 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 670 * means that @inode->i_size is changed while @ui_mutex is unlocked. 671 * 672 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond 673 * inode size. How do we do this if @inode->i_size may became smaller while we 674 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the 675 * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size 676 * internally and updates it under @ui_mutex. 677 * 678 * Q: why we do not worry that if we race with truncation, we may end up with a 679 * situation when the inode is truncated while we are in the middle of 680 * 'do_writepage()', so we do write beyond inode size? 681 * A: If we are in the middle of 'do_writepage()', truncation would be locked 682 * on the page lock and it would not write the truncated inode node to the 683 * journal before we have finished. 684 */ 685 static int ubifs_writepage(struct page *page, struct writeback_control *wbc) 686 { 687 struct inode *inode = page->mapping->host; 688 struct ubifs_inode *ui = ubifs_inode(inode); 689 loff_t i_size = i_size_read(inode), synced_i_size; 690 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 691 int err, len = i_size & (PAGE_CACHE_SIZE - 1); 692 void *kaddr; 693 694 dbg_gen("ino %lu, pg %lu, pg flags %#lx", 695 inode->i_ino, page->index, page->flags); 696 ubifs_assert(PagePrivate(page)); 697 698 /* Is the page fully outside @i_size? (truncate in progress) */ 699 if (page->index > end_index || (page->index == end_index && !len)) { 700 err = 0; 701 goto out_unlock; 702 } 703 704 spin_lock(&ui->ui_lock); 705 synced_i_size = ui->synced_i_size; 706 spin_unlock(&ui->ui_lock); 707 708 /* Is the page fully inside @i_size? */ 709 if (page->index < end_index) { 710 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { 711 err = inode->i_sb->s_op->write_inode(inode, 1); 712 if (err) 713 goto out_unlock; 714 /* 715 * The inode has been written, but the write-buffer has 716 * not been synchronized, so in case of an unclean 717 * reboot we may end up with some pages beyond inode 718 * size, but they would be in the journal (because 719 * commit flushes write buffers) and recovery would deal 720 * with this. 721 */ 722 } 723 return do_writepage(page, PAGE_CACHE_SIZE); 724 } 725 726 /* 727 * The page straddles @i_size. It must be zeroed out on each and every 728 * writepage invocation because it may be mmapped. "A file is mapped 729 * in multiples of the page size. For a file that is not a multiple of 730 * the page size, the remaining memory is zeroed when mapped, and 731 * writes to that region are not written out to the file." 732 */ 733 kaddr = kmap_atomic(page, KM_USER0); 734 memset(kaddr + len, 0, PAGE_CACHE_SIZE - len); 735 flush_dcache_page(page); 736 kunmap_atomic(kaddr, KM_USER0); 737 738 if (i_size > synced_i_size) { 739 err = inode->i_sb->s_op->write_inode(inode, 1); 740 if (err) 741 goto out_unlock; 742 } 743 744 return do_writepage(page, len); 745 746 out_unlock: 747 unlock_page(page); 748 return err; 749 } 750 751 /** 752 * do_attr_changes - change inode attributes. 753 * @inode: inode to change attributes for 754 * @attr: describes attributes to change 755 */ 756 static void do_attr_changes(struct inode *inode, const struct iattr *attr) 757 { 758 if (attr->ia_valid & ATTR_UID) 759 inode->i_uid = attr->ia_uid; 760 if (attr->ia_valid & ATTR_GID) 761 inode->i_gid = attr->ia_gid; 762 if (attr->ia_valid & ATTR_ATIME) 763 inode->i_atime = timespec_trunc(attr->ia_atime, 764 inode->i_sb->s_time_gran); 765 if (attr->ia_valid & ATTR_MTIME) 766 inode->i_mtime = timespec_trunc(attr->ia_mtime, 767 inode->i_sb->s_time_gran); 768 if (attr->ia_valid & ATTR_CTIME) 769 inode->i_ctime = timespec_trunc(attr->ia_ctime, 770 inode->i_sb->s_time_gran); 771 if (attr->ia_valid & ATTR_MODE) { 772 umode_t mode = attr->ia_mode; 773 774 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 775 mode &= ~S_ISGID; 776 inode->i_mode = mode; 777 } 778 } 779 780 /** 781 * do_truncation - truncate an inode. 782 * @c: UBIFS file-system description object 783 * @inode: inode to truncate 784 * @attr: inode attribute changes description 785 * 786 * This function implements VFS '->setattr()' call when the inode is truncated 787 * to a smaller size. Returns zero in case of success and a negative error code 788 * in case of failure. 789 */ 790 static int do_truncation(struct ubifs_info *c, struct inode *inode, 791 const struct iattr *attr) 792 { 793 int err; 794 struct ubifs_budget_req req; 795 loff_t old_size = inode->i_size, new_size = attr->ia_size; 796 int offset = new_size & (UBIFS_BLOCK_SIZE - 1); 797 struct ubifs_inode *ui = ubifs_inode(inode); 798 799 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); 800 memset(&req, 0, sizeof(struct ubifs_budget_req)); 801 802 /* 803 * If this is truncation to a smaller size, and we do not truncate on a 804 * block boundary, budget for changing one data block, because the last 805 * block will be re-written. 806 */ 807 if (new_size & (UBIFS_BLOCK_SIZE - 1)) 808 req.dirtied_page = 1; 809 810 req.dirtied_ino = 1; 811 /* A funny way to budget for truncation node */ 812 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; 813 err = ubifs_budget_space(c, &req); 814 if (err) 815 return err; 816 817 err = vmtruncate(inode, new_size); 818 if (err) 819 goto out_budg; 820 821 if (offset) { 822 pgoff_t index = new_size >> PAGE_CACHE_SHIFT; 823 struct page *page; 824 825 page = find_lock_page(inode->i_mapping, index); 826 if (page) { 827 if (PageDirty(page)) { 828 /* 829 * 'ubifs_jnl_truncate()' will try to truncate 830 * the last data node, but it contains 831 * out-of-date data because the page is dirty. 832 * Write the page now, so that 833 * 'ubifs_jnl_truncate()' will see an already 834 * truncated (and up to date) data node. 835 */ 836 ubifs_assert(PagePrivate(page)); 837 838 clear_page_dirty_for_io(page); 839 if (UBIFS_BLOCKS_PER_PAGE_SHIFT) 840 offset = new_size & 841 (PAGE_CACHE_SIZE - 1); 842 err = do_writepage(page, offset); 843 page_cache_release(page); 844 if (err) 845 goto out_budg; 846 /* 847 * We could now tell 'ubifs_jnl_truncate()' not 848 * to read the last block. 849 */ 850 } else { 851 /* 852 * We could 'kmap()' the page and pass the data 853 * to 'ubifs_jnl_truncate()' to save it from 854 * having to read it. 855 */ 856 unlock_page(page); 857 page_cache_release(page); 858 } 859 } 860 } 861 862 mutex_lock(&ui->ui_mutex); 863 ui->ui_size = inode->i_size; 864 /* Truncation changes inode [mc]time */ 865 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 866 /* The other attributes may be changed at the same time as well */ 867 do_attr_changes(inode, attr); 868 869 err = ubifs_jnl_truncate(c, inode, old_size, new_size); 870 mutex_unlock(&ui->ui_mutex); 871 out_budg: 872 ubifs_release_budget(c, &req); 873 return err; 874 } 875 876 /** 877 * do_setattr - change inode attributes. 878 * @c: UBIFS file-system description object 879 * @inode: inode to change attributes for 880 * @attr: inode attribute changes description 881 * 882 * This function implements VFS '->setattr()' call for all cases except 883 * truncations to smaller size. Returns zero in case of success and a negative 884 * error code in case of failure. 885 */ 886 static int do_setattr(struct ubifs_info *c, struct inode *inode, 887 const struct iattr *attr) 888 { 889 int err, release; 890 loff_t new_size = attr->ia_size; 891 struct ubifs_inode *ui = ubifs_inode(inode); 892 struct ubifs_budget_req req = { .dirtied_ino = 1, 893 .dirtied_ino_d = ui->data_len }; 894 895 err = ubifs_budget_space(c, &req); 896 if (err) 897 return err; 898 899 if (attr->ia_valid & ATTR_SIZE) { 900 dbg_gen("size %lld -> %lld", inode->i_size, new_size); 901 err = vmtruncate(inode, new_size); 902 if (err) 903 goto out; 904 } 905 906 mutex_lock(&ui->ui_mutex); 907 if (attr->ia_valid & ATTR_SIZE) { 908 /* Truncation changes inode [mc]time */ 909 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 910 /* 'vmtruncate()' changed @i_size, update @ui_size */ 911 ui->ui_size = inode->i_size; 912 } 913 914 do_attr_changes(inode, attr); 915 916 release = ui->dirty; 917 if (attr->ia_valid & ATTR_SIZE) 918 /* 919 * Inode length changed, so we have to make sure 920 * @I_DIRTY_DATASYNC is set. 921 */ 922 __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); 923 else 924 mark_inode_dirty_sync(inode); 925 mutex_unlock(&ui->ui_mutex); 926 927 if (release) 928 ubifs_release_budget(c, &req); 929 if (IS_SYNC(inode)) 930 err = inode->i_sb->s_op->write_inode(inode, 1); 931 return err; 932 933 out: 934 ubifs_release_budget(c, &req); 935 return err; 936 } 937 938 int ubifs_setattr(struct dentry *dentry, struct iattr *attr) 939 { 940 int err; 941 struct inode *inode = dentry->d_inode; 942 struct ubifs_info *c = inode->i_sb->s_fs_info; 943 944 dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid); 945 err = inode_change_ok(inode, attr); 946 if (err) 947 return err; 948 949 err = dbg_check_synced_i_size(inode); 950 if (err) 951 return err; 952 953 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size) 954 /* Truncation to a smaller size */ 955 err = do_truncation(c, inode, attr); 956 else 957 err = do_setattr(c, inode, attr); 958 959 return err; 960 } 961 962 static void ubifs_invalidatepage(struct page *page, unsigned long offset) 963 { 964 struct inode *inode = page->mapping->host; 965 struct ubifs_info *c = inode->i_sb->s_fs_info; 966 967 ubifs_assert(PagePrivate(page)); 968 if (offset) 969 /* Partial page remains dirty */ 970 return; 971 972 if (PageChecked(page)) 973 release_new_page_budget(c); 974 else 975 release_existing_page_budget(c); 976 977 atomic_long_dec(&c->dirty_pg_cnt); 978 ClearPagePrivate(page); 979 ClearPageChecked(page); 980 } 981 982 static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) 983 { 984 struct ubifs_inode *ui = ubifs_inode(dentry->d_inode); 985 986 nd_set_link(nd, ui->data); 987 return NULL; 988 } 989 990 int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) 991 { 992 struct inode *inode = dentry->d_inode; 993 struct ubifs_info *c = inode->i_sb->s_fs_info; 994 int err; 995 996 dbg_gen("syncing inode %lu", inode->i_ino); 997 998 /* 999 * VFS has already synchronized dirty pages for this inode. Synchronize 1000 * the inode unless this is a 'datasync()' call. 1001 */ 1002 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { 1003 err = inode->i_sb->s_op->write_inode(inode, 1); 1004 if (err) 1005 return err; 1006 } 1007 1008 /* 1009 * Nodes related to this inode may still sit in a write-buffer. Flush 1010 * them. 1011 */ 1012 err = ubifs_sync_wbufs_by_inode(c, inode); 1013 if (err) 1014 return err; 1015 1016 return 0; 1017 } 1018 1019 /** 1020 * mctime_update_needed - check if mtime or ctime update is needed. 1021 * @inode: the inode to do the check for 1022 * @now: current time 1023 * 1024 * This helper function checks if the inode mtime/ctime should be updated or 1025 * not. If current values of the time-stamps are within the UBIFS inode time 1026 * granularity, they are not updated. This is an optimization. 1027 */ 1028 static inline int mctime_update_needed(const struct inode *inode, 1029 const struct timespec *now) 1030 { 1031 if (!timespec_equal(&inode->i_mtime, now) || 1032 !timespec_equal(&inode->i_ctime, now)) 1033 return 1; 1034 return 0; 1035 } 1036 1037 /** 1038 * update_ctime - update mtime and ctime of an inode. 1039 * @c: UBIFS file-system description object 1040 * @inode: inode to update 1041 * 1042 * This function updates mtime and ctime of the inode if it is not equivalent to 1043 * current time. Returns zero in case of success and a negative error code in 1044 * case of failure. 1045 */ 1046 static int update_mctime(struct ubifs_info *c, struct inode *inode) 1047 { 1048 struct timespec now = ubifs_current_time(inode); 1049 struct ubifs_inode *ui = ubifs_inode(inode); 1050 1051 if (mctime_update_needed(inode, &now)) { 1052 int err, release; 1053 struct ubifs_budget_req req = { .dirtied_ino = 1, 1054 .dirtied_ino_d = ui->data_len }; 1055 1056 err = ubifs_budget_space(c, &req); 1057 if (err) 1058 return err; 1059 1060 mutex_lock(&ui->ui_mutex); 1061 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1062 release = ui->dirty; 1063 mark_inode_dirty_sync(inode); 1064 mutex_unlock(&ui->ui_mutex); 1065 if (release) 1066 ubifs_release_budget(c, &req); 1067 } 1068 1069 return 0; 1070 } 1071 1072 static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, 1073 unsigned long nr_segs, loff_t pos) 1074 { 1075 int err; 1076 ssize_t ret; 1077 struct inode *inode = iocb->ki_filp->f_mapping->host; 1078 struct ubifs_info *c = inode->i_sb->s_fs_info; 1079 1080 err = update_mctime(c, inode); 1081 if (err) 1082 return err; 1083 1084 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 1085 if (ret < 0) 1086 return ret; 1087 1088 if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) { 1089 err = ubifs_sync_wbufs_by_inode(c, inode); 1090 if (err) 1091 return err; 1092 } 1093 1094 return ret; 1095 } 1096 1097 static int ubifs_set_page_dirty(struct page *page) 1098 { 1099 int ret; 1100 1101 ret = __set_page_dirty_nobuffers(page); 1102 /* 1103 * An attempt to dirty a page without budgeting for it - should not 1104 * happen. 1105 */ 1106 ubifs_assert(ret == 0); 1107 return ret; 1108 } 1109 1110 static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) 1111 { 1112 /* 1113 * An attempt to release a dirty page without budgeting for it - should 1114 * not happen. 1115 */ 1116 if (PageWriteback(page)) 1117 return 0; 1118 ubifs_assert(PagePrivate(page)); 1119 ubifs_assert(0); 1120 ClearPagePrivate(page); 1121 ClearPageChecked(page); 1122 return 1; 1123 } 1124 1125 /* 1126 * mmap()d file has taken write protection fault and is being made 1127 * writable. UBIFS must ensure page is budgeted for. 1128 */ 1129 static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 1130 { 1131 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1132 struct ubifs_info *c = inode->i_sb->s_fs_info; 1133 struct timespec now = ubifs_current_time(inode); 1134 struct ubifs_budget_req req = { .new_page = 1 }; 1135 int err, update_time; 1136 1137 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, 1138 i_size_read(inode)); 1139 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); 1140 1141 if (unlikely(c->ro_media)) 1142 return -EROFS; 1143 1144 /* 1145 * We have not locked @page so far so we may budget for changing the 1146 * page. Note, we cannot do this after we locked the page, because 1147 * budgeting may cause write-back which would cause deadlock. 1148 * 1149 * At the moment we do not know whether the page is dirty or not, so we 1150 * assume that it is not and budget for a new page. We could look at 1151 * the @PG_private flag and figure this out, but we may race with write 1152 * back and the page state may change by the time we lock it, so this 1153 * would need additional care. We do not bother with this at the 1154 * moment, although it might be good idea to do. Instead, we allocate 1155 * budget for a new page and amend it later on if the page was in fact 1156 * dirty. 1157 * 1158 * The budgeting-related logic of this function is similar to what we 1159 * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there 1160 * for more comments. 1161 */ 1162 update_time = mctime_update_needed(inode, &now); 1163 if (update_time) 1164 /* 1165 * We have to change inode time stamp which requires extra 1166 * budgeting. 1167 */ 1168 req.dirtied_ino = 1; 1169 1170 err = ubifs_budget_space(c, &req); 1171 if (unlikely(err)) { 1172 if (err == -ENOSPC) 1173 ubifs_warn("out of space for mmapped file " 1174 "(inode number %lu)", inode->i_ino); 1175 return err; 1176 } 1177 1178 lock_page(page); 1179 if (unlikely(page->mapping != inode->i_mapping || 1180 page_offset(page) > i_size_read(inode))) { 1181 /* Page got truncated out from underneath us */ 1182 err = -EINVAL; 1183 goto out_unlock; 1184 } 1185 1186 if (PagePrivate(page)) 1187 release_new_page_budget(c); 1188 else { 1189 if (!PageChecked(page)) 1190 ubifs_convert_page_budget(c); 1191 SetPagePrivate(page); 1192 atomic_long_inc(&c->dirty_pg_cnt); 1193 __set_page_dirty_nobuffers(page); 1194 } 1195 1196 if (update_time) { 1197 int release; 1198 struct ubifs_inode *ui = ubifs_inode(inode); 1199 1200 mutex_lock(&ui->ui_mutex); 1201 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1202 release = ui->dirty; 1203 mark_inode_dirty_sync(inode); 1204 mutex_unlock(&ui->ui_mutex); 1205 if (release) 1206 ubifs_release_dirty_inode_budget(c, ui); 1207 } 1208 1209 unlock_page(page); 1210 return 0; 1211 1212 out_unlock: 1213 unlock_page(page); 1214 ubifs_release_budget(c, &req); 1215 return err; 1216 } 1217 1218 static struct vm_operations_struct ubifs_file_vm_ops = { 1219 .fault = filemap_fault, 1220 .page_mkwrite = ubifs_vm_page_mkwrite, 1221 }; 1222 1223 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1224 { 1225 int err; 1226 1227 /* 'generic_file_mmap()' takes care of NOMMU case */ 1228 err = generic_file_mmap(file, vma); 1229 if (err) 1230 return err; 1231 vma->vm_ops = &ubifs_file_vm_ops; 1232 return 0; 1233 } 1234 1235 struct address_space_operations ubifs_file_address_operations = { 1236 .readpage = ubifs_readpage, 1237 .writepage = ubifs_writepage, 1238 .write_begin = ubifs_write_begin, 1239 .write_end = ubifs_write_end, 1240 .invalidatepage = ubifs_invalidatepage, 1241 .set_page_dirty = ubifs_set_page_dirty, 1242 .releasepage = ubifs_releasepage, 1243 }; 1244 1245 struct inode_operations ubifs_file_inode_operations = { 1246 .setattr = ubifs_setattr, 1247 .getattr = ubifs_getattr, 1248 #ifdef CONFIG_UBIFS_FS_XATTR 1249 .setxattr = ubifs_setxattr, 1250 .getxattr = ubifs_getxattr, 1251 .listxattr = ubifs_listxattr, 1252 .removexattr = ubifs_removexattr, 1253 #endif 1254 }; 1255 1256 struct inode_operations ubifs_symlink_inode_operations = { 1257 .readlink = generic_readlink, 1258 .follow_link = ubifs_follow_link, 1259 .setattr = ubifs_setattr, 1260 .getattr = ubifs_getattr, 1261 }; 1262 1263 struct file_operations ubifs_file_operations = { 1264 .llseek = generic_file_llseek, 1265 .read = do_sync_read, 1266 .write = do_sync_write, 1267 .aio_read = generic_file_aio_read, 1268 .aio_write = ubifs_aio_write, 1269 .mmap = ubifs_file_mmap, 1270 .fsync = ubifs_fsync, 1271 .unlocked_ioctl = ubifs_ioctl, 1272 .splice_read = generic_file_splice_read, 1273 #ifdef CONFIG_COMPAT 1274 .compat_ioctl = ubifs_compat_ioctl, 1275 #endif 1276 }; 1277