1 /* 2 * mm/readahead.c - address_space-level file readahead. 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 09Apr2002 Andrew Morton 7 * Initial version. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/blkdev.h> 15 #include <linux/backing-dev.h> 16 #include <linux/task_io_accounting_ops.h> 17 #include <linux/pagevec.h> 18 #include <linux/pagemap.h> 19 20 /* 21 * Initialise a struct file's readahead state. Assumes that the caller has 22 * memset *ra to zero. 23 */ 24 void 25 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 26 { 27 ra->ra_pages = mapping->backing_dev_info->ra_pages; 28 ra->prev_pos = -1; 29 } 30 EXPORT_SYMBOL_GPL(file_ra_state_init); 31 32 #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 33 34 /** 35 * read_cache_pages - populate an address space with some pages & start reads against them 36 * @mapping: the address_space 37 * @pages: The address of a list_head which contains the target pages. These 38 * pages have their ->index populated and are otherwise uninitialised. 39 * @filler: callback routine for filling a single page. 40 * @data: private data for the callback routine. 41 * 42 * Hides the details of the LRU cache etc from the filesystems. 43 */ 44 int read_cache_pages(struct address_space *mapping, struct list_head *pages, 45 int (*filler)(void *, struct page *), void *data) 46 { 47 struct page *page; 48 int ret = 0; 49 50 while (!list_empty(pages)) { 51 page = list_to_page(pages); 52 list_del(&page->lru); 53 if (add_to_page_cache_lru(page, mapping, 54 page->index, GFP_KERNEL)) { 55 page_cache_release(page); 56 continue; 57 } 58 page_cache_release(page); 59 60 ret = filler(data, page); 61 if (unlikely(ret)) { 62 put_pages_list(pages); 63 break; 64 } 65 task_io_account_read(PAGE_CACHE_SIZE); 66 } 67 return ret; 68 } 69 70 EXPORT_SYMBOL(read_cache_pages); 71 72 static int read_pages(struct address_space *mapping, struct file *filp, 73 struct list_head *pages, unsigned nr_pages) 74 { 75 unsigned page_idx; 76 int ret; 77 78 if (mapping->a_ops->readpages) { 79 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 80 /* Clean up the remaining pages */ 81 put_pages_list(pages); 82 goto out; 83 } 84 85 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 86 struct page *page = list_to_page(pages); 87 list_del(&page->lru); 88 if (!add_to_page_cache_lru(page, mapping, 89 page->index, GFP_KERNEL)) { 90 mapping->a_ops->readpage(filp, page); 91 } 92 page_cache_release(page); 93 } 94 ret = 0; 95 out: 96 return ret; 97 } 98 99 /* 100 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 101 * the pages first, then submits them all for I/O. This avoids the very bad 102 * behaviour which would occur if page allocations are causing VM writeback. 103 * We really don't want to intermingle reads and writes like that. 104 * 105 * Returns the number of pages requested, or the maximum amount of I/O allowed. 106 * 107 * do_page_cache_readahead() returns -1 if it encountered request queue 108 * congestion. 109 */ 110 static int 111 __do_page_cache_readahead(struct address_space *mapping, struct file *filp, 112 pgoff_t offset, unsigned long nr_to_read, 113 unsigned long lookahead_size) 114 { 115 struct inode *inode = mapping->host; 116 struct page *page; 117 unsigned long end_index; /* The last page we want to read */ 118 LIST_HEAD(page_pool); 119 int page_idx; 120 int ret = 0; 121 loff_t isize = i_size_read(inode); 122 123 if (isize == 0) 124 goto out; 125 126 end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); 127 128 /* 129 * Preallocate as many pages as we will need. 130 */ 131 for (page_idx = 0; page_idx < nr_to_read; page_idx++) { 132 pgoff_t page_offset = offset + page_idx; 133 134 if (page_offset > end_index) 135 break; 136 137 rcu_read_lock(); 138 page = radix_tree_lookup(&mapping->page_tree, page_offset); 139 rcu_read_unlock(); 140 if (page) 141 continue; 142 143 page = page_cache_alloc_cold(mapping); 144 if (!page) 145 break; 146 page->index = page_offset; 147 list_add(&page->lru, &page_pool); 148 if (page_idx == nr_to_read - lookahead_size) 149 SetPageReadahead(page); 150 ret++; 151 } 152 153 /* 154 * Now start the IO. We ignore I/O errors - if the page is not 155 * uptodate then the caller will launch readpage again, and 156 * will then handle the error. 157 */ 158 if (ret) 159 read_pages(mapping, filp, &page_pool, ret); 160 BUG_ON(!list_empty(&page_pool)); 161 out: 162 return ret; 163 } 164 165 /* 166 * Chunk the readahead into 2 megabyte units, so that we don't pin too much 167 * memory at once. 168 */ 169 int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 170 pgoff_t offset, unsigned long nr_to_read) 171 { 172 int ret = 0; 173 174 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 175 return -EINVAL; 176 177 while (nr_to_read) { 178 int err; 179 180 unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; 181 182 if (this_chunk > nr_to_read) 183 this_chunk = nr_to_read; 184 err = __do_page_cache_readahead(mapping, filp, 185 offset, this_chunk, 0); 186 if (err < 0) { 187 ret = err; 188 break; 189 } 190 ret += err; 191 offset += this_chunk; 192 nr_to_read -= this_chunk; 193 } 194 return ret; 195 } 196 197 /* 198 * This version skips the IO if the queue is read-congested, and will tell the 199 * block layer to abandon the readahead if request allocation would block. 200 * 201 * force_page_cache_readahead() will ignore queue congestion and will block on 202 * request queues. 203 */ 204 int do_page_cache_readahead(struct address_space *mapping, struct file *filp, 205 pgoff_t offset, unsigned long nr_to_read) 206 { 207 if (bdi_read_congested(mapping->backing_dev_info)) 208 return -1; 209 210 return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); 211 } 212 213 /* 214 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 215 * sensible upper limit. 216 */ 217 unsigned long max_sane_readahead(unsigned long nr) 218 { 219 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) 220 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 221 } 222 223 /* 224 * Submit IO for the read-ahead request in file_ra_state. 225 */ 226 static unsigned long ra_submit(struct file_ra_state *ra, 227 struct address_space *mapping, struct file *filp) 228 { 229 int actual; 230 231 actual = __do_page_cache_readahead(mapping, filp, 232 ra->start, ra->size, ra->async_size); 233 234 return actual; 235 } 236 237 /* 238 * Set the initial window size, round to next power of 2 and square 239 * for small size, x 4 for medium, and x 2 for large 240 * for 128k (32 page) max ra 241 * 1-8 page = 32k initial, > 8 page = 128k initial 242 */ 243 static unsigned long get_init_ra_size(unsigned long size, unsigned long max) 244 { 245 unsigned long newsize = roundup_pow_of_two(size); 246 247 if (newsize <= max / 32) 248 newsize = newsize * 4; 249 else if (newsize <= max / 4) 250 newsize = newsize * 2; 251 else 252 newsize = max; 253 254 return newsize; 255 } 256 257 /* 258 * Get the previous window size, ramp it up, and 259 * return it as the new window size. 260 */ 261 static unsigned long get_next_ra_size(struct file_ra_state *ra, 262 unsigned long max) 263 { 264 unsigned long cur = ra->size; 265 unsigned long newsize; 266 267 if (cur < max / 16) 268 newsize = 4 * cur; 269 else 270 newsize = 2 * cur; 271 272 return min(newsize, max); 273 } 274 275 /* 276 * On-demand readahead design. 277 * 278 * The fields in struct file_ra_state represent the most-recently-executed 279 * readahead attempt: 280 * 281 * |<----- async_size ---------| 282 * |------------------- size -------------------->| 283 * |==================#===========================| 284 * ^start ^page marked with PG_readahead 285 * 286 * To overlap application thinking time and disk I/O time, we do 287 * `readahead pipelining': Do not wait until the application consumed all 288 * readahead pages and stalled on the missing page at readahead_index; 289 * Instead, submit an asynchronous readahead I/O as soon as there are 290 * only async_size pages left in the readahead window. Normally async_size 291 * will be equal to size, for maximum pipelining. 292 * 293 * In interleaved sequential reads, concurrent streams on the same fd can 294 * be invalidating each other's readahead state. So we flag the new readahead 295 * page at (start+size-async_size) with PG_readahead, and use it as readahead 296 * indicator. The flag won't be set on already cached pages, to avoid the 297 * readahead-for-nothing fuss, saving pointless page cache lookups. 298 * 299 * prev_pos tracks the last visited byte in the _previous_ read request. 300 * It should be maintained by the caller, and will be used for detecting 301 * small random reads. Note that the readahead algorithm checks loosely 302 * for sequential patterns. Hence interleaved reads might be served as 303 * sequential ones. 304 * 305 * There is a special-case: if the first page which the application tries to 306 * read happens to be the first page of the file, it is assumed that a linear 307 * read is about to happen and the window is immediately set to the initial size 308 * based on I/O request size and the max_readahead. 309 * 310 * The code ramps up the readahead size aggressively at first, but slow down as 311 * it approaches max_readhead. 312 */ 313 314 /* 315 * A minimal readahead algorithm for trivial sequential/random reads. 316 */ 317 static unsigned long 318 ondemand_readahead(struct address_space *mapping, 319 struct file_ra_state *ra, struct file *filp, 320 bool hit_readahead_marker, pgoff_t offset, 321 unsigned long req_size) 322 { 323 int max = ra->ra_pages; /* max readahead pages */ 324 pgoff_t prev_offset; 325 int sequential; 326 327 /* 328 * It's the expected callback offset, assume sequential access. 329 * Ramp up sizes, and push forward the readahead window. 330 */ 331 if (offset && (offset == (ra->start + ra->size - ra->async_size) || 332 offset == (ra->start + ra->size))) { 333 ra->start += ra->size; 334 ra->size = get_next_ra_size(ra, max); 335 ra->async_size = ra->size; 336 goto readit; 337 } 338 339 prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; 340 sequential = offset - prev_offset <= 1UL || req_size > max; 341 342 /* 343 * Standalone, small read. 344 * Read as is, and do not pollute the readahead state. 345 */ 346 if (!hit_readahead_marker && !sequential) { 347 return __do_page_cache_readahead(mapping, filp, 348 offset, req_size, 0); 349 } 350 351 /* 352 * Hit a marked page without valid readahead state. 353 * E.g. interleaved reads. 354 * Query the pagecache for async_size, which normally equals to 355 * readahead size. Ramp it up and use it as the new readahead size. 356 */ 357 if (hit_readahead_marker) { 358 pgoff_t start; 359 360 rcu_read_lock(); 361 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); 362 rcu_read_unlock(); 363 364 if (!start || start - offset > max) 365 return 0; 366 367 ra->start = start; 368 ra->size = start - offset; /* old async_size */ 369 ra->size = get_next_ra_size(ra, max); 370 ra->async_size = ra->size; 371 goto readit; 372 } 373 374 /* 375 * It may be one of 376 * - first read on start of file 377 * - sequential cache miss 378 * - oversize random read 379 * Start readahead for it. 380 */ 381 ra->start = offset; 382 ra->size = get_init_ra_size(req_size, max); 383 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 384 385 readit: 386 return ra_submit(ra, mapping, filp); 387 } 388 389 /** 390 * page_cache_sync_readahead - generic file readahead 391 * @mapping: address_space which holds the pagecache and I/O vectors 392 * @ra: file_ra_state which holds the readahead state 393 * @filp: passed on to ->readpage() and ->readpages() 394 * @offset: start offset into @mapping, in pagecache page-sized units 395 * @req_size: hint: total size of the read which the caller is performing in 396 * pagecache pages 397 * 398 * page_cache_sync_readahead() should be called when a cache miss happened: 399 * it will submit the read. The readahead logic may decide to piggyback more 400 * pages onto the read request if access patterns suggest it will improve 401 * performance. 402 */ 403 void page_cache_sync_readahead(struct address_space *mapping, 404 struct file_ra_state *ra, struct file *filp, 405 pgoff_t offset, unsigned long req_size) 406 { 407 /* no read-ahead */ 408 if (!ra->ra_pages) 409 return; 410 411 /* do read-ahead */ 412 ondemand_readahead(mapping, ra, filp, false, offset, req_size); 413 } 414 EXPORT_SYMBOL_GPL(page_cache_sync_readahead); 415 416 /** 417 * page_cache_async_readahead - file readahead for marked pages 418 * @mapping: address_space which holds the pagecache and I/O vectors 419 * @ra: file_ra_state which holds the readahead state 420 * @filp: passed on to ->readpage() and ->readpages() 421 * @page: the page at @offset which has the PG_readahead flag set 422 * @offset: start offset into @mapping, in pagecache page-sized units 423 * @req_size: hint: total size of the read which the caller is performing in 424 * pagecache pages 425 * 426 * page_cache_async_ondemand() should be called when a page is used which 427 * has the PG_readahead flag; this is a marker to suggest that the application 428 * has used up enough of the readahead window that we should start pulling in 429 * more pages. 430 */ 431 void 432 page_cache_async_readahead(struct address_space *mapping, 433 struct file_ra_state *ra, struct file *filp, 434 struct page *page, pgoff_t offset, 435 unsigned long req_size) 436 { 437 /* no read-ahead */ 438 if (!ra->ra_pages) 439 return; 440 441 /* 442 * Same bit is used for PG_readahead and PG_reclaim. 443 */ 444 if (PageWriteback(page)) 445 return; 446 447 ClearPageReadahead(page); 448 449 /* 450 * Defer asynchronous read-ahead on IO congestion. 451 */ 452 if (bdi_read_congested(mapping->backing_dev_info)) 453 return; 454 455 /* do read-ahead */ 456 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 457 } 458 EXPORT_SYMBOL_GPL(page_cache_async_readahead); 459