1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Request reply cache. This is currently a global cache, but this may 4 * change in the future and be a per-client cache. 5 * 6 * This code is heavily inspired by the 44BSD implementation, although 7 * it does things a bit differently. 8 * 9 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 10 */ 11 12 #include <linux/sunrpc/svc_xprt.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/sunrpc/addr.h> 16 #include <linux/highmem.h> 17 #include <linux/log2.h> 18 #include <linux/hash.h> 19 #include <net/checksum.h> 20 21 #include "nfsd.h" 22 #include "cache.h" 23 24 #define NFSDDBG_FACILITY NFSDDBG_REPCACHE 25 26 /* 27 * We use this value to determine the number of hash buckets from the max 28 * cache size, the idea being that when the cache is at its maximum number 29 * of entries, then this should be the average number of entries per bucket. 30 */ 31 #define TARGET_BUCKET_SIZE 64 32 33 struct nfsd_drc_bucket { 34 struct rb_root rb_head; 35 struct list_head lru_head; 36 spinlock_t cache_lock; 37 }; 38 39 static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); 40 static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, 41 struct shrink_control *sc); 42 static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, 43 struct shrink_control *sc); 44 45 /* 46 * Put a cap on the size of the DRC based on the amount of available 47 * low memory in the machine. 48 * 49 * 64MB: 8192 50 * 128MB: 11585 51 * 256MB: 16384 52 * 512MB: 23170 53 * 1GB: 32768 54 * 2GB: 46340 55 * 4GB: 65536 56 * 8GB: 92681 57 * 16GB: 131072 58 * 59 * ...with a hard cap of 256k entries. In the worst case, each entry will be 60 * ~1k, so the above numbers should give a rough max of the amount of memory 61 * used in k. 62 * 63 * XXX: these limits are per-container, so memory used will increase 64 * linearly with number of containers. Maybe that's OK. 65 */ 66 static unsigned int 67 nfsd_cache_size_limit(void) 68 { 69 unsigned int limit; 70 unsigned long low_pages = totalram_pages() - totalhigh_pages(); 71 72 limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); 73 return min_t(unsigned int, limit, 256*1024); 74 } 75 76 /* 77 * Compute the number of hash buckets we need. Divide the max cachesize by 78 * the "target" max bucket size, and round up to next power of two. 79 */ 80 static unsigned int 81 nfsd_hashsize(unsigned int limit) 82 { 83 return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); 84 } 85 86 static u32 87 nfsd_cache_hash(__be32 xid, struct nfsd_net *nn) 88 { 89 return hash_32(be32_to_cpu(xid), nn->maskbits); 90 } 91 92 static struct svc_cacherep * 93 nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, 94 struct nfsd_net *nn) 95 { 96 struct svc_cacherep *rp; 97 98 rp = kmem_cache_alloc(nn->drc_slab, GFP_KERNEL); 99 if (rp) { 100 rp->c_state = RC_UNUSED; 101 rp->c_type = RC_NOCACHE; 102 RB_CLEAR_NODE(&rp->c_node); 103 INIT_LIST_HEAD(&rp->c_lru); 104 105 memset(&rp->c_key, 0, sizeof(rp->c_key)); 106 rp->c_key.k_xid = rqstp->rq_xid; 107 rp->c_key.k_proc = rqstp->rq_proc; 108 rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp)); 109 rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp))); 110 rp->c_key.k_prot = rqstp->rq_prot; 111 rp->c_key.k_vers = rqstp->rq_vers; 112 rp->c_key.k_len = rqstp->rq_arg.len; 113 rp->c_key.k_csum = csum; 114 } 115 return rp; 116 } 117 118 static void 119 nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, 120 struct nfsd_net *nn) 121 { 122 if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { 123 nn->drc_mem_usage -= rp->c_replvec.iov_len; 124 kfree(rp->c_replvec.iov_base); 125 } 126 if (rp->c_state != RC_UNUSED) { 127 rb_erase(&rp->c_node, &b->rb_head); 128 list_del(&rp->c_lru); 129 atomic_dec(&nn->num_drc_entries); 130 nn->drc_mem_usage -= sizeof(*rp); 131 } 132 kmem_cache_free(nn->drc_slab, rp); 133 } 134 135 static void 136 nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, 137 struct nfsd_net *nn) 138 { 139 spin_lock(&b->cache_lock); 140 nfsd_reply_cache_free_locked(b, rp, nn); 141 spin_unlock(&b->cache_lock); 142 } 143 144 int nfsd_reply_cache_init(struct nfsd_net *nn) 145 { 146 unsigned int hashsize; 147 unsigned int i; 148 int status = 0; 149 150 nn->max_drc_entries = nfsd_cache_size_limit(); 151 atomic_set(&nn->num_drc_entries, 0); 152 hashsize = nfsd_hashsize(nn->max_drc_entries); 153 nn->maskbits = ilog2(hashsize); 154 155 nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; 156 nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; 157 nn->nfsd_reply_cache_shrinker.seeks = 1; 158 status = register_shrinker(&nn->nfsd_reply_cache_shrinker); 159 if (status) 160 goto out_nomem; 161 162 nn->drc_slab = kmem_cache_create("nfsd_drc", 163 sizeof(struct svc_cacherep), 0, 0, NULL); 164 if (!nn->drc_slab) 165 goto out_shrinker; 166 167 nn->drc_hashtbl = kcalloc(hashsize, 168 sizeof(*nn->drc_hashtbl), GFP_KERNEL); 169 if (!nn->drc_hashtbl) { 170 nn->drc_hashtbl = vzalloc(array_size(hashsize, 171 sizeof(*nn->drc_hashtbl))); 172 if (!nn->drc_hashtbl) 173 goto out_slab; 174 } 175 176 for (i = 0; i < hashsize; i++) { 177 INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); 178 spin_lock_init(&nn->drc_hashtbl[i].cache_lock); 179 } 180 nn->drc_hashsize = hashsize; 181 182 return 0; 183 out_slab: 184 kmem_cache_destroy(nn->drc_slab); 185 out_shrinker: 186 unregister_shrinker(&nn->nfsd_reply_cache_shrinker); 187 out_nomem: 188 printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); 189 return -ENOMEM; 190 } 191 192 void nfsd_reply_cache_shutdown(struct nfsd_net *nn) 193 { 194 struct svc_cacherep *rp; 195 unsigned int i; 196 197 unregister_shrinker(&nn->nfsd_reply_cache_shrinker); 198 199 for (i = 0; i < nn->drc_hashsize; i++) { 200 struct list_head *head = &nn->drc_hashtbl[i].lru_head; 201 while (!list_empty(head)) { 202 rp = list_first_entry(head, struct svc_cacherep, c_lru); 203 nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i], 204 rp, nn); 205 } 206 } 207 208 kvfree(nn->drc_hashtbl); 209 nn->drc_hashtbl = NULL; 210 nn->drc_hashsize = 0; 211 212 kmem_cache_destroy(nn->drc_slab); 213 nn->drc_slab = NULL; 214 } 215 216 /* 217 * Move cache entry to end of LRU list, and queue the cleaner to run if it's 218 * not already scheduled. 219 */ 220 static void 221 lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) 222 { 223 rp->c_timestamp = jiffies; 224 list_move_tail(&rp->c_lru, &b->lru_head); 225 } 226 227 static long 228 prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn) 229 { 230 struct svc_cacherep *rp, *tmp; 231 long freed = 0; 232 233 list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { 234 /* 235 * Don't free entries attached to calls that are still 236 * in-progress, but do keep scanning the list. 237 */ 238 if (rp->c_state == RC_INPROG) 239 continue; 240 if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && 241 time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) 242 break; 243 nfsd_reply_cache_free_locked(b, rp, nn); 244 freed++; 245 } 246 return freed; 247 } 248 249 /* 250 * Walk the LRU list and prune off entries that are older than RC_EXPIRE. 251 * Also prune the oldest ones when the total exceeds the max number of entries. 252 */ 253 static long 254 prune_cache_entries(struct nfsd_net *nn) 255 { 256 unsigned int i; 257 long freed = 0; 258 259 for (i = 0; i < nn->drc_hashsize; i++) { 260 struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i]; 261 262 if (list_empty(&b->lru_head)) 263 continue; 264 spin_lock(&b->cache_lock); 265 freed += prune_bucket(b, nn); 266 spin_unlock(&b->cache_lock); 267 } 268 return freed; 269 } 270 271 static unsigned long 272 nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) 273 { 274 struct nfsd_net *nn = container_of(shrink, 275 struct nfsd_net, nfsd_reply_cache_shrinker); 276 277 return atomic_read(&nn->num_drc_entries); 278 } 279 280 static unsigned long 281 nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) 282 { 283 struct nfsd_net *nn = container_of(shrink, 284 struct nfsd_net, nfsd_reply_cache_shrinker); 285 286 return prune_cache_entries(nn); 287 } 288 /* 289 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes 290 */ 291 static __wsum 292 nfsd_cache_csum(struct svc_rqst *rqstp) 293 { 294 int idx; 295 unsigned int base; 296 __wsum csum; 297 struct xdr_buf *buf = &rqstp->rq_arg; 298 const unsigned char *p = buf->head[0].iov_base; 299 size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, 300 RC_CSUMLEN); 301 size_t len = min(buf->head[0].iov_len, csum_len); 302 303 /* rq_arg.head first */ 304 csum = csum_partial(p, len, 0); 305 csum_len -= len; 306 307 /* Continue into page array */ 308 idx = buf->page_base / PAGE_SIZE; 309 base = buf->page_base & ~PAGE_MASK; 310 while (csum_len) { 311 p = page_address(buf->pages[idx]) + base; 312 len = min_t(size_t, PAGE_SIZE - base, csum_len); 313 csum = csum_partial(p, len, csum); 314 csum_len -= len; 315 base = 0; 316 ++idx; 317 } 318 return csum; 319 } 320 321 static int 322 nfsd_cache_key_cmp(const struct svc_cacherep *key, 323 const struct svc_cacherep *rp, struct nfsd_net *nn) 324 { 325 if (key->c_key.k_xid == rp->c_key.k_xid && 326 key->c_key.k_csum != rp->c_key.k_csum) 327 ++nn->payload_misses; 328 329 return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key)); 330 } 331 332 /* 333 * Search the request hash for an entry that matches the given rqstp. 334 * Must be called with cache_lock held. Returns the found entry or 335 * inserts an empty key on failure. 336 */ 337 static struct svc_cacherep * 338 nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, 339 struct nfsd_net *nn) 340 { 341 struct svc_cacherep *rp, *ret = key; 342 struct rb_node **p = &b->rb_head.rb_node, 343 *parent = NULL; 344 unsigned int entries = 0; 345 int cmp; 346 347 while (*p != NULL) { 348 ++entries; 349 parent = *p; 350 rp = rb_entry(parent, struct svc_cacherep, c_node); 351 352 cmp = nfsd_cache_key_cmp(key, rp, nn); 353 if (cmp < 0) 354 p = &parent->rb_left; 355 else if (cmp > 0) 356 p = &parent->rb_right; 357 else { 358 ret = rp; 359 goto out; 360 } 361 } 362 rb_link_node(&key->c_node, parent, p); 363 rb_insert_color(&key->c_node, &b->rb_head); 364 out: 365 /* tally hash chain length stats */ 366 if (entries > nn->longest_chain) { 367 nn->longest_chain = entries; 368 nn->longest_chain_cachesize = atomic_read(&nn->num_drc_entries); 369 } else if (entries == nn->longest_chain) { 370 /* prefer to keep the smallest cachesize possible here */ 371 nn->longest_chain_cachesize = min_t(unsigned int, 372 nn->longest_chain_cachesize, 373 atomic_read(&nn->num_drc_entries)); 374 } 375 376 lru_put_end(b, ret); 377 return ret; 378 } 379 380 /* 381 * Try to find an entry matching the current call in the cache. When none 382 * is found, we try to grab the oldest expired entry off the LRU list. If 383 * a suitable one isn't there, then drop the cache_lock and allocate a 384 * new one, then search again in case one got inserted while this thread 385 * didn't hold the lock. 386 */ 387 int 388 nfsd_cache_lookup(struct svc_rqst *rqstp) 389 { 390 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 391 struct svc_cacherep *rp, *found; 392 __be32 xid = rqstp->rq_xid; 393 __wsum csum; 394 u32 hash = nfsd_cache_hash(xid, nn); 395 struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash]; 396 int type = rqstp->rq_cachetype; 397 int rtn = RC_DOIT; 398 399 rqstp->rq_cacherep = NULL; 400 if (type == RC_NOCACHE) { 401 nfsdstats.rcnocache++; 402 return rtn; 403 } 404 405 csum = nfsd_cache_csum(rqstp); 406 407 /* 408 * Since the common case is a cache miss followed by an insert, 409 * preallocate an entry. 410 */ 411 rp = nfsd_reply_cache_alloc(rqstp, csum, nn); 412 if (!rp) { 413 dprintk("nfsd: unable to allocate DRC entry!\n"); 414 return rtn; 415 } 416 417 spin_lock(&b->cache_lock); 418 found = nfsd_cache_insert(b, rp, nn); 419 if (found != rp) { 420 nfsd_reply_cache_free_locked(NULL, rp, nn); 421 rp = found; 422 goto found_entry; 423 } 424 425 nfsdstats.rcmisses++; 426 rqstp->rq_cacherep = rp; 427 rp->c_state = RC_INPROG; 428 429 atomic_inc(&nn->num_drc_entries); 430 nn->drc_mem_usage += sizeof(*rp); 431 432 /* go ahead and prune the cache */ 433 prune_bucket(b, nn); 434 out: 435 spin_unlock(&b->cache_lock); 436 return rtn; 437 438 found_entry: 439 /* We found a matching entry which is either in progress or done. */ 440 nfsdstats.rchits++; 441 rtn = RC_DROPIT; 442 443 /* Request being processed */ 444 if (rp->c_state == RC_INPROG) 445 goto out; 446 447 /* From the hall of fame of impractical attacks: 448 * Is this a user who tries to snoop on the cache? */ 449 rtn = RC_DOIT; 450 if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure) 451 goto out; 452 453 /* Compose RPC reply header */ 454 switch (rp->c_type) { 455 case RC_NOCACHE: 456 break; 457 case RC_REPLSTAT: 458 svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat); 459 rtn = RC_REPLY; 460 break; 461 case RC_REPLBUFF: 462 if (!nfsd_cache_append(rqstp, &rp->c_replvec)) 463 goto out; /* should not happen */ 464 rtn = RC_REPLY; 465 break; 466 default: 467 printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); 468 nfsd_reply_cache_free_locked(b, rp, nn); 469 } 470 471 goto out; 472 } 473 474 /* 475 * Update a cache entry. This is called from nfsd_dispatch when 476 * the procedure has been executed and the complete reply is in 477 * rqstp->rq_res. 478 * 479 * We're copying around data here rather than swapping buffers because 480 * the toplevel loop requires max-sized buffers, which would be a waste 481 * of memory for a cache with a max reply size of 100 bytes (diropokres). 482 * 483 * If we should start to use different types of cache entries tailored 484 * specifically for attrstat and fh's, we may save even more space. 485 * 486 * Also note that a cachetype of RC_NOCACHE can legally be passed when 487 * nfsd failed to encode a reply that otherwise would have been cached. 488 * In this case, nfsd_cache_update is called with statp == NULL. 489 */ 490 void 491 nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) 492 { 493 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 494 struct svc_cacherep *rp = rqstp->rq_cacherep; 495 struct kvec *resv = &rqstp->rq_res.head[0], *cachv; 496 u32 hash; 497 struct nfsd_drc_bucket *b; 498 int len; 499 size_t bufsize = 0; 500 501 if (!rp) 502 return; 503 504 hash = nfsd_cache_hash(rp->c_key.k_xid, nn); 505 b = &nn->drc_hashtbl[hash]; 506 507 len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); 508 len >>= 2; 509 510 /* Don't cache excessive amounts of data and XDR failures */ 511 if (!statp || len > (256 >> 2)) { 512 nfsd_reply_cache_free(b, rp, nn); 513 return; 514 } 515 516 switch (cachetype) { 517 case RC_REPLSTAT: 518 if (len != 1) 519 printk("nfsd: RC_REPLSTAT/reply len %d!\n",len); 520 rp->c_replstat = *statp; 521 break; 522 case RC_REPLBUFF: 523 cachv = &rp->c_replvec; 524 bufsize = len << 2; 525 cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); 526 if (!cachv->iov_base) { 527 nfsd_reply_cache_free(b, rp, nn); 528 return; 529 } 530 cachv->iov_len = bufsize; 531 memcpy(cachv->iov_base, statp, bufsize); 532 break; 533 case RC_NOCACHE: 534 nfsd_reply_cache_free(b, rp, nn); 535 return; 536 } 537 spin_lock(&b->cache_lock); 538 nn->drc_mem_usage += bufsize; 539 lru_put_end(b, rp); 540 rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); 541 rp->c_type = cachetype; 542 rp->c_state = RC_DONE; 543 spin_unlock(&b->cache_lock); 544 return; 545 } 546 547 /* 548 * Copy cached reply to current reply buffer. Should always fit. 549 * FIXME as reply is in a page, we should just attach the page, and 550 * keep a refcount.... 551 */ 552 static int 553 nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) 554 { 555 struct kvec *vec = &rqstp->rq_res.head[0]; 556 557 if (vec->iov_len + data->iov_len > PAGE_SIZE) { 558 printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n", 559 data->iov_len); 560 return 0; 561 } 562 memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); 563 vec->iov_len += data->iov_len; 564 return 1; 565 } 566 567 /* 568 * Note that fields may be added, removed or reordered in the future. Programs 569 * scraping this file for info should test the labels to ensure they're 570 * getting the correct field. 571 */ 572 static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) 573 { 574 struct nfsd_net *nn = v; 575 576 seq_printf(m, "max entries: %u\n", nn->max_drc_entries); 577 seq_printf(m, "num entries: %u\n", 578 atomic_read(&nn->num_drc_entries)); 579 seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); 580 seq_printf(m, "mem usage: %u\n", nn->drc_mem_usage); 581 seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); 582 seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); 583 seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); 584 seq_printf(m, "payload misses: %u\n", nn->payload_misses); 585 seq_printf(m, "longest chain len: %u\n", nn->longest_chain); 586 seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); 587 return 0; 588 } 589 590 int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) 591 { 592 struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info, 593 nfsd_net_id); 594 595 return single_open(file, nfsd_reply_cache_stats_show, nn); 596 } 597