1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Event cache for netfilter. */ 3 4 /* 5 * (C) 2005 Harald Welte <laforge@gnumonks.org> 6 * (C) 2005 Patrick McHardy <kaber@trash.net> 7 * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 */ 10 11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13 #include <linux/types.h> 14 #include <linux/netfilter.h> 15 #include <linux/skbuff.h> 16 #include <linux/vmalloc.h> 17 #include <linux/stddef.h> 18 #include <linux/err.h> 19 #include <linux/percpu.h> 20 #include <linux/kernel.h> 21 #include <linux/netdevice.h> 22 #include <linux/slab.h> 23 #include <linux/export.h> 24 25 #include <net/netfilter/nf_conntrack.h> 26 #include <net/netfilter/nf_conntrack_core.h> 27 #include <net/netfilter/nf_conntrack_ecache.h> 28 #include <net/netfilter/nf_conntrack_extend.h> 29 30 static DEFINE_MUTEX(nf_ct_ecache_mutex); 31 32 #define ECACHE_RETRY_WAIT (HZ/10) 33 #define ECACHE_STACK_ALLOC (256 / sizeof(void *)) 34 35 enum retry_state { 36 STATE_CONGESTED, 37 STATE_RESTART, 38 STATE_DONE, 39 }; 40 41 static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) 42 { 43 struct nf_conn *refs[ECACHE_STACK_ALLOC]; 44 enum retry_state ret = STATE_DONE; 45 struct nf_conntrack_tuple_hash *h; 46 struct hlist_nulls_node *n; 47 unsigned int evicted = 0; 48 49 spin_lock(&pcpu->lock); 50 51 hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { 52 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 53 struct nf_conntrack_ecache *e; 54 55 if (!nf_ct_is_confirmed(ct)) 56 continue; 57 58 /* This ecache access is safe because the ct is on the 59 * pcpu dying list and we hold the spinlock -- the entry 60 * cannot be free'd until after the lock is released. 61 * 62 * This is true even if ct has a refcount of 0: the 63 * cpu that is about to free the entry must remove it 64 * from the dying list and needs the lock to do so. 65 */ 66 e = nf_ct_ecache_find(ct); 67 if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) 68 continue; 69 70 /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means 71 * the worker owns this entry: the ct will remain valid 72 * until the worker puts its ct reference. 73 */ 74 if (nf_conntrack_event(IPCT_DESTROY, ct)) { 75 ret = STATE_CONGESTED; 76 break; 77 } 78 79 e->state = NFCT_ECACHE_DESTROY_SENT; 80 refs[evicted] = ct; 81 82 if (++evicted >= ARRAY_SIZE(refs)) { 83 ret = STATE_RESTART; 84 break; 85 } 86 } 87 88 spin_unlock(&pcpu->lock); 89 90 /* can't _put while holding lock */ 91 while (evicted) 92 nf_ct_put(refs[--evicted]); 93 94 return ret; 95 } 96 97 static void ecache_work(struct work_struct *work) 98 { 99 struct netns_ct *ctnet = 100 container_of(work, struct netns_ct, ecache_dwork.work); 101 int cpu, delay = -1; 102 struct ct_pcpu *pcpu; 103 104 local_bh_disable(); 105 106 for_each_possible_cpu(cpu) { 107 enum retry_state ret; 108 109 pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); 110 111 ret = ecache_work_evict_list(pcpu); 112 113 switch (ret) { 114 case STATE_CONGESTED: 115 delay = ECACHE_RETRY_WAIT; 116 goto out; 117 case STATE_RESTART: 118 delay = 0; 119 break; 120 case STATE_DONE: 121 break; 122 } 123 } 124 125 out: 126 local_bh_enable(); 127 128 ctnet->ecache_dwork_pending = delay > 0; 129 if (delay >= 0) 130 schedule_delayed_work(&ctnet->ecache_dwork, delay); 131 } 132 133 int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, 134 u32 portid, int report) 135 { 136 int ret = 0; 137 struct net *net = nf_ct_net(ct); 138 struct nf_ct_event_notifier *notify; 139 struct nf_conntrack_ecache *e; 140 141 rcu_read_lock(); 142 notify = rcu_dereference(net->ct.nf_conntrack_event_cb); 143 if (!notify) 144 goto out_unlock; 145 146 e = nf_ct_ecache_find(ct); 147 if (!e) 148 goto out_unlock; 149 150 if (nf_ct_is_confirmed(ct)) { 151 struct nf_ct_event item = { 152 .ct = ct, 153 .portid = e->portid ? e->portid : portid, 154 .report = report 155 }; 156 /* This is a resent of a destroy event? If so, skip missed */ 157 unsigned long missed = e->portid ? 0 : e->missed; 158 159 if (!((eventmask | missed) & e->ctmask)) 160 goto out_unlock; 161 162 ret = notify->fcn(eventmask | missed, &item); 163 if (unlikely(ret < 0 || missed)) { 164 spin_lock_bh(&ct->lock); 165 if (ret < 0) { 166 /* This is a destroy event that has been 167 * triggered by a process, we store the PORTID 168 * to include it in the retransmission. 169 */ 170 if (eventmask & (1 << IPCT_DESTROY)) { 171 if (e->portid == 0 && portid != 0) 172 e->portid = portid; 173 e->state = NFCT_ECACHE_DESTROY_FAIL; 174 } else { 175 e->missed |= eventmask; 176 } 177 } else { 178 e->missed &= ~missed; 179 } 180 spin_unlock_bh(&ct->lock); 181 } 182 } 183 out_unlock: 184 rcu_read_unlock(); 185 return ret; 186 } 187 EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); 188 189 /* deliver cached events and clear cache entry - must be called with locally 190 * disabled softirqs */ 191 void nf_ct_deliver_cached_events(struct nf_conn *ct) 192 { 193 struct net *net = nf_ct_net(ct); 194 unsigned long events, missed; 195 struct nf_ct_event_notifier *notify; 196 struct nf_conntrack_ecache *e; 197 struct nf_ct_event item; 198 int ret; 199 200 rcu_read_lock(); 201 notify = rcu_dereference(net->ct.nf_conntrack_event_cb); 202 if (notify == NULL) 203 goto out_unlock; 204 205 e = nf_ct_ecache_find(ct); 206 if (e == NULL) 207 goto out_unlock; 208 209 events = xchg(&e->cache, 0); 210 211 if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) 212 goto out_unlock; 213 214 /* We make a copy of the missed event cache without taking 215 * the lock, thus we may send missed events twice. However, 216 * this does not harm and it happens very rarely. */ 217 missed = e->missed; 218 219 if (!((events | missed) & e->ctmask)) 220 goto out_unlock; 221 222 item.ct = ct; 223 item.portid = 0; 224 item.report = 0; 225 226 ret = notify->fcn(events | missed, &item); 227 228 if (likely(ret == 0 && !missed)) 229 goto out_unlock; 230 231 spin_lock_bh(&ct->lock); 232 if (ret < 0) 233 e->missed |= events; 234 else 235 e->missed &= ~missed; 236 spin_unlock_bh(&ct->lock); 237 238 out_unlock: 239 rcu_read_unlock(); 240 } 241 EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); 242 243 void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, 244 struct nf_conntrack_expect *exp, 245 u32 portid, int report) 246 247 { 248 struct net *net = nf_ct_exp_net(exp); 249 struct nf_exp_event_notifier *notify; 250 struct nf_conntrack_ecache *e; 251 252 rcu_read_lock(); 253 notify = rcu_dereference(net->ct.nf_expect_event_cb); 254 if (!notify) 255 goto out_unlock; 256 257 e = nf_ct_ecache_find(exp->master); 258 if (!e) 259 goto out_unlock; 260 261 if (e->expmask & (1 << event)) { 262 struct nf_exp_event item = { 263 .exp = exp, 264 .portid = portid, 265 .report = report 266 }; 267 notify->fcn(1 << event, &item); 268 } 269 out_unlock: 270 rcu_read_unlock(); 271 } 272 273 int nf_conntrack_register_notifier(struct net *net, 274 struct nf_ct_event_notifier *new) 275 { 276 int ret; 277 struct nf_ct_event_notifier *notify; 278 279 mutex_lock(&nf_ct_ecache_mutex); 280 notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, 281 lockdep_is_held(&nf_ct_ecache_mutex)); 282 if (notify != NULL) { 283 ret = -EBUSY; 284 goto out_unlock; 285 } 286 rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); 287 ret = 0; 288 289 out_unlock: 290 mutex_unlock(&nf_ct_ecache_mutex); 291 return ret; 292 } 293 EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); 294 295 void nf_conntrack_unregister_notifier(struct net *net, 296 struct nf_ct_event_notifier *new) 297 { 298 struct nf_ct_event_notifier *notify; 299 300 mutex_lock(&nf_ct_ecache_mutex); 301 notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, 302 lockdep_is_held(&nf_ct_ecache_mutex)); 303 BUG_ON(notify != new); 304 RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); 305 mutex_unlock(&nf_ct_ecache_mutex); 306 /* synchronize_rcu() is called from ctnetlink_exit. */ 307 } 308 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); 309 310 int nf_ct_expect_register_notifier(struct net *net, 311 struct nf_exp_event_notifier *new) 312 { 313 int ret; 314 struct nf_exp_event_notifier *notify; 315 316 mutex_lock(&nf_ct_ecache_mutex); 317 notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, 318 lockdep_is_held(&nf_ct_ecache_mutex)); 319 if (notify != NULL) { 320 ret = -EBUSY; 321 goto out_unlock; 322 } 323 rcu_assign_pointer(net->ct.nf_expect_event_cb, new); 324 ret = 0; 325 326 out_unlock: 327 mutex_unlock(&nf_ct_ecache_mutex); 328 return ret; 329 } 330 EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); 331 332 void nf_ct_expect_unregister_notifier(struct net *net, 333 struct nf_exp_event_notifier *new) 334 { 335 struct nf_exp_event_notifier *notify; 336 337 mutex_lock(&nf_ct_ecache_mutex); 338 notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, 339 lockdep_is_held(&nf_ct_ecache_mutex)); 340 BUG_ON(notify != new); 341 RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); 342 mutex_unlock(&nf_ct_ecache_mutex); 343 /* synchronize_rcu() is called from ctnetlink_exit. */ 344 } 345 EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); 346 347 #define NF_CT_EVENTS_DEFAULT 1 348 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; 349 350 static const struct nf_ct_ext_type event_extend = { 351 .len = sizeof(struct nf_conntrack_ecache), 352 .align = __alignof__(struct nf_conntrack_ecache), 353 .id = NF_CT_EXT_ECACHE, 354 }; 355 356 void nf_conntrack_ecache_pernet_init(struct net *net) 357 { 358 net->ct.sysctl_events = nf_ct_events; 359 INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); 360 } 361 362 void nf_conntrack_ecache_pernet_fini(struct net *net) 363 { 364 cancel_delayed_work_sync(&net->ct.ecache_dwork); 365 } 366 367 int nf_conntrack_ecache_init(void) 368 { 369 int ret = nf_ct_extend_register(&event_extend); 370 if (ret < 0) 371 pr_err("Unable to register event extension\n"); 372 373 BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */ 374 375 return ret; 376 } 377 378 void nf_conntrack_ecache_fini(void) 379 { 380 nf_ct_extend_unregister(&event_extend); 381 } 382