1 // SPDX-License-Identifier: GPL-2.0
2 /* Generic nexthop implementation
3 *
4 * Copyright (c) 2017-19 Cumulus Networks
5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
6 */
7
8 #include <linux/nexthop.h>
9 #include <linux/rtnetlink.h>
10 #include <linux/slab.h>
11 #include <linux/vmalloc.h>
12 #include <net/arp.h>
13 #include <net/ipv6_stubs.h>
14 #include <net/lwtunnel.h>
15 #include <net/ndisc.h>
16 #include <net/nexthop.h>
17 #include <net/route.h>
18 #include <net/sock.h>
19
20 #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ)
21 #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */
22
23 static void remove_nexthop(struct net *net, struct nexthop *nh,
24 struct nl_info *nlinfo);
25
26 #define NH_DEV_HASHBITS 8
27 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
28
29 static const struct nla_policy rtm_nh_policy_new[] = {
30 [NHA_ID] = { .type = NLA_U32 },
31 [NHA_GROUP] = { .type = NLA_BINARY },
32 [NHA_GROUP_TYPE] = { .type = NLA_U16 },
33 [NHA_BLACKHOLE] = { .type = NLA_FLAG },
34 [NHA_OIF] = { .type = NLA_U32 },
35 [NHA_GATEWAY] = { .type = NLA_BINARY },
36 [NHA_ENCAP_TYPE] = { .type = NLA_U16 },
37 [NHA_ENCAP] = { .type = NLA_NESTED },
38 [NHA_FDB] = { .type = NLA_FLAG },
39 [NHA_RES_GROUP] = { .type = NLA_NESTED },
40 };
41
42 static const struct nla_policy rtm_nh_policy_get[] = {
43 [NHA_ID] = { .type = NLA_U32 },
44 };
45
46 static const struct nla_policy rtm_nh_policy_dump[] = {
47 [NHA_OIF] = { .type = NLA_U32 },
48 [NHA_GROUPS] = { .type = NLA_FLAG },
49 [NHA_MASTER] = { .type = NLA_U32 },
50 [NHA_FDB] = { .type = NLA_FLAG },
51 };
52
53 static const struct nla_policy rtm_nh_res_policy_new[] = {
54 [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 },
55 [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 },
56 [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 },
57 };
58
59 static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
60 [NHA_ID] = { .type = NLA_U32 },
61 [NHA_OIF] = { .type = NLA_U32 },
62 [NHA_MASTER] = { .type = NLA_U32 },
63 [NHA_RES_BUCKET] = { .type = NLA_NESTED },
64 };
65
66 static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
67 [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 },
68 };
69
70 static const struct nla_policy rtm_nh_policy_get_bucket[] = {
71 [NHA_ID] = { .type = NLA_U32 },
72 [NHA_RES_BUCKET] = { .type = NLA_NESTED },
73 };
74
75 static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
76 [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 },
77 };
78
nexthop_notifiers_is_empty(struct net * net)79 static bool nexthop_notifiers_is_empty(struct net *net)
80 {
81 return !net->nexthop.notifier_chain.head;
82 }
83
84 static void
__nh_notifier_single_info_init(struct nh_notifier_single_info * nh_info,const struct nh_info * nhi)85 __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
86 const struct nh_info *nhi)
87 {
88 nh_info->dev = nhi->fib_nhc.nhc_dev;
89 nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
90 if (nh_info->gw_family == AF_INET)
91 nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
92 else if (nh_info->gw_family == AF_INET6)
93 nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
94
95 nh_info->is_reject = nhi->reject_nh;
96 nh_info->is_fdb = nhi->fdb_nh;
97 nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
98 }
99
nh_notifier_single_info_init(struct nh_notifier_info * info,const struct nexthop * nh)100 static int nh_notifier_single_info_init(struct nh_notifier_info *info,
101 const struct nexthop *nh)
102 {
103 struct nh_info *nhi = rtnl_dereference(nh->nh_info);
104
105 info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
106 info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
107 if (!info->nh)
108 return -ENOMEM;
109
110 __nh_notifier_single_info_init(info->nh, nhi);
111
112 return 0;
113 }
114
nh_notifier_single_info_fini(struct nh_notifier_info * info)115 static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
116 {
117 kfree(info->nh);
118 }
119
nh_notifier_mpath_info_init(struct nh_notifier_info * info,struct nh_group * nhg)120 static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
121 struct nh_group *nhg)
122 {
123 u16 num_nh = nhg->num_nh;
124 int i;
125
126 info->type = NH_NOTIFIER_INFO_TYPE_GRP;
127 info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
128 GFP_KERNEL);
129 if (!info->nh_grp)
130 return -ENOMEM;
131
132 info->nh_grp->num_nh = num_nh;
133 info->nh_grp->is_fdb = nhg->fdb_nh;
134
135 for (i = 0; i < num_nh; i++) {
136 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
137 struct nh_info *nhi;
138
139 nhi = rtnl_dereference(nhge->nh->nh_info);
140 info->nh_grp->nh_entries[i].id = nhge->nh->id;
141 info->nh_grp->nh_entries[i].weight = nhge->weight;
142 __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
143 nhi);
144 }
145
146 return 0;
147 }
148
nh_notifier_res_table_info_init(struct nh_notifier_info * info,struct nh_group * nhg)149 static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
150 struct nh_group *nhg)
151 {
152 struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
153 u16 num_nh_buckets = res_table->num_nh_buckets;
154 unsigned long size;
155 u16 i;
156
157 info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
158 size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
159 info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
160 __GFP_NOWARN);
161 if (!info->nh_res_table)
162 return -ENOMEM;
163
164 info->nh_res_table->num_nh_buckets = num_nh_buckets;
165
166 for (i = 0; i < num_nh_buckets; i++) {
167 struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
168 struct nh_grp_entry *nhge;
169 struct nh_info *nhi;
170
171 nhge = rtnl_dereference(bucket->nh_entry);
172 nhi = rtnl_dereference(nhge->nh->nh_info);
173 __nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
174 nhi);
175 }
176
177 return 0;
178 }
179
nh_notifier_grp_info_init(struct nh_notifier_info * info,const struct nexthop * nh)180 static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
181 const struct nexthop *nh)
182 {
183 struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
184
185 if (nhg->hash_threshold)
186 return nh_notifier_mpath_info_init(info, nhg);
187 else if (nhg->resilient)
188 return nh_notifier_res_table_info_init(info, nhg);
189 return -EINVAL;
190 }
191
nh_notifier_grp_info_fini(struct nh_notifier_info * info,const struct nexthop * nh)192 static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
193 const struct nexthop *nh)
194 {
195 struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
196
197 if (nhg->hash_threshold)
198 kfree(info->nh_grp);
199 else if (nhg->resilient)
200 vfree(info->nh_res_table);
201 }
202
nh_notifier_info_init(struct nh_notifier_info * info,const struct nexthop * nh)203 static int nh_notifier_info_init(struct nh_notifier_info *info,
204 const struct nexthop *nh)
205 {
206 info->id = nh->id;
207
208 if (nh->is_group)
209 return nh_notifier_grp_info_init(info, nh);
210 else
211 return nh_notifier_single_info_init(info, nh);
212 }
213
nh_notifier_info_fini(struct nh_notifier_info * info,const struct nexthop * nh)214 static void nh_notifier_info_fini(struct nh_notifier_info *info,
215 const struct nexthop *nh)
216 {
217 if (nh->is_group)
218 nh_notifier_grp_info_fini(info, nh);
219 else
220 nh_notifier_single_info_fini(info);
221 }
222
call_nexthop_notifiers(struct net * net,enum nexthop_event_type event_type,struct nexthop * nh,struct netlink_ext_ack * extack)223 static int call_nexthop_notifiers(struct net *net,
224 enum nexthop_event_type event_type,
225 struct nexthop *nh,
226 struct netlink_ext_ack *extack)
227 {
228 struct nh_notifier_info info = {
229 .net = net,
230 .extack = extack,
231 };
232 int err;
233
234 ASSERT_RTNL();
235
236 if (nexthop_notifiers_is_empty(net))
237 return 0;
238
239 err = nh_notifier_info_init(&info, nh);
240 if (err) {
241 NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
242 return err;
243 }
244
245 err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
246 event_type, &info);
247 nh_notifier_info_fini(&info, nh);
248
249 return notifier_to_errno(err);
250 }
251
252 static int
nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info * info,bool force,unsigned int * p_idle_timer_ms)253 nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
254 bool force, unsigned int *p_idle_timer_ms)
255 {
256 struct nh_res_table *res_table;
257 struct nh_group *nhg;
258 struct nexthop *nh;
259 int err = 0;
260
261 /* When 'force' is false, nexthop bucket replacement is performed
262 * because the bucket was deemed to be idle. In this case, capable
263 * listeners can choose to perform an atomic replacement: The bucket is
264 * only replaced if it is inactive. However, if the idle timer interval
265 * is smaller than the interval in which a listener is querying
266 * buckets' activity from the device, then atomic replacement should
267 * not be tried. Pass the idle timer value to listeners, so that they
268 * could determine which type of replacement to perform.
269 */
270 if (force) {
271 *p_idle_timer_ms = 0;
272 return 0;
273 }
274
275 rcu_read_lock();
276
277 nh = nexthop_find_by_id(info->net, info->id);
278 if (!nh) {
279 err = -EINVAL;
280 goto out;
281 }
282
283 nhg = rcu_dereference(nh->nh_grp);
284 res_table = rcu_dereference(nhg->res_table);
285 *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);
286
287 out:
288 rcu_read_unlock();
289
290 return err;
291 }
292
nh_notifier_res_bucket_info_init(struct nh_notifier_info * info,u16 bucket_index,bool force,struct nh_info * oldi,struct nh_info * newi)293 static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
294 u16 bucket_index, bool force,
295 struct nh_info *oldi,
296 struct nh_info *newi)
297 {
298 unsigned int idle_timer_ms;
299 int err;
300
301 err = nh_notifier_res_bucket_idle_timer_get(info, force,
302 &idle_timer_ms);
303 if (err)
304 return err;
305
306 info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
307 info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
308 GFP_KERNEL);
309 if (!info->nh_res_bucket)
310 return -ENOMEM;
311
312 info->nh_res_bucket->bucket_index = bucket_index;
313 info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
314 info->nh_res_bucket->force = force;
315 __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
316 __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
317 return 0;
318 }
319
nh_notifier_res_bucket_info_fini(struct nh_notifier_info * info)320 static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
321 {
322 kfree(info->nh_res_bucket);
323 }
324
__call_nexthop_res_bucket_notifiers(struct net * net,u32 nhg_id,u16 bucket_index,bool force,struct nh_info * oldi,struct nh_info * newi,struct netlink_ext_ack * extack)325 static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
326 u16 bucket_index, bool force,
327 struct nh_info *oldi,
328 struct nh_info *newi,
329 struct netlink_ext_ack *extack)
330 {
331 struct nh_notifier_info info = {
332 .net = net,
333 .extack = extack,
334 .id = nhg_id,
335 };
336 int err;
337
338 if (nexthop_notifiers_is_empty(net))
339 return 0;
340
341 err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
342 oldi, newi);
343 if (err)
344 return err;
345
346 err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
347 NEXTHOP_EVENT_BUCKET_REPLACE, &info);
348 nh_notifier_res_bucket_info_fini(&info);
349
350 return notifier_to_errno(err);
351 }
352
353 /* There are three users of RES_TABLE, and NHs etc. referenced from there:
354 *
355 * 1) a collection of callbacks for NH maintenance. This operates under
356 * RTNL,
357 * 2) the delayed work that gradually balances the resilient table,
358 * 3) and nexthop_select_path(), operating under RCU.
359 *
360 * Both the delayed work and the RTNL block are writers, and need to
361 * maintain mutual exclusion. Since there are only two and well-known
362 * writers for each table, the RTNL code can make sure it has exclusive
363 * access thus:
364 *
365 * - Have the DW operate without locking;
366 * - synchronously cancel the DW;
367 * - do the writing;
368 * - if the write was not actually a delete, call upkeep, which schedules
369 * DW again if necessary.
370 *
371 * The functions that are always called from the RTNL context use
372 * rtnl_dereference(). The functions that can also be called from the DW do
373 * a raw dereference and rely on the above mutual exclusion scheme.
374 */
375 #define nh_res_dereference(p) (rcu_dereference_raw(p))
376
call_nexthop_res_bucket_notifiers(struct net * net,u32 nhg_id,u16 bucket_index,bool force,struct nexthop * old_nh,struct nexthop * new_nh,struct netlink_ext_ack * extack)377 static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
378 u16 bucket_index, bool force,
379 struct nexthop *old_nh,
380 struct nexthop *new_nh,
381 struct netlink_ext_ack *extack)
382 {
383 struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
384 struct nh_info *newi = nh_res_dereference(new_nh->nh_info);
385
386 return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
387 force, oldi, newi, extack);
388 }
389
call_nexthop_res_table_notifiers(struct net * net,struct nexthop * nh,struct netlink_ext_ack * extack)390 static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
391 struct netlink_ext_ack *extack)
392 {
393 struct nh_notifier_info info = {
394 .net = net,
395 .extack = extack,
396 };
397 struct nh_group *nhg;
398 int err;
399
400 ASSERT_RTNL();
401
402 if (nexthop_notifiers_is_empty(net))
403 return 0;
404
405 /* At this point, the nexthop buckets are still not populated. Only
406 * emit a notification with the logical nexthops, so that a listener
407 * could potentially veto it in case of unsupported configuration.
408 */
409 nhg = rtnl_dereference(nh->nh_grp);
410 err = nh_notifier_mpath_info_init(&info, nhg);
411 if (err) {
412 NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
413 return err;
414 }
415
416 err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
417 NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
418 &info);
419 kfree(info.nh_grp);
420
421 return notifier_to_errno(err);
422 }
423
call_nexthop_notifier(struct notifier_block * nb,struct net * net,enum nexthop_event_type event_type,struct nexthop * nh,struct netlink_ext_ack * extack)424 static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
425 enum nexthop_event_type event_type,
426 struct nexthop *nh,
427 struct netlink_ext_ack *extack)
428 {
429 struct nh_notifier_info info = {
430 .net = net,
431 .extack = extack,
432 };
433 int err;
434
435 err = nh_notifier_info_init(&info, nh);
436 if (err)
437 return err;
438
439 err = nb->notifier_call(nb, event_type, &info);
440 nh_notifier_info_fini(&info, nh);
441
442 return notifier_to_errno(err);
443 }
444
nh_dev_hashfn(unsigned int val)445 static unsigned int nh_dev_hashfn(unsigned int val)
446 {
447 unsigned int mask = NH_DEV_HASHSIZE - 1;
448
449 return (val ^
450 (val >> NH_DEV_HASHBITS) ^
451 (val >> (NH_DEV_HASHBITS * 2))) & mask;
452 }
453
nexthop_devhash_add(struct net * net,struct nh_info * nhi)454 static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
455 {
456 struct net_device *dev = nhi->fib_nhc.nhc_dev;
457 struct hlist_head *head;
458 unsigned int hash;
459
460 WARN_ON(!dev);
461
462 hash = nh_dev_hashfn(dev->ifindex);
463 head = &net->nexthop.devhash[hash];
464 hlist_add_head(&nhi->dev_hash, head);
465 }
466
nexthop_free_group(struct nexthop * nh)467 static void nexthop_free_group(struct nexthop *nh)
468 {
469 struct nh_group *nhg;
470 int i;
471
472 nhg = rcu_dereference_raw(nh->nh_grp);
473 for (i = 0; i < nhg->num_nh; ++i) {
474 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
475
476 WARN_ON(!list_empty(&nhge->nh_list));
477 nexthop_put(nhge->nh);
478 }
479
480 WARN_ON(nhg->spare == nhg);
481
482 if (nhg->resilient)
483 vfree(rcu_dereference_raw(nhg->res_table));
484
485 kfree(nhg->spare);
486 kfree(nhg);
487 }
488
nexthop_free_single(struct nexthop * nh)489 static void nexthop_free_single(struct nexthop *nh)
490 {
491 struct nh_info *nhi;
492
493 nhi = rcu_dereference_raw(nh->nh_info);
494 switch (nhi->family) {
495 case AF_INET:
496 fib_nh_release(nh->net, &nhi->fib_nh);
497 break;
498 case AF_INET6:
499 ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
500 break;
501 }
502 kfree(nhi);
503 }
504
nexthop_free_rcu(struct rcu_head * head)505 void nexthop_free_rcu(struct rcu_head *head)
506 {
507 struct nexthop *nh = container_of(head, struct nexthop, rcu);
508
509 if (nh->is_group)
510 nexthop_free_group(nh);
511 else
512 nexthop_free_single(nh);
513
514 kfree(nh);
515 }
516 EXPORT_SYMBOL_GPL(nexthop_free_rcu);
517
nexthop_alloc(void)518 static struct nexthop *nexthop_alloc(void)
519 {
520 struct nexthop *nh;
521
522 nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
523 if (nh) {
524 INIT_LIST_HEAD(&nh->fi_list);
525 INIT_LIST_HEAD(&nh->f6i_list);
526 INIT_LIST_HEAD(&nh->grp_list);
527 INIT_LIST_HEAD(&nh->fdb_list);
528 }
529 return nh;
530 }
531
nexthop_grp_alloc(u16 num_nh)532 static struct nh_group *nexthop_grp_alloc(u16 num_nh)
533 {
534 struct nh_group *nhg;
535
536 nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
537 if (nhg)
538 nhg->num_nh = num_nh;
539
540 return nhg;
541 }
542
543 static void nh_res_table_upkeep_dw(struct work_struct *work);
544
545 static struct nh_res_table *
nexthop_res_table_alloc(struct net * net,u32 nhg_id,struct nh_config * cfg)546 nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
547 {
548 const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
549 struct nh_res_table *res_table;
550 unsigned long size;
551
552 size = struct_size(res_table, nh_buckets, num_nh_buckets);
553 res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
554 if (!res_table)
555 return NULL;
556
557 res_table->net = net;
558 res_table->nhg_id = nhg_id;
559 INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
560 INIT_LIST_HEAD(&res_table->uw_nh_entries);
561 res_table->idle_timer = cfg->nh_grp_res_idle_timer;
562 res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
563 res_table->num_nh_buckets = num_nh_buckets;
564 return res_table;
565 }
566
nh_base_seq_inc(struct net * net)567 static void nh_base_seq_inc(struct net *net)
568 {
569 while (++net->nexthop.seq == 0)
570 ;
571 }
572
573 /* no reference taken; rcu lock or rtnl must be held */
nexthop_find_by_id(struct net * net,u32 id)574 struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
575 {
576 struct rb_node **pp, *parent = NULL, *next;
577
578 pp = &net->nexthop.rb_root.rb_node;
579 while (1) {
580 struct nexthop *nh;
581
582 next = rcu_dereference_raw(*pp);
583 if (!next)
584 break;
585 parent = next;
586
587 nh = rb_entry(parent, struct nexthop, rb_node);
588 if (id < nh->id)
589 pp = &next->rb_left;
590 else if (id > nh->id)
591 pp = &next->rb_right;
592 else
593 return nh;
594 }
595 return NULL;
596 }
597 EXPORT_SYMBOL_GPL(nexthop_find_by_id);
598
599 /* used for auto id allocation; called with rtnl held */
nh_find_unused_id(struct net * net)600 static u32 nh_find_unused_id(struct net *net)
601 {
602 u32 id_start = net->nexthop.last_id_allocated;
603
604 while (1) {
605 net->nexthop.last_id_allocated++;
606 if (net->nexthop.last_id_allocated == id_start)
607 break;
608
609 if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
610 return net->nexthop.last_id_allocated;
611 }
612 return 0;
613 }
614
nh_res_time_set_deadline(unsigned long next_time,unsigned long * deadline)615 static void nh_res_time_set_deadline(unsigned long next_time,
616 unsigned long *deadline)
617 {
618 if (time_before(next_time, *deadline))
619 *deadline = next_time;
620 }
621
nh_res_table_unbalanced_time(struct nh_res_table * res_table)622 static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
623 {
624 if (list_empty(&res_table->uw_nh_entries))
625 return 0;
626 return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
627 }
628
nla_put_nh_group_res(struct sk_buff * skb,struct nh_group * nhg)629 static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
630 {
631 struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
632 struct nlattr *nest;
633
634 nest = nla_nest_start(skb, NHA_RES_GROUP);
635 if (!nest)
636 return -EMSGSIZE;
637
638 if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
639 res_table->num_nh_buckets) ||
640 nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
641 jiffies_to_clock_t(res_table->idle_timer)) ||
642 nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
643 jiffies_to_clock_t(res_table->unbalanced_timer)) ||
644 nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
645 nh_res_table_unbalanced_time(res_table),
646 NHA_RES_GROUP_PAD))
647 goto nla_put_failure;
648
649 nla_nest_end(skb, nest);
650 return 0;
651
652 nla_put_failure:
653 nla_nest_cancel(skb, nest);
654 return -EMSGSIZE;
655 }
656
nla_put_nh_group(struct sk_buff * skb,struct nh_group * nhg)657 static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
658 {
659 struct nexthop_grp *p;
660 size_t len = nhg->num_nh * sizeof(*p);
661 struct nlattr *nla;
662 u16 group_type = 0;
663 int i;
664
665 if (nhg->hash_threshold)
666 group_type = NEXTHOP_GRP_TYPE_MPATH;
667 else if (nhg->resilient)
668 group_type = NEXTHOP_GRP_TYPE_RES;
669
670 if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
671 goto nla_put_failure;
672
673 nla = nla_reserve(skb, NHA_GROUP, len);
674 if (!nla)
675 goto nla_put_failure;
676
677 p = nla_data(nla);
678 for (i = 0; i < nhg->num_nh; ++i) {
679 *p++ = (struct nexthop_grp) {
680 .id = nhg->nh_entries[i].nh->id,
681 .weight = nhg->nh_entries[i].weight - 1,
682 };
683 }
684
685 if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
686 goto nla_put_failure;
687
688 return 0;
689
690 nla_put_failure:
691 return -EMSGSIZE;
692 }
693
nh_fill_node(struct sk_buff * skb,struct nexthop * nh,int event,u32 portid,u32 seq,unsigned int nlflags)694 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
695 int event, u32 portid, u32 seq, unsigned int nlflags)
696 {
697 struct fib6_nh *fib6_nh;
698 struct fib_nh *fib_nh;
699 struct nlmsghdr *nlh;
700 struct nh_info *nhi;
701 struct nhmsg *nhm;
702
703 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
704 if (!nlh)
705 return -EMSGSIZE;
706
707 nhm = nlmsg_data(nlh);
708 nhm->nh_family = AF_UNSPEC;
709 nhm->nh_flags = nh->nh_flags;
710 nhm->nh_protocol = nh->protocol;
711 nhm->nh_scope = 0;
712 nhm->resvd = 0;
713
714 if (nla_put_u32(skb, NHA_ID, nh->id))
715 goto nla_put_failure;
716
717 if (nh->is_group) {
718 struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
719
720 if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
721 goto nla_put_failure;
722 if (nla_put_nh_group(skb, nhg))
723 goto nla_put_failure;
724 goto out;
725 }
726
727 nhi = rtnl_dereference(nh->nh_info);
728 nhm->nh_family = nhi->family;
729 if (nhi->reject_nh) {
730 if (nla_put_flag(skb, NHA_BLACKHOLE))
731 goto nla_put_failure;
732 goto out;
733 } else if (nhi->fdb_nh) {
734 if (nla_put_flag(skb, NHA_FDB))
735 goto nla_put_failure;
736 } else {
737 const struct net_device *dev;
738
739 dev = nhi->fib_nhc.nhc_dev;
740 if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
741 goto nla_put_failure;
742 }
743
744 nhm->nh_scope = nhi->fib_nhc.nhc_scope;
745 switch (nhi->family) {
746 case AF_INET:
747 fib_nh = &nhi->fib_nh;
748 if (fib_nh->fib_nh_gw_family &&
749 nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
750 goto nla_put_failure;
751 break;
752
753 case AF_INET6:
754 fib6_nh = &nhi->fib6_nh;
755 if (fib6_nh->fib_nh_gw_family &&
756 nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
757 goto nla_put_failure;
758 break;
759 }
760
761 if (nhi->fib_nhc.nhc_lwtstate &&
762 lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
763 NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
764 goto nla_put_failure;
765
766 out:
767 nlmsg_end(skb, nlh);
768 return 0;
769
770 nla_put_failure:
771 nlmsg_cancel(skb, nlh);
772 return -EMSGSIZE;
773 }
774
nh_nlmsg_size_grp_res(struct nh_group * nhg)775 static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
776 {
777 return nla_total_size(0) + /* NHA_RES_GROUP */
778 nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */
779 nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */
780 nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */
781 nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
782 }
783
nh_nlmsg_size_grp(struct nexthop * nh)784 static size_t nh_nlmsg_size_grp(struct nexthop *nh)
785 {
786 struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
787 size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
788 size_t tot = nla_total_size(sz) +
789 nla_total_size(2); /* NHA_GROUP_TYPE */
790
791 if (nhg->resilient)
792 tot += nh_nlmsg_size_grp_res(nhg);
793
794 return tot;
795 }
796
nh_nlmsg_size_single(struct nexthop * nh)797 static size_t nh_nlmsg_size_single(struct nexthop *nh)
798 {
799 struct nh_info *nhi = rtnl_dereference(nh->nh_info);
800 size_t sz;
801
802 /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
803 * are mutually exclusive
804 */
805 sz = nla_total_size(4); /* NHA_OIF */
806
807 switch (nhi->family) {
808 case AF_INET:
809 if (nhi->fib_nh.fib_nh_gw_family)
810 sz += nla_total_size(4); /* NHA_GATEWAY */
811 break;
812
813 case AF_INET6:
814 /* NHA_GATEWAY */
815 if (nhi->fib6_nh.fib_nh_gw_family)
816 sz += nla_total_size(sizeof(const struct in6_addr));
817 break;
818 }
819
820 if (nhi->fib_nhc.nhc_lwtstate) {
821 sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
822 sz += nla_total_size(2); /* NHA_ENCAP_TYPE */
823 }
824
825 return sz;
826 }
827
nh_nlmsg_size(struct nexthop * nh)828 static size_t nh_nlmsg_size(struct nexthop *nh)
829 {
830 size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
831
832 sz += nla_total_size(4); /* NHA_ID */
833
834 if (nh->is_group)
835 sz += nh_nlmsg_size_grp(nh);
836 else
837 sz += nh_nlmsg_size_single(nh);
838
839 return sz;
840 }
841
nexthop_notify(int event,struct nexthop * nh,struct nl_info * info)842 static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
843 {
844 unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
845 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
846 struct sk_buff *skb;
847 int err = -ENOBUFS;
848
849 skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
850 if (!skb)
851 goto errout;
852
853 err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
854 if (err < 0) {
855 /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
856 WARN_ON(err == -EMSGSIZE);
857 kfree_skb(skb);
858 goto errout;
859 }
860
861 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
862 info->nlh, gfp_any());
863 return;
864 errout:
865 if (err < 0)
866 rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
867 }
868
nh_res_bucket_used_time(const struct nh_res_bucket * bucket)869 static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
870 {
871 return (unsigned long)atomic_long_read(&bucket->used_time);
872 }
873
874 static unsigned long
nh_res_bucket_idle_point(const struct nh_res_table * res_table,const struct nh_res_bucket * bucket,unsigned long now)875 nh_res_bucket_idle_point(const struct nh_res_table *res_table,
876 const struct nh_res_bucket *bucket,
877 unsigned long now)
878 {
879 unsigned long time = nh_res_bucket_used_time(bucket);
880
881 /* Bucket was not used since it was migrated. The idle time is now. */
882 if (time == bucket->migrated_time)
883 return now;
884
885 return time + res_table->idle_timer;
886 }
887
888 static unsigned long
nh_res_table_unb_point(const struct nh_res_table * res_table)889 nh_res_table_unb_point(const struct nh_res_table *res_table)
890 {
891 return res_table->unbalanced_since + res_table->unbalanced_timer;
892 }
893
nh_res_bucket_set_idle(const struct nh_res_table * res_table,struct nh_res_bucket * bucket)894 static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
895 struct nh_res_bucket *bucket)
896 {
897 unsigned long now = jiffies;
898
899 atomic_long_set(&bucket->used_time, (long)now);
900 bucket->migrated_time = now;
901 }
902
nh_res_bucket_set_busy(struct nh_res_bucket * bucket)903 static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
904 {
905 atomic_long_set(&bucket->used_time, (long)jiffies);
906 }
907
nh_res_bucket_idle_time(const struct nh_res_bucket * bucket)908 static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
909 {
910 unsigned long used_time = nh_res_bucket_used_time(bucket);
911
912 return jiffies_delta_to_clock_t(jiffies - used_time);
913 }
914
nh_fill_res_bucket(struct sk_buff * skb,struct nexthop * nh,struct nh_res_bucket * bucket,u16 bucket_index,int event,u32 portid,u32 seq,unsigned int nlflags,struct netlink_ext_ack * extack)915 static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
916 struct nh_res_bucket *bucket, u16 bucket_index,
917 int event, u32 portid, u32 seq,
918 unsigned int nlflags,
919 struct netlink_ext_ack *extack)
920 {
921 struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
922 struct nlmsghdr *nlh;
923 struct nlattr *nest;
924 struct nhmsg *nhm;
925
926 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
927 if (!nlh)
928 return -EMSGSIZE;
929
930 nhm = nlmsg_data(nlh);
931 nhm->nh_family = AF_UNSPEC;
932 nhm->nh_flags = bucket->nh_flags;
933 nhm->nh_protocol = nh->protocol;
934 nhm->nh_scope = 0;
935 nhm->resvd = 0;
936
937 if (nla_put_u32(skb, NHA_ID, nh->id))
938 goto nla_put_failure;
939
940 nest = nla_nest_start(skb, NHA_RES_BUCKET);
941 if (!nest)
942 goto nla_put_failure;
943
944 if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
945 nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
946 nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
947 nh_res_bucket_idle_time(bucket),
948 NHA_RES_BUCKET_PAD))
949 goto nla_put_failure_nest;
950
951 nla_nest_end(skb, nest);
952 nlmsg_end(skb, nlh);
953 return 0;
954
955 nla_put_failure_nest:
956 nla_nest_cancel(skb, nest);
957 nla_put_failure:
958 nlmsg_cancel(skb, nlh);
959 return -EMSGSIZE;
960 }
961
nexthop_bucket_notify(struct nh_res_table * res_table,u16 bucket_index)962 static void nexthop_bucket_notify(struct nh_res_table *res_table,
963 u16 bucket_index)
964 {
965 struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
966 struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
967 struct nexthop *nh = nhge->nh_parent;
968 struct sk_buff *skb;
969 int err = -ENOBUFS;
970
971 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
972 if (!skb)
973 goto errout;
974
975 err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
976 RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
977 NULL);
978 if (err < 0) {
979 kfree_skb(skb);
980 goto errout;
981 }
982
983 rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
984 return;
985 errout:
986 if (err < 0)
987 rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
988 }
989
valid_group_nh(struct nexthop * nh,unsigned int npaths,bool * is_fdb,struct netlink_ext_ack * extack)990 static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
991 bool *is_fdb, struct netlink_ext_ack *extack)
992 {
993 if (nh->is_group) {
994 struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
995
996 /* Nesting groups within groups is not supported. */
997 if (nhg->hash_threshold) {
998 NL_SET_ERR_MSG(extack,
999 "Hash-threshold group can not be a nexthop within a group");
1000 return false;
1001 }
1002 if (nhg->resilient) {
1003 NL_SET_ERR_MSG(extack,
1004 "Resilient group can not be a nexthop within a group");
1005 return false;
1006 }
1007 *is_fdb = nhg->fdb_nh;
1008 } else {
1009 struct nh_info *nhi = rtnl_dereference(nh->nh_info);
1010
1011 if (nhi->reject_nh && npaths > 1) {
1012 NL_SET_ERR_MSG(extack,
1013 "Blackhole nexthop can not be used in a group with more than 1 path");
1014 return false;
1015 }
1016 *is_fdb = nhi->fdb_nh;
1017 }
1018
1019 return true;
1020 }
1021
nh_check_attr_fdb_group(struct nexthop * nh,u8 * nh_family,struct netlink_ext_ack * extack)1022 static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
1023 struct netlink_ext_ack *extack)
1024 {
1025 struct nh_info *nhi;
1026
1027 nhi = rtnl_dereference(nh->nh_info);
1028
1029 if (!nhi->fdb_nh) {
1030 NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
1031 return -EINVAL;
1032 }
1033
1034 if (*nh_family == AF_UNSPEC) {
1035 *nh_family = nhi->family;
1036 } else if (*nh_family != nhi->family) {
1037 NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
1038 return -EINVAL;
1039 }
1040
1041 return 0;
1042 }
1043
nh_check_attr_group(struct net * net,struct nlattr * tb[],size_t tb_size,u16 nh_grp_type,struct netlink_ext_ack * extack)1044 static int nh_check_attr_group(struct net *net,
1045 struct nlattr *tb[], size_t tb_size,
1046 u16 nh_grp_type, struct netlink_ext_ack *extack)
1047 {
1048 unsigned int len = nla_len(tb[NHA_GROUP]);
1049 u8 nh_family = AF_UNSPEC;
1050 struct nexthop_grp *nhg;
1051 unsigned int i, j;
1052 u8 nhg_fdb = 0;
1053
1054 if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
1055 NL_SET_ERR_MSG(extack,
1056 "Invalid length for nexthop group attribute");
1057 return -EINVAL;
1058 }
1059
1060 /* convert len to number of nexthop ids */
1061 len /= sizeof(*nhg);
1062
1063 nhg = nla_data(tb[NHA_GROUP]);
1064 for (i = 0; i < len; ++i) {
1065 if (nhg[i].resvd1 || nhg[i].resvd2) {
1066 NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
1067 return -EINVAL;
1068 }
1069 if (nhg[i].weight > 254) {
1070 NL_SET_ERR_MSG(extack, "Invalid value for weight");
1071 return -EINVAL;
1072 }
1073 for (j = i + 1; j < len; ++j) {
1074 if (nhg[i].id == nhg[j].id) {
1075 NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
1076 return -EINVAL;
1077 }
1078 }
1079 }
1080
1081 if (tb[NHA_FDB])
1082 nhg_fdb = 1;
1083 nhg = nla_data(tb[NHA_GROUP]);
1084 for (i = 0; i < len; ++i) {
1085 struct nexthop *nh;
1086 bool is_fdb_nh;
1087
1088 nh = nexthop_find_by_id(net, nhg[i].id);
1089 if (!nh) {
1090 NL_SET_ERR_MSG(extack, "Invalid nexthop id");
1091 return -EINVAL;
1092 }
1093 if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
1094 return -EINVAL;
1095
1096 if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
1097 return -EINVAL;
1098
1099 if (!nhg_fdb && is_fdb_nh) {
1100 NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
1101 return -EINVAL;
1102 }
1103 }
1104 for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
1105 if (!tb[i])
1106 continue;
1107 switch (i) {
1108 case NHA_FDB:
1109 continue;
1110 case NHA_RES_GROUP:
1111 if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
1112 continue;
1113 break;
1114 }
1115 NL_SET_ERR_MSG(extack,
1116 "No other attributes can be set in nexthop groups");
1117 return -EINVAL;
1118 }
1119
1120 return 0;
1121 }
1122
ipv6_good_nh(const struct fib6_nh * nh)1123 static bool ipv6_good_nh(const struct fib6_nh *nh)
1124 {
1125 int state = NUD_REACHABLE;
1126 struct neighbour *n;
1127
1128 rcu_read_lock();
1129
1130 n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
1131 if (n)
1132 state = READ_ONCE(n->nud_state);
1133
1134 rcu_read_unlock();
1135
1136 return !!(state & NUD_VALID);
1137 }
1138
ipv4_good_nh(const struct fib_nh * nh)1139 static bool ipv4_good_nh(const struct fib_nh *nh)
1140 {
1141 int state = NUD_REACHABLE;
1142 struct neighbour *n;
1143
1144 rcu_read_lock();
1145
1146 n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
1147 (__force u32)nh->fib_nh_gw4);
1148 if (n)
1149 state = READ_ONCE(n->nud_state);
1150
1151 rcu_read_unlock();
1152
1153 return !!(state & NUD_VALID);
1154 }
1155
nexthop_is_good_nh(const struct nexthop * nh)1156 static bool nexthop_is_good_nh(const struct nexthop *nh)
1157 {
1158 struct nh_info *nhi = rcu_dereference(nh->nh_info);
1159
1160 switch (nhi->family) {
1161 case AF_INET:
1162 return ipv4_good_nh(&nhi->fib_nh);
1163 case AF_INET6:
1164 return ipv6_good_nh(&nhi->fib6_nh);
1165 }
1166
1167 return false;
1168 }
1169
nexthop_select_path_fdb(struct nh_group * nhg,int hash)1170 static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash)
1171 {
1172 int i;
1173
1174 for (i = 0; i < nhg->num_nh; i++) {
1175 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1176
1177 if (hash > atomic_read(&nhge->hthr.upper_bound))
1178 continue;
1179
1180 return nhge->nh;
1181 }
1182
1183 WARN_ON_ONCE(1);
1184 return NULL;
1185 }
1186
nexthop_select_path_hthr(struct nh_group * nhg,int hash)1187 static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
1188 {
1189 struct nexthop *rc = NULL;
1190 int i;
1191
1192 if (nhg->fdb_nh)
1193 return nexthop_select_path_fdb(nhg, hash);
1194
1195 for (i = 0; i < nhg->num_nh; ++i) {
1196 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1197
1198 /* nexthops always check if it is good and does
1199 * not rely on a sysctl for this behavior
1200 */
1201 if (!nexthop_is_good_nh(nhge->nh))
1202 continue;
1203
1204 if (!rc)
1205 rc = nhge->nh;
1206
1207 if (hash > atomic_read(&nhge->hthr.upper_bound))
1208 continue;
1209
1210 return nhge->nh;
1211 }
1212
1213 return rc ? : nhg->nh_entries[0].nh;
1214 }
1215
nexthop_select_path_res(struct nh_group * nhg,int hash)1216 static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
1217 {
1218 struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
1219 u16 bucket_index = hash % res_table->num_nh_buckets;
1220 struct nh_res_bucket *bucket;
1221 struct nh_grp_entry *nhge;
1222
1223 /* nexthop_select_path() is expected to return a non-NULL value, so
1224 * skip protocol validation and just hand out whatever there is.
1225 */
1226 bucket = &res_table->nh_buckets[bucket_index];
1227 nh_res_bucket_set_busy(bucket);
1228 nhge = rcu_dereference(bucket->nh_entry);
1229 return nhge->nh;
1230 }
1231
nexthop_select_path(struct nexthop * nh,int hash)1232 struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
1233 {
1234 struct nh_group *nhg;
1235
1236 if (!nh->is_group)
1237 return nh;
1238
1239 nhg = rcu_dereference(nh->nh_grp);
1240 if (nhg->hash_threshold)
1241 return nexthop_select_path_hthr(nhg, hash);
1242 else if (nhg->resilient)
1243 return nexthop_select_path_res(nhg, hash);
1244
1245 /* Unreachable. */
1246 return NULL;
1247 }
1248 EXPORT_SYMBOL_GPL(nexthop_select_path);
1249
nexthop_for_each_fib6_nh(struct nexthop * nh,int (* cb)(struct fib6_nh * nh,void * arg),void * arg)1250 int nexthop_for_each_fib6_nh(struct nexthop *nh,
1251 int (*cb)(struct fib6_nh *nh, void *arg),
1252 void *arg)
1253 {
1254 struct nh_info *nhi;
1255 int err;
1256
1257 if (nh->is_group) {
1258 struct nh_group *nhg;
1259 int i;
1260
1261 nhg = rcu_dereference_rtnl(nh->nh_grp);
1262 for (i = 0; i < nhg->num_nh; i++) {
1263 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1264
1265 nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
1266 err = cb(&nhi->fib6_nh, arg);
1267 if (err)
1268 return err;
1269 }
1270 } else {
1271 nhi = rcu_dereference_rtnl(nh->nh_info);
1272 err = cb(&nhi->fib6_nh, arg);
1273 if (err)
1274 return err;
1275 }
1276
1277 return 0;
1278 }
1279 EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
1280
check_src_addr(const struct in6_addr * saddr,struct netlink_ext_ack * extack)1281 static int check_src_addr(const struct in6_addr *saddr,
1282 struct netlink_ext_ack *extack)
1283 {
1284 if (!ipv6_addr_any(saddr)) {
1285 NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
1286 return -EINVAL;
1287 }
1288 return 0;
1289 }
1290
fib6_check_nexthop(struct nexthop * nh,struct fib6_config * cfg,struct netlink_ext_ack * extack)1291 int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
1292 struct netlink_ext_ack *extack)
1293 {
1294 struct nh_info *nhi;
1295 bool is_fdb_nh;
1296
1297 /* fib6_src is unique to a fib6_info and limits the ability to cache
1298 * routes in fib6_nh within a nexthop that is potentially shared
1299 * across multiple fib entries. If the config wants to use source
1300 * routing it can not use nexthop objects. mlxsw also does not allow
1301 * fib6_src on routes.
1302 */
1303 if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
1304 return -EINVAL;
1305
1306 if (nh->is_group) {
1307 struct nh_group *nhg;
1308
1309 nhg = rtnl_dereference(nh->nh_grp);
1310 if (nhg->has_v4)
1311 goto no_v4_nh;
1312 is_fdb_nh = nhg->fdb_nh;
1313 } else {
1314 nhi = rtnl_dereference(nh->nh_info);
1315 if (nhi->family == AF_INET)
1316 goto no_v4_nh;
1317 is_fdb_nh = nhi->fdb_nh;
1318 }
1319
1320 if (is_fdb_nh) {
1321 NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1322 return -EINVAL;
1323 }
1324
1325 return 0;
1326 no_v4_nh:
1327 NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
1328 return -EINVAL;
1329 }
1330 EXPORT_SYMBOL_GPL(fib6_check_nexthop);
1331
1332 /* if existing nexthop has ipv6 routes linked to it, need
1333 * to verify this new spec works with ipv6
1334 */
fib6_check_nh_list(struct nexthop * old,struct nexthop * new,struct netlink_ext_ack * extack)1335 static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
1336 struct netlink_ext_ack *extack)
1337 {
1338 struct fib6_info *f6i;
1339
1340 if (list_empty(&old->f6i_list))
1341 return 0;
1342
1343 list_for_each_entry(f6i, &old->f6i_list, nh_list) {
1344 if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
1345 return -EINVAL;
1346 }
1347
1348 return fib6_check_nexthop(new, NULL, extack);
1349 }
1350
nexthop_check_scope(struct nh_info * nhi,u8 scope,struct netlink_ext_ack * extack)1351 static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
1352 struct netlink_ext_ack *extack)
1353 {
1354 if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
1355 NL_SET_ERR_MSG(extack,
1356 "Route with host scope can not have a gateway");
1357 return -EINVAL;
1358 }
1359
1360 if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
1361 NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
1362 return -EINVAL;
1363 }
1364
1365 return 0;
1366 }
1367
1368 /* Invoked by fib add code to verify nexthop by id is ok with
1369 * config for prefix; parts of fib_check_nh not done when nexthop
1370 * object is used.
1371 */
fib_check_nexthop(struct nexthop * nh,u8 scope,struct netlink_ext_ack * extack)1372 int fib_check_nexthop(struct nexthop *nh, u8 scope,
1373 struct netlink_ext_ack *extack)
1374 {
1375 struct nh_info *nhi;
1376 int err = 0;
1377
1378 if (nh->is_group) {
1379 struct nh_group *nhg;
1380
1381 nhg = rtnl_dereference(nh->nh_grp);
1382 if (nhg->fdb_nh) {
1383 NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1384 err = -EINVAL;
1385 goto out;
1386 }
1387
1388 if (scope == RT_SCOPE_HOST) {
1389 NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
1390 err = -EINVAL;
1391 goto out;
1392 }
1393
1394 /* all nexthops in a group have the same scope */
1395 nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
1396 err = nexthop_check_scope(nhi, scope, extack);
1397 } else {
1398 nhi = rtnl_dereference(nh->nh_info);
1399 if (nhi->fdb_nh) {
1400 NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1401 err = -EINVAL;
1402 goto out;
1403 }
1404 err = nexthop_check_scope(nhi, scope, extack);
1405 }
1406
1407 out:
1408 return err;
1409 }
1410
fib_check_nh_list(struct nexthop * old,struct nexthop * new,struct netlink_ext_ack * extack)1411 static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
1412 struct netlink_ext_ack *extack)
1413 {
1414 struct fib_info *fi;
1415
1416 list_for_each_entry(fi, &old->fi_list, nh_list) {
1417 int err;
1418
1419 err = fib_check_nexthop(new, fi->fib_scope, extack);
1420 if (err)
1421 return err;
1422 }
1423 return 0;
1424 }
1425
nh_res_nhge_is_balanced(const struct nh_grp_entry * nhge)1426 static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
1427 {
1428 return nhge->res.count_buckets == nhge->res.wants_buckets;
1429 }
1430
nh_res_nhge_is_ow(const struct nh_grp_entry * nhge)1431 static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
1432 {
1433 return nhge->res.count_buckets > nhge->res.wants_buckets;
1434 }
1435
nh_res_nhge_is_uw(const struct nh_grp_entry * nhge)1436 static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
1437 {
1438 return nhge->res.count_buckets < nhge->res.wants_buckets;
1439 }
1440
nh_res_table_is_balanced(const struct nh_res_table * res_table)1441 static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
1442 {
1443 return list_empty(&res_table->uw_nh_entries);
1444 }
1445
nh_res_bucket_unset_nh(struct nh_res_bucket * bucket)1446 static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
1447 {
1448 struct nh_grp_entry *nhge;
1449
1450 if (bucket->occupied) {
1451 nhge = nh_res_dereference(bucket->nh_entry);
1452 nhge->res.count_buckets--;
1453 bucket->occupied = false;
1454 }
1455 }
1456
nh_res_bucket_set_nh(struct nh_res_bucket * bucket,struct nh_grp_entry * nhge)1457 static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
1458 struct nh_grp_entry *nhge)
1459 {
1460 nh_res_bucket_unset_nh(bucket);
1461
1462 bucket->occupied = true;
1463 rcu_assign_pointer(bucket->nh_entry, nhge);
1464 nhge->res.count_buckets++;
1465 }
1466
nh_res_bucket_should_migrate(struct nh_res_table * res_table,struct nh_res_bucket * bucket,unsigned long * deadline,bool * force)1467 static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
1468 struct nh_res_bucket *bucket,
1469 unsigned long *deadline, bool *force)
1470 {
1471 unsigned long now = jiffies;
1472 struct nh_grp_entry *nhge;
1473 unsigned long idle_point;
1474
1475 if (!bucket->occupied) {
1476 /* The bucket is not occupied, its NHGE pointer is either
1477 * NULL or obsolete. We _have to_ migrate: set force.
1478 */
1479 *force = true;
1480 return true;
1481 }
1482
1483 nhge = nh_res_dereference(bucket->nh_entry);
1484
1485 /* If the bucket is populated by an underweight or balanced
1486 * nexthop, do not migrate.
1487 */
1488 if (!nh_res_nhge_is_ow(nhge))
1489 return false;
1490
1491 /* At this point we know that the bucket is populated with an
1492 * overweight nexthop. It needs to be migrated to a new nexthop if
1493 * the idle timer of unbalanced timer expired.
1494 */
1495
1496 idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
1497 if (time_after_eq(now, idle_point)) {
1498 /* The bucket is idle. We _can_ migrate: unset force. */
1499 *force = false;
1500 return true;
1501 }
1502
1503 /* Unbalanced timer of 0 means "never force". */
1504 if (res_table->unbalanced_timer) {
1505 unsigned long unb_point;
1506
1507 unb_point = nh_res_table_unb_point(res_table);
1508 if (time_after(now, unb_point)) {
1509 /* The bucket is not idle, but the unbalanced timer
1510 * expired. We _can_ migrate, but set force anyway,
1511 * so that drivers know to ignore activity reports
1512 * from the HW.
1513 */
1514 *force = true;
1515 return true;
1516 }
1517
1518 nh_res_time_set_deadline(unb_point, deadline);
1519 }
1520
1521 nh_res_time_set_deadline(idle_point, deadline);
1522 return false;
1523 }
1524
nh_res_bucket_migrate(struct nh_res_table * res_table,u16 bucket_index,bool notify,bool notify_nl,bool force)1525 static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
1526 u16 bucket_index, bool notify,
1527 bool notify_nl, bool force)
1528 {
1529 struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
1530 struct nh_grp_entry *new_nhge;
1531 struct netlink_ext_ack extack;
1532 int err;
1533
1534 new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
1535 struct nh_grp_entry,
1536 res.uw_nh_entry);
1537 if (WARN_ON_ONCE(!new_nhge))
1538 /* If this function is called, "bucket" is either not
1539 * occupied, or it belongs to a next hop that is
1540 * overweight. In either case, there ought to be a
1541 * corresponding underweight next hop.
1542 */
1543 return false;
1544
1545 if (notify) {
1546 struct nh_grp_entry *old_nhge;
1547
1548 old_nhge = nh_res_dereference(bucket->nh_entry);
1549 err = call_nexthop_res_bucket_notifiers(res_table->net,
1550 res_table->nhg_id,
1551 bucket_index, force,
1552 old_nhge->nh,
1553 new_nhge->nh, &extack);
1554 if (err) {
1555 pr_err_ratelimited("%s\n", extack._msg);
1556 if (!force)
1557 return false;
1558 /* It is not possible to veto a forced replacement, so
1559 * just clear the hardware flags from the nexthop
1560 * bucket to indicate to user space that this bucket is
1561 * not correctly populated in hardware.
1562 */
1563 bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
1564 }
1565 }
1566
1567 nh_res_bucket_set_nh(bucket, new_nhge);
1568 nh_res_bucket_set_idle(res_table, bucket);
1569
1570 if (notify_nl)
1571 nexthop_bucket_notify(res_table, bucket_index);
1572
1573 if (nh_res_nhge_is_balanced(new_nhge))
1574 list_del(&new_nhge->res.uw_nh_entry);
1575 return true;
1576 }
1577
1578 #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
1579
nh_res_table_upkeep(struct nh_res_table * res_table,bool notify,bool notify_nl)1580 static void nh_res_table_upkeep(struct nh_res_table *res_table,
1581 bool notify, bool notify_nl)
1582 {
1583 unsigned long now = jiffies;
1584 unsigned long deadline;
1585 u16 i;
1586
1587 /* Deadline is the next time that upkeep should be run. It is the
1588 * earliest time at which one of the buckets might be migrated.
1589 * Start at the most pessimistic estimate: either unbalanced_timer
1590 * from now, or if there is none, idle_timer from now. For each
1591 * encountered time point, call nh_res_time_set_deadline() to
1592 * refine the estimate.
1593 */
1594 if (res_table->unbalanced_timer)
1595 deadline = now + res_table->unbalanced_timer;
1596 else
1597 deadline = now + res_table->idle_timer;
1598
1599 for (i = 0; i < res_table->num_nh_buckets; i++) {
1600 struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
1601 bool force;
1602
1603 if (nh_res_bucket_should_migrate(res_table, bucket,
1604 &deadline, &force)) {
1605 if (!nh_res_bucket_migrate(res_table, i, notify,
1606 notify_nl, force)) {
1607 unsigned long idle_point;
1608
1609 /* A driver can override the migration
1610 * decision if the HW reports that the
1611 * bucket is actually not idle. Therefore
1612 * remark the bucket as busy again and
1613 * update the deadline.
1614 */
1615 nh_res_bucket_set_busy(bucket);
1616 idle_point = nh_res_bucket_idle_point(res_table,
1617 bucket,
1618 now);
1619 nh_res_time_set_deadline(idle_point, &deadline);
1620 }
1621 }
1622 }
1623
1624 /* If the group is still unbalanced, schedule the next upkeep to
1625 * either the deadline computed above, or the minimum deadline,
1626 * whichever comes later.
1627 */
1628 if (!nh_res_table_is_balanced(res_table)) {
1629 unsigned long now = jiffies;
1630 unsigned long min_deadline;
1631
1632 min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
1633 if (time_before(deadline, min_deadline))
1634 deadline = min_deadline;
1635
1636 queue_delayed_work(system_power_efficient_wq,
1637 &res_table->upkeep_dw, deadline - now);
1638 }
1639 }
1640
nh_res_table_upkeep_dw(struct work_struct * work)1641 static void nh_res_table_upkeep_dw(struct work_struct *work)
1642 {
1643 struct delayed_work *dw = to_delayed_work(work);
1644 struct nh_res_table *res_table;
1645
1646 res_table = container_of(dw, struct nh_res_table, upkeep_dw);
1647 nh_res_table_upkeep(res_table, true, true);
1648 }
1649
nh_res_table_cancel_upkeep(struct nh_res_table * res_table)1650 static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
1651 {
1652 cancel_delayed_work_sync(&res_table->upkeep_dw);
1653 }
1654
nh_res_group_rebalance(struct nh_group * nhg,struct nh_res_table * res_table)1655 static void nh_res_group_rebalance(struct nh_group *nhg,
1656 struct nh_res_table *res_table)
1657 {
1658 int prev_upper_bound = 0;
1659 int total = 0;
1660 int w = 0;
1661 int i;
1662
1663 INIT_LIST_HEAD(&res_table->uw_nh_entries);
1664
1665 for (i = 0; i < nhg->num_nh; ++i)
1666 total += nhg->nh_entries[i].weight;
1667
1668 for (i = 0; i < nhg->num_nh; ++i) {
1669 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1670 int upper_bound;
1671
1672 w += nhge->weight;
1673 upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
1674 total);
1675 nhge->res.wants_buckets = upper_bound - prev_upper_bound;
1676 prev_upper_bound = upper_bound;
1677
1678 if (nh_res_nhge_is_uw(nhge)) {
1679 if (list_empty(&res_table->uw_nh_entries))
1680 res_table->unbalanced_since = jiffies;
1681 list_add(&nhge->res.uw_nh_entry,
1682 &res_table->uw_nh_entries);
1683 }
1684 }
1685 }
1686
1687 /* Migrate buckets in res_table so that they reference NHGE's from NHG with
1688 * the right NH ID. Set those buckets that do not have a corresponding NHGE
1689 * entry in NHG as not occupied.
1690 */
nh_res_table_migrate_buckets(struct nh_res_table * res_table,struct nh_group * nhg)1691 static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
1692 struct nh_group *nhg)
1693 {
1694 u16 i;
1695
1696 for (i = 0; i < res_table->num_nh_buckets; i++) {
1697 struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
1698 u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
1699 bool found = false;
1700 int j;
1701
1702 for (j = 0; j < nhg->num_nh; j++) {
1703 struct nh_grp_entry *nhge = &nhg->nh_entries[j];
1704
1705 if (nhge->nh->id == id) {
1706 nh_res_bucket_set_nh(bucket, nhge);
1707 found = true;
1708 break;
1709 }
1710 }
1711
1712 if (!found)
1713 nh_res_bucket_unset_nh(bucket);
1714 }
1715 }
1716
replace_nexthop_grp_res(struct nh_group * oldg,struct nh_group * newg)1717 static void replace_nexthop_grp_res(struct nh_group *oldg,
1718 struct nh_group *newg)
1719 {
1720 /* For NH group replacement, the new NHG might only have a stub
1721 * hash table with 0 buckets, because the number of buckets was not
1722 * specified. For NH removal, oldg and newg both reference the same
1723 * res_table. So in any case, in the following, we want to work
1724 * with oldg->res_table.
1725 */
1726 struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
1727 unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
1728 bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
1729
1730 nh_res_table_cancel_upkeep(old_res_table);
1731 nh_res_table_migrate_buckets(old_res_table, newg);
1732 nh_res_group_rebalance(newg, old_res_table);
1733 if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
1734 old_res_table->unbalanced_since = prev_unbalanced_since;
1735 nh_res_table_upkeep(old_res_table, true, false);
1736 }
1737
nh_hthr_group_rebalance(struct nh_group * nhg)1738 static void nh_hthr_group_rebalance(struct nh_group *nhg)
1739 {
1740 int total = 0;
1741 int w = 0;
1742 int i;
1743
1744 for (i = 0; i < nhg->num_nh; ++i)
1745 total += nhg->nh_entries[i].weight;
1746
1747 for (i = 0; i < nhg->num_nh; ++i) {
1748 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1749 int upper_bound;
1750
1751 w += nhge->weight;
1752 upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
1753 atomic_set(&nhge->hthr.upper_bound, upper_bound);
1754 }
1755 }
1756
remove_nh_grp_entry(struct net * net,struct nh_grp_entry * nhge,struct nl_info * nlinfo)1757 static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
1758 struct nl_info *nlinfo)
1759 {
1760 struct nh_grp_entry *nhges, *new_nhges;
1761 struct nexthop *nhp = nhge->nh_parent;
1762 struct netlink_ext_ack extack;
1763 struct nexthop *nh = nhge->nh;
1764 struct nh_group *nhg, *newg;
1765 int i, j, err;
1766
1767 WARN_ON(!nh);
1768
1769 nhg = rtnl_dereference(nhp->nh_grp);
1770 newg = nhg->spare;
1771
1772 /* last entry, keep it visible and remove the parent */
1773 if (nhg->num_nh == 1) {
1774 remove_nexthop(net, nhp, nlinfo);
1775 return;
1776 }
1777
1778 newg->has_v4 = false;
1779 newg->is_multipath = nhg->is_multipath;
1780 newg->hash_threshold = nhg->hash_threshold;
1781 newg->resilient = nhg->resilient;
1782 newg->fdb_nh = nhg->fdb_nh;
1783 newg->num_nh = nhg->num_nh;
1784
1785 /* copy old entries to new except the one getting removed */
1786 nhges = nhg->nh_entries;
1787 new_nhges = newg->nh_entries;
1788 for (i = 0, j = 0; i < nhg->num_nh; ++i) {
1789 struct nh_info *nhi;
1790
1791 /* current nexthop getting removed */
1792 if (nhg->nh_entries[i].nh == nh) {
1793 newg->num_nh--;
1794 continue;
1795 }
1796
1797 nhi = rtnl_dereference(nhges[i].nh->nh_info);
1798 if (nhi->family == AF_INET)
1799 newg->has_v4 = true;
1800
1801 list_del(&nhges[i].nh_list);
1802 new_nhges[j].nh_parent = nhges[i].nh_parent;
1803 new_nhges[j].nh = nhges[i].nh;
1804 new_nhges[j].weight = nhges[i].weight;
1805 list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
1806 j++;
1807 }
1808
1809 if (newg->hash_threshold)
1810 nh_hthr_group_rebalance(newg);
1811 else if (newg->resilient)
1812 replace_nexthop_grp_res(nhg, newg);
1813
1814 rcu_assign_pointer(nhp->nh_grp, newg);
1815
1816 list_del(&nhge->nh_list);
1817 nexthop_put(nhge->nh);
1818
1819 /* Removal of a NH from a resilient group is notified through
1820 * bucket notifications.
1821 */
1822 if (newg->hash_threshold) {
1823 err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
1824 &extack);
1825 if (err)
1826 pr_err("%s\n", extack._msg);
1827 }
1828
1829 if (nlinfo)
1830 nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
1831 }
1832
remove_nexthop_from_groups(struct net * net,struct nexthop * nh,struct nl_info * nlinfo)1833 static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
1834 struct nl_info *nlinfo)
1835 {
1836 struct nh_grp_entry *nhge, *tmp;
1837
1838 list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
1839 remove_nh_grp_entry(net, nhge, nlinfo);
1840
1841 /* make sure all see the newly published array before releasing rtnl */
1842 synchronize_net();
1843 }
1844
remove_nexthop_group(struct nexthop * nh,struct nl_info * nlinfo)1845 static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
1846 {
1847 struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
1848 struct nh_res_table *res_table;
1849 int i, num_nh = nhg->num_nh;
1850
1851 for (i = 0; i < num_nh; ++i) {
1852 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1853
1854 if (WARN_ON(!nhge->nh))
1855 continue;
1856
1857 list_del_init(&nhge->nh_list);
1858 }
1859
1860 if (nhg->resilient) {
1861 res_table = rtnl_dereference(nhg->res_table);
1862 nh_res_table_cancel_upkeep(res_table);
1863 }
1864 }
1865
1866 /* not called for nexthop replace */
__remove_nexthop_fib(struct net * net,struct nexthop * nh)1867 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
1868 {
1869 struct fib6_info *f6i, *tmp;
1870 bool do_flush = false;
1871 struct fib_info *fi;
1872
1873 list_for_each_entry(fi, &nh->fi_list, nh_list) {
1874 fi->fib_flags |= RTNH_F_DEAD;
1875 do_flush = true;
1876 }
1877 if (do_flush)
1878 fib_flush(net);
1879
1880 /* ip6_del_rt removes the entry from this list hence the _safe */
1881 list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
1882 /* __ip6_del_rt does a release, so do a hold here */
1883 fib6_info_hold(f6i);
1884 ipv6_stub->ip6_del_rt(net, f6i,
1885 !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
1886 }
1887 }
1888
__remove_nexthop(struct net * net,struct nexthop * nh,struct nl_info * nlinfo)1889 static void __remove_nexthop(struct net *net, struct nexthop *nh,
1890 struct nl_info *nlinfo)
1891 {
1892 __remove_nexthop_fib(net, nh);
1893
1894 if (nh->is_group) {
1895 remove_nexthop_group(nh, nlinfo);
1896 } else {
1897 struct nh_info *nhi;
1898
1899 nhi = rtnl_dereference(nh->nh_info);
1900 if (nhi->fib_nhc.nhc_dev)
1901 hlist_del(&nhi->dev_hash);
1902
1903 remove_nexthop_from_groups(net, nh, nlinfo);
1904 }
1905 }
1906
remove_nexthop(struct net * net,struct nexthop * nh,struct nl_info * nlinfo)1907 static void remove_nexthop(struct net *net, struct nexthop *nh,
1908 struct nl_info *nlinfo)
1909 {
1910 call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
1911
1912 /* remove from the tree */
1913 rb_erase(&nh->rb_node, &net->nexthop.rb_root);
1914
1915 if (nlinfo)
1916 nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
1917
1918 __remove_nexthop(net, nh, nlinfo);
1919 nh_base_seq_inc(net);
1920
1921 nexthop_put(nh);
1922 }
1923
1924 /* if any FIB entries reference this nexthop, any dst entries
1925 * need to be regenerated
1926 */
nh_rt_cache_flush(struct net * net,struct nexthop * nh,struct nexthop * replaced_nh)1927 static void nh_rt_cache_flush(struct net *net, struct nexthop *nh,
1928 struct nexthop *replaced_nh)
1929 {
1930 struct fib6_info *f6i;
1931 struct nh_group *nhg;
1932 int i;
1933
1934 if (!list_empty(&nh->fi_list))
1935 rt_cache_flush(net);
1936
1937 list_for_each_entry(f6i, &nh->f6i_list, nh_list)
1938 ipv6_stub->fib6_update_sernum(net, f6i);
1939
1940 /* if an IPv6 group was replaced, we have to release all old
1941 * dsts to make sure all refcounts are released
1942 */
1943 if (!replaced_nh->is_group)
1944 return;
1945
1946 nhg = rtnl_dereference(replaced_nh->nh_grp);
1947 for (i = 0; i < nhg->num_nh; i++) {
1948 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1949 struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info);
1950
1951 if (nhi->family == AF_INET6)
1952 ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh);
1953 }
1954 }
1955
replace_nexthop_grp(struct net * net,struct nexthop * old,struct nexthop * new,const struct nh_config * cfg,struct netlink_ext_ack * extack)1956 static int replace_nexthop_grp(struct net *net, struct nexthop *old,
1957 struct nexthop *new, const struct nh_config *cfg,
1958 struct netlink_ext_ack *extack)
1959 {
1960 struct nh_res_table *tmp_table = NULL;
1961 struct nh_res_table *new_res_table;
1962 struct nh_res_table *old_res_table;
1963 struct nh_group *oldg, *newg;
1964 int i, err;
1965
1966 if (!new->is_group) {
1967 NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
1968 return -EINVAL;
1969 }
1970
1971 oldg = rtnl_dereference(old->nh_grp);
1972 newg = rtnl_dereference(new->nh_grp);
1973
1974 if (newg->hash_threshold != oldg->hash_threshold) {
1975 NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
1976 return -EINVAL;
1977 }
1978
1979 if (newg->hash_threshold) {
1980 err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
1981 extack);
1982 if (err)
1983 return err;
1984 } else if (newg->resilient) {
1985 new_res_table = rtnl_dereference(newg->res_table);
1986 old_res_table = rtnl_dereference(oldg->res_table);
1987
1988 /* Accept if num_nh_buckets was not given, but if it was
1989 * given, demand that the value be correct.
1990 */
1991 if (cfg->nh_grp_res_has_num_buckets &&
1992 cfg->nh_grp_res_num_buckets !=
1993 old_res_table->num_nh_buckets) {
1994 NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
1995 return -EINVAL;
1996 }
1997
1998 /* Emit a pre-replace notification so that listeners could veto
1999 * a potentially unsupported configuration. Otherwise,
2000 * individual bucket replacement notifications would need to be
2001 * vetoed, which is something that should only happen if the
2002 * bucket is currently active.
2003 */
2004 err = call_nexthop_res_table_notifiers(net, new, extack);
2005 if (err)
2006 return err;
2007
2008 if (cfg->nh_grp_res_has_idle_timer)
2009 old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
2010 if (cfg->nh_grp_res_has_unbalanced_timer)
2011 old_res_table->unbalanced_timer =
2012 cfg->nh_grp_res_unbalanced_timer;
2013
2014 replace_nexthop_grp_res(oldg, newg);
2015
2016 tmp_table = new_res_table;
2017 rcu_assign_pointer(newg->res_table, old_res_table);
2018 rcu_assign_pointer(newg->spare->res_table, old_res_table);
2019 }
2020
2021 /* update parents - used by nexthop code for cleanup */
2022 for (i = 0; i < newg->num_nh; i++)
2023 newg->nh_entries[i].nh_parent = old;
2024
2025 rcu_assign_pointer(old->nh_grp, newg);
2026
2027 /* Make sure concurrent readers are not using 'oldg' anymore. */
2028 synchronize_net();
2029
2030 if (newg->resilient) {
2031 rcu_assign_pointer(oldg->res_table, tmp_table);
2032 rcu_assign_pointer(oldg->spare->res_table, tmp_table);
2033 }
2034
2035 for (i = 0; i < oldg->num_nh; i++)
2036 oldg->nh_entries[i].nh_parent = new;
2037
2038 rcu_assign_pointer(new->nh_grp, oldg);
2039
2040 return 0;
2041 }
2042
nh_group_v4_update(struct nh_group * nhg)2043 static void nh_group_v4_update(struct nh_group *nhg)
2044 {
2045 struct nh_grp_entry *nhges;
2046 bool has_v4 = false;
2047 int i;
2048
2049 nhges = nhg->nh_entries;
2050 for (i = 0; i < nhg->num_nh; i++) {
2051 struct nh_info *nhi;
2052
2053 nhi = rtnl_dereference(nhges[i].nh->nh_info);
2054 if (nhi->family == AF_INET)
2055 has_v4 = true;
2056 }
2057 nhg->has_v4 = has_v4;
2058 }
2059
replace_nexthop_single_notify_res(struct net * net,struct nh_res_table * res_table,struct nexthop * old,struct nh_info * oldi,struct nh_info * newi,struct netlink_ext_ack * extack)2060 static int replace_nexthop_single_notify_res(struct net *net,
2061 struct nh_res_table *res_table,
2062 struct nexthop *old,
2063 struct nh_info *oldi,
2064 struct nh_info *newi,
2065 struct netlink_ext_ack *extack)
2066 {
2067 u32 nhg_id = res_table->nhg_id;
2068 int err;
2069 u16 i;
2070
2071 for (i = 0; i < res_table->num_nh_buckets; i++) {
2072 struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
2073 struct nh_grp_entry *nhge;
2074
2075 nhge = rtnl_dereference(bucket->nh_entry);
2076 if (nhge->nh == old) {
2077 err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
2078 i, true,
2079 oldi, newi,
2080 extack);
2081 if (err)
2082 goto err_notify;
2083 }
2084 }
2085
2086 return 0;
2087
2088 err_notify:
2089 while (i-- > 0) {
2090 struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
2091 struct nh_grp_entry *nhge;
2092
2093 nhge = rtnl_dereference(bucket->nh_entry);
2094 if (nhge->nh == old)
2095 __call_nexthop_res_bucket_notifiers(net, nhg_id, i,
2096 true, newi, oldi,
2097 extack);
2098 }
2099 return err;
2100 }
2101
replace_nexthop_single_notify(struct net * net,struct nexthop * group_nh,struct nexthop * old,struct nh_info * oldi,struct nh_info * newi,struct netlink_ext_ack * extack)2102 static int replace_nexthop_single_notify(struct net *net,
2103 struct nexthop *group_nh,
2104 struct nexthop *old,
2105 struct nh_info *oldi,
2106 struct nh_info *newi,
2107 struct netlink_ext_ack *extack)
2108 {
2109 struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
2110 struct nh_res_table *res_table;
2111
2112 if (nhg->hash_threshold) {
2113 return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
2114 group_nh, extack);
2115 } else if (nhg->resilient) {
2116 res_table = rtnl_dereference(nhg->res_table);
2117 return replace_nexthop_single_notify_res(net, res_table,
2118 old, oldi, newi,
2119 extack);
2120 }
2121
2122 return -EINVAL;
2123 }
2124
replace_nexthop_single(struct net * net,struct nexthop * old,struct nexthop * new,struct netlink_ext_ack * extack)2125 static int replace_nexthop_single(struct net *net, struct nexthop *old,
2126 struct nexthop *new,
2127 struct netlink_ext_ack *extack)
2128 {
2129 u8 old_protocol, old_nh_flags;
2130 struct nh_info *oldi, *newi;
2131 struct nh_grp_entry *nhge;
2132 int err;
2133
2134 if (new->is_group) {
2135 NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
2136 return -EINVAL;
2137 }
2138
2139 err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
2140 if (err)
2141 return err;
2142
2143 /* Hardware flags were set on 'old' as 'new' is not in the red-black
2144 * tree. Therefore, inherit the flags from 'old' to 'new'.
2145 */
2146 new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
2147
2148 oldi = rtnl_dereference(old->nh_info);
2149 newi = rtnl_dereference(new->nh_info);
2150
2151 newi->nh_parent = old;
2152 oldi->nh_parent = new;
2153
2154 old_protocol = old->protocol;
2155 old_nh_flags = old->nh_flags;
2156
2157 old->protocol = new->protocol;
2158 old->nh_flags = new->nh_flags;
2159
2160 rcu_assign_pointer(old->nh_info, newi);
2161 rcu_assign_pointer(new->nh_info, oldi);
2162
2163 /* Send a replace notification for all the groups using the nexthop. */
2164 list_for_each_entry(nhge, &old->grp_list, nh_list) {
2165 struct nexthop *nhp = nhge->nh_parent;
2166
2167 err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
2168 extack);
2169 if (err)
2170 goto err_notify;
2171 }
2172
2173 /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
2174 * update IPv4 indication in all the groups using the nexthop.
2175 */
2176 if (oldi->family == AF_INET && newi->family == AF_INET6) {
2177 list_for_each_entry(nhge, &old->grp_list, nh_list) {
2178 struct nexthop *nhp = nhge->nh_parent;
2179 struct nh_group *nhg;
2180
2181 nhg = rtnl_dereference(nhp->nh_grp);
2182 nh_group_v4_update(nhg);
2183 }
2184 }
2185
2186 return 0;
2187
2188 err_notify:
2189 rcu_assign_pointer(new->nh_info, newi);
2190 rcu_assign_pointer(old->nh_info, oldi);
2191 old->nh_flags = old_nh_flags;
2192 old->protocol = old_protocol;
2193 oldi->nh_parent = old;
2194 newi->nh_parent = new;
2195 list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
2196 struct nexthop *nhp = nhge->nh_parent;
2197
2198 replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
2199 }
2200 call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
2201 return err;
2202 }
2203
__nexthop_replace_notify(struct net * net,struct nexthop * nh,struct nl_info * info)2204 static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
2205 struct nl_info *info)
2206 {
2207 struct fib6_info *f6i;
2208
2209 if (!list_empty(&nh->fi_list)) {
2210 struct fib_info *fi;
2211
2212 /* expectation is a few fib_info per nexthop and then
2213 * a lot of routes per fib_info. So mark the fib_info
2214 * and then walk the fib tables once
2215 */
2216 list_for_each_entry(fi, &nh->fi_list, nh_list)
2217 fi->nh_updated = true;
2218
2219 fib_info_notify_update(net, info);
2220
2221 list_for_each_entry(fi, &nh->fi_list, nh_list)
2222 fi->nh_updated = false;
2223 }
2224
2225 list_for_each_entry(f6i, &nh->f6i_list, nh_list)
2226 ipv6_stub->fib6_rt_update(net, f6i, info);
2227 }
2228
2229 /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
2230 * linked to this nexthop and for all groups that the nexthop
2231 * is a member of
2232 */
nexthop_replace_notify(struct net * net,struct nexthop * nh,struct nl_info * info)2233 static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
2234 struct nl_info *info)
2235 {
2236 struct nh_grp_entry *nhge;
2237
2238 __nexthop_replace_notify(net, nh, info);
2239
2240 list_for_each_entry(nhge, &nh->grp_list, nh_list)
2241 __nexthop_replace_notify(net, nhge->nh_parent, info);
2242 }
2243
replace_nexthop(struct net * net,struct nexthop * old,struct nexthop * new,const struct nh_config * cfg,struct netlink_ext_ack * extack)2244 static int replace_nexthop(struct net *net, struct nexthop *old,
2245 struct nexthop *new, const struct nh_config *cfg,
2246 struct netlink_ext_ack *extack)
2247 {
2248 bool new_is_reject = false;
2249 struct nh_grp_entry *nhge;
2250 int err;
2251
2252 /* check that existing FIB entries are ok with the
2253 * new nexthop definition
2254 */
2255 err = fib_check_nh_list(old, new, extack);
2256 if (err)
2257 return err;
2258
2259 err = fib6_check_nh_list(old, new, extack);
2260 if (err)
2261 return err;
2262
2263 if (!new->is_group) {
2264 struct nh_info *nhi = rtnl_dereference(new->nh_info);
2265
2266 new_is_reject = nhi->reject_nh;
2267 }
2268
2269 list_for_each_entry(nhge, &old->grp_list, nh_list) {
2270 /* if new nexthop is a blackhole, any groups using this
2271 * nexthop cannot have more than 1 path
2272 */
2273 if (new_is_reject &&
2274 nexthop_num_path(nhge->nh_parent) > 1) {
2275 NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
2276 return -EINVAL;
2277 }
2278
2279 err = fib_check_nh_list(nhge->nh_parent, new, extack);
2280 if (err)
2281 return err;
2282
2283 err = fib6_check_nh_list(nhge->nh_parent, new, extack);
2284 if (err)
2285 return err;
2286 }
2287
2288 if (old->is_group)
2289 err = replace_nexthop_grp(net, old, new, cfg, extack);
2290 else
2291 err = replace_nexthop_single(net, old, new, extack);
2292
2293 if (!err) {
2294 nh_rt_cache_flush(net, old, new);
2295
2296 __remove_nexthop(net, new, NULL);
2297 nexthop_put(new);
2298 }
2299
2300 return err;
2301 }
2302
2303 /* called with rtnl_lock held */
insert_nexthop(struct net * net,struct nexthop * new_nh,struct nh_config * cfg,struct netlink_ext_ack * extack)2304 static int insert_nexthop(struct net *net, struct nexthop *new_nh,
2305 struct nh_config *cfg, struct netlink_ext_ack *extack)
2306 {
2307 struct rb_node **pp, *parent = NULL, *next;
2308 struct rb_root *root = &net->nexthop.rb_root;
2309 bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
2310 bool create = !!(cfg->nlflags & NLM_F_CREATE);
2311 u32 new_id = new_nh->id;
2312 int replace_notify = 0;
2313 int rc = -EEXIST;
2314
2315 pp = &root->rb_node;
2316 while (1) {
2317 struct nexthop *nh;
2318
2319 next = *pp;
2320 if (!next)
2321 break;
2322
2323 parent = next;
2324
2325 nh = rb_entry(parent, struct nexthop, rb_node);
2326 if (new_id < nh->id) {
2327 pp = &next->rb_left;
2328 } else if (new_id > nh->id) {
2329 pp = &next->rb_right;
2330 } else if (replace) {
2331 rc = replace_nexthop(net, nh, new_nh, cfg, extack);
2332 if (!rc) {
2333 new_nh = nh; /* send notification with old nh */
2334 replace_notify = 1;
2335 }
2336 goto out;
2337 } else {
2338 /* id already exists and not a replace */
2339 goto out;
2340 }
2341 }
2342
2343 if (replace && !create) {
2344 NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
2345 rc = -ENOENT;
2346 goto out;
2347 }
2348
2349 if (new_nh->is_group) {
2350 struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
2351 struct nh_res_table *res_table;
2352
2353 if (nhg->resilient) {
2354 res_table = rtnl_dereference(nhg->res_table);
2355
2356 /* Not passing the number of buckets is OK when
2357 * replacing, but not when creating a new group.
2358 */
2359 if (!cfg->nh_grp_res_has_num_buckets) {
2360 NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
2361 rc = -EINVAL;
2362 goto out;
2363 }
2364
2365 nh_res_group_rebalance(nhg, res_table);
2366
2367 /* Do not send bucket notifications, we do full
2368 * notification below.
2369 */
2370 nh_res_table_upkeep(res_table, false, false);
2371 }
2372 }
2373
2374 rb_link_node_rcu(&new_nh->rb_node, parent, pp);
2375 rb_insert_color(&new_nh->rb_node, root);
2376
2377 /* The initial insertion is a full notification for hash-threshold as
2378 * well as resilient groups.
2379 */
2380 rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
2381 if (rc)
2382 rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
2383
2384 out:
2385 if (!rc) {
2386 nh_base_seq_inc(net);
2387 nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
2388 if (replace_notify &&
2389 READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode))
2390 nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
2391 }
2392
2393 return rc;
2394 }
2395
2396 /* rtnl */
2397 /* remove all nexthops tied to a device being deleted */
nexthop_flush_dev(struct net_device * dev,unsigned long event)2398 static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
2399 {
2400 unsigned int hash = nh_dev_hashfn(dev->ifindex);
2401 struct net *net = dev_net(dev);
2402 struct hlist_head *head = &net->nexthop.devhash[hash];
2403 struct hlist_node *n;
2404 struct nh_info *nhi;
2405
2406 hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
2407 if (nhi->fib_nhc.nhc_dev != dev)
2408 continue;
2409
2410 if (nhi->reject_nh &&
2411 (event == NETDEV_DOWN || event == NETDEV_CHANGE))
2412 continue;
2413
2414 remove_nexthop(net, nhi->nh_parent, NULL);
2415 }
2416 }
2417
2418 /* rtnl; called when net namespace is deleted */
flush_all_nexthops(struct net * net)2419 static void flush_all_nexthops(struct net *net)
2420 {
2421 struct rb_root *root = &net->nexthop.rb_root;
2422 struct rb_node *node;
2423 struct nexthop *nh;
2424
2425 while ((node = rb_first(root))) {
2426 nh = rb_entry(node, struct nexthop, rb_node);
2427 remove_nexthop(net, nh, NULL);
2428 cond_resched();
2429 }
2430 }
2431
nexthop_create_group(struct net * net,struct nh_config * cfg)2432 static struct nexthop *nexthop_create_group(struct net *net,
2433 struct nh_config *cfg)
2434 {
2435 struct nlattr *grps_attr = cfg->nh_grp;
2436 struct nexthop_grp *entry = nla_data(grps_attr);
2437 u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
2438 struct nh_group *nhg;
2439 struct nexthop *nh;
2440 int err;
2441 int i;
2442
2443 if (WARN_ON(!num_nh))
2444 return ERR_PTR(-EINVAL);
2445
2446 nh = nexthop_alloc();
2447 if (!nh)
2448 return ERR_PTR(-ENOMEM);
2449
2450 nh->is_group = 1;
2451
2452 nhg = nexthop_grp_alloc(num_nh);
2453 if (!nhg) {
2454 kfree(nh);
2455 return ERR_PTR(-ENOMEM);
2456 }
2457
2458 /* spare group used for removals */
2459 nhg->spare = nexthop_grp_alloc(num_nh);
2460 if (!nhg->spare) {
2461 kfree(nhg);
2462 kfree(nh);
2463 return ERR_PTR(-ENOMEM);
2464 }
2465 nhg->spare->spare = nhg;
2466
2467 for (i = 0; i < nhg->num_nh; ++i) {
2468 struct nexthop *nhe;
2469 struct nh_info *nhi;
2470
2471 nhe = nexthop_find_by_id(net, entry[i].id);
2472 if (!nexthop_get(nhe)) {
2473 err = -ENOENT;
2474 goto out_no_nh;
2475 }
2476
2477 nhi = rtnl_dereference(nhe->nh_info);
2478 if (nhi->family == AF_INET)
2479 nhg->has_v4 = true;
2480
2481 nhg->nh_entries[i].nh = nhe;
2482 nhg->nh_entries[i].weight = entry[i].weight + 1;
2483 list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
2484 nhg->nh_entries[i].nh_parent = nh;
2485 }
2486
2487 if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
2488 nhg->hash_threshold = 1;
2489 nhg->is_multipath = true;
2490 } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
2491 struct nh_res_table *res_table;
2492
2493 res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
2494 if (!res_table) {
2495 err = -ENOMEM;
2496 goto out_no_nh;
2497 }
2498
2499 rcu_assign_pointer(nhg->spare->res_table, res_table);
2500 rcu_assign_pointer(nhg->res_table, res_table);
2501 nhg->resilient = true;
2502 nhg->is_multipath = true;
2503 }
2504
2505 WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1);
2506
2507 if (nhg->hash_threshold)
2508 nh_hthr_group_rebalance(nhg);
2509
2510 if (cfg->nh_fdb)
2511 nhg->fdb_nh = 1;
2512
2513 rcu_assign_pointer(nh->nh_grp, nhg);
2514
2515 return nh;
2516
2517 out_no_nh:
2518 for (i--; i >= 0; --i) {
2519 list_del(&nhg->nh_entries[i].nh_list);
2520 nexthop_put(nhg->nh_entries[i].nh);
2521 }
2522
2523 kfree(nhg->spare);
2524 kfree(nhg);
2525 kfree(nh);
2526
2527 return ERR_PTR(err);
2528 }
2529
nh_create_ipv4(struct net * net,struct nexthop * nh,struct nh_info * nhi,struct nh_config * cfg,struct netlink_ext_ack * extack)2530 static int nh_create_ipv4(struct net *net, struct nexthop *nh,
2531 struct nh_info *nhi, struct nh_config *cfg,
2532 struct netlink_ext_ack *extack)
2533 {
2534 struct fib_nh *fib_nh = &nhi->fib_nh;
2535 struct fib_config fib_cfg = {
2536 .fc_oif = cfg->nh_ifindex,
2537 .fc_gw4 = cfg->gw.ipv4,
2538 .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
2539 .fc_flags = cfg->nh_flags,
2540 .fc_nlinfo = cfg->nlinfo,
2541 .fc_encap = cfg->nh_encap,
2542 .fc_encap_type = cfg->nh_encap_type,
2543 };
2544 u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
2545 int err;
2546
2547 err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
2548 if (err) {
2549 fib_nh_release(net, fib_nh);
2550 goto out;
2551 }
2552
2553 if (nhi->fdb_nh)
2554 goto out;
2555
2556 /* sets nh_dev if successful */
2557 err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
2558 if (!err) {
2559 nh->nh_flags = fib_nh->fib_nh_flags;
2560 fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
2561 !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1);
2562 } else {
2563 fib_nh_release(net, fib_nh);
2564 }
2565 out:
2566 return err;
2567 }
2568
nh_create_ipv6(struct net * net,struct nexthop * nh,struct nh_info * nhi,struct nh_config * cfg,struct netlink_ext_ack * extack)2569 static int nh_create_ipv6(struct net *net, struct nexthop *nh,
2570 struct nh_info *nhi, struct nh_config *cfg,
2571 struct netlink_ext_ack *extack)
2572 {
2573 struct fib6_nh *fib6_nh = &nhi->fib6_nh;
2574 struct fib6_config fib6_cfg = {
2575 .fc_table = l3mdev_fib_table(cfg->dev),
2576 .fc_ifindex = cfg->nh_ifindex,
2577 .fc_gateway = cfg->gw.ipv6,
2578 .fc_flags = cfg->nh_flags,
2579 .fc_nlinfo = cfg->nlinfo,
2580 .fc_encap = cfg->nh_encap,
2581 .fc_encap_type = cfg->nh_encap_type,
2582 .fc_is_fdb = cfg->nh_fdb,
2583 };
2584 int err;
2585
2586 if (!ipv6_addr_any(&cfg->gw.ipv6))
2587 fib6_cfg.fc_flags |= RTF_GATEWAY;
2588
2589 /* sets nh_dev if successful */
2590 err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
2591 extack);
2592 if (err) {
2593 /* IPv6 is not enabled, don't call fib6_nh_release */
2594 if (err == -EAFNOSUPPORT)
2595 goto out;
2596 ipv6_stub->fib6_nh_release(fib6_nh);
2597 } else {
2598 nh->nh_flags = fib6_nh->fib_nh_flags;
2599 }
2600 out:
2601 return err;
2602 }
2603
nexthop_create(struct net * net,struct nh_config * cfg,struct netlink_ext_ack * extack)2604 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
2605 struct netlink_ext_ack *extack)
2606 {
2607 struct nh_info *nhi;
2608 struct nexthop *nh;
2609 int err = 0;
2610
2611 nh = nexthop_alloc();
2612 if (!nh)
2613 return ERR_PTR(-ENOMEM);
2614
2615 nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
2616 if (!nhi) {
2617 kfree(nh);
2618 return ERR_PTR(-ENOMEM);
2619 }
2620
2621 nh->nh_flags = cfg->nh_flags;
2622 nh->net = net;
2623
2624 nhi->nh_parent = nh;
2625 nhi->family = cfg->nh_family;
2626 nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
2627
2628 if (cfg->nh_fdb)
2629 nhi->fdb_nh = 1;
2630
2631 if (cfg->nh_blackhole) {
2632 nhi->reject_nh = 1;
2633 cfg->nh_ifindex = net->loopback_dev->ifindex;
2634 }
2635
2636 switch (cfg->nh_family) {
2637 case AF_INET:
2638 err = nh_create_ipv4(net, nh, nhi, cfg, extack);
2639 break;
2640 case AF_INET6:
2641 err = nh_create_ipv6(net, nh, nhi, cfg, extack);
2642 break;
2643 }
2644
2645 if (err) {
2646 kfree(nhi);
2647 kfree(nh);
2648 return ERR_PTR(err);
2649 }
2650
2651 /* add the entry to the device based hash */
2652 if (!nhi->fdb_nh)
2653 nexthop_devhash_add(net, nhi);
2654
2655 rcu_assign_pointer(nh->nh_info, nhi);
2656
2657 return nh;
2658 }
2659
2660 /* called with rtnl lock held */
nexthop_add(struct net * net,struct nh_config * cfg,struct netlink_ext_ack * extack)2661 static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
2662 struct netlink_ext_ack *extack)
2663 {
2664 struct nexthop *nh;
2665 int err;
2666
2667 if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
2668 NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
2669 return ERR_PTR(-EINVAL);
2670 }
2671
2672 if (!cfg->nh_id) {
2673 cfg->nh_id = nh_find_unused_id(net);
2674 if (!cfg->nh_id) {
2675 NL_SET_ERR_MSG(extack, "No unused id");
2676 return ERR_PTR(-EINVAL);
2677 }
2678 }
2679
2680 if (cfg->nh_grp)
2681 nh = nexthop_create_group(net, cfg);
2682 else
2683 nh = nexthop_create(net, cfg, extack);
2684
2685 if (IS_ERR(nh))
2686 return nh;
2687
2688 refcount_set(&nh->refcnt, 1);
2689 nh->id = cfg->nh_id;
2690 nh->protocol = cfg->nh_protocol;
2691 nh->net = net;
2692
2693 err = insert_nexthop(net, nh, cfg, extack);
2694 if (err) {
2695 __remove_nexthop(net, nh, NULL);
2696 nexthop_put(nh);
2697 nh = ERR_PTR(err);
2698 }
2699
2700 return nh;
2701 }
2702
rtm_nh_get_timer(struct nlattr * attr,unsigned long fallback,unsigned long * timer_p,bool * has_p,struct netlink_ext_ack * extack)2703 static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
2704 unsigned long *timer_p, bool *has_p,
2705 struct netlink_ext_ack *extack)
2706 {
2707 unsigned long timer;
2708 u32 value;
2709
2710 if (!attr) {
2711 *timer_p = fallback;
2712 *has_p = false;
2713 return 0;
2714 }
2715
2716 value = nla_get_u32(attr);
2717 timer = clock_t_to_jiffies(value);
2718 if (timer == ~0UL) {
2719 NL_SET_ERR_MSG(extack, "Timer value too large");
2720 return -EINVAL;
2721 }
2722
2723 *timer_p = timer;
2724 *has_p = true;
2725 return 0;
2726 }
2727
rtm_to_nh_config_grp_res(struct nlattr * res,struct nh_config * cfg,struct netlink_ext_ack * extack)2728 static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
2729 struct netlink_ext_ack *extack)
2730 {
2731 struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
2732 int err;
2733
2734 if (res) {
2735 err = nla_parse_nested(tb,
2736 ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
2737 res, rtm_nh_res_policy_new, extack);
2738 if (err < 0)
2739 return err;
2740 }
2741
2742 if (tb[NHA_RES_GROUP_BUCKETS]) {
2743 cfg->nh_grp_res_num_buckets =
2744 nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
2745 cfg->nh_grp_res_has_num_buckets = true;
2746 if (!cfg->nh_grp_res_num_buckets) {
2747 NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
2748 return -EINVAL;
2749 }
2750 }
2751
2752 err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
2753 NH_RES_DEFAULT_IDLE_TIMER,
2754 &cfg->nh_grp_res_idle_timer,
2755 &cfg->nh_grp_res_has_idle_timer,
2756 extack);
2757 if (err)
2758 return err;
2759
2760 return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
2761 NH_RES_DEFAULT_UNBALANCED_TIMER,
2762 &cfg->nh_grp_res_unbalanced_timer,
2763 &cfg->nh_grp_res_has_unbalanced_timer,
2764 extack);
2765 }
2766
rtm_to_nh_config(struct net * net,struct sk_buff * skb,struct nlmsghdr * nlh,struct nh_config * cfg,struct netlink_ext_ack * extack)2767 static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
2768 struct nlmsghdr *nlh, struct nh_config *cfg,
2769 struct netlink_ext_ack *extack)
2770 {
2771 struct nhmsg *nhm = nlmsg_data(nlh);
2772 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
2773 int err;
2774
2775 err = nlmsg_parse(nlh, sizeof(*nhm), tb,
2776 ARRAY_SIZE(rtm_nh_policy_new) - 1,
2777 rtm_nh_policy_new, extack);
2778 if (err < 0)
2779 return err;
2780
2781 err = -EINVAL;
2782 if (nhm->resvd || nhm->nh_scope) {
2783 NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
2784 goto out;
2785 }
2786 if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
2787 NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
2788 goto out;
2789 }
2790
2791 switch (nhm->nh_family) {
2792 case AF_INET:
2793 case AF_INET6:
2794 break;
2795 case AF_UNSPEC:
2796 if (tb[NHA_GROUP])
2797 break;
2798 fallthrough;
2799 default:
2800 NL_SET_ERR_MSG(extack, "Invalid address family");
2801 goto out;
2802 }
2803
2804 memset(cfg, 0, sizeof(*cfg));
2805 cfg->nlflags = nlh->nlmsg_flags;
2806 cfg->nlinfo.portid = NETLINK_CB(skb).portid;
2807 cfg->nlinfo.nlh = nlh;
2808 cfg->nlinfo.nl_net = net;
2809
2810 cfg->nh_family = nhm->nh_family;
2811 cfg->nh_protocol = nhm->nh_protocol;
2812 cfg->nh_flags = nhm->nh_flags;
2813
2814 if (tb[NHA_ID])
2815 cfg->nh_id = nla_get_u32(tb[NHA_ID]);
2816
2817 if (tb[NHA_FDB]) {
2818 if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
2819 tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
2820 NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
2821 goto out;
2822 }
2823 if (nhm->nh_flags) {
2824 NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
2825 goto out;
2826 }
2827 cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
2828 }
2829
2830 if (tb[NHA_GROUP]) {
2831 if (nhm->nh_family != AF_UNSPEC) {
2832 NL_SET_ERR_MSG(extack, "Invalid family for group");
2833 goto out;
2834 }
2835 cfg->nh_grp = tb[NHA_GROUP];
2836
2837 cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
2838 if (tb[NHA_GROUP_TYPE])
2839 cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
2840
2841 if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
2842 NL_SET_ERR_MSG(extack, "Invalid group type");
2843 goto out;
2844 }
2845 err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb),
2846 cfg->nh_grp_type, extack);
2847 if (err)
2848 goto out;
2849
2850 if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
2851 err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
2852 cfg, extack);
2853
2854 /* no other attributes should be set */
2855 goto out;
2856 }
2857
2858 if (tb[NHA_BLACKHOLE]) {
2859 if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
2860 tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
2861 NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
2862 goto out;
2863 }
2864
2865 cfg->nh_blackhole = 1;
2866 err = 0;
2867 goto out;
2868 }
2869
2870 if (!cfg->nh_fdb && !tb[NHA_OIF]) {
2871 NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
2872 goto out;
2873 }
2874
2875 if (!cfg->nh_fdb && tb[NHA_OIF]) {
2876 cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
2877 if (cfg->nh_ifindex)
2878 cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
2879
2880 if (!cfg->dev) {
2881 NL_SET_ERR_MSG(extack, "Invalid device index");
2882 goto out;
2883 } else if (!(cfg->dev->flags & IFF_UP)) {
2884 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2885 err = -ENETDOWN;
2886 goto out;
2887 } else if (!netif_carrier_ok(cfg->dev)) {
2888 NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
2889 err = -ENETDOWN;
2890 goto out;
2891 }
2892 }
2893
2894 err = -EINVAL;
2895 if (tb[NHA_GATEWAY]) {
2896 struct nlattr *gwa = tb[NHA_GATEWAY];
2897
2898 switch (cfg->nh_family) {
2899 case AF_INET:
2900 if (nla_len(gwa) != sizeof(u32)) {
2901 NL_SET_ERR_MSG(extack, "Invalid gateway");
2902 goto out;
2903 }
2904 cfg->gw.ipv4 = nla_get_be32(gwa);
2905 break;
2906 case AF_INET6:
2907 if (nla_len(gwa) != sizeof(struct in6_addr)) {
2908 NL_SET_ERR_MSG(extack, "Invalid gateway");
2909 goto out;
2910 }
2911 cfg->gw.ipv6 = nla_get_in6_addr(gwa);
2912 break;
2913 default:
2914 NL_SET_ERR_MSG(extack,
2915 "Unknown address family for gateway");
2916 goto out;
2917 }
2918 } else {
2919 /* device only nexthop (no gateway) */
2920 if (cfg->nh_flags & RTNH_F_ONLINK) {
2921 NL_SET_ERR_MSG(extack,
2922 "ONLINK flag can not be set for nexthop without a gateway");
2923 goto out;
2924 }
2925 }
2926
2927 if (tb[NHA_ENCAP]) {
2928 cfg->nh_encap = tb[NHA_ENCAP];
2929
2930 if (!tb[NHA_ENCAP_TYPE]) {
2931 NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
2932 goto out;
2933 }
2934
2935 cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
2936 err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
2937 if (err < 0)
2938 goto out;
2939
2940 } else if (tb[NHA_ENCAP_TYPE]) {
2941 NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
2942 goto out;
2943 }
2944
2945
2946 err = 0;
2947 out:
2948 return err;
2949 }
2950
2951 /* rtnl */
rtm_new_nexthop(struct sk_buff * skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)2952 static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
2953 struct netlink_ext_ack *extack)
2954 {
2955 struct net *net = sock_net(skb->sk);
2956 struct nh_config cfg;
2957 struct nexthop *nh;
2958 int err;
2959
2960 err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
2961 if (!err) {
2962 nh = nexthop_add(net, &cfg, extack);
2963 if (IS_ERR(nh))
2964 err = PTR_ERR(nh);
2965 }
2966
2967 return err;
2968 }
2969
__nh_valid_get_del_req(const struct nlmsghdr * nlh,struct nlattr ** tb,u32 * id,struct netlink_ext_ack * extack)2970 static int __nh_valid_get_del_req(const struct nlmsghdr *nlh,
2971 struct nlattr **tb, u32 *id,
2972 struct netlink_ext_ack *extack)
2973 {
2974 struct nhmsg *nhm = nlmsg_data(nlh);
2975
2976 if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
2977 NL_SET_ERR_MSG(extack, "Invalid values in header");
2978 return -EINVAL;
2979 }
2980
2981 if (!tb[NHA_ID]) {
2982 NL_SET_ERR_MSG(extack, "Nexthop id is missing");
2983 return -EINVAL;
2984 }
2985
2986 *id = nla_get_u32(tb[NHA_ID]);
2987 if (!(*id)) {
2988 NL_SET_ERR_MSG(extack, "Invalid nexthop id");
2989 return -EINVAL;
2990 }
2991
2992 return 0;
2993 }
2994
nh_valid_get_del_req(const struct nlmsghdr * nlh,u32 * id,struct netlink_ext_ack * extack)2995 static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id,
2996 struct netlink_ext_ack *extack)
2997 {
2998 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
2999 int err;
3000
3001 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3002 ARRAY_SIZE(rtm_nh_policy_get) - 1,
3003 rtm_nh_policy_get, extack);
3004 if (err < 0)
3005 return err;
3006
3007 return __nh_valid_get_del_req(nlh, tb, id, extack);
3008 }
3009
3010 /* rtnl */
rtm_del_nexthop(struct sk_buff * skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3011 static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
3012 struct netlink_ext_ack *extack)
3013 {
3014 struct net *net = sock_net(skb->sk);
3015 struct nl_info nlinfo = {
3016 .nlh = nlh,
3017 .nl_net = net,
3018 .portid = NETLINK_CB(skb).portid,
3019 };
3020 struct nexthop *nh;
3021 int err;
3022 u32 id;
3023
3024 err = nh_valid_get_del_req(nlh, &id, extack);
3025 if (err)
3026 return err;
3027
3028 nh = nexthop_find_by_id(net, id);
3029 if (!nh)
3030 return -ENOENT;
3031
3032 remove_nexthop(net, nh, &nlinfo);
3033
3034 return 0;
3035 }
3036
3037 /* rtnl */
rtm_get_nexthop(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3038 static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3039 struct netlink_ext_ack *extack)
3040 {
3041 struct net *net = sock_net(in_skb->sk);
3042 struct sk_buff *skb = NULL;
3043 struct nexthop *nh;
3044 int err;
3045 u32 id;
3046
3047 err = nh_valid_get_del_req(nlh, &id, extack);
3048 if (err)
3049 return err;
3050
3051 err = -ENOBUFS;
3052 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3053 if (!skb)
3054 goto out;
3055
3056 err = -ENOENT;
3057 nh = nexthop_find_by_id(net, id);
3058 if (!nh)
3059 goto errout_free;
3060
3061 err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
3062 nlh->nlmsg_seq, 0);
3063 if (err < 0) {
3064 WARN_ON(err == -EMSGSIZE);
3065 goto errout_free;
3066 }
3067
3068 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3069 out:
3070 return err;
3071 errout_free:
3072 kfree_skb(skb);
3073 goto out;
3074 }
3075
3076 struct nh_dump_filter {
3077 u32 nh_id;
3078 int dev_idx;
3079 int master_idx;
3080 bool group_filter;
3081 bool fdb_filter;
3082 u32 res_bucket_nh_id;
3083 };
3084
nh_dump_filtered(struct nexthop * nh,struct nh_dump_filter * filter,u8 family)3085 static bool nh_dump_filtered(struct nexthop *nh,
3086 struct nh_dump_filter *filter, u8 family)
3087 {
3088 const struct net_device *dev;
3089 const struct nh_info *nhi;
3090
3091 if (filter->group_filter && !nh->is_group)
3092 return true;
3093
3094 if (!filter->dev_idx && !filter->master_idx && !family)
3095 return false;
3096
3097 if (nh->is_group)
3098 return true;
3099
3100 nhi = rtnl_dereference(nh->nh_info);
3101 if (family && nhi->family != family)
3102 return true;
3103
3104 dev = nhi->fib_nhc.nhc_dev;
3105 if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
3106 return true;
3107
3108 if (filter->master_idx) {
3109 struct net_device *master;
3110
3111 if (!dev)
3112 return true;
3113
3114 master = netdev_master_upper_dev_get((struct net_device *)dev);
3115 if (!master || master->ifindex != filter->master_idx)
3116 return true;
3117 }
3118
3119 return false;
3120 }
3121
__nh_valid_dump_req(const struct nlmsghdr * nlh,struct nlattr ** tb,struct nh_dump_filter * filter,struct netlink_ext_ack * extack)3122 static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
3123 struct nh_dump_filter *filter,
3124 struct netlink_ext_ack *extack)
3125 {
3126 struct nhmsg *nhm;
3127 u32 idx;
3128
3129 if (tb[NHA_OIF]) {
3130 idx = nla_get_u32(tb[NHA_OIF]);
3131 if (idx > INT_MAX) {
3132 NL_SET_ERR_MSG(extack, "Invalid device index");
3133 return -EINVAL;
3134 }
3135 filter->dev_idx = idx;
3136 }
3137 if (tb[NHA_MASTER]) {
3138 idx = nla_get_u32(tb[NHA_MASTER]);
3139 if (idx > INT_MAX) {
3140 NL_SET_ERR_MSG(extack, "Invalid master device index");
3141 return -EINVAL;
3142 }
3143 filter->master_idx = idx;
3144 }
3145 filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
3146 filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);
3147
3148 nhm = nlmsg_data(nlh);
3149 if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
3150 NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
3151 return -EINVAL;
3152 }
3153
3154 return 0;
3155 }
3156
nh_valid_dump_req(const struct nlmsghdr * nlh,struct nh_dump_filter * filter,struct netlink_callback * cb)3157 static int nh_valid_dump_req(const struct nlmsghdr *nlh,
3158 struct nh_dump_filter *filter,
3159 struct netlink_callback *cb)
3160 {
3161 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
3162 int err;
3163
3164 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3165 ARRAY_SIZE(rtm_nh_policy_dump) - 1,
3166 rtm_nh_policy_dump, cb->extack);
3167 if (err < 0)
3168 return err;
3169
3170 return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
3171 }
3172
3173 struct rtm_dump_nh_ctx {
3174 u32 idx;
3175 };
3176
3177 static struct rtm_dump_nh_ctx *
rtm_dump_nh_ctx(struct netlink_callback * cb)3178 rtm_dump_nh_ctx(struct netlink_callback *cb)
3179 {
3180 struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;
3181
3182 BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
3183 return ctx;
3184 }
3185
rtm_dump_walk_nexthops(struct sk_buff * skb,struct netlink_callback * cb,struct rb_root * root,struct rtm_dump_nh_ctx * ctx,int (* nh_cb)(struct sk_buff * skb,struct netlink_callback * cb,struct nexthop * nh,void * data),void * data)3186 static int rtm_dump_walk_nexthops(struct sk_buff *skb,
3187 struct netlink_callback *cb,
3188 struct rb_root *root,
3189 struct rtm_dump_nh_ctx *ctx,
3190 int (*nh_cb)(struct sk_buff *skb,
3191 struct netlink_callback *cb,
3192 struct nexthop *nh, void *data),
3193 void *data)
3194 {
3195 struct rb_node *node;
3196 int s_idx;
3197 int err;
3198
3199 s_idx = ctx->idx;
3200 for (node = rb_first(root); node; node = rb_next(node)) {
3201 struct nexthop *nh;
3202
3203 nh = rb_entry(node, struct nexthop, rb_node);
3204 if (nh->id < s_idx)
3205 continue;
3206
3207 ctx->idx = nh->id;
3208 err = nh_cb(skb, cb, nh, data);
3209 if (err)
3210 return err;
3211 }
3212
3213 return 0;
3214 }
3215
rtm_dump_nexthop_cb(struct sk_buff * skb,struct netlink_callback * cb,struct nexthop * nh,void * data)3216 static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
3217 struct nexthop *nh, void *data)
3218 {
3219 struct nhmsg *nhm = nlmsg_data(cb->nlh);
3220 struct nh_dump_filter *filter = data;
3221
3222 if (nh_dump_filtered(nh, filter, nhm->nh_family))
3223 return 0;
3224
3225 return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
3226 NETLINK_CB(cb->skb).portid,
3227 cb->nlh->nlmsg_seq, NLM_F_MULTI);
3228 }
3229
3230 /* rtnl */
rtm_dump_nexthop(struct sk_buff * skb,struct netlink_callback * cb)3231 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
3232 {
3233 struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
3234 struct net *net = sock_net(skb->sk);
3235 struct rb_root *root = &net->nexthop.rb_root;
3236 struct nh_dump_filter filter = {};
3237 int err;
3238
3239 err = nh_valid_dump_req(cb->nlh, &filter, cb);
3240 if (err < 0)
3241 return err;
3242
3243 err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
3244 &rtm_dump_nexthop_cb, &filter);
3245 if (err < 0) {
3246 if (likely(skb->len))
3247 err = skb->len;
3248 }
3249
3250 cb->seq = net->nexthop.seq;
3251 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
3252 return err;
3253 }
3254
3255 static struct nexthop *
nexthop_find_group_resilient(struct net * net,u32 id,struct netlink_ext_ack * extack)3256 nexthop_find_group_resilient(struct net *net, u32 id,
3257 struct netlink_ext_ack *extack)
3258 {
3259 struct nh_group *nhg;
3260 struct nexthop *nh;
3261
3262 nh = nexthop_find_by_id(net, id);
3263 if (!nh)
3264 return ERR_PTR(-ENOENT);
3265
3266 if (!nh->is_group) {
3267 NL_SET_ERR_MSG(extack, "Not a nexthop group");
3268 return ERR_PTR(-EINVAL);
3269 }
3270
3271 nhg = rtnl_dereference(nh->nh_grp);
3272 if (!nhg->resilient) {
3273 NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
3274 return ERR_PTR(-EINVAL);
3275 }
3276
3277 return nh;
3278 }
3279
nh_valid_dump_nhid(struct nlattr * attr,u32 * nh_id_p,struct netlink_ext_ack * extack)3280 static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
3281 struct netlink_ext_ack *extack)
3282 {
3283 u32 idx;
3284
3285 if (attr) {
3286 idx = nla_get_u32(attr);
3287 if (!idx) {
3288 NL_SET_ERR_MSG(extack, "Invalid nexthop id");
3289 return -EINVAL;
3290 }
3291 *nh_id_p = idx;
3292 } else {
3293 *nh_id_p = 0;
3294 }
3295
3296 return 0;
3297 }
3298
nh_valid_dump_bucket_req(const struct nlmsghdr * nlh,struct nh_dump_filter * filter,struct netlink_callback * cb)3299 static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
3300 struct nh_dump_filter *filter,
3301 struct netlink_callback *cb)
3302 {
3303 struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
3304 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
3305 int err;
3306
3307 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3308 ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
3309 rtm_nh_policy_dump_bucket, NULL);
3310 if (err < 0)
3311 return err;
3312
3313 err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
3314 if (err)
3315 return err;
3316
3317 if (tb[NHA_RES_BUCKET]) {
3318 size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;
3319
3320 err = nla_parse_nested(res_tb, max,
3321 tb[NHA_RES_BUCKET],
3322 rtm_nh_res_bucket_policy_dump,
3323 cb->extack);
3324 if (err < 0)
3325 return err;
3326
3327 err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
3328 &filter->res_bucket_nh_id,
3329 cb->extack);
3330 if (err)
3331 return err;
3332 }
3333
3334 return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
3335 }
3336
3337 struct rtm_dump_res_bucket_ctx {
3338 struct rtm_dump_nh_ctx nh;
3339 u16 bucket_index;
3340 };
3341
3342 static struct rtm_dump_res_bucket_ctx *
rtm_dump_res_bucket_ctx(struct netlink_callback * cb)3343 rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
3344 {
3345 struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;
3346
3347 BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
3348 return ctx;
3349 }
3350
3351 struct rtm_dump_nexthop_bucket_data {
3352 struct rtm_dump_res_bucket_ctx *ctx;
3353 struct nh_dump_filter filter;
3354 };
3355
rtm_dump_nexthop_bucket_nh(struct sk_buff * skb,struct netlink_callback * cb,struct nexthop * nh,struct rtm_dump_nexthop_bucket_data * dd)3356 static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
3357 struct netlink_callback *cb,
3358 struct nexthop *nh,
3359 struct rtm_dump_nexthop_bucket_data *dd)
3360 {
3361 u32 portid = NETLINK_CB(cb->skb).portid;
3362 struct nhmsg *nhm = nlmsg_data(cb->nlh);
3363 struct nh_res_table *res_table;
3364 struct nh_group *nhg;
3365 u16 bucket_index;
3366 int err;
3367
3368 nhg = rtnl_dereference(nh->nh_grp);
3369 res_table = rtnl_dereference(nhg->res_table);
3370 for (bucket_index = dd->ctx->bucket_index;
3371 bucket_index < res_table->num_nh_buckets;
3372 bucket_index++) {
3373 struct nh_res_bucket *bucket;
3374 struct nh_grp_entry *nhge;
3375
3376 bucket = &res_table->nh_buckets[bucket_index];
3377 nhge = rtnl_dereference(bucket->nh_entry);
3378 if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
3379 continue;
3380
3381 if (dd->filter.res_bucket_nh_id &&
3382 dd->filter.res_bucket_nh_id != nhge->nh->id)
3383 continue;
3384
3385 dd->ctx->bucket_index = bucket_index;
3386 err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
3387 RTM_NEWNEXTHOPBUCKET, portid,
3388 cb->nlh->nlmsg_seq, NLM_F_MULTI,
3389 cb->extack);
3390 if (err)
3391 return err;
3392 }
3393
3394 dd->ctx->bucket_index = 0;
3395
3396 return 0;
3397 }
3398
rtm_dump_nexthop_bucket_cb(struct sk_buff * skb,struct netlink_callback * cb,struct nexthop * nh,void * data)3399 static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
3400 struct netlink_callback *cb,
3401 struct nexthop *nh, void *data)
3402 {
3403 struct rtm_dump_nexthop_bucket_data *dd = data;
3404 struct nh_group *nhg;
3405
3406 if (!nh->is_group)
3407 return 0;
3408
3409 nhg = rtnl_dereference(nh->nh_grp);
3410 if (!nhg->resilient)
3411 return 0;
3412
3413 return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
3414 }
3415
3416 /* rtnl */
rtm_dump_nexthop_bucket(struct sk_buff * skb,struct netlink_callback * cb)3417 static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
3418 struct netlink_callback *cb)
3419 {
3420 struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
3421 struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
3422 struct net *net = sock_net(skb->sk);
3423 struct nexthop *nh;
3424 int err;
3425
3426 err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
3427 if (err)
3428 return err;
3429
3430 if (dd.filter.nh_id) {
3431 nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
3432 cb->extack);
3433 if (IS_ERR(nh))
3434 return PTR_ERR(nh);
3435 err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
3436 } else {
3437 struct rb_root *root = &net->nexthop.rb_root;
3438
3439 err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
3440 &rtm_dump_nexthop_bucket_cb, &dd);
3441 }
3442
3443 if (err < 0) {
3444 if (likely(skb->len))
3445 err = skb->len;
3446 }
3447
3448 cb->seq = net->nexthop.seq;
3449 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
3450 return err;
3451 }
3452
nh_valid_get_bucket_req_res_bucket(struct nlattr * res,u16 * bucket_index,struct netlink_ext_ack * extack)3453 static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
3454 u16 *bucket_index,
3455 struct netlink_ext_ack *extack)
3456 {
3457 struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
3458 int err;
3459
3460 err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
3461 res, rtm_nh_res_bucket_policy_get, extack);
3462 if (err < 0)
3463 return err;
3464
3465 if (!tb[NHA_RES_BUCKET_INDEX]) {
3466 NL_SET_ERR_MSG(extack, "Bucket index is missing");
3467 return -EINVAL;
3468 }
3469
3470 *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
3471 return 0;
3472 }
3473
nh_valid_get_bucket_req(const struct nlmsghdr * nlh,u32 * id,u16 * bucket_index,struct netlink_ext_ack * extack)3474 static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
3475 u32 *id, u16 *bucket_index,
3476 struct netlink_ext_ack *extack)
3477 {
3478 struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
3479 int err;
3480
3481 err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3482 ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
3483 rtm_nh_policy_get_bucket, extack);
3484 if (err < 0)
3485 return err;
3486
3487 err = __nh_valid_get_del_req(nlh, tb, id, extack);
3488 if (err)
3489 return err;
3490
3491 if (!tb[NHA_RES_BUCKET]) {
3492 NL_SET_ERR_MSG(extack, "Bucket information is missing");
3493 return -EINVAL;
3494 }
3495
3496 err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
3497 bucket_index, extack);
3498 if (err)
3499 return err;
3500
3501 return 0;
3502 }
3503
3504 /* rtnl */
rtm_get_nexthop_bucket(struct sk_buff * in_skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)3505 static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3506 struct netlink_ext_ack *extack)
3507 {
3508 struct net *net = sock_net(in_skb->sk);
3509 struct nh_res_table *res_table;
3510 struct sk_buff *skb = NULL;
3511 struct nh_group *nhg;
3512 struct nexthop *nh;
3513 u16 bucket_index;
3514 int err;
3515 u32 id;
3516
3517 err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
3518 if (err)
3519 return err;
3520
3521 nh = nexthop_find_group_resilient(net, id, extack);
3522 if (IS_ERR(nh))
3523 return PTR_ERR(nh);
3524
3525 nhg = rtnl_dereference(nh->nh_grp);
3526 res_table = rtnl_dereference(nhg->res_table);
3527 if (bucket_index >= res_table->num_nh_buckets) {
3528 NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
3529 return -ENOENT;
3530 }
3531
3532 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3533 if (!skb)
3534 return -ENOBUFS;
3535
3536 err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
3537 bucket_index, RTM_NEWNEXTHOPBUCKET,
3538 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
3539 0, extack);
3540 if (err < 0) {
3541 WARN_ON(err == -EMSGSIZE);
3542 goto errout_free;
3543 }
3544
3545 return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3546
3547 errout_free:
3548 kfree_skb(skb);
3549 return err;
3550 }
3551
nexthop_sync_mtu(struct net_device * dev,u32 orig_mtu)3552 static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
3553 {
3554 unsigned int hash = nh_dev_hashfn(dev->ifindex);
3555 struct net *net = dev_net(dev);
3556 struct hlist_head *head = &net->nexthop.devhash[hash];
3557 struct hlist_node *n;
3558 struct nh_info *nhi;
3559
3560 hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
3561 if (nhi->fib_nhc.nhc_dev == dev) {
3562 if (nhi->family == AF_INET)
3563 fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
3564 orig_mtu);
3565 }
3566 }
3567 }
3568
3569 /* rtnl */
nh_netdev_event(struct notifier_block * this,unsigned long event,void * ptr)3570 static int nh_netdev_event(struct notifier_block *this,
3571 unsigned long event, void *ptr)
3572 {
3573 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3574 struct netdev_notifier_info_ext *info_ext;
3575
3576 switch (event) {
3577 case NETDEV_DOWN:
3578 case NETDEV_UNREGISTER:
3579 nexthop_flush_dev(dev, event);
3580 break;
3581 case NETDEV_CHANGE:
3582 if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
3583 nexthop_flush_dev(dev, event);
3584 break;
3585 case NETDEV_CHANGEMTU:
3586 info_ext = ptr;
3587 nexthop_sync_mtu(dev, info_ext->ext.mtu);
3588 rt_cache_flush(dev_net(dev));
3589 break;
3590 }
3591 return NOTIFY_DONE;
3592 }
3593
3594 static struct notifier_block nh_netdev_notifier = {
3595 .notifier_call = nh_netdev_event,
3596 };
3597
nexthops_dump(struct net * net,struct notifier_block * nb,enum nexthop_event_type event_type,struct netlink_ext_ack * extack)3598 static int nexthops_dump(struct net *net, struct notifier_block *nb,
3599 enum nexthop_event_type event_type,
3600 struct netlink_ext_ack *extack)
3601 {
3602 struct rb_root *root = &net->nexthop.rb_root;
3603 struct rb_node *node;
3604 int err = 0;
3605
3606 for (node = rb_first(root); node; node = rb_next(node)) {
3607 struct nexthop *nh;
3608
3609 nh = rb_entry(node, struct nexthop, rb_node);
3610 err = call_nexthop_notifier(nb, net, event_type, nh, extack);
3611 if (err)
3612 break;
3613 }
3614
3615 return err;
3616 }
3617
register_nexthop_notifier(struct net * net,struct notifier_block * nb,struct netlink_ext_ack * extack)3618 int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
3619 struct netlink_ext_ack *extack)
3620 {
3621 int err;
3622
3623 rtnl_lock();
3624 err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack);
3625 if (err)
3626 goto unlock;
3627 err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
3628 nb);
3629 unlock:
3630 rtnl_unlock();
3631 return err;
3632 }
3633 EXPORT_SYMBOL(register_nexthop_notifier);
3634
unregister_nexthop_notifier(struct net * net,struct notifier_block * nb)3635 int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
3636 {
3637 int err;
3638
3639 rtnl_lock();
3640 err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
3641 nb);
3642 if (err)
3643 goto unlock;
3644 nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
3645 unlock:
3646 rtnl_unlock();
3647 return err;
3648 }
3649 EXPORT_SYMBOL(unregister_nexthop_notifier);
3650
nexthop_set_hw_flags(struct net * net,u32 id,bool offload,bool trap)3651 void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
3652 {
3653 struct nexthop *nexthop;
3654
3655 rcu_read_lock();
3656
3657 nexthop = nexthop_find_by_id(net, id);
3658 if (!nexthop)
3659 goto out;
3660
3661 nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
3662 if (offload)
3663 nexthop->nh_flags |= RTNH_F_OFFLOAD;
3664 if (trap)
3665 nexthop->nh_flags |= RTNH_F_TRAP;
3666
3667 out:
3668 rcu_read_unlock();
3669 }
3670 EXPORT_SYMBOL(nexthop_set_hw_flags);
3671
nexthop_bucket_set_hw_flags(struct net * net,u32 id,u16 bucket_index,bool offload,bool trap)3672 void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
3673 bool offload, bool trap)
3674 {
3675 struct nh_res_table *res_table;
3676 struct nh_res_bucket *bucket;
3677 struct nexthop *nexthop;
3678 struct nh_group *nhg;
3679
3680 rcu_read_lock();
3681
3682 nexthop = nexthop_find_by_id(net, id);
3683 if (!nexthop || !nexthop->is_group)
3684 goto out;
3685
3686 nhg = rcu_dereference(nexthop->nh_grp);
3687 if (!nhg->resilient)
3688 goto out;
3689
3690 if (bucket_index >= nhg->res_table->num_nh_buckets)
3691 goto out;
3692
3693 res_table = rcu_dereference(nhg->res_table);
3694 bucket = &res_table->nh_buckets[bucket_index];
3695 bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
3696 if (offload)
3697 bucket->nh_flags |= RTNH_F_OFFLOAD;
3698 if (trap)
3699 bucket->nh_flags |= RTNH_F_TRAP;
3700
3701 out:
3702 rcu_read_unlock();
3703 }
3704 EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);
3705
nexthop_res_grp_activity_update(struct net * net,u32 id,u16 num_buckets,unsigned long * activity)3706 void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
3707 unsigned long *activity)
3708 {
3709 struct nh_res_table *res_table;
3710 struct nexthop *nexthop;
3711 struct nh_group *nhg;
3712 u16 i;
3713
3714 rcu_read_lock();
3715
3716 nexthop = nexthop_find_by_id(net, id);
3717 if (!nexthop || !nexthop->is_group)
3718 goto out;
3719
3720 nhg = rcu_dereference(nexthop->nh_grp);
3721 if (!nhg->resilient)
3722 goto out;
3723
3724 /* Instead of silently ignoring some buckets, demand that the sizes
3725 * be the same.
3726 */
3727 res_table = rcu_dereference(nhg->res_table);
3728 if (num_buckets != res_table->num_nh_buckets)
3729 goto out;
3730
3731 for (i = 0; i < num_buckets; i++) {
3732 if (test_bit(i, activity))
3733 nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
3734 }
3735
3736 out:
3737 rcu_read_unlock();
3738 }
3739 EXPORT_SYMBOL(nexthop_res_grp_activity_update);
3740
nexthop_net_exit_batch(struct list_head * net_list)3741 static void __net_exit nexthop_net_exit_batch(struct list_head *net_list)
3742 {
3743 struct net *net;
3744
3745 rtnl_lock();
3746 list_for_each_entry(net, net_list, exit_list) {
3747 flush_all_nexthops(net);
3748 kfree(net->nexthop.devhash);
3749 }
3750 rtnl_unlock();
3751 }
3752
nexthop_net_init(struct net * net)3753 static int __net_init nexthop_net_init(struct net *net)
3754 {
3755 size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
3756
3757 net->nexthop.rb_root = RB_ROOT;
3758 net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
3759 if (!net->nexthop.devhash)
3760 return -ENOMEM;
3761 BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
3762
3763 return 0;
3764 }
3765
3766 static struct pernet_operations nexthop_net_ops = {
3767 .init = nexthop_net_init,
3768 .exit_batch = nexthop_net_exit_batch,
3769 };
3770
nexthop_init(void)3771 static int __init nexthop_init(void)
3772 {
3773 register_pernet_subsys(&nexthop_net_ops);
3774
3775 register_netdevice_notifier(&nh_netdev_notifier);
3776
3777 rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3778 rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
3779 rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
3780 rtm_dump_nexthop, 0);
3781
3782 rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3783 rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
3784
3785 rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3786 rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
3787
3788 rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket,
3789 rtm_dump_nexthop_bucket, 0);
3790
3791 return 0;
3792 }
3793 subsys_initcall(nexthop_init);
3794