1 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/skbuff.h> 16 #include <linux/types.h> 17 #include <linux/bpf.h> 18 #include <net/lwtunnel.h> 19 20 struct bpf_lwt_prog { 21 struct bpf_prog *prog; 22 char *name; 23 }; 24 25 struct bpf_lwt { 26 struct bpf_lwt_prog in; 27 struct bpf_lwt_prog out; 28 struct bpf_lwt_prog xmit; 29 int family; 30 }; 31 32 #define MAX_PROG_NAME 256 33 34 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 35 { 36 return (struct bpf_lwt *)lwt->data; 37 } 38 39 #define NO_REDIRECT false 40 #define CAN_REDIRECT true 41 42 static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 43 struct dst_entry *dst, bool can_redirect) 44 { 45 int ret; 46 47 /* Preempt disable is needed to protect per-cpu redirect_info between 48 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and 49 * access to maps strictly require a rcu_read_lock() for protection, 50 * mixing with BH RCU lock doesn't work. 51 */ 52 preempt_disable(); 53 bpf_compute_data_pointers(skb); 54 ret = bpf_prog_run_save_cb(lwt->prog, skb); 55 56 switch (ret) { 57 case BPF_OK: 58 break; 59 60 case BPF_REDIRECT: 61 if (unlikely(!can_redirect)) { 62 pr_warn_once("Illegal redirect return code in prog %s\n", 63 lwt->name ? : "<unknown>"); 64 ret = BPF_OK; 65 } else { 66 skb_reset_mac_header(skb); 67 ret = skb_do_redirect(skb); 68 if (ret == 0) 69 ret = BPF_REDIRECT; 70 } 71 break; 72 73 case BPF_DROP: 74 kfree_skb(skb); 75 ret = -EPERM; 76 break; 77 78 default: 79 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 80 kfree_skb(skb); 81 ret = -EINVAL; 82 break; 83 } 84 85 preempt_enable(); 86 87 return ret; 88 } 89 90 static int bpf_input(struct sk_buff *skb) 91 { 92 struct dst_entry *dst = skb_dst(skb); 93 struct bpf_lwt *bpf; 94 int ret; 95 96 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 97 if (bpf->in.prog) { 98 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 99 if (ret < 0) 100 return ret; 101 } 102 103 if (unlikely(!dst->lwtstate->orig_input)) { 104 pr_warn_once("orig_input not set on dst for prog %s\n", 105 bpf->out.name); 106 kfree_skb(skb); 107 return -EINVAL; 108 } 109 110 return dst->lwtstate->orig_input(skb); 111 } 112 113 static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 114 { 115 struct dst_entry *dst = skb_dst(skb); 116 struct bpf_lwt *bpf; 117 int ret; 118 119 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 120 if (bpf->out.prog) { 121 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 122 if (ret < 0) 123 return ret; 124 } 125 126 if (unlikely(!dst->lwtstate->orig_output)) { 127 pr_warn_once("orig_output not set on dst for prog %s\n", 128 bpf->out.name); 129 kfree_skb(skb); 130 return -EINVAL; 131 } 132 133 return dst->lwtstate->orig_output(net, sk, skb); 134 } 135 136 static int xmit_check_hhlen(struct sk_buff *skb) 137 { 138 int hh_len = skb_dst(skb)->dev->hard_header_len; 139 140 if (skb_headroom(skb) < hh_len) { 141 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 142 143 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 144 return -ENOMEM; 145 } 146 147 return 0; 148 } 149 150 static int bpf_xmit(struct sk_buff *skb) 151 { 152 struct dst_entry *dst = skb_dst(skb); 153 struct bpf_lwt *bpf; 154 155 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 156 if (bpf->xmit.prog) { 157 int ret; 158 159 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 160 switch (ret) { 161 case BPF_OK: 162 /* If the header was expanded, headroom might be too 163 * small for L2 header to come, expand as needed. 164 */ 165 ret = xmit_check_hhlen(skb); 166 if (unlikely(ret)) 167 return ret; 168 169 return LWTUNNEL_XMIT_CONTINUE; 170 case BPF_REDIRECT: 171 return LWTUNNEL_XMIT_DONE; 172 default: 173 return ret; 174 } 175 } 176 177 return LWTUNNEL_XMIT_CONTINUE; 178 } 179 180 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 181 { 182 if (prog->prog) 183 bpf_prog_put(prog->prog); 184 185 kfree(prog->name); 186 } 187 188 static void bpf_destroy_state(struct lwtunnel_state *lwt) 189 { 190 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 191 192 bpf_lwt_prog_destroy(&bpf->in); 193 bpf_lwt_prog_destroy(&bpf->out); 194 bpf_lwt_prog_destroy(&bpf->xmit); 195 } 196 197 static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 198 [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 199 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 200 .len = MAX_PROG_NAME }, 201 }; 202 203 static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 204 enum bpf_prog_type type) 205 { 206 struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 207 struct bpf_prog *p; 208 int ret; 209 u32 fd; 210 211 ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, 212 NULL); 213 if (ret < 0) 214 return ret; 215 216 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 217 return -EINVAL; 218 219 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); 220 if (!prog->name) 221 return -ENOMEM; 222 223 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 224 p = bpf_prog_get_type(fd, type); 225 if (IS_ERR(p)) 226 return PTR_ERR(p); 227 228 prog->prog = p; 229 230 return 0; 231 } 232 233 static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 234 [LWT_BPF_IN] = { .type = NLA_NESTED, }, 235 [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 236 [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 237 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 238 }; 239 240 static int bpf_build_state(struct nlattr *nla, 241 unsigned int family, const void *cfg, 242 struct lwtunnel_state **ts, 243 struct netlink_ext_ack *extack) 244 { 245 struct nlattr *tb[LWT_BPF_MAX + 1]; 246 struct lwtunnel_state *newts; 247 struct bpf_lwt *bpf; 248 int ret; 249 250 if (family != AF_INET && family != AF_INET6) 251 return -EAFNOSUPPORT; 252 253 ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); 254 if (ret < 0) 255 return ret; 256 257 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 258 return -EINVAL; 259 260 newts = lwtunnel_state_alloc(sizeof(*bpf)); 261 if (!newts) 262 return -ENOMEM; 263 264 newts->type = LWTUNNEL_ENCAP_BPF; 265 bpf = bpf_lwt_lwtunnel(newts); 266 267 if (tb[LWT_BPF_IN]) { 268 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 269 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 270 BPF_PROG_TYPE_LWT_IN); 271 if (ret < 0) 272 goto errout; 273 } 274 275 if (tb[LWT_BPF_OUT]) { 276 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 277 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 278 BPF_PROG_TYPE_LWT_OUT); 279 if (ret < 0) 280 goto errout; 281 } 282 283 if (tb[LWT_BPF_XMIT]) { 284 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 285 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 286 BPF_PROG_TYPE_LWT_XMIT); 287 if (ret < 0) 288 goto errout; 289 } 290 291 if (tb[LWT_BPF_XMIT_HEADROOM]) { 292 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 293 294 if (headroom > LWT_BPF_MAX_HEADROOM) { 295 ret = -ERANGE; 296 goto errout; 297 } 298 299 newts->headroom = headroom; 300 } 301 302 bpf->family = family; 303 *ts = newts; 304 305 return 0; 306 307 errout: 308 bpf_destroy_state(newts); 309 kfree(newts); 310 return ret; 311 } 312 313 static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 314 struct bpf_lwt_prog *prog) 315 { 316 struct nlattr *nest; 317 318 if (!prog->prog) 319 return 0; 320 321 nest = nla_nest_start(skb, attr); 322 if (!nest) 323 return -EMSGSIZE; 324 325 if (prog->name && 326 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 327 return -EMSGSIZE; 328 329 return nla_nest_end(skb, nest); 330 } 331 332 static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 333 { 334 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 335 336 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 337 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 338 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 339 return -EMSGSIZE; 340 341 return 0; 342 } 343 344 static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 345 { 346 int nest_len = nla_total_size(sizeof(struct nlattr)) + 347 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 348 0; 349 350 return nest_len + /* LWT_BPF_IN */ 351 nest_len + /* LWT_BPF_OUT */ 352 nest_len + /* LWT_BPF_XMIT */ 353 0; 354 } 355 356 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 357 { 358 /* FIXME: 359 * The LWT state is currently rebuilt for delete requests which 360 * results in a new bpf_prog instance. Comparing names for now. 361 */ 362 if (!a->name && !b->name) 363 return 0; 364 365 if (!a->name || !b->name) 366 return 1; 367 368 return strcmp(a->name, b->name); 369 } 370 371 static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 372 { 373 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 374 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 375 376 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 377 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 378 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 379 } 380 381 static const struct lwtunnel_encap_ops bpf_encap_ops = { 382 .build_state = bpf_build_state, 383 .destroy_state = bpf_destroy_state, 384 .input = bpf_input, 385 .output = bpf_output, 386 .xmit = bpf_xmit, 387 .fill_encap = bpf_fill_encap_info, 388 .get_encap_size = bpf_encap_nlsize, 389 .cmp_encap = bpf_encap_cmp, 390 .owner = THIS_MODULE, 391 }; 392 393 static int __init bpf_lwt_init(void) 394 { 395 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 396 } 397 398 subsys_initcall(bpf_lwt_init) 399