1 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/skbuff.h> 16 #include <linux/types.h> 17 #include <linux/bpf.h> 18 #include <net/lwtunnel.h> 19 20 struct bpf_lwt_prog { 21 struct bpf_prog *prog; 22 char *name; 23 }; 24 25 struct bpf_lwt { 26 struct bpf_lwt_prog in; 27 struct bpf_lwt_prog out; 28 struct bpf_lwt_prog xmit; 29 int family; 30 }; 31 32 #define MAX_PROG_NAME 256 33 34 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 35 { 36 return (struct bpf_lwt *)lwt->data; 37 } 38 39 #define NO_REDIRECT false 40 #define CAN_REDIRECT true 41 42 static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 43 struct dst_entry *dst, bool can_redirect) 44 { 45 int ret; 46 47 /* Preempt disable is needed to protect per-cpu redirect_info between 48 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and 49 * access to maps strictly require a rcu_read_lock() for protection, 50 * mixing with BH RCU lock doesn't work. 51 */ 52 preempt_disable(); 53 bpf_compute_data_pointers(skb); 54 ret = bpf_prog_run_save_cb(lwt->prog, skb); 55 56 switch (ret) { 57 case BPF_OK: 58 break; 59 60 case BPF_REDIRECT: 61 if (unlikely(!can_redirect)) { 62 pr_warn_once("Illegal redirect return code in prog %s\n", 63 lwt->name ? : "<unknown>"); 64 ret = BPF_OK; 65 } else { 66 ret = skb_do_redirect(skb); 67 if (ret == 0) 68 ret = BPF_REDIRECT; 69 } 70 break; 71 72 case BPF_DROP: 73 kfree_skb(skb); 74 ret = -EPERM; 75 break; 76 77 default: 78 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 79 kfree_skb(skb); 80 ret = -EINVAL; 81 break; 82 } 83 84 preempt_enable(); 85 86 return ret; 87 } 88 89 static int bpf_input(struct sk_buff *skb) 90 { 91 struct dst_entry *dst = skb_dst(skb); 92 struct bpf_lwt *bpf; 93 int ret; 94 95 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 96 if (bpf->in.prog) { 97 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 98 if (ret < 0) 99 return ret; 100 } 101 102 if (unlikely(!dst->lwtstate->orig_input)) { 103 pr_warn_once("orig_input not set on dst for prog %s\n", 104 bpf->out.name); 105 kfree_skb(skb); 106 return -EINVAL; 107 } 108 109 return dst->lwtstate->orig_input(skb); 110 } 111 112 static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 113 { 114 struct dst_entry *dst = skb_dst(skb); 115 struct bpf_lwt *bpf; 116 int ret; 117 118 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 119 if (bpf->out.prog) { 120 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 121 if (ret < 0) 122 return ret; 123 } 124 125 if (unlikely(!dst->lwtstate->orig_output)) { 126 pr_warn_once("orig_output not set on dst for prog %s\n", 127 bpf->out.name); 128 kfree_skb(skb); 129 return -EINVAL; 130 } 131 132 return dst->lwtstate->orig_output(net, sk, skb); 133 } 134 135 static int xmit_check_hhlen(struct sk_buff *skb) 136 { 137 int hh_len = skb_dst(skb)->dev->hard_header_len; 138 139 if (skb_headroom(skb) < hh_len) { 140 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 141 142 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 143 return -ENOMEM; 144 } 145 146 return 0; 147 } 148 149 static int bpf_xmit(struct sk_buff *skb) 150 { 151 struct dst_entry *dst = skb_dst(skb); 152 struct bpf_lwt *bpf; 153 154 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 155 if (bpf->xmit.prog) { 156 int ret; 157 158 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 159 switch (ret) { 160 case BPF_OK: 161 /* If the header was expanded, headroom might be too 162 * small for L2 header to come, expand as needed. 163 */ 164 ret = xmit_check_hhlen(skb); 165 if (unlikely(ret)) 166 return ret; 167 168 return LWTUNNEL_XMIT_CONTINUE; 169 case BPF_REDIRECT: 170 return LWTUNNEL_XMIT_DONE; 171 default: 172 return ret; 173 } 174 } 175 176 return LWTUNNEL_XMIT_CONTINUE; 177 } 178 179 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 180 { 181 if (prog->prog) 182 bpf_prog_put(prog->prog); 183 184 kfree(prog->name); 185 } 186 187 static void bpf_destroy_state(struct lwtunnel_state *lwt) 188 { 189 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 190 191 bpf_lwt_prog_destroy(&bpf->in); 192 bpf_lwt_prog_destroy(&bpf->out); 193 bpf_lwt_prog_destroy(&bpf->xmit); 194 } 195 196 static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 197 [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 198 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 199 .len = MAX_PROG_NAME }, 200 }; 201 202 static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 203 enum bpf_prog_type type) 204 { 205 struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 206 struct bpf_prog *p; 207 int ret; 208 u32 fd; 209 210 ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, 211 NULL); 212 if (ret < 0) 213 return ret; 214 215 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 216 return -EINVAL; 217 218 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); 219 if (!prog->name) 220 return -ENOMEM; 221 222 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 223 p = bpf_prog_get_type(fd, type); 224 if (IS_ERR(p)) 225 return PTR_ERR(p); 226 227 prog->prog = p; 228 229 return 0; 230 } 231 232 static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 233 [LWT_BPF_IN] = { .type = NLA_NESTED, }, 234 [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 235 [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 236 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 237 }; 238 239 static int bpf_build_state(struct nlattr *nla, 240 unsigned int family, const void *cfg, 241 struct lwtunnel_state **ts, 242 struct netlink_ext_ack *extack) 243 { 244 struct nlattr *tb[LWT_BPF_MAX + 1]; 245 struct lwtunnel_state *newts; 246 struct bpf_lwt *bpf; 247 int ret; 248 249 if (family != AF_INET && family != AF_INET6) 250 return -EAFNOSUPPORT; 251 252 ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); 253 if (ret < 0) 254 return ret; 255 256 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 257 return -EINVAL; 258 259 newts = lwtunnel_state_alloc(sizeof(*bpf)); 260 if (!newts) 261 return -ENOMEM; 262 263 newts->type = LWTUNNEL_ENCAP_BPF; 264 bpf = bpf_lwt_lwtunnel(newts); 265 266 if (tb[LWT_BPF_IN]) { 267 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 268 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 269 BPF_PROG_TYPE_LWT_IN); 270 if (ret < 0) 271 goto errout; 272 } 273 274 if (tb[LWT_BPF_OUT]) { 275 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 276 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 277 BPF_PROG_TYPE_LWT_OUT); 278 if (ret < 0) 279 goto errout; 280 } 281 282 if (tb[LWT_BPF_XMIT]) { 283 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 284 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 285 BPF_PROG_TYPE_LWT_XMIT); 286 if (ret < 0) 287 goto errout; 288 } 289 290 if (tb[LWT_BPF_XMIT_HEADROOM]) { 291 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 292 293 if (headroom > LWT_BPF_MAX_HEADROOM) { 294 ret = -ERANGE; 295 goto errout; 296 } 297 298 newts->headroom = headroom; 299 } 300 301 bpf->family = family; 302 *ts = newts; 303 304 return 0; 305 306 errout: 307 bpf_destroy_state(newts); 308 kfree(newts); 309 return ret; 310 } 311 312 static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 313 struct bpf_lwt_prog *prog) 314 { 315 struct nlattr *nest; 316 317 if (!prog->prog) 318 return 0; 319 320 nest = nla_nest_start(skb, attr); 321 if (!nest) 322 return -EMSGSIZE; 323 324 if (prog->name && 325 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 326 return -EMSGSIZE; 327 328 return nla_nest_end(skb, nest); 329 } 330 331 static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 332 { 333 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 334 335 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 336 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 337 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 338 return -EMSGSIZE; 339 340 return 0; 341 } 342 343 static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 344 { 345 int nest_len = nla_total_size(sizeof(struct nlattr)) + 346 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 347 0; 348 349 return nest_len + /* LWT_BPF_IN */ 350 nest_len + /* LWT_BPF_OUT */ 351 nest_len + /* LWT_BPF_XMIT */ 352 0; 353 } 354 355 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 356 { 357 /* FIXME: 358 * The LWT state is currently rebuilt for delete requests which 359 * results in a new bpf_prog instance. Comparing names for now. 360 */ 361 if (!a->name && !b->name) 362 return 0; 363 364 if (!a->name || !b->name) 365 return 1; 366 367 return strcmp(a->name, b->name); 368 } 369 370 static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 371 { 372 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 373 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 374 375 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 376 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 377 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 378 } 379 380 static const struct lwtunnel_encap_ops bpf_encap_ops = { 381 .build_state = bpf_build_state, 382 .destroy_state = bpf_destroy_state, 383 .input = bpf_input, 384 .output = bpf_output, 385 .xmit = bpf_xmit, 386 .fill_encap = bpf_fill_encap_info, 387 .get_encap_size = bpf_encap_nlsize, 388 .cmp_encap = bpf_encap_cmp, 389 .owner = THIS_MODULE, 390 }; 391 392 static int __init bpf_lwt_init(void) 393 { 394 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 395 } 396 397 subsys_initcall(bpf_lwt_init) 398