1 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/skbuff.h> 16 #include <linux/types.h> 17 #include <linux/bpf.h> 18 #include <net/lwtunnel.h> 19 20 struct bpf_lwt_prog { 21 struct bpf_prog *prog; 22 char *name; 23 }; 24 25 struct bpf_lwt { 26 struct bpf_lwt_prog in; 27 struct bpf_lwt_prog out; 28 struct bpf_lwt_prog xmit; 29 int family; 30 }; 31 32 #define MAX_PROG_NAME 256 33 34 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 35 { 36 return (struct bpf_lwt *)lwt->data; 37 } 38 39 #define NO_REDIRECT false 40 #define CAN_REDIRECT true 41 42 static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 43 struct dst_entry *dst, bool can_redirect) 44 { 45 int ret; 46 47 /* Preempt disable is needed to protect per-cpu redirect_info between 48 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and 49 * access to maps strictly require a rcu_read_lock() for protection, 50 * mixing with BH RCU lock doesn't work. 51 */ 52 preempt_disable(); 53 rcu_read_lock(); 54 bpf_compute_data_end(skb); 55 ret = bpf_prog_run_save_cb(lwt->prog, skb); 56 rcu_read_unlock(); 57 58 switch (ret) { 59 case BPF_OK: 60 break; 61 62 case BPF_REDIRECT: 63 if (unlikely(!can_redirect)) { 64 pr_warn_once("Illegal redirect return code in prog %s\n", 65 lwt->name ? : "<unknown>"); 66 ret = BPF_OK; 67 } else { 68 ret = skb_do_redirect(skb); 69 if (ret == 0) 70 ret = BPF_REDIRECT; 71 } 72 break; 73 74 case BPF_DROP: 75 kfree_skb(skb); 76 ret = -EPERM; 77 break; 78 79 default: 80 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 81 kfree_skb(skb); 82 ret = -EINVAL; 83 break; 84 } 85 86 preempt_enable(); 87 88 return ret; 89 } 90 91 static int bpf_input(struct sk_buff *skb) 92 { 93 struct dst_entry *dst = skb_dst(skb); 94 struct bpf_lwt *bpf; 95 int ret; 96 97 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 98 if (bpf->in.prog) { 99 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 100 if (ret < 0) 101 return ret; 102 } 103 104 if (unlikely(!dst->lwtstate->orig_input)) { 105 pr_warn_once("orig_input not set on dst for prog %s\n", 106 bpf->out.name); 107 kfree_skb(skb); 108 return -EINVAL; 109 } 110 111 return dst->lwtstate->orig_input(skb); 112 } 113 114 static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 115 { 116 struct dst_entry *dst = skb_dst(skb); 117 struct bpf_lwt *bpf; 118 int ret; 119 120 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 121 if (bpf->out.prog) { 122 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 123 if (ret < 0) 124 return ret; 125 } 126 127 if (unlikely(!dst->lwtstate->orig_output)) { 128 pr_warn_once("orig_output not set on dst for prog %s\n", 129 bpf->out.name); 130 kfree_skb(skb); 131 return -EINVAL; 132 } 133 134 return dst->lwtstate->orig_output(net, sk, skb); 135 } 136 137 static int xmit_check_hhlen(struct sk_buff *skb) 138 { 139 int hh_len = skb_dst(skb)->dev->hard_header_len; 140 141 if (skb_headroom(skb) < hh_len) { 142 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 143 144 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 145 return -ENOMEM; 146 } 147 148 return 0; 149 } 150 151 static int bpf_xmit(struct sk_buff *skb) 152 { 153 struct dst_entry *dst = skb_dst(skb); 154 struct bpf_lwt *bpf; 155 156 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 157 if (bpf->xmit.prog) { 158 int ret; 159 160 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 161 switch (ret) { 162 case BPF_OK: 163 /* If the header was expanded, headroom might be too 164 * small for L2 header to come, expand as needed. 165 */ 166 ret = xmit_check_hhlen(skb); 167 if (unlikely(ret)) 168 return ret; 169 170 return LWTUNNEL_XMIT_CONTINUE; 171 case BPF_REDIRECT: 172 return LWTUNNEL_XMIT_DONE; 173 default: 174 return ret; 175 } 176 } 177 178 return LWTUNNEL_XMIT_CONTINUE; 179 } 180 181 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 182 { 183 if (prog->prog) 184 bpf_prog_put(prog->prog); 185 186 kfree(prog->name); 187 } 188 189 static void bpf_destroy_state(struct lwtunnel_state *lwt) 190 { 191 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 192 193 bpf_lwt_prog_destroy(&bpf->in); 194 bpf_lwt_prog_destroy(&bpf->out); 195 bpf_lwt_prog_destroy(&bpf->xmit); 196 } 197 198 static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 199 [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 200 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 201 .len = MAX_PROG_NAME }, 202 }; 203 204 static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 205 enum bpf_prog_type type) 206 { 207 struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 208 struct bpf_prog *p; 209 int ret; 210 u32 fd; 211 212 ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); 213 if (ret < 0) 214 return ret; 215 216 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 217 return -EINVAL; 218 219 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); 220 if (!prog->name) 221 return -ENOMEM; 222 223 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 224 p = bpf_prog_get_type(fd, type); 225 if (IS_ERR(p)) 226 return PTR_ERR(p); 227 228 prog->prog = p; 229 230 return 0; 231 } 232 233 static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 234 [LWT_BPF_IN] = { .type = NLA_NESTED, }, 235 [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 236 [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 237 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 238 }; 239 240 static int bpf_build_state(struct nlattr *nla, 241 unsigned int family, const void *cfg, 242 struct lwtunnel_state **ts) 243 { 244 struct nlattr *tb[LWT_BPF_MAX + 1]; 245 struct lwtunnel_state *newts; 246 struct bpf_lwt *bpf; 247 int ret; 248 249 if (family != AF_INET && family != AF_INET6) 250 return -EAFNOSUPPORT; 251 252 ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); 253 if (ret < 0) 254 return ret; 255 256 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 257 return -EINVAL; 258 259 newts = lwtunnel_state_alloc(sizeof(*bpf)); 260 if (!newts) 261 return -ENOMEM; 262 263 newts->type = LWTUNNEL_ENCAP_BPF; 264 bpf = bpf_lwt_lwtunnel(newts); 265 266 if (tb[LWT_BPF_IN]) { 267 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 268 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 269 BPF_PROG_TYPE_LWT_IN); 270 if (ret < 0) 271 goto errout; 272 } 273 274 if (tb[LWT_BPF_OUT]) { 275 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 276 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 277 BPF_PROG_TYPE_LWT_OUT); 278 if (ret < 0) 279 goto errout; 280 } 281 282 if (tb[LWT_BPF_XMIT]) { 283 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 284 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 285 BPF_PROG_TYPE_LWT_XMIT); 286 if (ret < 0) 287 goto errout; 288 } 289 290 if (tb[LWT_BPF_XMIT_HEADROOM]) { 291 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 292 293 if (headroom > LWT_BPF_MAX_HEADROOM) { 294 ret = -ERANGE; 295 goto errout; 296 } 297 298 newts->headroom = headroom; 299 } 300 301 bpf->family = family; 302 *ts = newts; 303 304 return 0; 305 306 errout: 307 bpf_destroy_state(newts); 308 kfree(newts); 309 return ret; 310 } 311 312 static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 313 struct bpf_lwt_prog *prog) 314 { 315 struct nlattr *nest; 316 317 if (!prog->prog) 318 return 0; 319 320 nest = nla_nest_start(skb, attr); 321 if (!nest) 322 return -EMSGSIZE; 323 324 if (prog->name && 325 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 326 return -EMSGSIZE; 327 328 return nla_nest_end(skb, nest); 329 } 330 331 static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 332 { 333 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 334 335 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 336 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 337 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 338 return -EMSGSIZE; 339 340 return 0; 341 } 342 343 static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 344 { 345 int nest_len = nla_total_size(sizeof(struct nlattr)) + 346 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 347 0; 348 349 return nest_len + /* LWT_BPF_IN */ 350 nest_len + /* LWT_BPF_OUT */ 351 nest_len + /* LWT_BPF_XMIT */ 352 0; 353 } 354 355 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 356 { 357 /* FIXME: 358 * The LWT state is currently rebuilt for delete requests which 359 * results in a new bpf_prog instance. Comparing names for now. 360 */ 361 if (!a->name && !b->name) 362 return 0; 363 364 if (!a->name || !b->name) 365 return 1; 366 367 return strcmp(a->name, b->name); 368 } 369 370 static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 371 { 372 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 373 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 374 375 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 376 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 377 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 378 } 379 380 static const struct lwtunnel_encap_ops bpf_encap_ops = { 381 .build_state = bpf_build_state, 382 .destroy_state = bpf_destroy_state, 383 .input = bpf_input, 384 .output = bpf_output, 385 .xmit = bpf_xmit, 386 .fill_encap = bpf_fill_encap_info, 387 .get_encap_size = bpf_encap_nlsize, 388 .cmp_encap = bpf_encap_cmp, 389 .owner = THIS_MODULE, 390 }; 391 392 static int __init bpf_lwt_init(void) 393 { 394 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 395 } 396 397 subsys_initcall(bpf_lwt_init) 398