1 /* 2 * Copyright (c) 2007-2013 Nicira, Inc. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 16 * 02110-1301, USA 17 */ 18 19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 20 21 #include <linux/skbuff.h> 22 #include <linux/in.h> 23 #include <linux/ip.h> 24 #include <linux/openvswitch.h> 25 #include <linux/sctp.h> 26 #include <linux/tcp.h> 27 #include <linux/udp.h> 28 #include <linux/in6.h> 29 #include <linux/if_arp.h> 30 #include <linux/if_vlan.h> 31 #include <net/ip.h> 32 #include <net/ipv6.h> 33 #include <net/checksum.h> 34 #include <net/dsfield.h> 35 #include <net/sctp/checksum.h> 36 37 #include "datapath.h" 38 #include "vport.h" 39 40 static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, 41 const struct nlattr *attr, int len); 42 43 static int make_writable(struct sk_buff *skb, int write_len) 44 { 45 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 46 return 0; 47 48 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 49 } 50 51 /* remove VLAN header from packet and update csum accordingly. */ 52 static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) 53 { 54 struct vlan_hdr *vhdr; 55 int err; 56 57 err = make_writable(skb, VLAN_ETH_HLEN); 58 if (unlikely(err)) 59 return err; 60 61 if (skb->ip_summed == CHECKSUM_COMPLETE) 62 skb->csum = csum_sub(skb->csum, csum_partial(skb->data 63 + (2 * ETH_ALEN), VLAN_HLEN, 0)); 64 65 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); 66 *current_tci = vhdr->h_vlan_TCI; 67 68 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); 69 __skb_pull(skb, VLAN_HLEN); 70 71 vlan_set_encap_proto(skb, vhdr); 72 skb->mac_header += VLAN_HLEN; 73 skb_reset_mac_len(skb); 74 75 return 0; 76 } 77 78 static int pop_vlan(struct sk_buff *skb) 79 { 80 __be16 tci; 81 int err; 82 83 if (likely(vlan_tx_tag_present(skb))) { 84 skb->vlan_tci = 0; 85 } else { 86 if (unlikely(skb->protocol != htons(ETH_P_8021Q) || 87 skb->len < VLAN_ETH_HLEN)) 88 return 0; 89 90 err = __pop_vlan_tci(skb, &tci); 91 if (err) 92 return err; 93 } 94 /* move next vlan tag to hw accel tag */ 95 if (likely(skb->protocol != htons(ETH_P_8021Q) || 96 skb->len < VLAN_ETH_HLEN)) 97 return 0; 98 99 err = __pop_vlan_tci(skb, &tci); 100 if (unlikely(err)) 101 return err; 102 103 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci)); 104 return 0; 105 } 106 107 static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan) 108 { 109 if (unlikely(vlan_tx_tag_present(skb))) { 110 u16 current_tag; 111 112 /* push down current VLAN tag */ 113 current_tag = vlan_tx_tag_get(skb); 114 115 if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) 116 return -ENOMEM; 117 118 if (skb->ip_summed == CHECKSUM_COMPLETE) 119 skb->csum = csum_add(skb->csum, csum_partial(skb->data 120 + (2 * ETH_ALEN), VLAN_HLEN, 0)); 121 122 } 123 __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); 124 return 0; 125 } 126 127 static int set_eth_addr(struct sk_buff *skb, 128 const struct ovs_key_ethernet *eth_key) 129 { 130 int err; 131 err = make_writable(skb, ETH_HLEN); 132 if (unlikely(err)) 133 return err; 134 135 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); 136 137 ether_addr_copy(eth_hdr(skb)->h_source, eth_key->eth_src); 138 ether_addr_copy(eth_hdr(skb)->h_dest, eth_key->eth_dst); 139 140 ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); 141 142 return 0; 143 } 144 145 static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, 146 __be32 *addr, __be32 new_addr) 147 { 148 int transport_len = skb->len - skb_transport_offset(skb); 149 150 if (nh->protocol == IPPROTO_TCP) { 151 if (likely(transport_len >= sizeof(struct tcphdr))) 152 inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, 153 *addr, new_addr, 1); 154 } else if (nh->protocol == IPPROTO_UDP) { 155 if (likely(transport_len >= sizeof(struct udphdr))) { 156 struct udphdr *uh = udp_hdr(skb); 157 158 if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { 159 inet_proto_csum_replace4(&uh->check, skb, 160 *addr, new_addr, 1); 161 if (!uh->check) 162 uh->check = CSUM_MANGLED_0; 163 } 164 } 165 } 166 167 csum_replace4(&nh->check, *addr, new_addr); 168 skb_clear_hash(skb); 169 *addr = new_addr; 170 } 171 172 static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, 173 __be32 addr[4], const __be32 new_addr[4]) 174 { 175 int transport_len = skb->len - skb_transport_offset(skb); 176 177 if (l4_proto == IPPROTO_TCP) { 178 if (likely(transport_len >= sizeof(struct tcphdr))) 179 inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, 180 addr, new_addr, 1); 181 } else if (l4_proto == IPPROTO_UDP) { 182 if (likely(transport_len >= sizeof(struct udphdr))) { 183 struct udphdr *uh = udp_hdr(skb); 184 185 if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { 186 inet_proto_csum_replace16(&uh->check, skb, 187 addr, new_addr, 1); 188 if (!uh->check) 189 uh->check = CSUM_MANGLED_0; 190 } 191 } 192 } 193 } 194 195 static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, 196 __be32 addr[4], const __be32 new_addr[4], 197 bool recalculate_csum) 198 { 199 if (recalculate_csum) 200 update_ipv6_checksum(skb, l4_proto, addr, new_addr); 201 202 skb_clear_hash(skb); 203 memcpy(addr, new_addr, sizeof(__be32[4])); 204 } 205 206 static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc) 207 { 208 nh->priority = tc >> 4; 209 nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4); 210 } 211 212 static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl) 213 { 214 nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16; 215 nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8; 216 nh->flow_lbl[2] = fl & 0x000000FF; 217 } 218 219 static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) 220 { 221 csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); 222 nh->ttl = new_ttl; 223 } 224 225 static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key) 226 { 227 struct iphdr *nh; 228 int err; 229 230 err = make_writable(skb, skb_network_offset(skb) + 231 sizeof(struct iphdr)); 232 if (unlikely(err)) 233 return err; 234 235 nh = ip_hdr(skb); 236 237 if (ipv4_key->ipv4_src != nh->saddr) 238 set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src); 239 240 if (ipv4_key->ipv4_dst != nh->daddr) 241 set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst); 242 243 if (ipv4_key->ipv4_tos != nh->tos) 244 ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos); 245 246 if (ipv4_key->ipv4_ttl != nh->ttl) 247 set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl); 248 249 return 0; 250 } 251 252 static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) 253 { 254 struct ipv6hdr *nh; 255 int err; 256 __be32 *saddr; 257 __be32 *daddr; 258 259 err = make_writable(skb, skb_network_offset(skb) + 260 sizeof(struct ipv6hdr)); 261 if (unlikely(err)) 262 return err; 263 264 nh = ipv6_hdr(skb); 265 saddr = (__be32 *)&nh->saddr; 266 daddr = (__be32 *)&nh->daddr; 267 268 if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) 269 set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr, 270 ipv6_key->ipv6_src, true); 271 272 if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) { 273 unsigned int offset = 0; 274 int flags = IP6_FH_F_SKIP_RH; 275 bool recalc_csum = true; 276 277 if (ipv6_ext_hdr(nh->nexthdr)) 278 recalc_csum = ipv6_find_hdr(skb, &offset, 279 NEXTHDR_ROUTING, NULL, 280 &flags) != NEXTHDR_ROUTING; 281 282 set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr, 283 ipv6_key->ipv6_dst, recalc_csum); 284 } 285 286 set_ipv6_tc(nh, ipv6_key->ipv6_tclass); 287 set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label)); 288 nh->hop_limit = ipv6_key->ipv6_hlimit; 289 290 return 0; 291 } 292 293 /* Must follow make_writable() since that can move the skb data. */ 294 static void set_tp_port(struct sk_buff *skb, __be16 *port, 295 __be16 new_port, __sum16 *check) 296 { 297 inet_proto_csum_replace2(check, skb, *port, new_port, 0); 298 *port = new_port; 299 skb_clear_hash(skb); 300 } 301 302 static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) 303 { 304 struct udphdr *uh = udp_hdr(skb); 305 306 if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { 307 set_tp_port(skb, port, new_port, &uh->check); 308 309 if (!uh->check) 310 uh->check = CSUM_MANGLED_0; 311 } else { 312 *port = new_port; 313 skb_clear_hash(skb); 314 } 315 } 316 317 static int set_udp(struct sk_buff *skb, const struct ovs_key_udp *udp_port_key) 318 { 319 struct udphdr *uh; 320 int err; 321 322 err = make_writable(skb, skb_transport_offset(skb) + 323 sizeof(struct udphdr)); 324 if (unlikely(err)) 325 return err; 326 327 uh = udp_hdr(skb); 328 if (udp_port_key->udp_src != uh->source) 329 set_udp_port(skb, &uh->source, udp_port_key->udp_src); 330 331 if (udp_port_key->udp_dst != uh->dest) 332 set_udp_port(skb, &uh->dest, udp_port_key->udp_dst); 333 334 return 0; 335 } 336 337 static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) 338 { 339 struct tcphdr *th; 340 int err; 341 342 err = make_writable(skb, skb_transport_offset(skb) + 343 sizeof(struct tcphdr)); 344 if (unlikely(err)) 345 return err; 346 347 th = tcp_hdr(skb); 348 if (tcp_port_key->tcp_src != th->source) 349 set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check); 350 351 if (tcp_port_key->tcp_dst != th->dest) 352 set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check); 353 354 return 0; 355 } 356 357 static int set_sctp(struct sk_buff *skb, 358 const struct ovs_key_sctp *sctp_port_key) 359 { 360 struct sctphdr *sh; 361 int err; 362 unsigned int sctphoff = skb_transport_offset(skb); 363 364 err = make_writable(skb, sctphoff + sizeof(struct sctphdr)); 365 if (unlikely(err)) 366 return err; 367 368 sh = sctp_hdr(skb); 369 if (sctp_port_key->sctp_src != sh->source || 370 sctp_port_key->sctp_dst != sh->dest) { 371 __le32 old_correct_csum, new_csum, old_csum; 372 373 old_csum = sh->checksum; 374 old_correct_csum = sctp_compute_cksum(skb, sctphoff); 375 376 sh->source = sctp_port_key->sctp_src; 377 sh->dest = sctp_port_key->sctp_dst; 378 379 new_csum = sctp_compute_cksum(skb, sctphoff); 380 381 /* Carry any checksum errors through. */ 382 sh->checksum = old_csum ^ old_correct_csum ^ new_csum; 383 384 skb_clear_hash(skb); 385 } 386 387 return 0; 388 } 389 390 static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) 391 { 392 struct vport *vport; 393 394 if (unlikely(!skb)) 395 return -ENOMEM; 396 397 vport = ovs_vport_rcu(dp, out_port); 398 if (unlikely(!vport)) { 399 kfree_skb(skb); 400 return -ENODEV; 401 } 402 403 ovs_vport_send(vport, skb); 404 return 0; 405 } 406 407 static int output_userspace(struct datapath *dp, struct sk_buff *skb, 408 const struct nlattr *attr) 409 { 410 struct dp_upcall_info upcall; 411 const struct nlattr *a; 412 int rem; 413 414 BUG_ON(!OVS_CB(skb)->pkt_key); 415 416 upcall.cmd = OVS_PACKET_CMD_ACTION; 417 upcall.key = OVS_CB(skb)->pkt_key; 418 upcall.userdata = NULL; 419 upcall.portid = 0; 420 421 for (a = nla_data(attr), rem = nla_len(attr); rem > 0; 422 a = nla_next(a, &rem)) { 423 switch (nla_type(a)) { 424 case OVS_USERSPACE_ATTR_USERDATA: 425 upcall.userdata = a; 426 break; 427 428 case OVS_USERSPACE_ATTR_PID: 429 upcall.portid = nla_get_u32(a); 430 break; 431 } 432 } 433 434 return ovs_dp_upcall(dp, skb, &upcall); 435 } 436 437 static bool last_action(const struct nlattr *a, int rem) 438 { 439 return a->nla_len == rem; 440 } 441 442 static int sample(struct datapath *dp, struct sk_buff *skb, 443 const struct nlattr *attr) 444 { 445 const struct nlattr *acts_list = NULL; 446 const struct nlattr *a; 447 struct sk_buff *sample_skb; 448 int rem; 449 450 for (a = nla_data(attr), rem = nla_len(attr); rem > 0; 451 a = nla_next(a, &rem)) { 452 switch (nla_type(a)) { 453 case OVS_SAMPLE_ATTR_PROBABILITY: 454 if (prandom_u32() >= nla_get_u32(a)) 455 return 0; 456 break; 457 458 case OVS_SAMPLE_ATTR_ACTIONS: 459 acts_list = a; 460 break; 461 } 462 } 463 464 rem = nla_len(acts_list); 465 a = nla_data(acts_list); 466 467 /* Actions list is either empty or only contains a single user-space 468 * action, the latter being a special case as it is the only known 469 * usage of the sample action. 470 * In these special cases don't clone the skb as there are no 471 * side-effects in the nested actions. 472 * Otherwise, clone in case the nested actions have side effects. 473 */ 474 if (likely(rem == 0 || (nla_type(a) == OVS_ACTION_ATTR_USERSPACE && 475 last_action(a, rem)))) { 476 sample_skb = skb; 477 skb_get(skb); 478 } else { 479 sample_skb = skb_clone(skb, GFP_ATOMIC); 480 if (!sample_skb) /* Skip sample action when out of memory. */ 481 return 0; 482 } 483 484 /* Note that do_execute_actions() never consumes skb. 485 * In the case where skb has been cloned above it is the clone that 486 * is consumed. Otherwise the skb_get(skb) call prevents 487 * consumption by do_execute_actions(). Thus, it is safe to simply 488 * return the error code and let the caller (also 489 * do_execute_actions()) free skb on error. 490 */ 491 return do_execute_actions(dp, sample_skb, a, rem); 492 } 493 494 static int execute_set_action(struct sk_buff *skb, 495 const struct nlattr *nested_attr) 496 { 497 int err = 0; 498 499 switch (nla_type(nested_attr)) { 500 case OVS_KEY_ATTR_PRIORITY: 501 skb->priority = nla_get_u32(nested_attr); 502 break; 503 504 case OVS_KEY_ATTR_SKB_MARK: 505 skb->mark = nla_get_u32(nested_attr); 506 break; 507 508 case OVS_KEY_ATTR_IPV4_TUNNEL: 509 OVS_CB(skb)->tun_key = nla_data(nested_attr); 510 break; 511 512 case OVS_KEY_ATTR_ETHERNET: 513 err = set_eth_addr(skb, nla_data(nested_attr)); 514 break; 515 516 case OVS_KEY_ATTR_IPV4: 517 err = set_ipv4(skb, nla_data(nested_attr)); 518 break; 519 520 case OVS_KEY_ATTR_IPV6: 521 err = set_ipv6(skb, nla_data(nested_attr)); 522 break; 523 524 case OVS_KEY_ATTR_TCP: 525 err = set_tcp(skb, nla_data(nested_attr)); 526 break; 527 528 case OVS_KEY_ATTR_UDP: 529 err = set_udp(skb, nla_data(nested_attr)); 530 break; 531 532 case OVS_KEY_ATTR_SCTP: 533 err = set_sctp(skb, nla_data(nested_attr)); 534 break; 535 } 536 537 return err; 538 } 539 540 /* Execute a list of actions against 'skb'. */ 541 static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, 542 const struct nlattr *attr, int len) 543 { 544 /* Every output action needs a separate clone of 'skb', but the common 545 * case is just a single output action, so that doing a clone and 546 * then freeing the original skbuff is wasteful. So the following code 547 * is slightly obscure just to avoid that. */ 548 int prev_port = -1; 549 const struct nlattr *a; 550 int rem; 551 552 for (a = attr, rem = len; rem > 0; 553 a = nla_next(a, &rem)) { 554 int err = 0; 555 556 if (prev_port != -1) { 557 do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); 558 prev_port = -1; 559 } 560 561 switch (nla_type(a)) { 562 case OVS_ACTION_ATTR_OUTPUT: 563 prev_port = nla_get_u32(a); 564 break; 565 566 case OVS_ACTION_ATTR_USERSPACE: 567 output_userspace(dp, skb, a); 568 break; 569 570 case OVS_ACTION_ATTR_PUSH_VLAN: 571 err = push_vlan(skb, nla_data(a)); 572 if (unlikely(err)) /* skb already freed. */ 573 return err; 574 break; 575 576 case OVS_ACTION_ATTR_POP_VLAN: 577 err = pop_vlan(skb); 578 break; 579 580 case OVS_ACTION_ATTR_SET: 581 err = execute_set_action(skb, nla_data(a)); 582 break; 583 584 case OVS_ACTION_ATTR_SAMPLE: 585 err = sample(dp, skb, a); 586 if (unlikely(err)) /* skb already freed. */ 587 return err; 588 break; 589 } 590 591 if (unlikely(err)) { 592 kfree_skb(skb); 593 return err; 594 } 595 } 596 597 if (prev_port != -1) 598 do_output(dp, skb, prev_port); 599 else 600 consume_skb(skb); 601 602 return 0; 603 } 604 605 /* Execute a list of actions against 'skb'. */ 606 int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) 607 { 608 struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); 609 610 OVS_CB(skb)->tun_key = NULL; 611 return do_execute_actions(dp, skb, acts->actions, acts->actions_len); 612 } 613