1 /* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the NetFilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Version 1, is capable of handling both version 0 and 1 messages. 9 * Version 0 is the plain old format. 10 * Note Version 0 receivers will just drop Ver 1 messages. 11 * Version 1 is capable of handle IPv6, Persistence data, 12 * time-outs, and firewall marks. 13 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 14 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 15 * 16 * Definitions Message: is a complete datagram 17 * Sync_conn: is a part of a Message 18 * Param Data is an option to a Sync_conn. 19 * 20 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 21 * 22 * ip_vs_sync: sync connection info from master load balancer to backups 23 * through multicast 24 * 25 * Changes: 26 * Alexandre Cassen : Added master & backup support at a time. 27 * Alexandre Cassen : Added SyncID support for incoming sync 28 * messages filtering. 29 * Justin Ossevoort : Fix endian problem on sync message size. 30 * Hans Schillstrom : Added Version 1: i.e. IPv6, 31 * Persistence support, fwmark and time-out. 32 */ 33 34 #define KMSG_COMPONENT "IPVS" 35 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 36 37 #include <linux/module.h> 38 #include <linux/slab.h> 39 #include <linux/inetdevice.h> 40 #include <linux/net.h> 41 #include <linux/completion.h> 42 #include <linux/delay.h> 43 #include <linux/skbuff.h> 44 #include <linux/in.h> 45 #include <linux/igmp.h> /* for ip_mc_join_group */ 46 #include <linux/udp.h> 47 #include <linux/err.h> 48 #include <linux/kthread.h> 49 #include <linux/wait.h> 50 #include <linux/kernel.h> 51 52 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ 53 54 #include <net/ip.h> 55 #include <net/sock.h> 56 57 #include <net/ip_vs.h> 58 59 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 60 #define IP_VS_SYNC_PORT 8848 /* multicast port */ 61 62 #define SYNC_PROTO_VER 1 /* Protocol version in header */ 63 64 static struct lock_class_key __ipvs_sync_key; 65 /* 66 * IPVS sync connection entry 67 * Version 0, i.e. original version. 68 */ 69 struct ip_vs_sync_conn_v0 { 70 __u8 reserved; 71 72 /* Protocol, addresses and port numbers */ 73 __u8 protocol; /* Which protocol (TCP/UDP) */ 74 __be16 cport; 75 __be16 vport; 76 __be16 dport; 77 __be32 caddr; /* client address */ 78 __be32 vaddr; /* virtual address */ 79 __be32 daddr; /* destination address */ 80 81 /* Flags and state transition */ 82 __be16 flags; /* status flags */ 83 __be16 state; /* state info */ 84 85 /* The sequence options start here */ 86 }; 87 88 struct ip_vs_sync_conn_options { 89 struct ip_vs_seq in_seq; /* incoming seq. struct */ 90 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 91 }; 92 93 /* 94 Sync Connection format (sync_conn) 95 96 0 1 2 3 97 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99 | Type | Protocol | Ver. | Size | 100 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101 | Flags | 102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 103 | State | cport | 104 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 105 | vport | dport | 106 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 107 | fwmark | 108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 109 | timeout (in sec.) | 110 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 111 | ... | 112 | IP-Addresses (v4 or v6) | 113 | ... | 114 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 115 Optional Parameters. 116 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 117 | Param. Type | Param. Length | Param. data | 118 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 119 | ... | 120 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 121 | | Param Type | Param. Length | 122 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 123 | Param data | 124 | Last Param data should be padded for 32 bit alignment | 125 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 126 */ 127 128 /* 129 * Type 0, IPv4 sync connection format 130 */ 131 struct ip_vs_sync_v4 { 132 __u8 type; 133 __u8 protocol; /* Which protocol (TCP/UDP) */ 134 __be16 ver_size; /* Version msb 4 bits */ 135 /* Flags and state transition */ 136 __be32 flags; /* status flags */ 137 __be16 state; /* state info */ 138 /* Protocol, addresses and port numbers */ 139 __be16 cport; 140 __be16 vport; 141 __be16 dport; 142 __be32 fwmark; /* Firewall mark from skb */ 143 __be32 timeout; /* cp timeout */ 144 __be32 caddr; /* client address */ 145 __be32 vaddr; /* virtual address */ 146 __be32 daddr; /* destination address */ 147 /* The sequence options start here */ 148 /* PE data padded to 32bit alignment after seq. options */ 149 }; 150 /* 151 * Type 2 messages IPv6 152 */ 153 struct ip_vs_sync_v6 { 154 __u8 type; 155 __u8 protocol; /* Which protocol (TCP/UDP) */ 156 __be16 ver_size; /* Version msb 4 bits */ 157 /* Flags and state transition */ 158 __be32 flags; /* status flags */ 159 __be16 state; /* state info */ 160 /* Protocol, addresses and port numbers */ 161 __be16 cport; 162 __be16 vport; 163 __be16 dport; 164 __be32 fwmark; /* Firewall mark from skb */ 165 __be32 timeout; /* cp timeout */ 166 struct in6_addr caddr; /* client address */ 167 struct in6_addr vaddr; /* virtual address */ 168 struct in6_addr daddr; /* destination address */ 169 /* The sequence options start here */ 170 /* PE data padded to 32bit alignment after seq. options */ 171 }; 172 173 union ip_vs_sync_conn { 174 struct ip_vs_sync_v4 v4; 175 struct ip_vs_sync_v6 v6; 176 }; 177 178 /* Bits in Type field in above */ 179 #define STYPE_INET6 0 180 #define STYPE_F_INET6 (1 << STYPE_INET6) 181 182 #define SVER_SHIFT 12 /* Shift to get version */ 183 #define SVER_MASK 0x0fff /* Mask to strip version */ 184 185 #define IPVS_OPT_SEQ_DATA 1 186 #define IPVS_OPT_PE_DATA 2 187 #define IPVS_OPT_PE_NAME 3 188 #define IPVS_OPT_PARAM 7 189 190 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 191 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 192 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 193 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 194 195 struct ip_vs_sync_thread_data { 196 struct netns_ipvs *ipvs; 197 struct socket *sock; 198 char *buf; 199 int id; 200 }; 201 202 /* Version 0 definition of packet sizes */ 203 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 204 #define FULL_CONN_SIZE \ 205 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 206 207 208 /* 209 The master mulitcasts messages (Datagrams) to the backup load balancers 210 in the following format. 211 212 Version 1: 213 Note, first byte should be Zero, so ver 0 receivers will drop the packet. 214 215 0 1 2 3 216 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 217 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 218 | 0 | SyncID | Size | 219 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 220 | Count Conns | Version | Reserved, set to Zero | 221 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 222 | | 223 | IPVS Sync Connection (1) | 224 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 225 | . | 226 ~ . ~ 227 | . | 228 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 229 | | 230 | IPVS Sync Connection (n) | 231 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 232 233 Version 0 Header 234 0 1 2 3 235 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 236 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 237 | Count Conns | SyncID | Size | 238 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 239 | IPVS Sync Connection (1) | 240 */ 241 242 #define SYNC_MESG_HEADER_LEN 4 243 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ 244 245 /* Version 0 header */ 246 struct ip_vs_sync_mesg_v0 { 247 __u8 nr_conns; 248 __u8 syncid; 249 __be16 size; 250 251 /* ip_vs_sync_conn entries start here */ 252 }; 253 254 /* Version 1 header */ 255 struct ip_vs_sync_mesg { 256 __u8 reserved; /* must be zero */ 257 __u8 syncid; 258 __be16 size; 259 __u8 nr_conns; 260 __s8 version; /* SYNC_PROTO_VER */ 261 __u16 spare; 262 /* ip_vs_sync_conn entries start here */ 263 }; 264 265 union ipvs_sockaddr { 266 struct sockaddr_in in; 267 struct sockaddr_in6 in6; 268 }; 269 270 struct ip_vs_sync_buff { 271 struct list_head list; 272 unsigned long firstuse; 273 274 /* pointers for the message data */ 275 struct ip_vs_sync_mesg *mesg; 276 unsigned char *head; 277 unsigned char *end; 278 }; 279 280 /* 281 * Copy of struct ip_vs_seq 282 * From unaligned network order to aligned host order 283 */ 284 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 285 { 286 ho->init_seq = get_unaligned_be32(&no->init_seq); 287 ho->delta = get_unaligned_be32(&no->delta); 288 ho->previous_delta = get_unaligned_be32(&no->previous_delta); 289 } 290 291 /* 292 * Copy of struct ip_vs_seq 293 * From Aligned host order to unaligned network order 294 */ 295 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 296 { 297 put_unaligned_be32(ho->init_seq, &no->init_seq); 298 put_unaligned_be32(ho->delta, &no->delta); 299 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 300 } 301 302 static inline struct ip_vs_sync_buff * 303 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 304 { 305 struct ip_vs_sync_buff *sb; 306 307 spin_lock_bh(&ipvs->sync_lock); 308 if (list_empty(&ms->sync_queue)) { 309 sb = NULL; 310 __set_current_state(TASK_INTERRUPTIBLE); 311 } else { 312 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 313 list); 314 list_del(&sb->list); 315 ms->sync_queue_len--; 316 if (!ms->sync_queue_len) 317 ms->sync_queue_delay = 0; 318 } 319 spin_unlock_bh(&ipvs->sync_lock); 320 321 return sb; 322 } 323 324 /* 325 * Create a new sync buffer for Version 1 proto. 326 */ 327 static inline struct ip_vs_sync_buff * 328 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 329 { 330 struct ip_vs_sync_buff *sb; 331 332 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 333 return NULL; 334 335 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 336 ipvs->mcfg.sync_maxlen); 337 sb->mesg = kmalloc(len, GFP_ATOMIC); 338 if (!sb->mesg) { 339 kfree(sb); 340 return NULL; 341 } 342 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 343 sb->mesg->version = SYNC_PROTO_VER; 344 sb->mesg->syncid = ipvs->mcfg.syncid; 345 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 346 sb->mesg->nr_conns = 0; 347 sb->mesg->spare = 0; 348 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 349 sb->end = (unsigned char *)sb->mesg + len; 350 351 sb->firstuse = jiffies; 352 return sb; 353 } 354 355 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 356 { 357 kfree(sb->mesg); 358 kfree(sb); 359 } 360 361 static inline void sb_queue_tail(struct netns_ipvs *ipvs, 362 struct ipvs_master_sync_state *ms) 363 { 364 struct ip_vs_sync_buff *sb = ms->sync_buff; 365 366 spin_lock(&ipvs->sync_lock); 367 if (ipvs->sync_state & IP_VS_STATE_MASTER && 368 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 369 if (!ms->sync_queue_len) 370 schedule_delayed_work(&ms->master_wakeup_work, 371 max(IPVS_SYNC_SEND_DELAY, 1)); 372 ms->sync_queue_len++; 373 list_add_tail(&sb->list, &ms->sync_queue); 374 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) 375 wake_up_process(ms->master_thread); 376 } else 377 ip_vs_sync_buff_release(sb); 378 spin_unlock(&ipvs->sync_lock); 379 } 380 381 /* 382 * Get the current sync buffer if it has been created for more 383 * than the specified time or the specified time is zero. 384 */ 385 static inline struct ip_vs_sync_buff * 386 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 387 unsigned long time) 388 { 389 struct ip_vs_sync_buff *sb; 390 391 spin_lock_bh(&ipvs->sync_buff_lock); 392 sb = ms->sync_buff; 393 if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 394 ms->sync_buff = NULL; 395 __set_current_state(TASK_RUNNING); 396 } else 397 sb = NULL; 398 spin_unlock_bh(&ipvs->sync_buff_lock); 399 return sb; 400 } 401 402 static inline int 403 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 404 { 405 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 406 } 407 408 /* 409 * Create a new sync buffer for Version 0 proto. 410 */ 411 static inline struct ip_vs_sync_buff * 412 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 413 { 414 struct ip_vs_sync_buff *sb; 415 struct ip_vs_sync_mesg_v0 *mesg; 416 417 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 418 return NULL; 419 420 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 421 ipvs->mcfg.sync_maxlen); 422 sb->mesg = kmalloc(len, GFP_ATOMIC); 423 if (!sb->mesg) { 424 kfree(sb); 425 return NULL; 426 } 427 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 428 mesg->nr_conns = 0; 429 mesg->syncid = ipvs->mcfg.syncid; 430 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 431 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 432 sb->end = (unsigned char *)mesg + len; 433 sb->firstuse = jiffies; 434 return sb; 435 } 436 437 /* Check if connection is controlled by persistence */ 438 static inline bool in_persistence(struct ip_vs_conn *cp) 439 { 440 for (cp = cp->control; cp; cp = cp->control) { 441 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 442 return true; 443 } 444 return false; 445 } 446 447 /* Check if conn should be synced. 448 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 449 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 450 * sync_retries times with period of sync_refresh_period/8 451 * - (2) if both sync_refresh_period and sync_period are 0 send sync only 452 * for state changes or only once when pkts matches sync_threshold 453 * - (3) templates: rate can be reduced only with sync_refresh_period or 454 * with (2) 455 */ 456 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 457 struct ip_vs_conn *cp, int pkts) 458 { 459 unsigned long orig = ACCESS_ONCE(cp->sync_endtime); 460 unsigned long now = jiffies; 461 unsigned long n = (now + cp->timeout) & ~3UL; 462 unsigned int sync_refresh_period; 463 int sync_period; 464 int force; 465 466 /* Check if we sync in current state */ 467 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 468 force = 0; 469 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 470 return 0; 471 else if (likely(cp->protocol == IPPROTO_TCP)) { 472 if (!((1 << cp->state) & 473 ((1 << IP_VS_TCP_S_ESTABLISHED) | 474 (1 << IP_VS_TCP_S_FIN_WAIT) | 475 (1 << IP_VS_TCP_S_CLOSE) | 476 (1 << IP_VS_TCP_S_CLOSE_WAIT) | 477 (1 << IP_VS_TCP_S_TIME_WAIT)))) 478 return 0; 479 force = cp->state != cp->old_state; 480 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 481 goto set; 482 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 483 if (!((1 << cp->state) & 484 ((1 << IP_VS_SCTP_S_ESTABLISHED) | 485 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 486 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 487 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 488 (1 << IP_VS_SCTP_S_CLOSED)))) 489 return 0; 490 force = cp->state != cp->old_state; 491 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 492 goto set; 493 } else { 494 /* UDP or another protocol with single state */ 495 force = 0; 496 } 497 498 sync_refresh_period = sysctl_sync_refresh_period(ipvs); 499 if (sync_refresh_period > 0) { 500 long diff = n - orig; 501 long min_diff = max(cp->timeout >> 1, 10UL * HZ); 502 503 /* Avoid sync if difference is below sync_refresh_period 504 * and below the half timeout. 505 */ 506 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 507 int retries = orig & 3; 508 509 if (retries >= sysctl_sync_retries(ipvs)) 510 return 0; 511 if (time_before(now, orig - cp->timeout + 512 (sync_refresh_period >> 3))) 513 return 0; 514 n |= retries + 1; 515 } 516 } 517 sync_period = sysctl_sync_period(ipvs); 518 if (sync_period > 0) { 519 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 520 pkts % sync_period != sysctl_sync_threshold(ipvs)) 521 return 0; 522 } else if (sync_refresh_period <= 0 && 523 pkts != sysctl_sync_threshold(ipvs)) 524 return 0; 525 526 set: 527 cp->old_state = cp->state; 528 n = cmpxchg(&cp->sync_endtime, orig, n); 529 return n == orig || force; 530 } 531 532 /* 533 * Version 0 , could be switched in by sys_ctl. 534 * Add an ip_vs_conn information into the current sync_buff. 535 */ 536 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 537 int pkts) 538 { 539 struct ip_vs_sync_mesg_v0 *m; 540 struct ip_vs_sync_conn_v0 *s; 541 struct ip_vs_sync_buff *buff; 542 struct ipvs_master_sync_state *ms; 543 int id; 544 unsigned int len; 545 546 if (unlikely(cp->af != AF_INET)) 547 return; 548 /* Do not sync ONE PACKET */ 549 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 550 return; 551 552 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 553 return; 554 555 spin_lock_bh(&ipvs->sync_buff_lock); 556 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 557 spin_unlock_bh(&ipvs->sync_buff_lock); 558 return; 559 } 560 561 id = select_master_thread_id(ipvs, cp); 562 ms = &ipvs->ms[id]; 563 buff = ms->sync_buff; 564 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 565 SIMPLE_CONN_SIZE; 566 if (buff) { 567 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 568 /* Send buffer if it is for v1 */ 569 if (buff->head + len > buff->end || !m->nr_conns) { 570 sb_queue_tail(ipvs, ms); 571 ms->sync_buff = NULL; 572 buff = NULL; 573 } 574 } 575 if (!buff) { 576 buff = ip_vs_sync_buff_create_v0(ipvs, len); 577 if (!buff) { 578 spin_unlock_bh(&ipvs->sync_buff_lock); 579 pr_err("ip_vs_sync_buff_create failed.\n"); 580 return; 581 } 582 ms->sync_buff = buff; 583 } 584 585 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 586 s = (struct ip_vs_sync_conn_v0 *) buff->head; 587 588 /* copy members */ 589 s->reserved = 0; 590 s->protocol = cp->protocol; 591 s->cport = cp->cport; 592 s->vport = cp->vport; 593 s->dport = cp->dport; 594 s->caddr = cp->caddr.ip; 595 s->vaddr = cp->vaddr.ip; 596 s->daddr = cp->daddr.ip; 597 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 598 s->state = htons(cp->state); 599 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 600 struct ip_vs_sync_conn_options *opt = 601 (struct ip_vs_sync_conn_options *)&s[1]; 602 memcpy(opt, &cp->in_seq, sizeof(*opt)); 603 } 604 605 m->nr_conns++; 606 m->size = htons(ntohs(m->size) + len); 607 buff->head += len; 608 spin_unlock_bh(&ipvs->sync_buff_lock); 609 610 /* synchronize its controller if it has */ 611 cp = cp->control; 612 if (cp) { 613 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 614 pkts = atomic_add_return(1, &cp->in_pkts); 615 else 616 pkts = sysctl_sync_threshold(ipvs); 617 ip_vs_sync_conn(ipvs, cp, pkts); 618 } 619 } 620 621 /* 622 * Add an ip_vs_conn information into the current sync_buff. 623 * Called by ip_vs_in. 624 * Sending Version 1 messages 625 */ 626 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 627 { 628 struct ip_vs_sync_mesg *m; 629 union ip_vs_sync_conn *s; 630 struct ip_vs_sync_buff *buff; 631 struct ipvs_master_sync_state *ms; 632 int id; 633 __u8 *p; 634 unsigned int len, pe_name_len, pad; 635 636 /* Handle old version of the protocol */ 637 if (sysctl_sync_ver(ipvs) == 0) { 638 ip_vs_sync_conn_v0(ipvs, cp, pkts); 639 return; 640 } 641 /* Do not sync ONE PACKET */ 642 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 643 goto control; 644 sloop: 645 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 646 goto control; 647 648 /* Sanity checks */ 649 pe_name_len = 0; 650 if (cp->pe_data_len) { 651 if (!cp->pe_data || !cp->dest) { 652 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 653 return; 654 } 655 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 656 } 657 658 spin_lock_bh(&ipvs->sync_buff_lock); 659 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 660 spin_unlock_bh(&ipvs->sync_buff_lock); 661 return; 662 } 663 664 id = select_master_thread_id(ipvs, cp); 665 ms = &ipvs->ms[id]; 666 667 #ifdef CONFIG_IP_VS_IPV6 668 if (cp->af == AF_INET6) 669 len = sizeof(struct ip_vs_sync_v6); 670 else 671 #endif 672 len = sizeof(struct ip_vs_sync_v4); 673 674 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 675 len += sizeof(struct ip_vs_sync_conn_options) + 2; 676 677 if (cp->pe_data_len) 678 len += cp->pe_data_len + 2; /* + Param hdr field */ 679 if (pe_name_len) 680 len += pe_name_len + 2; 681 682 /* check if there is a space for this one */ 683 pad = 0; 684 buff = ms->sync_buff; 685 if (buff) { 686 m = buff->mesg; 687 pad = (4 - (size_t) buff->head) & 3; 688 /* Send buffer if it is for v0 */ 689 if (buff->head + len + pad > buff->end || m->reserved) { 690 sb_queue_tail(ipvs, ms); 691 ms->sync_buff = NULL; 692 buff = NULL; 693 pad = 0; 694 } 695 } 696 697 if (!buff) { 698 buff = ip_vs_sync_buff_create(ipvs, len); 699 if (!buff) { 700 spin_unlock_bh(&ipvs->sync_buff_lock); 701 pr_err("ip_vs_sync_buff_create failed.\n"); 702 return; 703 } 704 ms->sync_buff = buff; 705 m = buff->mesg; 706 } 707 708 p = buff->head; 709 buff->head += pad + len; 710 m->size = htons(ntohs(m->size) + pad + len); 711 /* Add ev. padding from prev. sync_conn */ 712 while (pad--) 713 *(p++) = 0; 714 715 s = (union ip_vs_sync_conn *)p; 716 717 /* Set message type & copy members */ 718 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 719 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 720 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 721 s->v4.state = htons(cp->state); 722 s->v4.protocol = cp->protocol; 723 s->v4.cport = cp->cport; 724 s->v4.vport = cp->vport; 725 s->v4.dport = cp->dport; 726 s->v4.fwmark = htonl(cp->fwmark); 727 s->v4.timeout = htonl(cp->timeout / HZ); 728 m->nr_conns++; 729 730 #ifdef CONFIG_IP_VS_IPV6 731 if (cp->af == AF_INET6) { 732 p += sizeof(struct ip_vs_sync_v6); 733 s->v6.caddr = cp->caddr.in6; 734 s->v6.vaddr = cp->vaddr.in6; 735 s->v6.daddr = cp->daddr.in6; 736 } else 737 #endif 738 { 739 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 740 s->v4.caddr = cp->caddr.ip; 741 s->v4.vaddr = cp->vaddr.ip; 742 s->v4.daddr = cp->daddr.ip; 743 } 744 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 745 *(p++) = IPVS_OPT_SEQ_DATA; 746 *(p++) = sizeof(struct ip_vs_sync_conn_options); 747 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 748 p += sizeof(struct ip_vs_seq); 749 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 750 p += sizeof(struct ip_vs_seq); 751 } 752 /* Handle pe data */ 753 if (cp->pe_data_len && cp->pe_data) { 754 *(p++) = IPVS_OPT_PE_DATA; 755 *(p++) = cp->pe_data_len; 756 memcpy(p, cp->pe_data, cp->pe_data_len); 757 p += cp->pe_data_len; 758 if (pe_name_len) { 759 /* Add PE_NAME */ 760 *(p++) = IPVS_OPT_PE_NAME; 761 *(p++) = pe_name_len; 762 memcpy(p, cp->pe->name, pe_name_len); 763 p += pe_name_len; 764 } 765 } 766 767 spin_unlock_bh(&ipvs->sync_buff_lock); 768 769 control: 770 /* synchronize its controller if it has */ 771 cp = cp->control; 772 if (!cp) 773 return; 774 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 775 pkts = atomic_add_return(1, &cp->in_pkts); 776 else 777 pkts = sysctl_sync_threshold(ipvs); 778 goto sloop; 779 } 780 781 /* 782 * fill_param used by version 1 783 */ 784 static inline int 785 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 786 struct ip_vs_conn_param *p, 787 __u8 *pe_data, unsigned int pe_data_len, 788 __u8 *pe_name, unsigned int pe_name_len) 789 { 790 #ifdef CONFIG_IP_VS_IPV6 791 if (af == AF_INET6) 792 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 793 (const union nf_inet_addr *)&sc->v6.caddr, 794 sc->v6.cport, 795 (const union nf_inet_addr *)&sc->v6.vaddr, 796 sc->v6.vport, p); 797 else 798 #endif 799 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 800 (const union nf_inet_addr *)&sc->v4.caddr, 801 sc->v4.cport, 802 (const union nf_inet_addr *)&sc->v4.vaddr, 803 sc->v4.vport, p); 804 /* Handle pe data */ 805 if (pe_data_len) { 806 if (pe_name_len) { 807 char buff[IP_VS_PENAME_MAXLEN+1]; 808 809 memcpy(buff, pe_name, pe_name_len); 810 buff[pe_name_len]=0; 811 p->pe = __ip_vs_pe_getbyname(buff); 812 if (!p->pe) { 813 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 814 buff); 815 return 1; 816 } 817 } else { 818 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 819 return 1; 820 } 821 822 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 823 if (!p->pe_data) { 824 module_put(p->pe->module); 825 return -ENOMEM; 826 } 827 p->pe_data_len = pe_data_len; 828 } 829 return 0; 830 } 831 832 /* 833 * Connection Add / Update. 834 * Common for version 0 and 1 reception of backup sync_conns. 835 * Param: ... 836 * timeout is in sec. 837 */ 838 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 839 unsigned int flags, unsigned int state, 840 unsigned int protocol, unsigned int type, 841 const union nf_inet_addr *daddr, __be16 dport, 842 unsigned long timeout, __u32 fwmark, 843 struct ip_vs_sync_conn_options *opt) 844 { 845 struct ip_vs_dest *dest; 846 struct ip_vs_conn *cp; 847 848 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 849 cp = ip_vs_conn_in_get(param); 850 if (cp && ((cp->dport != dport) || 851 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 852 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 853 ip_vs_conn_expire_now(cp); 854 __ip_vs_conn_put(cp); 855 cp = NULL; 856 } else { 857 /* This is the expiration message for the 858 * connection that was already replaced, so we 859 * just ignore it. 860 */ 861 __ip_vs_conn_put(cp); 862 kfree(param->pe_data); 863 return; 864 } 865 } 866 } else { 867 cp = ip_vs_ct_in_get(param); 868 } 869 870 if (cp) { 871 /* Free pe_data */ 872 kfree(param->pe_data); 873 874 dest = cp->dest; 875 spin_lock_bh(&cp->lock); 876 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 877 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 878 if (flags & IP_VS_CONN_F_INACTIVE) { 879 atomic_dec(&dest->activeconns); 880 atomic_inc(&dest->inactconns); 881 } else { 882 atomic_inc(&dest->activeconns); 883 atomic_dec(&dest->inactconns); 884 } 885 } 886 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 887 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 888 cp->flags = flags; 889 spin_unlock_bh(&cp->lock); 890 if (!dest) 891 ip_vs_try_bind_dest(cp); 892 } else { 893 /* 894 * Find the appropriate destination for the connection. 895 * If it is not found the connection will remain unbound 896 * but still handled. 897 */ 898 rcu_read_lock(); 899 /* This function is only invoked by the synchronization 900 * code. We do not currently support heterogeneous pools 901 * with synchronization, so we can make the assumption that 902 * the svc_af is the same as the dest_af 903 */ 904 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 905 param->vaddr, param->vport, protocol, 906 fwmark, flags); 907 908 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 909 fwmark); 910 rcu_read_unlock(); 911 if (!cp) { 912 kfree(param->pe_data); 913 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 914 return; 915 } 916 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 917 kfree(param->pe_data); 918 } 919 920 if (opt) 921 memcpy(&cp->in_seq, opt, sizeof(*opt)); 922 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 923 cp->state = state; 924 cp->old_state = cp->state; 925 /* 926 * For Ver 0 messages style 927 * - Not possible to recover the right timeout for templates 928 * - can not find the right fwmark 929 * virtual service. If needed, we can do it for 930 * non-fwmark persistent services. 931 * Ver 1 messages style. 932 * - No problem. 933 */ 934 if (timeout) { 935 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 936 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 937 cp->timeout = timeout*HZ; 938 } else { 939 struct ip_vs_proto_data *pd; 940 941 pd = ip_vs_proto_data_get(ipvs, protocol); 942 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 943 cp->timeout = pd->timeout_table[state]; 944 else 945 cp->timeout = (3*60*HZ); 946 } 947 ip_vs_conn_put(cp); 948 } 949 950 /* 951 * Process received multicast message for Version 0 952 */ 953 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 954 const size_t buflen) 955 { 956 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 957 struct ip_vs_sync_conn_v0 *s; 958 struct ip_vs_sync_conn_options *opt; 959 struct ip_vs_protocol *pp; 960 struct ip_vs_conn_param param; 961 char *p; 962 int i; 963 964 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 965 for (i=0; i<m->nr_conns; i++) { 966 unsigned int flags, state; 967 968 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 969 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 970 return; 971 } 972 s = (struct ip_vs_sync_conn_v0 *) p; 973 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 974 flags &= ~IP_VS_CONN_F_HASHED; 975 if (flags & IP_VS_CONN_F_SEQ_MASK) { 976 opt = (struct ip_vs_sync_conn_options *)&s[1]; 977 p += FULL_CONN_SIZE; 978 if (p > buffer+buflen) { 979 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 980 return; 981 } 982 } else { 983 opt = NULL; 984 p += SIMPLE_CONN_SIZE; 985 } 986 987 state = ntohs(s->state); 988 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 989 pp = ip_vs_proto_get(s->protocol); 990 if (!pp) { 991 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 992 s->protocol); 993 continue; 994 } 995 if (state >= pp->num_states) { 996 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 997 pp->name, state); 998 continue; 999 } 1000 } else { 1001 /* protocol in templates is not used for state/timeout */ 1002 if (state > 0) { 1003 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", 1004 state); 1005 state = 0; 1006 } 1007 } 1008 1009 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1010 (const union nf_inet_addr *)&s->caddr, 1011 s->cport, 1012 (const union nf_inet_addr *)&s->vaddr, 1013 s->vport, ¶m); 1014 1015 /* Send timeout as Zero */ 1016 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1017 (union nf_inet_addr *)&s->daddr, s->dport, 1018 0, 0, opt); 1019 } 1020 } 1021 1022 /* 1023 * Handle options 1024 */ 1025 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1026 __u32 *opt_flags, 1027 struct ip_vs_sync_conn_options *opt) 1028 { 1029 struct ip_vs_sync_conn_options *topt; 1030 1031 topt = (struct ip_vs_sync_conn_options *)p; 1032 1033 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1034 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1035 return -EINVAL; 1036 } 1037 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1038 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1039 return -EINVAL; 1040 } 1041 ntoh_seq(&topt->in_seq, &opt->in_seq); 1042 ntoh_seq(&topt->out_seq, &opt->out_seq); 1043 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1044 return 0; 1045 } 1046 1047 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1048 __u8 **data, unsigned int maxlen, 1049 __u32 *opt_flags, __u32 flag) 1050 { 1051 if (plen > maxlen) { 1052 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1053 return -EINVAL; 1054 } 1055 if (*opt_flags & flag) { 1056 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1057 return -EINVAL; 1058 } 1059 *data_len = plen; 1060 *data = p; 1061 *opt_flags |= flag; 1062 return 0; 1063 } 1064 /* 1065 * Process a Version 1 sync. connection 1066 */ 1067 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1068 { 1069 struct ip_vs_sync_conn_options opt; 1070 union ip_vs_sync_conn *s; 1071 struct ip_vs_protocol *pp; 1072 struct ip_vs_conn_param param; 1073 __u32 flags; 1074 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1075 __u8 *pe_data=NULL, *pe_name=NULL; 1076 __u32 opt_flags=0; 1077 int retc=0; 1078 1079 s = (union ip_vs_sync_conn *) p; 1080 1081 if (s->v6.type & STYPE_F_INET6) { 1082 #ifdef CONFIG_IP_VS_IPV6 1083 af = AF_INET6; 1084 p += sizeof(struct ip_vs_sync_v6); 1085 #else 1086 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1087 retc = 10; 1088 goto out; 1089 #endif 1090 } else if (!s->v4.type) { 1091 af = AF_INET; 1092 p += sizeof(struct ip_vs_sync_v4); 1093 } else { 1094 return -10; 1095 } 1096 if (p > msg_end) 1097 return -20; 1098 1099 /* Process optional params check Type & Len. */ 1100 while (p < msg_end) { 1101 int ptype; 1102 int plen; 1103 1104 if (p+2 > msg_end) 1105 return -30; 1106 ptype = *(p++); 1107 plen = *(p++); 1108 1109 if (!plen || ((p + plen) > msg_end)) 1110 return -40; 1111 /* Handle seq option p = param data */ 1112 switch (ptype & ~IPVS_OPT_F_PARAM) { 1113 case IPVS_OPT_SEQ_DATA: 1114 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1115 return -50; 1116 break; 1117 1118 case IPVS_OPT_PE_DATA: 1119 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1120 IP_VS_PEDATA_MAXLEN, &opt_flags, 1121 IPVS_OPT_F_PE_DATA)) 1122 return -60; 1123 break; 1124 1125 case IPVS_OPT_PE_NAME: 1126 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1127 IP_VS_PENAME_MAXLEN, &opt_flags, 1128 IPVS_OPT_F_PE_NAME)) 1129 return -70; 1130 break; 1131 1132 default: 1133 /* Param data mandatory ? */ 1134 if (!(ptype & IPVS_OPT_F_PARAM)) { 1135 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1136 ptype & ~IPVS_OPT_F_PARAM); 1137 retc = 20; 1138 goto out; 1139 } 1140 } 1141 p += plen; /* Next option */ 1142 } 1143 1144 /* Get flags and Mask off unsupported */ 1145 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1146 flags |= IP_VS_CONN_F_SYNC; 1147 state = ntohs(s->v4.state); 1148 1149 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1150 pp = ip_vs_proto_get(s->v4.protocol); 1151 if (!pp) { 1152 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1153 s->v4.protocol); 1154 retc = 30; 1155 goto out; 1156 } 1157 if (state >= pp->num_states) { 1158 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1159 pp->name, state); 1160 retc = 40; 1161 goto out; 1162 } 1163 } else { 1164 /* protocol in templates is not used for state/timeout */ 1165 if (state > 0) { 1166 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", 1167 state); 1168 state = 0; 1169 } 1170 } 1171 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1172 pe_data_len, pe_name, pe_name_len)) { 1173 retc = 50; 1174 goto out; 1175 } 1176 /* If only IPv4, just silent skip IPv6 */ 1177 if (af == AF_INET) 1178 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1179 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1180 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1181 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1182 ); 1183 #ifdef CONFIG_IP_VS_IPV6 1184 else 1185 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1186 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1187 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1188 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1189 ); 1190 #endif 1191 ip_vs_pe_put(param.pe); 1192 return 0; 1193 /* Error exit */ 1194 out: 1195 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1196 return retc; 1197 1198 } 1199 /* 1200 * Process received multicast message and create the corresponding 1201 * ip_vs_conn entries. 1202 * Handles Version 0 & 1 1203 */ 1204 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1205 const size_t buflen) 1206 { 1207 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1208 __u8 *p, *msg_end; 1209 int i, nr_conns; 1210 1211 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1212 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1213 return; 1214 } 1215 1216 if (buflen != ntohs(m2->size)) { 1217 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1218 return; 1219 } 1220 /* SyncID sanity check */ 1221 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1222 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1223 return; 1224 } 1225 /* Handle version 1 message */ 1226 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1227 && (m2->spare == 0)) { 1228 1229 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1230 nr_conns = m2->nr_conns; 1231 1232 for (i=0; i<nr_conns; i++) { 1233 union ip_vs_sync_conn *s; 1234 unsigned int size; 1235 int retc; 1236 1237 p = msg_end; 1238 if (p + sizeof(s->v4) > buffer+buflen) { 1239 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); 1240 return; 1241 } 1242 s = (union ip_vs_sync_conn *)p; 1243 size = ntohs(s->v4.ver_size) & SVER_MASK; 1244 msg_end = p + size; 1245 /* Basic sanity checks */ 1246 if (msg_end > buffer+buflen) { 1247 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1248 return; 1249 } 1250 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1251 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1252 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1253 return; 1254 } 1255 /* Process a single sync_conn */ 1256 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1257 if (retc < 0) { 1258 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1259 retc); 1260 return; 1261 } 1262 /* Make sure we have 32 bit alignment */ 1263 msg_end = p + ((size + 3) & ~3); 1264 } 1265 } else { 1266 /* Old type of message */ 1267 ip_vs_process_message_v0(ipvs, buffer, buflen); 1268 return; 1269 } 1270 } 1271 1272 1273 /* 1274 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1275 */ 1276 static void set_sock_size(struct sock *sk, int mode, int val) 1277 { 1278 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1279 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1280 lock_sock(sk); 1281 if (mode) { 1282 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1283 sysctl_wmem_max); 1284 sk->sk_sndbuf = val * 2; 1285 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1286 } else { 1287 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1288 sysctl_rmem_max); 1289 sk->sk_rcvbuf = val * 2; 1290 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1291 } 1292 release_sock(sk); 1293 } 1294 1295 /* 1296 * Setup loopback of outgoing multicasts on a sending socket 1297 */ 1298 static void set_mcast_loop(struct sock *sk, u_char loop) 1299 { 1300 struct inet_sock *inet = inet_sk(sk); 1301 1302 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1303 lock_sock(sk); 1304 inet->mc_loop = loop ? 1 : 0; 1305 #ifdef CONFIG_IP_VS_IPV6 1306 if (sk->sk_family == AF_INET6) { 1307 struct ipv6_pinfo *np = inet6_sk(sk); 1308 1309 /* IPV6_MULTICAST_LOOP */ 1310 np->mc_loop = loop ? 1 : 0; 1311 } 1312 #endif 1313 release_sock(sk); 1314 } 1315 1316 /* 1317 * Specify TTL for outgoing multicasts on a sending socket 1318 */ 1319 static void set_mcast_ttl(struct sock *sk, u_char ttl) 1320 { 1321 struct inet_sock *inet = inet_sk(sk); 1322 1323 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1324 lock_sock(sk); 1325 inet->mc_ttl = ttl; 1326 #ifdef CONFIG_IP_VS_IPV6 1327 if (sk->sk_family == AF_INET6) { 1328 struct ipv6_pinfo *np = inet6_sk(sk); 1329 1330 /* IPV6_MULTICAST_HOPS */ 1331 np->mcast_hops = ttl; 1332 } 1333 #endif 1334 release_sock(sk); 1335 } 1336 1337 /* Control fragmentation of messages */ 1338 static void set_mcast_pmtudisc(struct sock *sk, int val) 1339 { 1340 struct inet_sock *inet = inet_sk(sk); 1341 1342 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1343 lock_sock(sk); 1344 inet->pmtudisc = val; 1345 #ifdef CONFIG_IP_VS_IPV6 1346 if (sk->sk_family == AF_INET6) { 1347 struct ipv6_pinfo *np = inet6_sk(sk); 1348 1349 /* IPV6_MTU_DISCOVER */ 1350 np->pmtudisc = val; 1351 } 1352 #endif 1353 release_sock(sk); 1354 } 1355 1356 /* 1357 * Specifiy default interface for outgoing multicasts 1358 */ 1359 static int set_mcast_if(struct sock *sk, char *ifname) 1360 { 1361 struct net_device *dev; 1362 struct inet_sock *inet = inet_sk(sk); 1363 struct net *net = sock_net(sk); 1364 1365 dev = __dev_get_by_name(net, ifname); 1366 if (!dev) 1367 return -ENODEV; 1368 1369 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1370 return -EINVAL; 1371 1372 lock_sock(sk); 1373 inet->mc_index = dev->ifindex; 1374 /* inet->mc_addr = 0; */ 1375 #ifdef CONFIG_IP_VS_IPV6 1376 if (sk->sk_family == AF_INET6) { 1377 struct ipv6_pinfo *np = inet6_sk(sk); 1378 1379 /* IPV6_MULTICAST_IF */ 1380 np->mcast_oif = dev->ifindex; 1381 } 1382 #endif 1383 release_sock(sk); 1384 1385 return 0; 1386 } 1387 1388 1389 /* 1390 * Join a multicast group. 1391 * the group is specified by a class D multicast address 224.0.0.0/8 1392 * in the in_addr structure passed in as a parameter. 1393 */ 1394 static int 1395 join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) 1396 { 1397 struct net *net = sock_net(sk); 1398 struct ip_mreqn mreq; 1399 struct net_device *dev; 1400 int ret; 1401 1402 memset(&mreq, 0, sizeof(mreq)); 1403 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1404 1405 dev = __dev_get_by_name(net, ifname); 1406 if (!dev) 1407 return -ENODEV; 1408 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1409 return -EINVAL; 1410 1411 mreq.imr_ifindex = dev->ifindex; 1412 1413 lock_sock(sk); 1414 ret = ip_mc_join_group(sk, &mreq); 1415 release_sock(sk); 1416 1417 return ret; 1418 } 1419 1420 #ifdef CONFIG_IP_VS_IPV6 1421 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1422 char *ifname) 1423 { 1424 struct net *net = sock_net(sk); 1425 struct net_device *dev; 1426 int ret; 1427 1428 dev = __dev_get_by_name(net, ifname); 1429 if (!dev) 1430 return -ENODEV; 1431 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1432 return -EINVAL; 1433 1434 lock_sock(sk); 1435 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1436 release_sock(sk); 1437 1438 return ret; 1439 } 1440 #endif 1441 1442 static int bind_mcastif_addr(struct socket *sock, char *ifname) 1443 { 1444 struct net *net = sock_net(sock->sk); 1445 struct net_device *dev; 1446 __be32 addr; 1447 struct sockaddr_in sin; 1448 1449 dev = __dev_get_by_name(net, ifname); 1450 if (!dev) 1451 return -ENODEV; 1452 1453 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1454 if (!addr) 1455 pr_err("You probably need to specify IP address on " 1456 "multicast interface.\n"); 1457 1458 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1459 ifname, &addr); 1460 1461 /* Now bind the socket with the address of multicast interface */ 1462 sin.sin_family = AF_INET; 1463 sin.sin_addr.s_addr = addr; 1464 sin.sin_port = 0; 1465 1466 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 1467 } 1468 1469 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1470 struct ipvs_sync_daemon_cfg *c, int id) 1471 { 1472 if (AF_INET6 == c->mcast_af) { 1473 sa->in6 = (struct sockaddr_in6) { 1474 .sin6_family = AF_INET6, 1475 .sin6_port = htons(c->mcast_port + id), 1476 }; 1477 sa->in6.sin6_addr = c->mcast_group.in6; 1478 *salen = sizeof(sa->in6); 1479 } else { 1480 sa->in = (struct sockaddr_in) { 1481 .sin_family = AF_INET, 1482 .sin_port = htons(c->mcast_port + id), 1483 }; 1484 sa->in.sin_addr = c->mcast_group.in; 1485 *salen = sizeof(sa->in); 1486 } 1487 } 1488 1489 /* 1490 * Set up sending multicast socket over UDP 1491 */ 1492 static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) 1493 { 1494 /* multicast addr */ 1495 union ipvs_sockaddr mcast_addr; 1496 struct socket *sock; 1497 int result, salen; 1498 1499 /* First create a socket */ 1500 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1501 IPPROTO_UDP, &sock); 1502 if (result < 0) { 1503 pr_err("Error during creation of socket; terminating\n"); 1504 return ERR_PTR(result); 1505 } 1506 result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); 1507 if (result < 0) { 1508 pr_err("Error setting outbound mcast interface\n"); 1509 goto error; 1510 } 1511 1512 set_mcast_loop(sock->sk, 0); 1513 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1514 /* Allow fragmentation if MTU changes */ 1515 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1516 result = sysctl_sync_sock_size(ipvs); 1517 if (result > 0) 1518 set_sock_size(sock->sk, 1, result); 1519 1520 if (AF_INET == ipvs->mcfg.mcast_af) 1521 result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); 1522 else 1523 result = 0; 1524 if (result < 0) { 1525 pr_err("Error binding address of the mcast interface\n"); 1526 goto error; 1527 } 1528 1529 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1530 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 1531 salen, 0); 1532 if (result < 0) { 1533 pr_err("Error connecting to the multicast addr\n"); 1534 goto error; 1535 } 1536 1537 return sock; 1538 1539 error: 1540 sock_release(sock); 1541 return ERR_PTR(result); 1542 } 1543 1544 1545 /* 1546 * Set up receiving multicast socket over UDP 1547 */ 1548 static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, 1549 int ifindex) 1550 { 1551 /* multicast addr */ 1552 union ipvs_sockaddr mcast_addr; 1553 struct socket *sock; 1554 int result, salen; 1555 1556 /* First create a socket */ 1557 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1558 IPPROTO_UDP, &sock); 1559 if (result < 0) { 1560 pr_err("Error during creation of socket; terminating\n"); 1561 return ERR_PTR(result); 1562 } 1563 /* it is equivalent to the REUSEADDR option in user-space */ 1564 sock->sk->sk_reuse = SK_CAN_REUSE; 1565 result = sysctl_sync_sock_size(ipvs); 1566 if (result > 0) 1567 set_sock_size(sock->sk, 0, result); 1568 1569 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1570 sock->sk->sk_bound_dev_if = ifindex; 1571 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); 1572 if (result < 0) { 1573 pr_err("Error binding to the multicast addr\n"); 1574 goto error; 1575 } 1576 1577 /* join the multicast group */ 1578 #ifdef CONFIG_IP_VS_IPV6 1579 if (ipvs->bcfg.mcast_af == AF_INET6) 1580 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1581 ipvs->bcfg.mcast_ifn); 1582 else 1583 #endif 1584 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1585 ipvs->bcfg.mcast_ifn); 1586 if (result < 0) { 1587 pr_err("Error joining to the multicast group\n"); 1588 goto error; 1589 } 1590 1591 return sock; 1592 1593 error: 1594 sock_release(sock); 1595 return ERR_PTR(result); 1596 } 1597 1598 1599 static int 1600 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1601 { 1602 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1603 struct kvec iov; 1604 int len; 1605 1606 EnterFunction(7); 1607 iov.iov_base = (void *)buffer; 1608 iov.iov_len = length; 1609 1610 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1611 1612 LeaveFunction(7); 1613 return len; 1614 } 1615 1616 static int 1617 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1618 { 1619 int msize; 1620 int ret; 1621 1622 msize = ntohs(msg->size); 1623 1624 ret = ip_vs_send_async(sock, (char *)msg, msize); 1625 if (ret >= 0 || ret == -EAGAIN) 1626 return ret; 1627 pr_err("ip_vs_send_async error %d\n", ret); 1628 return 0; 1629 } 1630 1631 static int 1632 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1633 { 1634 struct msghdr msg = {NULL,}; 1635 struct kvec iov; 1636 int len; 1637 1638 EnterFunction(7); 1639 1640 /* Receive a packet */ 1641 iov.iov_base = buffer; 1642 iov.iov_len = (size_t)buflen; 1643 1644 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); 1645 1646 if (len < 0) 1647 return len; 1648 1649 LeaveFunction(7); 1650 return len; 1651 } 1652 1653 /* Wakeup the master thread for sending */ 1654 static void master_wakeup_work_handler(struct work_struct *work) 1655 { 1656 struct ipvs_master_sync_state *ms = 1657 container_of(work, struct ipvs_master_sync_state, 1658 master_wakeup_work.work); 1659 struct netns_ipvs *ipvs = ms->ipvs; 1660 1661 spin_lock_bh(&ipvs->sync_lock); 1662 if (ms->sync_queue_len && 1663 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1664 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1665 wake_up_process(ms->master_thread); 1666 } 1667 spin_unlock_bh(&ipvs->sync_lock); 1668 } 1669 1670 /* Get next buffer to send */ 1671 static inline struct ip_vs_sync_buff * 1672 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1673 { 1674 struct ip_vs_sync_buff *sb; 1675 1676 sb = sb_dequeue(ipvs, ms); 1677 if (sb) 1678 return sb; 1679 /* Do not delay entries in buffer for more than 2 seconds */ 1680 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1681 } 1682 1683 static int sync_thread_master(void *data) 1684 { 1685 struct ip_vs_sync_thread_data *tinfo = data; 1686 struct netns_ipvs *ipvs = tinfo->ipvs; 1687 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1688 struct sock *sk = tinfo->sock->sk; 1689 struct ip_vs_sync_buff *sb; 1690 1691 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1692 "syncid = %d, id = %d\n", 1693 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1694 1695 for (;;) { 1696 sb = next_sync_buff(ipvs, ms); 1697 if (unlikely(kthread_should_stop())) 1698 break; 1699 if (!sb) { 1700 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1701 continue; 1702 } 1703 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1704 /* (Ab)use interruptible sleep to avoid increasing 1705 * the load avg. 1706 */ 1707 __wait_event_interruptible(*sk_sleep(sk), 1708 sock_writeable(sk) || 1709 kthread_should_stop()); 1710 if (unlikely(kthread_should_stop())) 1711 goto done; 1712 } 1713 ip_vs_sync_buff_release(sb); 1714 } 1715 1716 done: 1717 __set_current_state(TASK_RUNNING); 1718 if (sb) 1719 ip_vs_sync_buff_release(sb); 1720 1721 /* clean up the sync_buff queue */ 1722 while ((sb = sb_dequeue(ipvs, ms))) 1723 ip_vs_sync_buff_release(sb); 1724 __set_current_state(TASK_RUNNING); 1725 1726 /* clean up the current sync_buff */ 1727 sb = get_curr_sync_buff(ipvs, ms, 0); 1728 if (sb) 1729 ip_vs_sync_buff_release(sb); 1730 1731 /* release the sending multicast socket */ 1732 sock_release(tinfo->sock); 1733 kfree(tinfo); 1734 1735 return 0; 1736 } 1737 1738 1739 static int sync_thread_backup(void *data) 1740 { 1741 struct ip_vs_sync_thread_data *tinfo = data; 1742 struct netns_ipvs *ipvs = tinfo->ipvs; 1743 int len; 1744 1745 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1746 "syncid = %d, id = %d\n", 1747 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1748 1749 while (!kthread_should_stop()) { 1750 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), 1751 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) 1752 || kthread_should_stop()); 1753 1754 /* do we have data now? */ 1755 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { 1756 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1757 ipvs->bcfg.sync_maxlen); 1758 if (len <= 0) { 1759 if (len != -EAGAIN) 1760 pr_err("receiving message error\n"); 1761 break; 1762 } 1763 1764 ip_vs_process_message(ipvs, tinfo->buf, len); 1765 } 1766 } 1767 1768 /* release the sending multicast socket */ 1769 sock_release(tinfo->sock); 1770 kfree(tinfo->buf); 1771 kfree(tinfo); 1772 1773 return 0; 1774 } 1775 1776 1777 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1778 int state) 1779 { 1780 struct ip_vs_sync_thread_data *tinfo; 1781 struct task_struct **array = NULL, *task; 1782 struct socket *sock; 1783 struct net_device *dev; 1784 char *name; 1785 int (*threadfn)(void *data); 1786 int id, count, hlen; 1787 int result = -ENOMEM; 1788 u16 mtu, min_mtu; 1789 1790 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1791 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", 1792 sizeof(struct ip_vs_sync_conn_v0)); 1793 1794 if (!ipvs->sync_state) { 1795 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1796 ipvs->threads_mask = count - 1; 1797 } else 1798 count = ipvs->threads_mask + 1; 1799 1800 if (c->mcast_af == AF_UNSPEC) { 1801 c->mcast_af = AF_INET; 1802 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1803 } 1804 if (!c->mcast_port) 1805 c->mcast_port = IP_VS_SYNC_PORT; 1806 if (!c->mcast_ttl) 1807 c->mcast_ttl = 1; 1808 1809 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1810 if (!dev) { 1811 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1812 return -ENODEV; 1813 } 1814 hlen = (AF_INET6 == c->mcast_af) ? 1815 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1816 sizeof(struct iphdr) + sizeof(struct udphdr); 1817 mtu = (state == IP_VS_STATE_BACKUP) ? 1818 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1819 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1820 1821 if (c->sync_maxlen) 1822 c->sync_maxlen = clamp_t(unsigned int, 1823 c->sync_maxlen, min_mtu, 1824 65535 - hlen); 1825 else 1826 c->sync_maxlen = mtu - hlen; 1827 1828 if (state == IP_VS_STATE_MASTER) { 1829 if (ipvs->ms) 1830 return -EEXIST; 1831 1832 ipvs->mcfg = *c; 1833 name = "ipvs-m:%d:%d"; 1834 threadfn = sync_thread_master; 1835 } else if (state == IP_VS_STATE_BACKUP) { 1836 if (ipvs->backup_threads) 1837 return -EEXIST; 1838 1839 ipvs->bcfg = *c; 1840 name = "ipvs-b:%d:%d"; 1841 threadfn = sync_thread_backup; 1842 } else { 1843 return -EINVAL; 1844 } 1845 1846 if (state == IP_VS_STATE_MASTER) { 1847 struct ipvs_master_sync_state *ms; 1848 1849 ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); 1850 if (!ipvs->ms) 1851 goto out; 1852 ms = ipvs->ms; 1853 for (id = 0; id < count; id++, ms++) { 1854 INIT_LIST_HEAD(&ms->sync_queue); 1855 ms->sync_queue_len = 0; 1856 ms->sync_queue_delay = 0; 1857 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1858 master_wakeup_work_handler); 1859 ms->ipvs = ipvs; 1860 } 1861 } else { 1862 array = kzalloc(count * sizeof(struct task_struct *), 1863 GFP_KERNEL); 1864 if (!array) 1865 goto out; 1866 } 1867 1868 tinfo = NULL; 1869 for (id = 0; id < count; id++) { 1870 if (state == IP_VS_STATE_MASTER) 1871 sock = make_send_sock(ipvs, id); 1872 else 1873 sock = make_receive_sock(ipvs, id, dev->ifindex); 1874 if (IS_ERR(sock)) { 1875 result = PTR_ERR(sock); 1876 goto outtinfo; 1877 } 1878 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 1879 if (!tinfo) 1880 goto outsocket; 1881 tinfo->ipvs = ipvs; 1882 tinfo->sock = sock; 1883 if (state == IP_VS_STATE_BACKUP) { 1884 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1885 GFP_KERNEL); 1886 if (!tinfo->buf) 1887 goto outtinfo; 1888 } else { 1889 tinfo->buf = NULL; 1890 } 1891 tinfo->id = id; 1892 1893 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1894 if (IS_ERR(task)) { 1895 result = PTR_ERR(task); 1896 goto outtinfo; 1897 } 1898 tinfo = NULL; 1899 if (state == IP_VS_STATE_MASTER) 1900 ipvs->ms[id].master_thread = task; 1901 else 1902 array[id] = task; 1903 } 1904 1905 /* mark as active */ 1906 1907 if (state == IP_VS_STATE_BACKUP) 1908 ipvs->backup_threads = array; 1909 spin_lock_bh(&ipvs->sync_buff_lock); 1910 ipvs->sync_state |= state; 1911 spin_unlock_bh(&ipvs->sync_buff_lock); 1912 1913 /* increase the module use count */ 1914 ip_vs_use_count_inc(); 1915 1916 return 0; 1917 1918 outsocket: 1919 sock_release(sock); 1920 1921 outtinfo: 1922 if (tinfo) { 1923 sock_release(tinfo->sock); 1924 kfree(tinfo->buf); 1925 kfree(tinfo); 1926 } 1927 count = id; 1928 while (count-- > 0) { 1929 if (state == IP_VS_STATE_MASTER) 1930 kthread_stop(ipvs->ms[count].master_thread); 1931 else 1932 kthread_stop(array[count]); 1933 } 1934 kfree(array); 1935 1936 out: 1937 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1938 kfree(ipvs->ms); 1939 ipvs->ms = NULL; 1940 } 1941 return result; 1942 } 1943 1944 1945 int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1946 { 1947 struct task_struct **array; 1948 int id; 1949 int retc = -EINVAL; 1950 1951 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1952 1953 if (state == IP_VS_STATE_MASTER) { 1954 if (!ipvs->ms) 1955 return -ESRCH; 1956 1957 /* 1958 * The lock synchronizes with sb_queue_tail(), so that we don't 1959 * add sync buffers to the queue, when we are already in 1960 * progress of stopping the master sync daemon. 1961 */ 1962 1963 spin_lock_bh(&ipvs->sync_buff_lock); 1964 spin_lock(&ipvs->sync_lock); 1965 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1966 spin_unlock(&ipvs->sync_lock); 1967 spin_unlock_bh(&ipvs->sync_buff_lock); 1968 1969 retc = 0; 1970 for (id = ipvs->threads_mask; id >= 0; id--) { 1971 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1972 int ret; 1973 1974 pr_info("stopping master sync thread %d ...\n", 1975 task_pid_nr(ms->master_thread)); 1976 cancel_delayed_work_sync(&ms->master_wakeup_work); 1977 ret = kthread_stop(ms->master_thread); 1978 if (retc >= 0) 1979 retc = ret; 1980 } 1981 kfree(ipvs->ms); 1982 ipvs->ms = NULL; 1983 } else if (state == IP_VS_STATE_BACKUP) { 1984 if (!ipvs->backup_threads) 1985 return -ESRCH; 1986 1987 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1988 array = ipvs->backup_threads; 1989 retc = 0; 1990 for (id = ipvs->threads_mask; id >= 0; id--) { 1991 int ret; 1992 1993 pr_info("stopping backup sync thread %d ...\n", 1994 task_pid_nr(array[id])); 1995 ret = kthread_stop(array[id]); 1996 if (retc >= 0) 1997 retc = ret; 1998 } 1999 kfree(array); 2000 ipvs->backup_threads = NULL; 2001 } 2002 2003 /* decrease the module use count */ 2004 ip_vs_use_count_dec(); 2005 2006 return retc; 2007 } 2008 2009 /* 2010 * Initialize data struct for each netns 2011 */ 2012 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2013 { 2014 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2015 spin_lock_init(&ipvs->sync_lock); 2016 spin_lock_init(&ipvs->sync_buff_lock); 2017 return 0; 2018 } 2019 2020 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2021 { 2022 int retc; 2023 2024 mutex_lock(&ipvs->sync_mutex); 2025 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2026 if (retc && retc != -ESRCH) 2027 pr_err("Failed to stop Master Daemon\n"); 2028 2029 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2030 if (retc && retc != -ESRCH) 2031 pr_err("Failed to stop Backup Daemon\n"); 2032 mutex_unlock(&ipvs->sync_mutex); 2033 } 2034