1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Version 1, is capable of handling both version 0 and 1 messages. 10 * Version 0 is the plain old format. 11 * Note Version 0 receivers will just drop Ver 1 messages. 12 * Version 1 is capable of handle IPv6, Persistence data, 13 * time-outs, and firewall marks. 14 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 15 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 16 * 17 * Definitions Message: is a complete datagram 18 * Sync_conn: is a part of a Message 19 * Param Data is an option to a Sync_conn. 20 * 21 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 22 * 23 * ip_vs_sync: sync connection info from master load balancer to backups 24 * through multicast 25 * 26 * Changes: 27 * Alexandre Cassen : Added master & backup support at a time. 28 * Alexandre Cassen : Added SyncID support for incoming sync 29 * messages filtering. 30 * Justin Ossevoort : Fix endian problem on sync message size. 31 * Hans Schillstrom : Added Version 1: i.e. IPv6, 32 * Persistence support, fwmark and time-out. 33 */ 34 35 #define KMSG_COMPONENT "IPVS" 36 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 37 38 #include <linux/module.h> 39 #include <linux/slab.h> 40 #include <linux/inetdevice.h> 41 #include <linux/net.h> 42 #include <linux/completion.h> 43 #include <linux/delay.h> 44 #include <linux/skbuff.h> 45 #include <linux/in.h> 46 #include <linux/igmp.h> /* for ip_mc_join_group */ 47 #include <linux/udp.h> 48 #include <linux/err.h> 49 #include <linux/kthread.h> 50 #include <linux/wait.h> 51 #include <linux/kernel.h> 52 #include <linux/sched/signal.h> 53 54 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ 55 56 #include <net/ip.h> 57 #include <net/sock.h> 58 59 #include <net/ip_vs.h> 60 61 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 62 #define IP_VS_SYNC_PORT 8848 /* multicast port */ 63 64 #define SYNC_PROTO_VER 1 /* Protocol version in header */ 65 66 static struct lock_class_key __ipvs_sync_key; 67 /* 68 * IPVS sync connection entry 69 * Version 0, i.e. original version. 70 */ 71 struct ip_vs_sync_conn_v0 { 72 __u8 reserved; 73 74 /* Protocol, addresses and port numbers */ 75 __u8 protocol; /* Which protocol (TCP/UDP) */ 76 __be16 cport; 77 __be16 vport; 78 __be16 dport; 79 __be32 caddr; /* client address */ 80 __be32 vaddr; /* virtual address */ 81 __be32 daddr; /* destination address */ 82 83 /* Flags and state transition */ 84 __be16 flags; /* status flags */ 85 __be16 state; /* state info */ 86 87 /* The sequence options start here */ 88 }; 89 90 struct ip_vs_sync_conn_options { 91 struct ip_vs_seq in_seq; /* incoming seq. struct */ 92 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 93 }; 94 95 /* 96 Sync Connection format (sync_conn) 97 98 0 1 2 3 99 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 100 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101 | Type | Protocol | Ver. | Size | 102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 103 | Flags | 104 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 105 | State | cport | 106 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 107 | vport | dport | 108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 109 | fwmark | 110 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 111 | timeout (in sec.) | 112 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 113 | ... | 114 | IP-Addresses (v4 or v6) | 115 | ... | 116 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 117 Optional Parameters. 118 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 119 | Param. Type | Param. Length | Param. data | 120 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 121 | ... | 122 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 123 | | Param Type | Param. Length | 124 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 125 | Param data | 126 | Last Param data should be padded for 32 bit alignment | 127 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 128 */ 129 130 /* 131 * Type 0, IPv4 sync connection format 132 */ 133 struct ip_vs_sync_v4 { 134 __u8 type; 135 __u8 protocol; /* Which protocol (TCP/UDP) */ 136 __be16 ver_size; /* Version msb 4 bits */ 137 /* Flags and state transition */ 138 __be32 flags; /* status flags */ 139 __be16 state; /* state info */ 140 /* Protocol, addresses and port numbers */ 141 __be16 cport; 142 __be16 vport; 143 __be16 dport; 144 __be32 fwmark; /* Firewall mark from skb */ 145 __be32 timeout; /* cp timeout */ 146 __be32 caddr; /* client address */ 147 __be32 vaddr; /* virtual address */ 148 __be32 daddr; /* destination address */ 149 /* The sequence options start here */ 150 /* PE data padded to 32bit alignment after seq. options */ 151 }; 152 /* 153 * Type 2 messages IPv6 154 */ 155 struct ip_vs_sync_v6 { 156 __u8 type; 157 __u8 protocol; /* Which protocol (TCP/UDP) */ 158 __be16 ver_size; /* Version msb 4 bits */ 159 /* Flags and state transition */ 160 __be32 flags; /* status flags */ 161 __be16 state; /* state info */ 162 /* Protocol, addresses and port numbers */ 163 __be16 cport; 164 __be16 vport; 165 __be16 dport; 166 __be32 fwmark; /* Firewall mark from skb */ 167 __be32 timeout; /* cp timeout */ 168 struct in6_addr caddr; /* client address */ 169 struct in6_addr vaddr; /* virtual address */ 170 struct in6_addr daddr; /* destination address */ 171 /* The sequence options start here */ 172 /* PE data padded to 32bit alignment after seq. options */ 173 }; 174 175 union ip_vs_sync_conn { 176 struct ip_vs_sync_v4 v4; 177 struct ip_vs_sync_v6 v6; 178 }; 179 180 /* Bits in Type field in above */ 181 #define STYPE_INET6 0 182 #define STYPE_F_INET6 (1 << STYPE_INET6) 183 184 #define SVER_SHIFT 12 /* Shift to get version */ 185 #define SVER_MASK 0x0fff /* Mask to strip version */ 186 187 #define IPVS_OPT_SEQ_DATA 1 188 #define IPVS_OPT_PE_DATA 2 189 #define IPVS_OPT_PE_NAME 3 190 #define IPVS_OPT_PARAM 7 191 192 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 193 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 194 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 195 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 196 197 struct ip_vs_sync_thread_data { 198 struct task_struct *task; 199 struct netns_ipvs *ipvs; 200 struct socket *sock; 201 char *buf; 202 int id; 203 }; 204 205 /* Version 0 definition of packet sizes */ 206 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 207 #define FULL_CONN_SIZE \ 208 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 209 210 211 /* 212 The master mulitcasts messages (Datagrams) to the backup load balancers 213 in the following format. 214 215 Version 1: 216 Note, first byte should be Zero, so ver 0 receivers will drop the packet. 217 218 0 1 2 3 219 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 220 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 221 | 0 | SyncID | Size | 222 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 223 | Count Conns | Version | Reserved, set to Zero | 224 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 225 | | 226 | IPVS Sync Connection (1) | 227 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 228 | . | 229 ~ . ~ 230 | . | 231 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 232 | | 233 | IPVS Sync Connection (n) | 234 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 235 236 Version 0 Header 237 0 1 2 3 238 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 239 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 240 | Count Conns | SyncID | Size | 241 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 242 | IPVS Sync Connection (1) | 243 */ 244 245 #define SYNC_MESG_HEADER_LEN 4 246 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ 247 248 /* Version 0 header */ 249 struct ip_vs_sync_mesg_v0 { 250 __u8 nr_conns; 251 __u8 syncid; 252 __be16 size; 253 254 /* ip_vs_sync_conn entries start here */ 255 }; 256 257 /* Version 1 header */ 258 struct ip_vs_sync_mesg { 259 __u8 reserved; /* must be zero */ 260 __u8 syncid; 261 __be16 size; 262 __u8 nr_conns; 263 __s8 version; /* SYNC_PROTO_VER */ 264 __u16 spare; 265 /* ip_vs_sync_conn entries start here */ 266 }; 267 268 union ipvs_sockaddr { 269 struct sockaddr_in in; 270 struct sockaddr_in6 in6; 271 }; 272 273 struct ip_vs_sync_buff { 274 struct list_head list; 275 unsigned long firstuse; 276 277 /* pointers for the message data */ 278 struct ip_vs_sync_mesg *mesg; 279 unsigned char *head; 280 unsigned char *end; 281 }; 282 283 /* 284 * Copy of struct ip_vs_seq 285 * From unaligned network order to aligned host order 286 */ 287 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 288 { 289 memset(ho, 0, sizeof(*ho)); 290 ho->init_seq = get_unaligned_be32(&no->init_seq); 291 ho->delta = get_unaligned_be32(&no->delta); 292 ho->previous_delta = get_unaligned_be32(&no->previous_delta); 293 } 294 295 /* 296 * Copy of struct ip_vs_seq 297 * From Aligned host order to unaligned network order 298 */ 299 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 300 { 301 put_unaligned_be32(ho->init_seq, &no->init_seq); 302 put_unaligned_be32(ho->delta, &no->delta); 303 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 304 } 305 306 static inline struct ip_vs_sync_buff * 307 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 308 { 309 struct ip_vs_sync_buff *sb; 310 311 spin_lock_bh(&ipvs->sync_lock); 312 if (list_empty(&ms->sync_queue)) { 313 sb = NULL; 314 __set_current_state(TASK_INTERRUPTIBLE); 315 } else { 316 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 317 list); 318 list_del(&sb->list); 319 ms->sync_queue_len--; 320 if (!ms->sync_queue_len) 321 ms->sync_queue_delay = 0; 322 } 323 spin_unlock_bh(&ipvs->sync_lock); 324 325 return sb; 326 } 327 328 /* 329 * Create a new sync buffer for Version 1 proto. 330 */ 331 static inline struct ip_vs_sync_buff * 332 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 333 { 334 struct ip_vs_sync_buff *sb; 335 336 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 337 return NULL; 338 339 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 340 ipvs->mcfg.sync_maxlen); 341 sb->mesg = kmalloc(len, GFP_ATOMIC); 342 if (!sb->mesg) { 343 kfree(sb); 344 return NULL; 345 } 346 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 347 sb->mesg->version = SYNC_PROTO_VER; 348 sb->mesg->syncid = ipvs->mcfg.syncid; 349 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 350 sb->mesg->nr_conns = 0; 351 sb->mesg->spare = 0; 352 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 353 sb->end = (unsigned char *)sb->mesg + len; 354 355 sb->firstuse = jiffies; 356 return sb; 357 } 358 359 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 360 { 361 kfree(sb->mesg); 362 kfree(sb); 363 } 364 365 static inline void sb_queue_tail(struct netns_ipvs *ipvs, 366 struct ipvs_master_sync_state *ms) 367 { 368 struct ip_vs_sync_buff *sb = ms->sync_buff; 369 370 spin_lock(&ipvs->sync_lock); 371 if (ipvs->sync_state & IP_VS_STATE_MASTER && 372 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 373 if (!ms->sync_queue_len) 374 schedule_delayed_work(&ms->master_wakeup_work, 375 max(IPVS_SYNC_SEND_DELAY, 1)); 376 ms->sync_queue_len++; 377 list_add_tail(&sb->list, &ms->sync_queue); 378 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) { 379 int id = (int)(ms - ipvs->ms); 380 381 wake_up_process(ipvs->master_tinfo[id].task); 382 } 383 } else 384 ip_vs_sync_buff_release(sb); 385 spin_unlock(&ipvs->sync_lock); 386 } 387 388 /* 389 * Get the current sync buffer if it has been created for more 390 * than the specified time or the specified time is zero. 391 */ 392 static inline struct ip_vs_sync_buff * 393 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 394 unsigned long time) 395 { 396 struct ip_vs_sync_buff *sb; 397 398 spin_lock_bh(&ipvs->sync_buff_lock); 399 sb = ms->sync_buff; 400 if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 401 ms->sync_buff = NULL; 402 __set_current_state(TASK_RUNNING); 403 } else 404 sb = NULL; 405 spin_unlock_bh(&ipvs->sync_buff_lock); 406 return sb; 407 } 408 409 static inline int 410 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 411 { 412 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 413 } 414 415 /* 416 * Create a new sync buffer for Version 0 proto. 417 */ 418 static inline struct ip_vs_sync_buff * 419 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 420 { 421 struct ip_vs_sync_buff *sb; 422 struct ip_vs_sync_mesg_v0 *mesg; 423 424 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 425 return NULL; 426 427 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 428 ipvs->mcfg.sync_maxlen); 429 sb->mesg = kmalloc(len, GFP_ATOMIC); 430 if (!sb->mesg) { 431 kfree(sb); 432 return NULL; 433 } 434 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 435 mesg->nr_conns = 0; 436 mesg->syncid = ipvs->mcfg.syncid; 437 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 438 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 439 sb->end = (unsigned char *)mesg + len; 440 sb->firstuse = jiffies; 441 return sb; 442 } 443 444 /* Check if connection is controlled by persistence */ 445 static inline bool in_persistence(struct ip_vs_conn *cp) 446 { 447 for (cp = cp->control; cp; cp = cp->control) { 448 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 449 return true; 450 } 451 return false; 452 } 453 454 /* Check if conn should be synced. 455 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 456 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 457 * sync_retries times with period of sync_refresh_period/8 458 * - (2) if both sync_refresh_period and sync_period are 0 send sync only 459 * for state changes or only once when pkts matches sync_threshold 460 * - (3) templates: rate can be reduced only with sync_refresh_period or 461 * with (2) 462 */ 463 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 464 struct ip_vs_conn *cp, int pkts) 465 { 466 unsigned long orig = READ_ONCE(cp->sync_endtime); 467 unsigned long now = jiffies; 468 unsigned long n = (now + cp->timeout) & ~3UL; 469 unsigned int sync_refresh_period; 470 int sync_period; 471 int force; 472 473 /* Check if we sync in current state */ 474 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 475 force = 0; 476 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 477 return 0; 478 else if (likely(cp->protocol == IPPROTO_TCP)) { 479 if (!((1 << cp->state) & 480 ((1 << IP_VS_TCP_S_ESTABLISHED) | 481 (1 << IP_VS_TCP_S_FIN_WAIT) | 482 (1 << IP_VS_TCP_S_CLOSE) | 483 (1 << IP_VS_TCP_S_CLOSE_WAIT) | 484 (1 << IP_VS_TCP_S_TIME_WAIT)))) 485 return 0; 486 force = cp->state != cp->old_state; 487 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 488 goto set; 489 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 490 if (!((1 << cp->state) & 491 ((1 << IP_VS_SCTP_S_ESTABLISHED) | 492 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 493 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 494 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 495 (1 << IP_VS_SCTP_S_CLOSED)))) 496 return 0; 497 force = cp->state != cp->old_state; 498 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 499 goto set; 500 } else { 501 /* UDP or another protocol with single state */ 502 force = 0; 503 } 504 505 sync_refresh_period = sysctl_sync_refresh_period(ipvs); 506 if (sync_refresh_period > 0) { 507 long diff = n - orig; 508 long min_diff = max(cp->timeout >> 1, 10UL * HZ); 509 510 /* Avoid sync if difference is below sync_refresh_period 511 * and below the half timeout. 512 */ 513 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 514 int retries = orig & 3; 515 516 if (retries >= sysctl_sync_retries(ipvs)) 517 return 0; 518 if (time_before(now, orig - cp->timeout + 519 (sync_refresh_period >> 3))) 520 return 0; 521 n |= retries + 1; 522 } 523 } 524 sync_period = sysctl_sync_period(ipvs); 525 if (sync_period > 0) { 526 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 527 pkts % sync_period != sysctl_sync_threshold(ipvs)) 528 return 0; 529 } else if (!sync_refresh_period && 530 pkts != sysctl_sync_threshold(ipvs)) 531 return 0; 532 533 set: 534 cp->old_state = cp->state; 535 n = cmpxchg(&cp->sync_endtime, orig, n); 536 return n == orig || force; 537 } 538 539 /* 540 * Version 0 , could be switched in by sys_ctl. 541 * Add an ip_vs_conn information into the current sync_buff. 542 */ 543 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 544 int pkts) 545 { 546 struct ip_vs_sync_mesg_v0 *m; 547 struct ip_vs_sync_conn_v0 *s; 548 struct ip_vs_sync_buff *buff; 549 struct ipvs_master_sync_state *ms; 550 int id; 551 unsigned int len; 552 553 if (unlikely(cp->af != AF_INET)) 554 return; 555 /* Do not sync ONE PACKET */ 556 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 557 return; 558 559 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 560 return; 561 562 spin_lock_bh(&ipvs->sync_buff_lock); 563 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 564 spin_unlock_bh(&ipvs->sync_buff_lock); 565 return; 566 } 567 568 id = select_master_thread_id(ipvs, cp); 569 ms = &ipvs->ms[id]; 570 buff = ms->sync_buff; 571 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 572 SIMPLE_CONN_SIZE; 573 if (buff) { 574 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 575 /* Send buffer if it is for v1 */ 576 if (buff->head + len > buff->end || !m->nr_conns) { 577 sb_queue_tail(ipvs, ms); 578 ms->sync_buff = NULL; 579 buff = NULL; 580 } 581 } 582 if (!buff) { 583 buff = ip_vs_sync_buff_create_v0(ipvs, len); 584 if (!buff) { 585 spin_unlock_bh(&ipvs->sync_buff_lock); 586 pr_err("ip_vs_sync_buff_create failed.\n"); 587 return; 588 } 589 ms->sync_buff = buff; 590 } 591 592 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 593 s = (struct ip_vs_sync_conn_v0 *) buff->head; 594 595 /* copy members */ 596 s->reserved = 0; 597 s->protocol = cp->protocol; 598 s->cport = cp->cport; 599 s->vport = cp->vport; 600 s->dport = cp->dport; 601 s->caddr = cp->caddr.ip; 602 s->vaddr = cp->vaddr.ip; 603 s->daddr = cp->daddr.ip; 604 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 605 s->state = htons(cp->state); 606 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 607 struct ip_vs_sync_conn_options *opt = 608 (struct ip_vs_sync_conn_options *)&s[1]; 609 memcpy(opt, &cp->in_seq, sizeof(*opt)); 610 } 611 612 m->nr_conns++; 613 m->size = htons(ntohs(m->size) + len); 614 buff->head += len; 615 spin_unlock_bh(&ipvs->sync_buff_lock); 616 617 /* synchronize its controller if it has */ 618 cp = cp->control; 619 if (cp) { 620 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 621 pkts = atomic_add_return(1, &cp->in_pkts); 622 else 623 pkts = sysctl_sync_threshold(ipvs); 624 ip_vs_sync_conn(ipvs, cp, pkts); 625 } 626 } 627 628 /* 629 * Add an ip_vs_conn information into the current sync_buff. 630 * Called by ip_vs_in. 631 * Sending Version 1 messages 632 */ 633 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 634 { 635 struct ip_vs_sync_mesg *m; 636 union ip_vs_sync_conn *s; 637 struct ip_vs_sync_buff *buff; 638 struct ipvs_master_sync_state *ms; 639 int id; 640 __u8 *p; 641 unsigned int len, pe_name_len, pad; 642 643 /* Handle old version of the protocol */ 644 if (sysctl_sync_ver(ipvs) == 0) { 645 ip_vs_sync_conn_v0(ipvs, cp, pkts); 646 return; 647 } 648 /* Do not sync ONE PACKET */ 649 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 650 goto control; 651 sloop: 652 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 653 goto control; 654 655 /* Sanity checks */ 656 pe_name_len = 0; 657 if (cp->pe_data_len) { 658 if (!cp->pe_data || !cp->dest) { 659 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 660 return; 661 } 662 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 663 } 664 665 spin_lock_bh(&ipvs->sync_buff_lock); 666 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 667 spin_unlock_bh(&ipvs->sync_buff_lock); 668 return; 669 } 670 671 id = select_master_thread_id(ipvs, cp); 672 ms = &ipvs->ms[id]; 673 674 #ifdef CONFIG_IP_VS_IPV6 675 if (cp->af == AF_INET6) 676 len = sizeof(struct ip_vs_sync_v6); 677 else 678 #endif 679 len = sizeof(struct ip_vs_sync_v4); 680 681 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 682 len += sizeof(struct ip_vs_sync_conn_options) + 2; 683 684 if (cp->pe_data_len) 685 len += cp->pe_data_len + 2; /* + Param hdr field */ 686 if (pe_name_len) 687 len += pe_name_len + 2; 688 689 /* check if there is a space for this one */ 690 pad = 0; 691 buff = ms->sync_buff; 692 if (buff) { 693 m = buff->mesg; 694 pad = (4 - (size_t) buff->head) & 3; 695 /* Send buffer if it is for v0 */ 696 if (buff->head + len + pad > buff->end || m->reserved) { 697 sb_queue_tail(ipvs, ms); 698 ms->sync_buff = NULL; 699 buff = NULL; 700 pad = 0; 701 } 702 } 703 704 if (!buff) { 705 buff = ip_vs_sync_buff_create(ipvs, len); 706 if (!buff) { 707 spin_unlock_bh(&ipvs->sync_buff_lock); 708 pr_err("ip_vs_sync_buff_create failed.\n"); 709 return; 710 } 711 ms->sync_buff = buff; 712 m = buff->mesg; 713 } 714 715 p = buff->head; 716 buff->head += pad + len; 717 m->size = htons(ntohs(m->size) + pad + len); 718 /* Add ev. padding from prev. sync_conn */ 719 while (pad--) 720 *(p++) = 0; 721 722 s = (union ip_vs_sync_conn *)p; 723 724 /* Set message type & copy members */ 725 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 726 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 727 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 728 s->v4.state = htons(cp->state); 729 s->v4.protocol = cp->protocol; 730 s->v4.cport = cp->cport; 731 s->v4.vport = cp->vport; 732 s->v4.dport = cp->dport; 733 s->v4.fwmark = htonl(cp->fwmark); 734 s->v4.timeout = htonl(cp->timeout / HZ); 735 m->nr_conns++; 736 737 #ifdef CONFIG_IP_VS_IPV6 738 if (cp->af == AF_INET6) { 739 p += sizeof(struct ip_vs_sync_v6); 740 s->v6.caddr = cp->caddr.in6; 741 s->v6.vaddr = cp->vaddr.in6; 742 s->v6.daddr = cp->daddr.in6; 743 } else 744 #endif 745 { 746 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 747 s->v4.caddr = cp->caddr.ip; 748 s->v4.vaddr = cp->vaddr.ip; 749 s->v4.daddr = cp->daddr.ip; 750 } 751 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 752 *(p++) = IPVS_OPT_SEQ_DATA; 753 *(p++) = sizeof(struct ip_vs_sync_conn_options); 754 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 755 p += sizeof(struct ip_vs_seq); 756 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 757 p += sizeof(struct ip_vs_seq); 758 } 759 /* Handle pe data */ 760 if (cp->pe_data_len && cp->pe_data) { 761 *(p++) = IPVS_OPT_PE_DATA; 762 *(p++) = cp->pe_data_len; 763 memcpy(p, cp->pe_data, cp->pe_data_len); 764 p += cp->pe_data_len; 765 if (pe_name_len) { 766 /* Add PE_NAME */ 767 *(p++) = IPVS_OPT_PE_NAME; 768 *(p++) = pe_name_len; 769 memcpy(p, cp->pe->name, pe_name_len); 770 p += pe_name_len; 771 } 772 } 773 774 spin_unlock_bh(&ipvs->sync_buff_lock); 775 776 control: 777 /* synchronize its controller if it has */ 778 cp = cp->control; 779 if (!cp) 780 return; 781 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 782 pkts = atomic_add_return(1, &cp->in_pkts); 783 else 784 pkts = sysctl_sync_threshold(ipvs); 785 goto sloop; 786 } 787 788 /* 789 * fill_param used by version 1 790 */ 791 static inline int 792 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 793 struct ip_vs_conn_param *p, 794 __u8 *pe_data, unsigned int pe_data_len, 795 __u8 *pe_name, unsigned int pe_name_len) 796 { 797 #ifdef CONFIG_IP_VS_IPV6 798 if (af == AF_INET6) 799 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 800 (const union nf_inet_addr *)&sc->v6.caddr, 801 sc->v6.cport, 802 (const union nf_inet_addr *)&sc->v6.vaddr, 803 sc->v6.vport, p); 804 else 805 #endif 806 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 807 (const union nf_inet_addr *)&sc->v4.caddr, 808 sc->v4.cport, 809 (const union nf_inet_addr *)&sc->v4.vaddr, 810 sc->v4.vport, p); 811 /* Handle pe data */ 812 if (pe_data_len) { 813 if (pe_name_len) { 814 char buff[IP_VS_PENAME_MAXLEN+1]; 815 816 memcpy(buff, pe_name, pe_name_len); 817 buff[pe_name_len]=0; 818 p->pe = __ip_vs_pe_getbyname(buff); 819 if (!p->pe) { 820 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 821 buff); 822 return 1; 823 } 824 } else { 825 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 826 return 1; 827 } 828 829 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 830 if (!p->pe_data) { 831 module_put(p->pe->module); 832 return -ENOMEM; 833 } 834 p->pe_data_len = pe_data_len; 835 } 836 return 0; 837 } 838 839 /* 840 * Connection Add / Update. 841 * Common for version 0 and 1 reception of backup sync_conns. 842 * Param: ... 843 * timeout is in sec. 844 */ 845 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 846 unsigned int flags, unsigned int state, 847 unsigned int protocol, unsigned int type, 848 const union nf_inet_addr *daddr, __be16 dport, 849 unsigned long timeout, __u32 fwmark, 850 struct ip_vs_sync_conn_options *opt) 851 { 852 struct ip_vs_dest *dest; 853 struct ip_vs_conn *cp; 854 855 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 856 cp = ip_vs_conn_in_get(param); 857 if (cp && ((cp->dport != dport) || 858 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 859 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 860 ip_vs_conn_expire_now(cp); 861 __ip_vs_conn_put(cp); 862 cp = NULL; 863 } else { 864 /* This is the expiration message for the 865 * connection that was already replaced, so we 866 * just ignore it. 867 */ 868 __ip_vs_conn_put(cp); 869 kfree(param->pe_data); 870 return; 871 } 872 } 873 } else { 874 cp = ip_vs_ct_in_get(param); 875 } 876 877 if (cp) { 878 /* Free pe_data */ 879 kfree(param->pe_data); 880 881 dest = cp->dest; 882 spin_lock_bh(&cp->lock); 883 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 884 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 885 if (flags & IP_VS_CONN_F_INACTIVE) { 886 atomic_dec(&dest->activeconns); 887 atomic_inc(&dest->inactconns); 888 } else { 889 atomic_inc(&dest->activeconns); 890 atomic_dec(&dest->inactconns); 891 } 892 } 893 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 894 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 895 cp->flags = flags; 896 spin_unlock_bh(&cp->lock); 897 if (!dest) 898 ip_vs_try_bind_dest(cp); 899 } else { 900 /* 901 * Find the appropriate destination for the connection. 902 * If it is not found the connection will remain unbound 903 * but still handled. 904 */ 905 rcu_read_lock(); 906 /* This function is only invoked by the synchronization 907 * code. We do not currently support heterogeneous pools 908 * with synchronization, so we can make the assumption that 909 * the svc_af is the same as the dest_af 910 */ 911 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 912 param->vaddr, param->vport, protocol, 913 fwmark, flags); 914 915 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 916 fwmark); 917 rcu_read_unlock(); 918 if (!cp) { 919 kfree(param->pe_data); 920 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 921 return; 922 } 923 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 924 kfree(param->pe_data); 925 } 926 927 if (opt) { 928 cp->in_seq = opt->in_seq; 929 cp->out_seq = opt->out_seq; 930 } 931 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 932 cp->state = state; 933 cp->old_state = cp->state; 934 /* 935 * For Ver 0 messages style 936 * - Not possible to recover the right timeout for templates 937 * - can not find the right fwmark 938 * virtual service. If needed, we can do it for 939 * non-fwmark persistent services. 940 * Ver 1 messages style. 941 * - No problem. 942 */ 943 if (timeout) { 944 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 945 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 946 cp->timeout = timeout*HZ; 947 } else { 948 struct ip_vs_proto_data *pd; 949 950 pd = ip_vs_proto_data_get(ipvs, protocol); 951 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 952 cp->timeout = pd->timeout_table[state]; 953 else 954 cp->timeout = (3*60*HZ); 955 } 956 ip_vs_conn_put(cp); 957 } 958 959 /* 960 * Process received multicast message for Version 0 961 */ 962 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 963 const size_t buflen) 964 { 965 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 966 struct ip_vs_sync_conn_v0 *s; 967 struct ip_vs_sync_conn_options *opt; 968 struct ip_vs_protocol *pp; 969 struct ip_vs_conn_param param; 970 char *p; 971 int i; 972 973 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 974 for (i=0; i<m->nr_conns; i++) { 975 unsigned int flags, state; 976 977 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 978 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 979 return; 980 } 981 s = (struct ip_vs_sync_conn_v0 *) p; 982 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 983 flags &= ~IP_VS_CONN_F_HASHED; 984 if (flags & IP_VS_CONN_F_SEQ_MASK) { 985 opt = (struct ip_vs_sync_conn_options *)&s[1]; 986 p += FULL_CONN_SIZE; 987 if (p > buffer+buflen) { 988 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 989 return; 990 } 991 } else { 992 opt = NULL; 993 p += SIMPLE_CONN_SIZE; 994 } 995 996 state = ntohs(s->state); 997 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 998 pp = ip_vs_proto_get(s->protocol); 999 if (!pp) { 1000 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 1001 s->protocol); 1002 continue; 1003 } 1004 if (state >= pp->num_states) { 1005 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 1006 pp->name, state); 1007 continue; 1008 } 1009 } else { 1010 if (state >= IP_VS_CTPL_S_LAST) 1011 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", 1012 state); 1013 } 1014 1015 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1016 (const union nf_inet_addr *)&s->caddr, 1017 s->cport, 1018 (const union nf_inet_addr *)&s->vaddr, 1019 s->vport, ¶m); 1020 1021 /* Send timeout as Zero */ 1022 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1023 (union nf_inet_addr *)&s->daddr, s->dport, 1024 0, 0, opt); 1025 } 1026 } 1027 1028 /* 1029 * Handle options 1030 */ 1031 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1032 __u32 *opt_flags, 1033 struct ip_vs_sync_conn_options *opt) 1034 { 1035 struct ip_vs_sync_conn_options *topt; 1036 1037 topt = (struct ip_vs_sync_conn_options *)p; 1038 1039 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1040 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1041 return -EINVAL; 1042 } 1043 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1044 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1045 return -EINVAL; 1046 } 1047 ntoh_seq(&topt->in_seq, &opt->in_seq); 1048 ntoh_seq(&topt->out_seq, &opt->out_seq); 1049 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1050 return 0; 1051 } 1052 1053 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1054 __u8 **data, unsigned int maxlen, 1055 __u32 *opt_flags, __u32 flag) 1056 { 1057 if (plen > maxlen) { 1058 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1059 return -EINVAL; 1060 } 1061 if (*opt_flags & flag) { 1062 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1063 return -EINVAL; 1064 } 1065 *data_len = plen; 1066 *data = p; 1067 *opt_flags |= flag; 1068 return 0; 1069 } 1070 /* 1071 * Process a Version 1 sync. connection 1072 */ 1073 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1074 { 1075 struct ip_vs_sync_conn_options opt; 1076 union ip_vs_sync_conn *s; 1077 struct ip_vs_protocol *pp; 1078 struct ip_vs_conn_param param; 1079 __u32 flags; 1080 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1081 __u8 *pe_data=NULL, *pe_name=NULL; 1082 __u32 opt_flags=0; 1083 int retc=0; 1084 1085 s = (union ip_vs_sync_conn *) p; 1086 1087 if (s->v6.type & STYPE_F_INET6) { 1088 #ifdef CONFIG_IP_VS_IPV6 1089 af = AF_INET6; 1090 p += sizeof(struct ip_vs_sync_v6); 1091 #else 1092 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1093 retc = 10; 1094 goto out; 1095 #endif 1096 } else if (!s->v4.type) { 1097 af = AF_INET; 1098 p += sizeof(struct ip_vs_sync_v4); 1099 } else { 1100 return -10; 1101 } 1102 if (p > msg_end) 1103 return -20; 1104 1105 /* Process optional params check Type & Len. */ 1106 while (p < msg_end) { 1107 int ptype; 1108 int plen; 1109 1110 if (p+2 > msg_end) 1111 return -30; 1112 ptype = *(p++); 1113 plen = *(p++); 1114 1115 if (!plen || ((p + plen) > msg_end)) 1116 return -40; 1117 /* Handle seq option p = param data */ 1118 switch (ptype & ~IPVS_OPT_F_PARAM) { 1119 case IPVS_OPT_SEQ_DATA: 1120 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1121 return -50; 1122 break; 1123 1124 case IPVS_OPT_PE_DATA: 1125 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1126 IP_VS_PEDATA_MAXLEN, &opt_flags, 1127 IPVS_OPT_F_PE_DATA)) 1128 return -60; 1129 break; 1130 1131 case IPVS_OPT_PE_NAME: 1132 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1133 IP_VS_PENAME_MAXLEN, &opt_flags, 1134 IPVS_OPT_F_PE_NAME)) 1135 return -70; 1136 break; 1137 1138 default: 1139 /* Param data mandatory ? */ 1140 if (!(ptype & IPVS_OPT_F_PARAM)) { 1141 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1142 ptype & ~IPVS_OPT_F_PARAM); 1143 retc = 20; 1144 goto out; 1145 } 1146 } 1147 p += plen; /* Next option */ 1148 } 1149 1150 /* Get flags and Mask off unsupported */ 1151 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1152 flags |= IP_VS_CONN_F_SYNC; 1153 state = ntohs(s->v4.state); 1154 1155 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1156 pp = ip_vs_proto_get(s->v4.protocol); 1157 if (!pp) { 1158 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1159 s->v4.protocol); 1160 retc = 30; 1161 goto out; 1162 } 1163 if (state >= pp->num_states) { 1164 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1165 pp->name, state); 1166 retc = 40; 1167 goto out; 1168 } 1169 } else { 1170 if (state >= IP_VS_CTPL_S_LAST) 1171 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", 1172 state); 1173 } 1174 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1175 pe_data_len, pe_name, pe_name_len)) { 1176 retc = 50; 1177 goto out; 1178 } 1179 /* If only IPv4, just silent skip IPv6 */ 1180 if (af == AF_INET) 1181 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1182 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1183 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1184 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1185 ); 1186 #ifdef CONFIG_IP_VS_IPV6 1187 else 1188 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1189 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1190 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1191 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1192 ); 1193 #endif 1194 ip_vs_pe_put(param.pe); 1195 return 0; 1196 /* Error exit */ 1197 out: 1198 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1199 return retc; 1200 1201 } 1202 /* 1203 * Process received multicast message and create the corresponding 1204 * ip_vs_conn entries. 1205 * Handles Version 0 & 1 1206 */ 1207 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1208 const size_t buflen) 1209 { 1210 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1211 __u8 *p, *msg_end; 1212 int i, nr_conns; 1213 1214 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1215 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1216 return; 1217 } 1218 1219 if (buflen != ntohs(m2->size)) { 1220 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1221 return; 1222 } 1223 /* SyncID sanity check */ 1224 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1225 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1226 return; 1227 } 1228 /* Handle version 1 message */ 1229 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1230 && (m2->spare == 0)) { 1231 1232 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1233 nr_conns = m2->nr_conns; 1234 1235 for (i=0; i<nr_conns; i++) { 1236 union ip_vs_sync_conn *s; 1237 unsigned int size; 1238 int retc; 1239 1240 p = msg_end; 1241 if (p + sizeof(s->v4) > buffer+buflen) { 1242 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); 1243 return; 1244 } 1245 s = (union ip_vs_sync_conn *)p; 1246 size = ntohs(s->v4.ver_size) & SVER_MASK; 1247 msg_end = p + size; 1248 /* Basic sanity checks */ 1249 if (msg_end > buffer+buflen) { 1250 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1251 return; 1252 } 1253 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1254 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1255 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1256 return; 1257 } 1258 /* Process a single sync_conn */ 1259 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1260 if (retc < 0) { 1261 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1262 retc); 1263 return; 1264 } 1265 /* Make sure we have 32 bit alignment */ 1266 msg_end = p + ((size + 3) & ~3); 1267 } 1268 } else { 1269 /* Old type of message */ 1270 ip_vs_process_message_v0(ipvs, buffer, buflen); 1271 return; 1272 } 1273 } 1274 1275 1276 /* 1277 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1278 */ 1279 static void set_sock_size(struct sock *sk, int mode, int val) 1280 { 1281 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1282 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1283 lock_sock(sk); 1284 if (mode) { 1285 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1286 sysctl_wmem_max); 1287 sk->sk_sndbuf = val * 2; 1288 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1289 } else { 1290 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1291 sysctl_rmem_max); 1292 sk->sk_rcvbuf = val * 2; 1293 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1294 } 1295 release_sock(sk); 1296 } 1297 1298 /* 1299 * Setup loopback of outgoing multicasts on a sending socket 1300 */ 1301 static void set_mcast_loop(struct sock *sk, u_char loop) 1302 { 1303 struct inet_sock *inet = inet_sk(sk); 1304 1305 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1306 lock_sock(sk); 1307 inet->mc_loop = loop ? 1 : 0; 1308 #ifdef CONFIG_IP_VS_IPV6 1309 if (sk->sk_family == AF_INET6) { 1310 struct ipv6_pinfo *np = inet6_sk(sk); 1311 1312 /* IPV6_MULTICAST_LOOP */ 1313 np->mc_loop = loop ? 1 : 0; 1314 } 1315 #endif 1316 release_sock(sk); 1317 } 1318 1319 /* 1320 * Specify TTL for outgoing multicasts on a sending socket 1321 */ 1322 static void set_mcast_ttl(struct sock *sk, u_char ttl) 1323 { 1324 struct inet_sock *inet = inet_sk(sk); 1325 1326 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1327 lock_sock(sk); 1328 inet->mc_ttl = ttl; 1329 #ifdef CONFIG_IP_VS_IPV6 1330 if (sk->sk_family == AF_INET6) { 1331 struct ipv6_pinfo *np = inet6_sk(sk); 1332 1333 /* IPV6_MULTICAST_HOPS */ 1334 np->mcast_hops = ttl; 1335 } 1336 #endif 1337 release_sock(sk); 1338 } 1339 1340 /* Control fragmentation of messages */ 1341 static void set_mcast_pmtudisc(struct sock *sk, int val) 1342 { 1343 struct inet_sock *inet = inet_sk(sk); 1344 1345 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1346 lock_sock(sk); 1347 inet->pmtudisc = val; 1348 #ifdef CONFIG_IP_VS_IPV6 1349 if (sk->sk_family == AF_INET6) { 1350 struct ipv6_pinfo *np = inet6_sk(sk); 1351 1352 /* IPV6_MTU_DISCOVER */ 1353 np->pmtudisc = val; 1354 } 1355 #endif 1356 release_sock(sk); 1357 } 1358 1359 /* 1360 * Specifiy default interface for outgoing multicasts 1361 */ 1362 static int set_mcast_if(struct sock *sk, struct net_device *dev) 1363 { 1364 struct inet_sock *inet = inet_sk(sk); 1365 1366 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1367 return -EINVAL; 1368 1369 lock_sock(sk); 1370 inet->mc_index = dev->ifindex; 1371 /* inet->mc_addr = 0; */ 1372 #ifdef CONFIG_IP_VS_IPV6 1373 if (sk->sk_family == AF_INET6) { 1374 struct ipv6_pinfo *np = inet6_sk(sk); 1375 1376 /* IPV6_MULTICAST_IF */ 1377 np->mcast_oif = dev->ifindex; 1378 } 1379 #endif 1380 release_sock(sk); 1381 1382 return 0; 1383 } 1384 1385 1386 /* 1387 * Join a multicast group. 1388 * the group is specified by a class D multicast address 224.0.0.0/8 1389 * in the in_addr structure passed in as a parameter. 1390 */ 1391 static int 1392 join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev) 1393 { 1394 struct ip_mreqn mreq; 1395 int ret; 1396 1397 memset(&mreq, 0, sizeof(mreq)); 1398 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1399 1400 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1401 return -EINVAL; 1402 1403 mreq.imr_ifindex = dev->ifindex; 1404 1405 lock_sock(sk); 1406 ret = ip_mc_join_group(sk, &mreq); 1407 release_sock(sk); 1408 1409 return ret; 1410 } 1411 1412 #ifdef CONFIG_IP_VS_IPV6 1413 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1414 struct net_device *dev) 1415 { 1416 int ret; 1417 1418 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1419 return -EINVAL; 1420 1421 lock_sock(sk); 1422 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1423 release_sock(sk); 1424 1425 return ret; 1426 } 1427 #endif 1428 1429 static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) 1430 { 1431 __be32 addr; 1432 struct sockaddr_in sin; 1433 1434 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1435 if (!addr) 1436 pr_err("You probably need to specify IP address on " 1437 "multicast interface.\n"); 1438 1439 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1440 dev->name, &addr); 1441 1442 /* Now bind the socket with the address of multicast interface */ 1443 sin.sin_family = AF_INET; 1444 sin.sin_addr.s_addr = addr; 1445 sin.sin_port = 0; 1446 1447 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 1448 } 1449 1450 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1451 struct ipvs_sync_daemon_cfg *c, int id) 1452 { 1453 if (AF_INET6 == c->mcast_af) { 1454 sa->in6 = (struct sockaddr_in6) { 1455 .sin6_family = AF_INET6, 1456 .sin6_port = htons(c->mcast_port + id), 1457 }; 1458 sa->in6.sin6_addr = c->mcast_group.in6; 1459 *salen = sizeof(sa->in6); 1460 } else { 1461 sa->in = (struct sockaddr_in) { 1462 .sin_family = AF_INET, 1463 .sin_port = htons(c->mcast_port + id), 1464 }; 1465 sa->in.sin_addr = c->mcast_group.in; 1466 *salen = sizeof(sa->in); 1467 } 1468 } 1469 1470 /* 1471 * Set up sending multicast socket over UDP 1472 */ 1473 static int make_send_sock(struct netns_ipvs *ipvs, int id, 1474 struct net_device *dev, struct socket **sock_ret) 1475 { 1476 /* multicast addr */ 1477 union ipvs_sockaddr mcast_addr; 1478 struct socket *sock; 1479 int result, salen; 1480 1481 /* First create a socket */ 1482 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1483 IPPROTO_UDP, &sock); 1484 if (result < 0) { 1485 pr_err("Error during creation of socket; terminating\n"); 1486 goto error; 1487 } 1488 *sock_ret = sock; 1489 result = set_mcast_if(sock->sk, dev); 1490 if (result < 0) { 1491 pr_err("Error setting outbound mcast interface\n"); 1492 goto error; 1493 } 1494 1495 set_mcast_loop(sock->sk, 0); 1496 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1497 /* Allow fragmentation if MTU changes */ 1498 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1499 result = sysctl_sync_sock_size(ipvs); 1500 if (result > 0) 1501 set_sock_size(sock->sk, 1, result); 1502 1503 if (AF_INET == ipvs->mcfg.mcast_af) 1504 result = bind_mcastif_addr(sock, dev); 1505 else 1506 result = 0; 1507 if (result < 0) { 1508 pr_err("Error binding address of the mcast interface\n"); 1509 goto error; 1510 } 1511 1512 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1513 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 1514 salen, 0); 1515 if (result < 0) { 1516 pr_err("Error connecting to the multicast addr\n"); 1517 goto error; 1518 } 1519 1520 return 0; 1521 1522 error: 1523 return result; 1524 } 1525 1526 1527 /* 1528 * Set up receiving multicast socket over UDP 1529 */ 1530 static int make_receive_sock(struct netns_ipvs *ipvs, int id, 1531 struct net_device *dev, struct socket **sock_ret) 1532 { 1533 /* multicast addr */ 1534 union ipvs_sockaddr mcast_addr; 1535 struct socket *sock; 1536 int result, salen; 1537 1538 /* First create a socket */ 1539 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1540 IPPROTO_UDP, &sock); 1541 if (result < 0) { 1542 pr_err("Error during creation of socket; terminating\n"); 1543 goto error; 1544 } 1545 *sock_ret = sock; 1546 /* it is equivalent to the REUSEADDR option in user-space */ 1547 sock->sk->sk_reuse = SK_CAN_REUSE; 1548 result = sysctl_sync_sock_size(ipvs); 1549 if (result > 0) 1550 set_sock_size(sock->sk, 0, result); 1551 1552 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1553 sock->sk->sk_bound_dev_if = dev->ifindex; 1554 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); 1555 if (result < 0) { 1556 pr_err("Error binding to the multicast addr\n"); 1557 goto error; 1558 } 1559 1560 /* join the multicast group */ 1561 #ifdef CONFIG_IP_VS_IPV6 1562 if (ipvs->bcfg.mcast_af == AF_INET6) 1563 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1564 dev); 1565 else 1566 #endif 1567 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1568 dev); 1569 if (result < 0) { 1570 pr_err("Error joining to the multicast group\n"); 1571 goto error; 1572 } 1573 1574 return 0; 1575 1576 error: 1577 return result; 1578 } 1579 1580 1581 static int 1582 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1583 { 1584 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1585 struct kvec iov; 1586 int len; 1587 1588 EnterFunction(7); 1589 iov.iov_base = (void *)buffer; 1590 iov.iov_len = length; 1591 1592 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1593 1594 LeaveFunction(7); 1595 return len; 1596 } 1597 1598 static int 1599 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1600 { 1601 int msize; 1602 int ret; 1603 1604 msize = ntohs(msg->size); 1605 1606 ret = ip_vs_send_async(sock, (char *)msg, msize); 1607 if (ret >= 0 || ret == -EAGAIN) 1608 return ret; 1609 pr_err("ip_vs_send_async error %d\n", ret); 1610 return 0; 1611 } 1612 1613 static int 1614 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1615 { 1616 struct msghdr msg = {NULL,}; 1617 struct kvec iov = {buffer, buflen}; 1618 int len; 1619 1620 EnterFunction(7); 1621 1622 /* Receive a packet */ 1623 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen); 1624 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); 1625 if (len < 0) 1626 return len; 1627 1628 LeaveFunction(7); 1629 return len; 1630 } 1631 1632 /* Wakeup the master thread for sending */ 1633 static void master_wakeup_work_handler(struct work_struct *work) 1634 { 1635 struct ipvs_master_sync_state *ms = 1636 container_of(work, struct ipvs_master_sync_state, 1637 master_wakeup_work.work); 1638 struct netns_ipvs *ipvs = ms->ipvs; 1639 1640 spin_lock_bh(&ipvs->sync_lock); 1641 if (ms->sync_queue_len && 1642 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1643 int id = (int)(ms - ipvs->ms); 1644 1645 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1646 wake_up_process(ipvs->master_tinfo[id].task); 1647 } 1648 spin_unlock_bh(&ipvs->sync_lock); 1649 } 1650 1651 /* Get next buffer to send */ 1652 static inline struct ip_vs_sync_buff * 1653 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1654 { 1655 struct ip_vs_sync_buff *sb; 1656 1657 sb = sb_dequeue(ipvs, ms); 1658 if (sb) 1659 return sb; 1660 /* Do not delay entries in buffer for more than 2 seconds */ 1661 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1662 } 1663 1664 static int sync_thread_master(void *data) 1665 { 1666 struct ip_vs_sync_thread_data *tinfo = data; 1667 struct netns_ipvs *ipvs = tinfo->ipvs; 1668 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1669 struct sock *sk = tinfo->sock->sk; 1670 struct ip_vs_sync_buff *sb; 1671 1672 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1673 "syncid = %d, id = %d\n", 1674 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1675 1676 for (;;) { 1677 sb = next_sync_buff(ipvs, ms); 1678 if (unlikely(kthread_should_stop())) 1679 break; 1680 if (!sb) { 1681 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1682 continue; 1683 } 1684 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1685 /* (Ab)use interruptible sleep to avoid increasing 1686 * the load avg. 1687 */ 1688 __wait_event_interruptible(*sk_sleep(sk), 1689 sock_writeable(sk) || 1690 kthread_should_stop()); 1691 if (unlikely(kthread_should_stop())) 1692 goto done; 1693 } 1694 ip_vs_sync_buff_release(sb); 1695 } 1696 1697 done: 1698 __set_current_state(TASK_RUNNING); 1699 if (sb) 1700 ip_vs_sync_buff_release(sb); 1701 1702 /* clean up the sync_buff queue */ 1703 while ((sb = sb_dequeue(ipvs, ms))) 1704 ip_vs_sync_buff_release(sb); 1705 __set_current_state(TASK_RUNNING); 1706 1707 /* clean up the current sync_buff */ 1708 sb = get_curr_sync_buff(ipvs, ms, 0); 1709 if (sb) 1710 ip_vs_sync_buff_release(sb); 1711 1712 return 0; 1713 } 1714 1715 1716 static int sync_thread_backup(void *data) 1717 { 1718 struct ip_vs_sync_thread_data *tinfo = data; 1719 struct netns_ipvs *ipvs = tinfo->ipvs; 1720 int len; 1721 1722 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1723 "syncid = %d, id = %d\n", 1724 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1725 1726 while (!kthread_should_stop()) { 1727 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), 1728 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) 1729 || kthread_should_stop()); 1730 1731 /* do we have data now? */ 1732 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { 1733 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1734 ipvs->bcfg.sync_maxlen); 1735 if (len <= 0) { 1736 if (len != -EAGAIN) 1737 pr_err("receiving message error\n"); 1738 break; 1739 } 1740 1741 ip_vs_process_message(ipvs, tinfo->buf, len); 1742 } 1743 } 1744 1745 return 0; 1746 } 1747 1748 1749 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1750 int state) 1751 { 1752 struct ip_vs_sync_thread_data *ti = NULL, *tinfo; 1753 struct task_struct *task; 1754 struct net_device *dev; 1755 char *name; 1756 int (*threadfn)(void *data); 1757 int id = 0, count, hlen; 1758 int result = -ENOMEM; 1759 u16 mtu, min_mtu; 1760 1761 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1762 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", 1763 sizeof(struct ip_vs_sync_conn_v0)); 1764 1765 /* increase the module use count */ 1766 if (!ip_vs_use_count_inc()) 1767 return -ENOPROTOOPT; 1768 1769 /* Do not hold one mutex and then to block on another */ 1770 for (;;) { 1771 rtnl_lock(); 1772 if (mutex_trylock(&ipvs->sync_mutex)) 1773 break; 1774 rtnl_unlock(); 1775 mutex_lock(&ipvs->sync_mutex); 1776 if (rtnl_trylock()) 1777 break; 1778 mutex_unlock(&ipvs->sync_mutex); 1779 } 1780 1781 if (!ipvs->sync_state) { 1782 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1783 ipvs->threads_mask = count - 1; 1784 } else 1785 count = ipvs->threads_mask + 1; 1786 1787 if (c->mcast_af == AF_UNSPEC) { 1788 c->mcast_af = AF_INET; 1789 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1790 } 1791 if (!c->mcast_port) 1792 c->mcast_port = IP_VS_SYNC_PORT; 1793 if (!c->mcast_ttl) 1794 c->mcast_ttl = 1; 1795 1796 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1797 if (!dev) { 1798 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1799 result = -ENODEV; 1800 goto out_early; 1801 } 1802 hlen = (AF_INET6 == c->mcast_af) ? 1803 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1804 sizeof(struct iphdr) + sizeof(struct udphdr); 1805 mtu = (state == IP_VS_STATE_BACKUP) ? 1806 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1807 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1808 1809 if (c->sync_maxlen) 1810 c->sync_maxlen = clamp_t(unsigned int, 1811 c->sync_maxlen, min_mtu, 1812 65535 - hlen); 1813 else 1814 c->sync_maxlen = mtu - hlen; 1815 1816 if (state == IP_VS_STATE_MASTER) { 1817 result = -EEXIST; 1818 if (ipvs->ms) 1819 goto out_early; 1820 1821 ipvs->mcfg = *c; 1822 name = "ipvs-m:%d:%d"; 1823 threadfn = sync_thread_master; 1824 } else if (state == IP_VS_STATE_BACKUP) { 1825 result = -EEXIST; 1826 if (ipvs->backup_tinfo) 1827 goto out_early; 1828 1829 ipvs->bcfg = *c; 1830 name = "ipvs-b:%d:%d"; 1831 threadfn = sync_thread_backup; 1832 } else { 1833 result = -EINVAL; 1834 goto out_early; 1835 } 1836 1837 if (state == IP_VS_STATE_MASTER) { 1838 struct ipvs_master_sync_state *ms; 1839 1840 result = -ENOMEM; 1841 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); 1842 if (!ipvs->ms) 1843 goto out; 1844 ms = ipvs->ms; 1845 for (id = 0; id < count; id++, ms++) { 1846 INIT_LIST_HEAD(&ms->sync_queue); 1847 ms->sync_queue_len = 0; 1848 ms->sync_queue_delay = 0; 1849 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1850 master_wakeup_work_handler); 1851 ms->ipvs = ipvs; 1852 } 1853 } 1854 result = -ENOMEM; 1855 ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), 1856 GFP_KERNEL); 1857 if (!ti) 1858 goto out; 1859 1860 for (id = 0; id < count; id++) { 1861 tinfo = &ti[id]; 1862 tinfo->ipvs = ipvs; 1863 if (state == IP_VS_STATE_BACKUP) { 1864 result = -ENOMEM; 1865 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1866 GFP_KERNEL); 1867 if (!tinfo->buf) 1868 goto out; 1869 } 1870 tinfo->id = id; 1871 if (state == IP_VS_STATE_MASTER) 1872 result = make_send_sock(ipvs, id, dev, &tinfo->sock); 1873 else 1874 result = make_receive_sock(ipvs, id, dev, &tinfo->sock); 1875 if (result < 0) 1876 goto out; 1877 1878 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1879 if (IS_ERR(task)) { 1880 result = PTR_ERR(task); 1881 goto out; 1882 } 1883 tinfo->task = task; 1884 } 1885 1886 /* mark as active */ 1887 1888 if (state == IP_VS_STATE_MASTER) 1889 ipvs->master_tinfo = ti; 1890 else 1891 ipvs->backup_tinfo = ti; 1892 spin_lock_bh(&ipvs->sync_buff_lock); 1893 ipvs->sync_state |= state; 1894 spin_unlock_bh(&ipvs->sync_buff_lock); 1895 1896 mutex_unlock(&ipvs->sync_mutex); 1897 rtnl_unlock(); 1898 1899 return 0; 1900 1901 out: 1902 /* We do not need RTNL lock anymore, release it here so that 1903 * sock_release below can use rtnl_lock to leave the mcast group. 1904 */ 1905 rtnl_unlock(); 1906 id = min(id, count - 1); 1907 if (ti) { 1908 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1909 if (tinfo->task) 1910 kthread_stop(tinfo->task); 1911 } 1912 } 1913 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1914 kfree(ipvs->ms); 1915 ipvs->ms = NULL; 1916 } 1917 mutex_unlock(&ipvs->sync_mutex); 1918 1919 /* No more mutexes, release socks */ 1920 if (ti) { 1921 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1922 if (tinfo->sock) 1923 sock_release(tinfo->sock); 1924 kfree(tinfo->buf); 1925 } 1926 kfree(ti); 1927 } 1928 1929 /* decrease the module use count */ 1930 ip_vs_use_count_dec(); 1931 return result; 1932 1933 out_early: 1934 mutex_unlock(&ipvs->sync_mutex); 1935 rtnl_unlock(); 1936 1937 /* decrease the module use count */ 1938 ip_vs_use_count_dec(); 1939 return result; 1940 } 1941 1942 1943 int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1944 { 1945 struct ip_vs_sync_thread_data *ti, *tinfo; 1946 int id; 1947 int retc = -EINVAL; 1948 1949 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1950 1951 mutex_lock(&ipvs->sync_mutex); 1952 if (state == IP_VS_STATE_MASTER) { 1953 retc = -ESRCH; 1954 if (!ipvs->ms) 1955 goto err; 1956 ti = ipvs->master_tinfo; 1957 1958 /* 1959 * The lock synchronizes with sb_queue_tail(), so that we don't 1960 * add sync buffers to the queue, when we are already in 1961 * progress of stopping the master sync daemon. 1962 */ 1963 1964 spin_lock_bh(&ipvs->sync_buff_lock); 1965 spin_lock(&ipvs->sync_lock); 1966 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1967 spin_unlock(&ipvs->sync_lock); 1968 spin_unlock_bh(&ipvs->sync_buff_lock); 1969 1970 retc = 0; 1971 for (id = ipvs->threads_mask; id >= 0; id--) { 1972 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1973 int ret; 1974 1975 tinfo = &ti[id]; 1976 pr_info("stopping master sync thread %d ...\n", 1977 task_pid_nr(tinfo->task)); 1978 cancel_delayed_work_sync(&ms->master_wakeup_work); 1979 ret = kthread_stop(tinfo->task); 1980 if (retc >= 0) 1981 retc = ret; 1982 } 1983 kfree(ipvs->ms); 1984 ipvs->ms = NULL; 1985 ipvs->master_tinfo = NULL; 1986 } else if (state == IP_VS_STATE_BACKUP) { 1987 retc = -ESRCH; 1988 if (!ipvs->backup_tinfo) 1989 goto err; 1990 ti = ipvs->backup_tinfo; 1991 1992 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1993 retc = 0; 1994 for (id = ipvs->threads_mask; id >= 0; id--) { 1995 int ret; 1996 1997 tinfo = &ti[id]; 1998 pr_info("stopping backup sync thread %d ...\n", 1999 task_pid_nr(tinfo->task)); 2000 ret = kthread_stop(tinfo->task); 2001 if (retc >= 0) 2002 retc = ret; 2003 } 2004 ipvs->backup_tinfo = NULL; 2005 } else { 2006 goto err; 2007 } 2008 id = ipvs->threads_mask; 2009 mutex_unlock(&ipvs->sync_mutex); 2010 2011 /* No more mutexes, release socks */ 2012 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 2013 if (tinfo->sock) 2014 sock_release(tinfo->sock); 2015 kfree(tinfo->buf); 2016 } 2017 kfree(ti); 2018 2019 /* decrease the module use count */ 2020 ip_vs_use_count_dec(); 2021 return retc; 2022 2023 err: 2024 mutex_unlock(&ipvs->sync_mutex); 2025 return retc; 2026 } 2027 2028 /* 2029 * Initialize data struct for each netns 2030 */ 2031 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2032 { 2033 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2034 spin_lock_init(&ipvs->sync_lock); 2035 spin_lock_init(&ipvs->sync_buff_lock); 2036 return 0; 2037 } 2038 2039 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2040 { 2041 int retc; 2042 2043 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2044 if (retc && retc != -ESRCH) 2045 pr_err("Failed to stop Master Daemon\n"); 2046 2047 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2048 if (retc && retc != -ESRCH) 2049 pr_err("Failed to stop Backup Daemon\n"); 2050 } 2051