1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Version 1, is capable of handling both version 0 and 1 messages. 10 * Version 0 is the plain old format. 11 * Note Version 0 receivers will just drop Ver 1 messages. 12 * Version 1 is capable of handle IPv6, Persistence data, 13 * time-outs, and firewall marks. 14 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 15 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 16 * 17 * Definitions Message: is a complete datagram 18 * Sync_conn: is a part of a Message 19 * Param Data is an option to a Sync_conn. 20 * 21 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 22 * 23 * ip_vs_sync: sync connection info from master load balancer to backups 24 * through multicast 25 * 26 * Changes: 27 * Alexandre Cassen : Added master & backup support at a time. 28 * Alexandre Cassen : Added SyncID support for incoming sync 29 * messages filtering. 30 * Justin Ossevoort : Fix endian problem on sync message size. 31 * Hans Schillstrom : Added Version 1: i.e. IPv6, 32 * Persistence support, fwmark and time-out. 33 */ 34 35 #define KMSG_COMPONENT "IPVS" 36 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 37 38 #include <linux/module.h> 39 #include <linux/slab.h> 40 #include <linux/inetdevice.h> 41 #include <linux/net.h> 42 #include <linux/completion.h> 43 #include <linux/delay.h> 44 #include <linux/skbuff.h> 45 #include <linux/in.h> 46 #include <linux/igmp.h> /* for ip_mc_join_group */ 47 #include <linux/udp.h> 48 #include <linux/err.h> 49 #include <linux/kthread.h> 50 #include <linux/wait.h> 51 #include <linux/kernel.h> 52 #include <linux/sched/signal.h> 53 54 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ 55 56 #include <net/ip.h> 57 #include <net/sock.h> 58 59 #include <net/ip_vs.h> 60 61 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 62 #define IP_VS_SYNC_PORT 8848 /* multicast port */ 63 64 #define SYNC_PROTO_VER 1 /* Protocol version in header */ 65 66 static struct lock_class_key __ipvs_sync_key; 67 /* 68 * IPVS sync connection entry 69 * Version 0, i.e. original version. 70 */ 71 struct ip_vs_sync_conn_v0 { 72 __u8 reserved; 73 74 /* Protocol, addresses and port numbers */ 75 __u8 protocol; /* Which protocol (TCP/UDP) */ 76 __be16 cport; 77 __be16 vport; 78 __be16 dport; 79 __be32 caddr; /* client address */ 80 __be32 vaddr; /* virtual address */ 81 __be32 daddr; /* destination address */ 82 83 /* Flags and state transition */ 84 __be16 flags; /* status flags */ 85 __be16 state; /* state info */ 86 87 /* The sequence options start here */ 88 }; 89 90 struct ip_vs_sync_conn_options { 91 struct ip_vs_seq in_seq; /* incoming seq. struct */ 92 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 93 }; 94 95 /* 96 Sync Connection format (sync_conn) 97 98 0 1 2 3 99 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 100 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101 | Type | Protocol | Ver. | Size | 102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 103 | Flags | 104 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 105 | State | cport | 106 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 107 | vport | dport | 108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 109 | fwmark | 110 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 111 | timeout (in sec.) | 112 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 113 | ... | 114 | IP-Addresses (v4 or v6) | 115 | ... | 116 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 117 Optional Parameters. 118 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 119 | Param. Type | Param. Length | Param. data | 120 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 121 | ... | 122 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 123 | | Param Type | Param. Length | 124 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 125 | Param data | 126 | Last Param data should be padded for 32 bit alignment | 127 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 128 */ 129 130 /* 131 * Type 0, IPv4 sync connection format 132 */ 133 struct ip_vs_sync_v4 { 134 __u8 type; 135 __u8 protocol; /* Which protocol (TCP/UDP) */ 136 __be16 ver_size; /* Version msb 4 bits */ 137 /* Flags and state transition */ 138 __be32 flags; /* status flags */ 139 __be16 state; /* state info */ 140 /* Protocol, addresses and port numbers */ 141 __be16 cport; 142 __be16 vport; 143 __be16 dport; 144 __be32 fwmark; /* Firewall mark from skb */ 145 __be32 timeout; /* cp timeout */ 146 __be32 caddr; /* client address */ 147 __be32 vaddr; /* virtual address */ 148 __be32 daddr; /* destination address */ 149 /* The sequence options start here */ 150 /* PE data padded to 32bit alignment after seq. options */ 151 }; 152 /* 153 * Type 2 messages IPv6 154 */ 155 struct ip_vs_sync_v6 { 156 __u8 type; 157 __u8 protocol; /* Which protocol (TCP/UDP) */ 158 __be16 ver_size; /* Version msb 4 bits */ 159 /* Flags and state transition */ 160 __be32 flags; /* status flags */ 161 __be16 state; /* state info */ 162 /* Protocol, addresses and port numbers */ 163 __be16 cport; 164 __be16 vport; 165 __be16 dport; 166 __be32 fwmark; /* Firewall mark from skb */ 167 __be32 timeout; /* cp timeout */ 168 struct in6_addr caddr; /* client address */ 169 struct in6_addr vaddr; /* virtual address */ 170 struct in6_addr daddr; /* destination address */ 171 /* The sequence options start here */ 172 /* PE data padded to 32bit alignment after seq. options */ 173 }; 174 175 union ip_vs_sync_conn { 176 struct ip_vs_sync_v4 v4; 177 struct ip_vs_sync_v6 v6; 178 }; 179 180 /* Bits in Type field in above */ 181 #define STYPE_INET6 0 182 #define STYPE_F_INET6 (1 << STYPE_INET6) 183 184 #define SVER_SHIFT 12 /* Shift to get version */ 185 #define SVER_MASK 0x0fff /* Mask to strip version */ 186 187 #define IPVS_OPT_SEQ_DATA 1 188 #define IPVS_OPT_PE_DATA 2 189 #define IPVS_OPT_PE_NAME 3 190 #define IPVS_OPT_PARAM 7 191 192 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 193 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 194 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 195 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 196 197 struct ip_vs_sync_thread_data { 198 struct netns_ipvs *ipvs; 199 struct socket *sock; 200 char *buf; 201 int id; 202 }; 203 204 /* Version 0 definition of packet sizes */ 205 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 206 #define FULL_CONN_SIZE \ 207 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 208 209 210 /* 211 The master mulitcasts messages (Datagrams) to the backup load balancers 212 in the following format. 213 214 Version 1: 215 Note, first byte should be Zero, so ver 0 receivers will drop the packet. 216 217 0 1 2 3 218 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 219 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 220 | 0 | SyncID | Size | 221 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 222 | Count Conns | Version | Reserved, set to Zero | 223 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 224 | | 225 | IPVS Sync Connection (1) | 226 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 227 | . | 228 ~ . ~ 229 | . | 230 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 231 | | 232 | IPVS Sync Connection (n) | 233 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 234 235 Version 0 Header 236 0 1 2 3 237 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 238 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 239 | Count Conns | SyncID | Size | 240 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 241 | IPVS Sync Connection (1) | 242 */ 243 244 #define SYNC_MESG_HEADER_LEN 4 245 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ 246 247 /* Version 0 header */ 248 struct ip_vs_sync_mesg_v0 { 249 __u8 nr_conns; 250 __u8 syncid; 251 __be16 size; 252 253 /* ip_vs_sync_conn entries start here */ 254 }; 255 256 /* Version 1 header */ 257 struct ip_vs_sync_mesg { 258 __u8 reserved; /* must be zero */ 259 __u8 syncid; 260 __be16 size; 261 __u8 nr_conns; 262 __s8 version; /* SYNC_PROTO_VER */ 263 __u16 spare; 264 /* ip_vs_sync_conn entries start here */ 265 }; 266 267 union ipvs_sockaddr { 268 struct sockaddr_in in; 269 struct sockaddr_in6 in6; 270 }; 271 272 struct ip_vs_sync_buff { 273 struct list_head list; 274 unsigned long firstuse; 275 276 /* pointers for the message data */ 277 struct ip_vs_sync_mesg *mesg; 278 unsigned char *head; 279 unsigned char *end; 280 }; 281 282 /* 283 * Copy of struct ip_vs_seq 284 * From unaligned network order to aligned host order 285 */ 286 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 287 { 288 memset(ho, 0, sizeof(*ho)); 289 ho->init_seq = get_unaligned_be32(&no->init_seq); 290 ho->delta = get_unaligned_be32(&no->delta); 291 ho->previous_delta = get_unaligned_be32(&no->previous_delta); 292 } 293 294 /* 295 * Copy of struct ip_vs_seq 296 * From Aligned host order to unaligned network order 297 */ 298 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 299 { 300 put_unaligned_be32(ho->init_seq, &no->init_seq); 301 put_unaligned_be32(ho->delta, &no->delta); 302 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 303 } 304 305 static inline struct ip_vs_sync_buff * 306 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 307 { 308 struct ip_vs_sync_buff *sb; 309 310 spin_lock_bh(&ipvs->sync_lock); 311 if (list_empty(&ms->sync_queue)) { 312 sb = NULL; 313 __set_current_state(TASK_INTERRUPTIBLE); 314 } else { 315 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 316 list); 317 list_del(&sb->list); 318 ms->sync_queue_len--; 319 if (!ms->sync_queue_len) 320 ms->sync_queue_delay = 0; 321 } 322 spin_unlock_bh(&ipvs->sync_lock); 323 324 return sb; 325 } 326 327 /* 328 * Create a new sync buffer for Version 1 proto. 329 */ 330 static inline struct ip_vs_sync_buff * 331 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 332 { 333 struct ip_vs_sync_buff *sb; 334 335 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 336 return NULL; 337 338 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 339 ipvs->mcfg.sync_maxlen); 340 sb->mesg = kmalloc(len, GFP_ATOMIC); 341 if (!sb->mesg) { 342 kfree(sb); 343 return NULL; 344 } 345 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 346 sb->mesg->version = SYNC_PROTO_VER; 347 sb->mesg->syncid = ipvs->mcfg.syncid; 348 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 349 sb->mesg->nr_conns = 0; 350 sb->mesg->spare = 0; 351 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 352 sb->end = (unsigned char *)sb->mesg + len; 353 354 sb->firstuse = jiffies; 355 return sb; 356 } 357 358 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 359 { 360 kfree(sb->mesg); 361 kfree(sb); 362 } 363 364 static inline void sb_queue_tail(struct netns_ipvs *ipvs, 365 struct ipvs_master_sync_state *ms) 366 { 367 struct ip_vs_sync_buff *sb = ms->sync_buff; 368 369 spin_lock(&ipvs->sync_lock); 370 if (ipvs->sync_state & IP_VS_STATE_MASTER && 371 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 372 if (!ms->sync_queue_len) 373 schedule_delayed_work(&ms->master_wakeup_work, 374 max(IPVS_SYNC_SEND_DELAY, 1)); 375 ms->sync_queue_len++; 376 list_add_tail(&sb->list, &ms->sync_queue); 377 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) 378 wake_up_process(ms->master_thread); 379 } else 380 ip_vs_sync_buff_release(sb); 381 spin_unlock(&ipvs->sync_lock); 382 } 383 384 /* 385 * Get the current sync buffer if it has been created for more 386 * than the specified time or the specified time is zero. 387 */ 388 static inline struct ip_vs_sync_buff * 389 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 390 unsigned long time) 391 { 392 struct ip_vs_sync_buff *sb; 393 394 spin_lock_bh(&ipvs->sync_buff_lock); 395 sb = ms->sync_buff; 396 if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 397 ms->sync_buff = NULL; 398 __set_current_state(TASK_RUNNING); 399 } else 400 sb = NULL; 401 spin_unlock_bh(&ipvs->sync_buff_lock); 402 return sb; 403 } 404 405 static inline int 406 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 407 { 408 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 409 } 410 411 /* 412 * Create a new sync buffer for Version 0 proto. 413 */ 414 static inline struct ip_vs_sync_buff * 415 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 416 { 417 struct ip_vs_sync_buff *sb; 418 struct ip_vs_sync_mesg_v0 *mesg; 419 420 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 421 return NULL; 422 423 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 424 ipvs->mcfg.sync_maxlen); 425 sb->mesg = kmalloc(len, GFP_ATOMIC); 426 if (!sb->mesg) { 427 kfree(sb); 428 return NULL; 429 } 430 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 431 mesg->nr_conns = 0; 432 mesg->syncid = ipvs->mcfg.syncid; 433 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 434 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 435 sb->end = (unsigned char *)mesg + len; 436 sb->firstuse = jiffies; 437 return sb; 438 } 439 440 /* Check if connection is controlled by persistence */ 441 static inline bool in_persistence(struct ip_vs_conn *cp) 442 { 443 for (cp = cp->control; cp; cp = cp->control) { 444 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 445 return true; 446 } 447 return false; 448 } 449 450 /* Check if conn should be synced. 451 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 452 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 453 * sync_retries times with period of sync_refresh_period/8 454 * - (2) if both sync_refresh_period and sync_period are 0 send sync only 455 * for state changes or only once when pkts matches sync_threshold 456 * - (3) templates: rate can be reduced only with sync_refresh_period or 457 * with (2) 458 */ 459 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 460 struct ip_vs_conn *cp, int pkts) 461 { 462 unsigned long orig = READ_ONCE(cp->sync_endtime); 463 unsigned long now = jiffies; 464 unsigned long n = (now + cp->timeout) & ~3UL; 465 unsigned int sync_refresh_period; 466 int sync_period; 467 int force; 468 469 /* Check if we sync in current state */ 470 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 471 force = 0; 472 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 473 return 0; 474 else if (likely(cp->protocol == IPPROTO_TCP)) { 475 if (!((1 << cp->state) & 476 ((1 << IP_VS_TCP_S_ESTABLISHED) | 477 (1 << IP_VS_TCP_S_FIN_WAIT) | 478 (1 << IP_VS_TCP_S_CLOSE) | 479 (1 << IP_VS_TCP_S_CLOSE_WAIT) | 480 (1 << IP_VS_TCP_S_TIME_WAIT)))) 481 return 0; 482 force = cp->state != cp->old_state; 483 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 484 goto set; 485 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 486 if (!((1 << cp->state) & 487 ((1 << IP_VS_SCTP_S_ESTABLISHED) | 488 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 489 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 490 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 491 (1 << IP_VS_SCTP_S_CLOSED)))) 492 return 0; 493 force = cp->state != cp->old_state; 494 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 495 goto set; 496 } else { 497 /* UDP or another protocol with single state */ 498 force = 0; 499 } 500 501 sync_refresh_period = sysctl_sync_refresh_period(ipvs); 502 if (sync_refresh_period > 0) { 503 long diff = n - orig; 504 long min_diff = max(cp->timeout >> 1, 10UL * HZ); 505 506 /* Avoid sync if difference is below sync_refresh_period 507 * and below the half timeout. 508 */ 509 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 510 int retries = orig & 3; 511 512 if (retries >= sysctl_sync_retries(ipvs)) 513 return 0; 514 if (time_before(now, orig - cp->timeout + 515 (sync_refresh_period >> 3))) 516 return 0; 517 n |= retries + 1; 518 } 519 } 520 sync_period = sysctl_sync_period(ipvs); 521 if (sync_period > 0) { 522 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 523 pkts % sync_period != sysctl_sync_threshold(ipvs)) 524 return 0; 525 } else if (!sync_refresh_period && 526 pkts != sysctl_sync_threshold(ipvs)) 527 return 0; 528 529 set: 530 cp->old_state = cp->state; 531 n = cmpxchg(&cp->sync_endtime, orig, n); 532 return n == orig || force; 533 } 534 535 /* 536 * Version 0 , could be switched in by sys_ctl. 537 * Add an ip_vs_conn information into the current sync_buff. 538 */ 539 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 540 int pkts) 541 { 542 struct ip_vs_sync_mesg_v0 *m; 543 struct ip_vs_sync_conn_v0 *s; 544 struct ip_vs_sync_buff *buff; 545 struct ipvs_master_sync_state *ms; 546 int id; 547 unsigned int len; 548 549 if (unlikely(cp->af != AF_INET)) 550 return; 551 /* Do not sync ONE PACKET */ 552 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 553 return; 554 555 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 556 return; 557 558 spin_lock_bh(&ipvs->sync_buff_lock); 559 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 560 spin_unlock_bh(&ipvs->sync_buff_lock); 561 return; 562 } 563 564 id = select_master_thread_id(ipvs, cp); 565 ms = &ipvs->ms[id]; 566 buff = ms->sync_buff; 567 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 568 SIMPLE_CONN_SIZE; 569 if (buff) { 570 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 571 /* Send buffer if it is for v1 */ 572 if (buff->head + len > buff->end || !m->nr_conns) { 573 sb_queue_tail(ipvs, ms); 574 ms->sync_buff = NULL; 575 buff = NULL; 576 } 577 } 578 if (!buff) { 579 buff = ip_vs_sync_buff_create_v0(ipvs, len); 580 if (!buff) { 581 spin_unlock_bh(&ipvs->sync_buff_lock); 582 pr_err("ip_vs_sync_buff_create failed.\n"); 583 return; 584 } 585 ms->sync_buff = buff; 586 } 587 588 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 589 s = (struct ip_vs_sync_conn_v0 *) buff->head; 590 591 /* copy members */ 592 s->reserved = 0; 593 s->protocol = cp->protocol; 594 s->cport = cp->cport; 595 s->vport = cp->vport; 596 s->dport = cp->dport; 597 s->caddr = cp->caddr.ip; 598 s->vaddr = cp->vaddr.ip; 599 s->daddr = cp->daddr.ip; 600 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 601 s->state = htons(cp->state); 602 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 603 struct ip_vs_sync_conn_options *opt = 604 (struct ip_vs_sync_conn_options *)&s[1]; 605 memcpy(opt, &cp->in_seq, sizeof(*opt)); 606 } 607 608 m->nr_conns++; 609 m->size = htons(ntohs(m->size) + len); 610 buff->head += len; 611 spin_unlock_bh(&ipvs->sync_buff_lock); 612 613 /* synchronize its controller if it has */ 614 cp = cp->control; 615 if (cp) { 616 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 617 pkts = atomic_add_return(1, &cp->in_pkts); 618 else 619 pkts = sysctl_sync_threshold(ipvs); 620 ip_vs_sync_conn(ipvs, cp, pkts); 621 } 622 } 623 624 /* 625 * Add an ip_vs_conn information into the current sync_buff. 626 * Called by ip_vs_in. 627 * Sending Version 1 messages 628 */ 629 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 630 { 631 struct ip_vs_sync_mesg *m; 632 union ip_vs_sync_conn *s; 633 struct ip_vs_sync_buff *buff; 634 struct ipvs_master_sync_state *ms; 635 int id; 636 __u8 *p; 637 unsigned int len, pe_name_len, pad; 638 639 /* Handle old version of the protocol */ 640 if (sysctl_sync_ver(ipvs) == 0) { 641 ip_vs_sync_conn_v0(ipvs, cp, pkts); 642 return; 643 } 644 /* Do not sync ONE PACKET */ 645 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 646 goto control; 647 sloop: 648 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 649 goto control; 650 651 /* Sanity checks */ 652 pe_name_len = 0; 653 if (cp->pe_data_len) { 654 if (!cp->pe_data || !cp->dest) { 655 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 656 return; 657 } 658 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 659 } 660 661 spin_lock_bh(&ipvs->sync_buff_lock); 662 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 663 spin_unlock_bh(&ipvs->sync_buff_lock); 664 return; 665 } 666 667 id = select_master_thread_id(ipvs, cp); 668 ms = &ipvs->ms[id]; 669 670 #ifdef CONFIG_IP_VS_IPV6 671 if (cp->af == AF_INET6) 672 len = sizeof(struct ip_vs_sync_v6); 673 else 674 #endif 675 len = sizeof(struct ip_vs_sync_v4); 676 677 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 678 len += sizeof(struct ip_vs_sync_conn_options) + 2; 679 680 if (cp->pe_data_len) 681 len += cp->pe_data_len + 2; /* + Param hdr field */ 682 if (pe_name_len) 683 len += pe_name_len + 2; 684 685 /* check if there is a space for this one */ 686 pad = 0; 687 buff = ms->sync_buff; 688 if (buff) { 689 m = buff->mesg; 690 pad = (4 - (size_t) buff->head) & 3; 691 /* Send buffer if it is for v0 */ 692 if (buff->head + len + pad > buff->end || m->reserved) { 693 sb_queue_tail(ipvs, ms); 694 ms->sync_buff = NULL; 695 buff = NULL; 696 pad = 0; 697 } 698 } 699 700 if (!buff) { 701 buff = ip_vs_sync_buff_create(ipvs, len); 702 if (!buff) { 703 spin_unlock_bh(&ipvs->sync_buff_lock); 704 pr_err("ip_vs_sync_buff_create failed.\n"); 705 return; 706 } 707 ms->sync_buff = buff; 708 m = buff->mesg; 709 } 710 711 p = buff->head; 712 buff->head += pad + len; 713 m->size = htons(ntohs(m->size) + pad + len); 714 /* Add ev. padding from prev. sync_conn */ 715 while (pad--) 716 *(p++) = 0; 717 718 s = (union ip_vs_sync_conn *)p; 719 720 /* Set message type & copy members */ 721 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 722 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 723 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 724 s->v4.state = htons(cp->state); 725 s->v4.protocol = cp->protocol; 726 s->v4.cport = cp->cport; 727 s->v4.vport = cp->vport; 728 s->v4.dport = cp->dport; 729 s->v4.fwmark = htonl(cp->fwmark); 730 s->v4.timeout = htonl(cp->timeout / HZ); 731 m->nr_conns++; 732 733 #ifdef CONFIG_IP_VS_IPV6 734 if (cp->af == AF_INET6) { 735 p += sizeof(struct ip_vs_sync_v6); 736 s->v6.caddr = cp->caddr.in6; 737 s->v6.vaddr = cp->vaddr.in6; 738 s->v6.daddr = cp->daddr.in6; 739 } else 740 #endif 741 { 742 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 743 s->v4.caddr = cp->caddr.ip; 744 s->v4.vaddr = cp->vaddr.ip; 745 s->v4.daddr = cp->daddr.ip; 746 } 747 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 748 *(p++) = IPVS_OPT_SEQ_DATA; 749 *(p++) = sizeof(struct ip_vs_sync_conn_options); 750 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 751 p += sizeof(struct ip_vs_seq); 752 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 753 p += sizeof(struct ip_vs_seq); 754 } 755 /* Handle pe data */ 756 if (cp->pe_data_len && cp->pe_data) { 757 *(p++) = IPVS_OPT_PE_DATA; 758 *(p++) = cp->pe_data_len; 759 memcpy(p, cp->pe_data, cp->pe_data_len); 760 p += cp->pe_data_len; 761 if (pe_name_len) { 762 /* Add PE_NAME */ 763 *(p++) = IPVS_OPT_PE_NAME; 764 *(p++) = pe_name_len; 765 memcpy(p, cp->pe->name, pe_name_len); 766 p += pe_name_len; 767 } 768 } 769 770 spin_unlock_bh(&ipvs->sync_buff_lock); 771 772 control: 773 /* synchronize its controller if it has */ 774 cp = cp->control; 775 if (!cp) 776 return; 777 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 778 pkts = atomic_add_return(1, &cp->in_pkts); 779 else 780 pkts = sysctl_sync_threshold(ipvs); 781 goto sloop; 782 } 783 784 /* 785 * fill_param used by version 1 786 */ 787 static inline int 788 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 789 struct ip_vs_conn_param *p, 790 __u8 *pe_data, unsigned int pe_data_len, 791 __u8 *pe_name, unsigned int pe_name_len) 792 { 793 #ifdef CONFIG_IP_VS_IPV6 794 if (af == AF_INET6) 795 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 796 (const union nf_inet_addr *)&sc->v6.caddr, 797 sc->v6.cport, 798 (const union nf_inet_addr *)&sc->v6.vaddr, 799 sc->v6.vport, p); 800 else 801 #endif 802 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 803 (const union nf_inet_addr *)&sc->v4.caddr, 804 sc->v4.cport, 805 (const union nf_inet_addr *)&sc->v4.vaddr, 806 sc->v4.vport, p); 807 /* Handle pe data */ 808 if (pe_data_len) { 809 if (pe_name_len) { 810 char buff[IP_VS_PENAME_MAXLEN+1]; 811 812 memcpy(buff, pe_name, pe_name_len); 813 buff[pe_name_len]=0; 814 p->pe = __ip_vs_pe_getbyname(buff); 815 if (!p->pe) { 816 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 817 buff); 818 return 1; 819 } 820 } else { 821 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 822 return 1; 823 } 824 825 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 826 if (!p->pe_data) { 827 module_put(p->pe->module); 828 return -ENOMEM; 829 } 830 p->pe_data_len = pe_data_len; 831 } 832 return 0; 833 } 834 835 /* 836 * Connection Add / Update. 837 * Common for version 0 and 1 reception of backup sync_conns. 838 * Param: ... 839 * timeout is in sec. 840 */ 841 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 842 unsigned int flags, unsigned int state, 843 unsigned int protocol, unsigned int type, 844 const union nf_inet_addr *daddr, __be16 dport, 845 unsigned long timeout, __u32 fwmark, 846 struct ip_vs_sync_conn_options *opt) 847 { 848 struct ip_vs_dest *dest; 849 struct ip_vs_conn *cp; 850 851 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 852 cp = ip_vs_conn_in_get(param); 853 if (cp && ((cp->dport != dport) || 854 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 855 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 856 ip_vs_conn_expire_now(cp); 857 __ip_vs_conn_put(cp); 858 cp = NULL; 859 } else { 860 /* This is the expiration message for the 861 * connection that was already replaced, so we 862 * just ignore it. 863 */ 864 __ip_vs_conn_put(cp); 865 kfree(param->pe_data); 866 return; 867 } 868 } 869 } else { 870 cp = ip_vs_ct_in_get(param); 871 } 872 873 if (cp) { 874 /* Free pe_data */ 875 kfree(param->pe_data); 876 877 dest = cp->dest; 878 spin_lock_bh(&cp->lock); 879 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 880 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 881 if (flags & IP_VS_CONN_F_INACTIVE) { 882 atomic_dec(&dest->activeconns); 883 atomic_inc(&dest->inactconns); 884 } else { 885 atomic_inc(&dest->activeconns); 886 atomic_dec(&dest->inactconns); 887 } 888 } 889 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 890 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 891 cp->flags = flags; 892 spin_unlock_bh(&cp->lock); 893 if (!dest) 894 ip_vs_try_bind_dest(cp); 895 } else { 896 /* 897 * Find the appropriate destination for the connection. 898 * If it is not found the connection will remain unbound 899 * but still handled. 900 */ 901 rcu_read_lock(); 902 /* This function is only invoked by the synchronization 903 * code. We do not currently support heterogeneous pools 904 * with synchronization, so we can make the assumption that 905 * the svc_af is the same as the dest_af 906 */ 907 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 908 param->vaddr, param->vport, protocol, 909 fwmark, flags); 910 911 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 912 fwmark); 913 rcu_read_unlock(); 914 if (!cp) { 915 kfree(param->pe_data); 916 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 917 return; 918 } 919 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 920 kfree(param->pe_data); 921 } 922 923 if (opt) { 924 cp->in_seq = opt->in_seq; 925 cp->out_seq = opt->out_seq; 926 } 927 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 928 cp->state = state; 929 cp->old_state = cp->state; 930 /* 931 * For Ver 0 messages style 932 * - Not possible to recover the right timeout for templates 933 * - can not find the right fwmark 934 * virtual service. If needed, we can do it for 935 * non-fwmark persistent services. 936 * Ver 1 messages style. 937 * - No problem. 938 */ 939 if (timeout) { 940 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 941 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 942 cp->timeout = timeout*HZ; 943 } else { 944 struct ip_vs_proto_data *pd; 945 946 pd = ip_vs_proto_data_get(ipvs, protocol); 947 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 948 cp->timeout = pd->timeout_table[state]; 949 else 950 cp->timeout = (3*60*HZ); 951 } 952 ip_vs_conn_put(cp); 953 } 954 955 /* 956 * Process received multicast message for Version 0 957 */ 958 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 959 const size_t buflen) 960 { 961 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 962 struct ip_vs_sync_conn_v0 *s; 963 struct ip_vs_sync_conn_options *opt; 964 struct ip_vs_protocol *pp; 965 struct ip_vs_conn_param param; 966 char *p; 967 int i; 968 969 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 970 for (i=0; i<m->nr_conns; i++) { 971 unsigned int flags, state; 972 973 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 974 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 975 return; 976 } 977 s = (struct ip_vs_sync_conn_v0 *) p; 978 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 979 flags &= ~IP_VS_CONN_F_HASHED; 980 if (flags & IP_VS_CONN_F_SEQ_MASK) { 981 opt = (struct ip_vs_sync_conn_options *)&s[1]; 982 p += FULL_CONN_SIZE; 983 if (p > buffer+buflen) { 984 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 985 return; 986 } 987 } else { 988 opt = NULL; 989 p += SIMPLE_CONN_SIZE; 990 } 991 992 state = ntohs(s->state); 993 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 994 pp = ip_vs_proto_get(s->protocol); 995 if (!pp) { 996 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 997 s->protocol); 998 continue; 999 } 1000 if (state >= pp->num_states) { 1001 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 1002 pp->name, state); 1003 continue; 1004 } 1005 } else { 1006 if (state >= IP_VS_CTPL_S_LAST) 1007 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", 1008 state); 1009 } 1010 1011 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1012 (const union nf_inet_addr *)&s->caddr, 1013 s->cport, 1014 (const union nf_inet_addr *)&s->vaddr, 1015 s->vport, ¶m); 1016 1017 /* Send timeout as Zero */ 1018 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1019 (union nf_inet_addr *)&s->daddr, s->dport, 1020 0, 0, opt); 1021 } 1022 } 1023 1024 /* 1025 * Handle options 1026 */ 1027 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1028 __u32 *opt_flags, 1029 struct ip_vs_sync_conn_options *opt) 1030 { 1031 struct ip_vs_sync_conn_options *topt; 1032 1033 topt = (struct ip_vs_sync_conn_options *)p; 1034 1035 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1036 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1037 return -EINVAL; 1038 } 1039 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1040 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1041 return -EINVAL; 1042 } 1043 ntoh_seq(&topt->in_seq, &opt->in_seq); 1044 ntoh_seq(&topt->out_seq, &opt->out_seq); 1045 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1046 return 0; 1047 } 1048 1049 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1050 __u8 **data, unsigned int maxlen, 1051 __u32 *opt_flags, __u32 flag) 1052 { 1053 if (plen > maxlen) { 1054 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1055 return -EINVAL; 1056 } 1057 if (*opt_flags & flag) { 1058 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1059 return -EINVAL; 1060 } 1061 *data_len = plen; 1062 *data = p; 1063 *opt_flags |= flag; 1064 return 0; 1065 } 1066 /* 1067 * Process a Version 1 sync. connection 1068 */ 1069 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1070 { 1071 struct ip_vs_sync_conn_options opt; 1072 union ip_vs_sync_conn *s; 1073 struct ip_vs_protocol *pp; 1074 struct ip_vs_conn_param param; 1075 __u32 flags; 1076 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1077 __u8 *pe_data=NULL, *pe_name=NULL; 1078 __u32 opt_flags=0; 1079 int retc=0; 1080 1081 s = (union ip_vs_sync_conn *) p; 1082 1083 if (s->v6.type & STYPE_F_INET6) { 1084 #ifdef CONFIG_IP_VS_IPV6 1085 af = AF_INET6; 1086 p += sizeof(struct ip_vs_sync_v6); 1087 #else 1088 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1089 retc = 10; 1090 goto out; 1091 #endif 1092 } else if (!s->v4.type) { 1093 af = AF_INET; 1094 p += sizeof(struct ip_vs_sync_v4); 1095 } else { 1096 return -10; 1097 } 1098 if (p > msg_end) 1099 return -20; 1100 1101 /* Process optional params check Type & Len. */ 1102 while (p < msg_end) { 1103 int ptype; 1104 int plen; 1105 1106 if (p+2 > msg_end) 1107 return -30; 1108 ptype = *(p++); 1109 plen = *(p++); 1110 1111 if (!plen || ((p + plen) > msg_end)) 1112 return -40; 1113 /* Handle seq option p = param data */ 1114 switch (ptype & ~IPVS_OPT_F_PARAM) { 1115 case IPVS_OPT_SEQ_DATA: 1116 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1117 return -50; 1118 break; 1119 1120 case IPVS_OPT_PE_DATA: 1121 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1122 IP_VS_PEDATA_MAXLEN, &opt_flags, 1123 IPVS_OPT_F_PE_DATA)) 1124 return -60; 1125 break; 1126 1127 case IPVS_OPT_PE_NAME: 1128 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1129 IP_VS_PENAME_MAXLEN, &opt_flags, 1130 IPVS_OPT_F_PE_NAME)) 1131 return -70; 1132 break; 1133 1134 default: 1135 /* Param data mandatory ? */ 1136 if (!(ptype & IPVS_OPT_F_PARAM)) { 1137 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1138 ptype & ~IPVS_OPT_F_PARAM); 1139 retc = 20; 1140 goto out; 1141 } 1142 } 1143 p += plen; /* Next option */ 1144 } 1145 1146 /* Get flags and Mask off unsupported */ 1147 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1148 flags |= IP_VS_CONN_F_SYNC; 1149 state = ntohs(s->v4.state); 1150 1151 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1152 pp = ip_vs_proto_get(s->v4.protocol); 1153 if (!pp) { 1154 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1155 s->v4.protocol); 1156 retc = 30; 1157 goto out; 1158 } 1159 if (state >= pp->num_states) { 1160 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1161 pp->name, state); 1162 retc = 40; 1163 goto out; 1164 } 1165 } else { 1166 if (state >= IP_VS_CTPL_S_LAST) 1167 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", 1168 state); 1169 } 1170 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1171 pe_data_len, pe_name, pe_name_len)) { 1172 retc = 50; 1173 goto out; 1174 } 1175 /* If only IPv4, just silent skip IPv6 */ 1176 if (af == AF_INET) 1177 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1178 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1179 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1180 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1181 ); 1182 #ifdef CONFIG_IP_VS_IPV6 1183 else 1184 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1185 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1186 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1187 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1188 ); 1189 #endif 1190 ip_vs_pe_put(param.pe); 1191 return 0; 1192 /* Error exit */ 1193 out: 1194 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1195 return retc; 1196 1197 } 1198 /* 1199 * Process received multicast message and create the corresponding 1200 * ip_vs_conn entries. 1201 * Handles Version 0 & 1 1202 */ 1203 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1204 const size_t buflen) 1205 { 1206 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1207 __u8 *p, *msg_end; 1208 int i, nr_conns; 1209 1210 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1211 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1212 return; 1213 } 1214 1215 if (buflen != ntohs(m2->size)) { 1216 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1217 return; 1218 } 1219 /* SyncID sanity check */ 1220 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1221 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1222 return; 1223 } 1224 /* Handle version 1 message */ 1225 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1226 && (m2->spare == 0)) { 1227 1228 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1229 nr_conns = m2->nr_conns; 1230 1231 for (i=0; i<nr_conns; i++) { 1232 union ip_vs_sync_conn *s; 1233 unsigned int size; 1234 int retc; 1235 1236 p = msg_end; 1237 if (p + sizeof(s->v4) > buffer+buflen) { 1238 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); 1239 return; 1240 } 1241 s = (union ip_vs_sync_conn *)p; 1242 size = ntohs(s->v4.ver_size) & SVER_MASK; 1243 msg_end = p + size; 1244 /* Basic sanity checks */ 1245 if (msg_end > buffer+buflen) { 1246 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1247 return; 1248 } 1249 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1250 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1251 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1252 return; 1253 } 1254 /* Process a single sync_conn */ 1255 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1256 if (retc < 0) { 1257 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1258 retc); 1259 return; 1260 } 1261 /* Make sure we have 32 bit alignment */ 1262 msg_end = p + ((size + 3) & ~3); 1263 } 1264 } else { 1265 /* Old type of message */ 1266 ip_vs_process_message_v0(ipvs, buffer, buflen); 1267 return; 1268 } 1269 } 1270 1271 1272 /* 1273 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1274 */ 1275 static void set_sock_size(struct sock *sk, int mode, int val) 1276 { 1277 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1278 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1279 lock_sock(sk); 1280 if (mode) { 1281 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1282 sysctl_wmem_max); 1283 sk->sk_sndbuf = val * 2; 1284 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1285 } else { 1286 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1287 sysctl_rmem_max); 1288 sk->sk_rcvbuf = val * 2; 1289 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1290 } 1291 release_sock(sk); 1292 } 1293 1294 /* 1295 * Setup loopback of outgoing multicasts on a sending socket 1296 */ 1297 static void set_mcast_loop(struct sock *sk, u_char loop) 1298 { 1299 struct inet_sock *inet = inet_sk(sk); 1300 1301 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1302 lock_sock(sk); 1303 inet->mc_loop = loop ? 1 : 0; 1304 #ifdef CONFIG_IP_VS_IPV6 1305 if (sk->sk_family == AF_INET6) { 1306 struct ipv6_pinfo *np = inet6_sk(sk); 1307 1308 /* IPV6_MULTICAST_LOOP */ 1309 np->mc_loop = loop ? 1 : 0; 1310 } 1311 #endif 1312 release_sock(sk); 1313 } 1314 1315 /* 1316 * Specify TTL for outgoing multicasts on a sending socket 1317 */ 1318 static void set_mcast_ttl(struct sock *sk, u_char ttl) 1319 { 1320 struct inet_sock *inet = inet_sk(sk); 1321 1322 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1323 lock_sock(sk); 1324 inet->mc_ttl = ttl; 1325 #ifdef CONFIG_IP_VS_IPV6 1326 if (sk->sk_family == AF_INET6) { 1327 struct ipv6_pinfo *np = inet6_sk(sk); 1328 1329 /* IPV6_MULTICAST_HOPS */ 1330 np->mcast_hops = ttl; 1331 } 1332 #endif 1333 release_sock(sk); 1334 } 1335 1336 /* Control fragmentation of messages */ 1337 static void set_mcast_pmtudisc(struct sock *sk, int val) 1338 { 1339 struct inet_sock *inet = inet_sk(sk); 1340 1341 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1342 lock_sock(sk); 1343 inet->pmtudisc = val; 1344 #ifdef CONFIG_IP_VS_IPV6 1345 if (sk->sk_family == AF_INET6) { 1346 struct ipv6_pinfo *np = inet6_sk(sk); 1347 1348 /* IPV6_MTU_DISCOVER */ 1349 np->pmtudisc = val; 1350 } 1351 #endif 1352 release_sock(sk); 1353 } 1354 1355 /* 1356 * Specifiy default interface for outgoing multicasts 1357 */ 1358 static int set_mcast_if(struct sock *sk, struct net_device *dev) 1359 { 1360 struct inet_sock *inet = inet_sk(sk); 1361 1362 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1363 return -EINVAL; 1364 1365 lock_sock(sk); 1366 inet->mc_index = dev->ifindex; 1367 /* inet->mc_addr = 0; */ 1368 #ifdef CONFIG_IP_VS_IPV6 1369 if (sk->sk_family == AF_INET6) { 1370 struct ipv6_pinfo *np = inet6_sk(sk); 1371 1372 /* IPV6_MULTICAST_IF */ 1373 np->mcast_oif = dev->ifindex; 1374 } 1375 #endif 1376 release_sock(sk); 1377 1378 return 0; 1379 } 1380 1381 1382 /* 1383 * Join a multicast group. 1384 * the group is specified by a class D multicast address 224.0.0.0/8 1385 * in the in_addr structure passed in as a parameter. 1386 */ 1387 static int 1388 join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev) 1389 { 1390 struct ip_mreqn mreq; 1391 int ret; 1392 1393 memset(&mreq, 0, sizeof(mreq)); 1394 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1395 1396 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1397 return -EINVAL; 1398 1399 mreq.imr_ifindex = dev->ifindex; 1400 1401 lock_sock(sk); 1402 ret = ip_mc_join_group(sk, &mreq); 1403 release_sock(sk); 1404 1405 return ret; 1406 } 1407 1408 #ifdef CONFIG_IP_VS_IPV6 1409 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1410 struct net_device *dev) 1411 { 1412 int ret; 1413 1414 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1415 return -EINVAL; 1416 1417 lock_sock(sk); 1418 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1419 release_sock(sk); 1420 1421 return ret; 1422 } 1423 #endif 1424 1425 static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) 1426 { 1427 __be32 addr; 1428 struct sockaddr_in sin; 1429 1430 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1431 if (!addr) 1432 pr_err("You probably need to specify IP address on " 1433 "multicast interface.\n"); 1434 1435 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1436 dev->name, &addr); 1437 1438 /* Now bind the socket with the address of multicast interface */ 1439 sin.sin_family = AF_INET; 1440 sin.sin_addr.s_addr = addr; 1441 sin.sin_port = 0; 1442 1443 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 1444 } 1445 1446 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1447 struct ipvs_sync_daemon_cfg *c, int id) 1448 { 1449 if (AF_INET6 == c->mcast_af) { 1450 sa->in6 = (struct sockaddr_in6) { 1451 .sin6_family = AF_INET6, 1452 .sin6_port = htons(c->mcast_port + id), 1453 }; 1454 sa->in6.sin6_addr = c->mcast_group.in6; 1455 *salen = sizeof(sa->in6); 1456 } else { 1457 sa->in = (struct sockaddr_in) { 1458 .sin_family = AF_INET, 1459 .sin_port = htons(c->mcast_port + id), 1460 }; 1461 sa->in.sin_addr = c->mcast_group.in; 1462 *salen = sizeof(sa->in); 1463 } 1464 } 1465 1466 /* 1467 * Set up sending multicast socket over UDP 1468 */ 1469 static int make_send_sock(struct netns_ipvs *ipvs, int id, 1470 struct net_device *dev, struct socket **sock_ret) 1471 { 1472 /* multicast addr */ 1473 union ipvs_sockaddr mcast_addr; 1474 struct socket *sock; 1475 int result, salen; 1476 1477 /* First create a socket */ 1478 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1479 IPPROTO_UDP, &sock); 1480 if (result < 0) { 1481 pr_err("Error during creation of socket; terminating\n"); 1482 goto error; 1483 } 1484 *sock_ret = sock; 1485 result = set_mcast_if(sock->sk, dev); 1486 if (result < 0) { 1487 pr_err("Error setting outbound mcast interface\n"); 1488 goto error; 1489 } 1490 1491 set_mcast_loop(sock->sk, 0); 1492 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1493 /* Allow fragmentation if MTU changes */ 1494 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1495 result = sysctl_sync_sock_size(ipvs); 1496 if (result > 0) 1497 set_sock_size(sock->sk, 1, result); 1498 1499 if (AF_INET == ipvs->mcfg.mcast_af) 1500 result = bind_mcastif_addr(sock, dev); 1501 else 1502 result = 0; 1503 if (result < 0) { 1504 pr_err("Error binding address of the mcast interface\n"); 1505 goto error; 1506 } 1507 1508 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1509 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 1510 salen, 0); 1511 if (result < 0) { 1512 pr_err("Error connecting to the multicast addr\n"); 1513 goto error; 1514 } 1515 1516 return 0; 1517 1518 error: 1519 return result; 1520 } 1521 1522 1523 /* 1524 * Set up receiving multicast socket over UDP 1525 */ 1526 static int make_receive_sock(struct netns_ipvs *ipvs, int id, 1527 struct net_device *dev, struct socket **sock_ret) 1528 { 1529 /* multicast addr */ 1530 union ipvs_sockaddr mcast_addr; 1531 struct socket *sock; 1532 int result, salen; 1533 1534 /* First create a socket */ 1535 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1536 IPPROTO_UDP, &sock); 1537 if (result < 0) { 1538 pr_err("Error during creation of socket; terminating\n"); 1539 goto error; 1540 } 1541 *sock_ret = sock; 1542 /* it is equivalent to the REUSEADDR option in user-space */ 1543 sock->sk->sk_reuse = SK_CAN_REUSE; 1544 result = sysctl_sync_sock_size(ipvs); 1545 if (result > 0) 1546 set_sock_size(sock->sk, 0, result); 1547 1548 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1549 sock->sk->sk_bound_dev_if = dev->ifindex; 1550 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); 1551 if (result < 0) { 1552 pr_err("Error binding to the multicast addr\n"); 1553 goto error; 1554 } 1555 1556 /* join the multicast group */ 1557 #ifdef CONFIG_IP_VS_IPV6 1558 if (ipvs->bcfg.mcast_af == AF_INET6) 1559 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1560 dev); 1561 else 1562 #endif 1563 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1564 dev); 1565 if (result < 0) { 1566 pr_err("Error joining to the multicast group\n"); 1567 goto error; 1568 } 1569 1570 return 0; 1571 1572 error: 1573 return result; 1574 } 1575 1576 1577 static int 1578 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1579 { 1580 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1581 struct kvec iov; 1582 int len; 1583 1584 EnterFunction(7); 1585 iov.iov_base = (void *)buffer; 1586 iov.iov_len = length; 1587 1588 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1589 1590 LeaveFunction(7); 1591 return len; 1592 } 1593 1594 static int 1595 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1596 { 1597 int msize; 1598 int ret; 1599 1600 msize = ntohs(msg->size); 1601 1602 ret = ip_vs_send_async(sock, (char *)msg, msize); 1603 if (ret >= 0 || ret == -EAGAIN) 1604 return ret; 1605 pr_err("ip_vs_send_async error %d\n", ret); 1606 return 0; 1607 } 1608 1609 static int 1610 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1611 { 1612 struct msghdr msg = {NULL,}; 1613 struct kvec iov = {buffer, buflen}; 1614 int len; 1615 1616 EnterFunction(7); 1617 1618 /* Receive a packet */ 1619 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen); 1620 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); 1621 if (len < 0) 1622 return len; 1623 1624 LeaveFunction(7); 1625 return len; 1626 } 1627 1628 /* Wakeup the master thread for sending */ 1629 static void master_wakeup_work_handler(struct work_struct *work) 1630 { 1631 struct ipvs_master_sync_state *ms = 1632 container_of(work, struct ipvs_master_sync_state, 1633 master_wakeup_work.work); 1634 struct netns_ipvs *ipvs = ms->ipvs; 1635 1636 spin_lock_bh(&ipvs->sync_lock); 1637 if (ms->sync_queue_len && 1638 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1639 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1640 wake_up_process(ms->master_thread); 1641 } 1642 spin_unlock_bh(&ipvs->sync_lock); 1643 } 1644 1645 /* Get next buffer to send */ 1646 static inline struct ip_vs_sync_buff * 1647 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1648 { 1649 struct ip_vs_sync_buff *sb; 1650 1651 sb = sb_dequeue(ipvs, ms); 1652 if (sb) 1653 return sb; 1654 /* Do not delay entries in buffer for more than 2 seconds */ 1655 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1656 } 1657 1658 static int sync_thread_master(void *data) 1659 { 1660 struct ip_vs_sync_thread_data *tinfo = data; 1661 struct netns_ipvs *ipvs = tinfo->ipvs; 1662 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1663 struct sock *sk = tinfo->sock->sk; 1664 struct ip_vs_sync_buff *sb; 1665 1666 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1667 "syncid = %d, id = %d\n", 1668 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1669 1670 for (;;) { 1671 sb = next_sync_buff(ipvs, ms); 1672 if (unlikely(kthread_should_stop())) 1673 break; 1674 if (!sb) { 1675 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1676 continue; 1677 } 1678 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1679 /* (Ab)use interruptible sleep to avoid increasing 1680 * the load avg. 1681 */ 1682 __wait_event_interruptible(*sk_sleep(sk), 1683 sock_writeable(sk) || 1684 kthread_should_stop()); 1685 if (unlikely(kthread_should_stop())) 1686 goto done; 1687 } 1688 ip_vs_sync_buff_release(sb); 1689 } 1690 1691 done: 1692 __set_current_state(TASK_RUNNING); 1693 if (sb) 1694 ip_vs_sync_buff_release(sb); 1695 1696 /* clean up the sync_buff queue */ 1697 while ((sb = sb_dequeue(ipvs, ms))) 1698 ip_vs_sync_buff_release(sb); 1699 __set_current_state(TASK_RUNNING); 1700 1701 /* clean up the current sync_buff */ 1702 sb = get_curr_sync_buff(ipvs, ms, 0); 1703 if (sb) 1704 ip_vs_sync_buff_release(sb); 1705 1706 /* release the sending multicast socket */ 1707 sock_release(tinfo->sock); 1708 kfree(tinfo); 1709 1710 return 0; 1711 } 1712 1713 1714 static int sync_thread_backup(void *data) 1715 { 1716 struct ip_vs_sync_thread_data *tinfo = data; 1717 struct netns_ipvs *ipvs = tinfo->ipvs; 1718 int len; 1719 1720 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1721 "syncid = %d, id = %d\n", 1722 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1723 1724 while (!kthread_should_stop()) { 1725 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), 1726 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) 1727 || kthread_should_stop()); 1728 1729 /* do we have data now? */ 1730 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { 1731 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1732 ipvs->bcfg.sync_maxlen); 1733 if (len <= 0) { 1734 if (len != -EAGAIN) 1735 pr_err("receiving message error\n"); 1736 break; 1737 } 1738 1739 ip_vs_process_message(ipvs, tinfo->buf, len); 1740 } 1741 } 1742 1743 /* release the sending multicast socket */ 1744 sock_release(tinfo->sock); 1745 kfree(tinfo->buf); 1746 kfree(tinfo); 1747 1748 return 0; 1749 } 1750 1751 1752 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1753 int state) 1754 { 1755 struct ip_vs_sync_thread_data *tinfo = NULL; 1756 struct task_struct **array = NULL, *task; 1757 struct net_device *dev; 1758 char *name; 1759 int (*threadfn)(void *data); 1760 int id = 0, count, hlen; 1761 int result = -ENOMEM; 1762 u16 mtu, min_mtu; 1763 1764 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1765 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", 1766 sizeof(struct ip_vs_sync_conn_v0)); 1767 1768 /* Do not hold one mutex and then to block on another */ 1769 for (;;) { 1770 rtnl_lock(); 1771 if (mutex_trylock(&ipvs->sync_mutex)) 1772 break; 1773 rtnl_unlock(); 1774 mutex_lock(&ipvs->sync_mutex); 1775 if (rtnl_trylock()) 1776 break; 1777 mutex_unlock(&ipvs->sync_mutex); 1778 } 1779 1780 if (!ipvs->sync_state) { 1781 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1782 ipvs->threads_mask = count - 1; 1783 } else 1784 count = ipvs->threads_mask + 1; 1785 1786 if (c->mcast_af == AF_UNSPEC) { 1787 c->mcast_af = AF_INET; 1788 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1789 } 1790 if (!c->mcast_port) 1791 c->mcast_port = IP_VS_SYNC_PORT; 1792 if (!c->mcast_ttl) 1793 c->mcast_ttl = 1; 1794 1795 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1796 if (!dev) { 1797 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1798 result = -ENODEV; 1799 goto out_early; 1800 } 1801 hlen = (AF_INET6 == c->mcast_af) ? 1802 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1803 sizeof(struct iphdr) + sizeof(struct udphdr); 1804 mtu = (state == IP_VS_STATE_BACKUP) ? 1805 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1806 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1807 1808 if (c->sync_maxlen) 1809 c->sync_maxlen = clamp_t(unsigned int, 1810 c->sync_maxlen, min_mtu, 1811 65535 - hlen); 1812 else 1813 c->sync_maxlen = mtu - hlen; 1814 1815 if (state == IP_VS_STATE_MASTER) { 1816 result = -EEXIST; 1817 if (ipvs->ms) 1818 goto out_early; 1819 1820 ipvs->mcfg = *c; 1821 name = "ipvs-m:%d:%d"; 1822 threadfn = sync_thread_master; 1823 } else if (state == IP_VS_STATE_BACKUP) { 1824 result = -EEXIST; 1825 if (ipvs->backup_threads) 1826 goto out_early; 1827 1828 ipvs->bcfg = *c; 1829 name = "ipvs-b:%d:%d"; 1830 threadfn = sync_thread_backup; 1831 } else { 1832 result = -EINVAL; 1833 goto out_early; 1834 } 1835 1836 if (state == IP_VS_STATE_MASTER) { 1837 struct ipvs_master_sync_state *ms; 1838 1839 result = -ENOMEM; 1840 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); 1841 if (!ipvs->ms) 1842 goto out; 1843 ms = ipvs->ms; 1844 for (id = 0; id < count; id++, ms++) { 1845 INIT_LIST_HEAD(&ms->sync_queue); 1846 ms->sync_queue_len = 0; 1847 ms->sync_queue_delay = 0; 1848 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1849 master_wakeup_work_handler); 1850 ms->ipvs = ipvs; 1851 } 1852 } else { 1853 array = kcalloc(count, sizeof(struct task_struct *), 1854 GFP_KERNEL); 1855 result = -ENOMEM; 1856 if (!array) 1857 goto out; 1858 } 1859 1860 for (id = 0; id < count; id++) { 1861 result = -ENOMEM; 1862 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 1863 if (!tinfo) 1864 goto out; 1865 tinfo->ipvs = ipvs; 1866 tinfo->sock = NULL; 1867 if (state == IP_VS_STATE_BACKUP) { 1868 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1869 GFP_KERNEL); 1870 if (!tinfo->buf) 1871 goto out; 1872 } else { 1873 tinfo->buf = NULL; 1874 } 1875 tinfo->id = id; 1876 if (state == IP_VS_STATE_MASTER) 1877 result = make_send_sock(ipvs, id, dev, &tinfo->sock); 1878 else 1879 result = make_receive_sock(ipvs, id, dev, &tinfo->sock); 1880 if (result < 0) 1881 goto out; 1882 1883 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1884 if (IS_ERR(task)) { 1885 result = PTR_ERR(task); 1886 goto out; 1887 } 1888 tinfo = NULL; 1889 if (state == IP_VS_STATE_MASTER) 1890 ipvs->ms[id].master_thread = task; 1891 else 1892 array[id] = task; 1893 } 1894 1895 /* mark as active */ 1896 1897 if (state == IP_VS_STATE_BACKUP) 1898 ipvs->backup_threads = array; 1899 spin_lock_bh(&ipvs->sync_buff_lock); 1900 ipvs->sync_state |= state; 1901 spin_unlock_bh(&ipvs->sync_buff_lock); 1902 1903 mutex_unlock(&ipvs->sync_mutex); 1904 rtnl_unlock(); 1905 1906 /* increase the module use count */ 1907 ip_vs_use_count_inc(); 1908 1909 return 0; 1910 1911 out: 1912 /* We do not need RTNL lock anymore, release it here so that 1913 * sock_release below and in the kthreads can use rtnl_lock 1914 * to leave the mcast group. 1915 */ 1916 rtnl_unlock(); 1917 count = id; 1918 while (count-- > 0) { 1919 if (state == IP_VS_STATE_MASTER) 1920 kthread_stop(ipvs->ms[count].master_thread); 1921 else 1922 kthread_stop(array[count]); 1923 } 1924 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1925 kfree(ipvs->ms); 1926 ipvs->ms = NULL; 1927 } 1928 mutex_unlock(&ipvs->sync_mutex); 1929 if (tinfo) { 1930 if (tinfo->sock) 1931 sock_release(tinfo->sock); 1932 kfree(tinfo->buf); 1933 kfree(tinfo); 1934 } 1935 kfree(array); 1936 return result; 1937 1938 out_early: 1939 mutex_unlock(&ipvs->sync_mutex); 1940 rtnl_unlock(); 1941 return result; 1942 } 1943 1944 1945 int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1946 { 1947 struct task_struct **array; 1948 int id; 1949 int retc = -EINVAL; 1950 1951 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1952 1953 if (state == IP_VS_STATE_MASTER) { 1954 if (!ipvs->ms) 1955 return -ESRCH; 1956 1957 /* 1958 * The lock synchronizes with sb_queue_tail(), so that we don't 1959 * add sync buffers to the queue, when we are already in 1960 * progress of stopping the master sync daemon. 1961 */ 1962 1963 spin_lock_bh(&ipvs->sync_buff_lock); 1964 spin_lock(&ipvs->sync_lock); 1965 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1966 spin_unlock(&ipvs->sync_lock); 1967 spin_unlock_bh(&ipvs->sync_buff_lock); 1968 1969 retc = 0; 1970 for (id = ipvs->threads_mask; id >= 0; id--) { 1971 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1972 int ret; 1973 1974 pr_info("stopping master sync thread %d ...\n", 1975 task_pid_nr(ms->master_thread)); 1976 cancel_delayed_work_sync(&ms->master_wakeup_work); 1977 ret = kthread_stop(ms->master_thread); 1978 if (retc >= 0) 1979 retc = ret; 1980 } 1981 kfree(ipvs->ms); 1982 ipvs->ms = NULL; 1983 } else if (state == IP_VS_STATE_BACKUP) { 1984 if (!ipvs->backup_threads) 1985 return -ESRCH; 1986 1987 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1988 array = ipvs->backup_threads; 1989 retc = 0; 1990 for (id = ipvs->threads_mask; id >= 0; id--) { 1991 int ret; 1992 1993 pr_info("stopping backup sync thread %d ...\n", 1994 task_pid_nr(array[id])); 1995 ret = kthread_stop(array[id]); 1996 if (retc >= 0) 1997 retc = ret; 1998 } 1999 kfree(array); 2000 ipvs->backup_threads = NULL; 2001 } 2002 2003 /* decrease the module use count */ 2004 ip_vs_use_count_dec(); 2005 2006 return retc; 2007 } 2008 2009 /* 2010 * Initialize data struct for each netns 2011 */ 2012 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2013 { 2014 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2015 spin_lock_init(&ipvs->sync_lock); 2016 spin_lock_init(&ipvs->sync_buff_lock); 2017 return 0; 2018 } 2019 2020 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2021 { 2022 int retc; 2023 2024 mutex_lock(&ipvs->sync_mutex); 2025 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2026 if (retc && retc != -ESRCH) 2027 pr_err("Failed to stop Master Daemon\n"); 2028 2029 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2030 if (retc && retc != -ESRCH) 2031 pr_err("Failed to stop Backup Daemon\n"); 2032 mutex_unlock(&ipvs->sync_mutex); 2033 } 2034