1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Version 1, is capable of handling both version 0 and 1 messages. 10 * Version 0 is the plain old format. 11 * Note Version 0 receivers will just drop Ver 1 messages. 12 * Version 1 is capable of handle IPv6, Persistence data, 13 * time-outs, and firewall marks. 14 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 15 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 16 * 17 * Definitions Message: is a complete datagram 18 * Sync_conn: is a part of a Message 19 * Param Data is an option to a Sync_conn. 20 * 21 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 22 * 23 * ip_vs_sync: sync connection info from master load balancer to backups 24 * through multicast 25 * 26 * Changes: 27 * Alexandre Cassen : Added master & backup support at a time. 28 * Alexandre Cassen : Added SyncID support for incoming sync 29 * messages filtering. 30 * Justin Ossevoort : Fix endian problem on sync message size. 31 * Hans Schillstrom : Added Version 1: i.e. IPv6, 32 * Persistence support, fwmark and time-out. 33 */ 34 35 #define KMSG_COMPONENT "IPVS" 36 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 37 38 #include <linux/module.h> 39 #include <linux/slab.h> 40 #include <linux/inetdevice.h> 41 #include <linux/net.h> 42 #include <linux/completion.h> 43 #include <linux/delay.h> 44 #include <linux/skbuff.h> 45 #include <linux/in.h> 46 #include <linux/igmp.h> /* for ip_mc_join_group */ 47 #include <linux/udp.h> 48 #include <linux/err.h> 49 #include <linux/kthread.h> 50 #include <linux/wait.h> 51 #include <linux/kernel.h> 52 #include <linux/sched/signal.h> 53 54 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ 55 56 #include <net/ip.h> 57 #include <net/sock.h> 58 59 #include <net/ip_vs.h> 60 61 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 62 #define IP_VS_SYNC_PORT 8848 /* multicast port */ 63 64 #define SYNC_PROTO_VER 1 /* Protocol version in header */ 65 66 static struct lock_class_key __ipvs_sync_key; 67 /* 68 * IPVS sync connection entry 69 * Version 0, i.e. original version. 70 */ 71 struct ip_vs_sync_conn_v0 { 72 __u8 reserved; 73 74 /* Protocol, addresses and port numbers */ 75 __u8 protocol; /* Which protocol (TCP/UDP) */ 76 __be16 cport; 77 __be16 vport; 78 __be16 dport; 79 __be32 caddr; /* client address */ 80 __be32 vaddr; /* virtual address */ 81 __be32 daddr; /* destination address */ 82 83 /* Flags and state transition */ 84 __be16 flags; /* status flags */ 85 __be16 state; /* state info */ 86 87 /* The sequence options start here */ 88 }; 89 90 struct ip_vs_sync_conn_options { 91 struct ip_vs_seq in_seq; /* incoming seq. struct */ 92 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 93 }; 94 95 /* 96 Sync Connection format (sync_conn) 97 98 0 1 2 3 99 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 100 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101 | Type | Protocol | Ver. | Size | 102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 103 | Flags | 104 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 105 | State | cport | 106 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 107 | vport | dport | 108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 109 | fwmark | 110 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 111 | timeout (in sec.) | 112 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 113 | ... | 114 | IP-Addresses (v4 or v6) | 115 | ... | 116 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 117 Optional Parameters. 118 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 119 | Param. Type | Param. Length | Param. data | 120 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 121 | ... | 122 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 123 | | Param Type | Param. Length | 124 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 125 | Param data | 126 | Last Param data should be padded for 32 bit alignment | 127 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 128 */ 129 130 /* 131 * Type 0, IPv4 sync connection format 132 */ 133 struct ip_vs_sync_v4 { 134 __u8 type; 135 __u8 protocol; /* Which protocol (TCP/UDP) */ 136 __be16 ver_size; /* Version msb 4 bits */ 137 /* Flags and state transition */ 138 __be32 flags; /* status flags */ 139 __be16 state; /* state info */ 140 /* Protocol, addresses and port numbers */ 141 __be16 cport; 142 __be16 vport; 143 __be16 dport; 144 __be32 fwmark; /* Firewall mark from skb */ 145 __be32 timeout; /* cp timeout */ 146 __be32 caddr; /* client address */ 147 __be32 vaddr; /* virtual address */ 148 __be32 daddr; /* destination address */ 149 /* The sequence options start here */ 150 /* PE data padded to 32bit alignment after seq. options */ 151 }; 152 /* 153 * Type 2 messages IPv6 154 */ 155 struct ip_vs_sync_v6 { 156 __u8 type; 157 __u8 protocol; /* Which protocol (TCP/UDP) */ 158 __be16 ver_size; /* Version msb 4 bits */ 159 /* Flags and state transition */ 160 __be32 flags; /* status flags */ 161 __be16 state; /* state info */ 162 /* Protocol, addresses and port numbers */ 163 __be16 cport; 164 __be16 vport; 165 __be16 dport; 166 __be32 fwmark; /* Firewall mark from skb */ 167 __be32 timeout; /* cp timeout */ 168 struct in6_addr caddr; /* client address */ 169 struct in6_addr vaddr; /* virtual address */ 170 struct in6_addr daddr; /* destination address */ 171 /* The sequence options start here */ 172 /* PE data padded to 32bit alignment after seq. options */ 173 }; 174 175 union ip_vs_sync_conn { 176 struct ip_vs_sync_v4 v4; 177 struct ip_vs_sync_v6 v6; 178 }; 179 180 /* Bits in Type field in above */ 181 #define STYPE_INET6 0 182 #define STYPE_F_INET6 (1 << STYPE_INET6) 183 184 #define SVER_SHIFT 12 /* Shift to get version */ 185 #define SVER_MASK 0x0fff /* Mask to strip version */ 186 187 #define IPVS_OPT_SEQ_DATA 1 188 #define IPVS_OPT_PE_DATA 2 189 #define IPVS_OPT_PE_NAME 3 190 #define IPVS_OPT_PARAM 7 191 192 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 193 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 194 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 195 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 196 197 struct ip_vs_sync_thread_data { 198 struct task_struct *task; 199 struct netns_ipvs *ipvs; 200 struct socket *sock; 201 char *buf; 202 int id; 203 }; 204 205 /* Version 0 definition of packet sizes */ 206 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 207 #define FULL_CONN_SIZE \ 208 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 209 210 211 /* 212 The master mulitcasts messages (Datagrams) to the backup load balancers 213 in the following format. 214 215 Version 1: 216 Note, first byte should be Zero, so ver 0 receivers will drop the packet. 217 218 0 1 2 3 219 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 220 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 221 | 0 | SyncID | Size | 222 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 223 | Count Conns | Version | Reserved, set to Zero | 224 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 225 | | 226 | IPVS Sync Connection (1) | 227 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 228 | . | 229 ~ . ~ 230 | . | 231 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 232 | | 233 | IPVS Sync Connection (n) | 234 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 235 236 Version 0 Header 237 0 1 2 3 238 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 239 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 240 | Count Conns | SyncID | Size | 241 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 242 | IPVS Sync Connection (1) | 243 */ 244 245 /* Version 0 header */ 246 struct ip_vs_sync_mesg_v0 { 247 __u8 nr_conns; 248 __u8 syncid; 249 __be16 size; 250 251 /* ip_vs_sync_conn entries start here */ 252 }; 253 254 /* Version 1 header */ 255 struct ip_vs_sync_mesg { 256 __u8 reserved; /* must be zero */ 257 __u8 syncid; 258 __be16 size; 259 __u8 nr_conns; 260 __s8 version; /* SYNC_PROTO_VER */ 261 __u16 spare; 262 /* ip_vs_sync_conn entries start here */ 263 }; 264 265 union ipvs_sockaddr { 266 struct sockaddr_in in; 267 struct sockaddr_in6 in6; 268 }; 269 270 struct ip_vs_sync_buff { 271 struct list_head list; 272 unsigned long firstuse; 273 274 /* pointers for the message data */ 275 struct ip_vs_sync_mesg *mesg; 276 unsigned char *head; 277 unsigned char *end; 278 }; 279 280 /* 281 * Copy of struct ip_vs_seq 282 * From unaligned network order to aligned host order 283 */ 284 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 285 { 286 memset(ho, 0, sizeof(*ho)); 287 ho->init_seq = get_unaligned_be32(&no->init_seq); 288 ho->delta = get_unaligned_be32(&no->delta); 289 ho->previous_delta = get_unaligned_be32(&no->previous_delta); 290 } 291 292 /* 293 * Copy of struct ip_vs_seq 294 * From Aligned host order to unaligned network order 295 */ 296 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 297 { 298 put_unaligned_be32(ho->init_seq, &no->init_seq); 299 put_unaligned_be32(ho->delta, &no->delta); 300 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 301 } 302 303 static inline struct ip_vs_sync_buff * 304 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 305 { 306 struct ip_vs_sync_buff *sb; 307 308 spin_lock_bh(&ipvs->sync_lock); 309 if (list_empty(&ms->sync_queue)) { 310 sb = NULL; 311 __set_current_state(TASK_INTERRUPTIBLE); 312 } else { 313 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 314 list); 315 list_del(&sb->list); 316 ms->sync_queue_len--; 317 if (!ms->sync_queue_len) 318 ms->sync_queue_delay = 0; 319 } 320 spin_unlock_bh(&ipvs->sync_lock); 321 322 return sb; 323 } 324 325 /* 326 * Create a new sync buffer for Version 1 proto. 327 */ 328 static inline struct ip_vs_sync_buff * 329 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 330 { 331 struct ip_vs_sync_buff *sb; 332 333 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 334 return NULL; 335 336 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 337 ipvs->mcfg.sync_maxlen); 338 sb->mesg = kmalloc(len, GFP_ATOMIC); 339 if (!sb->mesg) { 340 kfree(sb); 341 return NULL; 342 } 343 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 344 sb->mesg->version = SYNC_PROTO_VER; 345 sb->mesg->syncid = ipvs->mcfg.syncid; 346 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 347 sb->mesg->nr_conns = 0; 348 sb->mesg->spare = 0; 349 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 350 sb->end = (unsigned char *)sb->mesg + len; 351 352 sb->firstuse = jiffies; 353 return sb; 354 } 355 356 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 357 { 358 kfree(sb->mesg); 359 kfree(sb); 360 } 361 362 static inline void sb_queue_tail(struct netns_ipvs *ipvs, 363 struct ipvs_master_sync_state *ms) 364 { 365 struct ip_vs_sync_buff *sb = ms->sync_buff; 366 367 spin_lock(&ipvs->sync_lock); 368 if (ipvs->sync_state & IP_VS_STATE_MASTER && 369 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 370 if (!ms->sync_queue_len) 371 schedule_delayed_work(&ms->master_wakeup_work, 372 max(IPVS_SYNC_SEND_DELAY, 1)); 373 ms->sync_queue_len++; 374 list_add_tail(&sb->list, &ms->sync_queue); 375 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) { 376 int id = (int)(ms - ipvs->ms); 377 378 wake_up_process(ipvs->master_tinfo[id].task); 379 } 380 } else 381 ip_vs_sync_buff_release(sb); 382 spin_unlock(&ipvs->sync_lock); 383 } 384 385 /* 386 * Get the current sync buffer if it has been created for more 387 * than the specified time or the specified time is zero. 388 */ 389 static inline struct ip_vs_sync_buff * 390 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 391 unsigned long time) 392 { 393 struct ip_vs_sync_buff *sb; 394 395 spin_lock_bh(&ipvs->sync_buff_lock); 396 sb = ms->sync_buff; 397 if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 398 ms->sync_buff = NULL; 399 __set_current_state(TASK_RUNNING); 400 } else 401 sb = NULL; 402 spin_unlock_bh(&ipvs->sync_buff_lock); 403 return sb; 404 } 405 406 static inline int 407 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 408 { 409 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 410 } 411 412 /* 413 * Create a new sync buffer for Version 0 proto. 414 */ 415 static inline struct ip_vs_sync_buff * 416 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 417 { 418 struct ip_vs_sync_buff *sb; 419 struct ip_vs_sync_mesg_v0 *mesg; 420 421 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 422 return NULL; 423 424 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 425 ipvs->mcfg.sync_maxlen); 426 sb->mesg = kmalloc(len, GFP_ATOMIC); 427 if (!sb->mesg) { 428 kfree(sb); 429 return NULL; 430 } 431 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 432 mesg->nr_conns = 0; 433 mesg->syncid = ipvs->mcfg.syncid; 434 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 435 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 436 sb->end = (unsigned char *)mesg + len; 437 sb->firstuse = jiffies; 438 return sb; 439 } 440 441 /* Check if connection is controlled by persistence */ 442 static inline bool in_persistence(struct ip_vs_conn *cp) 443 { 444 for (cp = cp->control; cp; cp = cp->control) { 445 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 446 return true; 447 } 448 return false; 449 } 450 451 /* Check if conn should be synced. 452 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 453 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 454 * sync_retries times with period of sync_refresh_period/8 455 * - (2) if both sync_refresh_period and sync_period are 0 send sync only 456 * for state changes or only once when pkts matches sync_threshold 457 * - (3) templates: rate can be reduced only with sync_refresh_period or 458 * with (2) 459 */ 460 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 461 struct ip_vs_conn *cp, int pkts) 462 { 463 unsigned long orig = READ_ONCE(cp->sync_endtime); 464 unsigned long now = jiffies; 465 unsigned long n = (now + cp->timeout) & ~3UL; 466 unsigned int sync_refresh_period; 467 int sync_period; 468 int force; 469 470 /* Check if we sync in current state */ 471 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 472 force = 0; 473 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 474 return 0; 475 else if (likely(cp->protocol == IPPROTO_TCP)) { 476 if (!((1 << cp->state) & 477 ((1 << IP_VS_TCP_S_ESTABLISHED) | 478 (1 << IP_VS_TCP_S_FIN_WAIT) | 479 (1 << IP_VS_TCP_S_CLOSE) | 480 (1 << IP_VS_TCP_S_CLOSE_WAIT) | 481 (1 << IP_VS_TCP_S_TIME_WAIT)))) 482 return 0; 483 force = cp->state != cp->old_state; 484 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 485 goto set; 486 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 487 if (!((1 << cp->state) & 488 ((1 << IP_VS_SCTP_S_ESTABLISHED) | 489 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 490 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 491 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 492 (1 << IP_VS_SCTP_S_CLOSED)))) 493 return 0; 494 force = cp->state != cp->old_state; 495 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 496 goto set; 497 } else { 498 /* UDP or another protocol with single state */ 499 force = 0; 500 } 501 502 sync_refresh_period = sysctl_sync_refresh_period(ipvs); 503 if (sync_refresh_period > 0) { 504 long diff = n - orig; 505 long min_diff = max(cp->timeout >> 1, 10UL * HZ); 506 507 /* Avoid sync if difference is below sync_refresh_period 508 * and below the half timeout. 509 */ 510 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 511 int retries = orig & 3; 512 513 if (retries >= sysctl_sync_retries(ipvs)) 514 return 0; 515 if (time_before(now, orig - cp->timeout + 516 (sync_refresh_period >> 3))) 517 return 0; 518 n |= retries + 1; 519 } 520 } 521 sync_period = sysctl_sync_period(ipvs); 522 if (sync_period > 0) { 523 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 524 pkts % sync_period != sysctl_sync_threshold(ipvs)) 525 return 0; 526 } else if (!sync_refresh_period && 527 pkts != sysctl_sync_threshold(ipvs)) 528 return 0; 529 530 set: 531 cp->old_state = cp->state; 532 n = cmpxchg(&cp->sync_endtime, orig, n); 533 return n == orig || force; 534 } 535 536 /* 537 * Version 0 , could be switched in by sys_ctl. 538 * Add an ip_vs_conn information into the current sync_buff. 539 */ 540 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 541 int pkts) 542 { 543 struct ip_vs_sync_mesg_v0 *m; 544 struct ip_vs_sync_conn_v0 *s; 545 struct ip_vs_sync_buff *buff; 546 struct ipvs_master_sync_state *ms; 547 int id; 548 unsigned int len; 549 550 if (unlikely(cp->af != AF_INET)) 551 return; 552 /* Do not sync ONE PACKET */ 553 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 554 return; 555 556 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 557 return; 558 559 spin_lock_bh(&ipvs->sync_buff_lock); 560 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 561 spin_unlock_bh(&ipvs->sync_buff_lock); 562 return; 563 } 564 565 id = select_master_thread_id(ipvs, cp); 566 ms = &ipvs->ms[id]; 567 buff = ms->sync_buff; 568 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 569 SIMPLE_CONN_SIZE; 570 if (buff) { 571 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 572 /* Send buffer if it is for v1 */ 573 if (buff->head + len > buff->end || !m->nr_conns) { 574 sb_queue_tail(ipvs, ms); 575 ms->sync_buff = NULL; 576 buff = NULL; 577 } 578 } 579 if (!buff) { 580 buff = ip_vs_sync_buff_create_v0(ipvs, len); 581 if (!buff) { 582 spin_unlock_bh(&ipvs->sync_buff_lock); 583 pr_err("ip_vs_sync_buff_create failed.\n"); 584 return; 585 } 586 ms->sync_buff = buff; 587 } 588 589 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 590 s = (struct ip_vs_sync_conn_v0 *) buff->head; 591 592 /* copy members */ 593 s->reserved = 0; 594 s->protocol = cp->protocol; 595 s->cport = cp->cport; 596 s->vport = cp->vport; 597 s->dport = cp->dport; 598 s->caddr = cp->caddr.ip; 599 s->vaddr = cp->vaddr.ip; 600 s->daddr = cp->daddr.ip; 601 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 602 s->state = htons(cp->state); 603 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 604 struct ip_vs_sync_conn_options *opt = 605 (struct ip_vs_sync_conn_options *)&s[1]; 606 memcpy(opt, &cp->in_seq, sizeof(*opt)); 607 } 608 609 m->nr_conns++; 610 m->size = htons(ntohs(m->size) + len); 611 buff->head += len; 612 spin_unlock_bh(&ipvs->sync_buff_lock); 613 614 /* synchronize its controller if it has */ 615 cp = cp->control; 616 if (cp) { 617 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 618 pkts = atomic_inc_return(&cp->in_pkts); 619 else 620 pkts = sysctl_sync_threshold(ipvs); 621 ip_vs_sync_conn(ipvs, cp, pkts); 622 } 623 } 624 625 /* 626 * Add an ip_vs_conn information into the current sync_buff. 627 * Called by ip_vs_in. 628 * Sending Version 1 messages 629 */ 630 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 631 { 632 struct ip_vs_sync_mesg *m; 633 union ip_vs_sync_conn *s; 634 struct ip_vs_sync_buff *buff; 635 struct ipvs_master_sync_state *ms; 636 int id; 637 __u8 *p; 638 unsigned int len, pe_name_len, pad; 639 640 /* Handle old version of the protocol */ 641 if (sysctl_sync_ver(ipvs) == 0) { 642 ip_vs_sync_conn_v0(ipvs, cp, pkts); 643 return; 644 } 645 /* Do not sync ONE PACKET */ 646 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 647 goto control; 648 sloop: 649 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 650 goto control; 651 652 /* Sanity checks */ 653 pe_name_len = 0; 654 if (cp->pe_data_len) { 655 if (!cp->pe_data || !cp->dest) { 656 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 657 return; 658 } 659 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 660 } 661 662 spin_lock_bh(&ipvs->sync_buff_lock); 663 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 664 spin_unlock_bh(&ipvs->sync_buff_lock); 665 return; 666 } 667 668 id = select_master_thread_id(ipvs, cp); 669 ms = &ipvs->ms[id]; 670 671 #ifdef CONFIG_IP_VS_IPV6 672 if (cp->af == AF_INET6) 673 len = sizeof(struct ip_vs_sync_v6); 674 else 675 #endif 676 len = sizeof(struct ip_vs_sync_v4); 677 678 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 679 len += sizeof(struct ip_vs_sync_conn_options) + 2; 680 681 if (cp->pe_data_len) 682 len += cp->pe_data_len + 2; /* + Param hdr field */ 683 if (pe_name_len) 684 len += pe_name_len + 2; 685 686 /* check if there is a space for this one */ 687 pad = 0; 688 buff = ms->sync_buff; 689 if (buff) { 690 m = buff->mesg; 691 pad = (4 - (size_t) buff->head) & 3; 692 /* Send buffer if it is for v0 */ 693 if (buff->head + len + pad > buff->end || m->reserved) { 694 sb_queue_tail(ipvs, ms); 695 ms->sync_buff = NULL; 696 buff = NULL; 697 pad = 0; 698 } 699 } 700 701 if (!buff) { 702 buff = ip_vs_sync_buff_create(ipvs, len); 703 if (!buff) { 704 spin_unlock_bh(&ipvs->sync_buff_lock); 705 pr_err("ip_vs_sync_buff_create failed.\n"); 706 return; 707 } 708 ms->sync_buff = buff; 709 m = buff->mesg; 710 } 711 712 p = buff->head; 713 buff->head += pad + len; 714 m->size = htons(ntohs(m->size) + pad + len); 715 /* Add ev. padding from prev. sync_conn */ 716 while (pad--) 717 *(p++) = 0; 718 719 s = (union ip_vs_sync_conn *)p; 720 721 /* Set message type & copy members */ 722 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 723 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 724 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 725 s->v4.state = htons(cp->state); 726 s->v4.protocol = cp->protocol; 727 s->v4.cport = cp->cport; 728 s->v4.vport = cp->vport; 729 s->v4.dport = cp->dport; 730 s->v4.fwmark = htonl(cp->fwmark); 731 s->v4.timeout = htonl(cp->timeout / HZ); 732 m->nr_conns++; 733 734 #ifdef CONFIG_IP_VS_IPV6 735 if (cp->af == AF_INET6) { 736 p += sizeof(struct ip_vs_sync_v6); 737 s->v6.caddr = cp->caddr.in6; 738 s->v6.vaddr = cp->vaddr.in6; 739 s->v6.daddr = cp->daddr.in6; 740 } else 741 #endif 742 { 743 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 744 s->v4.caddr = cp->caddr.ip; 745 s->v4.vaddr = cp->vaddr.ip; 746 s->v4.daddr = cp->daddr.ip; 747 } 748 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 749 *(p++) = IPVS_OPT_SEQ_DATA; 750 *(p++) = sizeof(struct ip_vs_sync_conn_options); 751 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 752 p += sizeof(struct ip_vs_seq); 753 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 754 p += sizeof(struct ip_vs_seq); 755 } 756 /* Handle pe data */ 757 if (cp->pe_data_len && cp->pe_data) { 758 *(p++) = IPVS_OPT_PE_DATA; 759 *(p++) = cp->pe_data_len; 760 memcpy(p, cp->pe_data, cp->pe_data_len); 761 p += cp->pe_data_len; 762 if (pe_name_len) { 763 /* Add PE_NAME */ 764 *(p++) = IPVS_OPT_PE_NAME; 765 *(p++) = pe_name_len; 766 memcpy(p, cp->pe->name, pe_name_len); 767 p += pe_name_len; 768 } 769 } 770 771 spin_unlock_bh(&ipvs->sync_buff_lock); 772 773 control: 774 /* synchronize its controller if it has */ 775 cp = cp->control; 776 if (!cp) 777 return; 778 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 779 pkts = atomic_inc_return(&cp->in_pkts); 780 else 781 pkts = sysctl_sync_threshold(ipvs); 782 goto sloop; 783 } 784 785 /* 786 * fill_param used by version 1 787 */ 788 static inline int 789 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 790 struct ip_vs_conn_param *p, 791 __u8 *pe_data, unsigned int pe_data_len, 792 __u8 *pe_name, unsigned int pe_name_len) 793 { 794 #ifdef CONFIG_IP_VS_IPV6 795 if (af == AF_INET6) 796 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 797 (const union nf_inet_addr *)&sc->v6.caddr, 798 sc->v6.cport, 799 (const union nf_inet_addr *)&sc->v6.vaddr, 800 sc->v6.vport, p); 801 else 802 #endif 803 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 804 (const union nf_inet_addr *)&sc->v4.caddr, 805 sc->v4.cport, 806 (const union nf_inet_addr *)&sc->v4.vaddr, 807 sc->v4.vport, p); 808 /* Handle pe data */ 809 if (pe_data_len) { 810 if (pe_name_len) { 811 char buff[IP_VS_PENAME_MAXLEN+1]; 812 813 memcpy(buff, pe_name, pe_name_len); 814 buff[pe_name_len]=0; 815 p->pe = __ip_vs_pe_getbyname(buff); 816 if (!p->pe) { 817 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 818 buff); 819 return 1; 820 } 821 } else { 822 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 823 return 1; 824 } 825 826 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 827 if (!p->pe_data) { 828 module_put(p->pe->module); 829 return -ENOMEM; 830 } 831 p->pe_data_len = pe_data_len; 832 } 833 return 0; 834 } 835 836 /* 837 * Connection Add / Update. 838 * Common for version 0 and 1 reception of backup sync_conns. 839 * Param: ... 840 * timeout is in sec. 841 */ 842 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 843 unsigned int flags, unsigned int state, 844 unsigned int protocol, unsigned int type, 845 const union nf_inet_addr *daddr, __be16 dport, 846 unsigned long timeout, __u32 fwmark, 847 struct ip_vs_sync_conn_options *opt) 848 { 849 struct ip_vs_dest *dest; 850 struct ip_vs_conn *cp; 851 852 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 853 cp = ip_vs_conn_in_get(param); 854 if (cp && ((cp->dport != dport) || 855 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 856 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 857 ip_vs_conn_expire_now(cp); 858 __ip_vs_conn_put(cp); 859 cp = NULL; 860 } else { 861 /* This is the expiration message for the 862 * connection that was already replaced, so we 863 * just ignore it. 864 */ 865 __ip_vs_conn_put(cp); 866 kfree(param->pe_data); 867 return; 868 } 869 } 870 } else { 871 cp = ip_vs_ct_in_get(param); 872 } 873 874 if (cp) { 875 /* Free pe_data */ 876 kfree(param->pe_data); 877 878 dest = cp->dest; 879 spin_lock_bh(&cp->lock); 880 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 881 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 882 if (flags & IP_VS_CONN_F_INACTIVE) { 883 atomic_dec(&dest->activeconns); 884 atomic_inc(&dest->inactconns); 885 } else { 886 atomic_inc(&dest->activeconns); 887 atomic_dec(&dest->inactconns); 888 } 889 } 890 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 891 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 892 cp->flags = flags; 893 spin_unlock_bh(&cp->lock); 894 if (!dest) 895 ip_vs_try_bind_dest(cp); 896 } else { 897 /* 898 * Find the appropriate destination for the connection. 899 * If it is not found the connection will remain unbound 900 * but still handled. 901 */ 902 rcu_read_lock(); 903 /* This function is only invoked by the synchronization 904 * code. We do not currently support heterogeneous pools 905 * with synchronization, so we can make the assumption that 906 * the svc_af is the same as the dest_af 907 */ 908 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 909 param->vaddr, param->vport, protocol, 910 fwmark, flags); 911 912 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 913 fwmark); 914 rcu_read_unlock(); 915 if (!cp) { 916 kfree(param->pe_data); 917 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 918 return; 919 } 920 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 921 kfree(param->pe_data); 922 } 923 924 if (opt) { 925 cp->in_seq = opt->in_seq; 926 cp->out_seq = opt->out_seq; 927 } 928 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 929 cp->state = state; 930 cp->old_state = cp->state; 931 /* 932 * For Ver 0 messages style 933 * - Not possible to recover the right timeout for templates 934 * - can not find the right fwmark 935 * virtual service. If needed, we can do it for 936 * non-fwmark persistent services. 937 * Ver 1 messages style. 938 * - No problem. 939 */ 940 if (timeout) { 941 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 942 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 943 cp->timeout = timeout*HZ; 944 } else { 945 struct ip_vs_proto_data *pd; 946 947 pd = ip_vs_proto_data_get(ipvs, protocol); 948 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 949 cp->timeout = pd->timeout_table[state]; 950 else 951 cp->timeout = (3*60*HZ); 952 } 953 ip_vs_conn_put(cp); 954 } 955 956 /* 957 * Process received multicast message for Version 0 958 */ 959 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 960 const size_t buflen) 961 { 962 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 963 struct ip_vs_sync_conn_v0 *s; 964 struct ip_vs_sync_conn_options *opt; 965 struct ip_vs_protocol *pp; 966 struct ip_vs_conn_param param; 967 char *p; 968 int i; 969 970 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 971 for (i=0; i<m->nr_conns; i++) { 972 unsigned int flags, state; 973 974 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 975 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 976 return; 977 } 978 s = (struct ip_vs_sync_conn_v0 *) p; 979 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 980 flags &= ~IP_VS_CONN_F_HASHED; 981 if (flags & IP_VS_CONN_F_SEQ_MASK) { 982 opt = (struct ip_vs_sync_conn_options *)&s[1]; 983 p += FULL_CONN_SIZE; 984 if (p > buffer+buflen) { 985 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 986 return; 987 } 988 } else { 989 opt = NULL; 990 p += SIMPLE_CONN_SIZE; 991 } 992 993 state = ntohs(s->state); 994 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 995 pp = ip_vs_proto_get(s->protocol); 996 if (!pp) { 997 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 998 s->protocol); 999 continue; 1000 } 1001 if (state >= pp->num_states) { 1002 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 1003 pp->name, state); 1004 continue; 1005 } 1006 } else { 1007 if (state >= IP_VS_CTPL_S_LAST) 1008 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", 1009 state); 1010 } 1011 1012 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1013 (const union nf_inet_addr *)&s->caddr, 1014 s->cport, 1015 (const union nf_inet_addr *)&s->vaddr, 1016 s->vport, ¶m); 1017 1018 /* Send timeout as Zero */ 1019 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1020 (union nf_inet_addr *)&s->daddr, s->dport, 1021 0, 0, opt); 1022 } 1023 } 1024 1025 /* 1026 * Handle options 1027 */ 1028 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1029 __u32 *opt_flags, 1030 struct ip_vs_sync_conn_options *opt) 1031 { 1032 struct ip_vs_sync_conn_options *topt; 1033 1034 topt = (struct ip_vs_sync_conn_options *)p; 1035 1036 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1037 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1038 return -EINVAL; 1039 } 1040 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1041 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1042 return -EINVAL; 1043 } 1044 ntoh_seq(&topt->in_seq, &opt->in_seq); 1045 ntoh_seq(&topt->out_seq, &opt->out_seq); 1046 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1047 return 0; 1048 } 1049 1050 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1051 __u8 **data, unsigned int maxlen, 1052 __u32 *opt_flags, __u32 flag) 1053 { 1054 if (plen > maxlen) { 1055 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1056 return -EINVAL; 1057 } 1058 if (*opt_flags & flag) { 1059 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1060 return -EINVAL; 1061 } 1062 *data_len = plen; 1063 *data = p; 1064 *opt_flags |= flag; 1065 return 0; 1066 } 1067 /* 1068 * Process a Version 1 sync. connection 1069 */ 1070 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1071 { 1072 struct ip_vs_sync_conn_options opt; 1073 union ip_vs_sync_conn *s; 1074 struct ip_vs_protocol *pp; 1075 struct ip_vs_conn_param param; 1076 __u32 flags; 1077 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1078 __u8 *pe_data=NULL, *pe_name=NULL; 1079 __u32 opt_flags=0; 1080 int retc=0; 1081 1082 s = (union ip_vs_sync_conn *) p; 1083 1084 if (s->v6.type & STYPE_F_INET6) { 1085 #ifdef CONFIG_IP_VS_IPV6 1086 af = AF_INET6; 1087 p += sizeof(struct ip_vs_sync_v6); 1088 #else 1089 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1090 retc = 10; 1091 goto out; 1092 #endif 1093 } else if (!s->v4.type) { 1094 af = AF_INET; 1095 p += sizeof(struct ip_vs_sync_v4); 1096 } else { 1097 return -10; 1098 } 1099 if (p > msg_end) 1100 return -20; 1101 1102 /* Process optional params check Type & Len. */ 1103 while (p < msg_end) { 1104 int ptype; 1105 int plen; 1106 1107 if (p+2 > msg_end) 1108 return -30; 1109 ptype = *(p++); 1110 plen = *(p++); 1111 1112 if (!plen || ((p + plen) > msg_end)) 1113 return -40; 1114 /* Handle seq option p = param data */ 1115 switch (ptype & ~IPVS_OPT_F_PARAM) { 1116 case IPVS_OPT_SEQ_DATA: 1117 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1118 return -50; 1119 break; 1120 1121 case IPVS_OPT_PE_DATA: 1122 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1123 IP_VS_PEDATA_MAXLEN, &opt_flags, 1124 IPVS_OPT_F_PE_DATA)) 1125 return -60; 1126 break; 1127 1128 case IPVS_OPT_PE_NAME: 1129 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1130 IP_VS_PENAME_MAXLEN, &opt_flags, 1131 IPVS_OPT_F_PE_NAME)) 1132 return -70; 1133 break; 1134 1135 default: 1136 /* Param data mandatory ? */ 1137 if (!(ptype & IPVS_OPT_F_PARAM)) { 1138 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1139 ptype & ~IPVS_OPT_F_PARAM); 1140 retc = 20; 1141 goto out; 1142 } 1143 } 1144 p += plen; /* Next option */ 1145 } 1146 1147 /* Get flags and Mask off unsupported */ 1148 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1149 flags |= IP_VS_CONN_F_SYNC; 1150 state = ntohs(s->v4.state); 1151 1152 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1153 pp = ip_vs_proto_get(s->v4.protocol); 1154 if (!pp) { 1155 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1156 s->v4.protocol); 1157 retc = 30; 1158 goto out; 1159 } 1160 if (state >= pp->num_states) { 1161 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1162 pp->name, state); 1163 retc = 40; 1164 goto out; 1165 } 1166 } else { 1167 if (state >= IP_VS_CTPL_S_LAST) 1168 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", 1169 state); 1170 } 1171 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1172 pe_data_len, pe_name, pe_name_len)) { 1173 retc = 50; 1174 goto out; 1175 } 1176 /* If only IPv4, just silent skip IPv6 */ 1177 if (af == AF_INET) 1178 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1179 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1180 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1181 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1182 ); 1183 #ifdef CONFIG_IP_VS_IPV6 1184 else 1185 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1186 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1187 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1188 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1189 ); 1190 #endif 1191 ip_vs_pe_put(param.pe); 1192 return 0; 1193 /* Error exit */ 1194 out: 1195 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1196 return retc; 1197 1198 } 1199 /* 1200 * Process received multicast message and create the corresponding 1201 * ip_vs_conn entries. 1202 * Handles Version 0 & 1 1203 */ 1204 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1205 const size_t buflen) 1206 { 1207 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1208 __u8 *p, *msg_end; 1209 int i, nr_conns; 1210 1211 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1212 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1213 return; 1214 } 1215 1216 if (buflen != ntohs(m2->size)) { 1217 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1218 return; 1219 } 1220 /* SyncID sanity check */ 1221 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1222 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1223 return; 1224 } 1225 /* Handle version 1 message */ 1226 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1227 && (m2->spare == 0)) { 1228 1229 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1230 nr_conns = m2->nr_conns; 1231 1232 for (i=0; i<nr_conns; i++) { 1233 union ip_vs_sync_conn *s; 1234 unsigned int size; 1235 int retc; 1236 1237 p = msg_end; 1238 if (p + sizeof(s->v4) > buffer+buflen) { 1239 IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); 1240 return; 1241 } 1242 s = (union ip_vs_sync_conn *)p; 1243 size = ntohs(s->v4.ver_size) & SVER_MASK; 1244 msg_end = p + size; 1245 /* Basic sanity checks */ 1246 if (msg_end > buffer+buflen) { 1247 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1248 return; 1249 } 1250 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1251 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1252 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1253 return; 1254 } 1255 /* Process a single sync_conn */ 1256 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1257 if (retc < 0) { 1258 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1259 retc); 1260 return; 1261 } 1262 /* Make sure we have 32 bit alignment */ 1263 msg_end = p + ((size + 3) & ~3); 1264 } 1265 } else { 1266 /* Old type of message */ 1267 ip_vs_process_message_v0(ipvs, buffer, buflen); 1268 return; 1269 } 1270 } 1271 1272 1273 /* 1274 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1275 */ 1276 static void set_sock_size(struct sock *sk, int mode, int val) 1277 { 1278 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1279 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1280 lock_sock(sk); 1281 if (mode) { 1282 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1283 sysctl_wmem_max); 1284 sk->sk_sndbuf = val * 2; 1285 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1286 } else { 1287 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1288 sysctl_rmem_max); 1289 sk->sk_rcvbuf = val * 2; 1290 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1291 } 1292 release_sock(sk); 1293 } 1294 1295 /* 1296 * Setup loopback of outgoing multicasts on a sending socket 1297 */ 1298 static void set_mcast_loop(struct sock *sk, u_char loop) 1299 { 1300 struct inet_sock *inet = inet_sk(sk); 1301 1302 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1303 lock_sock(sk); 1304 inet->mc_loop = loop ? 1 : 0; 1305 #ifdef CONFIG_IP_VS_IPV6 1306 if (sk->sk_family == AF_INET6) { 1307 struct ipv6_pinfo *np = inet6_sk(sk); 1308 1309 /* IPV6_MULTICAST_LOOP */ 1310 np->mc_loop = loop ? 1 : 0; 1311 } 1312 #endif 1313 release_sock(sk); 1314 } 1315 1316 /* 1317 * Specify TTL for outgoing multicasts on a sending socket 1318 */ 1319 static void set_mcast_ttl(struct sock *sk, u_char ttl) 1320 { 1321 struct inet_sock *inet = inet_sk(sk); 1322 1323 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1324 lock_sock(sk); 1325 inet->mc_ttl = ttl; 1326 #ifdef CONFIG_IP_VS_IPV6 1327 if (sk->sk_family == AF_INET6) { 1328 struct ipv6_pinfo *np = inet6_sk(sk); 1329 1330 /* IPV6_MULTICAST_HOPS */ 1331 np->mcast_hops = ttl; 1332 } 1333 #endif 1334 release_sock(sk); 1335 } 1336 1337 /* Control fragmentation of messages */ 1338 static void set_mcast_pmtudisc(struct sock *sk, int val) 1339 { 1340 struct inet_sock *inet = inet_sk(sk); 1341 1342 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1343 lock_sock(sk); 1344 inet->pmtudisc = val; 1345 #ifdef CONFIG_IP_VS_IPV6 1346 if (sk->sk_family == AF_INET6) { 1347 struct ipv6_pinfo *np = inet6_sk(sk); 1348 1349 /* IPV6_MTU_DISCOVER */ 1350 np->pmtudisc = val; 1351 } 1352 #endif 1353 release_sock(sk); 1354 } 1355 1356 /* 1357 * Specifiy default interface for outgoing multicasts 1358 */ 1359 static int set_mcast_if(struct sock *sk, struct net_device *dev) 1360 { 1361 struct inet_sock *inet = inet_sk(sk); 1362 1363 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1364 return -EINVAL; 1365 1366 lock_sock(sk); 1367 inet->mc_index = dev->ifindex; 1368 /* inet->mc_addr = 0; */ 1369 #ifdef CONFIG_IP_VS_IPV6 1370 if (sk->sk_family == AF_INET6) { 1371 struct ipv6_pinfo *np = inet6_sk(sk); 1372 1373 /* IPV6_MULTICAST_IF */ 1374 np->mcast_oif = dev->ifindex; 1375 } 1376 #endif 1377 release_sock(sk); 1378 1379 return 0; 1380 } 1381 1382 1383 /* 1384 * Join a multicast group. 1385 * the group is specified by a class D multicast address 224.0.0.0/8 1386 * in the in_addr structure passed in as a parameter. 1387 */ 1388 static int 1389 join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev) 1390 { 1391 struct ip_mreqn mreq; 1392 int ret; 1393 1394 memset(&mreq, 0, sizeof(mreq)); 1395 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1396 1397 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1398 return -EINVAL; 1399 1400 mreq.imr_ifindex = dev->ifindex; 1401 1402 lock_sock(sk); 1403 ret = ip_mc_join_group(sk, &mreq); 1404 release_sock(sk); 1405 1406 return ret; 1407 } 1408 1409 #ifdef CONFIG_IP_VS_IPV6 1410 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1411 struct net_device *dev) 1412 { 1413 int ret; 1414 1415 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1416 return -EINVAL; 1417 1418 lock_sock(sk); 1419 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1420 release_sock(sk); 1421 1422 return ret; 1423 } 1424 #endif 1425 1426 static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) 1427 { 1428 __be32 addr; 1429 struct sockaddr_in sin; 1430 1431 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1432 if (!addr) 1433 pr_err("You probably need to specify IP address on " 1434 "multicast interface.\n"); 1435 1436 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1437 dev->name, &addr); 1438 1439 /* Now bind the socket with the address of multicast interface */ 1440 sin.sin_family = AF_INET; 1441 sin.sin_addr.s_addr = addr; 1442 sin.sin_port = 0; 1443 1444 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 1445 } 1446 1447 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1448 struct ipvs_sync_daemon_cfg *c, int id) 1449 { 1450 if (AF_INET6 == c->mcast_af) { 1451 sa->in6 = (struct sockaddr_in6) { 1452 .sin6_family = AF_INET6, 1453 .sin6_port = htons(c->mcast_port + id), 1454 }; 1455 sa->in6.sin6_addr = c->mcast_group.in6; 1456 *salen = sizeof(sa->in6); 1457 } else { 1458 sa->in = (struct sockaddr_in) { 1459 .sin_family = AF_INET, 1460 .sin_port = htons(c->mcast_port + id), 1461 }; 1462 sa->in.sin_addr = c->mcast_group.in; 1463 *salen = sizeof(sa->in); 1464 } 1465 } 1466 1467 /* 1468 * Set up sending multicast socket over UDP 1469 */ 1470 static int make_send_sock(struct netns_ipvs *ipvs, int id, 1471 struct net_device *dev, struct socket **sock_ret) 1472 { 1473 /* multicast addr */ 1474 union ipvs_sockaddr mcast_addr; 1475 struct socket *sock; 1476 int result, salen; 1477 1478 /* First create a socket */ 1479 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1480 IPPROTO_UDP, &sock); 1481 if (result < 0) { 1482 pr_err("Error during creation of socket; terminating\n"); 1483 goto error; 1484 } 1485 *sock_ret = sock; 1486 result = set_mcast_if(sock->sk, dev); 1487 if (result < 0) { 1488 pr_err("Error setting outbound mcast interface\n"); 1489 goto error; 1490 } 1491 1492 set_mcast_loop(sock->sk, 0); 1493 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1494 /* Allow fragmentation if MTU changes */ 1495 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1496 result = sysctl_sync_sock_size(ipvs); 1497 if (result > 0) 1498 set_sock_size(sock->sk, 1, result); 1499 1500 if (AF_INET == ipvs->mcfg.mcast_af) 1501 result = bind_mcastif_addr(sock, dev); 1502 else 1503 result = 0; 1504 if (result < 0) { 1505 pr_err("Error binding address of the mcast interface\n"); 1506 goto error; 1507 } 1508 1509 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1510 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 1511 salen, 0); 1512 if (result < 0) { 1513 pr_err("Error connecting to the multicast addr\n"); 1514 goto error; 1515 } 1516 1517 return 0; 1518 1519 error: 1520 return result; 1521 } 1522 1523 1524 /* 1525 * Set up receiving multicast socket over UDP 1526 */ 1527 static int make_receive_sock(struct netns_ipvs *ipvs, int id, 1528 struct net_device *dev, struct socket **sock_ret) 1529 { 1530 /* multicast addr */ 1531 union ipvs_sockaddr mcast_addr; 1532 struct socket *sock; 1533 int result, salen; 1534 1535 /* First create a socket */ 1536 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1537 IPPROTO_UDP, &sock); 1538 if (result < 0) { 1539 pr_err("Error during creation of socket; terminating\n"); 1540 goto error; 1541 } 1542 *sock_ret = sock; 1543 /* it is equivalent to the REUSEADDR option in user-space */ 1544 sock->sk->sk_reuse = SK_CAN_REUSE; 1545 result = sysctl_sync_sock_size(ipvs); 1546 if (result > 0) 1547 set_sock_size(sock->sk, 0, result); 1548 1549 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1550 sock->sk->sk_bound_dev_if = dev->ifindex; 1551 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); 1552 if (result < 0) { 1553 pr_err("Error binding to the multicast addr\n"); 1554 goto error; 1555 } 1556 1557 /* join the multicast group */ 1558 #ifdef CONFIG_IP_VS_IPV6 1559 if (ipvs->bcfg.mcast_af == AF_INET6) 1560 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1561 dev); 1562 else 1563 #endif 1564 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1565 dev); 1566 if (result < 0) { 1567 pr_err("Error joining to the multicast group\n"); 1568 goto error; 1569 } 1570 1571 return 0; 1572 1573 error: 1574 return result; 1575 } 1576 1577 1578 static int 1579 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1580 { 1581 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1582 struct kvec iov; 1583 int len; 1584 1585 EnterFunction(7); 1586 iov.iov_base = (void *)buffer; 1587 iov.iov_len = length; 1588 1589 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1590 1591 LeaveFunction(7); 1592 return len; 1593 } 1594 1595 static int 1596 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1597 { 1598 int msize; 1599 int ret; 1600 1601 msize = ntohs(msg->size); 1602 1603 ret = ip_vs_send_async(sock, (char *)msg, msize); 1604 if (ret >= 0 || ret == -EAGAIN) 1605 return ret; 1606 pr_err("ip_vs_send_async error %d\n", ret); 1607 return 0; 1608 } 1609 1610 static int 1611 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1612 { 1613 struct msghdr msg = {NULL,}; 1614 struct kvec iov = {buffer, buflen}; 1615 int len; 1616 1617 EnterFunction(7); 1618 1619 /* Receive a packet */ 1620 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen); 1621 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); 1622 if (len < 0) 1623 return len; 1624 1625 LeaveFunction(7); 1626 return len; 1627 } 1628 1629 /* Wakeup the master thread for sending */ 1630 static void master_wakeup_work_handler(struct work_struct *work) 1631 { 1632 struct ipvs_master_sync_state *ms = 1633 container_of(work, struct ipvs_master_sync_state, 1634 master_wakeup_work.work); 1635 struct netns_ipvs *ipvs = ms->ipvs; 1636 1637 spin_lock_bh(&ipvs->sync_lock); 1638 if (ms->sync_queue_len && 1639 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1640 int id = (int)(ms - ipvs->ms); 1641 1642 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1643 wake_up_process(ipvs->master_tinfo[id].task); 1644 } 1645 spin_unlock_bh(&ipvs->sync_lock); 1646 } 1647 1648 /* Get next buffer to send */ 1649 static inline struct ip_vs_sync_buff * 1650 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1651 { 1652 struct ip_vs_sync_buff *sb; 1653 1654 sb = sb_dequeue(ipvs, ms); 1655 if (sb) 1656 return sb; 1657 /* Do not delay entries in buffer for more than 2 seconds */ 1658 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1659 } 1660 1661 static int sync_thread_master(void *data) 1662 { 1663 struct ip_vs_sync_thread_data *tinfo = data; 1664 struct netns_ipvs *ipvs = tinfo->ipvs; 1665 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1666 struct sock *sk = tinfo->sock->sk; 1667 struct ip_vs_sync_buff *sb; 1668 1669 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1670 "syncid = %d, id = %d\n", 1671 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1672 1673 for (;;) { 1674 sb = next_sync_buff(ipvs, ms); 1675 if (unlikely(kthread_should_stop())) 1676 break; 1677 if (!sb) { 1678 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1679 continue; 1680 } 1681 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1682 /* (Ab)use interruptible sleep to avoid increasing 1683 * the load avg. 1684 */ 1685 __wait_event_interruptible(*sk_sleep(sk), 1686 sock_writeable(sk) || 1687 kthread_should_stop()); 1688 if (unlikely(kthread_should_stop())) 1689 goto done; 1690 } 1691 ip_vs_sync_buff_release(sb); 1692 } 1693 1694 done: 1695 __set_current_state(TASK_RUNNING); 1696 if (sb) 1697 ip_vs_sync_buff_release(sb); 1698 1699 /* clean up the sync_buff queue */ 1700 while ((sb = sb_dequeue(ipvs, ms))) 1701 ip_vs_sync_buff_release(sb); 1702 __set_current_state(TASK_RUNNING); 1703 1704 /* clean up the current sync_buff */ 1705 sb = get_curr_sync_buff(ipvs, ms, 0); 1706 if (sb) 1707 ip_vs_sync_buff_release(sb); 1708 1709 return 0; 1710 } 1711 1712 1713 static int sync_thread_backup(void *data) 1714 { 1715 struct ip_vs_sync_thread_data *tinfo = data; 1716 struct netns_ipvs *ipvs = tinfo->ipvs; 1717 struct sock *sk = tinfo->sock->sk; 1718 struct udp_sock *up = udp_sk(sk); 1719 int len; 1720 1721 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1722 "syncid = %d, id = %d\n", 1723 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1724 1725 while (!kthread_should_stop()) { 1726 wait_event_interruptible(*sk_sleep(sk), 1727 !skb_queue_empty_lockless(&sk->sk_receive_queue) || 1728 !skb_queue_empty_lockless(&up->reader_queue) || 1729 kthread_should_stop()); 1730 1731 /* do we have data now? */ 1732 while (!skb_queue_empty_lockless(&sk->sk_receive_queue) || 1733 !skb_queue_empty_lockless(&up->reader_queue)) { 1734 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1735 ipvs->bcfg.sync_maxlen); 1736 if (len <= 0) { 1737 if (len != -EAGAIN) 1738 pr_err("receiving message error\n"); 1739 break; 1740 } 1741 1742 ip_vs_process_message(ipvs, tinfo->buf, len); 1743 } 1744 } 1745 1746 return 0; 1747 } 1748 1749 1750 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1751 int state) 1752 { 1753 struct ip_vs_sync_thread_data *ti = NULL, *tinfo; 1754 struct task_struct *task; 1755 struct net_device *dev; 1756 char *name; 1757 int (*threadfn)(void *data); 1758 int id = 0, count, hlen; 1759 int result = -ENOMEM; 1760 u16 mtu, min_mtu; 1761 1762 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1763 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", 1764 sizeof(struct ip_vs_sync_conn_v0)); 1765 1766 /* increase the module use count */ 1767 if (!ip_vs_use_count_inc()) 1768 return -ENOPROTOOPT; 1769 1770 /* Do not hold one mutex and then to block on another */ 1771 for (;;) { 1772 rtnl_lock(); 1773 if (mutex_trylock(&ipvs->sync_mutex)) 1774 break; 1775 rtnl_unlock(); 1776 mutex_lock(&ipvs->sync_mutex); 1777 if (rtnl_trylock()) 1778 break; 1779 mutex_unlock(&ipvs->sync_mutex); 1780 } 1781 1782 if (!ipvs->sync_state) { 1783 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1784 ipvs->threads_mask = count - 1; 1785 } else 1786 count = ipvs->threads_mask + 1; 1787 1788 if (c->mcast_af == AF_UNSPEC) { 1789 c->mcast_af = AF_INET; 1790 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1791 } 1792 if (!c->mcast_port) 1793 c->mcast_port = IP_VS_SYNC_PORT; 1794 if (!c->mcast_ttl) 1795 c->mcast_ttl = 1; 1796 1797 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1798 if (!dev) { 1799 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1800 result = -ENODEV; 1801 goto out_early; 1802 } 1803 hlen = (AF_INET6 == c->mcast_af) ? 1804 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1805 sizeof(struct iphdr) + sizeof(struct udphdr); 1806 mtu = (state == IP_VS_STATE_BACKUP) ? 1807 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1808 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1809 1810 if (c->sync_maxlen) 1811 c->sync_maxlen = clamp_t(unsigned int, 1812 c->sync_maxlen, min_mtu, 1813 65535 - hlen); 1814 else 1815 c->sync_maxlen = mtu - hlen; 1816 1817 if (state == IP_VS_STATE_MASTER) { 1818 result = -EEXIST; 1819 if (ipvs->ms) 1820 goto out_early; 1821 1822 ipvs->mcfg = *c; 1823 name = "ipvs-m:%d:%d"; 1824 threadfn = sync_thread_master; 1825 } else if (state == IP_VS_STATE_BACKUP) { 1826 result = -EEXIST; 1827 if (ipvs->backup_tinfo) 1828 goto out_early; 1829 1830 ipvs->bcfg = *c; 1831 name = "ipvs-b:%d:%d"; 1832 threadfn = sync_thread_backup; 1833 } else { 1834 result = -EINVAL; 1835 goto out_early; 1836 } 1837 1838 if (state == IP_VS_STATE_MASTER) { 1839 struct ipvs_master_sync_state *ms; 1840 1841 result = -ENOMEM; 1842 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); 1843 if (!ipvs->ms) 1844 goto out; 1845 ms = ipvs->ms; 1846 for (id = 0; id < count; id++, ms++) { 1847 INIT_LIST_HEAD(&ms->sync_queue); 1848 ms->sync_queue_len = 0; 1849 ms->sync_queue_delay = 0; 1850 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1851 master_wakeup_work_handler); 1852 ms->ipvs = ipvs; 1853 } 1854 } 1855 result = -ENOMEM; 1856 ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), 1857 GFP_KERNEL); 1858 if (!ti) 1859 goto out; 1860 1861 for (id = 0; id < count; id++) { 1862 tinfo = &ti[id]; 1863 tinfo->ipvs = ipvs; 1864 if (state == IP_VS_STATE_BACKUP) { 1865 result = -ENOMEM; 1866 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1867 GFP_KERNEL); 1868 if (!tinfo->buf) 1869 goto out; 1870 } 1871 tinfo->id = id; 1872 if (state == IP_VS_STATE_MASTER) 1873 result = make_send_sock(ipvs, id, dev, &tinfo->sock); 1874 else 1875 result = make_receive_sock(ipvs, id, dev, &tinfo->sock); 1876 if (result < 0) 1877 goto out; 1878 1879 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1880 if (IS_ERR(task)) { 1881 result = PTR_ERR(task); 1882 goto out; 1883 } 1884 tinfo->task = task; 1885 } 1886 1887 /* mark as active */ 1888 1889 if (state == IP_VS_STATE_MASTER) 1890 ipvs->master_tinfo = ti; 1891 else 1892 ipvs->backup_tinfo = ti; 1893 spin_lock_bh(&ipvs->sync_buff_lock); 1894 ipvs->sync_state |= state; 1895 spin_unlock_bh(&ipvs->sync_buff_lock); 1896 1897 mutex_unlock(&ipvs->sync_mutex); 1898 rtnl_unlock(); 1899 1900 return 0; 1901 1902 out: 1903 /* We do not need RTNL lock anymore, release it here so that 1904 * sock_release below can use rtnl_lock to leave the mcast group. 1905 */ 1906 rtnl_unlock(); 1907 id = min(id, count - 1); 1908 if (ti) { 1909 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1910 if (tinfo->task) 1911 kthread_stop(tinfo->task); 1912 } 1913 } 1914 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1915 kfree(ipvs->ms); 1916 ipvs->ms = NULL; 1917 } 1918 mutex_unlock(&ipvs->sync_mutex); 1919 1920 /* No more mutexes, release socks */ 1921 if (ti) { 1922 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1923 if (tinfo->sock) 1924 sock_release(tinfo->sock); 1925 kfree(tinfo->buf); 1926 } 1927 kfree(ti); 1928 } 1929 1930 /* decrease the module use count */ 1931 ip_vs_use_count_dec(); 1932 return result; 1933 1934 out_early: 1935 mutex_unlock(&ipvs->sync_mutex); 1936 rtnl_unlock(); 1937 1938 /* decrease the module use count */ 1939 ip_vs_use_count_dec(); 1940 return result; 1941 } 1942 1943 1944 int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1945 { 1946 struct ip_vs_sync_thread_data *ti, *tinfo; 1947 int id; 1948 int retc = -EINVAL; 1949 1950 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1951 1952 mutex_lock(&ipvs->sync_mutex); 1953 if (state == IP_VS_STATE_MASTER) { 1954 retc = -ESRCH; 1955 if (!ipvs->ms) 1956 goto err; 1957 ti = ipvs->master_tinfo; 1958 1959 /* 1960 * The lock synchronizes with sb_queue_tail(), so that we don't 1961 * add sync buffers to the queue, when we are already in 1962 * progress of stopping the master sync daemon. 1963 */ 1964 1965 spin_lock_bh(&ipvs->sync_buff_lock); 1966 spin_lock(&ipvs->sync_lock); 1967 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1968 spin_unlock(&ipvs->sync_lock); 1969 spin_unlock_bh(&ipvs->sync_buff_lock); 1970 1971 retc = 0; 1972 for (id = ipvs->threads_mask; id >= 0; id--) { 1973 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1974 int ret; 1975 1976 tinfo = &ti[id]; 1977 pr_info("stopping master sync thread %d ...\n", 1978 task_pid_nr(tinfo->task)); 1979 cancel_delayed_work_sync(&ms->master_wakeup_work); 1980 ret = kthread_stop(tinfo->task); 1981 if (retc >= 0) 1982 retc = ret; 1983 } 1984 kfree(ipvs->ms); 1985 ipvs->ms = NULL; 1986 ipvs->master_tinfo = NULL; 1987 } else if (state == IP_VS_STATE_BACKUP) { 1988 retc = -ESRCH; 1989 if (!ipvs->backup_tinfo) 1990 goto err; 1991 ti = ipvs->backup_tinfo; 1992 1993 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1994 retc = 0; 1995 for (id = ipvs->threads_mask; id >= 0; id--) { 1996 int ret; 1997 1998 tinfo = &ti[id]; 1999 pr_info("stopping backup sync thread %d ...\n", 2000 task_pid_nr(tinfo->task)); 2001 ret = kthread_stop(tinfo->task); 2002 if (retc >= 0) 2003 retc = ret; 2004 } 2005 ipvs->backup_tinfo = NULL; 2006 } else { 2007 goto err; 2008 } 2009 id = ipvs->threads_mask; 2010 mutex_unlock(&ipvs->sync_mutex); 2011 2012 /* No more mutexes, release socks */ 2013 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 2014 if (tinfo->sock) 2015 sock_release(tinfo->sock); 2016 kfree(tinfo->buf); 2017 } 2018 kfree(ti); 2019 2020 /* decrease the module use count */ 2021 ip_vs_use_count_dec(); 2022 return retc; 2023 2024 err: 2025 mutex_unlock(&ipvs->sync_mutex); 2026 return retc; 2027 } 2028 2029 /* 2030 * Initialize data struct for each netns 2031 */ 2032 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2033 { 2034 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2035 spin_lock_init(&ipvs->sync_lock); 2036 spin_lock_init(&ipvs->sync_buff_lock); 2037 return 0; 2038 } 2039 2040 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2041 { 2042 int retc; 2043 2044 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2045 if (retc && retc != -ESRCH) 2046 pr_err("Failed to stop Master Daemon\n"); 2047 2048 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2049 if (retc && retc != -ESRCH) 2050 pr_err("Failed to stop Backup Daemon\n"); 2051 } 2052