170041088SAndy Grover /*
2eee2fa6aSKa-Cheong Poon * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
370041088SAndy Grover *
470041088SAndy Grover * This software is available to you under a choice of one of two
570041088SAndy Grover * licenses. You may choose to be licensed under the terms of the GNU
670041088SAndy Grover * General Public License (GPL) Version 2, available from the file
770041088SAndy Grover * COPYING in the main directory of this source tree, or the
870041088SAndy Grover * OpenIB.org BSD license below:
970041088SAndy Grover *
1070041088SAndy Grover * Redistribution and use in source and binary forms, with or
1170041088SAndy Grover * without modification, are permitted provided that the following
1270041088SAndy Grover * conditions are met:
1370041088SAndy Grover *
1470041088SAndy Grover * - Redistributions of source code must retain the above
1570041088SAndy Grover * copyright notice, this list of conditions and the following
1670041088SAndy Grover * disclaimer.
1770041088SAndy Grover *
1870041088SAndy Grover * - Redistributions in binary form must reproduce the above
1970041088SAndy Grover * copyright notice, this list of conditions and the following
2070041088SAndy Grover * disclaimer in the documentation and/or other materials
2170041088SAndy Grover * provided with the distribution.
2270041088SAndy Grover *
2370041088SAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2470041088SAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2570041088SAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2670041088SAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2770041088SAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2870041088SAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2970041088SAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3070041088SAndy Grover * SOFTWARE.
3170041088SAndy Grover *
3270041088SAndy Grover */
3370041088SAndy Grover #include <linux/kernel.h>
3470041088SAndy Grover #include <linux/in.h>
3570041088SAndy Grover #include <net/tcp.h>
3670041088SAndy Grover
3770041088SAndy Grover #include "rds.h"
3870041088SAndy Grover #include "tcp.h"
3970041088SAndy Grover
rds_tcp_state_change(struct sock * sk)4070041088SAndy Grover void rds_tcp_state_change(struct sock *sk)
4170041088SAndy Grover {
4270041088SAndy Grover void (*state_change)(struct sock *sk);
43ea3b1ea5SSowmini Varadhan struct rds_conn_path *cp;
4470041088SAndy Grover struct rds_tcp_connection *tc;
4570041088SAndy Grover
4638036629SEric Dumazet read_lock_bh(&sk->sk_callback_lock);
47ea3b1ea5SSowmini Varadhan cp = sk->sk_user_data;
48ea3b1ea5SSowmini Varadhan if (!cp) {
4970041088SAndy Grover state_change = sk->sk_state_change;
5070041088SAndy Grover goto out;
5170041088SAndy Grover }
52ea3b1ea5SSowmini Varadhan tc = cp->cp_transport_data;
5370041088SAndy Grover state_change = tc->t_orig_state_change;
5470041088SAndy Grover
5570041088SAndy Grover rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
5670041088SAndy Grover
5770041088SAndy Grover switch (sk->sk_state) {
5870041088SAndy Grover /* ignore connecting sockets as they make progress */
5970041088SAndy Grover case TCP_SYN_SENT:
6070041088SAndy Grover case TCP_SYN_RECV:
6170041088SAndy Grover break;
6270041088SAndy Grover case TCP_ESTABLISHED:
631a0e100fSSowmini Varadhan /* Force the peer to reconnect so that we have the
641a0e100fSSowmini Varadhan * TCP ports going from <smaller-ip>.<transient> to
651a0e100fSSowmini Varadhan * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
661a0e100fSSowmini Varadhan * RDS connection as RDS_CONN_UP until the reconnect,
671a0e100fSSowmini Varadhan * to avoid RDS datagram loss.
681a0e100fSSowmini Varadhan */
69eee2fa6aSKa-Cheong Poon if (rds_addr_cmp(&cp->cp_conn->c_laddr,
70eee2fa6aSKa-Cheong Poon &cp->cp_conn->c_faddr) >= 0 &&
711a0e100fSSowmini Varadhan rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
721a0e100fSSowmini Varadhan RDS_CONN_ERROR)) {
73aed20a53SSowmini Varadhan rds_conn_path_drop(cp, false);
741a0e100fSSowmini Varadhan } else {
75ea3b1ea5SSowmini Varadhan rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
761a0e100fSSowmini Varadhan }
7770041088SAndy Grover break;
78f711a6aeSSowmini Varadhan case TCP_CLOSE_WAIT:
7970041088SAndy Grover case TCP_CLOSE:
80aed20a53SSowmini Varadhan rds_conn_path_drop(cp, false);
813754fa74SGustavo A. R. Silva break;
8270041088SAndy Grover default:
8370041088SAndy Grover break;
8470041088SAndy Grover }
8570041088SAndy Grover out:
8638036629SEric Dumazet read_unlock_bh(&sk->sk_callback_lock);
8770041088SAndy Grover state_change(sk);
8870041088SAndy Grover }
8970041088SAndy Grover
rds_tcp_conn_path_connect(struct rds_conn_path * cp)90b04e8554SSowmini Varadhan int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
9170041088SAndy Grover {
9270041088SAndy Grover struct socket *sock = NULL;
931e2b44e7SKa-Cheong Poon struct sockaddr_in6 sin6;
94eee2fa6aSKa-Cheong Poon struct sockaddr_in sin;
95eee2fa6aSKa-Cheong Poon struct sockaddr *addr;
96eee2fa6aSKa-Cheong Poon int addrlen;
971e2b44e7SKa-Cheong Poon bool isv6;
9870041088SAndy Grover int ret;
99b04e8554SSowmini Varadhan struct rds_connection *conn = cp->cp_conn;
100b04e8554SSowmini Varadhan struct rds_tcp_connection *tc = cp->cp_transport_data;
10170041088SAndy Grover
1025916e2c1SSowmini Varadhan /* for multipath rds,we only trigger the connection after
1035916e2c1SSowmini Varadhan * the handshake probe has determined the number of paths.
1045916e2c1SSowmini Varadhan */
1055916e2c1SSowmini Varadhan if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2)
1065916e2c1SSowmini Varadhan return -EAGAIN;
1075916e2c1SSowmini Varadhan
10802105b2cSSowmini Varadhan mutex_lock(&tc->t_conn_path_lock);
109bd7c5f98SSowmini Varadhan
110b04e8554SSowmini Varadhan if (rds_conn_path_up(cp)) {
11102105b2cSSowmini Varadhan mutex_unlock(&tc->t_conn_path_lock);
112bd7c5f98SSowmini Varadhan return 0;
113bd7c5f98SSowmini Varadhan }
1141e2b44e7SKa-Cheong Poon if (ipv6_addr_v4mapped(&conn->c_laddr)) {
115d5a8ac28SSowmini Varadhan ret = sock_create_kern(rds_conn_net(conn), PF_INET,
116d5a8ac28SSowmini Varadhan SOCK_STREAM, IPPROTO_TCP, &sock);
1171e2b44e7SKa-Cheong Poon isv6 = false;
1181e2b44e7SKa-Cheong Poon } else {
1191e2b44e7SKa-Cheong Poon ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
1201e2b44e7SKa-Cheong Poon SOCK_STREAM, IPPROTO_TCP, &sock);
1211e2b44e7SKa-Cheong Poon isv6 = true;
1221e2b44e7SKa-Cheong Poon }
1231e2b44e7SKa-Cheong Poon
12470041088SAndy Grover if (ret < 0)
12570041088SAndy Grover goto out;
12670041088SAndy Grover
1276997fbd7STetsuo Handa if (!rds_tcp_tune(sock)) {
1286997fbd7STetsuo Handa ret = -EINVAL;
1296997fbd7STetsuo Handa goto out;
1306997fbd7STetsuo Handa }
13170041088SAndy Grover
1321e2b44e7SKa-Cheong Poon if (isv6) {
1331e2b44e7SKa-Cheong Poon sin6.sin6_family = AF_INET6;
1341e2b44e7SKa-Cheong Poon sin6.sin6_addr = conn->c_laddr;
1351e2b44e7SKa-Cheong Poon sin6.sin6_port = 0;
1361e2b44e7SKa-Cheong Poon sin6.sin6_flowinfo = 0;
1371e2b44e7SKa-Cheong Poon sin6.sin6_scope_id = conn->c_dev_if;
1381e2b44e7SKa-Cheong Poon addr = (struct sockaddr *)&sin6;
1391e2b44e7SKa-Cheong Poon addrlen = sizeof(sin6);
1401e2b44e7SKa-Cheong Poon } else {
141eee2fa6aSKa-Cheong Poon sin.sin_family = AF_INET;
142eee2fa6aSKa-Cheong Poon sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
143eee2fa6aSKa-Cheong Poon sin.sin_port = 0;
144eee2fa6aSKa-Cheong Poon addr = (struct sockaddr *)&sin;
145eee2fa6aSKa-Cheong Poon addrlen = sizeof(sin);
1461e2b44e7SKa-Cheong Poon }
14770041088SAndy Grover
148*c889a99aSJordan Rife ret = kernel_bind(sock, addr, addrlen);
14970041088SAndy Grover if (ret) {
150eee2fa6aSKa-Cheong Poon rdsdebug("bind failed with %d at address %pI6c\n",
1516884b348SJoe Perches ret, &conn->c_laddr);
15270041088SAndy Grover goto out;
15370041088SAndy Grover }
15470041088SAndy Grover
1551e2b44e7SKa-Cheong Poon if (isv6) {
1561e2b44e7SKa-Cheong Poon sin6.sin6_family = AF_INET6;
1571e2b44e7SKa-Cheong Poon sin6.sin6_addr = conn->c_faddr;
1581e2b44e7SKa-Cheong Poon sin6.sin6_port = htons(RDS_TCP_PORT);
1591e2b44e7SKa-Cheong Poon sin6.sin6_flowinfo = 0;
1601e2b44e7SKa-Cheong Poon sin6.sin6_scope_id = conn->c_dev_if;
1611e2b44e7SKa-Cheong Poon addr = (struct sockaddr *)&sin6;
1621e2b44e7SKa-Cheong Poon addrlen = sizeof(sin6);
1631e2b44e7SKa-Cheong Poon } else {
164eee2fa6aSKa-Cheong Poon sin.sin_family = AF_INET;
165eee2fa6aSKa-Cheong Poon sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
166eee2fa6aSKa-Cheong Poon sin.sin_port = htons(RDS_TCP_PORT);
167eee2fa6aSKa-Cheong Poon addr = (struct sockaddr *)&sin;
168eee2fa6aSKa-Cheong Poon addrlen = sizeof(sin);
1691e2b44e7SKa-Cheong Poon }
17070041088SAndy Grover
17170041088SAndy Grover /*
17270041088SAndy Grover * once we call connect() we can start getting callbacks and they
17370041088SAndy Grover * own the socket
17470041088SAndy Grover */
175ea3b1ea5SSowmini Varadhan rds_tcp_set_callbacks(sock, cp);
17626297b4cSJordan Rife ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK);
17770041088SAndy Grover
178eee2fa6aSKa-Cheong Poon rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
17970041088SAndy Grover if (ret == -EINPROGRESS)
18070041088SAndy Grover ret = 0;
181467fa153SSowmini Varadhan if (ret == 0) {
182467fa153SSowmini Varadhan rds_tcp_keepalive(sock);
183eb74cc97SHerton R. Krzesinski sock = NULL;
184467fa153SSowmini Varadhan } else {
185b04e8554SSowmini Varadhan rds_tcp_restore_callbacks(sock, cp->cp_transport_data);
186467fa153SSowmini Varadhan }
18770041088SAndy Grover
18870041088SAndy Grover out:
18902105b2cSSowmini Varadhan mutex_unlock(&tc->t_conn_path_lock);
19070041088SAndy Grover if (sock)
19170041088SAndy Grover sock_release(sock);
19270041088SAndy Grover return ret;
19370041088SAndy Grover }
19470041088SAndy Grover
19570041088SAndy Grover /*
19670041088SAndy Grover * Before killing the tcp socket this needs to serialize with callbacks. The
19770041088SAndy Grover * caller has already grabbed the sending sem so we're serialized with other
19870041088SAndy Grover * senders.
19970041088SAndy Grover *
20070041088SAndy Grover * TCP calls the callbacks with the sock lock so we hold it while we reset the
20170041088SAndy Grover * callbacks to those set by TCP. Our callbacks won't execute again once we
20270041088SAndy Grover * hold the sock lock.
20370041088SAndy Grover */
rds_tcp_conn_path_shutdown(struct rds_conn_path * cp)204226f7a7dSSowmini Varadhan void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
20570041088SAndy Grover {
206226f7a7dSSowmini Varadhan struct rds_tcp_connection *tc = cp->cp_transport_data;
20770041088SAndy Grover struct socket *sock = tc->t_sock;
20870041088SAndy Grover
209226f7a7dSSowmini Varadhan rdsdebug("shutting down conn %p tc %p sock %p\n",
210226f7a7dSSowmini Varadhan cp->cp_conn, tc, sock);
21170041088SAndy Grover
21270041088SAndy Grover if (sock) {
213ebeeb1adSSowmini Varadhan if (rds_destroy_pending(cp->cp_conn))
214c433594cSChristoph Hellwig sock_no_linger(sock->sk);
21570041088SAndy Grover sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
21670041088SAndy Grover lock_sock(sock->sk);
21770041088SAndy Grover rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
21870041088SAndy Grover
21970041088SAndy Grover release_sock(sock->sk);
22070041088SAndy Grover sock_release(sock);
221ccbd6a5aSJoe Perches }
22270041088SAndy Grover
22370041088SAndy Grover if (tc->t_tinc) {
22470041088SAndy Grover rds_inc_put(&tc->t_tinc->ti_inc);
22570041088SAndy Grover tc->t_tinc = NULL;
22670041088SAndy Grover }
22770041088SAndy Grover tc->t_tinc_hdr_rem = sizeof(struct rds_header);
22870041088SAndy Grover tc->t_tinc_data_rem = 0;
22970041088SAndy Grover }
230