xref: /openbmc/linux/net/rds/send.c (revision 4a4dffdf)
15c115590SAndy Grover /*
2a43cced9SKa-Cheong Poon  * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
35c115590SAndy Grover  *
45c115590SAndy Grover  * This software is available to you under a choice of one of two
55c115590SAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
65c115590SAndy Grover  * General Public License (GPL) Version 2, available from the file
75c115590SAndy Grover  * COPYING in the main directory of this source tree, or the
85c115590SAndy Grover  * OpenIB.org BSD license below:
95c115590SAndy Grover  *
105c115590SAndy Grover  *     Redistribution and use in source and binary forms, with or
115c115590SAndy Grover  *     without modification, are permitted provided that the following
125c115590SAndy Grover  *     conditions are met:
135c115590SAndy Grover  *
145c115590SAndy Grover  *      - Redistributions of source code must retain the above
155c115590SAndy Grover  *        copyright notice, this list of conditions and the following
165c115590SAndy Grover  *        disclaimer.
175c115590SAndy Grover  *
185c115590SAndy Grover  *      - Redistributions in binary form must reproduce the above
195c115590SAndy Grover  *        copyright notice, this list of conditions and the following
205c115590SAndy Grover  *        disclaimer in the documentation and/or other materials
215c115590SAndy Grover  *        provided with the distribution.
225c115590SAndy Grover  *
235c115590SAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
245c115590SAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
255c115590SAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
265c115590SAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
275c115590SAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
285c115590SAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
295c115590SAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
305c115590SAndy Grover  * SOFTWARE.
315c115590SAndy Grover  *
325c115590SAndy Grover  */
335c115590SAndy Grover #include <linux/kernel.h>
34d9b93842SPaul Gortmaker #include <linux/moduleparam.h>
355a0e3ad6STejun Heo #include <linux/gfp.h>
365c115590SAndy Grover #include <net/sock.h>
375c115590SAndy Grover #include <linux/in.h>
385c115590SAndy Grover #include <linux/list.h>
39cb0a6056SManuel Zerpies #include <linux/ratelimit.h>
40bc3b2d7fSPaul Gortmaker #include <linux/export.h>
414bebdd7aSSantosh Shilimkar #include <linux/sizes.h>
425c115590SAndy Grover 
435c115590SAndy Grover #include "rds.h"
445c115590SAndy Grover 
455c115590SAndy Grover /* When transmitting messages in rds_send_xmit, we need to emerge from
465c115590SAndy Grover  * time to time and briefly release the CPU. Otherwise the softlock watchdog
475c115590SAndy Grover  * will kick our shin.
485c115590SAndy Grover  * Also, it seems fairer to not let one busy connection stall all the
495c115590SAndy Grover  * others.
505c115590SAndy Grover  *
515c115590SAndy Grover  * send_batch_count is the number of times we'll loop in send_xmit. Setting
525c115590SAndy Grover  * it to 0 will restore the old behavior (where we looped until we had
535c115590SAndy Grover  * drained the queue).
545c115590SAndy Grover  */
554bebdd7aSSantosh Shilimkar static int send_batch_count = SZ_1K;
565c115590SAndy Grover module_param(send_batch_count, int, 0444);
575c115590SAndy Grover MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
585c115590SAndy Grover 
59ff51bf84Sstephen hemminger static void rds_send_remove_from_sock(struct list_head *messages, int status);
60ff51bf84Sstephen hemminger 
615c115590SAndy Grover /*
620f4b1c7eSZach Brown  * Reset the send state.  Callers must ensure that this doesn't race with
630f4b1c7eSZach Brown  * rds_send_xmit().
645c115590SAndy Grover  */
rds_send_path_reset(struct rds_conn_path * cp)65d769ef81SSowmini Varadhan void rds_send_path_reset(struct rds_conn_path *cp)
665c115590SAndy Grover {
675c115590SAndy Grover 	struct rds_message *rm, *tmp;
685c115590SAndy Grover 	unsigned long flags;
695c115590SAndy Grover 
704e9b551cSSowmini Varadhan 	if (cp->cp_xmit_rm) {
714e9b551cSSowmini Varadhan 		rm = cp->cp_xmit_rm;
724e9b551cSSowmini Varadhan 		cp->cp_xmit_rm = NULL;
735c115590SAndy Grover 		/* Tell the user the RDMA op is no longer mapped by the
745c115590SAndy Grover 		 * transport. This isn't entirely true (it's flushed out
755c115590SAndy Grover 		 * independently) but as the connection is down, there's
765c115590SAndy Grover 		 * no ongoing RDMA to/from that memory */
777e3f2952SChris Mason 		rds_message_unmapped(rm);
787e3f2952SChris Mason 		rds_message_put(rm);
795c115590SAndy Grover 	}
807e3f2952SChris Mason 
814e9b551cSSowmini Varadhan 	cp->cp_xmit_sg = 0;
824e9b551cSSowmini Varadhan 	cp->cp_xmit_hdr_off = 0;
834e9b551cSSowmini Varadhan 	cp->cp_xmit_data_off = 0;
844e9b551cSSowmini Varadhan 	cp->cp_xmit_atomic_sent = 0;
854e9b551cSSowmini Varadhan 	cp->cp_xmit_rdma_sent = 0;
864e9b551cSSowmini Varadhan 	cp->cp_xmit_data_sent = 0;
875c115590SAndy Grover 
884e9b551cSSowmini Varadhan 	cp->cp_conn->c_map_queued = 0;
895c115590SAndy Grover 
904e9b551cSSowmini Varadhan 	cp->cp_unacked_packets = rds_sysctl_max_unacked_packets;
914e9b551cSSowmini Varadhan 	cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes;
925c115590SAndy Grover 
935c115590SAndy Grover 	/* Mark messages as retransmissions, and move them to the send q */
944e9b551cSSowmini Varadhan 	spin_lock_irqsave(&cp->cp_lock, flags);
954e9b551cSSowmini Varadhan 	list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
965c115590SAndy Grover 		set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
975c115590SAndy Grover 		set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
985c115590SAndy Grover 	}
994e9b551cSSowmini Varadhan 	list_splice_init(&cp->cp_retrans, &cp->cp_send_queue);
1004e9b551cSSowmini Varadhan 	spin_unlock_irqrestore(&cp->cp_lock, flags);
1014e9b551cSSowmini Varadhan }
102d769ef81SSowmini Varadhan EXPORT_SYMBOL_GPL(rds_send_path_reset);
1035c115590SAndy Grover 
acquire_in_xmit(struct rds_conn_path * cp)1041f9ecd7eSSowmini Varadhan static int acquire_in_xmit(struct rds_conn_path *cp)
1050f4b1c7eSZach Brown {
106*4a4dffdfSYewon Choi 	return test_and_set_bit_lock(RDS_IN_XMIT, &cp->cp_flags) == 0;
1070f4b1c7eSZach Brown }
1080f4b1c7eSZach Brown 
release_in_xmit(struct rds_conn_path * cp)1091f9ecd7eSSowmini Varadhan static void release_in_xmit(struct rds_conn_path *cp)
1100f4b1c7eSZach Brown {
111*4a4dffdfSYewon Choi 	clear_bit_unlock(RDS_IN_XMIT, &cp->cp_flags);
1120f4b1c7eSZach Brown 	/*
1130f4b1c7eSZach Brown 	 * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
1140f4b1c7eSZach Brown 	 * hot path and finding waiters is very rare.  We don't want to walk
1150f4b1c7eSZach Brown 	 * the system-wide hashed waitqueue buckets in the fast path only to
1160f4b1c7eSZach Brown 	 * almost never find waiters.
1170f4b1c7eSZach Brown 	 */
1181f9ecd7eSSowmini Varadhan 	if (waitqueue_active(&cp->cp_waitq))
1191f9ecd7eSSowmini Varadhan 		wake_up_all(&cp->cp_waitq);
1200f4b1c7eSZach Brown }
1210f4b1c7eSZach Brown 
1225c115590SAndy Grover /*
12325985edcSLucas De Marchi  * We're making the conscious trade-off here to only send one message
1245c115590SAndy Grover  * down the connection at a time.
1255c115590SAndy Grover  *   Pro:
1265c115590SAndy Grover  *      - tx queueing is a simple fifo list
1275c115590SAndy Grover  *   	- reassembly is optional and easily done by transports per conn
1285c115590SAndy Grover  *      - no per flow rx lookup at all, straight to the socket
1295c115590SAndy Grover  *   	- less per-frag memory and wire overhead
1305c115590SAndy Grover  *   Con:
1315c115590SAndy Grover  *      - queued acks can be delayed behind large messages
1325c115590SAndy Grover  *   Depends:
1335c115590SAndy Grover  *      - small message latency is higher behind queued large messages
1345c115590SAndy Grover  *      - large message latency isn't starved by intervening small sends
1355c115590SAndy Grover  */
rds_send_xmit(struct rds_conn_path * cp)1361f9ecd7eSSowmini Varadhan int rds_send_xmit(struct rds_conn_path *cp)
1375c115590SAndy Grover {
1381f9ecd7eSSowmini Varadhan 	struct rds_connection *conn = cp->cp_conn;
1395c115590SAndy Grover 	struct rds_message *rm;
1405c115590SAndy Grover 	unsigned long flags;
1415c115590SAndy Grover 	unsigned int tmp;
1425c115590SAndy Grover 	struct scatterlist *sg;
1435c115590SAndy Grover 	int ret = 0;
1445c115590SAndy Grover 	LIST_HEAD(to_be_dropped);
145443be0e5SSowmini Varadhan 	int batch_count;
146443be0e5SSowmini Varadhan 	unsigned long send_gen = 0;
14711740ef4SAndy Grover 	int same_rm = 0;
1485c115590SAndy Grover 
149fcc5450cSAndy Grover restart:
150443be0e5SSowmini Varadhan 	batch_count = 0;
151049ee3f5SAndy Grover 
1525c115590SAndy Grover 	/*
1535c115590SAndy Grover 	 * sendmsg calls here after having queued its message on the send
1545c115590SAndy Grover 	 * queue.  We only have one task feeding the connection at a time.  If
1555c115590SAndy Grover 	 * another thread is already feeding the queue then we back off.  This
1565c115590SAndy Grover 	 * avoids blocking the caller and trading per-connection data between
1575c115590SAndy Grover 	 * caches per message.
1585c115590SAndy Grover 	 */
1591f9ecd7eSSowmini Varadhan 	if (!acquire_in_xmit(cp)) {
160049ee3f5SAndy Grover 		rds_stats_inc(s_send_lock_contention);
1615c115590SAndy Grover 		ret = -ENOMEM;
1625c115590SAndy Grover 		goto out;
1635c115590SAndy Grover 	}
1640f4b1c7eSZach Brown 
165ebeeb1adSSowmini Varadhan 	if (rds_destroy_pending(cp->cp_conn)) {
1663db6e0d1SSowmini Varadhan 		release_in_xmit(cp);
1673db6e0d1SSowmini Varadhan 		ret = -ENETUNREACH; /* dont requeue send work */
1683db6e0d1SSowmini Varadhan 		goto out;
1693db6e0d1SSowmini Varadhan 	}
1703db6e0d1SSowmini Varadhan 
1710f4b1c7eSZach Brown 	/*
172443be0e5SSowmini Varadhan 	 * we record the send generation after doing the xmit acquire.
173443be0e5SSowmini Varadhan 	 * if someone else manages to jump in and do some work, we'll use
174443be0e5SSowmini Varadhan 	 * this to avoid a goto restart farther down.
175443be0e5SSowmini Varadhan 	 *
176443be0e5SSowmini Varadhan 	 * The acquire_in_xmit() check above ensures that only one
177443be0e5SSowmini Varadhan 	 * caller can increment c_send_gen at any time.
178443be0e5SSowmini Varadhan 	 */
179e623a48eSHåkon Bugge 	send_gen = READ_ONCE(cp->cp_send_gen) + 1;
180e623a48eSHåkon Bugge 	WRITE_ONCE(cp->cp_send_gen, send_gen);
181443be0e5SSowmini Varadhan 
182443be0e5SSowmini Varadhan 	/*
1830f4b1c7eSZach Brown 	 * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
1840f4b1c7eSZach Brown 	 * we do the opposite to avoid races.
1850f4b1c7eSZach Brown 	 */
1861f9ecd7eSSowmini Varadhan 	if (!rds_conn_path_up(cp)) {
1871f9ecd7eSSowmini Varadhan 		release_in_xmit(cp);
1880f4b1c7eSZach Brown 		ret = 0;
1890f4b1c7eSZach Brown 		goto out;
1900f4b1c7eSZach Brown 	}
1915c115590SAndy Grover 
1921f9ecd7eSSowmini Varadhan 	if (conn->c_trans->xmit_path_prepare)
1931f9ecd7eSSowmini Varadhan 		conn->c_trans->xmit_path_prepare(cp);
1945c115590SAndy Grover 
1955c115590SAndy Grover 	/*
1965c115590SAndy Grover 	 * spin trying to push headers and data down the connection until
1975b2366bdSAndy Grover 	 * the connection doesn't make forward progress.
1985c115590SAndy Grover 	 */
199fcc5450cSAndy Grover 	while (1) {
2005c115590SAndy Grover 
2011f9ecd7eSSowmini Varadhan 		rm = cp->cp_xmit_rm;
2025c115590SAndy Grover 
20311740ef4SAndy Grover 		if (!rm) {
20411740ef4SAndy Grover 			same_rm = 0;
20511740ef4SAndy Grover 		} else {
20611740ef4SAndy Grover 			same_rm++;
20711740ef4SAndy Grover 			if (same_rm >= 4096) {
20811740ef4SAndy Grover 				rds_stats_inc(s_send_stuck_rm);
20911740ef4SAndy Grover 				ret = -EAGAIN;
21011740ef4SAndy Grover 				break;
21111740ef4SAndy Grover 			}
21211740ef4SAndy Grover 		}
21311740ef4SAndy Grover 
2145b2366bdSAndy Grover 		/*
2155b2366bdSAndy Grover 		 * If between sending messages, we can send a pending congestion
2165b2366bdSAndy Grover 		 * map update.
2175c115590SAndy Grover 		 */
2188690bfa1SAndy Grover 		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
2195c115590SAndy Grover 			rm = rds_cong_update_alloc(conn);
2205c115590SAndy Grover 			if (IS_ERR(rm)) {
2215c115590SAndy Grover 				ret = PTR_ERR(rm);
2225c115590SAndy Grover 				break;
2235c115590SAndy Grover 			}
2245b2366bdSAndy Grover 			rm->data.op_active = 1;
2251f9ecd7eSSowmini Varadhan 			rm->m_inc.i_conn_path = cp;
2261f9ecd7eSSowmini Varadhan 			rm->m_inc.i_conn = cp->cp_conn;
2275c115590SAndy Grover 
2281f9ecd7eSSowmini Varadhan 			cp->cp_xmit_rm = rm;
2295c115590SAndy Grover 		}
2305c115590SAndy Grover 
2315c115590SAndy Grover 		/*
2325b2366bdSAndy Grover 		 * If not already working on one, grab the next message.
2335c115590SAndy Grover 		 *
2341f9ecd7eSSowmini Varadhan 		 * cp_xmit_rm holds a ref while we're sending this message down
2355c115590SAndy Grover 		 * the connction.  We can use this ref while holding the
2365c115590SAndy Grover 		 * send_sem.. rds_send_reset() is serialized with it.
2375c115590SAndy Grover 		 */
2388690bfa1SAndy Grover 		if (!rm) {
2395c115590SAndy Grover 			unsigned int len;
2405c115590SAndy Grover 
241443be0e5SSowmini Varadhan 			batch_count++;
242443be0e5SSowmini Varadhan 
243443be0e5SSowmini Varadhan 			/* we want to process as big a batch as we can, but
244443be0e5SSowmini Varadhan 			 * we also want to avoid softlockups.  If we've been
245443be0e5SSowmini Varadhan 			 * through a lot of messages, lets back off and see
246443be0e5SSowmini Varadhan 			 * if anyone else jumps in
247443be0e5SSowmini Varadhan 			 */
2484bebdd7aSSantosh Shilimkar 			if (batch_count >= send_batch_count)
249443be0e5SSowmini Varadhan 				goto over_batch;
250443be0e5SSowmini Varadhan 
2511f9ecd7eSSowmini Varadhan 			spin_lock_irqsave(&cp->cp_lock, flags);
2525c115590SAndy Grover 
2531f9ecd7eSSowmini Varadhan 			if (!list_empty(&cp->cp_send_queue)) {
2541f9ecd7eSSowmini Varadhan 				rm = list_entry(cp->cp_send_queue.next,
2555c115590SAndy Grover 						struct rds_message,
2565c115590SAndy Grover 						m_conn_item);
2575c115590SAndy Grover 				rds_message_addref(rm);
2585c115590SAndy Grover 
2595c115590SAndy Grover 				/*
2605c115590SAndy Grover 				 * Move the message from the send queue to the retransmit
2615c115590SAndy Grover 				 * list right away.
2625c115590SAndy Grover 				 */
2631f9ecd7eSSowmini Varadhan 				list_move_tail(&rm->m_conn_item,
2641f9ecd7eSSowmini Varadhan 					       &cp->cp_retrans);
2655c115590SAndy Grover 			}
2665c115590SAndy Grover 
2671f9ecd7eSSowmini Varadhan 			spin_unlock_irqrestore(&cp->cp_lock, flags);
2685c115590SAndy Grover 
269fcc5450cSAndy Grover 			if (!rm)
2705c115590SAndy Grover 				break;
2715c115590SAndy Grover 
2725c115590SAndy Grover 			/* Unfortunately, the way Infiniband deals with
2735c115590SAndy Grover 			 * RDMA to a bad MR key is by moving the entire
274db473c07SChristophe JAILLET 			 * queue pair to error state. We could possibly
2755c115590SAndy Grover 			 * recover from that, but right now we drop the
2765c115590SAndy Grover 			 * connection.
2775c115590SAndy Grover 			 * Therefore, we never retransmit messages with RDMA ops.
2785c115590SAndy Grover 			 */
279905dd418SSowmini Varadhan 			if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
280905dd418SSowmini Varadhan 			    (rm->rdma.op_active &&
281905dd418SSowmini Varadhan 			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
2821f9ecd7eSSowmini Varadhan 				spin_lock_irqsave(&cp->cp_lock, flags);
2835c115590SAndy Grover 				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
2845c115590SAndy Grover 					list_move(&rm->m_conn_item, &to_be_dropped);
2851f9ecd7eSSowmini Varadhan 				spin_unlock_irqrestore(&cp->cp_lock, flags);
2865c115590SAndy Grover 				continue;
2875c115590SAndy Grover 			}
2885c115590SAndy Grover 
2895c115590SAndy Grover 			/* Require an ACK every once in a while */
2905c115590SAndy Grover 			len = ntohl(rm->m_inc.i_hdr.h_len);
2911f9ecd7eSSowmini Varadhan 			if (cp->cp_unacked_packets == 0 ||
2921f9ecd7eSSowmini Varadhan 			    cp->cp_unacked_bytes < len) {
293f530f39fSHåkon Bugge 				set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
2945c115590SAndy Grover 
2951f9ecd7eSSowmini Varadhan 				cp->cp_unacked_packets =
2961f9ecd7eSSowmini Varadhan 					rds_sysctl_max_unacked_packets;
2971f9ecd7eSSowmini Varadhan 				cp->cp_unacked_bytes =
2981f9ecd7eSSowmini Varadhan 					rds_sysctl_max_unacked_bytes;
2995c115590SAndy Grover 				rds_stats_inc(s_send_ack_required);
3005c115590SAndy Grover 			} else {
3011f9ecd7eSSowmini Varadhan 				cp->cp_unacked_bytes -= len;
3021f9ecd7eSSowmini Varadhan 				cp->cp_unacked_packets--;
3035c115590SAndy Grover 			}
3045c115590SAndy Grover 
3051f9ecd7eSSowmini Varadhan 			cp->cp_xmit_rm = rm;
3065c115590SAndy Grover 		}
3075c115590SAndy Grover 
3082c3a5f9aSAndy Grover 		/* The transport either sends the whole rdma or none of it */
3091f9ecd7eSSowmini Varadhan 		if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) {
310ff3d7d36SAndy Grover 			rm->m_final_op = &rm->rdma;
3114f73113cSsantosh.shilimkar@oracle.com 			/* The transport owns the mapped memory for now.
3124f73113cSsantosh.shilimkar@oracle.com 			 * You can't unmap it while it's on the send queue
3134f73113cSsantosh.shilimkar@oracle.com 			 */
3144f73113cSsantosh.shilimkar@oracle.com 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
3152c3a5f9aSAndy Grover 			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
3164f73113cSsantosh.shilimkar@oracle.com 			if (ret) {
3174f73113cSsantosh.shilimkar@oracle.com 				clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
3184f73113cSsantosh.shilimkar@oracle.com 				wake_up_interruptible(&rm->m_flush_wait);
3192c3a5f9aSAndy Grover 				break;
3204f73113cSsantosh.shilimkar@oracle.com 			}
3211f9ecd7eSSowmini Varadhan 			cp->cp_xmit_rdma_sent = 1;
3222c3a5f9aSAndy Grover 
3232c3a5f9aSAndy Grover 		}
3242c3a5f9aSAndy Grover 
3251f9ecd7eSSowmini Varadhan 		if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) {
326ff3d7d36SAndy Grover 			rm->m_final_op = &rm->atomic;
3274f73113cSsantosh.shilimkar@oracle.com 			/* The transport owns the mapped memory for now.
3284f73113cSsantosh.shilimkar@oracle.com 			 * You can't unmap it while it's on the send queue
3294f73113cSsantosh.shilimkar@oracle.com 			 */
3304f73113cSsantosh.shilimkar@oracle.com 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
331ff3d7d36SAndy Grover 			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
3324f73113cSsantosh.shilimkar@oracle.com 			if (ret) {
3334f73113cSsantosh.shilimkar@oracle.com 				clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
3344f73113cSsantosh.shilimkar@oracle.com 				wake_up_interruptible(&rm->m_flush_wait);
33515133f6eSAndy Grover 				break;
3364f73113cSsantosh.shilimkar@oracle.com 			}
3371f9ecd7eSSowmini Varadhan 			cp->cp_xmit_atomic_sent = 1;
338ff3d7d36SAndy Grover 
3395b2366bdSAndy Grover 		}
3405b2366bdSAndy Grover 
3412c3a5f9aSAndy Grover 		/*
3422c3a5f9aSAndy Grover 		 * A number of cases require an RDS header to be sent
3432c3a5f9aSAndy Grover 		 * even if there is no data.
3442c3a5f9aSAndy Grover 		 * We permit 0-byte sends; rds-ping depends on this.
3452c3a5f9aSAndy Grover 		 * However, if there are exclusively attached silent ops,
3462c3a5f9aSAndy Grover 		 * we skip the hdr/data send, to enable silent operation.
3472c3a5f9aSAndy Grover 		 */
3482c3a5f9aSAndy Grover 		if (rm->data.op_nents == 0) {
3492c3a5f9aSAndy Grover 			int ops_present;
3502c3a5f9aSAndy Grover 			int all_ops_are_silent = 1;
351241eef3eSAndy Grover 
3522c3a5f9aSAndy Grover 			ops_present = (rm->atomic.op_active || rm->rdma.op_active);
3532c3a5f9aSAndy Grover 			if (rm->atomic.op_active && !rm->atomic.op_silent)
3542c3a5f9aSAndy Grover 				all_ops_are_silent = 0;
3552c3a5f9aSAndy Grover 			if (rm->rdma.op_active && !rm->rdma.op_silent)
3562c3a5f9aSAndy Grover 				all_ops_are_silent = 0;
357241eef3eSAndy Grover 
3582c3a5f9aSAndy Grover 			if (ops_present && all_ops_are_silent
3592c3a5f9aSAndy Grover 			    && !rm->m_rdma_cookie)
3602c3a5f9aSAndy Grover 				rm->data.op_active = 0;
3615c115590SAndy Grover 		}
3625c115590SAndy Grover 
3631f9ecd7eSSowmini Varadhan 		if (rm->data.op_active && !cp->cp_xmit_data_sent) {
364ff3d7d36SAndy Grover 			rm->m_final_op = &rm->data;
3651f9ecd7eSSowmini Varadhan 
3665c115590SAndy Grover 			ret = conn->c_trans->xmit(conn, rm,
3671f9ecd7eSSowmini Varadhan 						  cp->cp_xmit_hdr_off,
3681f9ecd7eSSowmini Varadhan 						  cp->cp_xmit_sg,
3691f9ecd7eSSowmini Varadhan 						  cp->cp_xmit_data_off);
3705c115590SAndy Grover 			if (ret <= 0)
3715c115590SAndy Grover 				break;
3725c115590SAndy Grover 
3731f9ecd7eSSowmini Varadhan 			if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) {
3745c115590SAndy Grover 				tmp = min_t(int, ret,
3755c115590SAndy Grover 					    sizeof(struct rds_header) -
3761f9ecd7eSSowmini Varadhan 					    cp->cp_xmit_hdr_off);
3771f9ecd7eSSowmini Varadhan 				cp->cp_xmit_hdr_off += tmp;
3785c115590SAndy Grover 				ret -= tmp;
3795c115590SAndy Grover 			}
3805c115590SAndy Grover 
3811f9ecd7eSSowmini Varadhan 			sg = &rm->data.op_sg[cp->cp_xmit_sg];
3825c115590SAndy Grover 			while (ret) {
3835c115590SAndy Grover 				tmp = min_t(int, ret, sg->length -
3841f9ecd7eSSowmini Varadhan 						      cp->cp_xmit_data_off);
3851f9ecd7eSSowmini Varadhan 				cp->cp_xmit_data_off += tmp;
3865c115590SAndy Grover 				ret -= tmp;
3871f9ecd7eSSowmini Varadhan 				if (cp->cp_xmit_data_off == sg->length) {
3881f9ecd7eSSowmini Varadhan 					cp->cp_xmit_data_off = 0;
3895c115590SAndy Grover 					sg++;
3901f9ecd7eSSowmini Varadhan 					cp->cp_xmit_sg++;
3911f9ecd7eSSowmini Varadhan 					BUG_ON(ret != 0 && cp->cp_xmit_sg ==
3921f9ecd7eSSowmini Varadhan 					       rm->data.op_nents);
3935c115590SAndy Grover 				}
3945c115590SAndy Grover 			}
3955b2366bdSAndy Grover 
3961f9ecd7eSSowmini Varadhan 			if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) &&
3971f9ecd7eSSowmini Varadhan 			    (cp->cp_xmit_sg == rm->data.op_nents))
3981f9ecd7eSSowmini Varadhan 				cp->cp_xmit_data_sent = 1;
3995b2366bdSAndy Grover 		}
4005b2366bdSAndy Grover 
4015b2366bdSAndy Grover 		/*
4025b2366bdSAndy Grover 		 * A rm will only take multiple times through this loop
4035b2366bdSAndy Grover 		 * if there is a data op. Thus, if the data is sent (or there was
4045b2366bdSAndy Grover 		 * none), then we're done with the rm.
4055b2366bdSAndy Grover 		 */
4061f9ecd7eSSowmini Varadhan 		if (!rm->data.op_active || cp->cp_xmit_data_sent) {
4071f9ecd7eSSowmini Varadhan 			cp->cp_xmit_rm = NULL;
4081f9ecd7eSSowmini Varadhan 			cp->cp_xmit_sg = 0;
4091f9ecd7eSSowmini Varadhan 			cp->cp_xmit_hdr_off = 0;
4101f9ecd7eSSowmini Varadhan 			cp->cp_xmit_data_off = 0;
4111f9ecd7eSSowmini Varadhan 			cp->cp_xmit_rdma_sent = 0;
4121f9ecd7eSSowmini Varadhan 			cp->cp_xmit_atomic_sent = 0;
4131f9ecd7eSSowmini Varadhan 			cp->cp_xmit_data_sent = 0;
4145b2366bdSAndy Grover 
4155b2366bdSAndy Grover 			rds_message_put(rm);
4165c115590SAndy Grover 		}
4175c115590SAndy Grover 	}
4185c115590SAndy Grover 
419443be0e5SSowmini Varadhan over_batch:
4201f9ecd7eSSowmini Varadhan 	if (conn->c_trans->xmit_path_complete)
4211f9ecd7eSSowmini Varadhan 		conn->c_trans->xmit_path_complete(cp);
4221f9ecd7eSSowmini Varadhan 	release_in_xmit(cp);
4235c115590SAndy Grover 
4242ad8099bSAndy Grover 	/* Nuke any messages we decided not to retransmit. */
4252ad8099bSAndy Grover 	if (!list_empty(&to_be_dropped)) {
4262ad8099bSAndy Grover 		/* irqs on here, so we can put(), unlike above */
4272ad8099bSAndy Grover 		list_for_each_entry(rm, &to_be_dropped, m_conn_item)
4282ad8099bSAndy Grover 			rds_message_put(rm);
4292ad8099bSAndy Grover 		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
4302ad8099bSAndy Grover 	}
4312ad8099bSAndy Grover 
432fcc5450cSAndy Grover 	/*
4330f4b1c7eSZach Brown 	 * Other senders can queue a message after we last test the send queue
4340f4b1c7eSZach Brown 	 * but before we clear RDS_IN_XMIT.  In that case they'd back off and
4350f4b1c7eSZach Brown 	 * not try and send their newly queued message.  We need to check the
4360f4b1c7eSZach Brown 	 * send queue after having cleared RDS_IN_XMIT so that their message
4370f4b1c7eSZach Brown 	 * doesn't get stuck on the send queue.
438fcc5450cSAndy Grover 	 *
439fcc5450cSAndy Grover 	 * If the transport cannot continue (i.e ret != 0), then it must
440fcc5450cSAndy Grover 	 * call us when more room is available, such as from the tx
441fcc5450cSAndy Grover 	 * completion handler.
442443be0e5SSowmini Varadhan 	 *
443443be0e5SSowmini Varadhan 	 * We have an extra generation check here so that if someone manages
444443be0e5SSowmini Varadhan 	 * to jump in after our release_in_xmit, we'll see that they have done
445443be0e5SSowmini Varadhan 	 * some work and we will skip our goto
4465c115590SAndy Grover 	 */
447fcc5450cSAndy Grover 	if (ret == 0) {
448126f760cSHåkon Bugge 		bool raced;
449126f760cSHåkon Bugge 
4509e29db0eSChris Mason 		smp_mb();
451126f760cSHåkon Bugge 		raced = send_gen != READ_ONCE(cp->cp_send_gen);
452126f760cSHåkon Bugge 
4530c484240Ssantosh.shilimkar@oracle.com 		if ((test_bit(0, &conn->c_map_queued) ||
454126f760cSHåkon Bugge 		    !list_empty(&cp->cp_send_queue)) && !raced) {
4554bebdd7aSSantosh Shilimkar 			if (batch_count < send_batch_count)
456fcc5450cSAndy Grover 				goto restart;
4573db6e0d1SSowmini Varadhan 			rcu_read_lock();
458ebeeb1adSSowmini Varadhan 			if (rds_destroy_pending(cp->cp_conn))
4593db6e0d1SSowmini Varadhan 				ret = -ENETUNREACH;
4603db6e0d1SSowmini Varadhan 			else
4611f9ecd7eSSowmini Varadhan 				queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
4623db6e0d1SSowmini Varadhan 			rcu_read_unlock();
463126f760cSHåkon Bugge 		} else if (raced) {
464126f760cSHåkon Bugge 			rds_stats_inc(s_send_lock_queue_raced);
4655c115590SAndy Grover 		}
4669e29db0eSChris Mason 	}
4675c115590SAndy Grover out:
4685c115590SAndy Grover 	return ret;
4695c115590SAndy Grover }
4700c28c045SSantosh Shilimkar EXPORT_SYMBOL_GPL(rds_send_xmit);
4715c115590SAndy Grover 
rds_send_sndbuf_remove(struct rds_sock * rs,struct rds_message * rm)4725c115590SAndy Grover static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
4735c115590SAndy Grover {
4745c115590SAndy Grover 	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
4755c115590SAndy Grover 
4765c115590SAndy Grover 	assert_spin_locked(&rs->rs_lock);
4775c115590SAndy Grover 
4785c115590SAndy Grover 	BUG_ON(rs->rs_snd_bytes < len);
4795c115590SAndy Grover 	rs->rs_snd_bytes -= len;
4805c115590SAndy Grover 
4815c115590SAndy Grover 	if (rs->rs_snd_bytes == 0)
4825c115590SAndy Grover 		rds_stats_inc(s_send_queue_empty);
4835c115590SAndy Grover }
4845c115590SAndy Grover 
rds_send_is_acked(struct rds_message * rm,u64 ack,is_acked_func is_acked)4855c115590SAndy Grover static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
4865c115590SAndy Grover 				    is_acked_func is_acked)
4875c115590SAndy Grover {
4885c115590SAndy Grover 	if (is_acked)
4895c115590SAndy Grover 		return is_acked(rm, ack);
4905c115590SAndy Grover 	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
4915c115590SAndy Grover }
4925c115590SAndy Grover 
4935c115590SAndy Grover /*
4945c115590SAndy Grover  * This is pretty similar to what happens below in the ACK
4955c115590SAndy Grover  * handling code - except that we call here as soon as we get
4965c115590SAndy Grover  * the IB send completion on the RDMA op and the accompanying
4975c115590SAndy Grover  * message.
4985c115590SAndy Grover  */
rds_rdma_send_complete(struct rds_message * rm,int status)4995c115590SAndy Grover void rds_rdma_send_complete(struct rds_message *rm, int status)
5005c115590SAndy Grover {
5015c115590SAndy Grover 	struct rds_sock *rs = NULL;
502f8b3aaf2SAndy Grover 	struct rm_rdma_op *ro;
5035c115590SAndy Grover 	struct rds_notifier *notifier;
5049de0864cSAndy Grover 	unsigned long flags;
5055c115590SAndy Grover 
5069de0864cSAndy Grover 	spin_lock_irqsave(&rm->m_rs_lock, flags);
5075c115590SAndy Grover 
508f8b3aaf2SAndy Grover 	ro = &rm->rdma;
509f64f9e71SJoe Perches 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
510616d37a0SSantosh Shilimkar 	    ro->op_active && ro->op_notify && ro->op_notifier) {
511f8b3aaf2SAndy Grover 		notifier = ro->op_notifier;
5125c115590SAndy Grover 		rs = rm->m_rs;
5135c115590SAndy Grover 		sock_hold(rds_rs_to_sk(rs));
5145c115590SAndy Grover 
5155c115590SAndy Grover 		notifier->n_status = status;
5165c115590SAndy Grover 		spin_lock(&rs->rs_lock);
5175c115590SAndy Grover 		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
5185c115590SAndy Grover 		spin_unlock(&rs->rs_lock);
5195c115590SAndy Grover 
520f8b3aaf2SAndy Grover 		ro->op_notifier = NULL;
5215c115590SAndy Grover 	}
5225c115590SAndy Grover 
5239de0864cSAndy Grover 	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
5245c115590SAndy Grover 
5255c115590SAndy Grover 	if (rs) {
5265c115590SAndy Grover 		rds_wake_sk_sleep(rs);
5275c115590SAndy Grover 		sock_put(rds_rs_to_sk(rs));
5285c115590SAndy Grover 	}
5295c115590SAndy Grover }
530616b757aSAndy Grover EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
5315c115590SAndy Grover 
5325c115590SAndy Grover /*
53315133f6eSAndy Grover  * Just like above, except looks at atomic op
53415133f6eSAndy Grover  */
rds_atomic_send_complete(struct rds_message * rm,int status)53515133f6eSAndy Grover void rds_atomic_send_complete(struct rds_message *rm, int status)
53615133f6eSAndy Grover {
53715133f6eSAndy Grover 	struct rds_sock *rs = NULL;
53815133f6eSAndy Grover 	struct rm_atomic_op *ao;
53915133f6eSAndy Grover 	struct rds_notifier *notifier;
540cf4b7389SAndy Grover 	unsigned long flags;
54115133f6eSAndy Grover 
542cf4b7389SAndy Grover 	spin_lock_irqsave(&rm->m_rs_lock, flags);
54315133f6eSAndy Grover 
54415133f6eSAndy Grover 	ao = &rm->atomic;
54515133f6eSAndy Grover 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
54615133f6eSAndy Grover 	    && ao->op_active && ao->op_notify && ao->op_notifier) {
54715133f6eSAndy Grover 		notifier = ao->op_notifier;
54815133f6eSAndy Grover 		rs = rm->m_rs;
54915133f6eSAndy Grover 		sock_hold(rds_rs_to_sk(rs));
55015133f6eSAndy Grover 
55115133f6eSAndy Grover 		notifier->n_status = status;
55215133f6eSAndy Grover 		spin_lock(&rs->rs_lock);
55315133f6eSAndy Grover 		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
55415133f6eSAndy Grover 		spin_unlock(&rs->rs_lock);
55515133f6eSAndy Grover 
55615133f6eSAndy Grover 		ao->op_notifier = NULL;
55715133f6eSAndy Grover 	}
55815133f6eSAndy Grover 
559cf4b7389SAndy Grover 	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
56015133f6eSAndy Grover 
56115133f6eSAndy Grover 	if (rs) {
56215133f6eSAndy Grover 		rds_wake_sk_sleep(rs);
56315133f6eSAndy Grover 		sock_put(rds_rs_to_sk(rs));
56415133f6eSAndy Grover 	}
56515133f6eSAndy Grover }
56615133f6eSAndy Grover EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
56715133f6eSAndy Grover 
56815133f6eSAndy Grover /*
5695c115590SAndy Grover  * This is the same as rds_rdma_send_complete except we
5705c115590SAndy Grover  * don't do any locking - we have all the ingredients (message,
5715c115590SAndy Grover  * socket, socket lock) and can just move the notifier.
5725c115590SAndy Grover  */
5735c115590SAndy Grover static inline void
__rds_send_complete(struct rds_sock * rs,struct rds_message * rm,int status)574940786ebSAndy Grover __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
5755c115590SAndy Grover {
576f8b3aaf2SAndy Grover 	struct rm_rdma_op *ro;
577940786ebSAndy Grover 	struct rm_atomic_op *ao;
5785c115590SAndy Grover 
579f8b3aaf2SAndy Grover 	ro = &rm->rdma;
580f8b3aaf2SAndy Grover 	if (ro->op_active && ro->op_notify && ro->op_notifier) {
581f8b3aaf2SAndy Grover 		ro->op_notifier->n_status = status;
582f8b3aaf2SAndy Grover 		list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
583f8b3aaf2SAndy Grover 		ro->op_notifier = NULL;
5845c115590SAndy Grover 	}
5855c115590SAndy Grover 
586940786ebSAndy Grover 	ao = &rm->atomic;
587940786ebSAndy Grover 	if (ao->op_active && ao->op_notify && ao->op_notifier) {
588940786ebSAndy Grover 		ao->op_notifier->n_status = status;
589940786ebSAndy Grover 		list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
590940786ebSAndy Grover 		ao->op_notifier = NULL;
591940786ebSAndy Grover 	}
592940786ebSAndy Grover 
5935c115590SAndy Grover 	/* No need to wake the app - caller does this */
5945c115590SAndy Grover }
5955c115590SAndy Grover 
5965c115590SAndy Grover /*
5975c115590SAndy Grover  * This removes messages from the socket's list if they're on it.  The list
5985c115590SAndy Grover  * argument must be private to the caller, we must be able to modify it
5995c115590SAndy Grover  * without locks.  The messages must have a reference held for their
6005c115590SAndy Grover  * position on the list.  This function will drop that reference after
6015c115590SAndy Grover  * removing the messages from the 'messages' list regardless of if it found
6025c115590SAndy Grover  * the messages on the socket list or not.
6035c115590SAndy Grover  */
rds_send_remove_from_sock(struct list_head * messages,int status)604ff51bf84Sstephen hemminger static void rds_send_remove_from_sock(struct list_head *messages, int status)
6055c115590SAndy Grover {
606561c7df6SAndy Grover 	unsigned long flags;
6075c115590SAndy Grover 	struct rds_sock *rs = NULL;
6085c115590SAndy Grover 	struct rds_message *rm;
6095c115590SAndy Grover 
6105c115590SAndy Grover 	while (!list_empty(messages)) {
611561c7df6SAndy Grover 		int was_on_sock = 0;
612561c7df6SAndy Grover 
6135c115590SAndy Grover 		rm = list_entry(messages->next, struct rds_message,
6145c115590SAndy Grover 				m_conn_item);
6155c115590SAndy Grover 		list_del_init(&rm->m_conn_item);
6165c115590SAndy Grover 
6175c115590SAndy Grover 		/*
6185c115590SAndy Grover 		 * If we see this flag cleared then we're *sure* that someone
6195c115590SAndy Grover 		 * else beat us to removing it from the sock.  If we race
6205c115590SAndy Grover 		 * with their flag update we'll get the lock and then really
6215c115590SAndy Grover 		 * see that the flag has been cleared.
6225c115590SAndy Grover 		 *
6235c115590SAndy Grover 		 * The message spinlock makes sure nobody clears rm->m_rs
6245c115590SAndy Grover 		 * while we're messing with it. It does not prevent the
6255c115590SAndy Grover 		 * message from being removed from the socket, though.
6265c115590SAndy Grover 		 */
627561c7df6SAndy Grover 		spin_lock_irqsave(&rm->m_rs_lock, flags);
6285c115590SAndy Grover 		if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
6295c115590SAndy Grover 			goto unlock_and_drop;
6305c115590SAndy Grover 
6315c115590SAndy Grover 		if (rs != rm->m_rs) {
6325c115590SAndy Grover 			if (rs) {
6335c115590SAndy Grover 				rds_wake_sk_sleep(rs);
6345c115590SAndy Grover 				sock_put(rds_rs_to_sk(rs));
6355c115590SAndy Grover 			}
6365c115590SAndy Grover 			rs = rm->m_rs;
637593cbb3eSHerton R. Krzesinski 			if (rs)
6385c115590SAndy Grover 				sock_hold(rds_rs_to_sk(rs));
6395c115590SAndy Grover 		}
640593cbb3eSHerton R. Krzesinski 		if (!rs)
641593cbb3eSHerton R. Krzesinski 			goto unlock_and_drop;
642048c15e6STina Yang 		spin_lock(&rs->rs_lock);
6435c115590SAndy Grover 
6445c115590SAndy Grover 		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
645f8b3aaf2SAndy Grover 			struct rm_rdma_op *ro = &rm->rdma;
6465c115590SAndy Grover 			struct rds_notifier *notifier;
6475c115590SAndy Grover 
6485c115590SAndy Grover 			list_del_init(&rm->m_sock_item);
6495c115590SAndy Grover 			rds_send_sndbuf_remove(rs, rm);
6505c115590SAndy Grover 
651f8b3aaf2SAndy Grover 			if (ro->op_active && ro->op_notifier &&
652f8b3aaf2SAndy Grover 			       (ro->op_notify || (ro->op_recverr && status))) {
653f8b3aaf2SAndy Grover 				notifier = ro->op_notifier;
6545c115590SAndy Grover 				list_add_tail(&notifier->n_list,
6555c115590SAndy Grover 						&rs->rs_notify_queue);
6565c115590SAndy Grover 				if (!notifier->n_status)
6575c115590SAndy Grover 					notifier->n_status = status;
658f8b3aaf2SAndy Grover 				rm->rdma.op_notifier = NULL;
6595c115590SAndy Grover 			}
660561c7df6SAndy Grover 			was_on_sock = 1;
6615c115590SAndy Grover 		}
662048c15e6STina Yang 		spin_unlock(&rs->rs_lock);
6635c115590SAndy Grover 
6645c115590SAndy Grover unlock_and_drop:
665561c7df6SAndy Grover 		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
666561c7df6SAndy Grover 		rds_message_put(rm);
667799bac55SLinus Torvalds 		if (was_on_sock)
6685c115590SAndy Grover 			rds_message_put(rm);
6695c115590SAndy Grover 	}
6705c115590SAndy Grover 
6715c115590SAndy Grover 	if (rs) {
6725c115590SAndy Grover 		rds_wake_sk_sleep(rs);
6735c115590SAndy Grover 		sock_put(rds_rs_to_sk(rs));
6745c115590SAndy Grover 	}
6755c115590SAndy Grover }
6765c115590SAndy Grover 
6775c115590SAndy Grover /*
6785c115590SAndy Grover  * Transports call here when they've determined that the receiver queued
6795c115590SAndy Grover  * messages up to, and including, the given sequence number.  Messages are
6805c115590SAndy Grover  * moved to the retrans queue when rds_send_xmit picks them off the send
6815c115590SAndy Grover  * queue. This means that in the TCP case, the message may not have been
6825c115590SAndy Grover  * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
6835c115590SAndy Grover  * checks the RDS_MSG_HAS_ACK_SEQ bit.
6845c115590SAndy Grover  */
rds_send_path_drop_acked(struct rds_conn_path * cp,u64 ack,is_acked_func is_acked)6855c3d274cSSowmini Varadhan void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
6865c115590SAndy Grover 			      is_acked_func is_acked)
6875c115590SAndy Grover {
6885c115590SAndy Grover 	struct rds_message *rm, *tmp;
6895c115590SAndy Grover 	unsigned long flags;
6905c115590SAndy Grover 	LIST_HEAD(list);
6915c115590SAndy Grover 
6925c3d274cSSowmini Varadhan 	spin_lock_irqsave(&cp->cp_lock, flags);
6935c115590SAndy Grover 
6945c3d274cSSowmini Varadhan 	list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
6955c115590SAndy Grover 		if (!rds_send_is_acked(rm, ack, is_acked))
6965c115590SAndy Grover 			break;
6975c115590SAndy Grover 
6985c115590SAndy Grover 		list_move(&rm->m_conn_item, &list);
6995c115590SAndy Grover 		clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
7005c115590SAndy Grover 	}
7015c115590SAndy Grover 
7025c115590SAndy Grover 	/* order flag updates with spin locks */
7035c115590SAndy Grover 	if (!list_empty(&list))
7044e857c58SPeter Zijlstra 		smp_mb__after_atomic();
7055c115590SAndy Grover 
7065c3d274cSSowmini Varadhan 	spin_unlock_irqrestore(&cp->cp_lock, flags);
7075c115590SAndy Grover 
7085c115590SAndy Grover 	/* now remove the messages from the sock list as needed */
7095c115590SAndy Grover 	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
7105c115590SAndy Grover }
7115c3d274cSSowmini Varadhan EXPORT_SYMBOL_GPL(rds_send_path_drop_acked);
7125c3d274cSSowmini Varadhan 
rds_send_drop_acked(struct rds_connection * conn,u64 ack,is_acked_func is_acked)7135c3d274cSSowmini Varadhan void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
7145c3d274cSSowmini Varadhan 			 is_acked_func is_acked)
7155c3d274cSSowmini Varadhan {
7165c3d274cSSowmini Varadhan 	WARN_ON(conn->c_trans->t_mp_capable);
7175c3d274cSSowmini Varadhan 	rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked);
7185c3d274cSSowmini Varadhan }
719616b757aSAndy Grover EXPORT_SYMBOL_GPL(rds_send_drop_acked);
7205c115590SAndy Grover 
rds_send_drop_to(struct rds_sock * rs,struct sockaddr_in6 * dest)721eee2fa6aSKa-Cheong Poon void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
7225c115590SAndy Grover {
7235c115590SAndy Grover 	struct rds_message *rm, *tmp;
7245c115590SAndy Grover 	struct rds_connection *conn;
72501ff34edSSowmini Varadhan 	struct rds_conn_path *cp;
7267c82eaf0SAndy Grover 	unsigned long flags;
7275c115590SAndy Grover 	LIST_HEAD(list);
7285c115590SAndy Grover 
7295c115590SAndy Grover 	/* get all the messages we're dropping under the rs lock */
7305c115590SAndy Grover 	spin_lock_irqsave(&rs->rs_lock, flags);
7315c115590SAndy Grover 
7325c115590SAndy Grover 	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
733eee2fa6aSKa-Cheong Poon 		if (dest &&
734eee2fa6aSKa-Cheong Poon 		    (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
735eee2fa6aSKa-Cheong Poon 		     dest->sin6_port != rm->m_inc.i_hdr.h_dport))
7365c115590SAndy Grover 			continue;
7375c115590SAndy Grover 
7385c115590SAndy Grover 		list_move(&rm->m_sock_item, &list);
7395c115590SAndy Grover 		rds_send_sndbuf_remove(rs, rm);
7405c115590SAndy Grover 		clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
7415c115590SAndy Grover 	}
7425c115590SAndy Grover 
7435c115590SAndy Grover 	/* order flag updates with the rs lock */
7444e857c58SPeter Zijlstra 	smp_mb__after_atomic();
7455c115590SAndy Grover 
7465c115590SAndy Grover 	spin_unlock_irqrestore(&rs->rs_lock, flags);
7475c115590SAndy Grover 
7487c82eaf0SAndy Grover 	if (list_empty(&list))
7497c82eaf0SAndy Grover 		return;
7505c115590SAndy Grover 
7517c82eaf0SAndy Grover 	/* Remove the messages from the conn */
7525c115590SAndy Grover 	list_for_each_entry(rm, &list, m_sock_item) {
7537c82eaf0SAndy Grover 
7547c82eaf0SAndy Grover 		conn = rm->m_inc.i_conn;
75501ff34edSSowmini Varadhan 		if (conn->c_trans->t_mp_capable)
75601ff34edSSowmini Varadhan 			cp = rm->m_inc.i_conn_path;
75701ff34edSSowmini Varadhan 		else
75801ff34edSSowmini Varadhan 			cp = &conn->c_path[0];
7597c82eaf0SAndy Grover 
76001ff34edSSowmini Varadhan 		spin_lock_irqsave(&cp->cp_lock, flags);
7617c82eaf0SAndy Grover 		/*
7627c82eaf0SAndy Grover 		 * Maybe someone else beat us to removing rm from the conn.
7637c82eaf0SAndy Grover 		 * If we race with their flag update we'll get the lock and
7647c82eaf0SAndy Grover 		 * then really see that the flag has been cleared.
7657c82eaf0SAndy Grover 		 */
7667c82eaf0SAndy Grover 		if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
76701ff34edSSowmini Varadhan 			spin_unlock_irqrestore(&cp->cp_lock, flags);
7687c82eaf0SAndy Grover 			continue;
7697c82eaf0SAndy Grover 		}
7709de0864cSAndy Grover 		list_del_init(&rm->m_conn_item);
77101ff34edSSowmini Varadhan 		spin_unlock_irqrestore(&cp->cp_lock, flags);
7727c82eaf0SAndy Grover 
7737c82eaf0SAndy Grover 		/*
7747c82eaf0SAndy Grover 		 * Couldn't grab m_rs_lock in top loop (lock ordering),
7757c82eaf0SAndy Grover 		 * but we can now.
7767c82eaf0SAndy Grover 		 */
7779de0864cSAndy Grover 		spin_lock_irqsave(&rm->m_rs_lock, flags);
7787c82eaf0SAndy Grover 
779550a8002STina Yang 		spin_lock(&rs->rs_lock);
780940786ebSAndy Grover 		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
781550a8002STina Yang 		spin_unlock(&rs->rs_lock);
7827c82eaf0SAndy Grover 
7839de0864cSAndy Grover 		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
7845c115590SAndy Grover 
7855c115590SAndy Grover 		rds_message_put(rm);
7867c82eaf0SAndy Grover 	}
7875c115590SAndy Grover 
788550a8002STina Yang 	rds_wake_sk_sleep(rs);
789550a8002STina Yang 
7905c115590SAndy Grover 	while (!list_empty(&list)) {
7915c115590SAndy Grover 		rm = list_entry(list.next, struct rds_message, m_sock_item);
7925c115590SAndy Grover 		list_del_init(&rm->m_sock_item);
7935c115590SAndy Grover 		rds_message_wait(rm);
794dfcec251Ssantosh.shilimkar@oracle.com 
795dfcec251Ssantosh.shilimkar@oracle.com 		/* just in case the code above skipped this message
796dfcec251Ssantosh.shilimkar@oracle.com 		 * because RDS_MSG_ON_CONN wasn't set, run it again here
797dfcec251Ssantosh.shilimkar@oracle.com 		 * taking m_rs_lock is the only thing that keeps us
798dfcec251Ssantosh.shilimkar@oracle.com 		 * from racing with ack processing.
799dfcec251Ssantosh.shilimkar@oracle.com 		 */
800dfcec251Ssantosh.shilimkar@oracle.com 		spin_lock_irqsave(&rm->m_rs_lock, flags);
801dfcec251Ssantosh.shilimkar@oracle.com 
802dfcec251Ssantosh.shilimkar@oracle.com 		spin_lock(&rs->rs_lock);
803dfcec251Ssantosh.shilimkar@oracle.com 		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
804dfcec251Ssantosh.shilimkar@oracle.com 		spin_unlock(&rs->rs_lock);
805dfcec251Ssantosh.shilimkar@oracle.com 
806dfcec251Ssantosh.shilimkar@oracle.com 		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
807dfcec251Ssantosh.shilimkar@oracle.com 
8085c115590SAndy Grover 		rds_message_put(rm);
8095c115590SAndy Grover 	}
8105c115590SAndy Grover }
8115c115590SAndy Grover 
8125c115590SAndy Grover /*
8135c115590SAndy Grover  * we only want this to fire once so we use the callers 'queued'.  It's
8145c115590SAndy Grover  * possible that another thread can race with us and remove the
8155c115590SAndy Grover  * message from the flow with RDS_CANCEL_SENT_TO.
8165c115590SAndy Grover  */
rds_send_queue_rm(struct rds_sock * rs,struct rds_connection * conn,struct rds_conn_path * cp,struct rds_message * rm,__be16 sport,__be16 dport,int * queued)8175c115590SAndy Grover static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
818780a6d9eSSowmini Varadhan 			     struct rds_conn_path *cp,
8195c115590SAndy Grover 			     struct rds_message *rm, __be16 sport,
8205c115590SAndy Grover 			     __be16 dport, int *queued)
8215c115590SAndy Grover {
8225c115590SAndy Grover 	unsigned long flags;
8235c115590SAndy Grover 	u32 len;
8245c115590SAndy Grover 
8255c115590SAndy Grover 	if (*queued)
8265c115590SAndy Grover 		goto out;
8275c115590SAndy Grover 
8285c115590SAndy Grover 	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
8295c115590SAndy Grover 
8305c115590SAndy Grover 	/* this is the only place which holds both the socket's rs_lock
8315c115590SAndy Grover 	 * and the connection's c_lock */
8325c115590SAndy Grover 	spin_lock_irqsave(&rs->rs_lock, flags);
8335c115590SAndy Grover 
8345c115590SAndy Grover 	/*
8355c115590SAndy Grover 	 * If there is a little space in sndbuf, we don't queue anything,
8365c115590SAndy Grover 	 * and userspace gets -EAGAIN. But poll() indicates there's send
8375c115590SAndy Grover 	 * room. This can lead to bad behavior (spinning) if snd_bytes isn't
8385c115590SAndy Grover 	 * freed up by incoming acks. So we check the *old* value of
8395c115590SAndy Grover 	 * rs_snd_bytes here to allow the last msg to exceed the buffer,
8405c115590SAndy Grover 	 * and poll() now knows no more data can be sent.
8415c115590SAndy Grover 	 */
8425c115590SAndy Grover 	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
8435c115590SAndy Grover 		rs->rs_snd_bytes += len;
8445c115590SAndy Grover 
8455c115590SAndy Grover 		/* let recv side know we are close to send space exhaustion.
8465c115590SAndy Grover 		 * This is probably not the optimal way to do it, as this
8475c115590SAndy Grover 		 * means we set the flag on *all* messages as soon as our
8485c115590SAndy Grover 		 * throughput hits a certain threshold.
8495c115590SAndy Grover 		 */
8505c115590SAndy Grover 		if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
851f530f39fSHåkon Bugge 			set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
8525c115590SAndy Grover 
8535c115590SAndy Grover 		list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
8545c115590SAndy Grover 		set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
8555c115590SAndy Grover 		rds_message_addref(rm);
856ea8994cbSSowmini Varadhan 		sock_hold(rds_rs_to_sk(rs));
8575c115590SAndy Grover 		rm->m_rs = rs;
8585c115590SAndy Grover 
8595c115590SAndy Grover 		/* The code ordering is a little weird, but we're
8605c115590SAndy Grover 		   trying to minimize the time we hold c_lock */
8615c115590SAndy Grover 		rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
8625c115590SAndy Grover 		rm->m_inc.i_conn = conn;
863780a6d9eSSowmini Varadhan 		rm->m_inc.i_conn_path = cp;
8645c115590SAndy Grover 		rds_message_addref(rm);
8655c115590SAndy Grover 
866780a6d9eSSowmini Varadhan 		spin_lock(&cp->cp_lock);
867780a6d9eSSowmini Varadhan 		rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++);
868780a6d9eSSowmini Varadhan 		list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
8695c115590SAndy Grover 		set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
870780a6d9eSSowmini Varadhan 		spin_unlock(&cp->cp_lock);
8715c115590SAndy Grover 
8725c115590SAndy Grover 		rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
8735c115590SAndy Grover 			 rm, len, rs, rs->rs_snd_bytes,
8745c115590SAndy Grover 			 (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
8755c115590SAndy Grover 
8765c115590SAndy Grover 		*queued = 1;
8775c115590SAndy Grover 	}
8785c115590SAndy Grover 
8795c115590SAndy Grover 	spin_unlock_irqrestore(&rs->rs_lock, flags);
8805c115590SAndy Grover out:
8815c115590SAndy Grover 	return *queued;
8825c115590SAndy Grover }
8835c115590SAndy Grover 
884fc445084SAndy Grover /*
885fc445084SAndy Grover  * rds_message is getting to be quite complicated, and we'd like to allocate
886fc445084SAndy Grover  * it all in one go. This figures out how big it needs to be up front.
887fc445084SAndy Grover  */
rds_rm_size(struct msghdr * msg,int num_sgs,struct rds_iov_vector_arr * vct)888ea010070Sshamir rabinovitch static int rds_rm_size(struct msghdr *msg, int num_sgs,
889ea010070Sshamir rabinovitch 		       struct rds_iov_vector_arr *vct)
890fc445084SAndy Grover {
891ff87e97aSAndy Grover 	struct cmsghdr *cmsg;
892fc445084SAndy Grover 	int size = 0;
893aa0a4ef4SAndy Grover 	int cmsg_groups = 0;
894ff87e97aSAndy Grover 	int retval;
8950cebacceSSowmini Varadhan 	bool zcopy_cookie = false;
896ea010070Sshamir rabinovitch 	struct rds_iov_vector *iov, *tmp_iov;
897ff87e97aSAndy Grover 
898c75ab8a5Sshamir rabinovitch 	if (num_sgs < 0)
899c75ab8a5Sshamir rabinovitch 		return -EINVAL;
900c75ab8a5Sshamir rabinovitch 
901f95b414eSGu Zheng 	for_each_cmsghdr(cmsg, msg) {
902ff87e97aSAndy Grover 		if (!CMSG_OK(msg, cmsg))
903ff87e97aSAndy Grover 			return -EINVAL;
904ff87e97aSAndy Grover 
905ff87e97aSAndy Grover 		if (cmsg->cmsg_level != SOL_RDS)
906ff87e97aSAndy Grover 			continue;
907ff87e97aSAndy Grover 
908ff87e97aSAndy Grover 		switch (cmsg->cmsg_type) {
909ff87e97aSAndy Grover 		case RDS_CMSG_RDMA_ARGS:
910ea010070Sshamir rabinovitch 			if (vct->indx >= vct->len) {
911ea010070Sshamir rabinovitch 				vct->len += vct->incr;
912ea010070Sshamir rabinovitch 				tmp_iov =
913ea010070Sshamir rabinovitch 					krealloc(vct->vec,
914ea010070Sshamir rabinovitch 						 vct->len *
915ea010070Sshamir rabinovitch 						 sizeof(struct rds_iov_vector),
916ea010070Sshamir rabinovitch 						 GFP_KERNEL);
917ea010070Sshamir rabinovitch 				if (!tmp_iov) {
918ea010070Sshamir rabinovitch 					vct->len -= vct->incr;
919ea010070Sshamir rabinovitch 					return -ENOMEM;
920ea010070Sshamir rabinovitch 				}
921ea010070Sshamir rabinovitch 				vct->vec = tmp_iov;
922ea010070Sshamir rabinovitch 			}
923ea010070Sshamir rabinovitch 			iov = &vct->vec[vct->indx];
924ea010070Sshamir rabinovitch 			memset(iov, 0, sizeof(struct rds_iov_vector));
925ea010070Sshamir rabinovitch 			vct->indx++;
926aa0a4ef4SAndy Grover 			cmsg_groups |= 1;
927ea010070Sshamir rabinovitch 			retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov);
928ff87e97aSAndy Grover 			if (retval < 0)
929ff87e97aSAndy Grover 				return retval;
930ff87e97aSAndy Grover 			size += retval;
931aa0a4ef4SAndy Grover 
932ff87e97aSAndy Grover 			break;
933ff87e97aSAndy Grover 
9340cebacceSSowmini Varadhan 		case RDS_CMSG_ZCOPY_COOKIE:
9350cebacceSSowmini Varadhan 			zcopy_cookie = true;
936df561f66SGustavo A. R. Silva 			fallthrough;
937f9053113SGustavo A. R. Silva 
938ff87e97aSAndy Grover 		case RDS_CMSG_RDMA_DEST:
939ff87e97aSAndy Grover 		case RDS_CMSG_RDMA_MAP:
940aa0a4ef4SAndy Grover 			cmsg_groups |= 2;
941ff87e97aSAndy Grover 			/* these are valid but do no add any size */
942ff87e97aSAndy Grover 			break;
943ff87e97aSAndy Grover 
94415133f6eSAndy Grover 		case RDS_CMSG_ATOMIC_CSWP:
94515133f6eSAndy Grover 		case RDS_CMSG_ATOMIC_FADD:
94620c72bd5SAndy Grover 		case RDS_CMSG_MASKED_ATOMIC_CSWP:
94720c72bd5SAndy Grover 		case RDS_CMSG_MASKED_ATOMIC_FADD:
948aa0a4ef4SAndy Grover 			cmsg_groups |= 1;
94915133f6eSAndy Grover 			size += sizeof(struct scatterlist);
95015133f6eSAndy Grover 			break;
95115133f6eSAndy Grover 
952ff87e97aSAndy Grover 		default:
953ff87e97aSAndy Grover 			return -EINVAL;
954ff87e97aSAndy Grover 		}
955ff87e97aSAndy Grover 
956ff87e97aSAndy Grover 	}
957fc445084SAndy Grover 
9580cebacceSSowmini Varadhan 	if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie)
9590cebacceSSowmini Varadhan 		return -EINVAL;
9600cebacceSSowmini Varadhan 
9610cebacceSSowmini Varadhan 	size += num_sgs * sizeof(struct scatterlist);
962fc445084SAndy Grover 
963aa0a4ef4SAndy Grover 	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
964aa0a4ef4SAndy Grover 	if (cmsg_groups == 3)
965aa0a4ef4SAndy Grover 		return -EINVAL;
966aa0a4ef4SAndy Grover 
967fc445084SAndy Grover 	return size;
968fc445084SAndy Grover }
969fc445084SAndy Grover 
rds_cmsg_zcopy(struct rds_sock * rs,struct rds_message * rm,struct cmsghdr * cmsg)9700cebacceSSowmini Varadhan static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
9710cebacceSSowmini Varadhan 			  struct cmsghdr *cmsg)
9720cebacceSSowmini Varadhan {
9730cebacceSSowmini Varadhan 	u32 *cookie;
9740cebacceSSowmini Varadhan 
97579a5b972SSowmini Varadhan 	if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) ||
97679a5b972SSowmini Varadhan 	    !rm->data.op_mmp_znotifier)
9770cebacceSSowmini Varadhan 		return -EINVAL;
9780cebacceSSowmini Varadhan 	cookie = CMSG_DATA(cmsg);
9790cebacceSSowmini Varadhan 	rm->data.op_mmp_znotifier->z_cookie = *cookie;
9800cebacceSSowmini Varadhan 	return 0;
9810cebacceSSowmini Varadhan }
9820cebacceSSowmini Varadhan 
rds_cmsg_send(struct rds_sock * rs,struct rds_message * rm,struct msghdr * msg,int * allocated_mr,struct rds_iov_vector_arr * vct)9835c115590SAndy Grover static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
984ea010070Sshamir rabinovitch 			 struct msghdr *msg, int *allocated_mr,
985ea010070Sshamir rabinovitch 			 struct rds_iov_vector_arr *vct)
9865c115590SAndy Grover {
9875c115590SAndy Grover 	struct cmsghdr *cmsg;
988ea010070Sshamir rabinovitch 	int ret = 0, ind = 0;
9895c115590SAndy Grover 
990f95b414eSGu Zheng 	for_each_cmsghdr(cmsg, msg) {
9915c115590SAndy Grover 		if (!CMSG_OK(msg, cmsg))
9925c115590SAndy Grover 			return -EINVAL;
9935c115590SAndy Grover 
9945c115590SAndy Grover 		if (cmsg->cmsg_level != SOL_RDS)
9955c115590SAndy Grover 			continue;
9965c115590SAndy Grover 
9975c115590SAndy Grover 		/* As a side effect, RDMA_DEST and RDMA_MAP will set
99815133f6eSAndy Grover 		 * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
9995c115590SAndy Grover 		 */
10005c115590SAndy Grover 		switch (cmsg->cmsg_type) {
10015c115590SAndy Grover 		case RDS_CMSG_RDMA_ARGS:
1002ea010070Sshamir rabinovitch 			if (ind >= vct->indx)
1003ea010070Sshamir rabinovitch 				return -ENOMEM;
1004ea010070Sshamir rabinovitch 			ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]);
1005ea010070Sshamir rabinovitch 			ind++;
10065c115590SAndy Grover 			break;
10075c115590SAndy Grover 
10085c115590SAndy Grover 		case RDS_CMSG_RDMA_DEST:
10095c115590SAndy Grover 			ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
10105c115590SAndy Grover 			break;
10115c115590SAndy Grover 
10125c115590SAndy Grover 		case RDS_CMSG_RDMA_MAP:
10135c115590SAndy Grover 			ret = rds_cmsg_rdma_map(rs, rm, cmsg);
10145c115590SAndy Grover 			if (!ret)
10155c115590SAndy Grover 				*allocated_mr = 1;
1016584a8279SSantosh Shilimkar 			else if (ret == -ENODEV)
1017584a8279SSantosh Shilimkar 				/* Accommodate the get_mr() case which can fail
1018584a8279SSantosh Shilimkar 				 * if connection isn't established yet.
1019584a8279SSantosh Shilimkar 				 */
1020584a8279SSantosh Shilimkar 				ret = -EAGAIN;
10215c115590SAndy Grover 			break;
102215133f6eSAndy Grover 		case RDS_CMSG_ATOMIC_CSWP:
102315133f6eSAndy Grover 		case RDS_CMSG_ATOMIC_FADD:
102420c72bd5SAndy Grover 		case RDS_CMSG_MASKED_ATOMIC_CSWP:
102520c72bd5SAndy Grover 		case RDS_CMSG_MASKED_ATOMIC_FADD:
102615133f6eSAndy Grover 			ret = rds_cmsg_atomic(rs, rm, cmsg);
102715133f6eSAndy Grover 			break;
10285c115590SAndy Grover 
10290cebacceSSowmini Varadhan 		case RDS_CMSG_ZCOPY_COOKIE:
10300cebacceSSowmini Varadhan 			ret = rds_cmsg_zcopy(rs, rm, cmsg);
10310cebacceSSowmini Varadhan 			break;
10320cebacceSSowmini Varadhan 
10335c115590SAndy Grover 		default:
10345c115590SAndy Grover 			return -EINVAL;
10355c115590SAndy Grover 		}
10365c115590SAndy Grover 
10375c115590SAndy Grover 		if (ret)
10385c115590SAndy Grover 			break;
10395c115590SAndy Grover 	}
10405c115590SAndy Grover 
10415c115590SAndy Grover 	return ret;
10425c115590SAndy Grover }
10435c115590SAndy Grover 
rds_send_mprds_hash(struct rds_sock * rs,struct rds_connection * conn,int nonblock)10449a4890bdSKa-Cheong Poon static int rds_send_mprds_hash(struct rds_sock *rs,
10459a4890bdSKa-Cheong Poon 			       struct rds_connection *conn, int nonblock)
10465916e2c1SSowmini Varadhan {
10475916e2c1SSowmini Varadhan 	int hash;
10485916e2c1SSowmini Varadhan 
10495916e2c1SSowmini Varadhan 	if (conn->c_npaths == 0)
10505916e2c1SSowmini Varadhan 		hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
10515916e2c1SSowmini Varadhan 	else
10525916e2c1SSowmini Varadhan 		hash = RDS_MPATH_HASH(rs, conn->c_npaths);
10535916e2c1SSowmini Varadhan 	if (conn->c_npaths == 0 && hash != 0) {
105469b92b5bSSowmini Varadhan 		rds_send_ping(conn, 0);
10555916e2c1SSowmini Varadhan 
1056a43cced9SKa-Cheong Poon 		/* The underlying connection is not up yet.  Need to wait
1057a43cced9SKa-Cheong Poon 		 * until it is up to be sure that the non-zero c_path can be
1058a43cced9SKa-Cheong Poon 		 * used.  But if we are interrupted, we have to use the zero
1059a43cced9SKa-Cheong Poon 		 * c_path in case the connection ends up being non-MP capable.
1060a43cced9SKa-Cheong Poon 		 */
10619a4890bdSKa-Cheong Poon 		if (conn->c_npaths == 0) {
10629a4890bdSKa-Cheong Poon 			/* Cannot wait for the connection be made, so just use
10639a4890bdSKa-Cheong Poon 			 * the base c_path.
10649a4890bdSKa-Cheong Poon 			 */
10659a4890bdSKa-Cheong Poon 			if (nonblock)
10669a4890bdSKa-Cheong Poon 				return 0;
1067a43cced9SKa-Cheong Poon 			if (wait_event_interruptible(conn->c_hs_waitq,
1068a43cced9SKa-Cheong Poon 						     conn->c_npaths != 0))
1069a43cced9SKa-Cheong Poon 				hash = 0;
10709a4890bdSKa-Cheong Poon 		}
10715916e2c1SSowmini Varadhan 		if (conn->c_npaths == 1)
10725916e2c1SSowmini Varadhan 			hash = 0;
10735916e2c1SSowmini Varadhan 	}
10745916e2c1SSowmini Varadhan 	return hash;
10755916e2c1SSowmini Varadhan }
10765916e2c1SSowmini Varadhan 
rds_rdma_bytes(struct msghdr * msg,size_t * rdma_bytes)1077f9fb69adSAvinash Repaka static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
1078f9fb69adSAvinash Repaka {
1079f9fb69adSAvinash Repaka 	struct rds_rdma_args *args;
1080f9fb69adSAvinash Repaka 	struct cmsghdr *cmsg;
1081f9fb69adSAvinash Repaka 
1082f9fb69adSAvinash Repaka 	for_each_cmsghdr(cmsg, msg) {
1083f9fb69adSAvinash Repaka 		if (!CMSG_OK(msg, cmsg))
1084f9fb69adSAvinash Repaka 			return -EINVAL;
1085f9fb69adSAvinash Repaka 
1086f9fb69adSAvinash Repaka 		if (cmsg->cmsg_level != SOL_RDS)
1087f9fb69adSAvinash Repaka 			continue;
1088f9fb69adSAvinash Repaka 
1089f9fb69adSAvinash Repaka 		if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
109014e138a8SAvinash Repaka 			if (cmsg->cmsg_len <
109114e138a8SAvinash Repaka 			    CMSG_LEN(sizeof(struct rds_rdma_args)))
109214e138a8SAvinash Repaka 				return -EINVAL;
1093f9fb69adSAvinash Repaka 			args = CMSG_DATA(cmsg);
1094f9fb69adSAvinash Repaka 			*rdma_bytes += args->remote_vec.bytes;
1095f9fb69adSAvinash Repaka 		}
1096f9fb69adSAvinash Repaka 	}
1097f9fb69adSAvinash Repaka 	return 0;
1098f9fb69adSAvinash Repaka }
1099f9fb69adSAvinash Repaka 
rds_sendmsg(struct socket * sock,struct msghdr * msg,size_t payload_len)11001b784140SYing Xue int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
11015c115590SAndy Grover {
11025c115590SAndy Grover 	struct sock *sk = sock->sk;
11035c115590SAndy Grover 	struct rds_sock *rs = rds_sk_to_rs(sk);
1104eee2fa6aSKa-Cheong Poon 	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
1105342dfc30SSteffen Hurrle 	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
11065c115590SAndy Grover 	__be16 dport;
11075c115590SAndy Grover 	struct rds_message *rm = NULL;
11085c115590SAndy Grover 	struct rds_connection *conn;
11095c115590SAndy Grover 	int ret = 0;
11105c115590SAndy Grover 	int queued = 0, allocated_mr = 0;
11115c115590SAndy Grover 	int nonblock = msg->msg_flags & MSG_DONTWAIT;
11121123fd73SAndy Grover 	long timeo = sock_sndtimeo(sk, nonblock);
1113780a6d9eSSowmini Varadhan 	struct rds_conn_path *cpath;
1114eee2fa6aSKa-Cheong Poon 	struct in6_addr daddr;
1115eee2fa6aSKa-Cheong Poon 	__u32 scope_id = 0;
1116d28c0e73SColin Ian King 	size_t rdma_payload_len = 0;
11170cebacceSSowmini Varadhan 	bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
11180cebacceSSowmini Varadhan 		      sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
1119eeb2c4fbSJacob Wen 	int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE);
1120eee2fa6aSKa-Cheong Poon 	int namelen;
1121d84e7bc0SDavid S. Miller 	struct rds_iov_vector_arr vct;
1122ea010070Sshamir rabinovitch 	int ind;
1123ea010070Sshamir rabinovitch 
1124d84e7bc0SDavid S. Miller 	memset(&vct, 0, sizeof(vct));
1125d84e7bc0SDavid S. Miller 
1126ea010070Sshamir rabinovitch 	/* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */
1127ea010070Sshamir rabinovitch 	vct.incr = 1;
11285c115590SAndy Grover 
11295c115590SAndy Grover 	/* Mirror Linux UDP mirror of BSD error message compatibility */
11305c115590SAndy Grover 	/* XXX: Perhaps MSG_MORE someday */
11310cebacceSSowmini Varadhan 	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) {
11325c115590SAndy Grover 		ret = -EOPNOTSUPP;
11335c115590SAndy Grover 		goto out;
11345c115590SAndy Grover 	}
11355c115590SAndy Grover 
1136eee2fa6aSKa-Cheong Poon 	namelen = msg->msg_namelen;
1137eee2fa6aSKa-Cheong Poon 	if (namelen != 0) {
1138eee2fa6aSKa-Cheong Poon 		if (namelen < sizeof(*usin)) {
11395c115590SAndy Grover 			ret = -EINVAL;
11405c115590SAndy Grover 			goto out;
11415c115590SAndy Grover 		}
11421e2b44e7SKa-Cheong Poon 		switch (usin->sin_family) {
11431e2b44e7SKa-Cheong Poon 		case AF_INET:
11441e2b44e7SKa-Cheong Poon 			if (usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
1145eee2fa6aSKa-Cheong Poon 			    usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
1146842841ecSDave Taht 			    ipv4_is_multicast(usin->sin_addr.s_addr)) {
1147eee2fa6aSKa-Cheong Poon 				ret = -EINVAL;
1148eee2fa6aSKa-Cheong Poon 				goto out;
1149eee2fa6aSKa-Cheong Poon 			}
1150eee2fa6aSKa-Cheong Poon 			ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
11515c115590SAndy Grover 			dport = usin->sin_port;
1152eee2fa6aSKa-Cheong Poon 			break;
1153eee2fa6aSKa-Cheong Poon 
1154e65d4d96SKa-Cheong Poon #if IS_ENABLED(CONFIG_IPV6)
11551e2b44e7SKa-Cheong Poon 		case AF_INET6: {
11561e2b44e7SKa-Cheong Poon 			int addr_type;
11571e2b44e7SKa-Cheong Poon 
11581e2b44e7SKa-Cheong Poon 			if (namelen < sizeof(*sin6)) {
11591e2b44e7SKa-Cheong Poon 				ret = -EINVAL;
1160eee2fa6aSKa-Cheong Poon 				goto out;
1161eee2fa6aSKa-Cheong Poon 			}
11621e2b44e7SKa-Cheong Poon 			addr_type = ipv6_addr_type(&sin6->sin6_addr);
11631e2b44e7SKa-Cheong Poon 			if (!(addr_type & IPV6_ADDR_UNICAST)) {
11641e2b44e7SKa-Cheong Poon 				__be32 addr4;
11651e2b44e7SKa-Cheong Poon 
11661e2b44e7SKa-Cheong Poon 				if (!(addr_type & IPV6_ADDR_MAPPED)) {
11671e2b44e7SKa-Cheong Poon 					ret = -EINVAL;
11681e2b44e7SKa-Cheong Poon 					goto out;
11691e2b44e7SKa-Cheong Poon 				}
11701e2b44e7SKa-Cheong Poon 
11711e2b44e7SKa-Cheong Poon 				/* It is a mapped address.  Need to do some
11721e2b44e7SKa-Cheong Poon 				 * sanity checks.
11731e2b44e7SKa-Cheong Poon 				 */
11741e2b44e7SKa-Cheong Poon 				addr4 = sin6->sin6_addr.s6_addr32[3];
11751e2b44e7SKa-Cheong Poon 				if (addr4 == htonl(INADDR_ANY) ||
11761e2b44e7SKa-Cheong Poon 				    addr4 == htonl(INADDR_BROADCAST) ||
1177842841ecSDave Taht 				    ipv4_is_multicast(addr4)) {
1178dc66fe43SGustavo A. R. Silva 					ret = -EINVAL;
11791e2b44e7SKa-Cheong Poon 					goto out;
11801e2b44e7SKa-Cheong Poon 				}
11811e2b44e7SKa-Cheong Poon 			}
11821e2b44e7SKa-Cheong Poon 			if (addr_type & IPV6_ADDR_LINKLOCAL) {
11831e2b44e7SKa-Cheong Poon 				if (sin6->sin6_scope_id == 0) {
11841e2b44e7SKa-Cheong Poon 					ret = -EINVAL;
11851e2b44e7SKa-Cheong Poon 					goto out;
11861e2b44e7SKa-Cheong Poon 				}
11871e2b44e7SKa-Cheong Poon 				scope_id = sin6->sin6_scope_id;
11881e2b44e7SKa-Cheong Poon 			}
11891e2b44e7SKa-Cheong Poon 
11901e2b44e7SKa-Cheong Poon 			daddr = sin6->sin6_addr;
11911e2b44e7SKa-Cheong Poon 			dport = sin6->sin6_port;
11921e2b44e7SKa-Cheong Poon 			break;
11931e2b44e7SKa-Cheong Poon 		}
1194e65d4d96SKa-Cheong Poon #endif
1195eee2fa6aSKa-Cheong Poon 
1196eee2fa6aSKa-Cheong Poon 		default:
1197eee2fa6aSKa-Cheong Poon 			ret = -EINVAL;
1198eee2fa6aSKa-Cheong Poon 			goto out;
1199eee2fa6aSKa-Cheong Poon 		}
12005c115590SAndy Grover 	} else {
12015c115590SAndy Grover 		/* We only care about consistency with ->connect() */
12025c115590SAndy Grover 		lock_sock(sk);
12035c115590SAndy Grover 		daddr = rs->rs_conn_addr;
12045c115590SAndy Grover 		dport = rs->rs_conn_port;
1205eee2fa6aSKa-Cheong Poon 		scope_id = rs->rs_bound_scope_id;
12065c115590SAndy Grover 		release_sock(sk);
12075c115590SAndy Grover 	}
12085c115590SAndy Grover 
12098c7188b2SQuentin Casasnovas 	lock_sock(sk);
1210eee2fa6aSKa-Cheong Poon 	if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) {
12118c7188b2SQuentin Casasnovas 		release_sock(sk);
1212eee2fa6aSKa-Cheong Poon 		ret = -ENOTCONN;
12135c115590SAndy Grover 		goto out;
1214eee2fa6aSKa-Cheong Poon 	} else if (namelen != 0) {
1215eee2fa6aSKa-Cheong Poon 		/* Cannot send to an IPv4 address using an IPv6 source
1216eee2fa6aSKa-Cheong Poon 		 * address and cannot send to an IPv6 address using an
1217eee2fa6aSKa-Cheong Poon 		 * IPv4 source address.
1218eee2fa6aSKa-Cheong Poon 		 */
1219eee2fa6aSKa-Cheong Poon 		if (ipv6_addr_v4mapped(&daddr) ^
1220eee2fa6aSKa-Cheong Poon 		    ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
1221eee2fa6aSKa-Cheong Poon 			release_sock(sk);
1222eee2fa6aSKa-Cheong Poon 			ret = -EOPNOTSUPP;
1223eee2fa6aSKa-Cheong Poon 			goto out;
1224eee2fa6aSKa-Cheong Poon 		}
12251e2b44e7SKa-Cheong Poon 		/* If the socket is already bound to a link local address,
12261e2b44e7SKa-Cheong Poon 		 * it can only send to peers on the same link.  But allow
1227ebf89395SLu Wei 		 * communicating between link local and non-link local address.
12281e2b44e7SKa-Cheong Poon 		 */
12291e2b44e7SKa-Cheong Poon 		if (scope_id != rs->rs_bound_scope_id) {
12301e2b44e7SKa-Cheong Poon 			if (!scope_id) {
12311e2b44e7SKa-Cheong Poon 				scope_id = rs->rs_bound_scope_id;
12321e2b44e7SKa-Cheong Poon 			} else if (rs->rs_bound_scope_id) {
12331e2b44e7SKa-Cheong Poon 				release_sock(sk);
12341e2b44e7SKa-Cheong Poon 				ret = -EINVAL;
12351e2b44e7SKa-Cheong Poon 				goto out;
12361e2b44e7SKa-Cheong Poon 			}
12371e2b44e7SKa-Cheong Poon 		}
12385c115590SAndy Grover 	}
12398c7188b2SQuentin Casasnovas 	release_sock(sk);
12405c115590SAndy Grover 
1241f9fb69adSAvinash Repaka 	ret = rds_rdma_bytes(msg, &rdma_payload_len);
1242f9fb69adSAvinash Repaka 	if (ret)
1243f9fb69adSAvinash Repaka 		goto out;
1244f9fb69adSAvinash Repaka 
1245f9fb69adSAvinash Repaka 	if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
1246f9fb69adSAvinash Repaka 		ret = -EMSGSIZE;
1247f9fb69adSAvinash Repaka 		goto out;
1248f9fb69adSAvinash Repaka 	}
1249f9fb69adSAvinash Repaka 
125006e8941eSMukesh Kacker 	if (payload_len > rds_sk_sndbuf(rs)) {
125106e8941eSMukesh Kacker 		ret = -EMSGSIZE;
125206e8941eSMukesh Kacker 		goto out;
125306e8941eSMukesh Kacker 	}
125406e8941eSMukesh Kacker 
12550cebacceSSowmini Varadhan 	if (zcopy) {
12560cebacceSSowmini Varadhan 		if (rs->rs_transport->t_type != RDS_TRANS_TCP) {
12570cebacceSSowmini Varadhan 			ret = -EOPNOTSUPP;
12580cebacceSSowmini Varadhan 			goto out;
12590cebacceSSowmini Varadhan 		}
12600cebacceSSowmini Varadhan 		num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
12610cebacceSSowmini Varadhan 	}
1262fc445084SAndy Grover 	/* size of rm including all sgs */
1263ea010070Sshamir rabinovitch 	ret = rds_rm_size(msg, num_sgs, &vct);
1264fc445084SAndy Grover 	if (ret < 0)
1265fc445084SAndy Grover 		goto out;
1266fc445084SAndy Grover 
1267fc445084SAndy Grover 	rm = rds_message_alloc(ret, GFP_KERNEL);
1268fc445084SAndy Grover 	if (!rm) {
1269fc445084SAndy Grover 		ret = -ENOMEM;
12705c115590SAndy Grover 		goto out;
12715c115590SAndy Grover 	}
12725c115590SAndy Grover 
1273372cd7deSAndy Grover 	/* Attach data to the rm */
1274372cd7deSAndy Grover 	if (payload_len) {
12757dba9203SJason Gunthorpe 		rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
12767dba9203SJason Gunthorpe 		if (IS_ERR(rm->data.op_sg)) {
12777dba9203SJason Gunthorpe 			ret = PTR_ERR(rm->data.op_sg);
1278d139ff09SAndy Grover 			goto out;
12797dba9203SJason Gunthorpe 		}
12800cebacceSSowmini Varadhan 		ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
1281fc445084SAndy Grover 		if (ret)
1282fc445084SAndy Grover 			goto out;
1283372cd7deSAndy Grover 	}
1284372cd7deSAndy Grover 	rm->data.op_active = 1;
1285fc445084SAndy Grover 
12865c115590SAndy Grover 	rm->m_daddr = daddr;
12875c115590SAndy Grover 
12885c115590SAndy Grover 	/* rds_conn_create has a spinlock that runs with IRQ off.
12895c115590SAndy Grover 	 * Caching the conn in the socket helps a lot. */
1290fd261ce6SSantosh Shilimkar 	if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) &&
1291fd261ce6SSantosh Shilimkar 	    rs->rs_tos == rs->rs_conn->c_tos) {
12925c115590SAndy Grover 		conn = rs->rs_conn;
12933eb45036SSantosh Shilimkar 	} else {
1294d5a8ac28SSowmini Varadhan 		conn = rds_conn_create_outgoing(sock_net(sock->sk),
1295eee2fa6aSKa-Cheong Poon 						&rs->rs_bound_addr, &daddr,
1296fd261ce6SSantosh Shilimkar 						rs->rs_transport, rs->rs_tos,
1297eee2fa6aSKa-Cheong Poon 						sock->sk->sk_allocation,
1298eee2fa6aSKa-Cheong Poon 						scope_id);
12995c115590SAndy Grover 		if (IS_ERR(conn)) {
13005c115590SAndy Grover 			ret = PTR_ERR(conn);
13015c115590SAndy Grover 			goto out;
13025c115590SAndy Grover 		}
13035c115590SAndy Grover 		rs->rs_conn = conn;
13045c115590SAndy Grover 	}
13055c115590SAndy Grover 
13069e630bcbSAvinash Repaka 	if (conn->c_trans->t_mp_capable)
13079a4890bdSKa-Cheong Poon 		cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)];
13089e630bcbSAvinash Repaka 	else
13099e630bcbSAvinash Repaka 		cpath = &conn->c_path[0];
13109e630bcbSAvinash Repaka 
13119e630bcbSAvinash Repaka 	rm->m_conn_path = cpath;
13129e630bcbSAvinash Repaka 
131349f69691SAndy Grover 	/* Parse any control messages the user may have included. */
1314ea010070Sshamir rabinovitch 	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct);
13152b505d05SEdward Adam Davis 	if (ret)
131649f69691SAndy Grover 		goto out;
131749f69691SAndy Grover 
13182c3a5f9aSAndy Grover 	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
1319cb0a6056SManuel Zerpies 		printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
1320f8b3aaf2SAndy Grover 			       &rm->rdma, conn->c_trans->xmit_rdma);
13215c115590SAndy Grover 		ret = -EOPNOTSUPP;
13225c115590SAndy Grover 		goto out;
13235c115590SAndy Grover 	}
13245c115590SAndy Grover 
132515133f6eSAndy Grover 	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
1326cb0a6056SManuel Zerpies 		printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
132715133f6eSAndy Grover 			       &rm->atomic, conn->c_trans->xmit_atomic);
132815133f6eSAndy Grover 		ret = -EOPNOTSUPP;
132915133f6eSAndy Grover 		goto out;
133015133f6eSAndy Grover 	}
133115133f6eSAndy Grover 
1332ebeeb1adSSowmini Varadhan 	if (rds_destroy_pending(conn)) {
13333db6e0d1SSowmini Varadhan 		ret = -EAGAIN;
13343db6e0d1SSowmini Varadhan 		goto out;
13353db6e0d1SSowmini Varadhan 	}
13363db6e0d1SSowmini Varadhan 
13379ef845f8SRao Shoaib 	if (rds_conn_path_down(cpath))
13389ef845f8SRao Shoaib 		rds_check_all_paths(conn);
13395c115590SAndy Grover 
13405c115590SAndy Grover 	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
1341b98ba52fSAndy Grover 	if (ret) {
1342b98ba52fSAndy Grover 		rs->rs_seen_congestion = 1;
13435c115590SAndy Grover 		goto out;
1344b98ba52fSAndy Grover 	}
1345780a6d9eSSowmini Varadhan 	while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port,
13465c115590SAndy Grover 				  dport, &queued)) {
13475c115590SAndy Grover 		rds_stats_inc(s_send_queue_full);
134806e8941eSMukesh Kacker 
13495c115590SAndy Grover 		if (nonblock) {
13505c115590SAndy Grover 			ret = -EAGAIN;
13515c115590SAndy Grover 			goto out;
13525c115590SAndy Grover 		}
13535c115590SAndy Grover 
1354aa395145SEric Dumazet 		timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
1355780a6d9eSSowmini Varadhan 					rds_send_queue_rm(rs, conn, cpath, rm,
13565c115590SAndy Grover 							  rs->rs_bound_port,
13575c115590SAndy Grover 							  dport,
13585c115590SAndy Grover 							  &queued),
13595c115590SAndy Grover 					timeo);
13605c115590SAndy Grover 		rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
13615c115590SAndy Grover 		if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
13625c115590SAndy Grover 			continue;
13635c115590SAndy Grover 
13645c115590SAndy Grover 		ret = timeo;
13655c115590SAndy Grover 		if (ret == 0)
13665c115590SAndy Grover 			ret = -ETIMEDOUT;
13675c115590SAndy Grover 		goto out;
13685c115590SAndy Grover 	}
13695c115590SAndy Grover 
13705c115590SAndy Grover 	/*
13715c115590SAndy Grover 	 * By now we've committed to the send.  We reuse rds_send_worker()
13725c115590SAndy Grover 	 * to retry sends in the rds thread if the transport asks us to.
13735c115590SAndy Grover 	 */
13745c115590SAndy Grover 	rds_stats_inc(s_send_queued);
13755c115590SAndy Grover 
13761f9ecd7eSSowmini Varadhan 	ret = rds_send_xmit(cpath);
13773db6e0d1SSowmini Varadhan 	if (ret == -ENOMEM || ret == -EAGAIN) {
13783db6e0d1SSowmini Varadhan 		ret = 0;
13793db6e0d1SSowmini Varadhan 		rcu_read_lock();
1380ebeeb1adSSowmini Varadhan 		if (rds_destroy_pending(cpath->cp_conn))
13813db6e0d1SSowmini Varadhan 			ret = -ENETUNREACH;
13823db6e0d1SSowmini Varadhan 		else
13831f9ecd7eSSowmini Varadhan 			queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
13843db6e0d1SSowmini Varadhan 		rcu_read_unlock();
13853db6e0d1SSowmini Varadhan 	}
13863db6e0d1SSowmini Varadhan 	if (ret)
13873db6e0d1SSowmini Varadhan 		goto out;
13885c115590SAndy Grover 	rds_message_put(rm);
1389ea010070Sshamir rabinovitch 
1390ea010070Sshamir rabinovitch 	for (ind = 0; ind < vct.indx; ind++)
1391ea010070Sshamir rabinovitch 		kfree(vct.vec[ind].iov);
1392ea010070Sshamir rabinovitch 	kfree(vct.vec);
1393ea010070Sshamir rabinovitch 
13945c115590SAndy Grover 	return payload_len;
13955c115590SAndy Grover 
13965c115590SAndy Grover out:
1397ea010070Sshamir rabinovitch 	for (ind = 0; ind < vct.indx; ind++)
1398ea010070Sshamir rabinovitch 		kfree(vct.vec[ind].iov);
1399ea010070Sshamir rabinovitch 	kfree(vct.vec);
1400ea010070Sshamir rabinovitch 
14015c115590SAndy Grover 	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
14025c115590SAndy Grover 	 * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
14035c115590SAndy Grover 	 * or in any other way, we need to destroy the MR again */
14045c115590SAndy Grover 	if (allocated_mr)
14055c115590SAndy Grover 		rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
14065c115590SAndy Grover 
14075c115590SAndy Grover 	if (rm)
14085c115590SAndy Grover 		rds_message_put(rm);
14095c115590SAndy Grover 	return ret;
14105c115590SAndy Grover }
14115c115590SAndy Grover 
14125c115590SAndy Grover /*
14135916e2c1SSowmini Varadhan  * send out a probe. Can be shared by rds_send_ping,
14145916e2c1SSowmini Varadhan  * rds_send_pong, rds_send_hb.
14155916e2c1SSowmini Varadhan  * rds_send_hb should use h_flags
14165916e2c1SSowmini Varadhan  *   RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED
14175916e2c1SSowmini Varadhan  * or
14185916e2c1SSowmini Varadhan  *   RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
14195c115590SAndy Grover  */
1420bb789763SSantosh Shilimkar static int
rds_send_probe(struct rds_conn_path * cp,__be16 sport,__be16 dport,u8 h_flags)14215916e2c1SSowmini Varadhan rds_send_probe(struct rds_conn_path *cp, __be16 sport,
14225916e2c1SSowmini Varadhan 	       __be16 dport, u8 h_flags)
14235c115590SAndy Grover {
14245c115590SAndy Grover 	struct rds_message *rm;
14255c115590SAndy Grover 	unsigned long flags;
14265c115590SAndy Grover 	int ret = 0;
14275c115590SAndy Grover 
14285c115590SAndy Grover 	rm = rds_message_alloc(0, GFP_ATOMIC);
14298690bfa1SAndy Grover 	if (!rm) {
14305c115590SAndy Grover 		ret = -ENOMEM;
14315c115590SAndy Grover 		goto out;
14325c115590SAndy Grover 	}
14335c115590SAndy Grover 
143445997e9eSSowmini Varadhan 	rm->m_daddr = cp->cp_conn->c_faddr;
1435acfcd4d4SAndy Grover 	rm->data.op_active = 1;
14365c115590SAndy Grover 
14373c0a5900SSowmini Varadhan 	rds_conn_path_connect_if_down(cp);
14385c115590SAndy Grover 
143945997e9eSSowmini Varadhan 	ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL);
14405c115590SAndy Grover 	if (ret)
14415c115590SAndy Grover 		goto out;
14425c115590SAndy Grover 
144345997e9eSSowmini Varadhan 	spin_lock_irqsave(&cp->cp_lock, flags);
144445997e9eSSowmini Varadhan 	list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
14455c115590SAndy Grover 	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
14465c115590SAndy Grover 	rds_message_addref(rm);
144745997e9eSSowmini Varadhan 	rm->m_inc.i_conn = cp->cp_conn;
144845997e9eSSowmini Varadhan 	rm->m_inc.i_conn_path = cp;
14495c115590SAndy Grover 
14505916e2c1SSowmini Varadhan 	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport,
145145997e9eSSowmini Varadhan 				    cp->cp_next_tx_seq);
14525916e2c1SSowmini Varadhan 	rm->m_inc.i_hdr.h_flags |= h_flags;
145345997e9eSSowmini Varadhan 	cp->cp_next_tx_seq++;
14545916e2c1SSowmini Varadhan 
145500354de5SSowmini Varadhan 	if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) &&
145600354de5SSowmini Varadhan 	    cp->cp_conn->c_trans->t_mp_capable) {
145700354de5SSowmini Varadhan 		u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
145800354de5SSowmini Varadhan 		u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
14595916e2c1SSowmini Varadhan 
14605916e2c1SSowmini Varadhan 		rds_message_add_extension(&rm->m_inc.i_hdr,
14615916e2c1SSowmini Varadhan 					  RDS_EXTHDR_NPATHS, &npaths,
14625916e2c1SSowmini Varadhan 					  sizeof(npaths));
1463905dd418SSowmini Varadhan 		rds_message_add_extension(&rm->m_inc.i_hdr,
1464905dd418SSowmini Varadhan 					  RDS_EXTHDR_GEN_NUM,
146500354de5SSowmini Varadhan 					  &my_gen_num,
1466905dd418SSowmini Varadhan 					  sizeof(u32));
14675916e2c1SSowmini Varadhan 	}
146845997e9eSSowmini Varadhan 	spin_unlock_irqrestore(&cp->cp_lock, flags);
14695c115590SAndy Grover 
14705c115590SAndy Grover 	rds_stats_inc(s_send_queued);
14715c115590SAndy Grover 	rds_stats_inc(s_send_pong);
14725c115590SAndy Grover 
14737b4b0009Ssantosh.shilimkar@oracle.com 	/* schedule the send work on rds_wq */
14743db6e0d1SSowmini Varadhan 	rcu_read_lock();
1475ebeeb1adSSowmini Varadhan 	if (!rds_destroy_pending(cp->cp_conn))
147645997e9eSSowmini Varadhan 		queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
14773db6e0d1SSowmini Varadhan 	rcu_read_unlock();
1478acfcd4d4SAndy Grover 
14795c115590SAndy Grover 	rds_message_put(rm);
14805c115590SAndy Grover 	return 0;
14815c115590SAndy Grover 
14825c115590SAndy Grover out:
14835c115590SAndy Grover 	if (rm)
14845c115590SAndy Grover 		rds_message_put(rm);
14855c115590SAndy Grover 	return ret;
14865c115590SAndy Grover }
14875916e2c1SSowmini Varadhan 
14885916e2c1SSowmini Varadhan int
rds_send_pong(struct rds_conn_path * cp,__be16 dport)14895916e2c1SSowmini Varadhan rds_send_pong(struct rds_conn_path *cp, __be16 dport)
14905916e2c1SSowmini Varadhan {
14915916e2c1SSowmini Varadhan 	return rds_send_probe(cp, 0, dport, 0);
14925916e2c1SSowmini Varadhan }
14935916e2c1SSowmini Varadhan 
149469b92b5bSSowmini Varadhan void
rds_send_ping(struct rds_connection * conn,int cp_index)149569b92b5bSSowmini Varadhan rds_send_ping(struct rds_connection *conn, int cp_index)
14965916e2c1SSowmini Varadhan {
14975916e2c1SSowmini Varadhan 	unsigned long flags;
149869b92b5bSSowmini Varadhan 	struct rds_conn_path *cp = &conn->c_path[cp_index];
14995916e2c1SSowmini Varadhan 
15005916e2c1SSowmini Varadhan 	spin_lock_irqsave(&cp->cp_lock, flags);
15015916e2c1SSowmini Varadhan 	if (conn->c_ping_triggered) {
15025916e2c1SSowmini Varadhan 		spin_unlock_irqrestore(&cp->cp_lock, flags);
15035916e2c1SSowmini Varadhan 		return;
15045916e2c1SSowmini Varadhan 	}
15055916e2c1SSowmini Varadhan 	conn->c_ping_triggered = 1;
15065916e2c1SSowmini Varadhan 	spin_unlock_irqrestore(&cp->cp_lock, flags);
150769b92b5bSSowmini Varadhan 	rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0);
15085916e2c1SSowmini Varadhan }
150969b92b5bSSowmini Varadhan EXPORT_SYMBOL_GPL(rds_send_ping);
1510