xref: /openbmc/linux/fs/dlm/lowcomms.c (revision feae43f8)
12522fe45SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
26ed7257bSPatrick Caulfield /******************************************************************************
36ed7257bSPatrick Caulfield *******************************************************************************
46ed7257bSPatrick Caulfield **
56ed7257bSPatrick Caulfield **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
65e9ccc37SChristine Caulfield **  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
76ed7257bSPatrick Caulfield **
86ed7257bSPatrick Caulfield **
96ed7257bSPatrick Caulfield *******************************************************************************
106ed7257bSPatrick Caulfield ******************************************************************************/
116ed7257bSPatrick Caulfield 
126ed7257bSPatrick Caulfield /*
136ed7257bSPatrick Caulfield  * lowcomms.c
146ed7257bSPatrick Caulfield  *
156ed7257bSPatrick Caulfield  * This is the "low-level" comms layer.
166ed7257bSPatrick Caulfield  *
176ed7257bSPatrick Caulfield  * It is responsible for sending/receiving messages
186ed7257bSPatrick Caulfield  * from other nodes in the cluster.
196ed7257bSPatrick Caulfield  *
206ed7257bSPatrick Caulfield  * Cluster nodes are referred to by their nodeids. nodeids are
216ed7257bSPatrick Caulfield  * simply 32 bit numbers to the locking module - if they need to
222cf12c0bSJoe Perches  * be expanded for the cluster infrastructure then that is its
236ed7257bSPatrick Caulfield  * responsibility. It is this layer's
246ed7257bSPatrick Caulfield  * responsibility to resolve these into IP address or
256ed7257bSPatrick Caulfield  * whatever it needs for inter-node communication.
266ed7257bSPatrick Caulfield  *
276ed7257bSPatrick Caulfield  * The comms level is two kernel threads that deal mainly with
286ed7257bSPatrick Caulfield  * the receiving of messages from other nodes and passing them
296ed7257bSPatrick Caulfield  * up to the mid-level comms layer (which understands the
306ed7257bSPatrick Caulfield  * message format) for execution by the locking core, and
316ed7257bSPatrick Caulfield  * a send thread which does all the setting up of connections
326ed7257bSPatrick Caulfield  * to remote nodes and the sending of data. Threads are not allowed
336ed7257bSPatrick Caulfield  * to send their own data because it may cause them to wait in times
346ed7257bSPatrick Caulfield  * of high load. Also, this way, the sending thread can collect together
356ed7257bSPatrick Caulfield  * messages bound for one node and send them in one block.
366ed7257bSPatrick Caulfield  *
372cf12c0bSJoe Perches  * lowcomms will choose to use either TCP or SCTP as its transport layer
386ed7257bSPatrick Caulfield  * depending on the configuration variable 'protocol'. This should be set
396ed7257bSPatrick Caulfield  * to 0 (default) for TCP or 1 for SCTP. It should be configured using a
406ed7257bSPatrick Caulfield  * cluster-wide mechanism as it must be the same on all nodes of the cluster
416ed7257bSPatrick Caulfield  * for the DLM to function.
426ed7257bSPatrick Caulfield  *
436ed7257bSPatrick Caulfield  */
446ed7257bSPatrick Caulfield 
456ed7257bSPatrick Caulfield #include <asm/ioctls.h>
466ed7257bSPatrick Caulfield #include <net/sock.h>
476ed7257bSPatrick Caulfield #include <net/tcp.h>
486ed7257bSPatrick Caulfield #include <linux/pagemap.h>
496ed7257bSPatrick Caulfield #include <linux/file.h>
507a936ce7SMatthias Kaehlcke #include <linux/mutex.h>
516ed7257bSPatrick Caulfield #include <linux/sctp.h>
525a0e3ad6STejun Heo #include <linux/slab.h>
532f2d76ccSBenjamin Poirier #include <net/sctp/sctp.h>
5444ad532bSJoe Perches #include <net/ipv6.h>
556ed7257bSPatrick Caulfield 
5692732376SAlexander Aring #include <trace/events/dlm.h>
5792732376SAlexander Aring 
586ed7257bSPatrick Caulfield #include "dlm_internal.h"
596ed7257bSPatrick Caulfield #include "lowcomms.h"
606ed7257bSPatrick Caulfield #include "midcomms.h"
613af2326cSAlexander Aring #include "memory.h"
626ed7257bSPatrick Caulfield #include "config.h"
636ed7257bSPatrick Caulfield 
646ed7257bSPatrick Caulfield #define NEEDED_RMEM (4*1024*1024)
656ed7257bSPatrick Caulfield 
66f92c8dd7SBob Peterson /* Number of messages to send before rescheduling */
67f92c8dd7SBob Peterson #define MAX_SEND_MSG_COUNT 25
68055923bfSAlexander Aring #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
69f92c8dd7SBob Peterson 
706ed7257bSPatrick Caulfield struct connection {
716ed7257bSPatrick Caulfield 	struct socket *sock;	/* NULL if not connected */
726ed7257bSPatrick Caulfield 	uint32_t nodeid;	/* So we know who we are in the list */
736ed7257bSPatrick Caulfield 	struct mutex sock_mutex;
746ed7257bSPatrick Caulfield 	unsigned long flags;
756ed7257bSPatrick Caulfield #define CF_READ_PENDING 1
768a4abb08Stsutomu.owa@toshiba.co.jp #define CF_WRITE_PENDING 2
776ed7257bSPatrick Caulfield #define CF_INIT_PENDING 4
786ed7257bSPatrick Caulfield #define CF_IS_OTHERCON 5
79063c4c99SLars Marowsky-Bree #define CF_CLOSE 6
80b36930ddSDavid Miller #define CF_APP_LIMITED 7
81b2a66629Stsutomu.owa@toshiba.co.jp #define CF_CLOSING 8
82055923bfSAlexander Aring #define CF_SHUTDOWN 9
8319633c7eSAlexander Aring #define CF_CONNECTED 10
84ba868d9dSAlexander Aring #define CF_RECONNECT 11
85ba868d9dSAlexander Aring #define CF_DELAY_CONNECT 12
868aa31cbfSAlexander Aring #define CF_EOF 13
876ed7257bSPatrick Caulfield 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
886ed7257bSPatrick Caulfield 	spinlock_t writequeue_lock;
898aa31cbfSAlexander Aring 	atomic_t writequeue_cnt;
906ed7257bSPatrick Caulfield 	int retries;
916ed7257bSPatrick Caulfield #define MAX_CONNECT_RETRIES 3
925e9ccc37SChristine Caulfield 	struct hlist_node list;
936ed7257bSPatrick Caulfield 	struct connection *othercon;
94ba868d9dSAlexander Aring 	struct connection *sendcon;
956ed7257bSPatrick Caulfield 	struct work_struct rwork; /* Receive workqueue */
966ed7257bSPatrick Caulfield 	struct work_struct swork; /* Send workqueue */
97055923bfSAlexander Aring 	wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
984798cbbfSAlexander Aring 	unsigned char *rx_buf;
994798cbbfSAlexander Aring 	int rx_buflen;
1004798cbbfSAlexander Aring 	int rx_leftover;
101a47666ebSAlexander Aring 	struct rcu_head rcu;
1026ed7257bSPatrick Caulfield };
1036ed7257bSPatrick Caulfield #define sock2con(x) ((struct connection *)(x)->sk_user_data)
1046ed7257bSPatrick Caulfield 
105d11ccd45SAlexander Aring struct listen_connection {
106d11ccd45SAlexander Aring 	struct socket *sock;
107d11ccd45SAlexander Aring 	struct work_struct rwork;
108d11ccd45SAlexander Aring };
109d11ccd45SAlexander Aring 
110f0747ebfSAlexander Aring #define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end)
111f0747ebfSAlexander Aring #define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset)
112f0747ebfSAlexander Aring 
1136ed7257bSPatrick Caulfield /* An entry waiting to be sent */
1146ed7257bSPatrick Caulfield struct writequeue_entry {
1156ed7257bSPatrick Caulfield 	struct list_head list;
1166ed7257bSPatrick Caulfield 	struct page *page;
1176ed7257bSPatrick Caulfield 	int offset;
1186ed7257bSPatrick Caulfield 	int len;
1196ed7257bSPatrick Caulfield 	int end;
1206ed7257bSPatrick Caulfield 	int users;
121706474fbSAlexander Aring 	bool dirty;
1226ed7257bSPatrick Caulfield 	struct connection *con;
1238f2dc78dSAlexander Aring 	struct list_head msgs;
1248f2dc78dSAlexander Aring 	struct kref ref;
1258f2dc78dSAlexander Aring };
1268f2dc78dSAlexander Aring 
1278f2dc78dSAlexander Aring struct dlm_msg {
1288f2dc78dSAlexander Aring 	struct writequeue_entry *entry;
1292874d1a6SAlexander Aring 	struct dlm_msg *orig_msg;
1302874d1a6SAlexander Aring 	bool retransmit;
1318f2dc78dSAlexander Aring 	void *ppc;
1328f2dc78dSAlexander Aring 	int len;
1338f2dc78dSAlexander Aring 	int idx; /* new()/commit() idx exchange */
1348f2dc78dSAlexander Aring 
1358f2dc78dSAlexander Aring 	struct list_head list;
1368f2dc78dSAlexander Aring 	struct kref ref;
1376ed7257bSPatrick Caulfield };
1386ed7257bSPatrick Caulfield 
13936b71a8bSDavid Teigland struct dlm_node_addr {
14036b71a8bSDavid Teigland 	struct list_head list;
14136b71a8bSDavid Teigland 	int nodeid;
142e125fbebSAlexander Aring 	int mark;
14336b71a8bSDavid Teigland 	int addr_count;
14498e1b60eSMike Christie 	int curr_addr_index;
14536b71a8bSDavid Teigland 	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
14636b71a8bSDavid Teigland };
14736b71a8bSDavid Teigland 
148a66c008cSAlexander Aring struct dlm_proto_ops {
1498728a455SAlexander Aring 	bool try_new_addr;
1502dc6b115SAlexander Aring 	const char *name;
1512dc6b115SAlexander Aring 	int proto;
1522dc6b115SAlexander Aring 
1538728a455SAlexander Aring 	int (*connect)(struct connection *con, struct socket *sock,
1548728a455SAlexander Aring 		       struct sockaddr *addr, int addr_len);
1558728a455SAlexander Aring 	void (*sockopts)(struct socket *sock);
1568728a455SAlexander Aring 	int (*bind)(struct socket *sock);
1572dc6b115SAlexander Aring 	int (*listen_validate)(void);
1582dc6b115SAlexander Aring 	void (*listen_sockopts)(struct socket *sock);
1592dc6b115SAlexander Aring 	int (*listen_bind)(struct socket *sock);
160a66c008cSAlexander Aring 	/* What to do to shutdown */
161a66c008cSAlexander Aring 	void (*shutdown_action)(struct connection *con);
162a66c008cSAlexander Aring 	/* What to do to eof check */
163a66c008cSAlexander Aring 	bool (*eof_condition)(struct connection *con);
164a66c008cSAlexander Aring };
165a66c008cSAlexander Aring 
166cc661fc9SBob Peterson static struct listen_sock_callbacks {
167cc661fc9SBob Peterson 	void (*sk_error_report)(struct sock *);
168cc661fc9SBob Peterson 	void (*sk_data_ready)(struct sock *);
169cc661fc9SBob Peterson 	void (*sk_state_change)(struct sock *);
170cc661fc9SBob Peterson 	void (*sk_write_space)(struct sock *);
171cc661fc9SBob Peterson } listen_sock;
172cc661fc9SBob Peterson 
17336b71a8bSDavid Teigland static LIST_HEAD(dlm_node_addrs);
17436b71a8bSDavid Teigland static DEFINE_SPINLOCK(dlm_node_addrs_spin);
17536b71a8bSDavid Teigland 
176d11ccd45SAlexander Aring static struct listen_connection listen_con;
1776ed7257bSPatrick Caulfield static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
1786ed7257bSPatrick Caulfield static int dlm_local_count;
17951746163SAlexander Aring int dlm_allow_conn;
1806ed7257bSPatrick Caulfield 
1816ed7257bSPatrick Caulfield /* Work queues */
1826ed7257bSPatrick Caulfield static struct workqueue_struct *recv_workqueue;
1836ed7257bSPatrick Caulfield static struct workqueue_struct *send_workqueue;
1846ed7257bSPatrick Caulfield 
1855e9ccc37SChristine Caulfield static struct hlist_head connection_hash[CONN_HASH_SIZE];
186a47666ebSAlexander Aring static DEFINE_SPINLOCK(connections_lock);
187a47666ebSAlexander Aring DEFINE_STATIC_SRCU(connections_srcu);
1886ed7257bSPatrick Caulfield 
189a66c008cSAlexander Aring static const struct dlm_proto_ops *dlm_proto_ops;
190a66c008cSAlexander Aring 
1916ed7257bSPatrick Caulfield static void process_recv_sockets(struct work_struct *work);
1926ed7257bSPatrick Caulfield static void process_send_sockets(struct work_struct *work);
1936ed7257bSPatrick Caulfield 
1943af2326cSAlexander Aring static void writequeue_entry_ctor(void *data)
1953af2326cSAlexander Aring {
1963af2326cSAlexander Aring 	struct writequeue_entry *entry = data;
1973af2326cSAlexander Aring 
1983af2326cSAlexander Aring 	INIT_LIST_HEAD(&entry->msgs);
1993af2326cSAlexander Aring }
2003af2326cSAlexander Aring 
2013af2326cSAlexander Aring struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void)
2023af2326cSAlexander Aring {
2033af2326cSAlexander Aring 	return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry),
2043af2326cSAlexander Aring 				 0, 0, writequeue_entry_ctor);
2053af2326cSAlexander Aring }
2063af2326cSAlexander Aring 
207e4dc81edSAlexander Aring struct kmem_cache *dlm_lowcomms_msg_cache_create(void)
208e4dc81edSAlexander Aring {
209e4dc81edSAlexander Aring 	return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL);
210e4dc81edSAlexander Aring }
211e4dc81edSAlexander Aring 
21266d5955aSAlexander Aring /* need to held writequeue_lock */
21366d5955aSAlexander Aring static struct writequeue_entry *con_next_wq(struct connection *con)
21466d5955aSAlexander Aring {
21566d5955aSAlexander Aring 	struct writequeue_entry *e;
21666d5955aSAlexander Aring 
21766d5955aSAlexander Aring 	if (list_empty(&con->writequeue))
21866d5955aSAlexander Aring 		return NULL;
21966d5955aSAlexander Aring 
22066d5955aSAlexander Aring 	e = list_first_entry(&con->writequeue, struct writequeue_entry,
22166d5955aSAlexander Aring 			     list);
222bcbfea41SAlexander Aring 	/* if len is zero nothing is to send, if there are users filling
223bcbfea41SAlexander Aring 	 * buffers we wait until the users are done so we can send more.
224bcbfea41SAlexander Aring 	 */
225bcbfea41SAlexander Aring 	if (e->users || e->len == 0)
22666d5955aSAlexander Aring 		return NULL;
22766d5955aSAlexander Aring 
22866d5955aSAlexander Aring 	return e;
22966d5955aSAlexander Aring }
23066d5955aSAlexander Aring 
231b38bc9c2SAlexander Aring static struct connection *__find_con(int nodeid, int r)
2325e9ccc37SChristine Caulfield {
2335e9ccc37SChristine Caulfield 	struct connection *con;
2345e9ccc37SChristine Caulfield 
235a47666ebSAlexander Aring 	hlist_for_each_entry_rcu(con, &connection_hash[r], list) {
236b38bc9c2SAlexander Aring 		if (con->nodeid == nodeid)
2375e9ccc37SChristine Caulfield 			return con;
2385e9ccc37SChristine Caulfield 	}
239a47666ebSAlexander Aring 
2405e9ccc37SChristine Caulfield 	return NULL;
2415e9ccc37SChristine Caulfield }
2425e9ccc37SChristine Caulfield 
2438aa31cbfSAlexander Aring static bool tcp_eof_condition(struct connection *con)
2448aa31cbfSAlexander Aring {
2458aa31cbfSAlexander Aring 	return atomic_read(&con->writequeue_cnt);
2468aa31cbfSAlexander Aring }
2478aa31cbfSAlexander Aring 
2486cde210aSAlexander Aring static int dlm_con_init(struct connection *con, int nodeid)
2496ed7257bSPatrick Caulfield {
2504798cbbfSAlexander Aring 	con->rx_buflen = dlm_config.ci_buffer_size;
2514798cbbfSAlexander Aring 	con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
2526cde210aSAlexander Aring 	if (!con->rx_buf)
2536cde210aSAlexander Aring 		return -ENOMEM;
2544798cbbfSAlexander Aring 
2556ed7257bSPatrick Caulfield 	con->nodeid = nodeid;
2566ed7257bSPatrick Caulfield 	mutex_init(&con->sock_mutex);
2576ed7257bSPatrick Caulfield 	INIT_LIST_HEAD(&con->writequeue);
2586ed7257bSPatrick Caulfield 	spin_lock_init(&con->writequeue_lock);
2598aa31cbfSAlexander Aring 	atomic_set(&con->writequeue_cnt, 0);
2606ed7257bSPatrick Caulfield 	INIT_WORK(&con->swork, process_send_sockets);
2616ed7257bSPatrick Caulfield 	INIT_WORK(&con->rwork, process_recv_sockets);
262055923bfSAlexander Aring 	init_waitqueue_head(&con->shutdown_wait);
2636ed7257bSPatrick Caulfield 
2646cde210aSAlexander Aring 	return 0;
2656cde210aSAlexander Aring }
2666cde210aSAlexander Aring 
2676cde210aSAlexander Aring /*
2686cde210aSAlexander Aring  * If 'allocation' is zero then we don't attempt to create a new
2696cde210aSAlexander Aring  * connection structure for this node.
2706cde210aSAlexander Aring  */
2716cde210aSAlexander Aring static struct connection *nodeid2con(int nodeid, gfp_t alloc)
2726cde210aSAlexander Aring {
2736cde210aSAlexander Aring 	struct connection *con, *tmp;
2746cde210aSAlexander Aring 	int r, ret;
2756cde210aSAlexander Aring 
276b38bc9c2SAlexander Aring 	r = nodeid_hash(nodeid);
277b38bc9c2SAlexander Aring 	con = __find_con(nodeid, r);
2786cde210aSAlexander Aring 	if (con || !alloc)
2796cde210aSAlexander Aring 		return con;
2806cde210aSAlexander Aring 
2816cde210aSAlexander Aring 	con = kzalloc(sizeof(*con), alloc);
2826cde210aSAlexander Aring 	if (!con)
2836cde210aSAlexander Aring 		return NULL;
2846cde210aSAlexander Aring 
2856cde210aSAlexander Aring 	ret = dlm_con_init(con, nodeid);
2866cde210aSAlexander Aring 	if (ret) {
2876cde210aSAlexander Aring 		kfree(con);
2886cde210aSAlexander Aring 		return NULL;
2896cde210aSAlexander Aring 	}
2906cde210aSAlexander Aring 
291a47666ebSAlexander Aring 	spin_lock(&connections_lock);
2924f2b30fdSAlexander Aring 	/* Because multiple workqueues/threads calls this function it can
2934f2b30fdSAlexander Aring 	 * race on multiple cpu's. Instead of locking hot path __find_con()
2944f2b30fdSAlexander Aring 	 * we just check in rare cases of recently added nodes again
2954f2b30fdSAlexander Aring 	 * under protection of connections_lock. If this is the case we
2964f2b30fdSAlexander Aring 	 * abort our connection creation and return the existing connection.
2974f2b30fdSAlexander Aring 	 */
298b38bc9c2SAlexander Aring 	tmp = __find_con(nodeid, r);
2994f2b30fdSAlexander Aring 	if (tmp) {
3004f2b30fdSAlexander Aring 		spin_unlock(&connections_lock);
3014f2b30fdSAlexander Aring 		kfree(con->rx_buf);
3024f2b30fdSAlexander Aring 		kfree(con);
3034f2b30fdSAlexander Aring 		return tmp;
3044f2b30fdSAlexander Aring 	}
3054f2b30fdSAlexander Aring 
306a47666ebSAlexander Aring 	hlist_add_head_rcu(&con->list, &connection_hash[r]);
307a47666ebSAlexander Aring 	spin_unlock(&connections_lock);
308a47666ebSAlexander Aring 
3096ed7257bSPatrick Caulfield 	return con;
3106ed7257bSPatrick Caulfield }
3116ed7257bSPatrick Caulfield 
3125e9ccc37SChristine Caulfield /* Loop round all connections */
3135e9ccc37SChristine Caulfield static void foreach_conn(void (*conn_func)(struct connection *c))
3145e9ccc37SChristine Caulfield {
315b38bc9c2SAlexander Aring 	int i;
3165e9ccc37SChristine Caulfield 	struct connection *con;
3175e9ccc37SChristine Caulfield 
3185e9ccc37SChristine Caulfield 	for (i = 0; i < CONN_HASH_SIZE; i++) {
319a47666ebSAlexander Aring 		hlist_for_each_entry_rcu(con, &connection_hash[i], list)
3205e9ccc37SChristine Caulfield 			conn_func(con);
3215e9ccc37SChristine Caulfield 	}
3226ed7257bSPatrick Caulfield }
3236ed7257bSPatrick Caulfield 
32436b71a8bSDavid Teigland static struct dlm_node_addr *find_node_addr(int nodeid)
3256ed7257bSPatrick Caulfield {
32636b71a8bSDavid Teigland 	struct dlm_node_addr *na;
32736b71a8bSDavid Teigland 
32836b71a8bSDavid Teigland 	list_for_each_entry(na, &dlm_node_addrs, list) {
32936b71a8bSDavid Teigland 		if (na->nodeid == nodeid)
33036b71a8bSDavid Teigland 			return na;
33136b71a8bSDavid Teigland 	}
33236b71a8bSDavid Teigland 	return NULL;
33336b71a8bSDavid Teigland }
33436b71a8bSDavid Teigland 
33540c6b83eSAlexander Aring static int addr_compare(const struct sockaddr_storage *x,
33640c6b83eSAlexander Aring 			const struct sockaddr_storage *y)
33736b71a8bSDavid Teigland {
33836b71a8bSDavid Teigland 	switch (x->ss_family) {
33936b71a8bSDavid Teigland 	case AF_INET: {
34036b71a8bSDavid Teigland 		struct sockaddr_in *sinx = (struct sockaddr_in *)x;
34136b71a8bSDavid Teigland 		struct sockaddr_in *siny = (struct sockaddr_in *)y;
34236b71a8bSDavid Teigland 		if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
34336b71a8bSDavid Teigland 			return 0;
34436b71a8bSDavid Teigland 		if (sinx->sin_port != siny->sin_port)
34536b71a8bSDavid Teigland 			return 0;
34636b71a8bSDavid Teigland 		break;
34736b71a8bSDavid Teigland 	}
34836b71a8bSDavid Teigland 	case AF_INET6: {
34936b71a8bSDavid Teigland 		struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
35036b71a8bSDavid Teigland 		struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
35136b71a8bSDavid Teigland 		if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
35236b71a8bSDavid Teigland 			return 0;
35336b71a8bSDavid Teigland 		if (sinx->sin6_port != siny->sin6_port)
35436b71a8bSDavid Teigland 			return 0;
35536b71a8bSDavid Teigland 		break;
35636b71a8bSDavid Teigland 	}
35736b71a8bSDavid Teigland 	default:
35836b71a8bSDavid Teigland 		return 0;
35936b71a8bSDavid Teigland 	}
36036b71a8bSDavid Teigland 	return 1;
36136b71a8bSDavid Teigland }
36236b71a8bSDavid Teigland 
36336b71a8bSDavid Teigland static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
364e125fbebSAlexander Aring 			  struct sockaddr *sa_out, bool try_new_addr,
365e125fbebSAlexander Aring 			  unsigned int *mark)
36636b71a8bSDavid Teigland {
36736b71a8bSDavid Teigland 	struct sockaddr_storage sas;
36836b71a8bSDavid Teigland 	struct dlm_node_addr *na;
3696ed7257bSPatrick Caulfield 
3706ed7257bSPatrick Caulfield 	if (!dlm_local_count)
3716ed7257bSPatrick Caulfield 		return -1;
3726ed7257bSPatrick Caulfield 
37336b71a8bSDavid Teigland 	spin_lock(&dlm_node_addrs_spin);
37436b71a8bSDavid Teigland 	na = find_node_addr(nodeid);
37598e1b60eSMike Christie 	if (na && na->addr_count) {
376ee44b4bcSMarcelo Ricardo Leitner 		memcpy(&sas, na->addr[na->curr_addr_index],
377ee44b4bcSMarcelo Ricardo Leitner 		       sizeof(struct sockaddr_storage));
378ee44b4bcSMarcelo Ricardo Leitner 
37998e1b60eSMike Christie 		if (try_new_addr) {
38098e1b60eSMike Christie 			na->curr_addr_index++;
38198e1b60eSMike Christie 			if (na->curr_addr_index == na->addr_count)
38298e1b60eSMike Christie 				na->curr_addr_index = 0;
38398e1b60eSMike Christie 		}
38498e1b60eSMike Christie 	}
38536b71a8bSDavid Teigland 	spin_unlock(&dlm_node_addrs_spin);
38636b71a8bSDavid Teigland 
38736b71a8bSDavid Teigland 	if (!na)
38836b71a8bSDavid Teigland 		return -EEXIST;
38936b71a8bSDavid Teigland 
39036b71a8bSDavid Teigland 	if (!na->addr_count)
39136b71a8bSDavid Teigland 		return -ENOENT;
39236b71a8bSDavid Teigland 
393e125fbebSAlexander Aring 	*mark = na->mark;
394e125fbebSAlexander Aring 
39536b71a8bSDavid Teigland 	if (sas_out)
39636b71a8bSDavid Teigland 		memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
39736b71a8bSDavid Teigland 
39836b71a8bSDavid Teigland 	if (!sa_out)
39936b71a8bSDavid Teigland 		return 0;
4006ed7257bSPatrick Caulfield 
4016ed7257bSPatrick Caulfield 	if (dlm_local_addr[0]->ss_family == AF_INET) {
40236b71a8bSDavid Teigland 		struct sockaddr_in *in4  = (struct sockaddr_in *) &sas;
40336b71a8bSDavid Teigland 		struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
4046ed7257bSPatrick Caulfield 		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
4056ed7257bSPatrick Caulfield 	} else {
40636b71a8bSDavid Teigland 		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &sas;
40736b71a8bSDavid Teigland 		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out;
4084e3fd7a0SAlexey Dobriyan 		ret6->sin6_addr = in6->sin6_addr;
4096ed7257bSPatrick Caulfield 	}
4106ed7257bSPatrick Caulfield 
4116ed7257bSPatrick Caulfield 	return 0;
4126ed7257bSPatrick Caulfield }
4136ed7257bSPatrick Caulfield 
414e125fbebSAlexander Aring static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid,
415e125fbebSAlexander Aring 			  unsigned int *mark)
41636b71a8bSDavid Teigland {
41736b71a8bSDavid Teigland 	struct dlm_node_addr *na;
41836b71a8bSDavid Teigland 	int rv = -EEXIST;
41998e1b60eSMike Christie 	int addr_i;
42036b71a8bSDavid Teigland 
42136b71a8bSDavid Teigland 	spin_lock(&dlm_node_addrs_spin);
42236b71a8bSDavid Teigland 	list_for_each_entry(na, &dlm_node_addrs, list) {
42336b71a8bSDavid Teigland 		if (!na->addr_count)
42436b71a8bSDavid Teigland 			continue;
42536b71a8bSDavid Teigland 
42698e1b60eSMike Christie 		for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
42798e1b60eSMike Christie 			if (addr_compare(na->addr[addr_i], addr)) {
42836b71a8bSDavid Teigland 				*nodeid = na->nodeid;
429e125fbebSAlexander Aring 				*mark = na->mark;
43036b71a8bSDavid Teigland 				rv = 0;
43198e1b60eSMike Christie 				goto unlock;
43236b71a8bSDavid Teigland 			}
43398e1b60eSMike Christie 		}
43498e1b60eSMike Christie 	}
43598e1b60eSMike Christie unlock:
43636b71a8bSDavid Teigland 	spin_unlock(&dlm_node_addrs_spin);
43736b71a8bSDavid Teigland 	return rv;
43836b71a8bSDavid Teigland }
43936b71a8bSDavid Teigland 
4404f19d071SAlexander Aring /* caller need to held dlm_node_addrs_spin lock */
4414f19d071SAlexander Aring static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
4424f19d071SAlexander Aring 				     const struct sockaddr_storage *addr)
4434f19d071SAlexander Aring {
4444f19d071SAlexander Aring 	int i;
4454f19d071SAlexander Aring 
4464f19d071SAlexander Aring 	for (i = 0; i < na->addr_count; i++) {
4474f19d071SAlexander Aring 		if (addr_compare(na->addr[i], addr))
4484f19d071SAlexander Aring 			return true;
4494f19d071SAlexander Aring 	}
4504f19d071SAlexander Aring 
4514f19d071SAlexander Aring 	return false;
4524f19d071SAlexander Aring }
4534f19d071SAlexander Aring 
45436b71a8bSDavid Teigland int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
45536b71a8bSDavid Teigland {
45636b71a8bSDavid Teigland 	struct sockaddr_storage *new_addr;
45736b71a8bSDavid Teigland 	struct dlm_node_addr *new_node, *na;
4584f19d071SAlexander Aring 	bool ret;
45936b71a8bSDavid Teigland 
46036b71a8bSDavid Teigland 	new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
46136b71a8bSDavid Teigland 	if (!new_node)
46236b71a8bSDavid Teigland 		return -ENOMEM;
46336b71a8bSDavid Teigland 
46436b71a8bSDavid Teigland 	new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
46536b71a8bSDavid Teigland 	if (!new_addr) {
46636b71a8bSDavid Teigland 		kfree(new_node);
46736b71a8bSDavid Teigland 		return -ENOMEM;
46836b71a8bSDavid Teigland 	}
46936b71a8bSDavid Teigland 
47036b71a8bSDavid Teigland 	memcpy(new_addr, addr, len);
47136b71a8bSDavid Teigland 
47236b71a8bSDavid Teigland 	spin_lock(&dlm_node_addrs_spin);
47336b71a8bSDavid Teigland 	na = find_node_addr(nodeid);
47436b71a8bSDavid Teigland 	if (!na) {
47536b71a8bSDavid Teigland 		new_node->nodeid = nodeid;
47636b71a8bSDavid Teigland 		new_node->addr[0] = new_addr;
47736b71a8bSDavid Teigland 		new_node->addr_count = 1;
478e125fbebSAlexander Aring 		new_node->mark = dlm_config.ci_mark;
47936b71a8bSDavid Teigland 		list_add(&new_node->list, &dlm_node_addrs);
48036b71a8bSDavid Teigland 		spin_unlock(&dlm_node_addrs_spin);
48136b71a8bSDavid Teigland 		return 0;
48236b71a8bSDavid Teigland 	}
48336b71a8bSDavid Teigland 
4844f19d071SAlexander Aring 	ret = dlm_lowcomms_na_has_addr(na, addr);
4854f19d071SAlexander Aring 	if (ret) {
4864f19d071SAlexander Aring 		spin_unlock(&dlm_node_addrs_spin);
4874f19d071SAlexander Aring 		kfree(new_addr);
4884f19d071SAlexander Aring 		kfree(new_node);
4894f19d071SAlexander Aring 		return -EEXIST;
4904f19d071SAlexander Aring 	}
4914f19d071SAlexander Aring 
49236b71a8bSDavid Teigland 	if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
49336b71a8bSDavid Teigland 		spin_unlock(&dlm_node_addrs_spin);
49436b71a8bSDavid Teigland 		kfree(new_addr);
49536b71a8bSDavid Teigland 		kfree(new_node);
49636b71a8bSDavid Teigland 		return -ENOSPC;
49736b71a8bSDavid Teigland 	}
49836b71a8bSDavid Teigland 
49936b71a8bSDavid Teigland 	na->addr[na->addr_count++] = new_addr;
50036b71a8bSDavid Teigland 	spin_unlock(&dlm_node_addrs_spin);
50136b71a8bSDavid Teigland 	kfree(new_node);
50236b71a8bSDavid Teigland 	return 0;
50336b71a8bSDavid Teigland }
50436b71a8bSDavid Teigland 
5056ed7257bSPatrick Caulfield /* Data available on socket or listen socket received a connect */
506676d2369SDavid S. Miller static void lowcomms_data_ready(struct sock *sk)
5076ed7257bSPatrick Caulfield {
50893eaadebStsutomu.owa@toshiba.co.jp 	struct connection *con;
50993eaadebStsutomu.owa@toshiba.co.jp 
51093eaadebStsutomu.owa@toshiba.co.jp 	con = sock2con(sk);
511afb853fbSPatrick Caulfield 	if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
5126ed7257bSPatrick Caulfield 		queue_work(recv_workqueue, &con->rwork);
5136ed7257bSPatrick Caulfield }
5146ed7257bSPatrick Caulfield 
515d11ccd45SAlexander Aring static void lowcomms_listen_data_ready(struct sock *sk)
516d11ccd45SAlexander Aring {
5179a4139a7SAlexander Aring 	if (!dlm_allow_conn)
5189a4139a7SAlexander Aring 		return;
5199a4139a7SAlexander Aring 
520d11ccd45SAlexander Aring 	queue_work(recv_workqueue, &listen_con.rwork);
521d11ccd45SAlexander Aring }
522d11ccd45SAlexander Aring 
5236ed7257bSPatrick Caulfield static void lowcomms_write_space(struct sock *sk)
5246ed7257bSPatrick Caulfield {
52593eaadebStsutomu.owa@toshiba.co.jp 	struct connection *con;
5266ed7257bSPatrick Caulfield 
52793eaadebStsutomu.owa@toshiba.co.jp 	con = sock2con(sk);
528b36930ddSDavid Miller 	if (!con)
52992c44605SAlexander Aring 		return;
530b36930ddSDavid Miller 
53119633c7eSAlexander Aring 	if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
53219633c7eSAlexander Aring 		log_print("successful connected to node %d", con->nodeid);
53319633c7eSAlexander Aring 		queue_work(send_workqueue, &con->swork);
53492c44605SAlexander Aring 		return;
53519633c7eSAlexander Aring 	}
53619633c7eSAlexander Aring 
537b36930ddSDavid Miller 	clear_bit(SOCK_NOSPACE, &con->sock->flags);
538b36930ddSDavid Miller 
539b36930ddSDavid Miller 	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
540b36930ddSDavid Miller 		con->sock->sk->sk_write_pending--;
5419cd3e072SEric Dumazet 		clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
542b36930ddSDavid Miller 	}
543b36930ddSDavid Miller 
5446ed7257bSPatrick Caulfield 	queue_work(send_workqueue, &con->swork);
5456ed7257bSPatrick Caulfield }
5466ed7257bSPatrick Caulfield 
5476ed7257bSPatrick Caulfield static inline void lowcomms_connect_sock(struct connection *con)
5486ed7257bSPatrick Caulfield {
549063c4c99SLars Marowsky-Bree 	if (test_bit(CF_CLOSE, &con->flags))
550063c4c99SLars Marowsky-Bree 		return;
5516ed7257bSPatrick Caulfield 	queue_work(send_workqueue, &con->swork);
55261d9102bSBob Peterson 	cond_resched();
5536ed7257bSPatrick Caulfield }
5546ed7257bSPatrick Caulfield 
5556ed7257bSPatrick Caulfield static void lowcomms_state_change(struct sock *sk)
5566ed7257bSPatrick Caulfield {
557ee44b4bcSMarcelo Ricardo Leitner 	/* SCTP layer is not calling sk_data_ready when the connection
558ee44b4bcSMarcelo Ricardo Leitner 	 * is done, so we catch the signal through here. Also, it
559ee44b4bcSMarcelo Ricardo Leitner 	 * doesn't switch socket state when entering shutdown, so we
560ee44b4bcSMarcelo Ricardo Leitner 	 * skip the write in that case.
561ee44b4bcSMarcelo Ricardo Leitner 	 */
562ee44b4bcSMarcelo Ricardo Leitner 	if (sk->sk_shutdown) {
563ee44b4bcSMarcelo Ricardo Leitner 		if (sk->sk_shutdown == RCV_SHUTDOWN)
564ee44b4bcSMarcelo Ricardo Leitner 			lowcomms_data_ready(sk);
565ee44b4bcSMarcelo Ricardo Leitner 	} else if (sk->sk_state == TCP_ESTABLISHED) {
5666ed7257bSPatrick Caulfield 		lowcomms_write_space(sk);
5676ed7257bSPatrick Caulfield 	}
568ee44b4bcSMarcelo Ricardo Leitner }
5696ed7257bSPatrick Caulfield 
570391fbdc5SChristine Caulfield int dlm_lowcomms_connect_node(int nodeid)
571391fbdc5SChristine Caulfield {
572391fbdc5SChristine Caulfield 	struct connection *con;
573b38bc9c2SAlexander Aring 	int idx;
574391fbdc5SChristine Caulfield 
575391fbdc5SChristine Caulfield 	if (nodeid == dlm_our_nodeid())
576391fbdc5SChristine Caulfield 		return 0;
577391fbdc5SChristine Caulfield 
578b38bc9c2SAlexander Aring 	idx = srcu_read_lock(&connections_srcu);
579391fbdc5SChristine Caulfield 	con = nodeid2con(nodeid, GFP_NOFS);
580b38bc9c2SAlexander Aring 	if (!con) {
581b38bc9c2SAlexander Aring 		srcu_read_unlock(&connections_srcu, idx);
582391fbdc5SChristine Caulfield 		return -ENOMEM;
583b38bc9c2SAlexander Aring 	}
584b38bc9c2SAlexander Aring 
585391fbdc5SChristine Caulfield 	lowcomms_connect_sock(con);
586b38bc9c2SAlexander Aring 	srcu_read_unlock(&connections_srcu, idx);
587b38bc9c2SAlexander Aring 
588391fbdc5SChristine Caulfield 	return 0;
589391fbdc5SChristine Caulfield }
590391fbdc5SChristine Caulfield 
591e125fbebSAlexander Aring int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
592e125fbebSAlexander Aring {
593e125fbebSAlexander Aring 	struct dlm_node_addr *na;
594e125fbebSAlexander Aring 
595e125fbebSAlexander Aring 	spin_lock(&dlm_node_addrs_spin);
596e125fbebSAlexander Aring 	na = find_node_addr(nodeid);
597e125fbebSAlexander Aring 	if (!na) {
598e125fbebSAlexander Aring 		spin_unlock(&dlm_node_addrs_spin);
599e125fbebSAlexander Aring 		return -ENOENT;
600e125fbebSAlexander Aring 	}
601e125fbebSAlexander Aring 
602e125fbebSAlexander Aring 	na->mark = mark;
603e125fbebSAlexander Aring 	spin_unlock(&dlm_node_addrs_spin);
604e125fbebSAlexander Aring 
605e125fbebSAlexander Aring 	return 0;
606e125fbebSAlexander Aring }
607e125fbebSAlexander Aring 
608b3a5bbfdSBob Peterson static void lowcomms_error_report(struct sock *sk)
609b3a5bbfdSBob Peterson {
610b81171cbSBob Peterson 	struct connection *con;
611b81171cbSBob Peterson 	void (*orig_report)(struct sock *) = NULL;
6124c3d9057SAlexander Aring 	struct inet_sock *inet;
613b3a5bbfdSBob Peterson 
614b81171cbSBob Peterson 	con = sock2con(sk);
615b81171cbSBob Peterson 	if (con == NULL)
616b81171cbSBob Peterson 		goto out;
617b81171cbSBob Peterson 
618cc661fc9SBob Peterson 	orig_report = listen_sock.sk_error_report;
619b3a5bbfdSBob Peterson 
6204c3d9057SAlexander Aring 	inet = inet_sk(sk);
6214c3d9057SAlexander Aring 	switch (sk->sk_family) {
6224c3d9057SAlexander Aring 	case AF_INET:
623b3a5bbfdSBob Peterson 		printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
6244c3d9057SAlexander Aring 				   "sending to node %d at %pI4, dport %d, "
625b3a5bbfdSBob Peterson 				   "sk_err=%d/%d\n", dlm_our_nodeid(),
6264c3d9057SAlexander Aring 				   con->nodeid, &inet->inet_daddr,
6274c3d9057SAlexander Aring 				   ntohs(inet->inet_dport), sk->sk_err,
628b3a5bbfdSBob Peterson 				   sk->sk_err_soft);
6294c3d9057SAlexander Aring 		break;
6301b9beda8SAlexander Aring #if IS_ENABLED(CONFIG_IPV6)
6314c3d9057SAlexander Aring 	case AF_INET6:
632b3a5bbfdSBob Peterson 		printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
6334c3d9057SAlexander Aring 				   "sending to node %d at %pI6c, "
6344c3d9057SAlexander Aring 				   "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(),
6354c3d9057SAlexander Aring 				   con->nodeid, &sk->sk_v6_daddr,
6364c3d9057SAlexander Aring 				   ntohs(inet->inet_dport), sk->sk_err,
637b3a5bbfdSBob Peterson 				   sk->sk_err_soft);
6384c3d9057SAlexander Aring 		break;
6391b9beda8SAlexander Aring #endif
6404c3d9057SAlexander Aring 	default:
6414c3d9057SAlexander Aring 		printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
6424c3d9057SAlexander Aring 				   "invalid socket family %d set, "
6434c3d9057SAlexander Aring 				   "sk_err=%d/%d\n", dlm_our_nodeid(),
6444c3d9057SAlexander Aring 				   sk->sk_family, sk->sk_err, sk->sk_err_soft);
6454c3d9057SAlexander Aring 		goto out;
646b3a5bbfdSBob Peterson 	}
647ba868d9dSAlexander Aring 
648ba868d9dSAlexander Aring 	/* below sendcon only handling */
649ba868d9dSAlexander Aring 	if (test_bit(CF_IS_OTHERCON, &con->flags))
650ba868d9dSAlexander Aring 		con = con->sendcon;
651ba868d9dSAlexander Aring 
652ba868d9dSAlexander Aring 	switch (sk->sk_err) {
653ba868d9dSAlexander Aring 	case ECONNREFUSED:
654ba868d9dSAlexander Aring 		set_bit(CF_DELAY_CONNECT, &con->flags);
655ba868d9dSAlexander Aring 		break;
656ba868d9dSAlexander Aring 	default:
657ba868d9dSAlexander Aring 		break;
658ba868d9dSAlexander Aring 	}
659ba868d9dSAlexander Aring 
660ba868d9dSAlexander Aring 	if (!test_and_set_bit(CF_RECONNECT, &con->flags))
661ba868d9dSAlexander Aring 		queue_work(send_workqueue, &con->swork);
662ba868d9dSAlexander Aring 
663b81171cbSBob Peterson out:
664b81171cbSBob Peterson 	if (orig_report)
665b81171cbSBob Peterson 		orig_report(sk);
666b81171cbSBob Peterson }
667b81171cbSBob Peterson 
668b81171cbSBob Peterson /* Note: sk_callback_lock must be locked before calling this function. */
669cc661fc9SBob Peterson static void save_listen_callbacks(struct socket *sock)
670b81171cbSBob Peterson {
671cc661fc9SBob Peterson 	struct sock *sk = sock->sk;
672cc661fc9SBob Peterson 
673cc661fc9SBob Peterson 	listen_sock.sk_data_ready = sk->sk_data_ready;
674cc661fc9SBob Peterson 	listen_sock.sk_state_change = sk->sk_state_change;
675cc661fc9SBob Peterson 	listen_sock.sk_write_space = sk->sk_write_space;
676cc661fc9SBob Peterson 	listen_sock.sk_error_report = sk->sk_error_report;
677b81171cbSBob Peterson }
678b81171cbSBob Peterson 
679cc661fc9SBob Peterson static void restore_callbacks(struct socket *sock)
680b81171cbSBob Peterson {
681cc661fc9SBob Peterson 	struct sock *sk = sock->sk;
682cc661fc9SBob Peterson 
68392c44605SAlexander Aring 	lock_sock(sk);
684b81171cbSBob Peterson 	sk->sk_user_data = NULL;
685cc661fc9SBob Peterson 	sk->sk_data_ready = listen_sock.sk_data_ready;
686cc661fc9SBob Peterson 	sk->sk_state_change = listen_sock.sk_state_change;
687cc661fc9SBob Peterson 	sk->sk_write_space = listen_sock.sk_write_space;
688cc661fc9SBob Peterson 	sk->sk_error_report = listen_sock.sk_error_report;
68992c44605SAlexander Aring 	release_sock(sk);
690b3a5bbfdSBob Peterson }
691b3a5bbfdSBob Peterson 
692d11ccd45SAlexander Aring static void add_listen_sock(struct socket *sock, struct listen_connection *con)
693d11ccd45SAlexander Aring {
694d11ccd45SAlexander Aring 	struct sock *sk = sock->sk;
695d11ccd45SAlexander Aring 
69692c44605SAlexander Aring 	lock_sock(sk);
697d11ccd45SAlexander Aring 	save_listen_callbacks(sock);
698d11ccd45SAlexander Aring 	con->sock = sock;
699d11ccd45SAlexander Aring 
700d11ccd45SAlexander Aring 	sk->sk_user_data = con;
701d11ccd45SAlexander Aring 	sk->sk_allocation = GFP_NOFS;
702d11ccd45SAlexander Aring 	/* Install a data_ready callback */
703d11ccd45SAlexander Aring 	sk->sk_data_ready = lowcomms_listen_data_ready;
70492c44605SAlexander Aring 	release_sock(sk);
705d11ccd45SAlexander Aring }
706d11ccd45SAlexander Aring 
7076ed7257bSPatrick Caulfield /* Make a socket active */
708988419a9Stsutomu.owa@toshiba.co.jp static void add_sock(struct socket *sock, struct connection *con)
7096ed7257bSPatrick Caulfield {
710b81171cbSBob Peterson 	struct sock *sk = sock->sk;
711b81171cbSBob Peterson 
71292c44605SAlexander Aring 	lock_sock(sk);
7136ed7257bSPatrick Caulfield 	con->sock = sock;
7146ed7257bSPatrick Caulfield 
715b81171cbSBob Peterson 	sk->sk_user_data = con;
7166ed7257bSPatrick Caulfield 	/* Install a data_ready callback */
717b81171cbSBob Peterson 	sk->sk_data_ready = lowcomms_data_ready;
718b81171cbSBob Peterson 	sk->sk_write_space = lowcomms_write_space;
719b81171cbSBob Peterson 	sk->sk_state_change = lowcomms_state_change;
720b81171cbSBob Peterson 	sk->sk_allocation = GFP_NOFS;
721b81171cbSBob Peterson 	sk->sk_error_report = lowcomms_error_report;
72292c44605SAlexander Aring 	release_sock(sk);
7236ed7257bSPatrick Caulfield }
7246ed7257bSPatrick Caulfield 
7256ed7257bSPatrick Caulfield /* Add the port number to an IPv6 or 4 sockaddr and return the address
7266ed7257bSPatrick Caulfield    length */
7276ed7257bSPatrick Caulfield static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
7286ed7257bSPatrick Caulfield 			  int *addr_len)
7296ed7257bSPatrick Caulfield {
7306ed7257bSPatrick Caulfield 	saddr->ss_family =  dlm_local_addr[0]->ss_family;
7316ed7257bSPatrick Caulfield 	if (saddr->ss_family == AF_INET) {
7326ed7257bSPatrick Caulfield 		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
7336ed7257bSPatrick Caulfield 		in4_addr->sin_port = cpu_to_be16(port);
7346ed7257bSPatrick Caulfield 		*addr_len = sizeof(struct sockaddr_in);
7356ed7257bSPatrick Caulfield 		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
7366ed7257bSPatrick Caulfield 	} else {
7376ed7257bSPatrick Caulfield 		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
7386ed7257bSPatrick Caulfield 		in6_addr->sin6_port = cpu_to_be16(port);
7396ed7257bSPatrick Caulfield 		*addr_len = sizeof(struct sockaddr_in6);
7406ed7257bSPatrick Caulfield 	}
74101c8cab2SPatrick Caulfield 	memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
7426ed7257bSPatrick Caulfield }
7436ed7257bSPatrick Caulfield 
744706474fbSAlexander Aring static void dlm_page_release(struct kref *kref)
745706474fbSAlexander Aring {
746706474fbSAlexander Aring 	struct writequeue_entry *e = container_of(kref, struct writequeue_entry,
747706474fbSAlexander Aring 						  ref);
748706474fbSAlexander Aring 
749706474fbSAlexander Aring 	__free_page(e->page);
7503af2326cSAlexander Aring 	dlm_free_writequeue(e);
751706474fbSAlexander Aring }
752706474fbSAlexander Aring 
753706474fbSAlexander Aring static void dlm_msg_release(struct kref *kref)
754706474fbSAlexander Aring {
755706474fbSAlexander Aring 	struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref);
756706474fbSAlexander Aring 
757706474fbSAlexander Aring 	kref_put(&msg->entry->ref, dlm_page_release);
758e4dc81edSAlexander Aring 	dlm_free_msg(msg);
759706474fbSAlexander Aring }
760706474fbSAlexander Aring 
761706474fbSAlexander Aring static void free_entry(struct writequeue_entry *e)
762706474fbSAlexander Aring {
763706474fbSAlexander Aring 	struct dlm_msg *msg, *tmp;
764706474fbSAlexander Aring 
765706474fbSAlexander Aring 	list_for_each_entry_safe(msg, tmp, &e->msgs, list) {
766706474fbSAlexander Aring 		if (msg->orig_msg) {
767706474fbSAlexander Aring 			msg->orig_msg->retransmit = false;
768706474fbSAlexander Aring 			kref_put(&msg->orig_msg->ref, dlm_msg_release);
769706474fbSAlexander Aring 		}
770706474fbSAlexander Aring 
771706474fbSAlexander Aring 		list_del(&msg->list);
772706474fbSAlexander Aring 		kref_put(&msg->ref, dlm_msg_release);
773706474fbSAlexander Aring 	}
774706474fbSAlexander Aring 
775706474fbSAlexander Aring 	list_del(&e->list);
776706474fbSAlexander Aring 	atomic_dec(&e->con->writequeue_cnt);
777706474fbSAlexander Aring 	kref_put(&e->ref, dlm_page_release);
778706474fbSAlexander Aring }
779706474fbSAlexander Aring 
780d11ccd45SAlexander Aring static void dlm_close_sock(struct socket **sock)
781d11ccd45SAlexander Aring {
782d11ccd45SAlexander Aring 	if (*sock) {
783d11ccd45SAlexander Aring 		restore_callbacks(*sock);
784d11ccd45SAlexander Aring 		sock_release(*sock);
785d11ccd45SAlexander Aring 		*sock = NULL;
786d11ccd45SAlexander Aring 	}
787d11ccd45SAlexander Aring }
788d11ccd45SAlexander Aring 
7896ed7257bSPatrick Caulfield /* Close a remote connection and tidy up */
7900d737a8cSMarcelo Ricardo Leitner static void close_connection(struct connection *con, bool and_other,
7910d737a8cSMarcelo Ricardo Leitner 			     bool tx, bool rx)
7926ed7257bSPatrick Caulfield {
793b2a66629Stsutomu.owa@toshiba.co.jp 	bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
794706474fbSAlexander Aring 	struct writequeue_entry *e;
795b2a66629Stsutomu.owa@toshiba.co.jp 
7960aa18464Stsutomu.owa@toshiba.co.jp 	if (tx && !closing && cancel_work_sync(&con->swork)) {
7970d737a8cSMarcelo Ricardo Leitner 		log_print("canceled swork for node %d", con->nodeid);
7980aa18464Stsutomu.owa@toshiba.co.jp 		clear_bit(CF_WRITE_PENDING, &con->flags);
7990aa18464Stsutomu.owa@toshiba.co.jp 	}
8000aa18464Stsutomu.owa@toshiba.co.jp 	if (rx && !closing && cancel_work_sync(&con->rwork)) {
8010d737a8cSMarcelo Ricardo Leitner 		log_print("canceled rwork for node %d", con->nodeid);
8020aa18464Stsutomu.owa@toshiba.co.jp 		clear_bit(CF_READ_PENDING, &con->flags);
8030aa18464Stsutomu.owa@toshiba.co.jp 	}
8046ed7257bSPatrick Caulfield 
8050d737a8cSMarcelo Ricardo Leitner 	mutex_lock(&con->sock_mutex);
806d11ccd45SAlexander Aring 	dlm_close_sock(&con->sock);
807d11ccd45SAlexander Aring 
8086ed7257bSPatrick Caulfield 	if (con->othercon && and_other) {
8096ed7257bSPatrick Caulfield 		/* Will only re-enter once. */
810c6aa00e3SAlexander Aring 		close_connection(con->othercon, false, tx, rx);
8116ed7257bSPatrick Caulfield 	}
8129e5f2825SPatrick Caulfield 
813706474fbSAlexander Aring 	/* if we send a writequeue entry only a half way, we drop the
814706474fbSAlexander Aring 	 * whole entry because reconnection and that we not start of the
815706474fbSAlexander Aring 	 * middle of a msg which will confuse the other end.
816706474fbSAlexander Aring 	 *
817706474fbSAlexander Aring 	 * we can always drop messages because retransmits, but what we
818706474fbSAlexander Aring 	 * cannot allow is to transmit half messages which may be processed
819706474fbSAlexander Aring 	 * at the other side.
820706474fbSAlexander Aring 	 *
821706474fbSAlexander Aring 	 * our policy is to start on a clean state when disconnects, we don't
822706474fbSAlexander Aring 	 * know what's send/received on transport layer in this case.
823706474fbSAlexander Aring 	 */
824706474fbSAlexander Aring 	spin_lock(&con->writequeue_lock);
825706474fbSAlexander Aring 	if (!list_empty(&con->writequeue)) {
826706474fbSAlexander Aring 		e = list_first_entry(&con->writequeue, struct writequeue_entry,
827706474fbSAlexander Aring 				     list);
828706474fbSAlexander Aring 		if (e->dirty)
829706474fbSAlexander Aring 			free_entry(e);
830706474fbSAlexander Aring 	}
831706474fbSAlexander Aring 	spin_unlock(&con->writequeue_lock);
832706474fbSAlexander Aring 
8334798cbbfSAlexander Aring 	con->rx_leftover = 0;
8346ed7257bSPatrick Caulfield 	con->retries = 0;
835052849beSAlexander Aring 	clear_bit(CF_APP_LIMITED, &con->flags);
83619633c7eSAlexander Aring 	clear_bit(CF_CONNECTED, &con->flags);
837ba868d9dSAlexander Aring 	clear_bit(CF_DELAY_CONNECT, &con->flags);
838ba868d9dSAlexander Aring 	clear_bit(CF_RECONNECT, &con->flags);
8398aa31cbfSAlexander Aring 	clear_bit(CF_EOF, &con->flags);
8406ed7257bSPatrick Caulfield 	mutex_unlock(&con->sock_mutex);
841b2a66629Stsutomu.owa@toshiba.co.jp 	clear_bit(CF_CLOSING, &con->flags);
8426ed7257bSPatrick Caulfield }
8436ed7257bSPatrick Caulfield 
844055923bfSAlexander Aring static void shutdown_connection(struct connection *con)
845055923bfSAlexander Aring {
846055923bfSAlexander Aring 	int ret;
847055923bfSAlexander Aring 
848eec054b5SAlexander Aring 	flush_work(&con->swork);
849055923bfSAlexander Aring 
850055923bfSAlexander Aring 	mutex_lock(&con->sock_mutex);
851055923bfSAlexander Aring 	/* nothing to shutdown */
852055923bfSAlexander Aring 	if (!con->sock) {
853055923bfSAlexander Aring 		mutex_unlock(&con->sock_mutex);
854055923bfSAlexander Aring 		return;
855055923bfSAlexander Aring 	}
856055923bfSAlexander Aring 
857055923bfSAlexander Aring 	set_bit(CF_SHUTDOWN, &con->flags);
858055923bfSAlexander Aring 	ret = kernel_sock_shutdown(con->sock, SHUT_WR);
859055923bfSAlexander Aring 	mutex_unlock(&con->sock_mutex);
860055923bfSAlexander Aring 	if (ret) {
861055923bfSAlexander Aring 		log_print("Connection %p failed to shutdown: %d will force close",
862055923bfSAlexander Aring 			  con, ret);
863055923bfSAlexander Aring 		goto force_close;
864055923bfSAlexander Aring 	} else {
865055923bfSAlexander Aring 		ret = wait_event_timeout(con->shutdown_wait,
866055923bfSAlexander Aring 					 !test_bit(CF_SHUTDOWN, &con->flags),
867055923bfSAlexander Aring 					 DLM_SHUTDOWN_WAIT_TIMEOUT);
868055923bfSAlexander Aring 		if (ret == 0) {
869055923bfSAlexander Aring 			log_print("Connection %p shutdown timed out, will force close",
870055923bfSAlexander Aring 				  con);
871055923bfSAlexander Aring 			goto force_close;
872055923bfSAlexander Aring 		}
873055923bfSAlexander Aring 	}
874055923bfSAlexander Aring 
875055923bfSAlexander Aring 	return;
876055923bfSAlexander Aring 
877055923bfSAlexander Aring force_close:
878055923bfSAlexander Aring 	clear_bit(CF_SHUTDOWN, &con->flags);
879055923bfSAlexander Aring 	close_connection(con, false, true, true);
880055923bfSAlexander Aring }
881055923bfSAlexander Aring 
882055923bfSAlexander Aring static void dlm_tcp_shutdown(struct connection *con)
883055923bfSAlexander Aring {
884055923bfSAlexander Aring 	if (con->othercon)
885055923bfSAlexander Aring 		shutdown_connection(con->othercon);
886055923bfSAlexander Aring 	shutdown_connection(con);
887055923bfSAlexander Aring }
888055923bfSAlexander Aring 
8894798cbbfSAlexander Aring static int con_realloc_receive_buf(struct connection *con, int newlen)
8904798cbbfSAlexander Aring {
8914798cbbfSAlexander Aring 	unsigned char *newbuf;
8924798cbbfSAlexander Aring 
8934798cbbfSAlexander Aring 	newbuf = kmalloc(newlen, GFP_NOFS);
8944798cbbfSAlexander Aring 	if (!newbuf)
8954798cbbfSAlexander Aring 		return -ENOMEM;
8964798cbbfSAlexander Aring 
8974798cbbfSAlexander Aring 	/* copy any leftover from last receive */
8984798cbbfSAlexander Aring 	if (con->rx_leftover)
8994798cbbfSAlexander Aring 		memmove(newbuf, con->rx_buf, con->rx_leftover);
9004798cbbfSAlexander Aring 
9014798cbbfSAlexander Aring 	/* swap to new buffer space */
9024798cbbfSAlexander Aring 	kfree(con->rx_buf);
9034798cbbfSAlexander Aring 	con->rx_buflen = newlen;
9044798cbbfSAlexander Aring 	con->rx_buf = newbuf;
9054798cbbfSAlexander Aring 
9064798cbbfSAlexander Aring 	return 0;
9074798cbbfSAlexander Aring }
9084798cbbfSAlexander Aring 
9096ed7257bSPatrick Caulfield /* Data received from remote end */
9106ed7257bSPatrick Caulfield static int receive_from_sock(struct connection *con)
9116ed7257bSPatrick Caulfield {
9124798cbbfSAlexander Aring 	struct msghdr msg;
9134798cbbfSAlexander Aring 	struct kvec iov;
9144798cbbfSAlexander Aring 	int ret, buflen;
9156ed7257bSPatrick Caulfield 
9166ed7257bSPatrick Caulfield 	mutex_lock(&con->sock_mutex);
9176ed7257bSPatrick Caulfield 
9186ed7257bSPatrick Caulfield 	if (con->sock == NULL) {
9196ed7257bSPatrick Caulfield 		ret = -EAGAIN;
9206ed7257bSPatrick Caulfield 		goto out_close;
9216ed7257bSPatrick Caulfield 	}
9224798cbbfSAlexander Aring 
9234798cbbfSAlexander Aring 	/* realloc if we get new buffer size to read out */
9244798cbbfSAlexander Aring 	buflen = dlm_config.ci_buffer_size;
9254798cbbfSAlexander Aring 	if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
9264798cbbfSAlexander Aring 		ret = con_realloc_receive_buf(con, buflen);
9274798cbbfSAlexander Aring 		if (ret < 0)
9286ed7257bSPatrick Caulfield 			goto out_resched;
9296ed7257bSPatrick Caulfield 	}
9306ed7257bSPatrick Caulfield 
93162699b3fSAlexander Aring 	for (;;) {
9324798cbbfSAlexander Aring 		/* calculate new buffer parameter regarding last receive and
9334798cbbfSAlexander Aring 		 * possible leftover bytes
9346ed7257bSPatrick Caulfield 		 */
9354798cbbfSAlexander Aring 		iov.iov_base = con->rx_buf + con->rx_leftover;
9364798cbbfSAlexander Aring 		iov.iov_len = con->rx_buflen - con->rx_leftover;
9376ed7257bSPatrick Caulfield 
9384798cbbfSAlexander Aring 		memset(&msg, 0, sizeof(msg));
9394798cbbfSAlexander Aring 		msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
9404798cbbfSAlexander Aring 		ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
9414798cbbfSAlexander Aring 				     msg.msg_flags);
94292732376SAlexander Aring 		trace_dlm_recv(con->nodeid, ret);
94362699b3fSAlexander Aring 		if (ret == -EAGAIN)
94462699b3fSAlexander Aring 			break;
94562699b3fSAlexander Aring 		else if (ret <= 0)
9466ed7257bSPatrick Caulfield 			goto out_close;
9476ed7257bSPatrick Caulfield 
9484798cbbfSAlexander Aring 		/* new buflen according readed bytes and leftover from last receive */
9494798cbbfSAlexander Aring 		buflen = ret + con->rx_leftover;
9504798cbbfSAlexander Aring 		ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
9514798cbbfSAlexander Aring 		if (ret < 0)
9524798cbbfSAlexander Aring 			goto out_close;
9536ed7257bSPatrick Caulfield 
9544798cbbfSAlexander Aring 		/* calculate leftover bytes from process and put it into begin of
9554798cbbfSAlexander Aring 		 * the receive buffer, so next receive we have the full message
9564798cbbfSAlexander Aring 		 * at the start address of the receive buffer.
9574798cbbfSAlexander Aring 		 */
9584798cbbfSAlexander Aring 		con->rx_leftover = buflen - ret;
9594798cbbfSAlexander Aring 		if (con->rx_leftover) {
9604798cbbfSAlexander Aring 			memmove(con->rx_buf, con->rx_buf + ret,
9614798cbbfSAlexander Aring 				con->rx_leftover);
9626ed7257bSPatrick Caulfield 		}
96362699b3fSAlexander Aring 	}
9644798cbbfSAlexander Aring 
965b97f8525SAlexander Aring 	dlm_midcomms_receive_done(con->nodeid);
9666ed7257bSPatrick Caulfield 	mutex_unlock(&con->sock_mutex);
9676ed7257bSPatrick Caulfield 	return 0;
9686ed7257bSPatrick Caulfield 
9696ed7257bSPatrick Caulfield out_resched:
9706ed7257bSPatrick Caulfield 	if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
9716ed7257bSPatrick Caulfield 		queue_work(recv_workqueue, &con->rwork);
9726ed7257bSPatrick Caulfield 	mutex_unlock(&con->sock_mutex);
9736ed7257bSPatrick Caulfield 	return -EAGAIN;
9746ed7257bSPatrick Caulfield 
9756ed7257bSPatrick Caulfield out_close:
976055923bfSAlexander Aring 	if (ret == 0) {
977055923bfSAlexander Aring 		log_print("connection %p got EOF from %d",
978055923bfSAlexander Aring 			  con, con->nodeid);
9798aa31cbfSAlexander Aring 
980a66c008cSAlexander Aring 		if (dlm_proto_ops->eof_condition &&
981a66c008cSAlexander Aring 		    dlm_proto_ops->eof_condition(con)) {
9828aa31cbfSAlexander Aring 			set_bit(CF_EOF, &con->flags);
9838aa31cbfSAlexander Aring 			mutex_unlock(&con->sock_mutex);
9848aa31cbfSAlexander Aring 		} else {
9858aa31cbfSAlexander Aring 			mutex_unlock(&con->sock_mutex);
9868aa31cbfSAlexander Aring 			close_connection(con, false, true, false);
9878aa31cbfSAlexander Aring 
988055923bfSAlexander Aring 			/* handling for tcp shutdown */
989055923bfSAlexander Aring 			clear_bit(CF_SHUTDOWN, &con->flags);
990055923bfSAlexander Aring 			wake_up(&con->shutdown_wait);
9918aa31cbfSAlexander Aring 		}
9928aa31cbfSAlexander Aring 
993055923bfSAlexander Aring 		/* signal to breaking receive worker */
994055923bfSAlexander Aring 		ret = -1;
9958aa31cbfSAlexander Aring 	} else {
9968aa31cbfSAlexander Aring 		mutex_unlock(&con->sock_mutex);
9976ed7257bSPatrick Caulfield 	}
9986ed7257bSPatrick Caulfield 	return ret;
9996ed7257bSPatrick Caulfield }
10006ed7257bSPatrick Caulfield 
10016ed7257bSPatrick Caulfield /* Listening socket is busy, accept a connection */
1002d11ccd45SAlexander Aring static int accept_from_sock(struct listen_connection *con)
10036ed7257bSPatrick Caulfield {
10046ed7257bSPatrick Caulfield 	int result;
10056ed7257bSPatrick Caulfield 	struct sockaddr_storage peeraddr;
10066ed7257bSPatrick Caulfield 	struct socket *newsock;
1007b38bc9c2SAlexander Aring 	int len, idx;
10086ed7257bSPatrick Caulfield 	int nodeid;
10096ed7257bSPatrick Caulfield 	struct connection *newcon;
10106ed7257bSPatrick Caulfield 	struct connection *addcon;
10113f78cd7dSAlexander Aring 	unsigned int mark;
10126ed7257bSPatrick Caulfield 
1013d11ccd45SAlexander Aring 	if (!con->sock)
10143421fb15Stsutomu.owa@toshiba.co.jp 		return -ENOTCONN;
10156ed7257bSPatrick Caulfield 
10163421fb15Stsutomu.owa@toshiba.co.jp 	result = kernel_accept(con->sock, &newsock, O_NONBLOCK);
10176ed7257bSPatrick Caulfield 	if (result < 0)
10186ed7257bSPatrick Caulfield 		goto accept_err;
10196ed7257bSPatrick Caulfield 
10206ed7257bSPatrick Caulfield 	/* Get the connected socket's peer */
10216ed7257bSPatrick Caulfield 	memset(&peeraddr, 0, sizeof(peeraddr));
10229b2c45d4SDenys Vlasenko 	len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2);
10239b2c45d4SDenys Vlasenko 	if (len < 0) {
10246ed7257bSPatrick Caulfield 		result = -ECONNABORTED;
10256ed7257bSPatrick Caulfield 		goto accept_err;
10266ed7257bSPatrick Caulfield 	}
10276ed7257bSPatrick Caulfield 
10286ed7257bSPatrick Caulfield 	/* Get the new node's NODEID */
10296ed7257bSPatrick Caulfield 	make_sockaddr(&peeraddr, 0, &len);
1030e125fbebSAlexander Aring 	if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) {
1031*feae43f8SAlexander Aring 		switch (peeraddr.ss_family) {
1032*feae43f8SAlexander Aring 		case AF_INET: {
1033*feae43f8SAlexander Aring 			struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr;
1034*feae43f8SAlexander Aring 
1035*feae43f8SAlexander Aring 			log_print("connect from non cluster IPv4 node %pI4",
1036*feae43f8SAlexander Aring 				  &sin->sin_addr);
1037*feae43f8SAlexander Aring 			break;
1038*feae43f8SAlexander Aring 		}
1039*feae43f8SAlexander Aring #if IS_ENABLED(CONFIG_IPV6)
1040*feae43f8SAlexander Aring 		case AF_INET6: {
1041*feae43f8SAlexander Aring 			struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr;
1042*feae43f8SAlexander Aring 
1043*feae43f8SAlexander Aring 			log_print("connect from non cluster IPv6 node %pI6c",
1044*feae43f8SAlexander Aring 				  &sin6->sin6_addr);
1045*feae43f8SAlexander Aring 			break;
1046*feae43f8SAlexander Aring 		}
1047*feae43f8SAlexander Aring #endif
1048*feae43f8SAlexander Aring 		default:
1049*feae43f8SAlexander Aring 			log_print("invalid family from non cluster node");
1050*feae43f8SAlexander Aring 			break;
1051*feae43f8SAlexander Aring 		}
1052*feae43f8SAlexander Aring 
10536ed7257bSPatrick Caulfield 		sock_release(newsock);
10546ed7257bSPatrick Caulfield 		return -1;
10556ed7257bSPatrick Caulfield 	}
10566ed7257bSPatrick Caulfield 
10576ed7257bSPatrick Caulfield 	log_print("got connection from %d", nodeid);
10586ed7257bSPatrick Caulfield 
10596ed7257bSPatrick Caulfield 	/*  Check to see if we already have a connection to this node. This
10606ed7257bSPatrick Caulfield 	 *  could happen if the two nodes initiate a connection at roughly
10616ed7257bSPatrick Caulfield 	 *  the same time and the connections cross on the wire.
10626ed7257bSPatrick Caulfield 	 *  In this case we store the incoming one in "othercon"
10636ed7257bSPatrick Caulfield 	 */
1064b38bc9c2SAlexander Aring 	idx = srcu_read_lock(&connections_srcu);
1065748285ccSDavid Teigland 	newcon = nodeid2con(nodeid, GFP_NOFS);
10666ed7257bSPatrick Caulfield 	if (!newcon) {
1067b38bc9c2SAlexander Aring 		srcu_read_unlock(&connections_srcu, idx);
10686ed7257bSPatrick Caulfield 		result = -ENOMEM;
10696ed7257bSPatrick Caulfield 		goto accept_err;
10706ed7257bSPatrick Caulfield 	}
1071d11ccd45SAlexander Aring 
1072e125fbebSAlexander Aring 	sock_set_mark(newsock->sk, mark);
1073e125fbebSAlexander Aring 
1074d11ccd45SAlexander Aring 	mutex_lock(&newcon->sock_mutex);
10756ed7257bSPatrick Caulfield 	if (newcon->sock) {
10766ed7257bSPatrick Caulfield 		struct connection *othercon = newcon->othercon;
10776ed7257bSPatrick Caulfield 
10786ed7257bSPatrick Caulfield 		if (!othercon) {
1079a47666ebSAlexander Aring 			othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
10806ed7257bSPatrick Caulfield 			if (!othercon) {
1081617e82e1SDavid Teigland 				log_print("failed to allocate incoming socket");
10826ed7257bSPatrick Caulfield 				mutex_unlock(&newcon->sock_mutex);
1083b38bc9c2SAlexander Aring 				srcu_read_unlock(&connections_srcu, idx);
10846ed7257bSPatrick Caulfield 				result = -ENOMEM;
10856ed7257bSPatrick Caulfield 				goto accept_err;
10866ed7257bSPatrick Caulfield 			}
10874798cbbfSAlexander Aring 
10886cde210aSAlexander Aring 			result = dlm_con_init(othercon, nodeid);
10896cde210aSAlexander Aring 			if (result < 0) {
10904798cbbfSAlexander Aring 				kfree(othercon);
10912fd8db2dSYang Yingliang 				mutex_unlock(&newcon->sock_mutex);
1092b38bc9c2SAlexander Aring 				srcu_read_unlock(&connections_srcu, idx);
10934798cbbfSAlexander Aring 				goto accept_err;
10944798cbbfSAlexander Aring 			}
10954798cbbfSAlexander Aring 
1096e9a470acSAlexander Aring 			lockdep_set_subclass(&othercon->sock_mutex, 1);
10977443bc96SAlexander Aring 			set_bit(CF_IS_OTHERCON, &othercon->flags);
10986cde210aSAlexander Aring 			newcon->othercon = othercon;
1099ba868d9dSAlexander Aring 			othercon->sendcon = newcon;
1100ba3ab3caSAlexander Aring 		} else {
1101ba3ab3caSAlexander Aring 			/* close other sock con if we have something new */
1102ba3ab3caSAlexander Aring 			close_connection(othercon, false, true, false);
110361d96be0SPatrick Caulfield 		}
1104ba3ab3caSAlexander Aring 
1105e9a470acSAlexander Aring 		mutex_lock(&othercon->sock_mutex);
1106988419a9Stsutomu.owa@toshiba.co.jp 		add_sock(newsock, othercon);
11076ed7257bSPatrick Caulfield 		addcon = othercon;
1108c7355827Stsutomu.owa@toshiba.co.jp 		mutex_unlock(&othercon->sock_mutex);
11096ed7257bSPatrick Caulfield 	}
11106ed7257bSPatrick Caulfield 	else {
11113735b4b9SBob Peterson 		/* accept copies the sk after we've saved the callbacks, so we
11123735b4b9SBob Peterson 		   don't want to save them a second time or comm errors will
11133735b4b9SBob Peterson 		   result in calling sk_error_report recursively. */
1114988419a9Stsutomu.owa@toshiba.co.jp 		add_sock(newsock, newcon);
11156ed7257bSPatrick Caulfield 		addcon = newcon;
11166ed7257bSPatrick Caulfield 	}
11176ed7257bSPatrick Caulfield 
1118b30a624fSAlexander Aring 	set_bit(CF_CONNECTED, &addcon->flags);
11196ed7257bSPatrick Caulfield 	mutex_unlock(&newcon->sock_mutex);
11206ed7257bSPatrick Caulfield 
11216ed7257bSPatrick Caulfield 	/*
11226ed7257bSPatrick Caulfield 	 * Add it to the active queue in case we got data
112325985edcSLucas De Marchi 	 * between processing the accept adding the socket
11246ed7257bSPatrick Caulfield 	 * to the read_sockets list
11256ed7257bSPatrick Caulfield 	 */
11266ed7257bSPatrick Caulfield 	if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
11276ed7257bSPatrick Caulfield 		queue_work(recv_workqueue, &addcon->rwork);
11286ed7257bSPatrick Caulfield 
1129b38bc9c2SAlexander Aring 	srcu_read_unlock(&connections_srcu, idx);
1130b38bc9c2SAlexander Aring 
11316ed7257bSPatrick Caulfield 	return 0;
11326ed7257bSPatrick Caulfield 
11336ed7257bSPatrick Caulfield accept_err:
11343421fb15Stsutomu.owa@toshiba.co.jp 	if (newsock)
11356ed7257bSPatrick Caulfield 		sock_release(newsock);
11366ed7257bSPatrick Caulfield 
11376ed7257bSPatrick Caulfield 	if (result != -EAGAIN)
1138617e82e1SDavid Teigland 		log_print("error accepting connection from node: %d", result);
11396ed7257bSPatrick Caulfield 	return result;
11406ed7257bSPatrick Caulfield }
11416ed7257bSPatrick Caulfield 
11425d689871SMike Christie /*
11435d689871SMike Christie  * writequeue_entry_complete - try to delete and free write queue entry
11445d689871SMike Christie  * @e: write queue entry to try to delete
11455d689871SMike Christie  * @completed: bytes completed
11465d689871SMike Christie  *
11475d689871SMike Christie  * writequeue_lock must be held.
11485d689871SMike Christie  */
11495d689871SMike Christie static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
11505d689871SMike Christie {
11515d689871SMike Christie 	e->offset += completed;
11525d689871SMike Christie 	e->len -= completed;
1153706474fbSAlexander Aring 	/* signal that page was half way transmitted */
1154706474fbSAlexander Aring 	e->dirty = true;
11555d689871SMike Christie 
11568f2dc78dSAlexander Aring 	if (e->len == 0 && e->users == 0)
11575d689871SMike Christie 		free_entry(e);
11585d689871SMike Christie }
11595d689871SMike Christie 
1160ee44b4bcSMarcelo Ricardo Leitner /*
1161ee44b4bcSMarcelo Ricardo Leitner  * sctp_bind_addrs - bind a SCTP socket to all our addresses
1162ee44b4bcSMarcelo Ricardo Leitner  */
116313004e8aSAlexander Aring static int sctp_bind_addrs(struct socket *sock, uint16_t port)
1164ee44b4bcSMarcelo Ricardo Leitner {
1165ee44b4bcSMarcelo Ricardo Leitner 	struct sockaddr_storage localaddr;
1166c0425a42SChristoph Hellwig 	struct sockaddr *addr = (struct sockaddr *)&localaddr;
1167ee44b4bcSMarcelo Ricardo Leitner 	int i, addr_len, result = 0;
1168ee44b4bcSMarcelo Ricardo Leitner 
1169ee44b4bcSMarcelo Ricardo Leitner 	for (i = 0; i < dlm_local_count; i++) {
1170ee44b4bcSMarcelo Ricardo Leitner 		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
1171ee44b4bcSMarcelo Ricardo Leitner 		make_sockaddr(&localaddr, port, &addr_len);
1172ee44b4bcSMarcelo Ricardo Leitner 
1173ee44b4bcSMarcelo Ricardo Leitner 		if (!i)
117413004e8aSAlexander Aring 			result = kernel_bind(sock, addr, addr_len);
1175ee44b4bcSMarcelo Ricardo Leitner 		else
117613004e8aSAlexander Aring 			result = sock_bind_add(sock->sk, addr, addr_len);
1177ee44b4bcSMarcelo Ricardo Leitner 
1178ee44b4bcSMarcelo Ricardo Leitner 		if (result < 0) {
1179ee44b4bcSMarcelo Ricardo Leitner 			log_print("Can't bind to %d addr number %d, %d.\n",
1180ee44b4bcSMarcelo Ricardo Leitner 				  port, i + 1, result);
1181ee44b4bcSMarcelo Ricardo Leitner 			break;
1182ee44b4bcSMarcelo Ricardo Leitner 		}
1183ee44b4bcSMarcelo Ricardo Leitner 	}
1184ee44b4bcSMarcelo Ricardo Leitner 	return result;
1185ee44b4bcSMarcelo Ricardo Leitner }
1186ee44b4bcSMarcelo Ricardo Leitner 
11876ed7257bSPatrick Caulfield /* Get local addresses */
11886ed7257bSPatrick Caulfield static void init_local(void)
11896ed7257bSPatrick Caulfield {
11906ed7257bSPatrick Caulfield 	struct sockaddr_storage sas, *addr;
11916ed7257bSPatrick Caulfield 	int i;
11926ed7257bSPatrick Caulfield 
119330d3a237SPatrick Caulfield 	dlm_local_count = 0;
11941b189b88SDavid Teigland 	for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) {
11956ed7257bSPatrick Caulfield 		if (dlm_our_addr(&sas, i))
11966ed7257bSPatrick Caulfield 			break;
11976ed7257bSPatrick Caulfield 
11985c93f56fSAmitoj Kaur Chawla 		addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS);
11996ed7257bSPatrick Caulfield 		if (!addr)
12006ed7257bSPatrick Caulfield 			break;
12016ed7257bSPatrick Caulfield 		dlm_local_addr[dlm_local_count++] = addr;
12026ed7257bSPatrick Caulfield 	}
12036ed7257bSPatrick Caulfield }
12046ed7257bSPatrick Caulfield 
1205043697f0SAlexander Aring static void deinit_local(void)
1206043697f0SAlexander Aring {
1207043697f0SAlexander Aring 	int i;
1208043697f0SAlexander Aring 
1209043697f0SAlexander Aring 	for (i = 0; i < dlm_local_count; i++)
1210043697f0SAlexander Aring 		kfree(dlm_local_addr[i]);
1211043697f0SAlexander Aring }
1212043697f0SAlexander Aring 
1213be3b0400SAlexander Aring static struct writequeue_entry *new_writequeue_entry(struct connection *con)
12146ed7257bSPatrick Caulfield {
12156ed7257bSPatrick Caulfield 	struct writequeue_entry *entry;
12166ed7257bSPatrick Caulfield 
12173af2326cSAlexander Aring 	entry = dlm_allocate_writequeue();
12186ed7257bSPatrick Caulfield 	if (!entry)
12196ed7257bSPatrick Caulfield 		return NULL;
12206ed7257bSPatrick Caulfield 
1221be3b0400SAlexander Aring 	entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
12226ed7257bSPatrick Caulfield 	if (!entry->page) {
12233af2326cSAlexander Aring 		dlm_free_writequeue(entry);
12246ed7257bSPatrick Caulfield 		return NULL;
12256ed7257bSPatrick Caulfield 	}
12266ed7257bSPatrick Caulfield 
12273af2326cSAlexander Aring 	entry->offset = 0;
12283af2326cSAlexander Aring 	entry->len = 0;
12293af2326cSAlexander Aring 	entry->end = 0;
12303af2326cSAlexander Aring 	entry->dirty = false;
12316ed7257bSPatrick Caulfield 	entry->con = con;
1232f0747ebfSAlexander Aring 	entry->users = 1;
12338f2dc78dSAlexander Aring 	kref_init(&entry->ref);
12346ed7257bSPatrick Caulfield 	return entry;
12356ed7257bSPatrick Caulfield }
12366ed7257bSPatrick Caulfield 
1237f0747ebfSAlexander Aring static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
1238be3b0400SAlexander Aring 					     char **ppc, void (*cb)(void *data),
1239be3b0400SAlexander Aring 					     void *data)
1240f0747ebfSAlexander Aring {
1241f0747ebfSAlexander Aring 	struct writequeue_entry *e;
1242f0747ebfSAlexander Aring 
1243f0747ebfSAlexander Aring 	spin_lock(&con->writequeue_lock);
1244f0747ebfSAlexander Aring 	if (!list_empty(&con->writequeue)) {
1245f0747ebfSAlexander Aring 		e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
1246f0747ebfSAlexander Aring 		if (DLM_WQ_REMAIN_BYTES(e) >= len) {
12478f2dc78dSAlexander Aring 			kref_get(&e->ref);
12488f2dc78dSAlexander Aring 
1249f0747ebfSAlexander Aring 			*ppc = page_address(e->page) + e->end;
12508f2dc78dSAlexander Aring 			if (cb)
12515c16febbSAlexander Aring 				cb(data);
12528f2dc78dSAlexander Aring 
1253f0747ebfSAlexander Aring 			e->end += len;
1254f0747ebfSAlexander Aring 			e->users++;
1255be3b0400SAlexander Aring 			goto out;
1256f0747ebfSAlexander Aring 		}
1257f0747ebfSAlexander Aring 	}
1258f0747ebfSAlexander Aring 
1259be3b0400SAlexander Aring 	e = new_writequeue_entry(con);
1260f0747ebfSAlexander Aring 	if (!e)
1261be3b0400SAlexander Aring 		goto out;
1262f0747ebfSAlexander Aring 
12638f2dc78dSAlexander Aring 	kref_get(&e->ref);
1264f0747ebfSAlexander Aring 	*ppc = page_address(e->page);
1265f0747ebfSAlexander Aring 	e->end += len;
12668aa31cbfSAlexander Aring 	atomic_inc(&con->writequeue_cnt);
12678f2dc78dSAlexander Aring 	if (cb)
12685c16febbSAlexander Aring 		cb(data);
12698f2dc78dSAlexander Aring 
1270f0747ebfSAlexander Aring 	list_add_tail(&e->list, &con->writequeue);
1271f0747ebfSAlexander Aring 
1272be3b0400SAlexander Aring out:
1273be3b0400SAlexander Aring 	spin_unlock(&con->writequeue_lock);
1274f0747ebfSAlexander Aring 	return e;
1275f0747ebfSAlexander Aring };
1276f0747ebfSAlexander Aring 
12772874d1a6SAlexander Aring static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
12782874d1a6SAlexander Aring 						gfp_t allocation, char **ppc,
12795c16febbSAlexander Aring 						void (*cb)(void *data),
12805c16febbSAlexander Aring 						void *data)
12812874d1a6SAlexander Aring {
12822874d1a6SAlexander Aring 	struct writequeue_entry *e;
12832874d1a6SAlexander Aring 	struct dlm_msg *msg;
12842874d1a6SAlexander Aring 
1285e4dc81edSAlexander Aring 	msg = dlm_allocate_msg(allocation);
12862874d1a6SAlexander Aring 	if (!msg)
12872874d1a6SAlexander Aring 		return NULL;
12882874d1a6SAlexander Aring 
12892874d1a6SAlexander Aring 	kref_init(&msg->ref);
12902874d1a6SAlexander Aring 
1291be3b0400SAlexander Aring 	e = new_wq_entry(con, len, ppc, cb, data);
12922874d1a6SAlexander Aring 	if (!e) {
1293e4dc81edSAlexander Aring 		dlm_free_msg(msg);
12942874d1a6SAlexander Aring 		return NULL;
12952874d1a6SAlexander Aring 	}
12962874d1a6SAlexander Aring 
1297e4dc81edSAlexander Aring 	msg->retransmit = false;
1298e4dc81edSAlexander Aring 	msg->orig_msg = NULL;
12992874d1a6SAlexander Aring 	msg->ppc = *ppc;
13002874d1a6SAlexander Aring 	msg->len = len;
13012874d1a6SAlexander Aring 	msg->entry = e;
13022874d1a6SAlexander Aring 
13032874d1a6SAlexander Aring 	return msg;
13042874d1a6SAlexander Aring }
13052874d1a6SAlexander Aring 
13068f2dc78dSAlexander Aring struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
13075c16febbSAlexander Aring 				     char **ppc, void (*cb)(void *data),
13085c16febbSAlexander Aring 				     void *data)
13096ed7257bSPatrick Caulfield {
13106ed7257bSPatrick Caulfield 	struct connection *con;
13118f2dc78dSAlexander Aring 	struct dlm_msg *msg;
1312b38bc9c2SAlexander Aring 	int idx;
13136ed7257bSPatrick Caulfield 
1314d10a0b88SAlexander Aring 	if (len > DLM_MAX_SOCKET_BUFSIZE ||
1315c45674fbSAlexander Aring 	    len < sizeof(struct dlm_header)) {
1316d10a0b88SAlexander Aring 		BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
1317692f51c8SAlexander Aring 		log_print("failed to allocate a buffer of size %d", len);
1318c45674fbSAlexander Aring 		WARN_ON(1);
1319692f51c8SAlexander Aring 		return NULL;
1320692f51c8SAlexander Aring 	}
1321692f51c8SAlexander Aring 
1322b38bc9c2SAlexander Aring 	idx = srcu_read_lock(&connections_srcu);
13236ed7257bSPatrick Caulfield 	con = nodeid2con(nodeid, allocation);
1324b38bc9c2SAlexander Aring 	if (!con) {
1325b38bc9c2SAlexander Aring 		srcu_read_unlock(&connections_srcu, idx);
13266ed7257bSPatrick Caulfield 		return NULL;
1327b38bc9c2SAlexander Aring 	}
13286ed7257bSPatrick Caulfield 
13295c16febbSAlexander Aring 	msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data);
13308f2dc78dSAlexander Aring 	if (!msg) {
1331b38bc9c2SAlexander Aring 		srcu_read_unlock(&connections_srcu, idx);
1332b38bc9c2SAlexander Aring 		return NULL;
1333b38bc9c2SAlexander Aring 	}
1334b38bc9c2SAlexander Aring 
13358f2dc78dSAlexander Aring 	/* we assume if successful commit must called */
13368f2dc78dSAlexander Aring 	msg->idx = idx;
13378f2dc78dSAlexander Aring 	return msg;
13388f2dc78dSAlexander Aring }
13398f2dc78dSAlexander Aring 
13402874d1a6SAlexander Aring static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
13416ed7257bSPatrick Caulfield {
13428f2dc78dSAlexander Aring 	struct writequeue_entry *e = msg->entry;
13436ed7257bSPatrick Caulfield 	struct connection *con = e->con;
13446ed7257bSPatrick Caulfield 	int users;
13456ed7257bSPatrick Caulfield 
13466ed7257bSPatrick Caulfield 	spin_lock(&con->writequeue_lock);
13478f2dc78dSAlexander Aring 	kref_get(&msg->ref);
13488f2dc78dSAlexander Aring 	list_add(&msg->list, &e->msgs);
13498f2dc78dSAlexander Aring 
13506ed7257bSPatrick Caulfield 	users = --e->users;
13516ed7257bSPatrick Caulfield 	if (users)
13526ed7257bSPatrick Caulfield 		goto out;
1353f0747ebfSAlexander Aring 
1354f0747ebfSAlexander Aring 	e->len = DLM_WQ_LENGTH_BYTES(e);
13556ed7257bSPatrick Caulfield 	spin_unlock(&con->writequeue_lock);
13566ed7257bSPatrick Caulfield 
13576ed7257bSPatrick Caulfield 	queue_work(send_workqueue, &con->swork);
13586ed7257bSPatrick Caulfield 	return;
13596ed7257bSPatrick Caulfield 
13606ed7257bSPatrick Caulfield out:
13616ed7257bSPatrick Caulfield 	spin_unlock(&con->writequeue_lock);
13626ed7257bSPatrick Caulfield 	return;
13636ed7257bSPatrick Caulfield }
13646ed7257bSPatrick Caulfield 
13652874d1a6SAlexander Aring void dlm_lowcomms_commit_msg(struct dlm_msg *msg)
13662874d1a6SAlexander Aring {
13672874d1a6SAlexander Aring 	_dlm_lowcomms_commit_msg(msg);
13682874d1a6SAlexander Aring 	srcu_read_unlock(&connections_srcu, msg->idx);
13692874d1a6SAlexander Aring }
13702874d1a6SAlexander Aring 
13718f2dc78dSAlexander Aring void dlm_lowcomms_put_msg(struct dlm_msg *msg)
13728f2dc78dSAlexander Aring {
13738f2dc78dSAlexander Aring 	kref_put(&msg->ref, dlm_msg_release);
13748f2dc78dSAlexander Aring }
13758f2dc78dSAlexander Aring 
13762874d1a6SAlexander Aring /* does not held connections_srcu, usage workqueue only */
13772874d1a6SAlexander Aring int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
13782874d1a6SAlexander Aring {
13792874d1a6SAlexander Aring 	struct dlm_msg *msg_resend;
13802874d1a6SAlexander Aring 	char *ppc;
13812874d1a6SAlexander Aring 
13822874d1a6SAlexander Aring 	if (msg->retransmit)
13832874d1a6SAlexander Aring 		return 1;
13842874d1a6SAlexander Aring 
13852874d1a6SAlexander Aring 	msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len,
13862874d1a6SAlexander Aring 					      GFP_ATOMIC, &ppc, NULL, NULL);
13872874d1a6SAlexander Aring 	if (!msg_resend)
13882874d1a6SAlexander Aring 		return -ENOMEM;
13892874d1a6SAlexander Aring 
13902874d1a6SAlexander Aring 	msg->retransmit = true;
13912874d1a6SAlexander Aring 	kref_get(&msg->ref);
13922874d1a6SAlexander Aring 	msg_resend->orig_msg = msg;
13932874d1a6SAlexander Aring 
13942874d1a6SAlexander Aring 	memcpy(ppc, msg->ppc, msg->len);
13952874d1a6SAlexander Aring 	_dlm_lowcomms_commit_msg(msg_resend);
13962874d1a6SAlexander Aring 	dlm_lowcomms_put_msg(msg_resend);
13972874d1a6SAlexander Aring 
13982874d1a6SAlexander Aring 	return 0;
13992874d1a6SAlexander Aring }
14002874d1a6SAlexander Aring 
14016ed7257bSPatrick Caulfield /* Send a message */
14026ed7257bSPatrick Caulfield static void send_to_sock(struct connection *con)
14036ed7257bSPatrick Caulfield {
14046ed7257bSPatrick Caulfield 	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
14056ed7257bSPatrick Caulfield 	struct writequeue_entry *e;
140666d5955aSAlexander Aring 	int len, offset, ret;
1407f92c8dd7SBob Peterson 	int count = 0;
14086ed7257bSPatrick Caulfield 
14096ed7257bSPatrick Caulfield 	mutex_lock(&con->sock_mutex);
14106ed7257bSPatrick Caulfield 	if (con->sock == NULL)
14116ed7257bSPatrick Caulfield 		goto out_connect;
14126ed7257bSPatrick Caulfield 
14136ed7257bSPatrick Caulfield 	spin_lock(&con->writequeue_lock);
14146ed7257bSPatrick Caulfield 	for (;;) {
141566d5955aSAlexander Aring 		e = con_next_wq(con);
141666d5955aSAlexander Aring 		if (!e)
14176ed7257bSPatrick Caulfield 			break;
14186ed7257bSPatrick Caulfield 
14196ed7257bSPatrick Caulfield 		len = e->len;
14206ed7257bSPatrick Caulfield 		offset = e->offset;
14216ed7257bSPatrick Caulfield 		BUG_ON(len == 0 && e->users == 0);
14226ed7257bSPatrick Caulfield 		spin_unlock(&con->writequeue_lock);
14236ed7257bSPatrick Caulfield 
14241329e3f2SPaolo Bonzini 		ret = kernel_sendpage(con->sock, e->page, offset, len,
14256ed7257bSPatrick Caulfield 				      msg_flags);
142692732376SAlexander Aring 		trace_dlm_send(con->nodeid, ret);
1427d66f8277SPatrick Caulfield 		if (ret == -EAGAIN || ret == 0) {
1428b36930ddSDavid Miller 			if (ret == -EAGAIN &&
14299cd3e072SEric Dumazet 			    test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
1430b36930ddSDavid Miller 			    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
1431b36930ddSDavid Miller 				/* Notify TCP that we're limited by the
1432b36930ddSDavid Miller 				 * application window size.
1433b36930ddSDavid Miller 				 */
1434b36930ddSDavid Miller 				set_bit(SOCK_NOSPACE, &con->sock->flags);
1435b36930ddSDavid Miller 				con->sock->sk->sk_write_pending++;
1436b36930ddSDavid Miller 			}
1437d66f8277SPatrick Caulfield 			cond_resched();
14386ed7257bSPatrick Caulfield 			goto out;
14399c5bef58SYing Xue 		} else if (ret < 0)
1440ba868d9dSAlexander Aring 			goto out;
1441f92c8dd7SBob Peterson 
14426ed7257bSPatrick Caulfield 		/* Don't starve people filling buffers */
1443f92c8dd7SBob Peterson 		if (++count >= MAX_SEND_MSG_COUNT) {
14446ed7257bSPatrick Caulfield 			cond_resched();
1445f92c8dd7SBob Peterson 			count = 0;
1446f92c8dd7SBob Peterson 		}
14476ed7257bSPatrick Caulfield 
14486ed7257bSPatrick Caulfield 		spin_lock(&con->writequeue_lock);
14495d689871SMike Christie 		writequeue_entry_complete(e, ret);
14506ed7257bSPatrick Caulfield 	}
14516ed7257bSPatrick Caulfield 	spin_unlock(&con->writequeue_lock);
14528aa31cbfSAlexander Aring 
14538aa31cbfSAlexander Aring 	/* close if we got EOF */
14548aa31cbfSAlexander Aring 	if (test_and_clear_bit(CF_EOF, &con->flags)) {
14558aa31cbfSAlexander Aring 		mutex_unlock(&con->sock_mutex);
14568aa31cbfSAlexander Aring 		close_connection(con, false, false, true);
14578aa31cbfSAlexander Aring 
14588aa31cbfSAlexander Aring 		/* handling for tcp shutdown */
14598aa31cbfSAlexander Aring 		clear_bit(CF_SHUTDOWN, &con->flags);
14608aa31cbfSAlexander Aring 		wake_up(&con->shutdown_wait);
14618aa31cbfSAlexander Aring 	} else {
14628aa31cbfSAlexander Aring 		mutex_unlock(&con->sock_mutex);
14638aa31cbfSAlexander Aring 	}
14648aa31cbfSAlexander Aring 
14658aa31cbfSAlexander Aring 	return;
14668aa31cbfSAlexander Aring 
14676ed7257bSPatrick Caulfield out:
14686ed7257bSPatrick Caulfield 	mutex_unlock(&con->sock_mutex);
14696ed7257bSPatrick Caulfield 	return;
14706ed7257bSPatrick Caulfield 
14716ed7257bSPatrick Caulfield out_connect:
14726ed7257bSPatrick Caulfield 	mutex_unlock(&con->sock_mutex);
147301da24d3SBob Peterson 	queue_work(send_workqueue, &con->swork);
147401da24d3SBob Peterson 	cond_resched();
14756ed7257bSPatrick Caulfield }
14766ed7257bSPatrick Caulfield 
14776ed7257bSPatrick Caulfield static void clean_one_writequeue(struct connection *con)
14786ed7257bSPatrick Caulfield {
14795e9ccc37SChristine Caulfield 	struct writequeue_entry *e, *safe;
14806ed7257bSPatrick Caulfield 
14816ed7257bSPatrick Caulfield 	spin_lock(&con->writequeue_lock);
14825e9ccc37SChristine Caulfield 	list_for_each_entry_safe(e, safe, &con->writequeue, list) {
14836ed7257bSPatrick Caulfield 		free_entry(e);
14846ed7257bSPatrick Caulfield 	}
14856ed7257bSPatrick Caulfield 	spin_unlock(&con->writequeue_lock);
14866ed7257bSPatrick Caulfield }
14876ed7257bSPatrick Caulfield 
14886ed7257bSPatrick Caulfield /* Called from recovery when it knows that a node has
14896ed7257bSPatrick Caulfield    left the cluster */
14906ed7257bSPatrick Caulfield int dlm_lowcomms_close(int nodeid)
14916ed7257bSPatrick Caulfield {
14926ed7257bSPatrick Caulfield 	struct connection *con;
149336b71a8bSDavid Teigland 	struct dlm_node_addr *na;
1494b38bc9c2SAlexander Aring 	int idx;
14956ed7257bSPatrick Caulfield 
14966ed7257bSPatrick Caulfield 	log_print("closing connection to node %d", nodeid);
1497b38bc9c2SAlexander Aring 	idx = srcu_read_lock(&connections_srcu);
14986ed7257bSPatrick Caulfield 	con = nodeid2con(nodeid, 0);
14996ed7257bSPatrick Caulfield 	if (con) {
1500063c4c99SLars Marowsky-Bree 		set_bit(CF_CLOSE, &con->flags);
15010d737a8cSMarcelo Ricardo Leitner 		close_connection(con, true, true, true);
15026ed7257bSPatrick Caulfield 		clean_one_writequeue(con);
150353a5edaaSAlexander Aring 		if (con->othercon)
150453a5edaaSAlexander Aring 			clean_one_writequeue(con->othercon);
15056ed7257bSPatrick Caulfield 	}
1506b38bc9c2SAlexander Aring 	srcu_read_unlock(&connections_srcu, idx);
150736b71a8bSDavid Teigland 
150836b71a8bSDavid Teigland 	spin_lock(&dlm_node_addrs_spin);
150936b71a8bSDavid Teigland 	na = find_node_addr(nodeid);
151036b71a8bSDavid Teigland 	if (na) {
151136b71a8bSDavid Teigland 		list_del(&na->list);
151236b71a8bSDavid Teigland 		while (na->addr_count--)
151336b71a8bSDavid Teigland 			kfree(na->addr[na->addr_count]);
151436b71a8bSDavid Teigland 		kfree(na);
151536b71a8bSDavid Teigland 	}
151636b71a8bSDavid Teigland 	spin_unlock(&dlm_node_addrs_spin);
151736b71a8bSDavid Teigland 
15186ed7257bSPatrick Caulfield 	return 0;
15196ed7257bSPatrick Caulfield }
15206ed7257bSPatrick Caulfield 
15216ed7257bSPatrick Caulfield /* Receive workqueue function */
15226ed7257bSPatrick Caulfield static void process_recv_sockets(struct work_struct *work)
15236ed7257bSPatrick Caulfield {
15246ed7257bSPatrick Caulfield 	struct connection *con = container_of(work, struct connection, rwork);
15256ed7257bSPatrick Caulfield 
15266ed7257bSPatrick Caulfield 	clear_bit(CF_READ_PENDING, &con->flags);
152762699b3fSAlexander Aring 	receive_from_sock(con);
15286ed7257bSPatrick Caulfield }
15296ed7257bSPatrick Caulfield 
1530d11ccd45SAlexander Aring static void process_listen_recv_socket(struct work_struct *work)
1531d11ccd45SAlexander Aring {
1532d11ccd45SAlexander Aring 	accept_from_sock(&listen_con);
1533d11ccd45SAlexander Aring }
1534d11ccd45SAlexander Aring 
15358728a455SAlexander Aring static void dlm_connect(struct connection *con)
15368728a455SAlexander Aring {
15378728a455SAlexander Aring 	struct sockaddr_storage addr;
15388728a455SAlexander Aring 	int result, addr_len;
15398728a455SAlexander Aring 	struct socket *sock;
15408728a455SAlexander Aring 	unsigned int mark;
15418728a455SAlexander Aring 
15428728a455SAlexander Aring 	/* Some odd races can cause double-connects, ignore them */
15438728a455SAlexander Aring 	if (con->retries++ > MAX_CONNECT_RETRIES)
15448728a455SAlexander Aring 		return;
15458728a455SAlexander Aring 
15468728a455SAlexander Aring 	if (con->sock) {
15478728a455SAlexander Aring 		log_print("node %d already connected.", con->nodeid);
15488728a455SAlexander Aring 		return;
15498728a455SAlexander Aring 	}
15508728a455SAlexander Aring 
15518728a455SAlexander Aring 	memset(&addr, 0, sizeof(addr));
15528728a455SAlexander Aring 	result = nodeid_to_addr(con->nodeid, &addr, NULL,
15538728a455SAlexander Aring 				dlm_proto_ops->try_new_addr, &mark);
15548728a455SAlexander Aring 	if (result < 0) {
15558728a455SAlexander Aring 		log_print("no address for nodeid %d", con->nodeid);
15568728a455SAlexander Aring 		return;
15578728a455SAlexander Aring 	}
15588728a455SAlexander Aring 
15598728a455SAlexander Aring 	/* Create a socket to communicate with */
15608728a455SAlexander Aring 	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
15618728a455SAlexander Aring 				  SOCK_STREAM, dlm_proto_ops->proto, &sock);
15628728a455SAlexander Aring 	if (result < 0)
15638728a455SAlexander Aring 		goto socket_err;
15648728a455SAlexander Aring 
15658728a455SAlexander Aring 	sock_set_mark(sock->sk, mark);
15668728a455SAlexander Aring 	dlm_proto_ops->sockopts(sock);
15678728a455SAlexander Aring 
15688728a455SAlexander Aring 	add_sock(sock, con);
15698728a455SAlexander Aring 
15708728a455SAlexander Aring 	result = dlm_proto_ops->bind(sock);
15718728a455SAlexander Aring 	if (result < 0)
15728728a455SAlexander Aring 		goto add_sock_err;
15738728a455SAlexander Aring 
15748728a455SAlexander Aring 	log_print_ratelimited("connecting to %d", con->nodeid);
15758728a455SAlexander Aring 	make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
15768728a455SAlexander Aring 	result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr,
15778728a455SAlexander Aring 					addr_len);
15788728a455SAlexander Aring 	if (result < 0)
15798728a455SAlexander Aring 		goto add_sock_err;
15808728a455SAlexander Aring 
15818728a455SAlexander Aring 	return;
15828728a455SAlexander Aring 
15838728a455SAlexander Aring add_sock_err:
15848728a455SAlexander Aring 	dlm_close_sock(&con->sock);
15858728a455SAlexander Aring 
15868728a455SAlexander Aring socket_err:
15878728a455SAlexander Aring 	/*
15888728a455SAlexander Aring 	 * Some errors are fatal and this list might need adjusting. For other
15898728a455SAlexander Aring 	 * errors we try again until the max number of retries is reached.
15908728a455SAlexander Aring 	 */
15918728a455SAlexander Aring 	if (result != -EHOSTUNREACH &&
15928728a455SAlexander Aring 	    result != -ENETUNREACH &&
15938728a455SAlexander Aring 	    result != -ENETDOWN &&
15948728a455SAlexander Aring 	    result != -EINVAL &&
15958728a455SAlexander Aring 	    result != -EPROTONOSUPPORT) {
15968728a455SAlexander Aring 		log_print("connect %d try %d error %d", con->nodeid,
15978728a455SAlexander Aring 			  con->retries, result);
15988728a455SAlexander Aring 		msleep(1000);
15998728a455SAlexander Aring 		lowcomms_connect_sock(con);
16008728a455SAlexander Aring 	}
16018728a455SAlexander Aring }
16028728a455SAlexander Aring 
16036ed7257bSPatrick Caulfield /* Send workqueue function */
16046ed7257bSPatrick Caulfield static void process_send_sockets(struct work_struct *work)
16056ed7257bSPatrick Caulfield {
16066ed7257bSPatrick Caulfield 	struct connection *con = container_of(work, struct connection, swork);
16076ed7257bSPatrick Caulfield 
16087443bc96SAlexander Aring 	WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags));
16097443bc96SAlexander Aring 
16108a4abb08Stsutomu.owa@toshiba.co.jp 	clear_bit(CF_WRITE_PENDING, &con->flags);
1611ba868d9dSAlexander Aring 
1612489d8e55SAlexander Aring 	if (test_and_clear_bit(CF_RECONNECT, &con->flags)) {
1613ba868d9dSAlexander Aring 		close_connection(con, false, false, true);
1614489d8e55SAlexander Aring 		dlm_midcomms_unack_msg_resend(con->nodeid);
1615489d8e55SAlexander Aring 	}
1616ba868d9dSAlexander Aring 
16178728a455SAlexander Aring 	if (con->sock == NULL) {
1618ba868d9dSAlexander Aring 		if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
1619ba868d9dSAlexander Aring 			msleep(1000);
16208728a455SAlexander Aring 
16218728a455SAlexander Aring 		mutex_lock(&con->sock_mutex);
16228728a455SAlexander Aring 		dlm_connect(con);
16238728a455SAlexander Aring 		mutex_unlock(&con->sock_mutex);
1624ba868d9dSAlexander Aring 	}
16258728a455SAlexander Aring 
162601da24d3SBob Peterson 	if (!list_empty(&con->writequeue))
16276ed7257bSPatrick Caulfield 		send_to_sock(con);
16286ed7257bSPatrick Caulfield }
16296ed7257bSPatrick Caulfield 
16306ed7257bSPatrick Caulfield static void work_stop(void)
16316ed7257bSPatrick Caulfield {
1632fcef0e6cSAlexander Aring 	if (recv_workqueue) {
16336ed7257bSPatrick Caulfield 		destroy_workqueue(recv_workqueue);
1634fcef0e6cSAlexander Aring 		recv_workqueue = NULL;
1635fcef0e6cSAlexander Aring 	}
1636fcef0e6cSAlexander Aring 
1637fcef0e6cSAlexander Aring 	if (send_workqueue) {
16386ed7257bSPatrick Caulfield 		destroy_workqueue(send_workqueue);
1639fcef0e6cSAlexander Aring 		send_workqueue = NULL;
1640fcef0e6cSAlexander Aring 	}
16416ed7257bSPatrick Caulfield }
16426ed7257bSPatrick Caulfield 
16436ed7257bSPatrick Caulfield static int work_start(void)
16446ed7257bSPatrick Caulfield {
16456c6a1cc6SAlexander Aring 	recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM);
1646b9d41052SNamhyung Kim 	if (!recv_workqueue) {
1647b9d41052SNamhyung Kim 		log_print("can't start dlm_recv");
1648b9d41052SNamhyung Kim 		return -ENOMEM;
16496ed7257bSPatrick Caulfield 	}
16506ed7257bSPatrick Caulfield 
16516c6a1cc6SAlexander Aring 	send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM);
1652b9d41052SNamhyung Kim 	if (!send_workqueue) {
1653b9d41052SNamhyung Kim 		log_print("can't start dlm_send");
16546ed7257bSPatrick Caulfield 		destroy_workqueue(recv_workqueue);
1655fcef0e6cSAlexander Aring 		recv_workqueue = NULL;
1656b9d41052SNamhyung Kim 		return -ENOMEM;
16576ed7257bSPatrick Caulfield 	}
16586ed7257bSPatrick Caulfield 
16596ed7257bSPatrick Caulfield 	return 0;
16606ed7257bSPatrick Caulfield }
16616ed7257bSPatrick Caulfield 
16629d232469SAlexander Aring static void shutdown_conn(struct connection *con)
16639d232469SAlexander Aring {
1664a66c008cSAlexander Aring 	if (dlm_proto_ops->shutdown_action)
1665a66c008cSAlexander Aring 		dlm_proto_ops->shutdown_action(con);
16669d232469SAlexander Aring }
16679d232469SAlexander Aring 
16689d232469SAlexander Aring void dlm_lowcomms_shutdown(void)
16699d232469SAlexander Aring {
1670b38bc9c2SAlexander Aring 	int idx;
1671b38bc9c2SAlexander Aring 
16729d232469SAlexander Aring 	/* Set all the flags to prevent any
16739d232469SAlexander Aring 	 * socket activity.
16749d232469SAlexander Aring 	 */
16759d232469SAlexander Aring 	dlm_allow_conn = 0;
16769d232469SAlexander Aring 
16779d232469SAlexander Aring 	if (recv_workqueue)
16789d232469SAlexander Aring 		flush_workqueue(recv_workqueue);
16799d232469SAlexander Aring 	if (send_workqueue)
16809d232469SAlexander Aring 		flush_workqueue(send_workqueue);
16819d232469SAlexander Aring 
16829d232469SAlexander Aring 	dlm_close_sock(&listen_con.sock);
16839d232469SAlexander Aring 
1684b38bc9c2SAlexander Aring 	idx = srcu_read_lock(&connections_srcu);
16859d232469SAlexander Aring 	foreach_conn(shutdown_conn);
1686b38bc9c2SAlexander Aring 	srcu_read_unlock(&connections_srcu, idx);
16879d232469SAlexander Aring }
16889d232469SAlexander Aring 
1689f0fb83cbStsutomu.owa@toshiba.co.jp static void _stop_conn(struct connection *con, bool and_other)
16906ed7257bSPatrick Caulfield {
1691f0fb83cbStsutomu.owa@toshiba.co.jp 	mutex_lock(&con->sock_mutex);
1692173a31feStsutomu.owa@toshiba.co.jp 	set_bit(CF_CLOSE, &con->flags);
1693f0fb83cbStsutomu.owa@toshiba.co.jp 	set_bit(CF_READ_PENDING, &con->flags);
16948a4abb08Stsutomu.owa@toshiba.co.jp 	set_bit(CF_WRITE_PENDING, &con->flags);
169593eaadebStsutomu.owa@toshiba.co.jp 	if (con->sock && con->sock->sk) {
169692c44605SAlexander Aring 		lock_sock(con->sock->sk);
1697afb853fbSPatrick Caulfield 		con->sock->sk->sk_user_data = NULL;
169892c44605SAlexander Aring 		release_sock(con->sock->sk);
169993eaadebStsutomu.owa@toshiba.co.jp 	}
1700f0fb83cbStsutomu.owa@toshiba.co.jp 	if (con->othercon && and_other)
1701f0fb83cbStsutomu.owa@toshiba.co.jp 		_stop_conn(con->othercon, false);
1702f0fb83cbStsutomu.owa@toshiba.co.jp 	mutex_unlock(&con->sock_mutex);
1703f0fb83cbStsutomu.owa@toshiba.co.jp }
1704f0fb83cbStsutomu.owa@toshiba.co.jp 
1705f0fb83cbStsutomu.owa@toshiba.co.jp static void stop_conn(struct connection *con)
1706f0fb83cbStsutomu.owa@toshiba.co.jp {
1707f0fb83cbStsutomu.owa@toshiba.co.jp 	_stop_conn(con, true);
1708afb853fbSPatrick Caulfield }
17095e9ccc37SChristine Caulfield 
17104798cbbfSAlexander Aring static void connection_release(struct rcu_head *rcu)
17114798cbbfSAlexander Aring {
17124798cbbfSAlexander Aring 	struct connection *con = container_of(rcu, struct connection, rcu);
17134798cbbfSAlexander Aring 
17144798cbbfSAlexander Aring 	kfree(con->rx_buf);
17154798cbbfSAlexander Aring 	kfree(con);
17164798cbbfSAlexander Aring }
17174798cbbfSAlexander Aring 
17185e9ccc37SChristine Caulfield static void free_conn(struct connection *con)
17195e9ccc37SChristine Caulfield {
17200d737a8cSMarcelo Ricardo Leitner 	close_connection(con, true, true, true);
1721a47666ebSAlexander Aring 	spin_lock(&connections_lock);
1722a47666ebSAlexander Aring 	hlist_del_rcu(&con->list);
1723a47666ebSAlexander Aring 	spin_unlock(&connections_lock);
1724948c47e9SAlexander Aring 	if (con->othercon) {
1725948c47e9SAlexander Aring 		clean_one_writequeue(con->othercon);
17265cbec208SAlexander Aring 		call_srcu(&connections_srcu, &con->othercon->rcu,
17275cbec208SAlexander Aring 			  connection_release);
1728948c47e9SAlexander Aring 	}
17290de98432SAlexander Aring 	clean_one_writequeue(con);
17305cbec208SAlexander Aring 	call_srcu(&connections_srcu, &con->rcu, connection_release);
17316ed7257bSPatrick Caulfield }
17325e9ccc37SChristine Caulfield 
1733f0fb83cbStsutomu.owa@toshiba.co.jp static void work_flush(void)
1734f0fb83cbStsutomu.owa@toshiba.co.jp {
1735b38bc9c2SAlexander Aring 	int ok;
1736f0fb83cbStsutomu.owa@toshiba.co.jp 	int i;
1737f0fb83cbStsutomu.owa@toshiba.co.jp 	struct connection *con;
1738f0fb83cbStsutomu.owa@toshiba.co.jp 
1739f0fb83cbStsutomu.owa@toshiba.co.jp 	do {
1740f0fb83cbStsutomu.owa@toshiba.co.jp 		ok = 1;
1741f0fb83cbStsutomu.owa@toshiba.co.jp 		foreach_conn(stop_conn);
1742b355516fSDavid Windsor 		if (recv_workqueue)
1743f0fb83cbStsutomu.owa@toshiba.co.jp 			flush_workqueue(recv_workqueue);
1744b355516fSDavid Windsor 		if (send_workqueue)
1745f0fb83cbStsutomu.owa@toshiba.co.jp 			flush_workqueue(send_workqueue);
1746f0fb83cbStsutomu.owa@toshiba.co.jp 		for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
1747a47666ebSAlexander Aring 			hlist_for_each_entry_rcu(con, &connection_hash[i],
1748a47666ebSAlexander Aring 						 list) {
1749f0fb83cbStsutomu.owa@toshiba.co.jp 				ok &= test_bit(CF_READ_PENDING, &con->flags);
17508a4abb08Stsutomu.owa@toshiba.co.jp 				ok &= test_bit(CF_WRITE_PENDING, &con->flags);
17518a4abb08Stsutomu.owa@toshiba.co.jp 				if (con->othercon) {
1752f0fb83cbStsutomu.owa@toshiba.co.jp 					ok &= test_bit(CF_READ_PENDING,
1753f0fb83cbStsutomu.owa@toshiba.co.jp 						       &con->othercon->flags);
17548a4abb08Stsutomu.owa@toshiba.co.jp 					ok &= test_bit(CF_WRITE_PENDING,
17558a4abb08Stsutomu.owa@toshiba.co.jp 						       &con->othercon->flags);
17568a4abb08Stsutomu.owa@toshiba.co.jp 				}
1757f0fb83cbStsutomu.owa@toshiba.co.jp 			}
1758f0fb83cbStsutomu.owa@toshiba.co.jp 		}
1759f0fb83cbStsutomu.owa@toshiba.co.jp 	} while (!ok);
1760f0fb83cbStsutomu.owa@toshiba.co.jp }
1761f0fb83cbStsutomu.owa@toshiba.co.jp 
17625e9ccc37SChristine Caulfield void dlm_lowcomms_stop(void)
17635e9ccc37SChristine Caulfield {
1764b38bc9c2SAlexander Aring 	int idx;
1765b38bc9c2SAlexander Aring 
1766b38bc9c2SAlexander Aring 	idx = srcu_read_lock(&connections_srcu);
1767f0fb83cbStsutomu.owa@toshiba.co.jp 	work_flush();
17683a8db798SMarcelo Ricardo Leitner 	foreach_conn(free_conn);
1769b38bc9c2SAlexander Aring 	srcu_read_unlock(&connections_srcu, idx);
17706ed7257bSPatrick Caulfield 	work_stop();
1771043697f0SAlexander Aring 	deinit_local();
1772a66c008cSAlexander Aring 
1773a66c008cSAlexander Aring 	dlm_proto_ops = NULL;
17746ed7257bSPatrick Caulfield }
17756ed7257bSPatrick Caulfield 
17762dc6b115SAlexander Aring static int dlm_listen_for_all(void)
17772dc6b115SAlexander Aring {
17782dc6b115SAlexander Aring 	struct socket *sock;
17792dc6b115SAlexander Aring 	int result;
17802dc6b115SAlexander Aring 
17812dc6b115SAlexander Aring 	log_print("Using %s for communications",
17822dc6b115SAlexander Aring 		  dlm_proto_ops->name);
17832dc6b115SAlexander Aring 
17842dc6b115SAlexander Aring 	result = dlm_proto_ops->listen_validate();
17852dc6b115SAlexander Aring 	if (result < 0)
17862dc6b115SAlexander Aring 		return result;
17872dc6b115SAlexander Aring 
17882dc6b115SAlexander Aring 	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
17892dc6b115SAlexander Aring 				  SOCK_STREAM, dlm_proto_ops->proto, &sock);
17902dc6b115SAlexander Aring 	if (result < 0) {
1791fe933675SAlexander Aring 		log_print("Can't create comms socket: %d", result);
17922dc6b115SAlexander Aring 		goto out;
17932dc6b115SAlexander Aring 	}
17942dc6b115SAlexander Aring 
17952dc6b115SAlexander Aring 	sock_set_mark(sock->sk, dlm_config.ci_mark);
17962dc6b115SAlexander Aring 	dlm_proto_ops->listen_sockopts(sock);
17972dc6b115SAlexander Aring 
17982dc6b115SAlexander Aring 	result = dlm_proto_ops->listen_bind(sock);
17992dc6b115SAlexander Aring 	if (result < 0)
18002dc6b115SAlexander Aring 		goto out;
18012dc6b115SAlexander Aring 
18022dc6b115SAlexander Aring 	save_listen_callbacks(sock);
18032dc6b115SAlexander Aring 	add_listen_sock(sock, &listen_con);
18042dc6b115SAlexander Aring 
18052dc6b115SAlexander Aring 	INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
18062dc6b115SAlexander Aring 	result = sock->ops->listen(sock, 5);
18072dc6b115SAlexander Aring 	if (result < 0) {
18082dc6b115SAlexander Aring 		dlm_close_sock(&listen_con.sock);
18092dc6b115SAlexander Aring 		goto out;
18102dc6b115SAlexander Aring 	}
18112dc6b115SAlexander Aring 
18122dc6b115SAlexander Aring 	return 0;
18132dc6b115SAlexander Aring 
18142dc6b115SAlexander Aring out:
18152dc6b115SAlexander Aring 	sock_release(sock);
18162dc6b115SAlexander Aring 	return result;
18172dc6b115SAlexander Aring }
18182dc6b115SAlexander Aring 
18198728a455SAlexander Aring static int dlm_tcp_bind(struct socket *sock)
18208728a455SAlexander Aring {
18218728a455SAlexander Aring 	struct sockaddr_storage src_addr;
18228728a455SAlexander Aring 	int result, addr_len;
18238728a455SAlexander Aring 
18248728a455SAlexander Aring 	/* Bind to our cluster-known address connecting to avoid
18258728a455SAlexander Aring 	 * routing problems.
18268728a455SAlexander Aring 	 */
18278728a455SAlexander Aring 	memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
18288728a455SAlexander Aring 	make_sockaddr(&src_addr, 0, &addr_len);
18298728a455SAlexander Aring 
18308728a455SAlexander Aring 	result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
18318728a455SAlexander Aring 				 addr_len);
18328728a455SAlexander Aring 	if (result < 0) {
18338728a455SAlexander Aring 		/* This *may* not indicate a critical error */
18348728a455SAlexander Aring 		log_print("could not bind for connect: %d", result);
18358728a455SAlexander Aring 	}
18368728a455SAlexander Aring 
18378728a455SAlexander Aring 	return 0;
18388728a455SAlexander Aring }
18398728a455SAlexander Aring 
18408728a455SAlexander Aring static int dlm_tcp_connect(struct connection *con, struct socket *sock,
18418728a455SAlexander Aring 			   struct sockaddr *addr, int addr_len)
18428728a455SAlexander Aring {
18438728a455SAlexander Aring 	int ret;
18448728a455SAlexander Aring 
18458728a455SAlexander Aring 	ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
18468728a455SAlexander Aring 	switch (ret) {
18478728a455SAlexander Aring 	case -EINPROGRESS:
18488728a455SAlexander Aring 		fallthrough;
18498728a455SAlexander Aring 	case 0:
18508728a455SAlexander Aring 		return 0;
18518728a455SAlexander Aring 	}
18528728a455SAlexander Aring 
18538728a455SAlexander Aring 	return ret;
18548728a455SAlexander Aring }
18558728a455SAlexander Aring 
18562dc6b115SAlexander Aring static int dlm_tcp_listen_validate(void)
18572dc6b115SAlexander Aring {
18582dc6b115SAlexander Aring 	/* We don't support multi-homed hosts */
18592dc6b115SAlexander Aring 	if (dlm_local_count > 1) {
18602dc6b115SAlexander Aring 		log_print("TCP protocol can't handle multi-homed hosts, try SCTP");
18612dc6b115SAlexander Aring 		return -EINVAL;
18622dc6b115SAlexander Aring 	}
18632dc6b115SAlexander Aring 
18642dc6b115SAlexander Aring 	return 0;
18652dc6b115SAlexander Aring }
18662dc6b115SAlexander Aring 
18672dc6b115SAlexander Aring static void dlm_tcp_sockopts(struct socket *sock)
18682dc6b115SAlexander Aring {
18692dc6b115SAlexander Aring 	/* Turn off Nagle's algorithm */
18702dc6b115SAlexander Aring 	tcp_sock_set_nodelay(sock->sk);
18712dc6b115SAlexander Aring }
18722dc6b115SAlexander Aring 
18732dc6b115SAlexander Aring static void dlm_tcp_listen_sockopts(struct socket *sock)
18742dc6b115SAlexander Aring {
18752dc6b115SAlexander Aring 	dlm_tcp_sockopts(sock);
18762dc6b115SAlexander Aring 	sock_set_reuseaddr(sock->sk);
18772dc6b115SAlexander Aring }
18782dc6b115SAlexander Aring 
18792dc6b115SAlexander Aring static int dlm_tcp_listen_bind(struct socket *sock)
18802dc6b115SAlexander Aring {
18812dc6b115SAlexander Aring 	int addr_len;
18822dc6b115SAlexander Aring 
18832dc6b115SAlexander Aring 	/* Bind to our port */
18842dc6b115SAlexander Aring 	make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
18852dc6b115SAlexander Aring 	return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0],
18862dc6b115SAlexander Aring 			       addr_len);
18872dc6b115SAlexander Aring }
18882dc6b115SAlexander Aring 
1889a66c008cSAlexander Aring static const struct dlm_proto_ops dlm_tcp_ops = {
18902dc6b115SAlexander Aring 	.name = "TCP",
18912dc6b115SAlexander Aring 	.proto = IPPROTO_TCP,
18928728a455SAlexander Aring 	.connect = dlm_tcp_connect,
18938728a455SAlexander Aring 	.sockopts = dlm_tcp_sockopts,
18948728a455SAlexander Aring 	.bind = dlm_tcp_bind,
18952dc6b115SAlexander Aring 	.listen_validate = dlm_tcp_listen_validate,
18962dc6b115SAlexander Aring 	.listen_sockopts = dlm_tcp_listen_sockopts,
18972dc6b115SAlexander Aring 	.listen_bind = dlm_tcp_listen_bind,
1898a66c008cSAlexander Aring 	.shutdown_action = dlm_tcp_shutdown,
1899a66c008cSAlexander Aring 	.eof_condition = tcp_eof_condition,
1900a66c008cSAlexander Aring };
1901a66c008cSAlexander Aring 
19028728a455SAlexander Aring static int dlm_sctp_bind(struct socket *sock)
19038728a455SAlexander Aring {
19048728a455SAlexander Aring 	return sctp_bind_addrs(sock, 0);
19058728a455SAlexander Aring }
19068728a455SAlexander Aring 
19078728a455SAlexander Aring static int dlm_sctp_connect(struct connection *con, struct socket *sock,
19088728a455SAlexander Aring 			    struct sockaddr *addr, int addr_len)
19098728a455SAlexander Aring {
19108728a455SAlexander Aring 	int ret;
19118728a455SAlexander Aring 
19128728a455SAlexander Aring 	/*
19138728a455SAlexander Aring 	 * Make sock->ops->connect() function return in specified time,
19148728a455SAlexander Aring 	 * since O_NONBLOCK argument in connect() function does not work here,
19158728a455SAlexander Aring 	 * then, we should restore the default value of this attribute.
19168728a455SAlexander Aring 	 */
19178728a455SAlexander Aring 	sock_set_sndtimeo(sock->sk, 5);
19188728a455SAlexander Aring 	ret = sock->ops->connect(sock, addr, addr_len, 0);
19198728a455SAlexander Aring 	sock_set_sndtimeo(sock->sk, 0);
19208728a455SAlexander Aring 	if (ret < 0)
19218728a455SAlexander Aring 		return ret;
19228728a455SAlexander Aring 
19238728a455SAlexander Aring 	if (!test_and_set_bit(CF_CONNECTED, &con->flags))
19248728a455SAlexander Aring 		log_print("successful connected to node %d", con->nodeid);
19258728a455SAlexander Aring 
19268728a455SAlexander Aring 	return 0;
19278728a455SAlexander Aring }
19288728a455SAlexander Aring 
192990d21fc0SAlexander Aring static int dlm_sctp_listen_validate(void)
193090d21fc0SAlexander Aring {
193190d21fc0SAlexander Aring 	if (!IS_ENABLED(CONFIG_IP_SCTP)) {
193290d21fc0SAlexander Aring 		log_print("SCTP is not enabled by this kernel");
193390d21fc0SAlexander Aring 		return -EOPNOTSUPP;
193490d21fc0SAlexander Aring 	}
193590d21fc0SAlexander Aring 
193690d21fc0SAlexander Aring 	request_module("sctp");
193790d21fc0SAlexander Aring 	return 0;
193890d21fc0SAlexander Aring }
193990d21fc0SAlexander Aring 
19402dc6b115SAlexander Aring static int dlm_sctp_bind_listen(struct socket *sock)
19412dc6b115SAlexander Aring {
19422dc6b115SAlexander Aring 	return sctp_bind_addrs(sock, dlm_config.ci_tcp_port);
19432dc6b115SAlexander Aring }
19442dc6b115SAlexander Aring 
19452dc6b115SAlexander Aring static void dlm_sctp_sockopts(struct socket *sock)
19462dc6b115SAlexander Aring {
19472dc6b115SAlexander Aring 	/* Turn off Nagle's algorithm */
19482dc6b115SAlexander Aring 	sctp_sock_set_nodelay(sock->sk);
19492dc6b115SAlexander Aring 	sock_set_rcvbuf(sock->sk, NEEDED_RMEM);
19502dc6b115SAlexander Aring }
19512dc6b115SAlexander Aring 
1952a66c008cSAlexander Aring static const struct dlm_proto_ops dlm_sctp_ops = {
19532dc6b115SAlexander Aring 	.name = "SCTP",
19542dc6b115SAlexander Aring 	.proto = IPPROTO_SCTP,
19558728a455SAlexander Aring 	.try_new_addr = true,
19568728a455SAlexander Aring 	.connect = dlm_sctp_connect,
19578728a455SAlexander Aring 	.sockopts = dlm_sctp_sockopts,
19588728a455SAlexander Aring 	.bind = dlm_sctp_bind,
195990d21fc0SAlexander Aring 	.listen_validate = dlm_sctp_listen_validate,
19602dc6b115SAlexander Aring 	.listen_sockopts = dlm_sctp_sockopts,
19612dc6b115SAlexander Aring 	.listen_bind = dlm_sctp_bind_listen,
1962a66c008cSAlexander Aring };
1963a66c008cSAlexander Aring 
19646ed7257bSPatrick Caulfield int dlm_lowcomms_start(void)
19656ed7257bSPatrick Caulfield {
19666ed7257bSPatrick Caulfield 	int error = -EINVAL;
19675e9ccc37SChristine Caulfield 	int i;
19685e9ccc37SChristine Caulfield 
19695e9ccc37SChristine Caulfield 	for (i = 0; i < CONN_HASH_SIZE; i++)
19705e9ccc37SChristine Caulfield 		INIT_HLIST_HEAD(&connection_hash[i]);
19716ed7257bSPatrick Caulfield 
19726ed7257bSPatrick Caulfield 	init_local();
19736ed7257bSPatrick Caulfield 	if (!dlm_local_count) {
1974617e82e1SDavid Teigland 		error = -ENOTCONN;
19756ed7257bSPatrick Caulfield 		log_print("no local IP address has been set");
1976513ef596SDavid Teigland 		goto fail;
19776ed7257bSPatrick Caulfield 	}
19786ed7257bSPatrick Caulfield 
1979d11ccd45SAlexander Aring 	INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
1980d11ccd45SAlexander Aring 
1981513ef596SDavid Teigland 	error = work_start();
1982513ef596SDavid Teigland 	if (error)
1983fcef0e6cSAlexander Aring 		goto fail_local;
1984513ef596SDavid Teigland 
1985513ef596SDavid Teigland 	dlm_allow_conn = 1;
19866ed7257bSPatrick Caulfield 
19876ed7257bSPatrick Caulfield 	/* Start listening */
1988ac7d5d03SAlexander Aring 	switch (dlm_config.ci_protocol) {
1989ac7d5d03SAlexander Aring 	case DLM_PROTO_TCP:
1990a66c008cSAlexander Aring 		dlm_proto_ops = &dlm_tcp_ops;
1991ac7d5d03SAlexander Aring 		break;
1992ac7d5d03SAlexander Aring 	case DLM_PROTO_SCTP:
1993a66c008cSAlexander Aring 		dlm_proto_ops = &dlm_sctp_ops;
1994ac7d5d03SAlexander Aring 		break;
1995ac7d5d03SAlexander Aring 	default:
1996ac7d5d03SAlexander Aring 		log_print("Invalid protocol identifier %d set",
1997ac7d5d03SAlexander Aring 			  dlm_config.ci_protocol);
1998ac7d5d03SAlexander Aring 		error = -EINVAL;
19992dc6b115SAlexander Aring 		goto fail_proto_ops;
2000ac7d5d03SAlexander Aring 	}
20012dc6b115SAlexander Aring 
20022dc6b115SAlexander Aring 	error = dlm_listen_for_all();
20036ed7257bSPatrick Caulfield 	if (error)
20042dc6b115SAlexander Aring 		goto fail_listen;
20056ed7257bSPatrick Caulfield 
20066ed7257bSPatrick Caulfield 	return 0;
20076ed7257bSPatrick Caulfield 
20082dc6b115SAlexander Aring fail_listen:
20092dc6b115SAlexander Aring 	dlm_proto_ops = NULL;
20102dc6b115SAlexander Aring fail_proto_ops:
2011513ef596SDavid Teigland 	dlm_allow_conn = 0;
2012d11ccd45SAlexander Aring 	dlm_close_sock(&listen_con.sock);
2013fcef0e6cSAlexander Aring 	work_stop();
2014fcef0e6cSAlexander Aring fail_local:
2015fcef0e6cSAlexander Aring 	deinit_local();
2016513ef596SDavid Teigland fail:
20176ed7257bSPatrick Caulfield 	return error;
20186ed7257bSPatrick Caulfield }
201936b71a8bSDavid Teigland 
202036b71a8bSDavid Teigland void dlm_lowcomms_exit(void)
202136b71a8bSDavid Teigland {
202236b71a8bSDavid Teigland 	struct dlm_node_addr *na, *safe;
202336b71a8bSDavid Teigland 
202436b71a8bSDavid Teigland 	spin_lock(&dlm_node_addrs_spin);
202536b71a8bSDavid Teigland 	list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
202636b71a8bSDavid Teigland 		list_del(&na->list);
202736b71a8bSDavid Teigland 		while (na->addr_count--)
202836b71a8bSDavid Teigland 			kfree(na->addr[na->addr_count]);
202936b71a8bSDavid Teigland 		kfree(na);
203036b71a8bSDavid Teigland 	}
203136b71a8bSDavid Teigland 	spin_unlock(&dlm_node_addrs_spin);
203236b71a8bSDavid Teigland }
2033