// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. ** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* ******************************************************************************/ /* * lowcomms.c * * This is the "low-level" comms layer. * * It is responsible for sending/receiving messages * from other nodes in the cluster. * * Cluster nodes are referred to by their nodeids. nodeids are * simply 32 bit numbers to the locking module - if they need to * be expanded for the cluster infrastructure then that is its * responsibility. It is this layer's * responsibility to resolve these into IP address or * whatever it needs for inter-node communication. * * The comms level is two kernel threads that deal mainly with * the receiving of messages from other nodes and passing them * up to the mid-level comms layer (which understands the * message format) for execution by the locking core, and * a send thread which does all the setting up of connections * to remote nodes and the sending of data. Threads are not allowed * to send their own data because it may cause them to wait in times * of high load. Also, this way, the sending thread can collect together * messages bound for one node and send them in one block. * * lowcomms will choose to use either TCP or SCTP as its transport layer * depending on the configuration variable 'protocol'. This should be set * to 0 (default) for TCP or 1 for SCTP. It should be configured using a * cluster-wide mechanism as it must be the same on all nodes of the cluster * for the DLM to function. * */ #include #include #include #include #include #include #include #include #include #include #include #include "dlm_internal.h" #include "lowcomms.h" #include "midcomms.h" #include "memory.h" #include "config.h" #define NEEDED_RMEM (4*1024*1024) /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ struct mutex sock_mutex; unsigned long flags; #define CF_READ_PENDING 1 #define CF_WRITE_PENDING 2 #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 #define CF_CLOSE 6 #define CF_APP_LIMITED 7 #define CF_CLOSING 8 #define CF_CONNECTED 9 #define CF_RECONNECT 10 #define CF_DELAY_CONNECT 11 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; struct connection *othercon; struct connection *sendcon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ unsigned char *rx_buf; int rx_buflen; int rx_leftover; int mark; int addr_count; int curr_addr_index; struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT]; spinlock_t addrs_lock; struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) struct listen_connection { struct socket *sock; struct work_struct rwork; }; #define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end) #define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset) /* An entry waiting to be sent */ struct writequeue_entry { struct list_head list; struct page *page; int offset; int len; int end; int users; bool dirty; struct connection *con; struct list_head msgs; struct kref ref; }; struct dlm_msg { struct writequeue_entry *entry; struct dlm_msg *orig_msg; bool retransmit; void *ppc; int len; int idx; /* new()/commit() idx exchange */ struct list_head list; struct kref ref; }; struct dlm_proto_ops { bool try_new_addr; const char *name; int proto; int (*connect)(struct connection *con, struct socket *sock, struct sockaddr *addr, int addr_len); void (*sockopts)(struct socket *sock); int (*bind)(struct socket *sock); int (*listen_validate)(void); void (*listen_sockopts)(struct socket *sock); int (*listen_bind)(struct socket *sock); }; static struct listen_sock_callbacks { void (*sk_error_report)(struct sock *); void (*sk_data_ready)(struct sock *); void (*sk_state_change)(struct sock *); void (*sk_write_space)(struct sock *); } listen_sock; static struct listen_connection listen_con; static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; /* Work queues */ static struct workqueue_struct *recv_workqueue; static struct workqueue_struct *send_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(connections_lock); DEFINE_STATIC_SRCU(connections_srcu); static const struct dlm_proto_ops *dlm_proto_ops; static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); bool dlm_lowcomms_is_running(void) { return !!listen_con.sock; } static void writequeue_entry_ctor(void *data) { struct writequeue_entry *entry = data; INIT_LIST_HEAD(&entry->msgs); } struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void) { return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry), 0, 0, writequeue_entry_ctor); } struct kmem_cache *dlm_lowcomms_msg_cache_create(void) { return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL); } /* need to held writequeue_lock */ static struct writequeue_entry *con_next_wq(struct connection *con) { struct writequeue_entry *e; e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry, list); /* if len is zero nothing is to send, if there are users filling * buffers we wait until the users are done so we can send more. */ if (!e || e->users || e->len == 0) return NULL; return e; } static struct connection *__find_con(int nodeid, int r) { struct connection *con; hlist_for_each_entry_rcu(con, &connection_hash[r], list) { if (con->nodeid == nodeid) return con; } return NULL; } static int dlm_con_init(struct connection *con, int nodeid) { con->rx_buflen = dlm_config.ci_buffer_size; con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); if (!con->rx_buf) return -ENOMEM; con->nodeid = nodeid; mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); return 0; } /* * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node. */ static struct connection *nodeid2con(int nodeid, gfp_t alloc) { struct connection *con, *tmp; int r, ret; r = nodeid_hash(nodeid); con = __find_con(nodeid, r); if (con || !alloc) return con; con = kzalloc(sizeof(*con), alloc); if (!con) return NULL; ret = dlm_con_init(con, nodeid); if (ret) { kfree(con); return NULL; } spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() * we just check in rare cases of recently added nodes again * under protection of connections_lock. If this is the case we * abort our connection creation and return the existing connection. */ tmp = __find_con(nodeid, r); if (tmp) { spin_unlock(&connections_lock); kfree(con->rx_buf); kfree(con); return tmp; } hlist_add_head_rcu(&con->list, &connection_hash[r]); spin_unlock(&connections_lock); return con; } /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { int i; struct connection *con; for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } } static int addr_compare(const struct sockaddr_storage *x, const struct sockaddr_storage *y) { switch (x->ss_family) { case AF_INET: { struct sockaddr_in *sinx = (struct sockaddr_in *)x; struct sockaddr_in *siny = (struct sockaddr_in *)y; if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) return 0; if (sinx->sin_port != siny->sin_port) return 0; break; } case AF_INET6: { struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) return 0; if (sinx->sin6_port != siny->sin6_port) return 0; break; } default: return 0; } return 1; } static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, struct sockaddr *sa_out, bool try_new_addr, unsigned int *mark) { struct sockaddr_storage sas; struct connection *con; int idx; if (!dlm_local_count) return -1; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (!con) { srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } spin_lock(&con->addrs_lock); if (!con->addr_count) { spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } memcpy(&sas, &con->addr[con->curr_addr_index], sizeof(struct sockaddr_storage)); if (try_new_addr) { con->curr_addr_index++; if (con->curr_addr_index == con->addr_count) con->curr_addr_index = 0; } *mark = con->mark; spin_unlock(&con->addrs_lock); if (sas_out) memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); if (!sa_out) { srcu_read_unlock(&connections_srcu, idx); return 0; } if (dlm_local_addr[0].ss_family == AF_INET) { struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; ret4->sin_addr.s_addr = in4->sin_addr.s_addr; } else { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &sas; struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) sa_out; ret6->sin6_addr = in6->sin6_addr; } srcu_read_unlock(&connections_srcu, idx); return 0; } static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, unsigned int *mark) { struct connection *con; int i, idx, addr_i; idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { WARN_ON_ONCE(!con->addr_count); spin_lock(&con->addrs_lock); for (addr_i = 0; addr_i < con->addr_count; addr_i++) { if (addr_compare(&con->addr[addr_i], addr)) { *nodeid = con->nodeid; *mark = con->mark; spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return 0; } } spin_unlock(&con->addrs_lock); } } srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } static bool dlm_lowcomms_con_has_addr(const struct connection *con, const struct sockaddr_storage *addr) { int i; for (i = 0; i < con->addr_count; i++) { if (addr_compare(&con->addr[i], addr)) return true; } return false; } int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) { struct connection *con; bool ret, idx; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, GFP_NOFS); if (!con) { srcu_read_unlock(&connections_srcu, idx); return -ENOMEM; } spin_lock(&con->addrs_lock); if (!con->addr_count) { memcpy(&con->addr[0], addr, sizeof(*addr)); con->addr_count = 1; con->mark = dlm_config.ci_mark; spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return 0; } ret = dlm_lowcomms_con_has_addr(con, addr); if (ret) { spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return -EEXIST; } if (con->addr_count >= DLM_MAX_ADDR_COUNT) { spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return -ENOSPC; } memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr)); srcu_read_unlock(&connections_srcu, idx); spin_unlock(&con->addrs_lock); return 0; } /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk) { struct connection *con; con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); } static void lowcomms_listen_data_ready(struct sock *sk) { queue_work(recv_workqueue, &listen_con.rwork); } static void lowcomms_write_space(struct sock *sk) { struct connection *con; con = sock2con(sk); if (!con) return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { log_print("connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); return; } clear_bit(SOCK_NOSPACE, &con->sock->flags); if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { con->sock->sk->sk_write_pending--; clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } queue_work(send_workqueue, &con->swork); } static inline void lowcomms_connect_sock(struct connection *con) { if (test_bit(CF_CLOSE, &con->flags)) return; queue_work(send_workqueue, &con->swork); cond_resched(); } static void lowcomms_state_change(struct sock *sk) { /* SCTP layer is not calling sk_data_ready when the connection * is done, so we catch the signal through here. Also, it * doesn't switch socket state when entering shutdown, so we * skip the write in that case. */ if (sk->sk_shutdown) { if (sk->sk_shutdown == RCV_SHUTDOWN) lowcomms_data_ready(sk); } else if (sk->sk_state == TCP_ESTABLISHED) { lowcomms_write_space(sk); } } int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; int idx; if (nodeid == dlm_our_nodeid()) return 0; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } lowcomms_connect_sock(con); srcu_read_unlock(&connections_srcu, idx); return 0; } int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) { struct connection *con; int idx; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (!con) { srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } spin_lock(&con->addrs_lock); con->mark = mark; spin_unlock(&con->addrs_lock); srcu_read_unlock(&connections_srcu, idx); return 0; } static void lowcomms_error_report(struct sock *sk) { struct connection *con; void (*orig_report)(struct sock *) = NULL; struct inet_sock *inet; con = sock2con(sk); if (con == NULL) goto out; orig_report = listen_sock.sk_error_report; inet = inet_sk(sk); switch (sk->sk_family) { case AF_INET: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d at %pI4, dport %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), con->nodeid, &inet->inet_daddr, ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d at %pI6c, " "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(), con->nodeid, &sk->sk_v6_daddr, ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); break; #endif default: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "invalid socket family %d set, " "sk_err=%d/%d\n", dlm_our_nodeid(), sk->sk_family, sk->sk_err, sk->sk_err_soft); goto out; } /* below sendcon only handling */ if (test_bit(CF_IS_OTHERCON, &con->flags)) con = con->sendcon; switch (sk->sk_err) { case ECONNREFUSED: set_bit(CF_DELAY_CONNECT, &con->flags); break; default: break; } if (!test_and_set_bit(CF_RECONNECT, &con->flags)) queue_work(send_workqueue, &con->swork); out: if (orig_report) orig_report(sk); } static void restore_callbacks(struct socket *sock) { struct sock *sk = sock->sk; lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; sk->sk_state_change = listen_sock.sk_state_change; sk->sk_write_space = listen_sock.sk_write_space; sk->sk_error_report = listen_sock.sk_error_report; release_sock(sk); } /* Make a socket active */ static void add_sock(struct socket *sock, struct connection *con) { struct sock *sk = sock->sk; lock_sock(sk); con->sock = sock; sk->sk_user_data = con; /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; sk->sk_write_space = lowcomms_write_space; sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; sk->sk_error_report = lowcomms_error_report; release_sock(sk); } /* Add the port number to an IPv6 or 4 sockaddr and return the address length */ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, int *addr_len) { saddr->ss_family = dlm_local_addr[0].ss_family; if (saddr->ss_family == AF_INET) { struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; in4_addr->sin_port = cpu_to_be16(port); *addr_len = sizeof(struct sockaddr_in); memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); } else { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; in6_addr->sin6_port = cpu_to_be16(port); *addr_len = sizeof(struct sockaddr_in6); } memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); } static void dlm_page_release(struct kref *kref) { struct writequeue_entry *e = container_of(kref, struct writequeue_entry, ref); __free_page(e->page); dlm_free_writequeue(e); } static void dlm_msg_release(struct kref *kref) { struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); kref_put(&msg->entry->ref, dlm_page_release); dlm_free_msg(msg); } static void free_entry(struct writequeue_entry *e) { struct dlm_msg *msg, *tmp; list_for_each_entry_safe(msg, tmp, &e->msgs, list) { if (msg->orig_msg) { msg->orig_msg->retransmit = false; kref_put(&msg->orig_msg->ref, dlm_msg_release); } list_del(&msg->list); kref_put(&msg->ref, dlm_msg_release); } list_del(&e->list); kref_put(&e->ref, dlm_page_release); } static void dlm_close_sock(struct socket **sock) { if (*sock) { restore_callbacks(*sock); sock_release(*sock); *sock = NULL; } } /* Close a remote connection and tidy up */ static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { bool closing = test_and_set_bit(CF_CLOSING, &con->flags); struct writequeue_entry *e; if (tx && !closing && cancel_work_sync(&con->swork)) { log_print("canceled swork for node %d", con->nodeid); clear_bit(CF_WRITE_PENDING, &con->flags); } if (rx && !closing && cancel_work_sync(&con->rwork)) { log_print("canceled rwork for node %d", con->nodeid); clear_bit(CF_READ_PENDING, &con->flags); } mutex_lock(&con->sock_mutex); dlm_close_sock(&con->sock); if (con->othercon && and_other) { /* Will only re-enter once. */ close_connection(con->othercon, false, tx, rx); } /* if we send a writequeue entry only a half way, we drop the * whole entry because reconnection and that we not start of the * middle of a msg which will confuse the other end. * * we can always drop messages because retransmits, but what we * cannot allow is to transmit half messages which may be processed * at the other side. * * our policy is to start on a clean state when disconnects, we don't * know what's send/received on transport layer in this case. */ spin_lock(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_first_entry(&con->writequeue, struct writequeue_entry, list); if (e->dirty) free_entry(e); } spin_unlock(&con->writequeue_lock); con->rx_leftover = 0; con->retries = 0; clear_bit(CF_APP_LIMITED, &con->flags); clear_bit(CF_CONNECTED, &con->flags); clear_bit(CF_DELAY_CONNECT, &con->flags); clear_bit(CF_RECONNECT, &con->flags); mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); } static int con_realloc_receive_buf(struct connection *con, int newlen) { unsigned char *newbuf; newbuf = kmalloc(newlen, GFP_NOFS); if (!newbuf) return -ENOMEM; /* copy any leftover from last receive */ if (con->rx_leftover) memmove(newbuf, con->rx_buf, con->rx_leftover); /* swap to new buffer space */ kfree(con->rx_buf); con->rx_buflen = newlen; con->rx_buf = newbuf; return 0; } /* Data received from remote end */ static int receive_from_sock(struct connection *con) { struct msghdr msg; struct kvec iov; int ret, buflen; mutex_lock(&con->sock_mutex); if (con->sock == NULL) { ret = -EAGAIN; goto out_close; } /* realloc if we get new buffer size to read out */ buflen = dlm_config.ci_buffer_size; if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { ret = con_realloc_receive_buf(con, buflen); if (ret < 0) goto out_resched; } for (;;) { /* calculate new buffer parameter regarding last receive and * possible leftover bytes */ iov.iov_base = con->rx_buf + con->rx_leftover; iov.iov_len = con->rx_buflen - con->rx_leftover; memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); trace_dlm_recv(con->nodeid, ret); if (ret == -EAGAIN) break; else if (ret <= 0) goto out_close; /* new buflen according readed bytes and leftover from last receive */ buflen = ret + con->rx_leftover; ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); if (ret < 0) goto out_close; /* calculate leftover bytes from process and put it into begin of * the receive buffer, so next receive we have the full message * at the start address of the receive buffer. */ con->rx_leftover = buflen - ret; if (con->rx_leftover) { memmove(con->rx_buf, con->rx_buf + ret, con->rx_leftover); } } dlm_midcomms_receive_done(con->nodeid); mutex_unlock(&con->sock_mutex); return 0; out_resched: if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); mutex_unlock(&con->sock_mutex); return -EAGAIN; out_close: if (ret == 0) { log_print("connection %p got EOF from %d", con, con->nodeid); mutex_unlock(&con->sock_mutex); close_connection(con, false, true, false); /* signal to breaking receive worker */ ret = -1; } else { mutex_unlock(&con->sock_mutex); } return ret; } /* Listening socket is busy, accept a connection */ static int accept_from_sock(struct listen_connection *con) { int result; struct sockaddr_storage peeraddr; struct socket *newsock; int len, idx; int nodeid; struct connection *newcon; struct connection *addcon; unsigned int mark; if (!con->sock) return -ENOTCONN; result = kernel_accept(con->sock, &newsock, O_NONBLOCK); if (result < 0) goto accept_err; /* Get the connected socket's peer */ memset(&peeraddr, 0, sizeof(peeraddr)); len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2); if (len < 0) { result = -ECONNABORTED; goto accept_err; } /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { switch (peeraddr.ss_family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr; log_print("connect from non cluster IPv4 node %pI4", &sin->sin_addr); break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr; log_print("connect from non cluster IPv6 node %pI6c", &sin6->sin6_addr); break; } #endif default: log_print("invalid family from non cluster node"); break; } sock_release(newsock); return -1; } log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This * could happen if the two nodes initiate a connection at roughly * the same time and the connections cross on the wire. * In this case we store the incoming one in "othercon" */ idx = srcu_read_lock(&connections_srcu); newcon = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!newcon)) { srcu_read_unlock(&connections_srcu, idx); result = -ENOENT; goto accept_err; } sock_set_mark(newsock->sk, mark); mutex_lock(&newcon->sock_mutex); if (newcon->sock) { struct connection *othercon = newcon->othercon; if (!othercon) { othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } result = dlm_con_init(othercon, nodeid); if (result < 0) { kfree(othercon); mutex_unlock(&newcon->sock_mutex); srcu_read_unlock(&connections_srcu, idx); goto accept_err; } lockdep_set_subclass(&othercon->sock_mutex, 1); set_bit(CF_IS_OTHERCON, &othercon->flags); newcon->othercon = othercon; othercon->sendcon = newcon; } else { /* close other sock con if we have something new */ close_connection(othercon, false, true, false); } mutex_lock(&othercon->sock_mutex); add_sock(newsock, othercon); addcon = othercon; mutex_unlock(&othercon->sock_mutex); } else { /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ add_sock(newsock, newcon); addcon = newcon; } set_bit(CF_CONNECTED, &addcon->flags); mutex_unlock(&newcon->sock_mutex); /* * Add it to the active queue in case we got data * between processing the accept adding the socket * to the read_sockets list */ if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) queue_work(recv_workqueue, &addcon->rwork); srcu_read_unlock(&connections_srcu, idx); return 0; accept_err: if (newsock) sock_release(newsock); if (result != -EAGAIN) log_print("error accepting connection from node: %d", result); return result; } /* * writequeue_entry_complete - try to delete and free write queue entry * @e: write queue entry to try to delete * @completed: bytes completed * * writequeue_lock must be held. */ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) { e->offset += completed; e->len -= completed; /* signal that page was half way transmitted */ e->dirty = true; if (e->len == 0 && e->users == 0) free_entry(e); } /* * sctp_bind_addrs - bind a SCTP socket to all our addresses */ static int sctp_bind_addrs(struct socket *sock, uint16_t port) { struct sockaddr_storage localaddr; struct sockaddr *addr = (struct sockaddr *)&localaddr; int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr)); make_sockaddr(&localaddr, port, &addr_len); if (!i) result = kernel_bind(sock, addr, addr_len); else result = sock_bind_add(sock->sk, addr, addr_len); if (result < 0) { log_print("Can't bind to %d addr number %d, %d.\n", port, i + 1, result); break; } } return result; } /* Get local addresses */ static void init_local(void) { struct sockaddr_storage sas; int i; dlm_local_count = 0; for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) { if (dlm_our_addr(&sas, i)) break; memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas)); } } static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; entry = dlm_allocate_writequeue(); if (!entry) return NULL; entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO); if (!entry->page) { dlm_free_writequeue(entry); return NULL; } entry->offset = 0; entry->len = 0; entry->end = 0; entry->dirty = false; entry->con = con; entry->users = 1; kref_init(&entry->ref); return entry; } static struct writequeue_entry *new_wq_entry(struct connection *con, int len, char **ppc, void (*cb)(void *data), void *data) { struct writequeue_entry *e; spin_lock(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_last_entry(&con->writequeue, struct writequeue_entry, list); if (DLM_WQ_REMAIN_BYTES(e) >= len) { kref_get(&e->ref); *ppc = page_address(e->page) + e->end; if (cb) cb(data); e->end += len; e->users++; goto out; } } e = new_writequeue_entry(con); if (!e) goto out; kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; if (cb) cb(data); list_add_tail(&e->list, &con->writequeue); out: spin_unlock(&con->writequeue_lock); return e; }; static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, gfp_t allocation, char **ppc, void (*cb)(void *data), void *data) { struct writequeue_entry *e; struct dlm_msg *msg; msg = dlm_allocate_msg(allocation); if (!msg) return NULL; kref_init(&msg->ref); e = new_wq_entry(con, len, ppc, cb, data); if (!e) { dlm_free_msg(msg); return NULL; } msg->retransmit = false; msg->orig_msg = NULL; msg->ppc = *ppc; msg->len = len; msg->entry = e; return msg; } /* avoid false positive for nodes_srcu, unlock happens in * dlm_lowcomms_commit_msg which is a must call if success */ #ifndef __CHECKER__ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, char **ppc, void (*cb)(void *data), void *data) { struct connection *con; struct dlm_msg *msg; int idx; if (len > DLM_MAX_SOCKET_BUFSIZE || len < sizeof(struct dlm_header)) { BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); log_print("failed to allocate a buffer of size %d", len); WARN_ON(1); return NULL; } idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return NULL; } msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data); if (!msg) { srcu_read_unlock(&connections_srcu, idx); return NULL; } /* for dlm_lowcomms_commit_msg() */ kref_get(&msg->ref); /* we assume if successful commit must called */ msg->idx = idx; return msg; } #endif static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) { struct writequeue_entry *e = msg->entry; struct connection *con = e->con; int users; spin_lock(&con->writequeue_lock); kref_get(&msg->ref); list_add(&msg->list, &e->msgs); users = --e->users; if (users) goto out; e->len = DLM_WQ_LENGTH_BYTES(e); spin_unlock(&con->writequeue_lock); queue_work(send_workqueue, &con->swork); return; out: spin_unlock(&con->writequeue_lock); return; } /* avoid false positive for nodes_srcu, lock was happen in * dlm_lowcomms_new_msg */ #ifndef __CHECKER__ void dlm_lowcomms_commit_msg(struct dlm_msg *msg) { _dlm_lowcomms_commit_msg(msg); srcu_read_unlock(&connections_srcu, msg->idx); /* because dlm_lowcomms_new_msg() */ kref_put(&msg->ref, dlm_msg_release); } #endif void dlm_lowcomms_put_msg(struct dlm_msg *msg) { kref_put(&msg->ref, dlm_msg_release); } /* does not held connections_srcu, usage workqueue only */ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) { struct dlm_msg *msg_resend; char *ppc; if (msg->retransmit) return 1; msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len, GFP_ATOMIC, &ppc, NULL, NULL); if (!msg_resend) return -ENOMEM; msg->retransmit = true; kref_get(&msg->ref); msg_resend->orig_msg = msg; memcpy(ppc, msg->ppc, msg->len); _dlm_lowcomms_commit_msg(msg_resend); dlm_lowcomms_put_msg(msg_resend); return 0; } /* Send a message */ static void send_to_sock(struct connection *con) { const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; int len, offset, ret; int count; again: count = 0; mutex_lock(&con->sock_mutex); if (con->sock == NULL) goto out_connect; spin_lock(&con->writequeue_lock); for (;;) { e = con_next_wq(con); if (!e) break; len = e->len; offset = e->offset; BUG_ON(len == 0 && e->users == 0); spin_unlock(&con->writequeue_lock); ret = kernel_sendpage(con->sock, e->page, offset, len, msg_flags); trace_dlm_send(con->nodeid, ret); if (ret == -EAGAIN || ret == 0) { if (ret == -EAGAIN && test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { /* Notify TCP that we're limited by the * application window size. */ set_bit(SOCK_NOSPACE, &con->sock->flags); con->sock->sk->sk_write_pending++; } cond_resched(); goto out; } else if (ret < 0) goto out; spin_lock(&con->writequeue_lock); writequeue_entry_complete(e, ret); /* Don't starve people filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { spin_unlock(&con->writequeue_lock); mutex_unlock(&con->sock_mutex); cond_resched(); goto again; } } spin_unlock(&con->writequeue_lock); out: mutex_unlock(&con->sock_mutex); return; out_connect: mutex_unlock(&con->sock_mutex); queue_work(send_workqueue, &con->swork); cond_resched(); } static void clean_one_writequeue(struct connection *con) { struct writequeue_entry *e, *safe; spin_lock(&con->writequeue_lock); list_for_each_entry_safe(e, safe, &con->writequeue, list) { free_entry(e); } spin_unlock(&con->writequeue_lock); } static void connection_release(struct rcu_head *rcu) { struct connection *con = container_of(rcu, struct connection, rcu); kfree(con->rx_buf); kfree(con); } /* Called from recovery when it knows that a node has left the cluster */ int dlm_lowcomms_close(int nodeid) { struct connection *con; int idx; log_print("closing connection to node %d", nodeid); idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } spin_lock(&connections_lock); hlist_del_rcu(&con->list); spin_unlock(&connections_lock); close_connection(con, true, true, true); clean_one_writequeue(con); call_srcu(&connections_srcu, &con->rcu, connection_release); if (con->othercon) { clean_one_writequeue(con->othercon); if (con->othercon) call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); } srcu_read_unlock(&connections_srcu, idx); return 0; } /* Receive workqueue function */ static void process_recv_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, rwork); clear_bit(CF_READ_PENDING, &con->flags); receive_from_sock(con); } static void process_listen_recv_socket(struct work_struct *work) { int ret; do { ret = accept_from_sock(&listen_con); } while (!ret); } static void dlm_connect(struct connection *con) { struct sockaddr_storage addr; int result, addr_len; struct socket *sock; unsigned int mark; /* Some odd races can cause double-connects, ignore them */ if (con->retries++ > MAX_CONNECT_RETRIES) return; if (con->sock) { log_print("node %d already connected.", con->nodeid); return; } memset(&addr, 0, sizeof(addr)); result = nodeid_to_addr(con->nodeid, &addr, NULL, dlm_proto_ops->try_new_addr, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); return; } /* Create a socket to communicate with */ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) goto socket_err; sock_set_mark(sock->sk, mark); dlm_proto_ops->sockopts(sock); add_sock(sock, con); result = dlm_proto_ops->bind(sock); if (result < 0) goto add_sock_err; log_print_ratelimited("connecting to %d", con->nodeid); make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr, addr_len); if (result < 0) goto add_sock_err; return; add_sock_err: dlm_close_sock(&con->sock); socket_err: /* * Some errors are fatal and this list might need adjusting. For other * errors we try again until the max number of retries is reached. */ if (result != -EHOSTUNREACH && result != -ENETUNREACH && result != -ENETDOWN && result != -EINVAL && result != -EPROTONOSUPPORT) { log_print("connect %d try %d error %d", con->nodeid, con->retries, result); msleep(1000); lowcomms_connect_sock(con); } } /* Send workqueue function */ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags)); clear_bit(CF_WRITE_PENDING, &con->flags); if (test_and_clear_bit(CF_RECONNECT, &con->flags)) { close_connection(con, false, false, true); dlm_midcomms_unack_msg_resend(con->nodeid); } if (con->sock == NULL) { if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) msleep(1000); mutex_lock(&con->sock_mutex); dlm_connect(con); mutex_unlock(&con->sock_mutex); } if (!list_empty(&con->writequeue)) send_to_sock(con); } static void work_stop(void) { if (recv_workqueue) { destroy_workqueue(recv_workqueue); recv_workqueue = NULL; } if (send_workqueue) { destroy_workqueue(send_workqueue); send_workqueue = NULL; } } static int work_start(void) { recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM); if (!recv_workqueue) { log_print("can't start dlm_recv"); return -ENOMEM; } send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM); if (!send_workqueue) { log_print("can't start dlm_send"); destroy_workqueue(recv_workqueue); recv_workqueue = NULL; return -ENOMEM; } return 0; } void dlm_lowcomms_shutdown(void) { /* stop lowcomms_listen_data_ready calls */ lock_sock(listen_con.sock->sk); listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; release_sock(listen_con.sock->sk); cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); } void dlm_lowcomms_shutdown_node(int nodeid, bool force) { struct connection *con; int idx; idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return; } flush_work(&con->swork); WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); clean_one_writequeue(con); if (con->othercon) clean_one_writequeue(con->othercon); close_connection(con, true, true, true); srcu_read_unlock(&connections_srcu, idx); } static void _stop_conn(struct connection *con, bool and_other) { mutex_lock(&con->sock_mutex); set_bit(CF_CLOSE, &con->flags); set_bit(CF_READ_PENDING, &con->flags); set_bit(CF_WRITE_PENDING, &con->flags); if (con->sock && con->sock->sk) { lock_sock(con->sock->sk); con->sock->sk->sk_user_data = NULL; release_sock(con->sock->sk); } if (con->othercon && and_other) _stop_conn(con->othercon, false); mutex_unlock(&con->sock_mutex); } static void stop_conn(struct connection *con) { _stop_conn(con, true); } static void free_conn(struct connection *con) { close_connection(con, true, true, true); } static void work_flush(void) { int ok; int i; struct connection *con; do { ok = 1; foreach_conn(stop_conn); if (recv_workqueue) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { ok &= test_bit(CF_READ_PENDING, &con->flags); ok &= test_bit(CF_WRITE_PENDING, &con->flags); if (con->othercon) { ok &= test_bit(CF_READ_PENDING, &con->othercon->flags); ok &= test_bit(CF_WRITE_PENDING, &con->othercon->flags); } } } } while (!ok); } void dlm_lowcomms_stop(void) { int idx; idx = srcu_read_lock(&connections_srcu); work_flush(); foreach_conn(free_conn); srcu_read_unlock(&connections_srcu, idx); work_stop(); dlm_proto_ops = NULL; } static int dlm_listen_for_all(void) { struct socket *sock; int result; log_print("Using %s for communications", dlm_proto_ops->name); result = dlm_proto_ops->listen_validate(); if (result < 0) return result; result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { log_print("Can't create comms socket: %d", result); return result; } sock_set_mark(sock->sk, dlm_config.ci_mark); dlm_proto_ops->listen_sockopts(sock); result = dlm_proto_ops->listen_bind(sock); if (result < 0) goto out; lock_sock(sock->sk); listen_sock.sk_data_ready = sock->sk->sk_data_ready; listen_sock.sk_write_space = sock->sk->sk_write_space; listen_sock.sk_error_report = sock->sk->sk_error_report; listen_sock.sk_state_change = sock->sk->sk_state_change; listen_con.sock = sock; sock->sk->sk_allocation = GFP_NOFS; sock->sk->sk_data_ready = lowcomms_listen_data_ready; release_sock(sock->sk); result = sock->ops->listen(sock, 5); if (result < 0) { dlm_close_sock(&listen_con.sock); return result; } return 0; out: sock_release(sock); return result; } static int dlm_tcp_bind(struct socket *sock) { struct sockaddr_storage src_addr; int result, addr_len; /* Bind to our cluster-known address connecting to avoid * routing problems. */ memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, addr_len); if (result < 0) { /* This *may* not indicate a critical error */ log_print("could not bind for connect: %d", result); } return 0; } static int dlm_tcp_connect(struct connection *con, struct socket *sock, struct sockaddr *addr, int addr_len) { int ret; ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); switch (ret) { case -EINPROGRESS: fallthrough; case 0: return 0; } return ret; } static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ if (dlm_local_count > 1) { log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); return -EINVAL; } return 0; } static void dlm_tcp_sockopts(struct socket *sock) { /* Turn off Nagle's algorithm */ tcp_sock_set_nodelay(sock->sk); } static void dlm_tcp_listen_sockopts(struct socket *sock) { dlm_tcp_sockopts(sock); sock_set_reuseaddr(sock->sk); } static int dlm_tcp_listen_bind(struct socket *sock) { int addr_len; /* Bind to our port */ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0], addr_len); } static const struct dlm_proto_ops dlm_tcp_ops = { .name = "TCP", .proto = IPPROTO_TCP, .connect = dlm_tcp_connect, .sockopts = dlm_tcp_sockopts, .bind = dlm_tcp_bind, .listen_validate = dlm_tcp_listen_validate, .listen_sockopts = dlm_tcp_listen_sockopts, .listen_bind = dlm_tcp_listen_bind, }; static int dlm_sctp_bind(struct socket *sock) { return sctp_bind_addrs(sock, 0); } static int dlm_sctp_connect(struct connection *con, struct socket *sock, struct sockaddr *addr, int addr_len) { int ret; /* * Make sock->ops->connect() function return in specified time, * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ sock_set_sndtimeo(sock->sk, 5); ret = sock->ops->connect(sock, addr, addr_len, 0); sock_set_sndtimeo(sock->sk, 0); if (ret < 0) return ret; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) log_print("connected to node %d", con->nodeid); return 0; } static int dlm_sctp_listen_validate(void) { if (!IS_ENABLED(CONFIG_IP_SCTP)) { log_print("SCTP is not enabled by this kernel"); return -EOPNOTSUPP; } request_module("sctp"); return 0; } static int dlm_sctp_bind_listen(struct socket *sock) { return sctp_bind_addrs(sock, dlm_config.ci_tcp_port); } static void dlm_sctp_sockopts(struct socket *sock) { /* Turn off Nagle's algorithm */ sctp_sock_set_nodelay(sock->sk); sock_set_rcvbuf(sock->sk, NEEDED_RMEM); } static const struct dlm_proto_ops dlm_sctp_ops = { .name = "SCTP", .proto = IPPROTO_SCTP, .try_new_addr = true, .connect = dlm_sctp_connect, .sockopts = dlm_sctp_sockopts, .bind = dlm_sctp_bind, .listen_validate = dlm_sctp_listen_validate, .listen_sockopts = dlm_sctp_sockopts, .listen_bind = dlm_sctp_bind_listen, }; int dlm_lowcomms_start(void) { int error = -EINVAL; init_local(); if (!dlm_local_count) { error = -ENOTCONN; log_print("no local IP address has been set"); goto fail; } error = work_start(); if (error) goto fail; /* Start listening */ switch (dlm_config.ci_protocol) { case DLM_PROTO_TCP: dlm_proto_ops = &dlm_tcp_ops; break; case DLM_PROTO_SCTP: dlm_proto_ops = &dlm_sctp_ops; break; default: log_print("Invalid protocol identifier %d set", dlm_config.ci_protocol); error = -EINVAL; goto fail_proto_ops; } error = dlm_listen_for_all(); if (error) goto fail_listen; return 0; fail_listen: dlm_proto_ops = NULL; fail_proto_ops: work_stop(); fail: return error; } void dlm_lowcomms_init(void) { int i; for (i = 0; i < CONN_HASH_SIZE; i++) INIT_HLIST_HEAD(&connection_hash[i]); INIT_WORK(&listen_con.rwork, process_listen_recv_socket); } void dlm_lowcomms_exit(void) { struct connection *con; int i, idx; idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { spin_lock(&connections_lock); hlist_del_rcu(&con->list); spin_unlock(&connections_lock); if (con->othercon) call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); call_srcu(&connections_srcu, &con->rcu, connection_release); } } srcu_read_unlock(&connections_srcu, idx); }