xref: /openbmc/linux/fs/ocfs2/cluster/quorum.c (revision 28a45ef8)
1328970deSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
2fa60ce2cSMasahiro Yamada /*
398211489SZach Brown  *
498211489SZach Brown  * Copyright (C) 2005 Oracle.  All rights reserved.
598211489SZach Brown  */
698211489SZach Brown 
798211489SZach Brown /* This quorum hack is only here until we transition to some more rational
898211489SZach Brown  * approach that is driven from userspace.  Honest.  No foolin'.
998211489SZach Brown  *
1098211489SZach Brown  * Imagine two nodes lose network connectivity to each other but they're still
1198211489SZach Brown  * up and operating in every other way.  Presumably a network timeout indicates
1298211489SZach Brown  * that a node is broken and should be recovered.  They can't both recover each
1398211489SZach Brown  * other and both carry on without serialising their access to the file system.
1498211489SZach Brown  * They need to decide who is authoritative.  Now extend that problem to
1598211489SZach Brown  * arbitrary groups of nodes losing connectivity between each other.
1698211489SZach Brown  *
1798211489SZach Brown  * So we declare that a node which has given up on connecting to a majority
1898211489SZach Brown  * of nodes who are still heartbeating will fence itself.
1998211489SZach Brown  *
2098211489SZach Brown  * There are huge opportunities for races here.  After we give up on a node's
2198211489SZach Brown  * connection we need to wait long enough to give heartbeat an opportunity
2298211489SZach Brown  * to declare the node as truly dead.  We also need to be careful with the
2398211489SZach Brown  * race between when we see a node start heartbeating and when we connect
2498211489SZach Brown  * to it.
2598211489SZach Brown  *
2698211489SZach Brown  * So nodes that are in this transtion put a hold on the quorum decision
2798211489SZach Brown  * with a counter.  As they fall out of this transition they drop the count
2898211489SZach Brown  * and if they're the last, they fire off the decision.
2998211489SZach Brown  */
3098211489SZach Brown #include <linux/kernel.h>
3198211489SZach Brown #include <linux/workqueue.h>
32bebe6f12SSunil Mushran #include <linux/reboot.h>
3398211489SZach Brown 
3498211489SZach Brown #include "heartbeat.h"
3598211489SZach Brown #include "nodemanager.h"
3698211489SZach Brown #define MLOG_MASK_PREFIX ML_QUORUM
3798211489SZach Brown #include "masklog.h"
3898211489SZach Brown #include "quorum.h"
3998211489SZach Brown 
4098211489SZach Brown static struct o2quo_state {
4198211489SZach Brown 	spinlock_t		qs_lock;
4298211489SZach Brown 	struct work_struct	qs_work;
4398211489SZach Brown 	int			qs_pending;
4498211489SZach Brown 	int			qs_heartbeating;
4598211489SZach Brown 	unsigned long		qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
4698211489SZach Brown 	int			qs_connected;
4798211489SZach Brown 	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
4898211489SZach Brown 	int			qs_holds;
4998211489SZach Brown 	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
5098211489SZach Brown } o2quo_state;
5198211489SZach Brown 
5298211489SZach Brown /* this is horribly heavy-handed.  It should instead flip the file
5398211489SZach Brown  * system RO and call some userspace script. */
o2quo_fence_self(void)5498211489SZach Brown static void o2quo_fence_self(void)
5598211489SZach Brown {
5698211489SZach Brown 	/* panic spins with interrupts enabled.  with preempt
5798211489SZach Brown 	 * threads can still schedule, etc, etc */
5898211489SZach Brown 	o2hb_stop_all_regions();
59bebe6f12SSunil Mushran 
60f6656d26SSunil Mushran 	switch (o2nm_single_cluster->cl_fence_method) {
61f6656d26SSunil Mushran 	case O2NM_FENCE_PANIC:
62f6656d26SSunil Mushran 		panic("*** ocfs2 is very sorry to be fencing this system by "
63f6656d26SSunil Mushran 		      "panicing ***\n");
64f6656d26SSunil Mushran 		break;
65f6656d26SSunil Mushran 	default:
66f6656d26SSunil Mushran 		WARN_ON(o2nm_single_cluster->cl_fence_method >=
67f6656d26SSunil Mushran 			O2NM_FENCE_METHODS);
68df561f66SGustavo A. R. Silva 		fallthrough;
69f6656d26SSunil Mushran 	case O2NM_FENCE_RESET:
70f6656d26SSunil Mushran 		printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
71f6656d26SSunil Mushran 		       "system by restarting ***\n");
72bebe6f12SSunil Mushran 		emergency_restart();
73f6656d26SSunil Mushran 		break;
745b43d645Szhengbin 	}
7598211489SZach Brown }
7698211489SZach Brown 
77e926d8a1SChenGang /* Indicate that a timeout occurred on a heartbeat region write. The
7898211489SZach Brown  * other nodes in the cluster may consider us dead at that time so we
7998211489SZach Brown  * want to "fence" ourselves so that we don't scribble on the disk
8098211489SZach Brown  * after they think they've recovered us. This can't solve all
8198211489SZach Brown  * problems related to writeout after recovery but this hack can at
8298211489SZach Brown  * least close some of those gaps. When we have real fencing, this can
8398211489SZach Brown  * go away as our node would be fenced externally before other nodes
8498211489SZach Brown  * begin recovery. */
o2quo_disk_timeout(void)8598211489SZach Brown void o2quo_disk_timeout(void)
8698211489SZach Brown {
8798211489SZach Brown 	o2quo_fence_self();
8898211489SZach Brown }
8998211489SZach Brown 
o2quo_make_decision(struct work_struct * work)90c4028958SDavid Howells static void o2quo_make_decision(struct work_struct *work)
9198211489SZach Brown {
9298211489SZach Brown 	int quorum;
9398211489SZach Brown 	int lowest_hb, lowest_reachable = 0, fence = 0;
9498211489SZach Brown 	struct o2quo_state *qs = &o2quo_state;
9598211489SZach Brown 
96*28a45ef8SChengfeng Ye 	spin_lock_bh(&qs->qs_lock);
9798211489SZach Brown 
9898211489SZach Brown 	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
9998211489SZach Brown 	if (lowest_hb != O2NM_MAX_NODES)
10098211489SZach Brown 		lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
10198211489SZach Brown 
10298211489SZach Brown 	mlog(0, "heartbeating: %d, connected: %d, "
10398211489SZach Brown 	     "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
10498211489SZach Brown 	     qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
10598211489SZach Brown 
10698211489SZach Brown 	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
10798211489SZach Brown 	    qs->qs_heartbeating == 1)
10898211489SZach Brown 		goto out;
10998211489SZach Brown 
11098211489SZach Brown 	if (qs->qs_heartbeating & 1) {
11198211489SZach Brown 		/* the odd numbered cluster case is straight forward --
11298211489SZach Brown 		 * if we can't talk to the majority we're hosed */
11398211489SZach Brown 		quorum = (qs->qs_heartbeating + 1)/2;
11498211489SZach Brown 		if (qs->qs_connected < quorum) {
11598211489SZach Brown 			mlog(ML_ERROR, "fencing this node because it is "
11698211489SZach Brown 			     "only connected to %u nodes and %u is needed "
11798211489SZach Brown 			     "to make a quorum out of %u heartbeating nodes\n",
11898211489SZach Brown 			     qs->qs_connected, quorum,
11998211489SZach Brown 			     qs->qs_heartbeating);
12098211489SZach Brown 			fence = 1;
12198211489SZach Brown 		}
12298211489SZach Brown 	} else {
12398211489SZach Brown 		/* the even numbered cluster adds the possibility of each half
12498211489SZach Brown 		 * of the cluster being able to talk amongst themselves.. in
12598211489SZach Brown 		 * that case we're hosed if we can't talk to the group that has
12698211489SZach Brown 		 * the lowest numbered node */
12798211489SZach Brown 		quorum = qs->qs_heartbeating / 2;
12898211489SZach Brown 		if (qs->qs_connected < quorum) {
12998211489SZach Brown 			mlog(ML_ERROR, "fencing this node because it is "
13098211489SZach Brown 			     "only connected to %u nodes and %u is needed "
13198211489SZach Brown 			     "to make a quorum out of %u heartbeating nodes\n",
13298211489SZach Brown 			     qs->qs_connected, quorum,
13398211489SZach Brown 			     qs->qs_heartbeating);
13498211489SZach Brown 			fence = 1;
13598211489SZach Brown 		}
13698211489SZach Brown 		else if ((qs->qs_connected == quorum) &&
13798211489SZach Brown 			 !lowest_reachable) {
13898211489SZach Brown 			mlog(ML_ERROR, "fencing this node because it is "
13998211489SZach Brown 			     "connected to a half-quorum of %u out of %u "
14098211489SZach Brown 			     "nodes which doesn't include the lowest active "
14198211489SZach Brown 			     "node %u\n", quorum, qs->qs_heartbeating,
14298211489SZach Brown 			     lowest_hb);
14398211489SZach Brown 			fence = 1;
14498211489SZach Brown 		}
14598211489SZach Brown 	}
14698211489SZach Brown 
14798211489SZach Brown out:
1488c7b638cSJunxiao Bi 	if (fence) {
149*28a45ef8SChengfeng Ye 		spin_unlock_bh(&qs->qs_lock);
15098211489SZach Brown 		o2quo_fence_self();
1518c7b638cSJunxiao Bi 	} else {
1528c7b638cSJunxiao Bi 		mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
1538c7b638cSJunxiao Bi 			"connected: %d, lowest: %d (%sreachable)\n",
1548c7b638cSJunxiao Bi 			qs->qs_heartbeating, qs->qs_connected, lowest_hb,
1558c7b638cSJunxiao Bi 			lowest_reachable ? "" : "un");
156*28a45ef8SChengfeng Ye 		spin_unlock_bh(&qs->qs_lock);
1578c7b638cSJunxiao Bi 
1588c7b638cSJunxiao Bi 	}
1598c7b638cSJunxiao Bi 
16098211489SZach Brown }
16198211489SZach Brown 
o2quo_set_hold(struct o2quo_state * qs,u8 node)16298211489SZach Brown static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
16398211489SZach Brown {
16498211489SZach Brown 	assert_spin_locked(&qs->qs_lock);
16598211489SZach Brown 
16698211489SZach Brown 	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
16798211489SZach Brown 		qs->qs_holds++;
16898211489SZach Brown 		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
16998211489SZach Brown 			        "node %u\n", node);
17098211489SZach Brown 		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
17198211489SZach Brown 	}
17298211489SZach Brown }
17398211489SZach Brown 
o2quo_clear_hold(struct o2quo_state * qs,u8 node)17498211489SZach Brown static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
17598211489SZach Brown {
17698211489SZach Brown 	assert_spin_locked(&qs->qs_lock);
17798211489SZach Brown 
17898211489SZach Brown 	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
17998211489SZach Brown 		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
18098211489SZach Brown 		if (--qs->qs_holds == 0) {
18198211489SZach Brown 			if (qs->qs_pending) {
18298211489SZach Brown 				qs->qs_pending = 0;
18398211489SZach Brown 				schedule_work(&qs->qs_work);
18498211489SZach Brown 			}
18598211489SZach Brown 		}
18698211489SZach Brown 		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
18798211489SZach Brown 				node, qs->qs_holds);
18898211489SZach Brown 	}
18998211489SZach Brown }
19098211489SZach Brown 
19198211489SZach Brown /* as a node comes up we delay the quorum decision until we know the fate of
19298211489SZach Brown  * the connection.  the hold will be droped in conn_up or hb_down.  it might be
19398211489SZach Brown  * perpetuated by con_err until hb_down.  if we already have a conn, we might
19498211489SZach Brown  * be dropping a hold that conn_up got. */
o2quo_hb_up(u8 node)19598211489SZach Brown void o2quo_hb_up(u8 node)
19698211489SZach Brown {
19798211489SZach Brown 	struct o2quo_state *qs = &o2quo_state;
19898211489SZach Brown 
199*28a45ef8SChengfeng Ye 	spin_lock_bh(&qs->qs_lock);
20098211489SZach Brown 
20198211489SZach Brown 	qs->qs_heartbeating++;
20298211489SZach Brown 	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
20398211489SZach Brown 		        "node %u\n", node);
20498211489SZach Brown 	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
20598211489SZach Brown 	set_bit(node, qs->qs_hb_bm);
20698211489SZach Brown 
20798211489SZach Brown 	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
20898211489SZach Brown 
20998211489SZach Brown 	if (!test_bit(node, qs->qs_conn_bm))
21098211489SZach Brown 		o2quo_set_hold(qs, node);
21198211489SZach Brown 	else
21298211489SZach Brown 		o2quo_clear_hold(qs, node);
21398211489SZach Brown 
214*28a45ef8SChengfeng Ye 	spin_unlock_bh(&qs->qs_lock);
21598211489SZach Brown }
21698211489SZach Brown 
21798211489SZach Brown /* hb going down releases any holds we might have had due to this node from
21898211489SZach Brown  * conn_up, conn_err, or hb_up */
o2quo_hb_down(u8 node)21998211489SZach Brown void o2quo_hb_down(u8 node)
22098211489SZach Brown {
22198211489SZach Brown 	struct o2quo_state *qs = &o2quo_state;
22298211489SZach Brown 
223*28a45ef8SChengfeng Ye 	spin_lock_bh(&qs->qs_lock);
22498211489SZach Brown 
22598211489SZach Brown 	qs->qs_heartbeating--;
22698211489SZach Brown 	mlog_bug_on_msg(qs->qs_heartbeating < 0,
22798211489SZach Brown 			"node %u, %d heartbeating\n",
22898211489SZach Brown 			node, qs->qs_heartbeating);
22998211489SZach Brown 	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
23098211489SZach Brown 	clear_bit(node, qs->qs_hb_bm);
23198211489SZach Brown 
23298211489SZach Brown 	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
23398211489SZach Brown 
23498211489SZach Brown 	o2quo_clear_hold(qs, node);
23598211489SZach Brown 
236*28a45ef8SChengfeng Ye 	spin_unlock_bh(&qs->qs_lock);
23798211489SZach Brown }
23898211489SZach Brown 
23998211489SZach Brown /* this tells us that we've decided that the node is still heartbeating
24098211489SZach Brown  * even though we've lost it's conn.  it must only be called after conn_err
24198211489SZach Brown  * and indicates that we must now make a quorum decision in the future,
24298211489SZach Brown  * though we might be doing so after waiting for holds to drain.  Here
24398211489SZach Brown  * we'll be dropping the hold from conn_err. */
o2quo_hb_still_up(u8 node)24498211489SZach Brown void o2quo_hb_still_up(u8 node)
24598211489SZach Brown {
24698211489SZach Brown 	struct o2quo_state *qs = &o2quo_state;
24798211489SZach Brown 
248*28a45ef8SChengfeng Ye 	spin_lock_bh(&qs->qs_lock);
24998211489SZach Brown 
25098211489SZach Brown 	mlog(0, "node %u\n", node);
25198211489SZach Brown 
25298211489SZach Brown 	qs->qs_pending = 1;
25398211489SZach Brown 	o2quo_clear_hold(qs, node);
25498211489SZach Brown 
255*28a45ef8SChengfeng Ye 	spin_unlock_bh(&qs->qs_lock);
25698211489SZach Brown }
25798211489SZach Brown 
25825985edcSLucas De Marchi /* This is analogous to hb_up.  as a node's connection comes up we delay the
25998211489SZach Brown  * quorum decision until we see it heartbeating.  the hold will be droped in
26098211489SZach Brown  * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
261b4d8ed4fSJie Liu  * it's already heartbeating we might be dropping a hold that conn_up got.
26298211489SZach Brown  * */
o2quo_conn_up(u8 node)26398211489SZach Brown void o2quo_conn_up(u8 node)
26498211489SZach Brown {
26598211489SZach Brown 	struct o2quo_state *qs = &o2quo_state;
26698211489SZach Brown 
267*28a45ef8SChengfeng Ye 	spin_lock_bh(&qs->qs_lock);
26898211489SZach Brown 
26998211489SZach Brown 	qs->qs_connected++;
27098211489SZach Brown 	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
27198211489SZach Brown 		        "node %u\n", node);
27298211489SZach Brown 	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
27398211489SZach Brown 	set_bit(node, qs->qs_conn_bm);
27498211489SZach Brown 
27598211489SZach Brown 	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
27698211489SZach Brown 
27798211489SZach Brown 	if (!test_bit(node, qs->qs_hb_bm))
27898211489SZach Brown 		o2quo_set_hold(qs, node);
27998211489SZach Brown 	else
28098211489SZach Brown 		o2quo_clear_hold(qs, node);
28198211489SZach Brown 
282*28a45ef8SChengfeng Ye 	spin_unlock_bh(&qs->qs_lock);
28398211489SZach Brown }
28498211489SZach Brown 
28598211489SZach Brown /* we've decided that we won't ever be connecting to the node again.  if it's
28698211489SZach Brown  * still heartbeating we grab a hold that will delay decisions until either the
28798211489SZach Brown  * node stops heartbeating from hb_down or the caller decides that the node is
28898211489SZach Brown  * still up and calls still_up */
o2quo_conn_err(u8 node)28998211489SZach Brown void o2quo_conn_err(u8 node)
29098211489SZach Brown {
29198211489SZach Brown 	struct o2quo_state *qs = &o2quo_state;
29298211489SZach Brown 
293*28a45ef8SChengfeng Ye 	spin_lock_bh(&qs->qs_lock);
29498211489SZach Brown 
29598211489SZach Brown 	if (test_bit(node, qs->qs_conn_bm)) {
29698211489SZach Brown 		qs->qs_connected--;
29798211489SZach Brown 		mlog_bug_on_msg(qs->qs_connected < 0,
29898211489SZach Brown 				"node %u, connected %d\n",
29998211489SZach Brown 				node, qs->qs_connected);
30098211489SZach Brown 
30198211489SZach Brown 		clear_bit(node, qs->qs_conn_bm);
302fc2af28bSYang Zhang 
303fc2af28bSYang Zhang 		if (test_bit(node, qs->qs_hb_bm))
304fc2af28bSYang Zhang 			o2quo_set_hold(qs, node);
30598211489SZach Brown 	}
30698211489SZach Brown 
30798211489SZach Brown 	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
30898211489SZach Brown 
30998211489SZach Brown 
310*28a45ef8SChengfeng Ye 	spin_unlock_bh(&qs->qs_lock);
31198211489SZach Brown }
31298211489SZach Brown 
o2quo_init(void)31398211489SZach Brown void o2quo_init(void)
31498211489SZach Brown {
31598211489SZach Brown 	struct o2quo_state *qs = &o2quo_state;
31698211489SZach Brown 
31798211489SZach Brown 	spin_lock_init(&qs->qs_lock);
318c4028958SDavid Howells 	INIT_WORK(&qs->qs_work, o2quo_make_decision);
31998211489SZach Brown }
32098211489SZach Brown 
o2quo_exit(void)32198211489SZach Brown void o2quo_exit(void)
32298211489SZach Brown {
3239b00a818STejun Heo 	struct o2quo_state *qs = &o2quo_state;
3249b00a818STejun Heo 
32543829731STejun Heo 	flush_work(&qs->qs_work);
32698211489SZach Brown }
327