History log of /openbmc/linux/drivers/md/md-cluster.c (Results 26 – 50 of 222)
Revision (<<< Hide revision tags) (Show revision tags >>>) Date Author Comments
Revision tags: v4.17.3, v4.17.2
# 6396bb22 12-Jun-2018 Kees Cook <keescook@chromium.org>

treewide: kzalloc() -> kcalloc()

The kzalloc() function has a 2-factor argument form, kcalloc(). This
patch replaces cases of:

kzalloc(a * b, gfp)

with:
kcalloc(a * b, gfp)

as wel

treewide: kzalloc() -> kcalloc()

The kzalloc() function has a 2-factor argument form, kcalloc(). This
patch replaces cases of:

kzalloc(a * b, gfp)

with:
kcalloc(a * b, gfp)

as well as handling cases of:

kzalloc(a * b * c, gfp)

with:

kzalloc(array3_size(a, b, c), gfp)

as it's slightly less ugly than:

kzalloc_array(array_size(a, b), c, gfp)

This does, however, attempt to ignore constant size factors like:

kzalloc(4 * 1024, gfp)

though any constants defined via macros get caught up in the conversion.

Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.

The Coccinelle script used for this was:

// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@

(
kzalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kzalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)

// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@

(
kzalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)

// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@

(
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)

// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@

- kzalloc
+ kcalloc
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)

// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@

(
kzalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)

// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@

(
kzalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)

// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@

(
kzalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)

// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@

(
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)

// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@

(
kzalloc(sizeof(THING) * C2, ...)
|
kzalloc(sizeof(TYPE) * C2, ...)
|
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(C1 * C2, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * E2
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- E1 * E2
+ E1, E2
, ...)
)

Signed-off-by: Kees Cook <keescook@chromium.org>

show more ...


Revision tags: v4.17.1, v4.17, v4.16, v4.15, v4.13.16, v4.14
# f0e230ad 24-Oct-2017 Guoqing Jiang <gqjiang@suse.com>

md-cluster: update document for raid10

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>


# b03e0ccb 18-Oct-2017 NeilBrown <neilb@suse.com>

md: remove special meaning of ->quiesce(.., 2)

The '2' argument means "wake up anything that is waiting".
This is an inelegant part of the design and was added
to help support management of suspend_

md: remove special meaning of ->quiesce(.., 2)

The '2' argument means "wake up anything that is waiting".
This is an inelegant part of the design and was added
to help support management of suspend_lo/suspend_hi setting.
Now that suspend_lo/hi is managed in mddev_suspend/resume,
that need is gone.
These is still a couple of places where we call 'quiesce'
with an argument of '2', but they can safely be changed to
call ->quiesce(.., 1); ->quiesce(.., 0) which
achieve the same result at the small cost of pausing IO
briefly.

This removes a small "optimization" from suspend_{hi,lo}_store,
but it isn't clear that optimization served a useful purpose.
The code now is a lot clearer.

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 935fe098 10-Oct-2017 Mike Snitzer <snitzer@redhat.com>

md: rename some drivers/md/ files to have an "md-" prefix

Motivated by the desire to illiminate the imprecise nature of
DM-specific patches being unnecessarily sent to both the MD maintainer
and mai

md: rename some drivers/md/ files to have an "md-" prefix

Motivated by the desire to illiminate the imprecise nature of
DM-specific patches being unnecessarily sent to both the MD maintainer
and mailing-list. Which is born out of the fact that DM files also
reside in drivers/md/

Now all MD-specific files in drivers/md/ start with either "raid" or
"md-" and the MAINTAINERS file has been updated accordingly.

Shaohua: don't change module name

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


Revision tags: v4.13.5
# 7a57157a 03-Oct-2017 Colin Ian King <colin.king@canonical.com>

md-cluster: make function cluster_check_sync_size static

The function cluster_check_sync_size is local to the source and does
not need to be in global scope, so make it static.

Cleans up sparse war

md-cluster: make function cluster_check_sync_size static

The function cluster_check_sync_size is local to the source and does
not need to be in global scope, so make it static.

Cleans up sparse warning:
symbol 'cluster_check_sync_size' was not declared. Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


Revision tags: v4.13, v4.12, v4.10.17
# 2dffdc07 16-May-2017 Guoqing Jiang <gqjiang@suse.com>

md-cluster: fix potential lock issue in add_new_disk

The add_new_disk returns with communication locked if
__sendmsg returns failure, fix it with call unlock_comm
before return.

Reported-by: Dan Ca

md-cluster: fix potential lock issue in add_new_disk

The add_new_disk returns with communication locked if
__sendmsg returns failure, fix it with call unlock_comm
before return.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
CC: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


Revision tags: v4.10.16, v4.10.15, v4.10.14, v4.10.13, v4.10.12, v4.10.11
# 835d89e9 14-Apr-2017 Christophe JAILLET <christophe.jaillet@wanadoo.fr>

md-cluster: Fix a memleak in an error handling path

We know that 'bm_lockres' is NULL here, so 'lockres_free(bm_lockres)' is a
no-op. According to resource handling in case of error a few lines belo

md-cluster: Fix a memleak in an error handling path

We know that 'bm_lockres' is NULL here, so 'lockres_free(bm_lockres)' is a
no-op. According to resource handling in case of error a few lines below,
it is likely that 'bitmap_free(bitmap)' was expected instead.

Fixes: b98938d16a10 ("md-cluster: introduce cluster_check_sync_size")

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


Revision tags: v4.10.10, v4.10.9, v4.10.8, v4.10.7, v4.10.6, v4.10.5, v4.10.4, v4.10.3, v4.10.2
# 818da59f 01-Mar-2017 Guoqing Jiang <gqjiang@suse.com>

md-cluster: add the support for resize

To update size for cluster raid, we need to make
sure all nodes can perform the change successfully.
However, it is possible that some of them can't do
it due

md-cluster: add the support for resize

To update size for cluster raid, we need to make
sure all nodes can perform the change successfully.
However, it is possible that some of them can't do
it due to failure (bitmap_resize could fail). So
we need to consider the issue before we set the
capacity unconditionally, and we use below steps
to perform sanity check.

1. A change the size, then broadcast METADATA_UPDATED
msg.
2. B and C receive METADATA_UPDATED change the size
excepts call set_capacity, sync_size is not update
if the change failed. Also call bitmap_update_sb
to sync sb to disk.
3. A checks other node's sync_size, if sync_size has
been updated in all nodes, then send CHANGE_CAPACITY
msg otherwise send msg to revert previous change.
4. B and C call set_capacity if receive CHANGE_CAPACITY
msg, otherwise pers->resize will be called to restore
the old value.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# b98938d1 01-Mar-2017 Guoqing Jiang <gqjiang@suse.com>

md-cluster: introduce cluster_check_sync_size

Support resize is a little complex for clustered
raid, since we need to ensure all the nodes share
the same knowledge about the size of raid.

We achiev

md-cluster: introduce cluster_check_sync_size

Support resize is a little complex for clustered
raid, since we need to ensure all the nodes share
the same knowledge about the size of raid.

We achieve the goal by check the sync_size which
is in each node's bitmap, we can only change the
capacity after cluster_check_sync_size returns 0.

Also, get_bitmap_from_slot is added to get a slot's
bitmap. And we exported some funcs since they are
used in cluster_check_sync_size().

We can also reuse get_bitmap_from_slot to remove
redundant code existed in bitmap_copy_from_slot.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 7da3d203 01-Mar-2017 Guoqing Jiang <gqjiang@suse.com>

md-cluster: add CHANGE_CAPACITY message type

The msg type CHANGE_CAPACITY is introduced to support
resize clustered raid in later patch, and it is sent
after all the nodes have the same sync_size, r

md-cluster: add CHANGE_CAPACITY message type

The msg type CHANGE_CAPACITY is introduced to support
resize clustered raid in later patch, and it is sent
after all the nodes have the same sync_size, receiver
node just need to set new capacity once received this
msg.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 0ba95977 01-Mar-2017 Guoqing Jiang <gqjiang@suse.com>

md-cluster: use sync way to handle METADATA_UPDATED msg

Previously, when node received METADATA_UPDATED msg, it just
need to wakeup mddev->thread, then md_reload_sb will be called
eventually.

We ta

md-cluster: use sync way to handle METADATA_UPDATED msg

Previously, when node received METADATA_UPDATED msg, it just
need to wakeup mddev->thread, then md_reload_sb will be called
eventually.

We taken the asynchronous way to avoid a deadlock issue, the
deadlock issue could happen when one node is receiving the
METADATA_UPDATED msg (wants reconfig_mutex) and trying to run
the path:

md_check_recovery -> mddev_trylock(hold reconfig_mutex)
-> md_update_sb-metadata_update_start
(want EX on token however token is
got by the sending node)

Since we will support resizing for clustered raid, and we
need the metadata update handling to be synchronous so that
the initiating node can detect failure, so we need to change
the way for handling METADATA_UPDATED msg.

But, we obviously need to avoid above deadlock with the
sync way. To make this happen, we considered to not hold
reconfig_mutex to call md_reload_sb, if some other thread
has already taken reconfig_mutex and waiting for the 'token',
then process_recvd_msg() can safely call md_reload_sb()
without taking the mutex. This is because we can be certain
that no other thread will take the mutex, and we also certain
that the actions performed by md_reload_sb() won't interfere
with anything that the other thread is in the middle of.

To make this more concrete, we added a new cinfo->state bit
MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD

Which is set in lock_token() just before dlm_lock_sync() is
called, and cleared just after. As lock_token() is always
called with reconfig_mutex() held (the specific case is the
resync_info_update which is distinguished well in previous
patch), if process_recvd_msg() finds that the new bit is set,
then the mutex must be held by some other thread, and it will
keep waiting.

So process_metadata_update() can call md_reload_sb() if either
mddev_trylock() succeeds, or if MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD
is set. The tricky bit is what to do if neither of these apply.
We need to wait. Fortunately mddev_unlock() always calls wake_up()
on mddev->thread->wqueue. So we can get lock_token() to call
wake_up() on that when it sets the bit.

There are also some related changes inside this commit:
1. remove RELOAD_SB related codes since there are not valid anymore.
2. mddev is added into md_cluster_info then we can get mddev inside
lock_token.
3. add new parameter for lock_token to distinguish reconfig_mutex
is held or not.

And, we need to set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD in below:
1. set it before unregister thread, otherwise a deadlock could
appear if stop a resyncing array.
This is because md_unregister_thread(&cinfo->recv_thread) is
blocked by recv_daemon -> process_recvd_msg
-> process_metadata_update.
To resolve the issue, MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
also need to be set before unregister thread.
2. set it in metadata_update_start to fix another deadlock.
a. Node A sends METADATA_UPDATED msg (held Token lock).
b. Node B wants to do resync, and is blocked since it can't
get Token lock, but MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
not set since the callchain
(md_do_sync -> sync_request
-> resync_info_update
-> sendmsg
-> lock_comm -> lock_token)
doesn't hold reconfig_mutex.
c. Node B trys to update sb (held reconfig_mutex), but stopped
at wait_event() in metadata_update_start since we have set
MD_CLUSTER_SEND_LOCK flag in lock_comm (step 2).
d. Then Node B receives METADATA_UPDATED msg from A, of course
recv_daemon is blocked forever.
Since metadata_update_start always calls lock_token with reconfig_mutex,
we need to set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD here as well, and
lock_token don't need to set it twice unless lock_token is invoked from
lock_comm.

Finally, thanks to Neil for his great idea and help!

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


Revision tags: v4.10.1
# 75df023f 23-Feb-2017 Guoqing Jiang <gqjiang@suse.com>

md-cluster: remove useless memset from gather_all_resync_info

This memset is not needed. The lvb is already zeroed because
it was recently allocated by lockres_init, which uses kzalloc(),
and read_

md-cluster: remove useless memset from gather_all_resync_info

This memset is not needed. The lvb is already zeroed because
it was recently allocated by lockres_init, which uses kzalloc(),
and read_resync_info() doesn't need it to be zero anyway.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 9c8043f3 23-Feb-2017 Guoqing Jiang <gqjiang@suse.com>

md-cluster: free md_cluster_info if node leave cluster

To avoid memory leak, we need to free the cinfo which
is allocated when node join cluster.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-

md-cluster: free md_cluster_info if node leave cluster

To avoid memory leak, we need to free the cinfo which
is allocated when node join cluster.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


Revision tags: v4.10, v4.9, openbmc-4.4-20161121-1, v4.4.33, v4.4.32, v4.4.31, v4.4.30, v4.4.29, v4.4.28, v4.4.27, v4.7.10, openbmc-4.4-20161021-1, v4.7.9, v4.4.26, v4.7.8, v4.4.25, v4.4.24, v4.7.7, v4.8, v4.4.23, v4.7.6, v4.7.5, v4.4.22, v4.4.21, v4.7.4, v4.7.3, v4.4.20, v4.7.2, v4.4.19, openbmc-4.4-20160819-1, v4.7.1, v4.4.18
# d6385db9 12-Aug-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: make resync lock also could be interruptted

When one node is perform resync or recovery, other nodes
can't get resync lock and could block for a while before
it holds the lock, so we can

md-cluster: make resync lock also could be interruptted

When one node is perform resync or recovery, other nodes
can't get resync lock and could block for a while before
it holds the lock, so we can't stop array immediately for
this scenario.

To make array could be stop quickly, we check MD_CLOSING
in dlm_lock_sync_interruptible to make us can interrupt
the lock request.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 7bcda714 12-Aug-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: introduce dlm_lock_sync_interruptible to fix tasks hang

When some node leaves cluster, then it's bitmap need to be
synced by another node, so "md*_recover" thread is triggered
for the pu

md-cluster: introduce dlm_lock_sync_interruptible to fix tasks hang

When some node leaves cluster, then it's bitmap need to be
synced by another node, so "md*_recover" thread is triggered
for the purpose. However, with below steps. we can find tasks
hang happened either in B or C.

1. Node A create a resyncing cluster raid1, assemble it in
other two nodes (B and C).
2. stop array in B and C.
3. stop array in A.

linux44:~ # ps aux|grep md|grep D
root 5938 0.0 0.1 19852 1964 pts/0 D+ 14:52 0:00 mdadm -S md0
root 5939 0.0 0.0 0 0 ? D 14:52 0:00 [md0_recover]

linux44:~ # cat /proc/5939/stack
[<ffffffffa04cf321>] dlm_lock_sync+0x71/0x90 [md_cluster]
[<ffffffffa04d0705>] recover_bitmaps+0x125/0x220 [md_cluster]
[<ffffffffa052105d>] md_thread+0x16d/0x180 [md_mod]
[<ffffffff8107ad94>] kthread+0xb4/0xc0
[<ffffffff8152a518>] ret_from_fork+0x58/0x90

linux44:~ # cat /proc/5938/stack
[<ffffffff8107afde>] kthread_stop+0x6e/0x120
[<ffffffffa0519da0>] md_unregister_thread+0x40/0x80 [md_mod]
[<ffffffffa04cfd20>] leave+0x70/0x120 [md_cluster]
[<ffffffffa0525e24>] md_cluster_stop+0x14/0x30 [md_mod]
[<ffffffffa05269ab>] bitmap_free+0x14b/0x150 [md_mod]
[<ffffffffa0523f3b>] do_md_stop+0x35b/0x5a0 [md_mod]
[<ffffffffa0524e83>] md_ioctl+0x873/0x1590 [md_mod]
[<ffffffff81288464>] blkdev_ioctl+0x214/0x7d0
[<ffffffff811dd3dd>] block_ioctl+0x3d/0x40
[<ffffffff811b92d4>] do_vfs_ioctl+0x2d4/0x4b0
[<ffffffff811b9538>] SyS_ioctl+0x88/0xa0
[<ffffffff8152a5c9>] system_call_fastpath+0x16/0x1b

The problem is caused by recover_bitmaps can't reliably abort
when the thread is unregistered. So dlm_lock_sync_interruptible
is introduced to detect the thread's situation to fix the problem.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# fccb60a4 12-Aug-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: convert the completion to wait queue

Previously, we used completion to sync between require dlm lock
and sync_ast, however we will have to expose completion.wait
and completion.done in d

md-cluster: convert the completion to wait queue

Previously, we used completion to sync between require dlm lock
and sync_ast, however we will have to expose completion.wait
and completion.done in dlm_lock_sync_interruptible (introduced
later), it is not a common usage for completion, so convert
related things to wait queue.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 5f0aa21d 12-Aug-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: protect md_find_rdev_nr_rcu with rcu lock

We need to use rcu_read_lock/unlock to avoid potential
race.

Reported-by: Shaohua Li <shli@fb.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Sign

md-cluster: protect md_find_rdev_nr_rcu with rcu lock

We need to use rcu_read_lock/unlock to avoid potential
race.

Reported-by: Shaohua Li <shli@fb.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# e3f924d3 12-Aug-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: remove some unnecessary dlm_unlock_sync

Since DLM_LKF_FORCEUNLOCK is used in lockres_free,
we don't need to call dlm_unlock_sync before free
lock resource.

Reviewed-by: NeilBrown <neilb

md-cluster: remove some unnecessary dlm_unlock_sync

Since DLM_LKF_FORCEUNLOCK is used in lockres_free,
we don't need to call dlm_unlock_sync before free
lock resource.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 400cb454 12-Aug-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: use FORCEUNLOCK in lockres_free

For dlm_unlock, we need to pass flag to dlm_unlock as the
third parameter instead of set res->flags.

Also, DLM_LKF_FORCEUNLOCK is more suitable for dlm_u

md-cluster: use FORCEUNLOCK in lockres_free

For dlm_unlock, we need to pass flag to dlm_unlock as the
third parameter instead of set res->flags.

Also, DLM_LKF_FORCEUNLOCK is more suitable for dlm_unlock
since it works even the lock is on waiting or convert queue.

Acked-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 0f6187db 21-Aug-2016 Wei Yongjun <weiyongjun1@huawei.com>

md-cluster: fix error return code in join()

Fix to return error code -ENOMEM from the lockres_init() error
handling case instead of 0, as done elsewhere in this function.

Signed-off-by: Wei Yongjun

md-cluster: fix error return code in join()

Fix to return error code -ENOMEM from the lockres_init() error
handling case instead of 0, as done elsewhere in this function.

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


Revision tags: v4.4.17, openbmc-4.4-20160804-1, v4.4.16, v4.7, openbmc-4.4-20160722-1, openbmc-20160722-1, openbmc-20160713-1, v4.4.15, v4.6.4, v4.6.3, v4.4.14, v4.6.2, v4.4.13, openbmc-20160606-1, v4.6.1, v4.4.12, openbmc-20160521-1, v4.4.11, openbmc-20160518-1, v4.6, v4.4.10, openbmc-20160511-1, openbmc-20160505-1, v4.4.9
# 1fa9a1ad 03-May-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: check the return value of process_recvd_msg

We don't need to run the full path of recv_daemon
if process_recvd_msg doesn't return 0.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-b

md-cluster: check the return value of process_recvd_msg

We don't need to run the full path of recv_daemon
if process_recvd_msg doesn't return 0.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 51e453ae 04-May-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: gather resync infos and enable recv_thread after bitmap is ready

The in-memory bitmap is not ready when node joins cluster,
so it doesn't make sense to make gather_all_resync_info()
call

md-cluster: gather resync infos and enable recv_thread after bitmap is ready

The in-memory bitmap is not ready when node joins cluster,
so it doesn't make sense to make gather_all_resync_info()
called so earlier, we need to call it after the node's
bitmap is setup. Also, recv_thread could be wake up after
node joins cluster, but it could cause problem if node
receives RESYNCING message without persionality since
mddev->pers->quiesce is called in process_suspend_info.

This commit introduces a new cluster interface load_bitmaps
to fix above problems, load_bitmaps is called in bitmap_load
where bitmap and persionality are ready, and load_bitmaps
does the following tasks:

1. call gather_all_resync_info to load all the node's
bitmap info.
2. set MD_CLUSTER_ALREADY_IN_CLUSTER bit to recv_thread
could be wake up, and wake up recv_thread if there is
pending recv event.

Then ack_bast only wakes up recv_thread after IN_CLUSTER
bit is ready otherwise MD_CLUSTER_PENDING_RESYNC_EVENT is
set.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 18c9ff7f 02-May-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: sync bitmap when node received RESYNCING msg

If the node received RESYNCING message which means
another node will perform resync with the area, then
we don't want to do it again in anoth

md-cluster: sync bitmap when node received RESYNCING msg

If the node received RESYNCING message which means
another node will perform resync with the area, then
we don't want to do it again in another node.

Let's set RESYNC_MASK and clear NEEDED_MASK for the
region from old-low to new-low which has finished
syncing, and the region from old-hi to new-hi is about
to syncing, bitmap_sync_with_cluste is introduced for
the purpose.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 1535212c 02-May-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: fix locking when node joins cluster during message broadcast

If a node joins the cluster while a message broadcast
is under way, a lock issue could happen as follows.

For a cluster whic

md-cluster: fix locking when node joins cluster during message broadcast

If a node joins the cluster while a message broadcast
is under way, a lock issue could happen as follows.

For a cluster which included two nodes, if node A is
calling __sendmsg before up-convert CR to EX on ack,
and node B released CR on ack. But if a new node C
joins the cluster and it doesn't receive the message
which A sent before, so it could hold CR on ack before
A up-convert CR to EX on ack.

So a node joining the cluster should get an EX lock on
the "token" first to ensure no broadcast is ongoing,
then release it after held CR on ack.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


# 5b0fb33e 02-May-2016 Guoqing Jiang <gqjiang@suse.com>

md-cluster: unregister thread if err happened

The two threads need to be unregistered if a node
can't join cluster successfully.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang

md-cluster: unregister thread if err happened

The two threads need to be unregistered if a node
can't join cluster successfully.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

show more ...


123456789