diff options
author | Zhao Heming | 2020-11-19 05:41:34 -0600 |
---|---|---|
committer | Greg Kroah-Hartman | 2020-12-29 06:47:10 -0600 |
commit | dcbda41ad7427cf377f86b1ecf039083e6d7e2f0 (patch) | |
tree | 54c336546332b0bc3fb153084c2374fe30618679 | |
parent | b22fdfd458244b96e200db2c3058121ff0d06cde (diff) | |
download | kernel-dcbda41ad7427cf377f86b1ecf039083e6d7e2f0.tar.gz kernel-dcbda41ad7427cf377f86b1ecf039083e6d7e2f0.tar.xz kernel-dcbda41ad7427cf377f86b1ecf039083e6d7e2f0.zip |
md/cluster: fix deadlock when node is doing resync job
commit bca5b0658020be90b6b504ca514fd80110204f71 upstream.
md-cluster uses MD_CLUSTER_SEND_LOCK to make node can exclusively send msg.
During sending msg, node can concurrently receive msg from another node.
When node does resync job, grab token_lockres:EX may trigger a deadlock:
```
nodeA nodeB
-------------------- --------------------
a.
send METADATA_UPDATED
held token_lockres:EX
b.
md_do_sync
resync_info_update
send RESYNCING
+ set MD_CLUSTER_SEND_LOCK
+ wait for holding token_lockres:EX
c.
mdadm /dev/md0 --remove /dev/sdg
+ held reconfig_mutex
+ send REMOVE
+ wait_event(MD_CLUSTER_SEND_LOCK)
d.
recv_daemon //METADATA_UPDATED from A
process_metadata_update
+ (mddev_trylock(mddev) ||
MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD)
//this time, both return false forever
```
Explaination:
a. A send METADATA_UPDATED
This will block another node to send msg
b. B does sync jobs, which will send RESYNCING at intervals.
This will be block for holding token_lockres:EX lock.
c. B do "mdadm --remove", which will send REMOVE.
This will be blocked by step <b>: MD_CLUSTER_SEND_LOCK is 1.
d. B recv METADATA_UPDATED msg, which send from A in step <a>.
This will be blocked by step <c>: holding mddev lock, it makes
wait_event can't hold mddev lock. (btw,
MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD keep ZERO in this scenario.)
There is a similar deadlock in commit 0ba959774e93
("md-cluster: use sync way to handle METADATA_UPDATED msg")
In that commit, step c is "update sb". This patch step c is
"mdadm --remove".
For fixing this issue, we can refer the solution of function:
metadata_update_start. Which does the same grab lock_token action.
lock_comm can use the same steps to avoid deadlock. By moving
MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD from lock_token to lock_comm.
It enlarge a little bit window of MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
but it is safe & can break deadlock.
Repro steps (I only triggered 3 times with hundreds tests):
two nodes share 3 iSCSI luns: sdg/sdh/sdi. Each lun size is 1GB.
```
ssh root@node2 "mdadm -S --scan"
mdadm -S --scan
for i in {g,h,i};do dd if=/dev/zero of=/dev/sd$i oflag=direct bs=1M \
count=20; done
mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sdg /dev/sdh \
--bitmap-chunk=1M
ssh root@node2 "mdadm -A /dev/md0 /dev/sdg /dev/sdh"
sleep 5
mkfs.xfs /dev/md0
mdadm --manage --add /dev/md0 /dev/sdi
mdadm --wait /dev/md0
mdadm --grow --raid-devices=3 /dev/md0
mdadm /dev/md0 --fail /dev/sdg
mdadm /dev/md0 --remove /dev/sdg
mdadm --grow --raid-devices=2 /dev/md0
```
test script will hung when executing "mdadm --remove".
```
# dump stacks by "echo t > /proc/sysrq-trigger"
md0_cluster_rec D 0 5329 2 0x80004000
Call Trace:
__schedule+0x1f6/0x560
? _cond_resched+0x2d/0x40
? schedule+0x4a/0xb0
? process_metadata_update.isra.0+0xdb/0x140 [md_cluster]
? wait_woken+0x80/0x80
? process_recvd_msg+0x113/0x1d0 [md_cluster]
? recv_daemon+0x9e/0x120 [md_cluster]
? md_thread+0x94/0x160 [md_mod]
? wait_woken+0x80/0x80
? md_congested+0x30/0x30 [md_mod]
? kthread+0x115/0x140
? __kthread_bind_mask+0x60/0x60
? ret_from_fork+0x1f/0x40
mdadm D 0 5423 1 0x00004004
Call Trace:
__schedule+0x1f6/0x560
? __schedule+0x1fe/0x560
? schedule+0x4a/0xb0
? lock_comm.isra.0+0x7b/0xb0 [md_cluster]
? wait_woken+0x80/0x80
? remove_disk+0x4f/0x90 [md_cluster]
? hot_remove_disk+0xb1/0x1b0 [md_mod]
? md_ioctl+0x50c/0xba0 [md_mod]
? wait_woken+0x80/0x80
? blkdev_ioctl+0xa2/0x2a0
? block_ioctl+0x39/0x40
? ksys_ioctl+0x82/0xc0
? __x64_sys_ioctl+0x16/0x20
? do_syscall_64+0x5f/0x150
? entry_SYSCALL_64_after_hwframe+0x44/0xa9
md0_resync D 0 5425 2 0x80004000
Call Trace:
__schedule+0x1f6/0x560
? schedule+0x4a/0xb0
? dlm_lock_sync+0xa1/0xd0 [md_cluster]
? wait_woken+0x80/0x80
? lock_token+0x2d/0x90 [md_cluster]
? resync_info_update+0x95/0x100 [md_cluster]
? raid1_sync_request+0x7d3/0xa40 [raid1]
? md_do_sync.cold+0x737/0xc8f [md_mod]
? md_thread+0x94/0x160 [md_mod]
? md_congested+0x30/0x30 [md_mod]
? kthread+0x115/0x140
? __kthread_bind_mask+0x60/0x60
? ret_from_fork+0x1f/0x40
```
At last, thanks for Xiao's solution.
Cc: stable@vger.kernel.org
Signed-off-by: Zhao Heming <heming.zhao@suse.com>
Suggested-by: Xiao Ni <xni@redhat.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r-- | drivers/md/md-cluster.c | 67 | ||||
-rw-r--r-- | drivers/md/md.c | 6 |
2 files changed, 42 insertions, 31 deletions
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 035d5ec8e677..71a196acdac1 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c | |||
@@ -659,9 +659,27 @@ out: | |||
659 | * Takes the lock on the TOKEN lock resource so no other | 659 | * Takes the lock on the TOKEN lock resource so no other |
660 | * node can communicate while the operation is underway. | 660 | * node can communicate while the operation is underway. |
661 | */ | 661 | */ |
662 | static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) | 662 | static int lock_token(struct md_cluster_info *cinfo) |
663 | { | 663 | { |
664 | int error, set_bit = 0; | 664 | int error; |
665 | |||
666 | error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); | ||
667 | if (error) { | ||
668 | pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", | ||
669 | __func__, __LINE__, error); | ||
670 | } else { | ||
671 | /* Lock the receive sequence */ | ||
672 | mutex_lock(&cinfo->recv_mutex); | ||
673 | } | ||
674 | return error; | ||
675 | } | ||
676 | |||
677 | /* lock_comm() | ||
678 | * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. | ||
679 | */ | ||
680 | static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) | ||
681 | { | ||
682 | int rv, set_bit = 0; | ||
665 | struct mddev *mddev = cinfo->mddev; | 683 | struct mddev *mddev = cinfo->mddev; |
666 | 684 | ||
667 | /* | 685 | /* |
@@ -672,34 +690,19 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) | |||
672 | */ | 690 | */ |
673 | if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, | 691 | if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, |
674 | &cinfo->state)) { | 692 | &cinfo->state)) { |
675 | error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, | 693 | rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, |
676 | &cinfo->state); | 694 | &cinfo->state); |
677 | WARN_ON_ONCE(error); | 695 | WARN_ON_ONCE(rv); |
678 | md_wakeup_thread(mddev->thread); | 696 | md_wakeup_thread(mddev->thread); |
679 | set_bit = 1; | 697 | set_bit = 1; |
680 | } | 698 | } |
681 | error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); | ||
682 | if (set_bit) | ||
683 | clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); | ||
684 | 699 | ||
685 | if (error) | ||
686 | pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", | ||
687 | __func__, __LINE__, error); | ||
688 | |||
689 | /* Lock the receive sequence */ | ||
690 | mutex_lock(&cinfo->recv_mutex); | ||
691 | return error; | ||
692 | } | ||
693 | |||
694 | /* lock_comm() | ||
695 | * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. | ||
696 | */ | ||
697 | static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) | ||
698 | { | ||
699 | wait_event(cinfo->wait, | 700 | wait_event(cinfo->wait, |
700 | !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); | 701 | !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); |
701 | 702 | rv = lock_token(cinfo); | |
702 | return lock_token(cinfo, mddev_locked); | 703 | if (set_bit) |
704 | clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); | ||
705 | return rv; | ||
703 | } | 706 | } |
704 | 707 | ||
705 | static void unlock_comm(struct md_cluster_info *cinfo) | 708 | static void unlock_comm(struct md_cluster_info *cinfo) |
@@ -779,9 +782,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg, | |||
779 | { | 782 | { |
780 | int ret; | 783 | int ret; |
781 | 784 | ||
782 | lock_comm(cinfo, mddev_locked); | 785 | ret = lock_comm(cinfo, mddev_locked); |
783 | ret = __sendmsg(cinfo, cmsg); | 786 | if (!ret) { |
784 | unlock_comm(cinfo); | 787 | ret = __sendmsg(cinfo, cmsg); |
788 | unlock_comm(cinfo); | ||
789 | } | ||
785 | return ret; | 790 | return ret; |
786 | } | 791 | } |
787 | 792 | ||
@@ -1053,7 +1058,7 @@ static int metadata_update_start(struct mddev *mddev) | |||
1053 | return 0; | 1058 | return 0; |
1054 | } | 1059 | } |
1055 | 1060 | ||
1056 | ret = lock_token(cinfo, 1); | 1061 | ret = lock_token(cinfo); |
1057 | clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); | 1062 | clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); |
1058 | return ret; | 1063 | return ret; |
1059 | } | 1064 | } |
@@ -1171,7 +1176,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) | |||
1171 | int raid_slot = -1; | 1176 | int raid_slot = -1; |
1172 | 1177 | ||
1173 | md_update_sb(mddev, 1); | 1178 | md_update_sb(mddev, 1); |
1174 | lock_comm(cinfo, 1); | 1179 | if (lock_comm(cinfo, 1)) { |
1180 | pr_err("%s: lock_comm failed\n", __func__); | ||
1181 | return; | ||
1182 | } | ||
1175 | 1183 | ||
1176 | memset(&cmsg, 0, sizeof(cmsg)); | 1184 | memset(&cmsg, 0, sizeof(cmsg)); |
1177 | cmsg.type = cpu_to_le32(METADATA_UPDATED); | 1185 | cmsg.type = cpu_to_le32(METADATA_UPDATED); |
@@ -1310,7 +1318,8 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1310 | cmsg.type = cpu_to_le32(NEWDISK); | 1318 | cmsg.type = cpu_to_le32(NEWDISK); |
1311 | memcpy(cmsg.uuid, uuid, 16); | 1319 | memcpy(cmsg.uuid, uuid, 16); |
1312 | cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); | 1320 | cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); |
1313 | lock_comm(cinfo, 1); | 1321 | if (lock_comm(cinfo, 1)) |
1322 | return -EAGAIN; | ||
1314 | ret = __sendmsg(cinfo, &cmsg); | 1323 | ret = __sendmsg(cinfo, &cmsg); |
1315 | if (ret) { | 1324 | if (ret) { |
1316 | unlock_comm(cinfo); | 1325 | unlock_comm(cinfo); |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 10fcc68665d0..bda2dba8433a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -6537,8 +6537,10 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) | |||
6537 | goto busy; | 6537 | goto busy; |
6538 | 6538 | ||
6539 | kick_rdev: | 6539 | kick_rdev: |
6540 | if (mddev_is_clustered(mddev)) | 6540 | if (mddev_is_clustered(mddev)) { |
6541 | md_cluster_ops->remove_disk(mddev, rdev); | 6541 | if (md_cluster_ops->remove_disk(mddev, rdev)) |
6542 | goto busy; | ||
6543 | } | ||
6542 | 6544 | ||
6543 | md_kick_rdev_from_array(rdev); | 6545 | md_kick_rdev_from_array(rdev); |
6544 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); | 6546 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |