aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZhao Heming2020-11-19 05:41:34 -0600
committerGreg Kroah-Hartman2020-12-29 06:47:10 -0600
commitdcbda41ad7427cf377f86b1ecf039083e6d7e2f0 (patch)
tree54c336546332b0bc3fb153084c2374fe30618679
parentb22fdfd458244b96e200db2c3058121ff0d06cde (diff)
downloadkernel-dcbda41ad7427cf377f86b1ecf039083e6d7e2f0.tar.gz
kernel-dcbda41ad7427cf377f86b1ecf039083e6d7e2f0.tar.xz
kernel-dcbda41ad7427cf377f86b1ecf039083e6d7e2f0.zip
md/cluster: fix deadlock when node is doing resync job
commit bca5b0658020be90b6b504ca514fd80110204f71 upstream. md-cluster uses MD_CLUSTER_SEND_LOCK to make node can exclusively send msg. During sending msg, node can concurrently receive msg from another node. When node does resync job, grab token_lockres:EX may trigger a deadlock: ``` nodeA nodeB -------------------- -------------------- a. send METADATA_UPDATED held token_lockres:EX b. md_do_sync resync_info_update send RESYNCING + set MD_CLUSTER_SEND_LOCK + wait for holding token_lockres:EX c. mdadm /dev/md0 --remove /dev/sdg + held reconfig_mutex + send REMOVE + wait_event(MD_CLUSTER_SEND_LOCK) d. recv_daemon //METADATA_UPDATED from A process_metadata_update + (mddev_trylock(mddev) || MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD) //this time, both return false forever ``` Explaination: a. A send METADATA_UPDATED This will block another node to send msg b. B does sync jobs, which will send RESYNCING at intervals. This will be block for holding token_lockres:EX lock. c. B do "mdadm --remove", which will send REMOVE. This will be blocked by step <b>: MD_CLUSTER_SEND_LOCK is 1. d. B recv METADATA_UPDATED msg, which send from A in step <a>. This will be blocked by step <c>: holding mddev lock, it makes wait_event can't hold mddev lock. (btw, MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD keep ZERO in this scenario.) There is a similar deadlock in commit 0ba959774e93 ("md-cluster: use sync way to handle METADATA_UPDATED msg") In that commit, step c is "update sb". This patch step c is "mdadm --remove". For fixing this issue, we can refer the solution of function: metadata_update_start. Which does the same grab lock_token action. lock_comm can use the same steps to avoid deadlock. By moving MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD from lock_token to lock_comm. It enlarge a little bit window of MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, but it is safe & can break deadlock. Repro steps (I only triggered 3 times with hundreds tests): two nodes share 3 iSCSI luns: sdg/sdh/sdi. Each lun size is 1GB. ``` ssh root@node2 "mdadm -S --scan" mdadm -S --scan for i in {g,h,i};do dd if=/dev/zero of=/dev/sd$i oflag=direct bs=1M \ count=20; done mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sdg /dev/sdh \ --bitmap-chunk=1M ssh root@node2 "mdadm -A /dev/md0 /dev/sdg /dev/sdh" sleep 5 mkfs.xfs /dev/md0 mdadm --manage --add /dev/md0 /dev/sdi mdadm --wait /dev/md0 mdadm --grow --raid-devices=3 /dev/md0 mdadm /dev/md0 --fail /dev/sdg mdadm /dev/md0 --remove /dev/sdg mdadm --grow --raid-devices=2 /dev/md0 ``` test script will hung when executing "mdadm --remove". ``` # dump stacks by "echo t > /proc/sysrq-trigger" md0_cluster_rec D 0 5329 2 0x80004000 Call Trace: __schedule+0x1f6/0x560 ? _cond_resched+0x2d/0x40 ? schedule+0x4a/0xb0 ? process_metadata_update.isra.0+0xdb/0x140 [md_cluster] ? wait_woken+0x80/0x80 ? process_recvd_msg+0x113/0x1d0 [md_cluster] ? recv_daemon+0x9e/0x120 [md_cluster] ? md_thread+0x94/0x160 [md_mod] ? wait_woken+0x80/0x80 ? md_congested+0x30/0x30 [md_mod] ? kthread+0x115/0x140 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x1f/0x40 mdadm D 0 5423 1 0x00004004 Call Trace: __schedule+0x1f6/0x560 ? __schedule+0x1fe/0x560 ? schedule+0x4a/0xb0 ? lock_comm.isra.0+0x7b/0xb0 [md_cluster] ? wait_woken+0x80/0x80 ? remove_disk+0x4f/0x90 [md_cluster] ? hot_remove_disk+0xb1/0x1b0 [md_mod] ? md_ioctl+0x50c/0xba0 [md_mod] ? wait_woken+0x80/0x80 ? blkdev_ioctl+0xa2/0x2a0 ? block_ioctl+0x39/0x40 ? ksys_ioctl+0x82/0xc0 ? __x64_sys_ioctl+0x16/0x20 ? do_syscall_64+0x5f/0x150 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 md0_resync D 0 5425 2 0x80004000 Call Trace: __schedule+0x1f6/0x560 ? schedule+0x4a/0xb0 ? dlm_lock_sync+0xa1/0xd0 [md_cluster] ? wait_woken+0x80/0x80 ? lock_token+0x2d/0x90 [md_cluster] ? resync_info_update+0x95/0x100 [md_cluster] ? raid1_sync_request+0x7d3/0xa40 [raid1] ? md_do_sync.cold+0x737/0xc8f [md_mod] ? md_thread+0x94/0x160 [md_mod] ? md_congested+0x30/0x30 [md_mod] ? kthread+0x115/0x140 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x1f/0x40 ``` At last, thanks for Xiao's solution. Cc: stable@vger.kernel.org Signed-off-by: Zhao Heming <heming.zhao@suse.com> Suggested-by: Xiao Ni <xni@redhat.com> Reviewed-by: Xiao Ni <xni@redhat.com> Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--drivers/md/md-cluster.c67
-rw-r--r--drivers/md/md.c6
2 files changed, 42 insertions, 31 deletions
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 035d5ec8e677..71a196acdac1 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -659,9 +659,27 @@ out:
659 * Takes the lock on the TOKEN lock resource so no other 659 * Takes the lock on the TOKEN lock resource so no other
660 * node can communicate while the operation is underway. 660 * node can communicate while the operation is underway.
661 */ 661 */
662static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) 662static int lock_token(struct md_cluster_info *cinfo)
663{ 663{
664 int error, set_bit = 0; 664 int error;
665
666 error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
667 if (error) {
668 pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
669 __func__, __LINE__, error);
670 } else {
671 /* Lock the receive sequence */
672 mutex_lock(&cinfo->recv_mutex);
673 }
674 return error;
675}
676
677/* lock_comm()
678 * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
679 */
680static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
681{
682 int rv, set_bit = 0;
665 struct mddev *mddev = cinfo->mddev; 683 struct mddev *mddev = cinfo->mddev;
666 684
667 /* 685 /*
@@ -672,34 +690,19 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
672 */ 690 */
673 if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, 691 if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
674 &cinfo->state)) { 692 &cinfo->state)) {
675 error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, 693 rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
676 &cinfo->state); 694 &cinfo->state);
677 WARN_ON_ONCE(error); 695 WARN_ON_ONCE(rv);
678 md_wakeup_thread(mddev->thread); 696 md_wakeup_thread(mddev->thread);
679 set_bit = 1; 697 set_bit = 1;
680 } 698 }
681 error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
682 if (set_bit)
683 clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
684 699
685 if (error)
686 pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
687 __func__, __LINE__, error);
688
689 /* Lock the receive sequence */
690 mutex_lock(&cinfo->recv_mutex);
691 return error;
692}
693
694/* lock_comm()
695 * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
696 */
697static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
698{
699 wait_event(cinfo->wait, 700 wait_event(cinfo->wait,
700 !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); 701 !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
701 702 rv = lock_token(cinfo);
702 return lock_token(cinfo, mddev_locked); 703 if (set_bit)
704 clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
705 return rv;
703} 706}
704 707
705static void unlock_comm(struct md_cluster_info *cinfo) 708static void unlock_comm(struct md_cluster_info *cinfo)
@@ -779,9 +782,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
779{ 782{
780 int ret; 783 int ret;
781 784
782 lock_comm(cinfo, mddev_locked); 785 ret = lock_comm(cinfo, mddev_locked);
783 ret = __sendmsg(cinfo, cmsg); 786 if (!ret) {
784 unlock_comm(cinfo); 787 ret = __sendmsg(cinfo, cmsg);
788 unlock_comm(cinfo);
789 }
785 return ret; 790 return ret;
786} 791}
787 792
@@ -1053,7 +1058,7 @@ static int metadata_update_start(struct mddev *mddev)
1053 return 0; 1058 return 0;
1054 } 1059 }
1055 1060
1056 ret = lock_token(cinfo, 1); 1061 ret = lock_token(cinfo);
1057 clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); 1062 clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
1058 return ret; 1063 return ret;
1059} 1064}
@@ -1171,7 +1176,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
1171 int raid_slot = -1; 1176 int raid_slot = -1;
1172 1177
1173 md_update_sb(mddev, 1); 1178 md_update_sb(mddev, 1);
1174 lock_comm(cinfo, 1); 1179 if (lock_comm(cinfo, 1)) {
1180 pr_err("%s: lock_comm failed\n", __func__);
1181 return;
1182 }
1175 1183
1176 memset(&cmsg, 0, sizeof(cmsg)); 1184 memset(&cmsg, 0, sizeof(cmsg));
1177 cmsg.type = cpu_to_le32(METADATA_UPDATED); 1185 cmsg.type = cpu_to_le32(METADATA_UPDATED);
@@ -1310,7 +1318,8 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
1310 cmsg.type = cpu_to_le32(NEWDISK); 1318 cmsg.type = cpu_to_le32(NEWDISK);
1311 memcpy(cmsg.uuid, uuid, 16); 1319 memcpy(cmsg.uuid, uuid, 16);
1312 cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); 1320 cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1313 lock_comm(cinfo, 1); 1321 if (lock_comm(cinfo, 1))
1322 return -EAGAIN;
1314 ret = __sendmsg(cinfo, &cmsg); 1323 ret = __sendmsg(cinfo, &cmsg);
1315 if (ret) { 1324 if (ret) {
1316 unlock_comm(cinfo); 1325 unlock_comm(cinfo);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 10fcc68665d0..bda2dba8433a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6537,8 +6537,10 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6537 goto busy; 6537 goto busy;
6538 6538
6539kick_rdev: 6539kick_rdev:
6540 if (mddev_is_clustered(mddev)) 6540 if (mddev_is_clustered(mddev)) {
6541 md_cluster_ops->remove_disk(mddev, rdev); 6541 if (md_cluster_ops->remove_disk(mddev, rdev))
6542 goto busy;
6543 }
6542 6544
6543 md_kick_rdev_from_array(rdev); 6545 md_kick_rdev_from_array(rdev);
6544 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6546 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);