aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds2013-03-05 19:22:08 -0600
committerLinus Torvalds2013-03-05 19:22:08 -0600
commita5e0d73163a848060ac0c2c054274e84a654986e (patch)
tree708ab200af3653090d3cad95228fae21aa6c52cd
parent6dbe51c251a327e012439c4772097a13df43c5b8 (diff)
parentf3378b48705154b9089affb2d2e939622aea68f1 (diff)
downloadam43-linux-kernel-a5e0d73163a848060ac0c2c054274e84a654986e.tar.gz
am43-linux-kernel-a5e0d73163a848060ac0c2c054274e84a654986e.tar.xz
am43-linux-kernel-a5e0d73163a848060ac0c2c054274e84a654986e.zip
Merge tag 'md-3.9' of git://neil.brown.name/md
Pull md updates from NeilBrown: "Mostly little bugfixes. Only "feature" is a new RAID10 layout which slightly improves the number of sets of devices that can concurrently fail, without data loss." * tag 'md-3.9' of git://neil.brown.name/md: md: expedite metadata update when switching read-auto -> active md: remove CONFIG_MULTICORE_RAID456 md/raid1,raid10: fix deadlock with freeze_array() md/raid0: improve error message when converting RAID4-with-spares to RAID0 md: raid0: fix error return from create_stripe_zones. md: fix two bugs when attempting to resize RAID0 array. DM RAID: Add support for MD's RAID10 "far" and "offset" algorithms MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 2) MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1) MD RAID10: Minor non-functional code changes md: raid1,10: Handle REQ_WRITE_SAME flag in write bios md: protect against crash upon fsync on ro array
-rw-r--r--Documentation/device-mapper/dm-raid.txt44
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/dm-raid.c123
-rw-r--r--drivers/md/md.c19
-rw-r--r--drivers/md/raid0.c13
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/md/raid10.c97
-rw-r--r--drivers/md/raid10.h5
-rw-r--r--drivers/md/raid5.c38
9 files changed, 256 insertions, 102 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 56fb62b09fc..b428556197c 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -30,6 +30,7 @@ The target is named "raid" and it accepts the following parameters:
30 raid10 Various RAID10 inspired algorithms chosen by additional params 30 raid10 Various RAID10 inspired algorithms chosen by additional params
31 - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') 31 - RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
32 - RAID1E: Integrated Adjacent Stripe Mirroring 32 - RAID1E: Integrated Adjacent Stripe Mirroring
33 - RAID1E: Integrated Offset Stripe Mirroring
33 - and other similar RAID10 variants 34 - and other similar RAID10 variants
34 35
35 Reference: Chapter 4 of 36 Reference: Chapter 4 of
@@ -64,15 +65,15 @@ The target is named "raid" and it accepts the following parameters:
64 synchronisation state for each region. 65 synchronisation state for each region.
65 66
66 [raid10_copies <# copies>] 67 [raid10_copies <# copies>]
67 [raid10_format near] 68 [raid10_format <near|far|offset>]
68 These two options are used to alter the default layout of 69 These two options are used to alter the default layout of
69 a RAID10 configuration. The number of copies is can be 70 a RAID10 configuration. The number of copies is can be
70 specified, but the default is 2. There are other variations 71 specified, but the default is 2. There are also three
71 to how the copies are laid down - the default and only current 72 variations to how the copies are laid down - the default
72 option is "near". Near copies are what most people think of 73 is "near". Near copies are what most people think of with
73 with respect to mirroring. If these options are left 74 respect to mirroring. If these options are left unspecified,
74 unspecified, or 'raid10_copies 2' and/or 'raid10_format near' 75 or 'raid10_copies 2' and/or 'raid10_format near' are given,
75 are given, then the layouts for 2, 3 and 4 devices are: 76 then the layouts for 2, 3 and 4 devices are:
76 2 drives 3 drives 4 drives 77 2 drives 3 drives 4 drives
77 -------- ---------- -------------- 78 -------- ---------- --------------
78 A1 A1 A1 A1 A2 A1 A1 A2 A2 79 A1 A1 A1 A1 A2 A1 A1 A2 A2
@@ -85,6 +86,33 @@ The target is named "raid" and it accepts the following parameters:
85 3-device layout is what might be called a 'RAID1E - Integrated 86 3-device layout is what might be called a 'RAID1E - Integrated
86 Adjacent Stripe Mirroring'. 87 Adjacent Stripe Mirroring'.
87 88
89 If 'raid10_copies 2' and 'raid10_format far', then the layouts
90 for 2, 3 and 4 devices are:
91 2 drives 3 drives 4 drives
92 -------- -------------- --------------------
93 A1 A2 A1 A2 A3 A1 A2 A3 A4
94 A3 A4 A4 A5 A6 A5 A6 A7 A8
95 A5 A6 A7 A8 A9 A9 A10 A11 A12
96 .. .. .. .. .. .. .. .. ..
97 A2 A1 A3 A1 A2 A2 A1 A4 A3
98 A4 A3 A6 A4 A5 A6 A5 A8 A7
99 A6 A5 A9 A7 A8 A10 A9 A12 A11
100 .. .. .. .. .. .. .. .. ..
101
102 If 'raid10_copies 2' and 'raid10_format offset', then the
103 layouts for 2, 3 and 4 devices are:
104 2 drives 3 drives 4 drives
105 -------- ------------ -----------------
106 A1 A2 A1 A2 A3 A1 A2 A3 A4
107 A2 A1 A3 A1 A2 A2 A1 A4 A3
108 A3 A4 A4 A5 A6 A5 A6 A7 A8
109 A4 A3 A6 A4 A5 A6 A5 A8 A7
110 A5 A6 A7 A8 A9 A9 A10 A11 A12
111 A6 A5 A9 A7 A8 A10 A9 A12 A11
112 .. .. .. .. .. .. .. .. ..
113 Here we see layouts closely akin to 'RAID1E - Integrated
114 Offset Stripe Mirroring'.
115
88<#raid_devs>: The number of devices composing the array. 116<#raid_devs>: The number of devices composing the array.
89 Each device consists of two entries. The first is the device 117 Each device consists of two entries. The first is the device
90 containing the metadata (if any); the second is the one containing the 118 containing the metadata (if any); the second is the one containing the
@@ -142,3 +170,5 @@ Version History
1421.3.0 Added support for RAID 10 1701.3.0 Added support for RAID 10
1431.3.1 Allow device replacement/rebuild for RAID 10 1711.3.1 Allow device replacement/rebuild for RAID 10
1441.3.2 Fix/improve redundancy checking for RAID10 1721.3.2 Fix/improve redundancy checking for RAID10
1731.4.0 Non-functional change. Removes arg from mapping function.
1741.4.1 Add RAID10 "far" and "offset" algorithm support.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index e30b490055a..4d8d90b4fe7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -154,17 +154,6 @@ config MD_RAID456
154 154
155 If unsure, say Y. 155 If unsure, say Y.
156 156
157config MULTICORE_RAID456
158 bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
159 depends on MD_RAID456
160 depends on SMP
161 depends on EXPERIMENTAL
162 ---help---
163 Enable the raid456 module to dispatch per-stripe raid operations to a
164 thread pool.
165
166 If unsure, say N.
167
168config MD_MULTIPATH 157config MD_MULTIPATH
169 tristate "Multipath I/O support" 158 tristate "Multipath I/O support"
170 depends on BLK_DEV_MD 159 depends on BLK_DEV_MD
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9a01d1e4c78..311e3d35b27 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
92}; 92};
93 93
94static char *raid10_md_layout_to_format(int layout)
95{
96 /*
97 * Bit 16 and 17 stand for "offset" and "use_far_sets"
98 * Refer to MD's raid10.c for details
99 */
100 if ((layout & 0x10000) && (layout & 0x20000))
101 return "offset";
102
103 if ((layout & 0xFF) > 1)
104 return "near";
105
106 return "far";
107}
108
94static unsigned raid10_md_layout_to_copies(int layout) 109static unsigned raid10_md_layout_to_copies(int layout)
95{ 110{
96 return layout & 0xFF; 111 if ((layout & 0xFF) > 1)
112 return layout & 0xFF;
113 return (layout >> 8) & 0xFF;
97} 114}
98 115
99static int raid10_format_to_md_layout(char *format, unsigned copies) 116static int raid10_format_to_md_layout(char *format, unsigned copies)
100{ 117{
101 /* 1 "far" copy, and 'copies' "near" copies */ 118 unsigned n = 1, f = 1;
102 return (1 << 8) | (copies & 0xFF); 119
120 if (!strcmp("near", format))
121 n = copies;
122 else
123 f = copies;
124
125 if (!strcmp("offset", format))
126 return 0x30000 | (f << 8) | n;
127
128 if (!strcmp("far", format))
129 return 0x20000 | (f << 8) | n;
130
131 return (f << 8) | n;
103} 132}
104 133
105static struct raid_type *get_raid_type(char *name) 134static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
352{ 381{
353 unsigned i, rebuild_cnt = 0; 382 unsigned i, rebuild_cnt = 0;
354 unsigned rebuilds_per_group, copies, d; 383 unsigned rebuilds_per_group, copies, d;
384 unsigned group_size, last_group_start;
355 385
356 for (i = 0; i < rs->md.raid_disks; i++) 386 for (i = 0; i < rs->md.raid_disks; i++)
357 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || 387 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
379 * as long as the failed devices occur in different mirror 409 * as long as the failed devices occur in different mirror
380 * groups (i.e. different stripes). 410 * groups (i.e. different stripes).
381 * 411 *
382 * Right now, we only allow for "near" copies. When other
383 * formats are added, we will have to check those too.
384 *
385 * When checking "near" format, make sure no adjacent devices 412 * When checking "near" format, make sure no adjacent devices
386 * have failed beyond what can be handled. In addition to the 413 * have failed beyond what can be handled. In addition to the
387 * simple case where the number of devices is a multiple of the 414 * simple case where the number of devices is a multiple of the
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
391 * A A B B C 418 * A A B B C
392 * C D D E E 419 * C D D E E
393 */ 420 */
394 for (i = 0; i < rs->md.raid_disks * copies; i++) { 421 if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
395 if (!(i % copies)) 422 for (i = 0; i < rs->md.raid_disks * copies; i++) {
423 if (!(i % copies))
424 rebuilds_per_group = 0;
425 d = i % rs->md.raid_disks;
426 if ((!rs->dev[d].rdev.sb_page ||
427 !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
428 (++rebuilds_per_group >= copies))
429 goto too_many;
430 }
431 break;
432 }
433
434 /*
435 * When checking "far" and "offset" formats, we need to ensure
436 * that the device that holds its copy is not also dead or
437 * being rebuilt. (Note that "far" and "offset" formats only
438 * support two copies right now. These formats also only ever
439 * use the 'use_far_sets' variant.)
440 *
441 * This check is somewhat complicated by the need to account
442 * for arrays that are not a multiple of (far) copies. This
443 * results in the need to treat the last (potentially larger)
444 * set differently.
445 */
446 group_size = (rs->md.raid_disks / copies);
447 last_group_start = (rs->md.raid_disks / group_size) - 1;
448 last_group_start *= group_size;
449 for (i = 0; i < rs->md.raid_disks; i++) {
450 if (!(i % copies) && !(i > last_group_start))
396 rebuilds_per_group = 0; 451 rebuilds_per_group = 0;
397 d = i % rs->md.raid_disks; 452 if ((!rs->dev[i].rdev.sb_page ||
398 if ((!rs->dev[d].rdev.sb_page || 453 !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
399 !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
400 (++rebuilds_per_group >= copies)) 454 (++rebuilds_per_group >= copies))
401 goto too_many; 455 goto too_many;
402 } 456 }
403 break; 457 break;
404 default: 458 default:
@@ -433,7 +487,7 @@ too_many:
433 * 487 *
434 * RAID10-only options: 488 * RAID10-only options:
435 * [raid10_copies <# copies>] Number of copies. (Default: 2) 489 * [raid10_copies <# copies>] Number of copies. (Default: 2)
436 * [raid10_format <near>] Layout algorithm. (Default: near) 490 * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
437 */ 491 */
438static int parse_raid_params(struct raid_set *rs, char **argv, 492static int parse_raid_params(struct raid_set *rs, char **argv,
439 unsigned num_raid_params) 493 unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
520 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; 574 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
521 return -EINVAL; 575 return -EINVAL;
522 } 576 }
523 if (strcmp("near", argv[i])) { 577 if (strcmp("near", argv[i]) &&
578 strcmp("far", argv[i]) &&
579 strcmp("offset", argv[i])) {
524 rs->ti->error = "Invalid 'raid10_format' value given"; 580 rs->ti->error = "Invalid 'raid10_format' value given";
525 return -EINVAL; 581 return -EINVAL;
526 } 582 }
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
644 return -EINVAL; 700 return -EINVAL;
645 } 701 }
646 702
703 /*
704 * If the format is not "near", we only support
705 * two copies at the moment.
706 */
707 if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
708 rs->ti->error = "Too many copies for given RAID10 format.";
709 return -EINVAL;
710 }
711
647 /* (Len * #mirrors) / #devices */ 712 /* (Len * #mirrors) / #devices */
648 sectors_per_dev = rs->ti->len * raid10_copies; 713 sectors_per_dev = rs->ti->len * raid10_copies;
649 sector_div(sectors_per_dev, rs->md.raid_disks); 714 sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
854 /* 919 /*
855 * Reshaping is not currently allowed 920 * Reshaping is not currently allowed
856 */ 921 */
857 if ((le32_to_cpu(sb->level) != mddev->level) || 922 if (le32_to_cpu(sb->level) != mddev->level) {
858 (le32_to_cpu(sb->layout) != mddev->layout) || 923 DMERR("Reshaping arrays not yet supported. (RAID level change)");
859 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { 924 return -EINVAL;
860 DMERR("Reshaping arrays not yet supported."); 925 }
926 if (le32_to_cpu(sb->layout) != mddev->layout) {
927 DMERR("Reshaping arrays not yet supported. (RAID layout change)");
928 DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
929 DMERR(" Old layout: %s w/ %d copies",
930 raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
931 raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
932 DMERR(" New layout: %s w/ %d copies",
933 raid10_md_layout_to_format(mddev->layout),
934 raid10_md_layout_to_copies(mddev->layout));
935 return -EINVAL;
936 }
937 if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
938 DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
861 return -EINVAL; 939 return -EINVAL;
862 } 940 }
863 941
864 /* We can only change the number of devices in RAID1 right now */ 942 /* We can only change the number of devices in RAID1 right now */
865 if ((rs->raid_type->level != 1) && 943 if ((rs->raid_type->level != 1) &&
866 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { 944 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
867 DMERR("Reshaping arrays not yet supported."); 945 DMERR("Reshaping arrays not yet supported. (device count change)");
868 return -EINVAL; 946 return -EINVAL;
869 } 947 }
870 948
@@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
1329 raid10_md_layout_to_copies(rs->md.layout)); 1407 raid10_md_layout_to_copies(rs->md.layout));
1330 1408
1331 if (rs->print_flags & DMPF_RAID10_FORMAT) 1409 if (rs->print_flags & DMPF_RAID10_FORMAT)
1332 DMEMIT(" raid10_format near"); 1410 DMEMIT(" raid10_format %s",
1411 raid10_md_layout_to_format(rs->md.layout));
1333 1412
1334 DMEMIT(" %d", rs->md.raid_disks); 1413 DMEMIT(" %d", rs->md.raid_disks);
1335 for (i = 0; i < rs->md.raid_disks; i++) { 1414 for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1418,6 +1497,10 @@ static struct target_type raid_target = {
1418 1497
1419static int __init dm_raid_init(void) 1498static int __init dm_raid_init(void)
1420{ 1499{
1500 DMINFO("Loading target version %u.%u.%u",
1501 raid_target.version[0],
1502 raid_target.version[1],
1503 raid_target.version[2]);
1421 return dm_register_target(&raid_target); 1504 return dm_register_target(&raid_target);
1422} 1505}
1423 1506
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3db3d1b271f..fcb878f8879 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
307 bio_io_error(bio); 307 bio_io_error(bio);
308 return; 308 return;
309 } 309 }
310 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
311 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
312 return;
313 }
310 smp_rmb(); /* Ensure implications of 'active' are visible */ 314 smp_rmb(); /* Ensure implications of 'active' are visible */
311 rcu_read_lock(); 315 rcu_read_lock();
312 if (mddev->suspended) { 316 if (mddev->suspended) {
@@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2994 } else if (!sectors) 2998 } else if (!sectors)
2995 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 2999 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2996 rdev->data_offset; 3000 rdev->data_offset;
3001 if (!my_mddev->pers->resize)
3002 /* Cannot change size for RAID0 or Linear etc */
3003 return -EINVAL;
2997 } 3004 }
2998 if (sectors < my_mddev->dev_sectors) 3005 if (sectors < my_mddev->dev_sectors)
2999 return -EINVAL; /* component must fit device */ 3006 return -EINVAL; /* component must fit device */
@@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6525 mddev->ro = 0; 6532 mddev->ro = 0;
6526 sysfs_notify_dirent_safe(mddev->sysfs_state); 6533 sysfs_notify_dirent_safe(mddev->sysfs_state);
6527 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6534 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6528 md_wakeup_thread(mddev->thread); 6535 /* mddev_unlock will wake thread */
6536 /* If a device failed while we were read-only, we
6537 * need to make sure the metadata is updated now.
6538 */
6539 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6540 mddev_unlock(mddev);
6541 wait_event(mddev->sb_wait,
6542 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6543 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6544 mddev_lock(mddev);
6545 }
6529 } else { 6546 } else {
6530 err = -EROFS; 6547 err = -EROFS;
6531 goto abort_unlock; 6548 goto abort_unlock;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7..0505452de8d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
175 rdev1->new_raid_disk = j; 175 rdev1->new_raid_disk = j;
176 } 176 }
177 177
178 if (j < 0 || j >= mddev->raid_disks) { 178 if (j < 0) {
179 printk(KERN_ERR
180 "md/raid0:%s: remove inactive devices before converting to RAID0\n",
181 mdname(mddev));
182 goto abort;
183 }
184 if (j >= mddev->raid_disks) {
179 printk(KERN_ERR "md/raid0:%s: bad disk number %d - " 185 printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
180 "aborting!\n", mdname(mddev), j); 186 "aborting!\n", mdname(mddev), j);
181 goto abort; 187 goto abort;
@@ -289,7 +295,7 @@ abort:
289 kfree(conf->strip_zone); 295 kfree(conf->strip_zone);
290 kfree(conf->devlist); 296 kfree(conf->devlist);
291 kfree(conf); 297 kfree(conf);
292 *private_conf = NULL; 298 *private_conf = ERR_PTR(err);
293 return err; 299 return err;
294} 300}
295 301
@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
411 "%s does not support generic reshape\n", __func__); 417 "%s does not support generic reshape\n", __func__);
412 418
413 rdev_for_each(rdev, mddev) 419 rdev_for_each(rdev, mddev)
414 array_sectors += rdev->sectors; 420 array_sectors += (rdev->sectors &
421 ~(sector_t)(mddev->chunk_sectors-1));
415 422
416 return array_sectors; 423 return array_sectors;
417} 424}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010..fd86b372692 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
967 bio_list_merge(&conf->pending_bio_list, &plug->pending); 967 bio_list_merge(&conf->pending_bio_list, &plug->pending);
968 conf->pending_count += plug->pending_cnt; 968 conf->pending_count += plug->pending_cnt;
969 spin_unlock_irq(&conf->device_lock); 969 spin_unlock_irq(&conf->device_lock);
970 wake_up(&conf->wait_barrier);
970 md_wakeup_thread(mddev->thread); 971 md_wakeup_thread(mddev->thread);
971 kfree(plug); 972 kfree(plug);
972 return; 973 return;
@@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1000 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 1001 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
1001 const unsigned long do_discard = (bio->bi_rw 1002 const unsigned long do_discard = (bio->bi_rw
1002 & (REQ_DISCARD | REQ_SECURE)); 1003 & (REQ_DISCARD | REQ_SECURE));
1004 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1003 struct md_rdev *blocked_rdev; 1005 struct md_rdev *blocked_rdev;
1004 struct blk_plug_cb *cb; 1006 struct blk_plug_cb *cb;
1005 struct raid1_plug_cb *plug = NULL; 1007 struct raid1_plug_cb *plug = NULL;
@@ -1301,7 +1303,8 @@ read_again:
1301 conf->mirrors[i].rdev->data_offset); 1303 conf->mirrors[i].rdev->data_offset);
1302 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1304 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1303 mbio->bi_end_io = raid1_end_write_request; 1305 mbio->bi_end_io = raid1_end_write_request;
1304 mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; 1306 mbio->bi_rw =
1307 WRITE | do_flush_fua | do_sync | do_discard | do_same;
1305 mbio->bi_private = r1_bio; 1308 mbio->bi_private = r1_bio;
1306 1309
1307 atomic_inc(&r1_bio->remaining); 1310 atomic_inc(&r1_bio->remaining);
@@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev)
2818 if (IS_ERR(conf)) 2821 if (IS_ERR(conf))
2819 return PTR_ERR(conf); 2822 return PTR_ERR(conf);
2820 2823
2824 if (mddev->queue)
2825 blk_queue_max_write_same_sectors(mddev->queue,
2826 mddev->chunk_sectors);
2821 rdev_for_each(rdev, mddev) { 2827 rdev_for_each(rdev, mddev) {
2822 if (!mddev->gendisk) 2828 if (!mddev->gendisk)
2823 continue; 2829 continue;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03..77b562d18a9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
38 * near_copies (stored in low byte of layout) 38 * near_copies (stored in low byte of layout)
39 * far_copies (stored in second byte of layout) 39 * far_copies (stored in second byte of layout)
40 * far_offset (stored in bit 16 of layout ) 40 * far_offset (stored in bit 16 of layout )
41 * use_far_sets (stored in bit 17 of layout )
41 * 42 *
42 * The data to be stored is divided into chunks using chunksize. 43 * The data to be stored is divided into chunks using chunksize. Each device
43 * Each device is divided into far_copies sections. 44 * is divided into far_copies sections. In each section, chunks are laid out
44 * In each section, chunks are laid out in a style similar to raid0, but 45 * in a style similar to raid0, but near_copies copies of each chunk is stored
45 * near_copies copies of each chunk is stored (each on a different drive). 46 * (each on a different drive). The starting device for each section is offset
46 * The starting device for each section is offset near_copies from the starting 47 * near_copies from the starting device of the previous section. Thus there
47 * device of the previous section. 48 * are (near_copies * far_copies) of each chunk, and each is on a different
48 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different 49 * drive. near_copies and far_copies must be at least one, and their product
49 * drive. 50 * is at most raid_disks.
50 * near_copies and far_copies must be at least one, and their product is at most
51 * raid_disks.
52 * 51 *
53 * If far_offset is true, then the far_copies are handled a bit differently. 52 * If far_offset is true, then the far_copies are handled a bit differently.
54 * The copies are still in different stripes, but instead of be very far apart 53 * The copies are still in different stripes, but instead of being very far
55 * on disk, there are adjacent stripes. 54 * apart on disk, there are adjacent stripes.
55 *
56 * The far and offset algorithms are handled slightly differently if
57 * 'use_far_sets' is true. In this case, the array's devices are grouped into
58 * sets that are (near_copies * far_copies) in size. The far copied stripes
59 * are still shifted by 'near_copies' devices, but this shifting stays confined
60 * to the set rather than the entire array. This is done to improve the number
61 * of device combinations that can fail without causing the array to fail.
62 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
63 * on a device):
64 * A B C D A B C D E
65 * ... ...
66 * D A B C E A B C D
67 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
68 * [A B] [C D] [A B] [C D E]
69 * |...| |...| |...| | ... |
70 * [B A] [D C] [B A] [E C D]
56 */ 71 */
57 72
58/* 73/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
535 sector_t stripe; 550 sector_t stripe;
536 int dev; 551 int dev;
537 int slot = 0; 552 int slot = 0;
553 int last_far_set_start, last_far_set_size;
554
555 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
556 last_far_set_start *= geo->far_set_size;
557
558 last_far_set_size = geo->far_set_size;
559 last_far_set_size += (geo->raid_disks % geo->far_set_size);
538 560
539 /* now calculate first sector/dev */ 561 /* now calculate first sector/dev */
540 chunk = r10bio->sector >> geo->chunk_shift; 562 chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
551 /* and calculate all the others */ 573 /* and calculate all the others */
552 for (n = 0; n < geo->near_copies; n++) { 574 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev; 575 int d = dev;
576 int set;
554 sector_t s = sector; 577 sector_t s = sector;
555 r10bio->devs[slot].addr = sector;
556 r10bio->devs[slot].devnum = d; 578 r10bio->devs[slot].devnum = d;
579 r10bio->devs[slot].addr = s;
557 slot++; 580 slot++;
558 581
559 for (f = 1; f < geo->far_copies; f++) { 582 for (f = 1; f < geo->far_copies; f++) {
583 set = d / geo->far_set_size;
560 d += geo->near_copies; 584 d += geo->near_copies;
561 if (d >= geo->raid_disks) 585
562 d -= geo->raid_disks; 586 if ((geo->raid_disks % geo->far_set_size) &&
587 (d > last_far_set_start)) {
588 d -= last_far_set_start;
589 d %= last_far_set_size;
590 d += last_far_set_start;
591 } else {
592 d %= geo->far_set_size;
593 d += geo->far_set_size * set;
594 }
563 s += geo->stride; 595 s += geo->stride;
564 r10bio->devs[slot].devnum = d; 596 r10bio->devs[slot].devnum = d;
565 r10bio->devs[slot].addr = s; 597 r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
595 * or recovery, so reshape isn't happening 627 * or recovery, so reshape isn't happening
596 */ 628 */
597 struct geom *geo = &conf->geo; 629 struct geom *geo = &conf->geo;
630 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
631 int far_set_size = geo->far_set_size;
632 int last_far_set_start;
633
634 if (geo->raid_disks % geo->far_set_size) {
635 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
636 last_far_set_start *= geo->far_set_size;
637
638 if (dev >= last_far_set_start) {
639 far_set_size = geo->far_set_size;
640 far_set_size += (geo->raid_disks % geo->far_set_size);
641 far_set_start = last_far_set_start;
642 }
643 }
598 644
599 offset = sector & geo->chunk_mask; 645 offset = sector & geo->chunk_mask;
600 if (geo->far_offset) { 646 if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
602 chunk = sector >> geo->chunk_shift; 648 chunk = sector >> geo->chunk_shift;
603 fc = sector_div(chunk, geo->far_copies); 649 fc = sector_div(chunk, geo->far_copies);
604 dev -= fc * geo->near_copies; 650 dev -= fc * geo->near_copies;
605 if (dev < 0) 651 if (dev < far_set_start)
606 dev += geo->raid_disks; 652 dev += far_set_size;
607 } else { 653 } else {
608 while (sector >= geo->stride) { 654 while (sector >= geo->stride) {
609 sector -= geo->stride; 655 sector -= geo->stride;
610 if (dev < geo->near_copies) 656 if (dev < (geo->near_copies + far_set_start))
611 dev += geo->raid_disks - geo->near_copies; 657 dev += far_set_size - geo->near_copies;
612 else 658 else
613 dev -= geo->near_copies; 659 dev -= geo->near_copies;
614 } 660 }
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1073 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1119 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1074 conf->pending_count += plug->pending_cnt; 1120 conf->pending_count += plug->pending_cnt;
1075 spin_unlock_irq(&conf->device_lock); 1121 spin_unlock_irq(&conf->device_lock);
1122 wake_up(&conf->wait_barrier);
1076 md_wakeup_thread(mddev->thread); 1123 md_wakeup_thread(mddev->thread);
1077 kfree(plug); 1124 kfree(plug);
1078 return; 1125 return;
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1105 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1152 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1106 const unsigned long do_discard = (bio->bi_rw 1153 const unsigned long do_discard = (bio->bi_rw
1107 & (REQ_DISCARD | REQ_SECURE)); 1154 & (REQ_DISCARD | REQ_SECURE));
1155 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1108 unsigned long flags; 1156 unsigned long flags;
1109 struct md_rdev *blocked_rdev; 1157 struct md_rdev *blocked_rdev;
1110 struct blk_plug_cb *cb; 1158 struct blk_plug_cb *cb;
@@ -1460,7 +1508,8 @@ retry_write:
1460 rdev)); 1508 rdev));
1461 mbio->bi_bdev = rdev->bdev; 1509 mbio->bi_bdev = rdev->bdev;
1462 mbio->bi_end_io = raid10_end_write_request; 1510 mbio->bi_end_io = raid10_end_write_request;
1463 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1511 mbio->bi_rw =
1512 WRITE | do_sync | do_fua | do_discard | do_same;
1464 mbio->bi_private = r10_bio; 1513 mbio->bi_private = r10_bio;
1465 1514
1466 atomic_inc(&r10_bio->remaining); 1515 atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1551,8 @@ retry_write:
1502 r10_bio, rdev)); 1551 r10_bio, rdev));
1503 mbio->bi_bdev = rdev->bdev; 1552 mbio->bi_bdev = rdev->bdev;
1504 mbio->bi_end_io = raid10_end_write_request; 1553 mbio->bi_end_io = raid10_end_write_request;
1505 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1554 mbio->bi_rw =
1555 WRITE | do_sync | do_fua | do_discard | do_same;
1506 mbio->bi_private = r10_bio; 1556 mbio->bi_private = r10_bio;
1507 1557
1508 atomic_inc(&r10_bio->remaining); 1558 atomic_inc(&r10_bio->remaining);
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3436 disks = mddev->raid_disks + mddev->delta_disks; 3486 disks = mddev->raid_disks + mddev->delta_disks;
3437 break; 3487 break;
3438 } 3488 }
3439 if (layout >> 17) 3489 if (layout >> 18)
3440 return -1; 3490 return -1;
3441 if (chunk < (PAGE_SIZE >> 9) || 3491 if (chunk < (PAGE_SIZE >> 9) ||
3442 !is_power_of_2(chunk)) 3492 !is_power_of_2(chunk))
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3448 geo->near_copies = nc; 3498 geo->near_copies = nc;
3449 geo->far_copies = fc; 3499 geo->far_copies = fc;
3450 geo->far_offset = fo; 3500 geo->far_offset = fo;
3501 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3451 geo->chunk_mask = chunk - 1; 3502 geo->chunk_mask = chunk - 1;
3452 geo->chunk_shift = ffz(~chunk); 3503 geo->chunk_shift = ffz(~chunk);
3453 return nc*fc; 3504 return nc*fc;
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
3569 if (mddev->queue) { 3620 if (mddev->queue) {
3570 blk_queue_max_discard_sectors(mddev->queue, 3621 blk_queue_max_discard_sectors(mddev->queue,
3571 mddev->chunk_sectors); 3622 mddev->chunk_sectors);
3623 blk_queue_max_write_same_sectors(mddev->queue,
3624 mddev->chunk_sectors);
3572 blk_queue_io_min(mddev->queue, chunk_size); 3625 blk_queue_io_min(mddev->queue, chunk_size);
3573 if (conf->geo.raid_disks % conf->geo.near_copies) 3626 if (conf->geo.raid_disks % conf->geo.near_copies)
3574 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3627 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf60234..157d69e83ff 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
33 * far_offset, in which case it is 33 * far_offset, in which case it is
34 * 1 stripe. 34 * 1 stripe.
35 */ 35 */
36 int far_set_size; /* The number of devices in a set,
37 * where a 'set' are devices that
38 * contain far/offset copies of
39 * each other.
40 */
36 int chunk_shift; /* shift from chunks to sectors */ 41 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask; 42 sector_t chunk_mask;
38 } prev, geo; 43 } prev, geo;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5af2d270908..3ee2912889e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1403,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
1403 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1403 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1404} 1404}
1405 1405
1406static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1406static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1407{ 1407{
1408 int overlap_clear = 0, i, disks = sh->disks; 1408 int overlap_clear = 0, i, disks = sh->disks;
1409 struct dma_async_tx_descriptor *tx = NULL; 1409 struct dma_async_tx_descriptor *tx = NULL;
@@ -1468,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1468 put_cpu(); 1468 put_cpu();
1469} 1469}
1470 1470
1471#ifdef CONFIG_MULTICORE_RAID456
1472static void async_run_ops(void *param, async_cookie_t cookie)
1473{
1474 struct stripe_head *sh = param;
1475 unsigned long ops_request = sh->ops.request;
1476
1477 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1478 wake_up(&sh->ops.wait_for_ops);
1479
1480 __raid_run_ops(sh, ops_request);
1481 release_stripe(sh);
1482}
1483
1484static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1485{
1486 /* since handle_stripe can be called outside of raid5d context
1487 * we need to ensure sh->ops.request is de-staged before another
1488 * request arrives
1489 */
1490 wait_event(sh->ops.wait_for_ops,
1491 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1492 sh->ops.request = ops_request;
1493
1494 atomic_inc(&sh->count);
1495 async_schedule(async_run_ops, sh);
1496}
1497#else
1498#define raid_run_ops __raid_run_ops
1499#endif
1500
1501static int grow_one_stripe(struct r5conf *conf) 1471static int grow_one_stripe(struct r5conf *conf)
1502{ 1472{
1503 struct stripe_head *sh; 1473 struct stripe_head *sh;
@@ -1506,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf)
1506 return 0; 1476 return 0;
1507 1477
1508 sh->raid_conf = conf; 1478 sh->raid_conf = conf;
1509 #ifdef CONFIG_MULTICORE_RAID456
1510 init_waitqueue_head(&sh->ops.wait_for_ops);
1511 #endif
1512 1479
1513 spin_lock_init(&sh->stripe_lock); 1480 spin_lock_init(&sh->stripe_lock);
1514 1481
@@ -1627,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1627 break; 1594 break;
1628 1595
1629 nsh->raid_conf = conf; 1596 nsh->raid_conf = conf;
1630 #ifdef CONFIG_MULTICORE_RAID456
1631 init_waitqueue_head(&nsh->ops.wait_for_ops);
1632 #endif
1633 spin_lock_init(&nsh->stripe_lock); 1597 spin_lock_init(&nsh->stripe_lock);
1634 1598
1635 list_add(&nsh->lru, &newstripes); 1599 list_add(&nsh->lru, &newstripes);