aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo2008-09-03 02:03:02 -0500
committerJens Axboe2008-10-09 01:56:06 -0500
commite71bf0d0ee89e51b92776391c5634938236977d5 (patch)
tree9fc62352a40ad388deebdd8ed497cab926cf0470
parentf331c0296f2a9fee0d396a70598b954062603015 (diff)
downloadkernel-common-e71bf0d0ee89e51b92776391c5634938236977d5.tar.gz
kernel-common-e71bf0d0ee89e51b92776391c5634938236977d5.tar.xz
kernel-common-e71bf0d0ee89e51b92776391c5634938236977d5.zip
block: fix disk->part[] dereferencing race
disk->part[] is protected by its matching bdev's lock. However, non-critical accesses like collecting stats and printing out sysfs and proc information used to be performed without any locking. As partitions can come and go dynamically, partitions can go away underneath those non-critical accesses. As some of those accesses are writes, this theoretically can lead to silent corruption. This patch fixes the race by using RCU for the partition array and dev reference counter to hold partitions. * Rename disk->part[] to disk->__part[] to make sure no one outside genhd layer proper accesses it directly. * Use RCU for disk->__part[] dereferencing. * Implement disk_{get|put}_part() which can be used to get and put partitions from gendisk respectively. * Iterators are implemented to help iterate through all partitions safely. * Functions which require RCU readlock are marked with _rcu suffix. * Use disk_put_part() in __blkdev_put() instead of directly putting the contained kobject. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--block/blk-core.c20
-rw-r--r--block/blk-merge.c9
-rw-r--r--block/genhd.c218
-rw-r--r--block/ioctl.c26
-rw-r--r--drivers/block/aoe/aoecmd.c6
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/partitions/check.c70
-rw-r--r--include/linux/genhd.h53
8 files changed, 323 insertions, 94 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index a0dc2e72fcb..d6128d9ad60 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -60,7 +60,9 @@ static void drive_stat_acct(struct request *rq, int new_io)
60 if (!blk_fs_request(rq) || !rq->rq_disk) 60 if (!blk_fs_request(rq) || !rq->rq_disk)
61 return; 61 return;
62 62
63 part = disk_map_sector(rq->rq_disk, rq->sector); 63 rcu_read_lock();
64
65 part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
64 if (!new_io) 66 if (!new_io)
65 __all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector); 67 __all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
66 else { 68 else {
@@ -71,6 +73,8 @@ static void drive_stat_acct(struct request *rq, int new_io)
71 part->in_flight++; 73 part->in_flight++;
72 } 74 }
73 } 75 }
76
77 rcu_read_unlock();
74} 78}
75 79
76void blk_queue_congestion_threshold(struct request_queue *q) 80void blk_queue_congestion_threshold(struct request_queue *q)
@@ -1557,12 +1561,14 @@ static int __end_that_request_first(struct request *req, int error,
1557 } 1561 }
1558 1562
1559 if (blk_fs_request(req) && req->rq_disk) { 1563 if (blk_fs_request(req) && req->rq_disk) {
1560 struct hd_struct *part =
1561 disk_map_sector(req->rq_disk, req->sector);
1562 const int rw = rq_data_dir(req); 1564 const int rw = rq_data_dir(req);
1565 struct hd_struct *part;
1563 1566
1567 rcu_read_lock();
1568 part = disk_map_sector_rcu(req->rq_disk, req->sector);
1564 all_stat_add(req->rq_disk, part, sectors[rw], 1569 all_stat_add(req->rq_disk, part, sectors[rw],
1565 nr_bytes >> 9, req->sector); 1570 nr_bytes >> 9, req->sector);
1571 rcu_read_unlock();
1566 } 1572 }
1567 1573
1568 total_bytes = bio_nbytes = 0; 1574 total_bytes = bio_nbytes = 0;
@@ -1746,7 +1752,11 @@ static void end_that_request_last(struct request *req, int error)
1746 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { 1752 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
1747 unsigned long duration = jiffies - req->start_time; 1753 unsigned long duration = jiffies - req->start_time;
1748 const int rw = rq_data_dir(req); 1754 const int rw = rq_data_dir(req);
1749 struct hd_struct *part = disk_map_sector(disk, req->sector); 1755 struct hd_struct *part;
1756
1757 rcu_read_lock();
1758
1759 part = disk_map_sector_rcu(disk, req->sector);
1750 1760
1751 __all_stat_inc(disk, part, ios[rw], req->sector); 1761 __all_stat_inc(disk, part, ios[rw], req->sector);
1752 __all_stat_add(disk, part, ticks[rw], duration, req->sector); 1762 __all_stat_add(disk, part, ticks[rw], duration, req->sector);
@@ -1756,6 +1766,8 @@ static void end_that_request_last(struct request *req, int error)
1756 part_round_stats(part); 1766 part_round_stats(part);
1757 part->in_flight--; 1767 part->in_flight--;
1758 } 1768 }
1769
1770 rcu_read_unlock();
1759 } 1771 }
1760 1772
1761 if (req->end_io) 1773 if (req->end_io)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 9b17da698d7..eb2a3ca5830 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -387,14 +387,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
387 elv_merge_requests(q, req, next); 387 elv_merge_requests(q, req, next);
388 388
389 if (req->rq_disk) { 389 if (req->rq_disk) {
390 struct hd_struct *part = 390 struct hd_struct *part;
391 disk_map_sector(req->rq_disk, req->sector); 391
392 rcu_read_lock();
393
394 part = disk_map_sector_rcu(req->rq_disk, req->sector);
392 disk_round_stats(req->rq_disk); 395 disk_round_stats(req->rq_disk);
393 req->rq_disk->in_flight--; 396 req->rq_disk->in_flight--;
394 if (part) { 397 if (part) {
395 part_round_stats(part); 398 part_round_stats(part);
396 part->in_flight--; 399 part->in_flight--;
397 } 400 }
401
402 rcu_read_unlock();
398 } 403 }
399 404
400 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 405 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
diff --git a/block/genhd.c b/block/genhd.c
index fa32d09fda2..b431d654394 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -26,6 +26,158 @@ struct kobject *block_depr;
26 26
27static struct device_type disk_type; 27static struct device_type disk_type;
28 28
29/**
30 * disk_get_part - get partition
31 * @disk: disk to look partition from
32 * @partno: partition number
33 *
34 * Look for partition @partno from @disk. If found, increment
35 * reference count and return it.
36 *
37 * CONTEXT:
38 * Don't care.
39 *
40 * RETURNS:
41 * Pointer to the found partition on success, NULL if not found.
42 */
43struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
44{
45 struct hd_struct *part;
46
47 if (unlikely(partno < 1 || partno > disk_max_parts(disk)))
48 return NULL;
49 rcu_read_lock();
50 part = rcu_dereference(disk->__part[partno - 1]);
51 if (part)
52 get_device(&part->dev);
53 rcu_read_unlock();
54
55 return part;
56}
57EXPORT_SYMBOL_GPL(disk_get_part);
58
59/**
60 * disk_part_iter_init - initialize partition iterator
61 * @piter: iterator to initialize
62 * @disk: disk to iterate over
63 * @flags: DISK_PITER_* flags
64 *
65 * Initialize @piter so that it iterates over partitions of @disk.
66 *
67 * CONTEXT:
68 * Don't care.
69 */
70void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
71 unsigned int flags)
72{
73 piter->disk = disk;
74 piter->part = NULL;
75
76 if (flags & DISK_PITER_REVERSE)
77 piter->idx = disk_max_parts(piter->disk) - 1;
78 else
79 piter->idx = 0;
80
81 piter->flags = flags;
82}
83EXPORT_SYMBOL_GPL(disk_part_iter_init);
84
85/**
86 * disk_part_iter_next - proceed iterator to the next partition and return it
87 * @piter: iterator of interest
88 *
89 * Proceed @piter to the next partition and return it.
90 *
91 * CONTEXT:
92 * Don't care.
93 */
94struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
95{
96 int inc, end;
97
98 /* put the last partition */
99 disk_put_part(piter->part);
100 piter->part = NULL;
101
102 rcu_read_lock();
103
104 /* determine iteration parameters */
105 if (piter->flags & DISK_PITER_REVERSE) {
106 inc = -1;
107 end = -1;
108 } else {
109 inc = 1;
110 end = disk_max_parts(piter->disk);
111 }
112
113 /* iterate to the next partition */
114 for (; piter->idx != end; piter->idx += inc) {
115 struct hd_struct *part;
116
117 part = rcu_dereference(piter->disk->__part[piter->idx]);
118 if (!part)
119 continue;
120 if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects)
121 continue;
122
123 get_device(&part->dev);
124 piter->part = part;
125 piter->idx += inc;
126 break;
127 }
128
129 rcu_read_unlock();
130
131 return piter->part;
132}
133EXPORT_SYMBOL_GPL(disk_part_iter_next);
134
135/**
136 * disk_part_iter_exit - finish up partition iteration
137 * @piter: iter of interest
138 *
139 * Called when iteration is over. Cleans up @piter.
140 *
141 * CONTEXT:
142 * Don't care.
143 */
144void disk_part_iter_exit(struct disk_part_iter *piter)
145{
146 disk_put_part(piter->part);
147 piter->part = NULL;
148}
149EXPORT_SYMBOL_GPL(disk_part_iter_exit);
150
151/**
152 * disk_map_sector_rcu - map sector to partition
153 * @disk: gendisk of interest
154 * @sector: sector to map
155 *
156 * Find out which partition @sector maps to on @disk. This is
157 * primarily used for stats accounting.
158 *
159 * CONTEXT:
160 * RCU read locked. The returned partition pointer is valid only
161 * while preemption is disabled.
162 *
163 * RETURNS:
164 * Found partition on success, NULL if there's no matching partition.
165 */
166struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
167{
168 int i;
169
170 for (i = 0; i < disk_max_parts(disk); i++) {
171 struct hd_struct *part = rcu_dereference(disk->__part[i]);
172
173 if (part && part->start_sect <= sector &&
174 sector < part->start_sect + part->nr_sects)
175 return part;
176 }
177 return NULL;
178}
179EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
180
29/* 181/*
30 * Can be deleted altogether. Later. 182 * Can be deleted altogether. Later.
31 * 183 *
@@ -245,10 +397,12 @@ extern struct block_device *bdget_disk(struct gendisk *disk, int partno)
245 if (partno == 0) 397 if (partno == 0)
246 devt = disk_devt(disk); 398 devt = disk_devt(disk);
247 else { 399 else {
248 struct hd_struct *part = disk->part[partno - 1]; 400 struct hd_struct *part;
249 401
402 part = disk_get_part(disk, partno);
250 if (part && part->nr_sects) 403 if (part && part->nr_sects)
251 devt = part_devt(part); 404 devt = part_devt(part);
405 disk_put_part(part);
252 } 406 }
253 407
254 if (likely(devt != MKDEV(0, 0))) 408 if (likely(devt != MKDEV(0, 0)))
@@ -270,8 +424,9 @@ void __init printk_all_partitions(void)
270 class_dev_iter_init(&iter, &block_class, NULL, &disk_type); 424 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
271 while ((dev = class_dev_iter_next(&iter))) { 425 while ((dev = class_dev_iter_next(&iter))) {
272 struct gendisk *disk = dev_to_disk(dev); 426 struct gendisk *disk = dev_to_disk(dev);
427 struct disk_part_iter piter;
428 struct hd_struct *part;
273 char buf[BDEVNAME_SIZE]; 429 char buf[BDEVNAME_SIZE];
274 int n;
275 430
276 /* 431 /*
277 * Don't show empty devices or things that have been 432 * Don't show empty devices or things that have been
@@ -298,16 +453,13 @@ void __init printk_all_partitions(void)
298 printk(" (driver?)\n"); 453 printk(" (driver?)\n");
299 454
300 /* now show the partitions */ 455 /* now show the partitions */
301 for (n = 0; n < disk_max_parts(disk); ++n) { 456 disk_part_iter_init(&piter, disk, 0);
302 struct hd_struct *part = disk->part[n]; 457 while ((part = disk_part_iter_next(&piter)))
303
304 if (!part || !part->nr_sects)
305 continue;
306 printk(" %02x%02x %10llu %s\n", 458 printk(" %02x%02x %10llu %s\n",
307 MAJOR(part_devt(part)), MINOR(part_devt(part)), 459 MAJOR(part_devt(part)), MINOR(part_devt(part)),
308 (unsigned long long)part->nr_sects >> 1, 460 (unsigned long long)part->nr_sects >> 1,
309 disk_name(disk, part->partno, buf)); 461 disk_name(disk, part->partno, buf));
310 } 462 disk_part_iter_exit(&piter);
311 } 463 }
312 class_dev_iter_exit(&iter); 464 class_dev_iter_exit(&iter);
313} 465}
@@ -371,7 +523,8 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
371static int show_partition(struct seq_file *seqf, void *v) 523static int show_partition(struct seq_file *seqf, void *v)
372{ 524{
373 struct gendisk *sgp = v; 525 struct gendisk *sgp = v;
374 int n; 526 struct disk_part_iter piter;
527 struct hd_struct *part;
375 char buf[BDEVNAME_SIZE]; 528 char buf[BDEVNAME_SIZE];
376 529
377 /* Don't show non-partitionable removeable devices or empty devices */ 530 /* Don't show non-partitionable removeable devices or empty devices */
@@ -386,17 +539,14 @@ static int show_partition(struct seq_file *seqf, void *v)
386 MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)), 539 MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)),
387 (unsigned long long)get_capacity(sgp) >> 1, 540 (unsigned long long)get_capacity(sgp) >> 1,
388 disk_name(sgp, 0, buf)); 541 disk_name(sgp, 0, buf));
389 for (n = 0; n < disk_max_parts(sgp); n++) { 542
390 struct hd_struct *part = sgp->part[n]; 543 disk_part_iter_init(&piter, sgp, 0);
391 if (!part) 544 while ((part = disk_part_iter_next(&piter)))
392 continue;
393 if (part->nr_sects == 0)
394 continue;
395 seq_printf(seqf, "%4d %4d %10llu %s\n", 545 seq_printf(seqf, "%4d %4d %10llu %s\n",
396 MAJOR(part_devt(part)), MINOR(part_devt(part)), 546 MAJOR(part_devt(part)), MINOR(part_devt(part)),
397 (unsigned long long)part->nr_sects >> 1, 547 (unsigned long long)part->nr_sects >> 1,
398 disk_name(sgp, part->partno, buf)); 548 disk_name(sgp, part->partno, buf));
399 } 549 disk_part_iter_exit(&piter);
400 550
401 return 0; 551 return 0;
402} 552}
@@ -571,7 +721,7 @@ static void disk_release(struct device *dev)
571 struct gendisk *disk = dev_to_disk(dev); 721 struct gendisk *disk = dev_to_disk(dev);
572 722
573 kfree(disk->random); 723 kfree(disk->random);
574 kfree(disk->part); 724 kfree(disk->__part);
575 free_disk_stats(disk); 725 free_disk_stats(disk);
576 kfree(disk); 726 kfree(disk);
577} 727}
@@ -596,8 +746,9 @@ static struct device_type disk_type = {
596static int diskstats_show(struct seq_file *seqf, void *v) 746static int diskstats_show(struct seq_file *seqf, void *v)
597{ 747{
598 struct gendisk *gp = v; 748 struct gendisk *gp = v;
749 struct disk_part_iter piter;
750 struct hd_struct *hd;
599 char buf[BDEVNAME_SIZE]; 751 char buf[BDEVNAME_SIZE];
600 int n;
601 752
602 /* 753 /*
603 if (&gp->dev.kobj.entry == block_class.devices.next) 754 if (&gp->dev.kobj.entry == block_class.devices.next)
@@ -624,12 +775,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
624 jiffies_to_msecs(disk_stat_read(gp, time_in_queue))); 775 jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
625 776
626 /* now show all non-0 size partitions of it */ 777 /* now show all non-0 size partitions of it */
627 for (n = 0; n < disk_max_parts(gp); n++) { 778 disk_part_iter_init(&piter, gp, 0);
628 struct hd_struct *hd = gp->part[n]; 779 while ((hd = disk_part_iter_next(&piter))) {
629
630 if (!hd || !hd->nr_sects)
631 continue;
632
633 preempt_disable(); 780 preempt_disable();
634 part_round_stats(hd); 781 part_round_stats(hd);
635 preempt_enable(); 782 preempt_enable();
@@ -650,6 +797,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
650 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 797 jiffies_to_msecs(part_stat_read(hd, time_in_queue))
651 ); 798 );
652 } 799 }
800 disk_part_iter_exit(&piter);
653 801
654 return 0; 802 return 0;
655} 803}
@@ -703,12 +851,16 @@ dev_t blk_lookup_devt(const char *name, int partno)
703 if (partno == 0) 851 if (partno == 0)
704 devt = disk_devt(disk); 852 devt = disk_devt(disk);
705 else { 853 else {
706 struct hd_struct *part = disk->part[partno - 1]; 854 struct hd_struct *part;
707 855
708 if (!part || !part->nr_sects) 856 part = disk_get_part(disk, partno);
857 if (!part || !part->nr_sects) {
858 disk_put_part(part);
709 continue; 859 continue;
860 }
710 861
711 devt = part_devt(part); 862 devt = part_devt(part);
863 disk_put_part(part);
712 } 864 }
713 break; 865 break;
714 } 866 }
@@ -735,9 +887,9 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
735 } 887 }
736 if (minors > 1) { 888 if (minors > 1) {
737 int size = (minors - 1) * sizeof(struct hd_struct *); 889 int size = (minors - 1) * sizeof(struct hd_struct *);
738 disk->part = kmalloc_node(size, 890 disk->__part = kmalloc_node(size,
739 GFP_KERNEL | __GFP_ZERO, node_id); 891 GFP_KERNEL | __GFP_ZERO, node_id);
740 if (!disk->part) { 892 if (!disk->__part) {
741 free_disk_stats(disk); 893 free_disk_stats(disk);
742 kfree(disk); 894 kfree(disk);
743 return NULL; 895 return NULL;
@@ -798,10 +950,14 @@ EXPORT_SYMBOL(set_device_ro);
798 950
799void set_disk_ro(struct gendisk *disk, int flag) 951void set_disk_ro(struct gendisk *disk, int flag)
800{ 952{
801 int i; 953 struct disk_part_iter piter;
954 struct hd_struct *part;
955
802 disk->policy = flag; 956 disk->policy = flag;
803 for (i = 0; i < disk_max_parts(disk); i++) 957 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
804 if (disk->part[i]) disk->part[i]->policy = flag; 958 while ((part = disk_part_iter_next(&piter)))
959 part->policy = flag;
960 disk_part_iter_exit(&piter);
805} 961}
806 962
807EXPORT_SYMBOL(set_disk_ro); 963EXPORT_SYMBOL(set_disk_ro);
diff --git a/block/ioctl.c b/block/ioctl.c
index 403f7d7e0c2..a5f672ad55f 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -12,11 +12,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
12{ 12{
13 struct block_device *bdevp; 13 struct block_device *bdevp;
14 struct gendisk *disk; 14 struct gendisk *disk;
15 struct hd_struct *part;
15 struct blkpg_ioctl_arg a; 16 struct blkpg_ioctl_arg a;
16 struct blkpg_partition p; 17 struct blkpg_partition p;
18 struct disk_part_iter piter;
17 long long start, length; 19 long long start, length;
18 int partno; 20 int partno;
19 int i;
20 int err; 21 int err;
21 22
22 if (!capable(CAP_SYS_ADMIN)) 23 if (!capable(CAP_SYS_ADMIN))
@@ -47,28 +48,33 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
47 mutex_lock(&bdev->bd_mutex); 48 mutex_lock(&bdev->bd_mutex);
48 49
49 /* overlap? */ 50 /* overlap? */
50 for (i = 0; i < disk_max_parts(disk); i++) { 51 disk_part_iter_init(&piter, disk,
51 struct hd_struct *s = disk->part[i]; 52 DISK_PITER_INCL_EMPTY);
52 53 while ((part = disk_part_iter_next(&piter))) {
53 if (!s) 54 if (!(start + length <= part->start_sect ||
54 continue; 55 start >= part->start_sect + part->nr_sects)) {
55 if (!(start+length <= s->start_sect || 56 disk_part_iter_exit(&piter);
56 start >= s->start_sect + s->nr_sects)) {
57 mutex_unlock(&bdev->bd_mutex); 57 mutex_unlock(&bdev->bd_mutex);
58 return -EBUSY; 58 return -EBUSY;
59 } 59 }
60 } 60 }
61 disk_part_iter_exit(&piter);
62
61 /* all seems OK */ 63 /* all seems OK */
62 err = add_partition(disk, partno, start, length, 64 err = add_partition(disk, partno, start, length,
63 ADDPART_FLAG_NONE); 65 ADDPART_FLAG_NONE);
64 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
65 return err; 67 return err;
66 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION:
67 if (!disk->part[partno - 1]) 69 part = disk_get_part(disk, partno);
70 if (!part)
68 return -ENXIO; 71 return -ENXIO;
69 bdevp = bdget_disk(disk, partno); 72
73 bdevp = bdget(part_devt(part));
74 disk_put_part(part);
70 if (!bdevp) 75 if (!bdevp)
71 return -ENOMEM; 76 return -ENOMEM;
77
72 mutex_lock(&bdevp->bd_mutex); 78 mutex_lock(&bdevp->bd_mutex);
73 if (bdevp->bd_openers) { 79 if (bdevp->bd_openers) {
74 mutex_unlock(&bdevp->bd_mutex); 80 mutex_unlock(&bdevp->bd_mutex);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 885d1409521..84c03d65dcc 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -757,11 +757,15 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
757 const int rw = bio_data_dir(bio); 757 const int rw = bio_data_dir(bio);
758 struct hd_struct *part; 758 struct hd_struct *part;
759 759
760 part = disk_map_sector(disk, sector); 760 rcu_read_lock();
761
762 part = disk_map_sector_rcu(disk, sector);
761 all_stat_inc(disk, part, ios[rw], sector); 763 all_stat_inc(disk, part, ios[rw], sector);
762 all_stat_add(disk, part, ticks[rw], duration, sector); 764 all_stat_add(disk, part, ticks[rw], duration, sector);
763 all_stat_add(disk, part, sectors[rw], n_sect, sector); 765 all_stat_add(disk, part, sectors[rw], n_sect, sector);
764 all_stat_add(disk, part, io_ticks, duration, sector); 766 all_stat_add(disk, part, io_ticks, duration, sector);
767
768 rcu_read_unlock();
765} 769}
766 770
767void 771void
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 72e0a2887cb..2f2873b9a04 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -929,6 +929,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
929{ 929{
930 struct module *owner = NULL; 930 struct module *owner = NULL;
931 struct gendisk *disk; 931 struct gendisk *disk;
932 struct hd_struct *part = NULL;
932 int ret; 933 int ret;
933 int partno; 934 int partno;
934 int perm = 0; 935 int perm = 0;
@@ -978,7 +979,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
978 if (bdev->bd_invalidated) 979 if (bdev->bd_invalidated)
979 rescan_partitions(disk, bdev); 980 rescan_partitions(disk, bdev);
980 } else { 981 } else {
981 struct hd_struct *p;
982 struct block_device *whole; 982 struct block_device *whole;
983 whole = bdget_disk(disk, 0); 983 whole = bdget_disk(disk, 0);
984 ret = -ENOMEM; 984 ret = -ENOMEM;
@@ -989,16 +989,16 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
989 if (ret) 989 if (ret)
990 goto out_first; 990 goto out_first;
991 bdev->bd_contains = whole; 991 bdev->bd_contains = whole;
992 p = disk->part[partno - 1]; 992 part = disk_get_part(disk, partno);
993 bdev->bd_inode->i_data.backing_dev_info = 993 bdev->bd_inode->i_data.backing_dev_info =
994 whole->bd_inode->i_data.backing_dev_info; 994 whole->bd_inode->i_data.backing_dev_info;
995 if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { 995 if (!(disk->flags & GENHD_FL_UP) ||
996 !part || !part->nr_sects) {
996 ret = -ENXIO; 997 ret = -ENXIO;
997 goto out_first; 998 goto out_first;
998 } 999 }
999 kobject_get(&p->dev.kobj); 1000 bdev->bd_part = part;
1000 bdev->bd_part = p; 1001 bd_set_size(bdev, (loff_t)part->nr_sects << 9);
1001 bd_set_size(bdev, (loff_t) p->nr_sects << 9);
1002 } 1002 }
1003 } else { 1003 } else {
1004 put_disk(disk); 1004 put_disk(disk);
@@ -1027,6 +1027,7 @@ out_first:
1027 __blkdev_put(bdev->bd_contains, 1); 1027 __blkdev_put(bdev->bd_contains, 1);
1028 bdev->bd_contains = NULL; 1028 bdev->bd_contains = NULL;
1029 put_disk(disk); 1029 put_disk(disk);
1030 disk_put_part(part);
1030 module_put(owner); 1031 module_put(owner);
1031out: 1032out:
1032 mutex_unlock(&bdev->bd_mutex); 1033 mutex_unlock(&bdev->bd_mutex);
@@ -1119,7 +1120,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
1119 module_put(owner); 1120 module_put(owner);
1120 1121
1121 if (bdev->bd_contains != bdev) { 1122 if (bdev->bd_contains != bdev) {
1122 kobject_put(&bdev->bd_part->dev.kobj); 1123 disk_put_part(bdev->bd_part);
1123 bdev->bd_part = NULL; 1124 bdev->bd_part = NULL;
1124 } 1125 }
1125 bdev->bd_disk = NULL; 1126 bdev->bd_disk = NULL;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e77fa144a07..96c8bf41e45 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -314,19 +314,29 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
314 kobject_put(k); 314 kobject_put(k);
315} 315}
316 316
317static void delete_partition_rcu_cb(struct rcu_head *head)
318{
319 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
320
321 part->start_sect = 0;
322 part->nr_sects = 0;
323 part_stat_set_all(part, 0);
324 put_device(&part->dev);
325}
326
317void delete_partition(struct gendisk *disk, int partno) 327void delete_partition(struct gendisk *disk, int partno)
318{ 328{
319 struct hd_struct *p = disk->part[partno - 1]; 329 struct hd_struct *part;
320 330
321 if (!p) 331 part = disk->__part[partno-1];
332 if (!part)
322 return; 333 return;
323 disk->part[partno - 1] = NULL; 334
324 p->start_sect = 0; 335 rcu_assign_pointer(disk->__part[partno-1], NULL);
325 p->nr_sects = 0; 336 kobject_put(part->holder_dir);
326 part_stat_set_all(p, 0); 337 device_del(&part->dev);
327 kobject_put(p->holder_dir); 338
328 device_del(&p->dev); 339 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
329 put_device(&p->dev);
330} 340}
331 341
332static ssize_t whole_disk_show(struct device *dev, 342static ssize_t whole_disk_show(struct device *dev,
@@ -343,7 +353,7 @@ int add_partition(struct gendisk *disk, int partno,
343 struct hd_struct *p; 353 struct hd_struct *p;
344 int err; 354 int err;
345 355
346 if (disk->part[partno - 1]) 356 if (disk->__part[partno - 1])
347 return -EBUSY; 357 return -EBUSY;
348 358
349 p = kzalloc(sizeof(*p), GFP_KERNEL); 359 p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -391,7 +401,8 @@ int add_partition(struct gendisk *disk, int partno,
391 } 401 }
392 402
393 /* everything is up and running, commence */ 403 /* everything is up and running, commence */
394 disk->part[partno - 1] = p; 404 INIT_RCU_HEAD(&p->rcu_head);
405 rcu_assign_pointer(disk->__part[partno - 1], p);
395 406
396 /* suppress uevent if the disk supresses it */ 407 /* suppress uevent if the disk supresses it */
397 if (!disk->dev.uevent_suppress) 408 if (!disk->dev.uevent_suppress)
@@ -414,9 +425,9 @@ out_put:
414void register_disk(struct gendisk *disk) 425void register_disk(struct gendisk *disk)
415{ 426{
416 struct block_device *bdev; 427 struct block_device *bdev;
428 struct disk_part_iter piter;
429 struct hd_struct *part;
417 char *s; 430 char *s;
418 int i;
419 struct hd_struct *p;
420 int err; 431 int err;
421 432
422 disk->dev.parent = disk->driverfs_dev; 433 disk->dev.parent = disk->driverfs_dev;
@@ -466,16 +477,16 @@ exit:
466 kobject_uevent(&disk->dev.kobj, KOBJ_ADD); 477 kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
467 478
468 /* announce possible partitions */ 479 /* announce possible partitions */
469 for (i = 0; i < disk_max_parts(disk); i++) { 480 disk_part_iter_init(&piter, disk, 0);
470 p = disk->part[i]; 481 while ((part = disk_part_iter_next(&piter)))
471 if (!p || !p->nr_sects) 482 kobject_uevent(&part->dev.kobj, KOBJ_ADD);
472 continue; 483 disk_part_iter_exit(&piter);
473 kobject_uevent(&p->dev.kobj, KOBJ_ADD);
474 }
475} 484}
476 485
477int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 486int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
478{ 487{
488 struct disk_part_iter piter;
489 struct hd_struct *part;
479 struct parsed_partitions *state; 490 struct parsed_partitions *state;
480 int p, res; 491 int p, res;
481 492
@@ -485,8 +496,12 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
485 if (res) 496 if (res)
486 return res; 497 return res;
487 bdev->bd_invalidated = 0; 498 bdev->bd_invalidated = 0;
488 for (p = 1; p <= disk_max_parts(disk); p++) 499
489 delete_partition(disk, p); 500 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
501 while ((part = disk_part_iter_next(&piter)))
502 delete_partition(disk, part->partno);
503 disk_part_iter_exit(&piter);
504
490 if (disk->fops->revalidate_disk) 505 if (disk->fops->revalidate_disk)
491 disk->fops->revalidate_disk(disk); 506 disk->fops->revalidate_disk(disk);
492 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 507 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
@@ -545,13 +560,18 @@ EXPORT_SYMBOL(read_dev_sector);
545 560
546void del_gendisk(struct gendisk *disk) 561void del_gendisk(struct gendisk *disk)
547{ 562{
548 int p; 563 struct disk_part_iter piter;
564 struct hd_struct *part;
549 565
550 /* invalidate stuff */ 566 /* invalidate stuff */
551 for (p = disk_max_parts(disk); p > 0; p--) { 567 disk_part_iter_init(&piter, disk,
552 invalidate_partition(disk, p); 568 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
553 delete_partition(disk, p); 569 while ((part = disk_part_iter_next(&piter))) {
570 invalidate_partition(disk, part->partno);
571 delete_partition(disk, part->partno);
554 } 572 }
573 disk_part_iter_exit(&piter);
574
555 invalidate_partition(disk, 0); 575 invalidate_partition(disk, 0);
556 disk->capacity = 0; 576 disk->capacity = 0;
557 disk->flags &= ~GENHD_FL_UP; 577 disk->flags &= ~GENHD_FL_UP;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0ff75329199..7fbba19e076 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -11,6 +11,7 @@
11 11
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/kdev_t.h> 13#include <linux/kdev_t.h>
14#include <linux/rcupdate.h>
14 15
15#ifdef CONFIG_BLOCK 16#ifdef CONFIG_BLOCK
16 17
@@ -100,6 +101,7 @@ struct hd_struct {
100#else 101#else
101 struct disk_stats dkstats; 102 struct disk_stats dkstats;
102#endif 103#endif
104 struct rcu_head rcu_head;
103}; 105};
104 106
105#define GENHD_FL_REMOVABLE 1 107#define GENHD_FL_REMOVABLE 1
@@ -120,7 +122,14 @@ struct gendisk {
120 * disks that can't be partitioned. */ 122 * disks that can't be partitioned. */
121 123
122 char disk_name[32]; /* name of major driver */ 124 char disk_name[32]; /* name of major driver */
123 struct hd_struct **part; /* [indexed by minor - 1] */ 125
126 /* Array of pointers to partitions indexed by partno - 1.
127 * Protected with matching bdev lock but stat and other
128 * non-critical accesses use RCU. Always access through
129 * helpers.
130 */
131 struct hd_struct **__part;
132
124 struct block_device_operations *fops; 133 struct block_device_operations *fops;
125 struct request_queue *queue; 134 struct request_queue *queue;
126 void *private_data; 135 void *private_data;
@@ -171,25 +180,41 @@ static inline dev_t part_devt(struct hd_struct *part)
171 return part->dev.devt; 180 return part->dev.devt;
172} 181}
173 182
183extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
184
185static inline void disk_put_part(struct hd_struct *part)
186{
187 if (likely(part))
188 put_device(&part->dev);
189}
190
191/*
192 * Smarter partition iterator without context limits.
193 */
194#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */
195#define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */
196
197struct disk_part_iter {
198 struct gendisk *disk;
199 struct hd_struct *part;
200 int idx;
201 unsigned int flags;
202};
203
204extern void disk_part_iter_init(struct disk_part_iter *piter,
205 struct gendisk *disk, unsigned int flags);
206extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter);
207extern void disk_part_iter_exit(struct disk_part_iter *piter);
208
209extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
210 sector_t sector);
211
174/* 212/*
175 * Macros to operate on percpu disk statistics: 213 * Macros to operate on percpu disk statistics:
176 * 214 *
177 * The __ variants should only be called in critical sections. The full 215 * The __ variants should only be called in critical sections. The full
178 * variants disable/enable preemption. 216 * variants disable/enable preemption.
179 */ 217 */
180static inline struct hd_struct *disk_map_sector(struct gendisk *gendiskp,
181 sector_t sector)
182{
183 struct hd_struct *part;
184 int i;
185 for (i = 0; i < disk_max_parts(gendiskp); i++) {
186 part = gendiskp->part[i];
187 if (part && part->start_sect <= sector
188 && sector < part->start_sect + part->nr_sects)
189 return part;
190 }
191 return NULL;
192}
193 218
194#ifdef CONFIG_SMP 219#ifdef CONFIG_SMP
195#define __disk_stat_add(gendiskp, field, addnd) \ 220#define __disk_stat_add(gendiskp, field, addnd) \