aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe2008-12-09 01:11:22 -0600
committerJens Axboe2008-12-29 01:29:50 -0600
commitabf137dd7712132ee56d5b3143c2ff61a72a5faa (patch)
tree8334f03c598343bb93340f081fcde5ba659b440b
parent392ddc32982a5c661dd90dd49a3cb37f1c68b782 (diff)
downloadkernel-common-abf137dd7712132ee56d5b3143c2ff61a72a5faa.tar.gz
kernel-common-abf137dd7712132ee56d5b3143c2ff61a72a5faa.tar.xz
kernel-common-abf137dd7712132ee56d5b3143c2ff61a72a5faa.zip
aio: make the lookup_ioctx() lockless
The mm->ioctx_list is currently protected by a reader-writer lock, so we always grab that lock on the read side for doing ioctx lookups. As the workload is extremely reader biased, turn this into an rcu hlist so we can make lookup_ioctx() lockless. Get rid of the rwlock and use a spinlock for providing update side exclusion. There's usually only 1 entry on this list, so it doesn't make sense to look into fancier data structures. Reviewed-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--arch/s390/mm/pgtable.c4
-rw-r--r--fs/aio.c100
-rw-r--r--include/linux/aio.h5
-rw-r--r--include/linux/mm_types.h5
-rw-r--r--kernel/fork.c4
5 files changed, 67 insertions, 51 deletions
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ef3635b52fc..0767827540b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -263,7 +263,7 @@ int s390_enable_sie(void)
263 /* lets check if we are allowed to replace the mm */ 263 /* lets check if we are allowed to replace the mm */
264 task_lock(tsk); 264 task_lock(tsk);
265 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 265 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
266 tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { 266 tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
267 task_unlock(tsk); 267 task_unlock(tsk);
268 return -EINVAL; 268 return -EINVAL;
269 } 269 }
@@ -279,7 +279,7 @@ int s390_enable_sie(void)
279 /* Now lets check again if something happened */ 279 /* Now lets check again if something happened */
280 task_lock(tsk); 280 task_lock(tsk);
281 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 281 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
282 tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { 282 tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
283 mmput(mm); 283 mmput(mm);
284 task_unlock(tsk); 284 task_unlock(tsk);
285 return -EINVAL; 285 return -EINVAL;
diff --git a/fs/aio.c b/fs/aio.c
index f658441d566..d6f89d3c15e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
191 kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ 191 kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
192} while(0) 192} while(0)
193 193
194static void ctx_rcu_free(struct rcu_head *head)
195{
196 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
197 unsigned nr_events = ctx->max_reqs;
198
199 kmem_cache_free(kioctx_cachep, ctx);
200
201 if (nr_events) {
202 spin_lock(&aio_nr_lock);
203 BUG_ON(aio_nr - nr_events > aio_nr);
204 aio_nr -= nr_events;
205 spin_unlock(&aio_nr_lock);
206 }
207}
194 208
195/* __put_ioctx 209/* __put_ioctx
196 * Called when the last user of an aio context has gone away, 210 * Called when the last user of an aio context has gone away,
@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
198 */ 212 */
199static void __put_ioctx(struct kioctx *ctx) 213static void __put_ioctx(struct kioctx *ctx)
200{ 214{
201 unsigned nr_events = ctx->max_reqs;
202
203 BUG_ON(ctx->reqs_active); 215 BUG_ON(ctx->reqs_active);
204 216
205 cancel_delayed_work(&ctx->wq); 217 cancel_delayed_work(&ctx->wq);
@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
208 mmdrop(ctx->mm); 220 mmdrop(ctx->mm);
209 ctx->mm = NULL; 221 ctx->mm = NULL;
210 pr_debug("__put_ioctx: freeing %p\n", ctx); 222 pr_debug("__put_ioctx: freeing %p\n", ctx);
211 kmem_cache_free(kioctx_cachep, ctx); 223 call_rcu(&ctx->rcu_head, ctx_rcu_free);
212
213 if (nr_events) {
214 spin_lock(&aio_nr_lock);
215 BUG_ON(aio_nr - nr_events > aio_nr);
216 aio_nr -= nr_events;
217 spin_unlock(&aio_nr_lock);
218 }
219} 224}
220 225
221#define get_ioctx(kioctx) do { \ 226#define get_ioctx(kioctx) do { \
@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
235{ 240{
236 struct mm_struct *mm; 241 struct mm_struct *mm;
237 struct kioctx *ctx; 242 struct kioctx *ctx;
243 int did_sync = 0;
238 244
239 /* Prevent overflows */ 245 /* Prevent overflows */
240 if ((nr_events > (0x10000000U / sizeof(struct io_event))) || 246 if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
267 goto out_freectx; 273 goto out_freectx;
268 274
269 /* limit the number of system wide aios */ 275 /* limit the number of system wide aios */
270 spin_lock(&aio_nr_lock); 276 do {
271 if (aio_nr + ctx->max_reqs > aio_max_nr || 277 spin_lock_bh(&aio_nr_lock);
272 aio_nr + ctx->max_reqs < aio_nr) 278 if (aio_nr + nr_events > aio_max_nr ||
273 ctx->max_reqs = 0; 279 aio_nr + nr_events < aio_nr)
274 else 280 ctx->max_reqs = 0;
275 aio_nr += ctx->max_reqs; 281 else
276 spin_unlock(&aio_nr_lock); 282 aio_nr += ctx->max_reqs;
283 spin_unlock_bh(&aio_nr_lock);
284 if (ctx->max_reqs || did_sync)
285 break;
286
287 /* wait for rcu callbacks to have completed before giving up */
288 synchronize_rcu();
289 did_sync = 1;
290 ctx->max_reqs = nr_events;
291 } while (1);
292
277 if (ctx->max_reqs == 0) 293 if (ctx->max_reqs == 0)
278 goto out_cleanup; 294 goto out_cleanup;
279 295
280 /* now link into global list. */ 296 /* now link into global list. */
281 write_lock(&mm->ioctx_list_lock); 297 spin_lock(&mm->ioctx_lock);
282 ctx->next = mm->ioctx_list; 298 hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
283 mm->ioctx_list = ctx; 299 spin_unlock(&mm->ioctx_lock);
284 write_unlock(&mm->ioctx_list_lock);
285 300
286 dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 301 dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
287 ctx, ctx->user_id, current->mm, ctx->ring_info.nr); 302 ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
375 */ 390 */
376void exit_aio(struct mm_struct *mm) 391void exit_aio(struct mm_struct *mm)
377{ 392{
378 struct kioctx *ctx = mm->ioctx_list; 393 struct kioctx *ctx;
379 mm->ioctx_list = NULL; 394
380 while (ctx) { 395 while (!hlist_empty(&mm->ioctx_list)) {
381 struct kioctx *next = ctx->next; 396 ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
382 ctx->next = NULL; 397 hlist_del_rcu(&ctx->list);
398
383 aio_cancel_all(ctx); 399 aio_cancel_all(ctx);
384 400
385 wait_for_all_aios(ctx); 401 wait_for_all_aios(ctx);
@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
394 atomic_read(&ctx->users), ctx->dead, 410 atomic_read(&ctx->users), ctx->dead,
395 ctx->reqs_active); 411 ctx->reqs_active);
396 put_ioctx(ctx); 412 put_ioctx(ctx);
397 ctx = next;
398 } 413 }
399} 414}
400 415
@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
555 570
556static struct kioctx *lookup_ioctx(unsigned long ctx_id) 571static struct kioctx *lookup_ioctx(unsigned long ctx_id)
557{ 572{
558 struct kioctx *ioctx; 573 struct mm_struct *mm = current->mm;
559 struct mm_struct *mm; 574 struct kioctx *ctx = NULL;
575 struct hlist_node *n;
560 576
561 mm = current->mm; 577 rcu_read_lock();
562 read_lock(&mm->ioctx_list_lock); 578
563 for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) 579 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
564 if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { 580 if (ctx->user_id == ctx_id && !ctx->dead) {
565 get_ioctx(ioctx); 581 get_ioctx(ctx);
566 break; 582 break;
567 } 583 }
568 read_unlock(&mm->ioctx_list_lock); 584 }
569 585
570 return ioctx; 586 rcu_read_unlock();
587 return ctx;
571} 588}
572 589
573/* 590/*
@@ -1215,19 +1232,14 @@ out:
1215static void io_destroy(struct kioctx *ioctx) 1232static void io_destroy(struct kioctx *ioctx)
1216{ 1233{
1217 struct mm_struct *mm = current->mm; 1234 struct mm_struct *mm = current->mm;
1218 struct kioctx **tmp;
1219 int was_dead; 1235 int was_dead;
1220 1236
1221 /* delete the entry from the list is someone else hasn't already */ 1237 /* delete the entry from the list is someone else hasn't already */
1222 write_lock(&mm->ioctx_list_lock); 1238 spin_lock(&mm->ioctx_lock);
1223 was_dead = ioctx->dead; 1239 was_dead = ioctx->dead;
1224 ioctx->dead = 1; 1240 ioctx->dead = 1;
1225 for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx; 1241 hlist_del_rcu(&ioctx->list);
1226 tmp = &(*tmp)->next) 1242 spin_unlock(&mm->ioctx_lock);
1227 ;
1228 if (*tmp)
1229 *tmp = ioctx->next;
1230 write_unlock(&mm->ioctx_list_lock);
1231 1243
1232 dprintk("aio_release(%p)\n", ioctx); 1244 dprintk("aio_release(%p)\n", ioctx);
1233 if (likely(!was_dead)) 1245 if (likely(!was_dead))
diff --git a/include/linux/aio.h b/include/linux/aio.h
index f6b8cf99b59..b16a957030f 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -5,6 +5,7 @@
5#include <linux/workqueue.h> 5#include <linux/workqueue.h>
6#include <linux/aio_abi.h> 6#include <linux/aio_abi.h>
7#include <linux/uio.h> 7#include <linux/uio.h>
8#include <linux/rcupdate.h>
8 9
9#include <asm/atomic.h> 10#include <asm/atomic.h>
10 11
@@ -183,7 +184,7 @@ struct kioctx {
183 184
184 /* This needs improving */ 185 /* This needs improving */
185 unsigned long user_id; 186 unsigned long user_id;
186 struct kioctx *next; 187 struct hlist_node list;
187 188
188 wait_queue_head_t wait; 189 wait_queue_head_t wait;
189 190
@@ -199,6 +200,8 @@ struct kioctx {
199 struct aio_ring_info ring_info; 200 struct aio_ring_info ring_info;
200 201
201 struct delayed_work wq; 202 struct delayed_work wq;
203
204 struct rcu_head rcu_head;
202}; 205};
203 206
204/* prototypes */ 207/* prototypes */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fe825471d5a..9cfc9b627fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -232,8 +232,9 @@ struct mm_struct {
232 struct core_state *core_state; /* coredumping support */ 232 struct core_state *core_state; /* coredumping support */
233 233
234 /* aio bits */ 234 /* aio bits */
235 rwlock_t ioctx_list_lock; /* aio lock */ 235 spinlock_t ioctx_lock;
236 struct kioctx *ioctx_list; 236 struct hlist_head ioctx_list;
237
237#ifdef CONFIG_MM_OWNER 238#ifdef CONFIG_MM_OWNER
238 /* 239 /*
239 * "owner" points to a task that is regarded as the canonical 240 * "owner" points to a task that is regarded as the canonical
diff --git a/kernel/fork.c b/kernel/fork.c
index 6144b36cd89..43cbf30669e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -415,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
415 set_mm_counter(mm, file_rss, 0); 415 set_mm_counter(mm, file_rss, 0);
416 set_mm_counter(mm, anon_rss, 0); 416 set_mm_counter(mm, anon_rss, 0);
417 spin_lock_init(&mm->page_table_lock); 417 spin_lock_init(&mm->page_table_lock);
418 rwlock_init(&mm->ioctx_list_lock); 418 spin_lock_init(&mm->ioctx_lock);
419 mm->ioctx_list = NULL; 419 INIT_HLIST_HEAD(&mm->ioctx_list);
420 mm->free_area_cache = TASK_UNMAPPED_BASE; 420 mm->free_area_cache = TASK_UNMAPPED_BASE;
421 mm->cached_hole_size = ~0UL; 421 mm->cached_hole_size = ~0UL;
422 mm_init_owner(mm, p); 422 mm_init_owner(mm, p);