aboutsummaryrefslogtreecommitdiffstats
path: root/ipc
diff options
context:
space:
mode:
authorManfred Spraul2010-05-26 16:43:40 -0500
committerLinus Torvalds2010-05-27 11:12:49 -0500
commitfd5db42254518fbf241dc454e918598fbe494fa2 (patch)
tree356c8098f7f706a0e0396476fb5ccb924568eea1 /ipc
parent2dcb22b346be7b7b7e630a8970d69cf3f1111ec1 (diff)
downloadkernel-common-fd5db42254518fbf241dc454e918598fbe494fa2.tar.gz
kernel-common-fd5db42254518fbf241dc454e918598fbe494fa2.tar.xz
kernel-common-fd5db42254518fbf241dc454e918598fbe494fa2.zip
ipc/sem.c: optimize update_queue() for bulk wakeup calls
The following series of patches tries to fix the spinlock contention reported by Chris Mason - his benchmark exposes problems of the current code: - In the worst case, the algorithm used by update_queue() is O(N^2). Bulk wake-up calls can enter this worst case. The patch series fix that. Note that the benchmark app doesn't expose the problem, it just should be fixed: Real world apps might do the wake-ups in another order than perfect FIFO. - The part of the code that runs within the semaphore array spinlock is significantly larger than necessary. The patch series fixes that. This change is responsible for the main improvement. - The cacheline with the spinlock is also used for a variable that is read in the hot path (sem_base) and for a variable that is unnecessarily written to multiple times (sem_otime). The last step of the series cacheline-aligns the spinlock. This patch: The SysV semaphore code allows to perform multiple operations on all semaphores in the array as atomic operations. After a modification, update_queue() checks which of the waiting tasks can complete. The algorithm that is used to identify the tasks is O(N^2) in the worst case. For some cases, it is simple to avoid the O(N^2). The patch adds a detection logic for some cases, especially for the case of an array where all sleeping tasks are single sembuf operations and a multi-sembuf operation is used to wake up multiple tasks. A big database application uses that approach. The patch fixes wakeup due to semctl(,,SETALL,) - the initial version of the patch breaks that. [akpm@linux-foundation.org: make do_smart_update() static] Signed-off-by: Manfred Spraul <manfred@colorfullife.com> Cc: Chris Mason <chris.mason@oracle.com> Cc: Zach Brown <zach.brown@oracle.com> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'ipc')
-rw-r--r--ipc/sem.c110
1 files changed, 97 insertions, 13 deletions
diff --git a/ipc/sem.c b/ipc/sem.c
index dbef95b1594..81a9c74ab64 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -434,6 +434,69 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
434 sma->complex_count--; 434 sma->complex_count--;
435} 435}
436 436
437/** check_restart(sma, q)
438 * @sma: semaphore array
439 * @q: the operation that just completed
440 *
441 * update_queue is O(N^2) when it restarts scanning the whole queue of
442 * waiting operations. Therefore this function checks if the restart is
443 * really necessary. It is called after a previously waiting operation
444 * was completed.
445 */
446static int check_restart(struct sem_array *sma, struct sem_queue *q)
447{
448 struct sem *curr;
449 struct sem_queue *h;
450
451 /* if the operation didn't modify the array, then no restart */
452 if (q->alter == 0)
453 return 0;
454
455 /* pending complex operations are too difficult to analyse */
456 if (sma->complex_count)
457 return 1;
458
459 /* we were a sleeping complex operation. Too difficult */
460 if (q->nsops > 1)
461 return 1;
462
463 curr = sma->sem_base + q->sops[0].sem_num;
464
465 /* No-one waits on this queue */
466 if (list_empty(&curr->sem_pending))
467 return 0;
468
469 /* the new semaphore value */
470 if (curr->semval) {
471 /* It is impossible that someone waits for the new value:
472 * - q is a previously sleeping simple operation that
473 * altered the array. It must be a decrement, because
474 * simple increments never sleep.
475 * - The value is not 0, thus wait-for-zero won't proceed.
476 * - If there are older (higher priority) decrements
477 * in the queue, then they have observed the original
478 * semval value and couldn't proceed. The operation
479 * decremented to value - thus they won't proceed either.
480 */
481 BUG_ON(q->sops[0].sem_op >= 0);
482 return 0;
483 }
484 /*
485 * semval is 0. Check if there are wait-for-zero semops.
486 * They must be the first entries in the per-semaphore simple queue
487 */
488 h = list_first_entry(&curr->sem_pending, struct sem_queue, simple_list);
489 BUG_ON(h->nsops != 1);
490 BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num);
491
492 /* Yes, there is a wait-for-zero semop. Restart */
493 if (h->sops[0].sem_op == 0)
494 return 1;
495
496 /* Again - no-one is waiting for the new value. */
497 return 0;
498}
499
437 500
438/** 501/**
439 * update_queue(sma, semnum): Look for tasks that can be completed. 502 * update_queue(sma, semnum): Look for tasks that can be completed.
@@ -469,7 +532,7 @@ static void update_queue(struct sem_array *sma, int semnum)
469again: 532again:
470 walk = pending_list->next; 533 walk = pending_list->next;
471 while (walk != pending_list) { 534 while (walk != pending_list) {
472 int error, alter; 535 int error, restart;
473 536
474 q = (struct sem_queue *)((char *)walk - offset); 537 q = (struct sem_queue *)((char *)walk - offset);
475 walk = walk->next; 538 walk = walk->next;
@@ -494,22 +557,43 @@ again:
494 557
495 unlink_queue(sma, q); 558 unlink_queue(sma, q);
496 559
497 /* 560 if (error)
498 * The next operation that must be checked depends on the type 561 restart = 0;
499 * of the completed operation: 562 else
500 * - if the operation modified the array, then restart from the 563 restart = check_restart(sma, q);
501 * head of the queue and check for threads that might be 564
502 * waiting for the new semaphore values.
503 * - if the operation didn't modify the array, then just
504 * continue.
505 */
506 alter = q->alter;
507 wake_up_sem_queue(q, error); 565 wake_up_sem_queue(q, error);
508 if (alter && !error) 566 if (restart)
509 goto again; 567 goto again;
510 } 568 }
511} 569}
512 570
571/** do_smart_update(sma, sops, nsops): Optimized update_queue
572 * @sma: semaphore array
573 * @sops: operations that were performed
574 * @nsops: number of operations
575 *
576 * do_smart_update() does the required called to update_queue, based on the
577 * actual changes that were performed on the semaphore array.
578 */
579static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops)
580{
581 int i;
582
583 if (sma->complex_count || sops == NULL) {
584 update_queue(sma, -1);
585 return;
586 }
587
588 for (i = 0; i < nsops; i++) {
589 if (sops[i].sem_op > 0 ||
590 (sops[i].sem_op < 0 &&
591 sma->sem_base[sops[i].sem_num].semval == 0))
592 update_queue(sma, sops[i].sem_num);
593 }
594}
595
596
513/* The following counts are associated to each semaphore: 597/* The following counts are associated to each semaphore:
514 * semncnt number of tasks waiting on semval being nonzero 598 * semncnt number of tasks waiting on semval being nonzero
515 * semzcnt number of tasks waiting on semval being zero 599 * semzcnt number of tasks waiting on semval being zero
@@ -1225,7 +1309,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1225 error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); 1309 error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
1226 if (error <= 0) { 1310 if (error <= 0) {
1227 if (alter && error == 0) 1311 if (alter && error == 0)
1228 update_queue(sma, (nsops == 1) ? sops[0].sem_num : -1); 1312 do_smart_update(sma, sops, nsops);
1229 1313
1230 goto out_unlock_free; 1314 goto out_unlock_free;
1231 } 1315 }