aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig14
-rw-r--r--block/Kconfig.iosched69
-rw-r--r--block/Makefile10
-rw-r--r--block/as-iosched.c1985
-rw-r--r--block/cfq-iosched.c2428
-rw-r--r--block/deadline-iosched.c878
-rw-r--r--block/elevator.c802
-rw-r--r--block/genhd.c726
-rw-r--r--block/ioctl.c275
-rw-r--r--block/ll_rw_blk.c3613
-rw-r--r--block/noop-iosched.c46
-rw-r--r--block/scsi_ioctl.c589
12 files changed, 11435 insertions, 0 deletions
diff --git a/block/Kconfig b/block/Kconfig
new file mode 100644
index 00000000000..eb48edb80c1
--- /dev/null
+++ b/block/Kconfig
@@ -0,0 +1,14 @@
1#
2# Block layer core configuration
3#
4#XXX - it makes sense to enable this only for 32-bit subarch's, not for x86_64
5#for instance.
6config LBD
7 bool "Support for Large Block Devices"
8 depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
9 help
10 Say Y here if you want to attach large (bigger than 2TB) discs to
11 your machine, or if you want to have a raid or loopback device
12 bigger than 2TB. Otherwise say N.
13
14source block/Kconfig.iosched
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
new file mode 100644
index 00000000000..5b90d2fa63b
--- /dev/null
+++ b/block/Kconfig.iosched
@@ -0,0 +1,69 @@
1
2menu "IO Schedulers"
3
4config IOSCHED_NOOP
5 bool
6 default y
7 ---help---
8 The no-op I/O scheduler is a minimal scheduler that does basic merging
9 and sorting. Its main uses include non-disk based block devices like
10 memory devices, and specialised software or hardware environments
11 that do their own scheduling and require only minimal assistance from
12 the kernel.
13
14config IOSCHED_AS
15 tristate "Anticipatory I/O scheduler"
16 default y
17 ---help---
18 The anticipatory I/O scheduler is the default disk scheduler. It is
19 generally a good choice for most environments, but is quite large and
20 complex when compared to the deadline I/O scheduler, it can also be
21 slower in some cases especially some database loads.
22
23config IOSCHED_DEADLINE
24 tristate "Deadline I/O scheduler"
25 default y
26 ---help---
27 The deadline I/O scheduler is simple and compact, and is often as
28 good as the anticipatory I/O scheduler, and in some database
29 workloads, better. In the case of a single process performing I/O to
30 a disk at any one time, its behaviour is almost identical to the
31 anticipatory I/O scheduler and so is a good choice.
32
33config IOSCHED_CFQ
34 tristate "CFQ I/O scheduler"
35 default y
36 ---help---
37 The CFQ I/O scheduler tries to distribute bandwidth equally
38 among all processes in the system. It should provide a fair
39 working environment, suitable for desktop systems.
40
41choice
42 prompt "Default I/O scheduler"
43 default DEFAULT_AS
44 help
45 Select the I/O scheduler which will be used by default for all
46 block devices.
47
48 config DEFAULT_AS
49 bool "Anticipatory" if IOSCHED_AS
50
51 config DEFAULT_DEADLINE
52 bool "Deadline" if IOSCHED_DEADLINE
53
54 config DEFAULT_CFQ
55 bool "CFQ" if IOSCHED_CFQ
56
57 config DEFAULT_NOOP
58 bool "No-op"
59
60endchoice
61
62config DEFAULT_IOSCHED
63 string
64 default "anticipatory" if DEFAULT_AS
65 default "deadline" if DEFAULT_DEADLINE
66 default "cfq" if DEFAULT_CFQ
67 default "noop" if DEFAULT_NOOP
68
69endmenu
diff --git a/block/Makefile b/block/Makefile
new file mode 100644
index 00000000000..7e4f93e2b44
--- /dev/null
+++ b/block/Makefile
@@ -0,0 +1,10 @@
1#
2# Makefile for the kernel block layer
3#
4
5obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
6
7obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
8obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
9obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
10obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
new file mode 100644
index 00000000000..c6744ff3829
--- /dev/null
+++ b/block/as-iosched.c
@@ -0,0 +1,1985 @@
1/*
2 * linux/drivers/block/as-iosched.c
3 *
4 * Anticipatory & deadline i/o scheduler.
5 *
6 * Copyright (C) 2002 Jens Axboe <axboe@suse.de>
7 * Nick Piggin <piggin@cyberone.com.au>
8 *
9 */
10#include <linux/kernel.h>
11#include <linux/fs.h>
12#include <linux/blkdev.h>
13#include <linux/elevator.h>
14#include <linux/bio.h>
15#include <linux/config.h>
16#include <linux/module.h>
17#include <linux/slab.h>
18#include <linux/init.h>
19#include <linux/compiler.h>
20#include <linux/hash.h>
21#include <linux/rbtree.h>
22#include <linux/interrupt.h>
23
24#define REQ_SYNC 1
25#define REQ_ASYNC 0
26
27/*
28 * See Documentation/block/as-iosched.txt
29 */
30
31/*
32 * max time before a read is submitted.
33 */
34#define default_read_expire (HZ / 8)
35
36/*
37 * ditto for writes, these limits are not hard, even
38 * if the disk is capable of satisfying them.
39 */
40#define default_write_expire (HZ / 4)
41
42/*
43 * read_batch_expire describes how long we will allow a stream of reads to
44 * persist before looking to see whether it is time to switch over to writes.
45 */
46#define default_read_batch_expire (HZ / 2)
47
48/*
49 * write_batch_expire describes how long we want a stream of writes to run for.
50 * This is not a hard limit, but a target we set for the auto-tuning thingy.
51 * See, the problem is: we can send a lot of writes to disk cache / TCQ in
52 * a short amount of time...
53 */
54#define default_write_batch_expire (HZ / 8)
55
56/*
57 * max time we may wait to anticipate a read (default around 6ms)
58 */
59#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
60
61/*
62 * Keep track of up to 20ms thinktimes. We can go as big as we like here,
63 * however huge values tend to interfere and not decay fast enough. A program
64 * might be in a non-io phase of operation. Waiting on user input for example,
65 * or doing a lengthy computation. A small penalty can be justified there, and
66 * will still catch out those processes that constantly have large thinktimes.
67 */
68#define MAX_THINKTIME (HZ/50UL)
69
70/* Bits in as_io_context.state */
71enum as_io_states {
72 AS_TASK_RUNNING=0, /* Process has not exitted */
73 AS_TASK_IOSTARTED, /* Process has started some IO */
74 AS_TASK_IORUNNING, /* Process has completed some IO */
75};
76
77enum anticipation_status {
78 ANTIC_OFF=0, /* Not anticipating (normal operation) */
79 ANTIC_WAIT_REQ, /* The last read has not yet completed */
80 ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
81 last read (which has completed) */
82 ANTIC_FINISHED, /* Anticipating but have found a candidate
83 * or timed out */
84};
85
86struct as_data {
87 /*
88 * run time data
89 */
90
91 struct request_queue *q; /* the "owner" queue */
92
93 /*
94 * requests (as_rq s) are present on both sort_list and fifo_list
95 */
96 struct rb_root sort_list[2];
97 struct list_head fifo_list[2];
98
99 struct as_rq *next_arq[2]; /* next in sort order */
100 sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */
101 struct list_head *hash; /* request hash */
102
103 unsigned long exit_prob; /* probability a task will exit while
104 being waited on */
105 unsigned long new_ttime_total; /* mean thinktime on new proc */
106 unsigned long new_ttime_mean;
107 u64 new_seek_total; /* mean seek on new proc */
108 sector_t new_seek_mean;
109
110 unsigned long current_batch_expires;
111 unsigned long last_check_fifo[2];
112 int changed_batch; /* 1: waiting for old batch to end */
113 int new_batch; /* 1: waiting on first read complete */
114 int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */
115 int write_batch_count; /* max # of reqs in a write batch */
116 int current_write_count; /* how many requests left this batch */
117 int write_batch_idled; /* has the write batch gone idle? */
118 mempool_t *arq_pool;
119
120 enum anticipation_status antic_status;
121 unsigned long antic_start; /* jiffies: when it started */
122 struct timer_list antic_timer; /* anticipatory scheduling timer */
123 struct work_struct antic_work; /* Deferred unplugging */
124 struct io_context *io_context; /* Identify the expected process */
125 int ioc_finished; /* IO associated with io_context is finished */
126 int nr_dispatched;
127
128 /*
129 * settings that change how the i/o scheduler behaves
130 */
131 unsigned long fifo_expire[2];
132 unsigned long batch_expire[2];
133 unsigned long antic_expire;
134};
135
136#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo)
137
138/*
139 * per-request data.
140 */
141enum arq_state {
142 AS_RQ_NEW=0, /* New - not referenced and not on any lists */
143 AS_RQ_QUEUED, /* In the request queue. It belongs to the
144 scheduler */
145 AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the
146 driver now */
147 AS_RQ_PRESCHED, /* Debug poisoning for requests being used */
148 AS_RQ_REMOVED,
149 AS_RQ_MERGED,
150 AS_RQ_POSTSCHED, /* when they shouldn't be */
151};
152
153struct as_rq {
154 /*
155 * rbtree index, key is the starting offset
156 */
157 struct rb_node rb_node;
158 sector_t rb_key;
159
160 struct request *request;
161
162 struct io_context *io_context; /* The submitting task */
163
164 /*
165 * request hash, key is the ending offset (for back merge lookup)
166 */
167 struct list_head hash;
168 unsigned int on_hash;
169
170 /*
171 * expire fifo
172 */
173 struct list_head fifo;
174 unsigned long expires;
175
176 unsigned int is_sync;
177 enum arq_state state;
178};
179
180#define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private)
181
182static kmem_cache_t *arq_pool;
183
184/*
185 * IO Context helper functions
186 */
187
188/* Called to deallocate the as_io_context */
189static void free_as_io_context(struct as_io_context *aic)
190{
191 kfree(aic);
192}
193
194/* Called when the task exits */
195static void exit_as_io_context(struct as_io_context *aic)
196{
197 WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
198 clear_bit(AS_TASK_RUNNING, &aic->state);
199}
200
201static struct as_io_context *alloc_as_io_context(void)
202{
203 struct as_io_context *ret;
204
205 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
206 if (ret) {
207 ret->dtor = free_as_io_context;
208 ret->exit = exit_as_io_context;
209 ret->state = 1 << AS_TASK_RUNNING;
210 atomic_set(&ret->nr_queued, 0);
211 atomic_set(&ret->nr_dispatched, 0);
212 spin_lock_init(&ret->lock);
213 ret->ttime_total = 0;
214 ret->ttime_samples = 0;
215 ret->ttime_mean = 0;
216 ret->seek_total = 0;
217 ret->seek_samples = 0;
218 ret->seek_mean = 0;
219 }
220
221 return ret;
222}
223
224/*
225 * If the current task has no AS IO context then create one and initialise it.
226 * Then take a ref on the task's io context and return it.
227 */
228static struct io_context *as_get_io_context(void)
229{
230 struct io_context *ioc = get_io_context(GFP_ATOMIC);
231 if (ioc && !ioc->aic) {
232 ioc->aic = alloc_as_io_context();
233 if (!ioc->aic) {
234 put_io_context(ioc);
235 ioc = NULL;
236 }
237 }
238 return ioc;
239}
240
241static void as_put_io_context(struct as_rq *arq)
242{
243 struct as_io_context *aic;
244
245 if (unlikely(!arq->io_context))
246 return;
247
248 aic = arq->io_context->aic;
249
250 if (arq->is_sync == REQ_SYNC && aic) {
251 spin_lock(&aic->lock);
252 set_bit(AS_TASK_IORUNNING, &aic->state);
253 aic->last_end_request = jiffies;
254 spin_unlock(&aic->lock);
255 }
256
257 put_io_context(arq->io_context);
258}
259
260/*
261 * the back merge hash support functions
262 */
263static const int as_hash_shift = 6;
264#define AS_HASH_BLOCK(sec) ((sec) >> 3)
265#define AS_HASH_FN(sec) (hash_long(AS_HASH_BLOCK((sec)), as_hash_shift))
266#define AS_HASH_ENTRIES (1 << as_hash_shift)
267#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
268#define list_entry_hash(ptr) list_entry((ptr), struct as_rq, hash)
269
270static inline void __as_del_arq_hash(struct as_rq *arq)
271{
272 arq->on_hash = 0;
273 list_del_init(&arq->hash);
274}
275
276static inline void as_del_arq_hash(struct as_rq *arq)
277{
278 if (arq->on_hash)
279 __as_del_arq_hash(arq);
280}
281
282static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq)
283{
284 struct request *rq = arq->request;
285
286 BUG_ON(arq->on_hash);
287
288 arq->on_hash = 1;
289 list_add(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]);
290}
291
292/*
293 * move hot entry to front of chain
294 */
295static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq)
296{
297 struct request *rq = arq->request;
298 struct list_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))];
299
300 if (!arq->on_hash) {
301 WARN_ON(1);
302 return;
303 }
304
305 if (arq->hash.prev != head) {
306 list_del(&arq->hash);
307 list_add(&arq->hash, head);
308 }
309}
310
311static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset)
312{
313 struct list_head *hash_list = &ad->hash[AS_HASH_FN(offset)];
314 struct list_head *entry, *next = hash_list->next;
315
316 while ((entry = next) != hash_list) {
317 struct as_rq *arq = list_entry_hash(entry);
318 struct request *__rq = arq->request;
319
320 next = entry->next;
321
322 BUG_ON(!arq->on_hash);
323
324 if (!rq_mergeable(__rq)) {
325 as_del_arq_hash(arq);
326 continue;
327 }
328
329 if (rq_hash_key(__rq) == offset)
330 return __rq;
331 }
332
333 return NULL;
334}
335
336/*
337 * rb tree support functions
338 */
339#define RB_NONE (2)
340#define RB_EMPTY(root) ((root)->rb_node == NULL)
341#define ON_RB(node) ((node)->rb_color != RB_NONE)
342#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
343#define rb_entry_arq(node) rb_entry((node), struct as_rq, rb_node)
344#define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[(arq)->is_sync])
345#define rq_rb_key(rq) (rq)->sector
346
347/*
348 * as_find_first_arq finds the first (lowest sector numbered) request
349 * for the specified data_dir. Used to sweep back to the start of the disk
350 * (1-way elevator) after we process the last (highest sector) request.
351 */
352static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
353{
354 struct rb_node *n = ad->sort_list[data_dir].rb_node;
355
356 if (n == NULL)
357 return NULL;
358
359 for (;;) {
360 if (n->rb_left == NULL)
361 return rb_entry_arq(n);
362
363 n = n->rb_left;
364 }
365}
366
367/*
368 * Add the request to the rb tree if it is unique. If there is an alias (an
369 * existing request against the same sector), which can happen when using
370 * direct IO, then return the alias.
371 */
372static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
373{
374 struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
375 struct rb_node *parent = NULL;
376 struct as_rq *__arq;
377 struct request *rq = arq->request;
378
379 arq->rb_key = rq_rb_key(rq);
380
381 while (*p) {
382 parent = *p;
383 __arq = rb_entry_arq(parent);
384
385 if (arq->rb_key < __arq->rb_key)
386 p = &(*p)->rb_left;
387 else if (arq->rb_key > __arq->rb_key)
388 p = &(*p)->rb_right;
389 else
390 return __arq;
391 }
392
393 rb_link_node(&arq->rb_node, parent, p);
394 rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
395
396 return NULL;
397}
398
399static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
400{
401 if (!ON_RB(&arq->rb_node)) {
402 WARN_ON(1);
403 return;
404 }
405
406 rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
407 RB_CLEAR(&arq->rb_node);
408}
409
410static struct request *
411as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir)
412{
413 struct rb_node *n = ad->sort_list[data_dir].rb_node;
414 struct as_rq *arq;
415
416 while (n) {
417 arq = rb_entry_arq(n);
418
419 if (sector < arq->rb_key)
420 n = n->rb_left;
421 else if (sector > arq->rb_key)
422 n = n->rb_right;
423 else
424 return arq->request;
425 }
426
427 return NULL;
428}
429
430/*
431 * IO Scheduler proper
432 */
433
434#define MAXBACK (1024 * 1024) /*
435 * Maximum distance the disk will go backward
436 * for a request.
437 */
438
439#define BACK_PENALTY 2
440
441/*
442 * as_choose_req selects the preferred one of two requests of the same data_dir
443 * ignoring time - eg. timeouts, which is the job of as_dispatch_request
444 */
445static struct as_rq *
446as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
447{
448 int data_dir;
449 sector_t last, s1, s2, d1, d2;
450 int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */
451 const sector_t maxback = MAXBACK;
452
453 if (arq1 == NULL || arq1 == arq2)
454 return arq2;
455 if (arq2 == NULL)
456 return arq1;
457
458 data_dir = arq1->is_sync;
459
460 last = ad->last_sector[data_dir];
461 s1 = arq1->request->sector;
462 s2 = arq2->request->sector;
463
464 BUG_ON(data_dir != arq2->is_sync);
465
466 /*
467 * Strict one way elevator _except_ in the case where we allow
468 * short backward seeks which are biased as twice the cost of a
469 * similar forward seek.
470 */
471 if (s1 >= last)
472 d1 = s1 - last;
473 else if (s1+maxback >= last)
474 d1 = (last - s1)*BACK_PENALTY;
475 else {
476 r1_wrap = 1;
477 d1 = 0; /* shut up, gcc */
478 }
479
480 if (s2 >= last)
481 d2 = s2 - last;
482 else if (s2+maxback >= last)
483 d2 = (last - s2)*BACK_PENALTY;
484 else {
485 r2_wrap = 1;
486 d2 = 0;
487 }
488
489 /* Found required data */
490 if (!r1_wrap && r2_wrap)
491 return arq1;
492 else if (!r2_wrap && r1_wrap)
493 return arq2;
494 else if (r1_wrap && r2_wrap) {
495 /* both behind the head */
496 if (s1 <= s2)
497 return arq1;
498 else
499 return arq2;
500 }
501
502 /* Both requests in front of the head */
503 if (d1 < d2)
504 return arq1;
505 else if (d2 < d1)
506 return arq2;
507 else {
508 if (s1 >= s2)
509 return arq1;
510 else
511 return arq2;
512 }
513}
514
515/*
516 * as_find_next_arq finds the next request after @prev in elevator order.
517 * this with as_choose_req form the basis for how the scheduler chooses
518 * what request to process next. Anticipation works on top of this.
519 */
520static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
521{
522 const int data_dir = last->is_sync;
523 struct as_rq *ret;
524 struct rb_node *rbnext = rb_next(&last->rb_node);
525 struct rb_node *rbprev = rb_prev(&last->rb_node);
526 struct as_rq *arq_next, *arq_prev;
527
528 BUG_ON(!ON_RB(&last->rb_node));
529
530 if (rbprev)
531 arq_prev = rb_entry_arq(rbprev);
532 else
533 arq_prev = NULL;
534
535 if (rbnext)
536 arq_next = rb_entry_arq(rbnext);
537 else {
538 arq_next = as_find_first_arq(ad, data_dir);
539 if (arq_next == last)
540 arq_next = NULL;
541 }
542
543 ret = as_choose_req(ad, arq_next, arq_prev);
544
545 return ret;
546}
547
548/*
549 * anticipatory scheduling functions follow
550 */
551
552/*
553 * as_antic_expired tells us when we have anticipated too long.
554 * The funny "absolute difference" math on the elapsed time is to handle
555 * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
556 */
557static int as_antic_expired(struct as_data *ad)
558{
559 long delta_jif;
560
561 delta_jif = jiffies - ad->antic_start;
562 if (unlikely(delta_jif < 0))
563 delta_jif = -delta_jif;
564 if (delta_jif < ad->antic_expire)
565 return 0;
566
567 return 1;
568}
569
570/*
571 * as_antic_waitnext starts anticipating that a nice request will soon be
572 * submitted. See also as_antic_waitreq
573 */
574static void as_antic_waitnext(struct as_data *ad)
575{
576 unsigned long timeout;
577
578 BUG_ON(ad->antic_status != ANTIC_OFF
579 && ad->antic_status != ANTIC_WAIT_REQ);
580
581 timeout = ad->antic_start + ad->antic_expire;
582
583 mod_timer(&ad->antic_timer, timeout);
584
585 ad->antic_status = ANTIC_WAIT_NEXT;
586}
587
588/*
589 * as_antic_waitreq starts anticipating. We don't start timing the anticipation
590 * until the request that we're anticipating on has finished. This means we
591 * are timing from when the candidate process wakes up hopefully.
592 */
593static void as_antic_waitreq(struct as_data *ad)
594{
595 BUG_ON(ad->antic_status == ANTIC_FINISHED);
596 if (ad->antic_status == ANTIC_OFF) {
597 if (!ad->io_context || ad->ioc_finished)
598 as_antic_waitnext(ad);
599 else
600 ad->antic_status = ANTIC_WAIT_REQ;
601 }
602}
603
604/*
605 * This is called directly by the functions in this file to stop anticipation.
606 * We kill the timer and schedule a call to the request_fn asap.
607 */
608static void as_antic_stop(struct as_data *ad)
609{
610 int status = ad->antic_status;
611
612 if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
613 if (status == ANTIC_WAIT_NEXT)
614 del_timer(&ad->antic_timer);
615 ad->antic_status = ANTIC_FINISHED;
616 /* see as_work_handler */
617 kblockd_schedule_work(&ad->antic_work);
618 }
619}
620
621/*
622 * as_antic_timeout is the timer function set by as_antic_waitnext.
623 */
624static void as_antic_timeout(unsigned long data)
625{
626 struct request_queue *q = (struct request_queue *)data;
627 struct as_data *ad = q->elevator->elevator_data;
628 unsigned long flags;
629
630 spin_lock_irqsave(q->queue_lock, flags);
631 if (ad->antic_status == ANTIC_WAIT_REQ
632 || ad->antic_status == ANTIC_WAIT_NEXT) {
633 struct as_io_context *aic = ad->io_context->aic;
634
635 ad->antic_status = ANTIC_FINISHED;
636 kblockd_schedule_work(&ad->antic_work);
637
638 if (aic->ttime_samples == 0) {
639 /* process anticipated on has exitted or timed out*/
640 ad->exit_prob = (7*ad->exit_prob + 256)/8;
641 }
642 }
643 spin_unlock_irqrestore(q->queue_lock, flags);
644}
645
646/*
647 * as_close_req decides if one request is considered "close" to the
648 * previous one issued.
649 */
650static int as_close_req(struct as_data *ad, struct as_rq *arq)
651{
652 unsigned long delay; /* milliseconds */
653 sector_t last = ad->last_sector[ad->batch_data_dir];
654 sector_t next = arq->request->sector;
655 sector_t delta; /* acceptable close offset (in sectors) */
656
657 if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
658 delay = 0;
659 else
660 delay = ((jiffies - ad->antic_start) * 1000) / HZ;
661
662 if (delay <= 1)
663 delta = 64;
664 else if (delay <= 20 && delay <= ad->antic_expire)
665 delta = 64 << (delay-1);
666 else
667 return 1;
668
669 return (last - (delta>>1) <= next) && (next <= last + delta);
670}
671
672/*
673 * as_can_break_anticipation returns true if we have been anticipating this
674 * request.
675 *
676 * It also returns true if the process against which we are anticipating
677 * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
678 * dispatch it ASAP, because we know that application will not be submitting
679 * any new reads.
680 *
681 * If the task which has submitted the request has exitted, break anticipation.
682 *
683 * If this task has queued some other IO, do not enter enticipation.
684 */
685static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
686{
687 struct io_context *ioc;
688 struct as_io_context *aic;
689 sector_t s;
690
691 ioc = ad->io_context;
692 BUG_ON(!ioc);
693
694 if (arq && ioc == arq->io_context) {
695 /* request from same process */
696 return 1;
697 }
698
699 if (ad->ioc_finished && as_antic_expired(ad)) {
700 /*
701 * In this situation status should really be FINISHED,
702 * however the timer hasn't had the chance to run yet.
703 */
704 return 1;
705 }
706
707 aic = ioc->aic;
708 if (!aic)
709 return 0;
710
711 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
712 /* process anticipated on has exitted */
713 if (aic->ttime_samples == 0)
714 ad->exit_prob = (7*ad->exit_prob + 256)/8;
715 return 1;
716 }
717
718 if (atomic_read(&aic->nr_queued) > 0) {
719 /* process has more requests queued */
720 return 1;
721 }
722
723 if (atomic_read(&aic->nr_dispatched) > 0) {
724 /* process has more requests dispatched */
725 return 1;
726 }
727
728 if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) {
729 /*
730 * Found a close request that is not one of ours.
731 *
732 * This makes close requests from another process reset
733 * our thinktime delay. Is generally useful when there are
734 * two or more cooperating processes working in the same
735 * area.
736 */
737 spin_lock(&aic->lock);
738 aic->last_end_request = jiffies;
739 spin_unlock(&aic->lock);
740 return 1;
741 }
742
743
744 if (aic->ttime_samples == 0) {
745 if (ad->new_ttime_mean > ad->antic_expire)
746 return 1;
747 if (ad->exit_prob > 128)
748 return 1;
749 } else if (aic->ttime_mean > ad->antic_expire) {
750 /* the process thinks too much between requests */
751 return 1;
752 }
753
754 if (!arq)
755 return 0;
756
757 if (ad->last_sector[REQ_SYNC] < arq->request->sector)
758 s = arq->request->sector - ad->last_sector[REQ_SYNC];
759 else
760 s = ad->last_sector[REQ_SYNC] - arq->request->sector;
761
762 if (aic->seek_samples == 0) {
763 /*
764 * Process has just started IO. Use past statistics to
765 * guage success possibility
766 */
767 if (ad->new_seek_mean > s) {
768 /* this request is better than what we're expecting */
769 return 1;
770 }
771
772 } else {
773 if (aic->seek_mean > s) {
774 /* this request is better than what we're expecting */
775 return 1;
776 }
777 }
778
779 return 0;
780}
781
782/*
783 * as_can_anticipate indicates weather we should either run arq
784 * or keep anticipating a better request.
785 */
786static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
787{
788 if (!ad->io_context)
789 /*
790 * Last request submitted was a write
791 */
792 return 0;
793
794 if (ad->antic_status == ANTIC_FINISHED)
795 /*
796 * Don't restart if we have just finished. Run the next request
797 */
798 return 0;
799
800 if (as_can_break_anticipation(ad, arq))
801 /*
802 * This request is a good candidate. Don't keep anticipating,
803 * run it.
804 */
805 return 0;
806
807 /*
808 * OK from here, we haven't finished, and don't have a decent request!
809 * Status is either ANTIC_OFF so start waiting,
810 * ANTIC_WAIT_REQ so continue waiting for request to finish
811 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
812 *
813 */
814
815 return 1;
816}
817
818static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime)
819{
820 /* fixed point: 1.0 == 1<<8 */
821 if (aic->ttime_samples == 0) {
822 ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
823 ad->new_ttime_mean = ad->new_ttime_total / 256;
824
825 ad->exit_prob = (7*ad->exit_prob)/8;
826 }
827 aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
828 aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
829 aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
830}
831
832static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist)
833{
834 u64 total;
835
836 if (aic->seek_samples == 0) {
837 ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
838 ad->new_seek_mean = ad->new_seek_total / 256;
839 }
840
841 /*
842 * Don't allow the seek distance to get too large from the
843 * odd fragment, pagein, etc
844 */
845 if (aic->seek_samples <= 60) /* second&third seek */
846 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
847 else
848 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
849
850 aic->seek_samples = (7*aic->seek_samples + 256) / 8;
851 aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
852 total = aic->seek_total + (aic->seek_samples/2);
853 do_div(total, aic->seek_samples);
854 aic->seek_mean = (sector_t)total;
855}
856
857/*
858 * as_update_iohist keeps a decaying histogram of IO thinktimes, and
859 * updates @aic->ttime_mean based on that. It is called when a new
860 * request is queued.
861 */
862static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq)
863{
864 struct as_rq *arq = RQ_DATA(rq);
865 int data_dir = arq->is_sync;
866 unsigned long thinktime;
867 sector_t seek_dist;
868
869 if (aic == NULL)
870 return;
871
872 if (data_dir == REQ_SYNC) {
873 unsigned long in_flight = atomic_read(&aic->nr_queued)
874 + atomic_read(&aic->nr_dispatched);
875 spin_lock(&aic->lock);
876 if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
877 test_bit(AS_TASK_IOSTARTED, &aic->state)) {
878 /* Calculate read -> read thinktime */
879 if (test_bit(AS_TASK_IORUNNING, &aic->state)
880 && in_flight == 0) {
881 thinktime = jiffies - aic->last_end_request;
882 thinktime = min(thinktime, MAX_THINKTIME-1);
883 } else
884 thinktime = 0;
885 as_update_thinktime(ad, aic, thinktime);
886
887 /* Calculate read -> read seek distance */
888 if (aic->last_request_pos < rq->sector)
889 seek_dist = rq->sector - aic->last_request_pos;
890 else
891 seek_dist = aic->last_request_pos - rq->sector;
892 as_update_seekdist(ad, aic, seek_dist);
893 }
894 aic->last_request_pos = rq->sector + rq->nr_sectors;
895 set_bit(AS_TASK_IOSTARTED, &aic->state);
896 spin_unlock(&aic->lock);
897 }
898}
899
900/*
901 * as_update_arq must be called whenever a request (arq) is added to
902 * the sort_list. This function keeps caches up to date, and checks if the
903 * request might be one we are "anticipating"
904 */
905static void as_update_arq(struct as_data *ad, struct as_rq *arq)
906{
907 const int data_dir = arq->is_sync;
908
909 /* keep the next_arq cache up to date */
910 ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]);
911
912 /*
913 * have we been anticipating this request?
914 * or does it come from the same process as the one we are anticipating
915 * for?
916 */
917 if (ad->antic_status == ANTIC_WAIT_REQ
918 || ad->antic_status == ANTIC_WAIT_NEXT) {
919 if (as_can_break_anticipation(ad, arq))
920 as_antic_stop(ad);
921 }
922}
923
924/*
925 * Gathers timings and resizes the write batch automatically
926 */
927static void update_write_batch(struct as_data *ad)
928{
929 unsigned long batch = ad->batch_expire[REQ_ASYNC];
930 long write_time;
931
932 write_time = (jiffies - ad->current_batch_expires) + batch;
933 if (write_time < 0)
934 write_time = 0;
935
936 if (write_time > batch && !ad->write_batch_idled) {
937 if (write_time > batch * 3)
938 ad->write_batch_count /= 2;
939 else
940 ad->write_batch_count--;
941 } else if (write_time < batch && ad->current_write_count == 0) {
942 if (batch > write_time * 3)
943 ad->write_batch_count *= 2;
944 else
945 ad->write_batch_count++;
946 }
947
948 if (ad->write_batch_count < 1)
949 ad->write_batch_count = 1;
950}
951
952/*
953 * as_completed_request is to be called when a request has completed and
954 * returned something to the requesting process, be it an error or data.
955 */
956static void as_completed_request(request_queue_t *q, struct request *rq)
957{
958 struct as_data *ad = q->elevator->elevator_data;
959 struct as_rq *arq = RQ_DATA(rq);
960
961 WARN_ON(!list_empty(&rq->queuelist));
962
963 if (arq->state != AS_RQ_REMOVED) {
964 printk("arq->state %d\n", arq->state);
965 WARN_ON(1);
966 goto out;
967 }
968
969 if (ad->changed_batch && ad->nr_dispatched == 1) {
970 kblockd_schedule_work(&ad->antic_work);
971 ad->changed_batch = 0;
972
973 if (ad->batch_data_dir == REQ_SYNC)
974 ad->new_batch = 1;
975 }
976 WARN_ON(ad->nr_dispatched == 0);
977 ad->nr_dispatched--;
978
979 /*
980 * Start counting the batch from when a request of that direction is
981 * actually serviced. This should help devices with big TCQ windows
982 * and writeback caches
983 */
984 if (ad->new_batch && ad->batch_data_dir == arq->is_sync) {
985 update_write_batch(ad);
986 ad->current_batch_expires = jiffies +
987 ad->batch_expire[REQ_SYNC];
988 ad->new_batch = 0;
989 }
990
991 if (ad->io_context == arq->io_context && ad->io_context) {
992 ad->antic_start = jiffies;
993 ad->ioc_finished = 1;
994 if (ad->antic_status == ANTIC_WAIT_REQ) {
995 /*
996 * We were waiting on this request, now anticipate
997 * the next one
998 */
999 as_antic_waitnext(ad);
1000 }
1001 }
1002
1003 as_put_io_context(arq);
1004out:
1005 arq->state = AS_RQ_POSTSCHED;
1006}
1007
1008/*
1009 * as_remove_queued_request removes a request from the pre dispatch queue
1010 * without updating refcounts. It is expected the caller will drop the
1011 * reference unless it replaces the request at somepart of the elevator
1012 * (ie. the dispatch queue)
1013 */
1014static void as_remove_queued_request(request_queue_t *q, struct request *rq)
1015{
1016 struct as_rq *arq = RQ_DATA(rq);
1017 const int data_dir = arq->is_sync;
1018 struct as_data *ad = q->elevator->elevator_data;
1019
1020 WARN_ON(arq->state != AS_RQ_QUEUED);
1021
1022 if (arq->io_context && arq->io_context->aic) {
1023 BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
1024 atomic_dec(&arq->io_context->aic->nr_queued);
1025 }
1026
1027 /*
1028 * Update the "next_arq" cache if we are about to remove its
1029 * entry
1030 */
1031 if (ad->next_arq[data_dir] == arq)
1032 ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
1033
1034 list_del_init(&arq->fifo);
1035 as_del_arq_hash(arq);
1036 as_del_arq_rb(ad, arq);
1037}
1038
1039/*
1040 * as_fifo_expired returns 0 if there are no expired reads on the fifo,
1041 * 1 otherwise. It is ratelimited so that we only perform the check once per
1042 * `fifo_expire' interval. Otherwise a large number of expired requests
1043 * would create a hopeless seekstorm.
1044 *
1045 * See as_antic_expired comment.
1046 */
1047static int as_fifo_expired(struct as_data *ad, int adir)
1048{
1049 struct as_rq *arq;
1050 long delta_jif;
1051
1052 delta_jif = jiffies - ad->last_check_fifo[adir];
1053 if (unlikely(delta_jif < 0))
1054 delta_jif = -delta_jif;
1055 if (delta_jif < ad->fifo_expire[adir])
1056 return 0;
1057
1058 ad->last_check_fifo[adir] = jiffies;
1059
1060 if (list_empty(&ad->fifo_list[adir]))
1061 return 0;
1062
1063 arq = list_entry_fifo(ad->fifo_list[adir].next);
1064
1065 return time_after(jiffies, arq->expires);
1066}
1067
1068/*
1069 * as_batch_expired returns true if the current batch has expired. A batch
1070 * is a set of reads or a set of writes.
1071 */
1072static inline int as_batch_expired(struct as_data *ad)
1073{
1074 if (ad->changed_batch || ad->new_batch)
1075 return 0;
1076
1077 if (ad->batch_data_dir == REQ_SYNC)
1078 /* TODO! add a check so a complete fifo gets written? */
1079 return time_after(jiffies, ad->current_batch_expires);
1080
1081 return time_after(jiffies, ad->current_batch_expires)
1082 || ad->current_write_count == 0;
1083}
1084
1085/*
1086 * move an entry to dispatch queue
1087 */
1088static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
1089{
1090 struct request *rq = arq->request;
1091 const int data_dir = arq->is_sync;
1092
1093 BUG_ON(!ON_RB(&arq->rb_node));
1094
1095 as_antic_stop(ad);
1096 ad->antic_status = ANTIC_OFF;
1097
1098 /*
1099 * This has to be set in order to be correctly updated by
1100 * as_find_next_arq
1101 */
1102 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
1103
1104 if (data_dir == REQ_SYNC) {
1105 /* In case we have to anticipate after this */
1106 copy_io_context(&ad->io_context, &arq->io_context);
1107 } else {
1108 if (ad->io_context) {
1109 put_io_context(ad->io_context);
1110 ad->io_context = NULL;
1111 }
1112
1113 if (ad->current_write_count != 0)
1114 ad->current_write_count--;
1115 }
1116 ad->ioc_finished = 0;
1117
1118 ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
1119
1120 /*
1121 * take it off the sort and fifo list, add to dispatch queue
1122 */
1123 while (!list_empty(&rq->queuelist)) {
1124 struct request *__rq = list_entry_rq(rq->queuelist.next);
1125 struct as_rq *__arq = RQ_DATA(__rq);
1126
1127 list_del(&__rq->queuelist);
1128
1129 elv_dispatch_add_tail(ad->q, __rq);
1130
1131 if (__arq->io_context && __arq->io_context->aic)
1132 atomic_inc(&__arq->io_context->aic->nr_dispatched);
1133
1134 WARN_ON(__arq->state != AS_RQ_QUEUED);
1135 __arq->state = AS_RQ_DISPATCHED;
1136
1137 ad->nr_dispatched++;
1138 }
1139
1140 as_remove_queued_request(ad->q, rq);
1141 WARN_ON(arq->state != AS_RQ_QUEUED);
1142
1143 elv_dispatch_sort(ad->q, rq);
1144
1145 arq->state = AS_RQ_DISPATCHED;
1146 if (arq->io_context && arq->io_context->aic)
1147 atomic_inc(&arq->io_context->aic->nr_dispatched);
1148 ad->nr_dispatched++;
1149}
1150
1151/*
1152 * as_dispatch_request selects the best request according to
1153 * read/write expire, batch expire, etc, and moves it to the dispatch
1154 * queue. Returns 1 if a request was found, 0 otherwise.
1155 */
1156static int as_dispatch_request(request_queue_t *q, int force)
1157{
1158 struct as_data *ad = q->elevator->elevator_data;
1159 struct as_rq *arq;
1160 const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
1161 const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
1162
1163 if (unlikely(force)) {
1164 /*
1165 * Forced dispatch, accounting is useless. Reset
1166 * accounting states and dump fifo_lists. Note that
1167 * batch_data_dir is reset to REQ_SYNC to avoid
1168 * screwing write batch accounting as write batch
1169 * accounting occurs on W->R transition.
1170 */
1171 int dispatched = 0;
1172
1173 ad->batch_data_dir = REQ_SYNC;
1174 ad->changed_batch = 0;
1175 ad->new_batch = 0;
1176
1177 while (ad->next_arq[REQ_SYNC]) {
1178 as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]);
1179 dispatched++;
1180 }
1181 ad->last_check_fifo[REQ_SYNC] = jiffies;
1182
1183 while (ad->next_arq[REQ_ASYNC]) {
1184 as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
1185 dispatched++;
1186 }
1187 ad->last_check_fifo[REQ_ASYNC] = jiffies;
1188
1189 return dispatched;
1190 }
1191
1192 /* Signal that the write batch was uncontended, so we can't time it */
1193 if (ad->batch_data_dir == REQ_ASYNC && !reads) {
1194 if (ad->current_write_count == 0 || !writes)
1195 ad->write_batch_idled = 1;
1196 }
1197
1198 if (!(reads || writes)
1199 || ad->antic_status == ANTIC_WAIT_REQ
1200 || ad->antic_status == ANTIC_WAIT_NEXT
1201 || ad->changed_batch)
1202 return 0;
1203
1204 if (!(reads && writes && as_batch_expired(ad)) ) {
1205 /*
1206 * batch is still running or no reads or no writes
1207 */
1208 arq = ad->next_arq[ad->batch_data_dir];
1209
1210 if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
1211 if (as_fifo_expired(ad, REQ_SYNC))
1212 goto fifo_expired;
1213
1214 if (as_can_anticipate(ad, arq)) {
1215 as_antic_waitreq(ad);
1216 return 0;
1217 }
1218 }
1219
1220 if (arq) {
1221 /* we have a "next request" */
1222 if (reads && !writes)
1223 ad->current_batch_expires =
1224 jiffies + ad->batch_expire[REQ_SYNC];
1225 goto dispatch_request;
1226 }
1227 }
1228
1229 /*
1230 * at this point we are not running a batch. select the appropriate
1231 * data direction (read / write)
1232 */
1233
1234 if (reads) {
1235 BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC]));
1236
1237 if (writes && ad->batch_data_dir == REQ_SYNC)
1238 /*
1239 * Last batch was a read, switch to writes
1240 */
1241 goto dispatch_writes;
1242
1243 if (ad->batch_data_dir == REQ_ASYNC) {
1244 WARN_ON(ad->new_batch);
1245 ad->changed_batch = 1;
1246 }
1247 ad->batch_data_dir = REQ_SYNC;
1248 arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1249 ad->last_check_fifo[ad->batch_data_dir] = jiffies;
1250 goto dispatch_request;
1251 }
1252
1253 /*
1254 * the last batch was a read
1255 */
1256
1257 if (writes) {
1258dispatch_writes:
1259 BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC]));
1260
1261 if (ad->batch_data_dir == REQ_SYNC) {
1262 ad->changed_batch = 1;
1263
1264 /*
1265 * new_batch might be 1 when the queue runs out of
1266 * reads. A subsequent submission of a write might
1267 * cause a change of batch before the read is finished.
1268 */
1269 ad->new_batch = 0;
1270 }
1271 ad->batch_data_dir = REQ_ASYNC;
1272 ad->current_write_count = ad->write_batch_count;
1273 ad->write_batch_idled = 0;
1274 arq = ad->next_arq[ad->batch_data_dir];
1275 goto dispatch_request;
1276 }
1277
1278 BUG();
1279 return 0;
1280
1281dispatch_request:
1282 /*
1283 * If a request has expired, service it.
1284 */
1285
1286 if (as_fifo_expired(ad, ad->batch_data_dir)) {
1287fifo_expired:
1288 arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1289 BUG_ON(arq == NULL);
1290 }
1291
1292 if (ad->changed_batch) {
1293 WARN_ON(ad->new_batch);
1294
1295 if (ad->nr_dispatched)
1296 return 0;
1297
1298 if (ad->batch_data_dir == REQ_ASYNC)
1299 ad->current_batch_expires = jiffies +
1300 ad->batch_expire[REQ_ASYNC];
1301 else
1302 ad->new_batch = 1;
1303
1304 ad->changed_batch = 0;
1305 }
1306
1307 /*
1308 * arq is the selected appropriate request.
1309 */
1310 as_move_to_dispatch(ad, arq);
1311
1312 return 1;
1313}
1314
1315/*
1316 * Add arq to a list behind alias
1317 */
1318static inline void
1319as_add_aliased_request(struct as_data *ad, struct as_rq *arq, struct as_rq *alias)
1320{
1321 struct request *req = arq->request;
1322 struct list_head *insert = alias->request->queuelist.prev;
1323
1324 /*
1325 * Transfer list of aliases
1326 */
1327 while (!list_empty(&req->queuelist)) {
1328 struct request *__rq = list_entry_rq(req->queuelist.next);
1329 struct as_rq *__arq = RQ_DATA(__rq);
1330
1331 list_move_tail(&__rq->queuelist, &alias->request->queuelist);
1332
1333 WARN_ON(__arq->state != AS_RQ_QUEUED);
1334 }
1335
1336 /*
1337 * Another request with the same start sector on the rbtree.
1338 * Link this request to that sector. They are untangled in
1339 * as_move_to_dispatch
1340 */
1341 list_add(&arq->request->queuelist, insert);
1342
1343 /*
1344 * Don't want to have to handle merges.
1345 */
1346 as_del_arq_hash(arq);
1347 arq->request->flags |= REQ_NOMERGE;
1348}
1349
1350/*
1351 * add arq to rbtree and fifo
1352 */
1353static void as_add_request(request_queue_t *q, struct request *rq)
1354{
1355 struct as_data *ad = q->elevator->elevator_data;
1356 struct as_rq *arq = RQ_DATA(rq);
1357 struct as_rq *alias;
1358 int data_dir;
1359
1360 if (arq->state != AS_RQ_PRESCHED) {
1361 printk("arq->state: %d\n", arq->state);
1362 WARN_ON(1);
1363 }
1364 arq->state = AS_RQ_NEW;
1365
1366 if (rq_data_dir(arq->request) == READ
1367 || current->flags&PF_SYNCWRITE)
1368 arq->is_sync = 1;
1369 else
1370 arq->is_sync = 0;
1371 data_dir = arq->is_sync;
1372
1373 arq->io_context = as_get_io_context();
1374
1375 if (arq->io_context) {
1376 as_update_iohist(ad, arq->io_context->aic, arq->request);
1377 atomic_inc(&arq->io_context->aic->nr_queued);
1378 }
1379
1380 alias = as_add_arq_rb(ad, arq);
1381 if (!alias) {
1382 /*
1383 * set expire time (only used for reads) and add to fifo list
1384 */
1385 arq->expires = jiffies + ad->fifo_expire[data_dir];
1386 list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
1387
1388 if (rq_mergeable(arq->request))
1389 as_add_arq_hash(ad, arq);
1390 as_update_arq(ad, arq); /* keep state machine up to date */
1391
1392 } else {
1393 as_add_aliased_request(ad, arq, alias);
1394
1395 /*
1396 * have we been anticipating this request?
1397 * or does it come from the same process as the one we are
1398 * anticipating for?
1399 */
1400 if (ad->antic_status == ANTIC_WAIT_REQ
1401 || ad->antic_status == ANTIC_WAIT_NEXT) {
1402 if (as_can_break_anticipation(ad, arq))
1403 as_antic_stop(ad);
1404 }
1405 }
1406
1407 arq->state = AS_RQ_QUEUED;
1408}
1409
1410static void as_activate_request(request_queue_t *q, struct request *rq)
1411{
1412 struct as_rq *arq = RQ_DATA(rq);
1413
1414 WARN_ON(arq->state != AS_RQ_DISPATCHED);
1415 arq->state = AS_RQ_REMOVED;
1416 if (arq->io_context && arq->io_context->aic)
1417 atomic_dec(&arq->io_context->aic->nr_dispatched);
1418}
1419
1420static void as_deactivate_request(request_queue_t *q, struct request *rq)
1421{
1422 struct as_rq *arq = RQ_DATA(rq);
1423
1424 WARN_ON(arq->state != AS_RQ_REMOVED);
1425 arq->state = AS_RQ_DISPATCHED;
1426 if (arq->io_context && arq->io_context->aic)
1427 atomic_inc(&arq->io_context->aic->nr_dispatched);
1428}
1429
1430/*
1431 * as_queue_empty tells us if there are requests left in the device. It may
1432 * not be the case that a driver can get the next request even if the queue
1433 * is not empty - it is used in the block layer to check for plugging and
1434 * merging opportunities
1435 */
1436static int as_queue_empty(request_queue_t *q)
1437{
1438 struct as_data *ad = q->elevator->elevator_data;
1439
1440 return list_empty(&ad->fifo_list[REQ_ASYNC])
1441 && list_empty(&ad->fifo_list[REQ_SYNC]);
1442}
1443
1444static struct request *
1445as_former_request(request_queue_t *q, struct request *rq)
1446{
1447 struct as_rq *arq = RQ_DATA(rq);
1448 struct rb_node *rbprev = rb_prev(&arq->rb_node);
1449 struct request *ret = NULL;
1450
1451 if (rbprev)
1452 ret = rb_entry_arq(rbprev)->request;
1453
1454 return ret;
1455}
1456
1457static struct request *
1458as_latter_request(request_queue_t *q, struct request *rq)
1459{
1460 struct as_rq *arq = RQ_DATA(rq);
1461 struct rb_node *rbnext = rb_next(&arq->rb_node);
1462 struct request *ret = NULL;
1463
1464 if (rbnext)
1465 ret = rb_entry_arq(rbnext)->request;
1466
1467 return ret;
1468}
1469
1470static int
1471as_merge(request_queue_t *q, struct request **req, struct bio *bio)
1472{
1473 struct as_data *ad = q->elevator->elevator_data;
1474 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
1475 struct request *__rq;
1476 int ret;
1477
1478 /*
1479 * see if the merge hash can satisfy a back merge
1480 */
1481 __rq = as_find_arq_hash(ad, bio->bi_sector);
1482 if (__rq) {
1483 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
1484
1485 if (elv_rq_merge_ok(__rq, bio)) {
1486 ret = ELEVATOR_BACK_MERGE;
1487 goto out;
1488 }
1489 }
1490
1491 /*
1492 * check for front merge
1493 */
1494 __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
1495 if (__rq) {
1496 BUG_ON(rb_key != rq_rb_key(__rq));
1497
1498 if (elv_rq_merge_ok(__rq, bio)) {
1499 ret = ELEVATOR_FRONT_MERGE;
1500 goto out;
1501 }
1502 }
1503
1504 return ELEVATOR_NO_MERGE;
1505out:
1506 if (ret) {
1507 if (rq_mergeable(__rq))
1508 as_hot_arq_hash(ad, RQ_DATA(__rq));
1509 }
1510 *req = __rq;
1511 return ret;
1512}
1513
1514static void as_merged_request(request_queue_t *q, struct request *req)
1515{
1516 struct as_data *ad = q->elevator->elevator_data;
1517 struct as_rq *arq = RQ_DATA(req);
1518
1519 /*
1520 * hash always needs to be repositioned, key is end sector
1521 */
1522 as_del_arq_hash(arq);
1523 as_add_arq_hash(ad, arq);
1524
1525 /*
1526 * if the merge was a front merge, we need to reposition request
1527 */
1528 if (rq_rb_key(req) != arq->rb_key) {
1529 struct as_rq *alias, *next_arq = NULL;
1530
1531 if (ad->next_arq[arq->is_sync] == arq)
1532 next_arq = as_find_next_arq(ad, arq);
1533
1534 /*
1535 * Note! We should really be moving any old aliased requests
1536 * off this request and try to insert them into the rbtree. We
1537 * currently don't bother. Ditto the next function.
1538 */
1539 as_del_arq_rb(ad, arq);
1540 if ((alias = as_add_arq_rb(ad, arq)) ) {
1541 list_del_init(&arq->fifo);
1542 as_add_aliased_request(ad, arq, alias);
1543 if (next_arq)
1544 ad->next_arq[arq->is_sync] = next_arq;
1545 }
1546 /*
1547 * Note! At this stage of this and the next function, our next
1548 * request may not be optimal - eg the request may have "grown"
1549 * behind the disk head. We currently don't bother adjusting.
1550 */
1551 }
1552}
1553
1554static void
1555as_merged_requests(request_queue_t *q, struct request *req,
1556 struct request *next)
1557{
1558 struct as_data *ad = q->elevator->elevator_data;
1559 struct as_rq *arq = RQ_DATA(req);
1560 struct as_rq *anext = RQ_DATA(next);
1561
1562 BUG_ON(!arq);
1563 BUG_ON(!anext);
1564
1565 /*
1566 * reposition arq (this is the merged request) in hash, and in rbtree
1567 * in case of a front merge
1568 */
1569 as_del_arq_hash(arq);
1570 as_add_arq_hash(ad, arq);
1571
1572 if (rq_rb_key(req) != arq->rb_key) {
1573 struct as_rq *alias, *next_arq = NULL;
1574
1575 if (ad->next_arq[arq->is_sync] == arq)
1576 next_arq = as_find_next_arq(ad, arq);
1577
1578 as_del_arq_rb(ad, arq);
1579 if ((alias = as_add_arq_rb(ad, arq)) ) {
1580 list_del_init(&arq->fifo);
1581 as_add_aliased_request(ad, arq, alias);
1582 if (next_arq)
1583 ad->next_arq[arq->is_sync] = next_arq;
1584 }
1585 }
1586
1587 /*
1588 * if anext expires before arq, assign its expire time to arq
1589 * and move into anext position (anext will be deleted) in fifo
1590 */
1591 if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
1592 if (time_before(anext->expires, arq->expires)) {
1593 list_move(&arq->fifo, &anext->fifo);
1594 arq->expires = anext->expires;
1595 /*
1596 * Don't copy here but swap, because when anext is
1597 * removed below, it must contain the unused context
1598 */
1599 swap_io_context(&arq->io_context, &anext->io_context);
1600 }
1601 }
1602
1603 /*
1604 * Transfer list of aliases
1605 */
1606 while (!list_empty(&next->queuelist)) {
1607 struct request *__rq = list_entry_rq(next->queuelist.next);
1608 struct as_rq *__arq = RQ_DATA(__rq);
1609
1610 list_move_tail(&__rq->queuelist, &req->queuelist);
1611
1612 WARN_ON(__arq->state != AS_RQ_QUEUED);
1613 }
1614
1615 /*
1616 * kill knowledge of next, this one is a goner
1617 */
1618 as_remove_queued_request(q, next);
1619 as_put_io_context(anext);
1620
1621 anext->state = AS_RQ_MERGED;
1622}
1623
1624/*
1625 * This is executed in a "deferred" process context, by kblockd. It calls the
1626 * driver's request_fn so the driver can submit that request.
1627 *
1628 * IMPORTANT! This guy will reenter the elevator, so set up all queue global
1629 * state before calling, and don't rely on any state over calls.
1630 *
1631 * FIXME! dispatch queue is not a queue at all!
1632 */
1633static void as_work_handler(void *data)
1634{
1635 struct request_queue *q = data;
1636 unsigned long flags;
1637
1638 spin_lock_irqsave(q->queue_lock, flags);
1639 if (!as_queue_empty(q))
1640 q->request_fn(q);
1641 spin_unlock_irqrestore(q->queue_lock, flags);
1642}
1643
1644static void as_put_request(request_queue_t *q, struct request *rq)
1645{
1646 struct as_data *ad = q->elevator->elevator_data;
1647 struct as_rq *arq = RQ_DATA(rq);
1648
1649 if (!arq) {
1650 WARN_ON(1);
1651 return;
1652 }
1653
1654 if (unlikely(arq->state != AS_RQ_POSTSCHED &&
1655 arq->state != AS_RQ_PRESCHED &&
1656 arq->state != AS_RQ_MERGED)) {
1657 printk("arq->state %d\n", arq->state);
1658 WARN_ON(1);
1659 }
1660
1661 mempool_free(arq, ad->arq_pool);
1662 rq->elevator_private = NULL;
1663}
1664
1665static int as_set_request(request_queue_t *q, struct request *rq,
1666 struct bio *bio, gfp_t gfp_mask)
1667{
1668 struct as_data *ad = q->elevator->elevator_data;
1669 struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
1670
1671 if (arq) {
1672 memset(arq, 0, sizeof(*arq));
1673 RB_CLEAR(&arq->rb_node);
1674 arq->request = rq;
1675 arq->state = AS_RQ_PRESCHED;
1676 arq->io_context = NULL;
1677 INIT_LIST_HEAD(&arq->hash);
1678 arq->on_hash = 0;
1679 INIT_LIST_HEAD(&arq->fifo);
1680 rq->elevator_private = arq;
1681 return 0;
1682 }
1683
1684 return 1;
1685}
1686
1687static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
1688{
1689 int ret = ELV_MQUEUE_MAY;
1690 struct as_data *ad = q->elevator->elevator_data;
1691 struct io_context *ioc;
1692 if (ad->antic_status == ANTIC_WAIT_REQ ||
1693 ad->antic_status == ANTIC_WAIT_NEXT) {
1694 ioc = as_get_io_context();
1695 if (ad->io_context == ioc)
1696 ret = ELV_MQUEUE_MUST;
1697 put_io_context(ioc);
1698 }
1699
1700 return ret;
1701}
1702
1703static void as_exit_queue(elevator_t *e)
1704{
1705 struct as_data *ad = e->elevator_data;
1706
1707 del_timer_sync(&ad->antic_timer);
1708 kblockd_flush();
1709
1710 BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
1711 BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
1712
1713 mempool_destroy(ad->arq_pool);
1714 put_io_context(ad->io_context);
1715 kfree(ad->hash);
1716 kfree(ad);
1717}
1718
1719/*
1720 * initialize elevator private data (as_data), and alloc a arq for
1721 * each request on the free lists
1722 */
1723static int as_init_queue(request_queue_t *q, elevator_t *e)
1724{
1725 struct as_data *ad;
1726 int i;
1727
1728 if (!arq_pool)
1729 return -ENOMEM;
1730
1731 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
1732 if (!ad)
1733 return -ENOMEM;
1734 memset(ad, 0, sizeof(*ad));
1735
1736 ad->q = q; /* Identify what queue the data belongs to */
1737
1738 ad->hash = kmalloc_node(sizeof(struct list_head)*AS_HASH_ENTRIES,
1739 GFP_KERNEL, q->node);
1740 if (!ad->hash) {
1741 kfree(ad);
1742 return -ENOMEM;
1743 }
1744
1745 ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1746 mempool_free_slab, arq_pool, q->node);
1747 if (!ad->arq_pool) {
1748 kfree(ad->hash);
1749 kfree(ad);
1750 return -ENOMEM;
1751 }
1752
1753 /* anticipatory scheduling helpers */
1754 ad->antic_timer.function = as_antic_timeout;
1755 ad->antic_timer.data = (unsigned long)q;
1756 init_timer(&ad->antic_timer);
1757 INIT_WORK(&ad->antic_work, as_work_handler, q);
1758
1759 for (i = 0; i < AS_HASH_ENTRIES; i++)
1760 INIT_LIST_HEAD(&ad->hash[i]);
1761
1762 INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
1763 INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
1764 ad->sort_list[REQ_SYNC] = RB_ROOT;
1765 ad->sort_list[REQ_ASYNC] = RB_ROOT;
1766 ad->fifo_expire[REQ_SYNC] = default_read_expire;
1767 ad->fifo_expire[REQ_ASYNC] = default_write_expire;
1768 ad->antic_expire = default_antic_expire;
1769 ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
1770 ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
1771 e->elevator_data = ad;
1772
1773 ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
1774 ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
1775 if (ad->write_batch_count < 2)
1776 ad->write_batch_count = 2;
1777
1778 return 0;
1779}
1780
1781/*
1782 * sysfs parts below
1783 */
1784struct as_fs_entry {
1785 struct attribute attr;
1786 ssize_t (*show)(struct as_data *, char *);
1787 ssize_t (*store)(struct as_data *, const char *, size_t);
1788};
1789
1790static ssize_t
1791as_var_show(unsigned int var, char *page)
1792{
1793 return sprintf(page, "%d\n", var);
1794}
1795
1796static ssize_t
1797as_var_store(unsigned long *var, const char *page, size_t count)
1798{
1799 char *p = (char *) page;
1800
1801 *var = simple_strtoul(p, &p, 10);
1802 return count;
1803}
1804
1805static ssize_t as_est_show(struct as_data *ad, char *page)
1806{
1807 int pos = 0;
1808
1809 pos += sprintf(page+pos, "%lu %% exit probability\n", 100*ad->exit_prob/256);
1810 pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
1811 pos += sprintf(page+pos, "%llu sectors new seek distance\n", (unsigned long long)ad->new_seek_mean);
1812
1813 return pos;
1814}
1815
1816#define SHOW_FUNCTION(__FUNC, __VAR) \
1817static ssize_t __FUNC(struct as_data *ad, char *page) \
1818{ \
1819 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1820}
1821SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
1822SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
1823SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire);
1824SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]);
1825SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
1826#undef SHOW_FUNCTION
1827
1828#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1829static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \
1830{ \
1831 int ret = as_var_store(__PTR, (page), count); \
1832 if (*(__PTR) < (MIN)) \
1833 *(__PTR) = (MIN); \
1834 else if (*(__PTR) > (MAX)) \
1835 *(__PTR) = (MAX); \
1836 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1837 return ret; \
1838}
1839STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
1840STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
1841STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX);
1842STORE_FUNCTION(as_read_batchexpire_store,
1843 &ad->batch_expire[REQ_SYNC], 0, INT_MAX);
1844STORE_FUNCTION(as_write_batchexpire_store,
1845 &ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
1846#undef STORE_FUNCTION
1847
1848static struct as_fs_entry as_est_entry = {
1849 .attr = {.name = "est_time", .mode = S_IRUGO },
1850 .show = as_est_show,
1851};
1852static struct as_fs_entry as_readexpire_entry = {
1853 .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
1854 .show = as_readexpire_show,
1855 .store = as_readexpire_store,
1856};
1857static struct as_fs_entry as_writeexpire_entry = {
1858 .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
1859 .show = as_writeexpire_show,
1860 .store = as_writeexpire_store,
1861};
1862static struct as_fs_entry as_anticexpire_entry = {
1863 .attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
1864 .show = as_anticexpire_show,
1865 .store = as_anticexpire_store,
1866};
1867static struct as_fs_entry as_read_batchexpire_entry = {
1868 .attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
1869 .show = as_read_batchexpire_show,
1870 .store = as_read_batchexpire_store,
1871};
1872static struct as_fs_entry as_write_batchexpire_entry = {
1873 .attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
1874 .show = as_write_batchexpire_show,
1875 .store = as_write_batchexpire_store,
1876};
1877
1878static struct attribute *default_attrs[] = {
1879 &as_est_entry.attr,
1880 &as_readexpire_entry.attr,
1881 &as_writeexpire_entry.attr,
1882 &as_anticexpire_entry.attr,
1883 &as_read_batchexpire_entry.attr,
1884 &as_write_batchexpire_entry.attr,
1885 NULL,
1886};
1887
1888#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
1889
1890static ssize_t
1891as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1892{
1893 elevator_t *e = container_of(kobj, elevator_t, kobj);
1894 struct as_fs_entry *entry = to_as(attr);
1895
1896 if (!entry->show)
1897 return -EIO;
1898
1899 return entry->show(e->elevator_data, page);
1900}
1901
1902static ssize_t
1903as_attr_store(struct kobject *kobj, struct attribute *attr,
1904 const char *page, size_t length)
1905{
1906 elevator_t *e = container_of(kobj, elevator_t, kobj);
1907 struct as_fs_entry *entry = to_as(attr);
1908
1909 if (!entry->store)
1910 return -EIO;
1911
1912 return entry->store(e->elevator_data, page, length);
1913}
1914
1915static struct sysfs_ops as_sysfs_ops = {
1916 .show = as_attr_show,
1917 .store = as_attr_store,
1918};
1919
1920static struct kobj_type as_ktype = {
1921 .sysfs_ops = &as_sysfs_ops,
1922 .default_attrs = default_attrs,
1923};
1924
1925static struct elevator_type iosched_as = {
1926 .ops = {
1927 .elevator_merge_fn = as_merge,
1928 .elevator_merged_fn = as_merged_request,
1929 .elevator_merge_req_fn = as_merged_requests,
1930 .elevator_dispatch_fn = as_dispatch_request,
1931 .elevator_add_req_fn = as_add_request,
1932 .elevator_activate_req_fn = as_activate_request,
1933 .elevator_deactivate_req_fn = as_deactivate_request,
1934 .elevator_queue_empty_fn = as_queue_empty,
1935 .elevator_completed_req_fn = as_completed_request,
1936 .elevator_former_req_fn = as_former_request,
1937 .elevator_latter_req_fn = as_latter_request,
1938 .elevator_set_req_fn = as_set_request,
1939 .elevator_put_req_fn = as_put_request,
1940 .elevator_may_queue_fn = as_may_queue,
1941 .elevator_init_fn = as_init_queue,
1942 .elevator_exit_fn = as_exit_queue,
1943 },
1944
1945 .elevator_ktype = &as_ktype,
1946 .elevator_name = "anticipatory",
1947 .elevator_owner = THIS_MODULE,
1948};
1949
1950static int __init as_init(void)
1951{
1952 int ret;
1953
1954 arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
1955 0, 0, NULL, NULL);
1956 if (!arq_pool)
1957 return -ENOMEM;
1958
1959 ret = elv_register(&iosched_as);
1960 if (!ret) {
1961 /*
1962 * don't allow AS to get unregistered, since we would have
1963 * to browse all tasks in the system and release their
1964 * as_io_context first
1965 */
1966 __module_get(THIS_MODULE);
1967 return 0;
1968 }
1969
1970 kmem_cache_destroy(arq_pool);
1971 return ret;
1972}
1973
1974static void __exit as_exit(void)
1975{
1976 elv_unregister(&iosched_as);
1977 kmem_cache_destroy(arq_pool);
1978}
1979
1980module_init(as_init);
1981module_exit(as_exit);
1982
1983MODULE_AUTHOR("Nick Piggin");
1984MODULE_LICENSE("GPL");
1985MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
new file mode 100644
index 00000000000..ecacca9c877
--- /dev/null
+++ b/block/cfq-iosched.c
@@ -0,0 +1,2428 @@
1/*
2 * linux/drivers/block/cfq-iosched.c
3 *
4 * CFQ, or complete fairness queueing, disk scheduler.
5 *
6 * Based on ideas from a previously unfinished io
7 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
8 *
9 * Copyright (C) 2003 Jens Axboe <axboe@suse.de>
10 */
11#include <linux/kernel.h>
12#include <linux/fs.h>
13#include <linux/blkdev.h>
14#include <linux/elevator.h>
15#include <linux/bio.h>
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/slab.h>
19#include <linux/init.h>
20#include <linux/compiler.h>
21#include <linux/hash.h>
22#include <linux/rbtree.h>
23#include <linux/mempool.h>
24#include <linux/ioprio.h>
25#include <linux/writeback.h>
26
27/*
28 * tunables
29 */
30static int cfq_quantum = 4; /* max queue in one round of service */
31static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/
32static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
33static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */
34static int cfq_back_penalty = 2; /* penalty of a backwards seek */
35
36static int cfq_slice_sync = HZ / 10;
37static int cfq_slice_async = HZ / 25;
38static int cfq_slice_async_rq = 2;
39static int cfq_slice_idle = HZ / 100;
40
41#define CFQ_IDLE_GRACE (HZ / 10)
42#define CFQ_SLICE_SCALE (5)
43
44#define CFQ_KEY_ASYNC (0)
45#define CFQ_KEY_ANY (0xffff)
46
47/*
48 * disable queueing at the driver/hardware level
49 */
50static int cfq_max_depth = 2;
51
52/*
53 * for the hash of cfqq inside the cfqd
54 */
55#define CFQ_QHASH_SHIFT 6
56#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT)
57#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
58
59/*
60 * for the hash of crq inside the cfqq
61 */
62#define CFQ_MHASH_SHIFT 6
63#define CFQ_MHASH_BLOCK(sec) ((sec) >> 3)
64#define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT)
65#define CFQ_MHASH_FN(sec) hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
66#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
67#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash)
68
69#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list)
70#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
71
72#define RQ_DATA(rq) (rq)->elevator_private
73
74/*
75 * rb-tree defines
76 */
77#define RB_NONE (2)
78#define RB_EMPTY(node) ((node)->rb_node == NULL)
79#define RB_CLEAR_COLOR(node) (node)->rb_color = RB_NONE
80#define RB_CLEAR(node) do { \
81 (node)->rb_parent = NULL; \
82 RB_CLEAR_COLOR((node)); \
83 (node)->rb_right = NULL; \
84 (node)->rb_left = NULL; \
85} while (0)
86#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL)
87#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node)
88#define rq_rb_key(rq) (rq)->sector
89
90static kmem_cache_t *crq_pool;
91static kmem_cache_t *cfq_pool;
92static kmem_cache_t *cfq_ioc_pool;
93
94#define CFQ_PRIO_LISTS IOPRIO_BE_NR
95#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
96#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
97#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
98
99#define ASYNC (0)
100#define SYNC (1)
101
102#define cfq_cfqq_dispatched(cfqq) \
103 ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
104
105#define cfq_cfqq_class_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC)
106
107#define cfq_cfqq_sync(cfqq) \
108 (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
109
110/*
111 * Per block device queue structure
112 */
113struct cfq_data {
114 atomic_t ref;
115 request_queue_t *queue;
116
117 /*
118 * rr list of queues with requests and the count of them
119 */
120 struct list_head rr_list[CFQ_PRIO_LISTS];
121 struct list_head busy_rr;
122 struct list_head cur_rr;
123 struct list_head idle_rr;
124 unsigned int busy_queues;
125
126 /*
127 * non-ordered list of empty cfqq's
128 */
129 struct list_head empty_list;
130
131 /*
132 * cfqq lookup hash
133 */
134 struct hlist_head *cfq_hash;
135
136 /*
137 * global crq hash for all queues
138 */
139 struct hlist_head *crq_hash;
140
141 unsigned int max_queued;
142
143 mempool_t *crq_pool;
144
145 int rq_in_driver;
146
147 /*
148 * schedule slice state info
149 */
150 /*
151 * idle window management
152 */
153 struct timer_list idle_slice_timer;
154 struct work_struct unplug_work;
155
156 struct cfq_queue *active_queue;
157 struct cfq_io_context *active_cic;
158 int cur_prio, cur_end_prio;
159 unsigned int dispatch_slice;
160
161 struct timer_list idle_class_timer;
162
163 sector_t last_sector;
164 unsigned long last_end_request;
165
166 unsigned int rq_starved;
167
168 /*
169 * tunables, see top of file
170 */
171 unsigned int cfq_quantum;
172 unsigned int cfq_queued;
173 unsigned int cfq_fifo_expire[2];
174 unsigned int cfq_back_penalty;
175 unsigned int cfq_back_max;
176 unsigned int cfq_slice[2];
177 unsigned int cfq_slice_async_rq;
178 unsigned int cfq_slice_idle;
179 unsigned int cfq_max_depth;
180};
181
182/*
183 * Per process-grouping structure
184 */
185struct cfq_queue {
186 /* reference count */
187 atomic_t ref;
188 /* parent cfq_data */
189 struct cfq_data *cfqd;
190 /* cfqq lookup hash */
191 struct hlist_node cfq_hash;
192 /* hash key */
193 unsigned int key;
194 /* on either rr or empty list of cfqd */
195 struct list_head cfq_list;
196 /* sorted list of pending requests */
197 struct rb_root sort_list;
198 /* if fifo isn't expired, next request to serve */
199 struct cfq_rq *next_crq;
200 /* requests queued in sort_list */
201 int queued[2];
202 /* currently allocated requests */
203 int allocated[2];
204 /* fifo list of requests in sort_list */
205 struct list_head fifo;
206
207 unsigned long slice_start;
208 unsigned long slice_end;
209 unsigned long slice_left;
210 unsigned long service_last;
211
212 /* number of requests that are on the dispatch list */
213 int on_dispatch[2];
214
215 /* io prio of this group */
216 unsigned short ioprio, org_ioprio;
217 unsigned short ioprio_class, org_ioprio_class;
218
219 /* various state flags, see below */
220 unsigned int flags;
221};
222
223struct cfq_rq {
224 struct rb_node rb_node;
225 sector_t rb_key;
226 struct request *request;
227 struct hlist_node hash;
228
229 struct cfq_queue *cfq_queue;
230 struct cfq_io_context *io_context;
231
232 unsigned int crq_flags;
233};
234
235enum cfqq_state_flags {
236 CFQ_CFQQ_FLAG_on_rr = 0,
237 CFQ_CFQQ_FLAG_wait_request,
238 CFQ_CFQQ_FLAG_must_alloc,
239 CFQ_CFQQ_FLAG_must_alloc_slice,
240 CFQ_CFQQ_FLAG_must_dispatch,
241 CFQ_CFQQ_FLAG_fifo_expire,
242 CFQ_CFQQ_FLAG_idle_window,
243 CFQ_CFQQ_FLAG_prio_changed,
244 CFQ_CFQQ_FLAG_expired,
245};
246
247#define CFQ_CFQQ_FNS(name) \
248static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
249{ \
250 cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
251} \
252static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
253{ \
254 cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
255} \
256static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
257{ \
258 return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
259}
260
261CFQ_CFQQ_FNS(on_rr);
262CFQ_CFQQ_FNS(wait_request);
263CFQ_CFQQ_FNS(must_alloc);
264CFQ_CFQQ_FNS(must_alloc_slice);
265CFQ_CFQQ_FNS(must_dispatch);
266CFQ_CFQQ_FNS(fifo_expire);
267CFQ_CFQQ_FNS(idle_window);
268CFQ_CFQQ_FNS(prio_changed);
269CFQ_CFQQ_FNS(expired);
270#undef CFQ_CFQQ_FNS
271
272enum cfq_rq_state_flags {
273 CFQ_CRQ_FLAG_is_sync = 0,
274};
275
276#define CFQ_CRQ_FNS(name) \
277static inline void cfq_mark_crq_##name(struct cfq_rq *crq) \
278{ \
279 crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name); \
280} \
281static inline void cfq_clear_crq_##name(struct cfq_rq *crq) \
282{ \
283 crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name); \
284} \
285static inline int cfq_crq_##name(const struct cfq_rq *crq) \
286{ \
287 return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0; \
288}
289
290CFQ_CRQ_FNS(is_sync);
291#undef CFQ_CRQ_FNS
292
293static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
294static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
295static void cfq_put_cfqd(struct cfq_data *cfqd);
296
297#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE)
298
299/*
300 * lots of deadline iosched dupes, can be abstracted later...
301 */
302static inline void cfq_del_crq_hash(struct cfq_rq *crq)
303{
304 hlist_del_init(&crq->hash);
305}
306
307static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
308{
309 const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
310
311 hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
312}
313
314static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
315{
316 struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
317 struct hlist_node *entry, *next;
318
319 hlist_for_each_safe(entry, next, hash_list) {
320 struct cfq_rq *crq = list_entry_hash(entry);
321 struct request *__rq = crq->request;
322
323 if (!rq_mergeable(__rq)) {
324 cfq_del_crq_hash(crq);
325 continue;
326 }
327
328 if (rq_hash_key(__rq) == offset)
329 return __rq;
330 }
331
332 return NULL;
333}
334
335/*
336 * scheduler run of queue, if there are requests pending and no one in the
337 * driver that will restart queueing
338 */
339static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
340{
341 if (!cfqd->rq_in_driver && cfqd->busy_queues)
342 kblockd_schedule_work(&cfqd->unplug_work);
343}
344
345static int cfq_queue_empty(request_queue_t *q)
346{
347 struct cfq_data *cfqd = q->elevator->elevator_data;
348
349 return !cfqd->busy_queues;
350}
351
352/*
353 * Lifted from AS - choose which of crq1 and crq2 that is best served now.
354 * We choose the request that is closest to the head right now. Distance
355 * behind the head are penalized and only allowed to a certain extent.
356 */
357static struct cfq_rq *
358cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
359{
360 sector_t last, s1, s2, d1 = 0, d2 = 0;
361 int r1_wrap = 0, r2_wrap = 0; /* requests are behind the disk head */
362 unsigned long back_max;
363
364 if (crq1 == NULL || crq1 == crq2)
365 return crq2;
366 if (crq2 == NULL)
367 return crq1;
368
369 if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
370 return crq1;
371 else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
372 return crq2;
373
374 s1 = crq1->request->sector;
375 s2 = crq2->request->sector;
376
377 last = cfqd->last_sector;
378
379 /*
380 * by definition, 1KiB is 2 sectors
381 */
382 back_max = cfqd->cfq_back_max * 2;
383
384 /*
385 * Strict one way elevator _except_ in the case where we allow
386 * short backward seeks which are biased as twice the cost of a
387 * similar forward seek.
388 */
389 if (s1 >= last)
390 d1 = s1 - last;
391 else if (s1 + back_max >= last)
392 d1 = (last - s1) * cfqd->cfq_back_penalty;
393 else
394 r1_wrap = 1;
395
396 if (s2 >= last)
397 d2 = s2 - last;
398 else if (s2 + back_max >= last)
399 d2 = (last - s2) * cfqd->cfq_back_penalty;
400 else
401 r2_wrap = 1;
402
403 /* Found required data */
404 if (!r1_wrap && r2_wrap)
405 return crq1;
406 else if (!r2_wrap && r1_wrap)
407 return crq2;
408 else if (r1_wrap && r2_wrap) {
409 /* both behind the head */
410 if (s1 <= s2)
411 return crq1;
412 else
413 return crq2;
414 }
415
416 /* Both requests in front of the head */
417 if (d1 < d2)
418 return crq1;
419 else if (d2 < d1)
420 return crq2;
421 else {
422 if (s1 >= s2)
423 return crq1;
424 else
425 return crq2;
426 }
427}
428
429/*
430 * would be nice to take fifo expire time into account as well
431 */
432static struct cfq_rq *
433cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
434 struct cfq_rq *last)
435{
436 struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
437 struct rb_node *rbnext, *rbprev;
438
439 if (!(rbnext = rb_next(&last->rb_node))) {
440 rbnext = rb_first(&cfqq->sort_list);
441 if (rbnext == &last->rb_node)
442 rbnext = NULL;
443 }
444
445 rbprev = rb_prev(&last->rb_node);
446
447 if (rbprev)
448 crq_prev = rb_entry_crq(rbprev);
449 if (rbnext)
450 crq_next = rb_entry_crq(rbnext);
451
452 return cfq_choose_req(cfqd, crq_next, crq_prev);
453}
454
455static void cfq_update_next_crq(struct cfq_rq *crq)
456{
457 struct cfq_queue *cfqq = crq->cfq_queue;
458
459 if (cfqq->next_crq == crq)
460 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
461}
462
463static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
464{
465 struct cfq_data *cfqd = cfqq->cfqd;
466 struct list_head *list, *entry;
467
468 BUG_ON(!cfq_cfqq_on_rr(cfqq));
469
470 list_del(&cfqq->cfq_list);
471
472 if (cfq_class_rt(cfqq))
473 list = &cfqd->cur_rr;
474 else if (cfq_class_idle(cfqq))
475 list = &cfqd->idle_rr;
476 else {
477 /*
478 * if cfqq has requests in flight, don't allow it to be
479 * found in cfq_set_active_queue before it has finished them.
480 * this is done to increase fairness between a process that
481 * has lots of io pending vs one that only generates one
482 * sporadically or synchronously
483 */
484 if (cfq_cfqq_dispatched(cfqq))
485 list = &cfqd->busy_rr;
486 else
487 list = &cfqd->rr_list[cfqq->ioprio];
488 }
489
490 /*
491 * if queue was preempted, just add to front to be fair. busy_rr
492 * isn't sorted.
493 */
494 if (preempted || list == &cfqd->busy_rr) {
495 list_add(&cfqq->cfq_list, list);
496 return;
497 }
498
499 /*
500 * sort by when queue was last serviced
501 */
502 entry = list;
503 while ((entry = entry->prev) != list) {
504 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
505
506 if (!__cfqq->service_last)
507 break;
508 if (time_before(__cfqq->service_last, cfqq->service_last))
509 break;
510 }
511
512 list_add(&cfqq->cfq_list, entry);
513}
514
515/*
516 * add to busy list of queues for service, trying to be fair in ordering
517 * the pending list according to last request service
518 */
519static inline void
520cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
521{
522 BUG_ON(cfq_cfqq_on_rr(cfqq));
523 cfq_mark_cfqq_on_rr(cfqq);
524 cfqd->busy_queues++;
525
526 cfq_resort_rr_list(cfqq, 0);
527}
528
529static inline void
530cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
531{
532 BUG_ON(!cfq_cfqq_on_rr(cfqq));
533 cfq_clear_cfqq_on_rr(cfqq);
534 list_move(&cfqq->cfq_list, &cfqd->empty_list);
535
536 BUG_ON(!cfqd->busy_queues);
537 cfqd->busy_queues--;
538}
539
540/*
541 * rb tree support functions
542 */
543static inline void cfq_del_crq_rb(struct cfq_rq *crq)
544{
545 struct cfq_queue *cfqq = crq->cfq_queue;
546 struct cfq_data *cfqd = cfqq->cfqd;
547 const int sync = cfq_crq_is_sync(crq);
548
549 BUG_ON(!cfqq->queued[sync]);
550 cfqq->queued[sync]--;
551
552 cfq_update_next_crq(crq);
553
554 rb_erase(&crq->rb_node, &cfqq->sort_list);
555 RB_CLEAR_COLOR(&crq->rb_node);
556
557 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
558 cfq_del_cfqq_rr(cfqd, cfqq);
559}
560
561static struct cfq_rq *
562__cfq_add_crq_rb(struct cfq_rq *crq)
563{
564 struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
565 struct rb_node *parent = NULL;
566 struct cfq_rq *__crq;
567
568 while (*p) {
569 parent = *p;
570 __crq = rb_entry_crq(parent);
571
572 if (crq->rb_key < __crq->rb_key)
573 p = &(*p)->rb_left;
574 else if (crq->rb_key > __crq->rb_key)
575 p = &(*p)->rb_right;
576 else
577 return __crq;
578 }
579
580 rb_link_node(&crq->rb_node, parent, p);
581 return NULL;
582}
583
584static void cfq_add_crq_rb(struct cfq_rq *crq)
585{
586 struct cfq_queue *cfqq = crq->cfq_queue;
587 struct cfq_data *cfqd = cfqq->cfqd;
588 struct request *rq = crq->request;
589 struct cfq_rq *__alias;
590
591 crq->rb_key = rq_rb_key(rq);
592 cfqq->queued[cfq_crq_is_sync(crq)]++;
593
594 /*
595 * looks a little odd, but the first insert might return an alias.
596 * if that happens, put the alias on the dispatch list
597 */
598 while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
599 cfq_dispatch_insert(cfqd->queue, __alias);
600
601 rb_insert_color(&crq->rb_node, &cfqq->sort_list);
602
603 if (!cfq_cfqq_on_rr(cfqq))
604 cfq_add_cfqq_rr(cfqd, cfqq);
605
606 /*
607 * check if this request is a better next-serve candidate
608 */
609 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
610}
611
612static inline void
613cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
614{
615 rb_erase(&crq->rb_node, &cfqq->sort_list);
616 cfqq->queued[cfq_crq_is_sync(crq)]--;
617
618 cfq_add_crq_rb(crq);
619}
620
621static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
622
623{
624 struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
625 struct rb_node *n;
626
627 if (!cfqq)
628 goto out;
629
630 n = cfqq->sort_list.rb_node;
631 while (n) {
632 struct cfq_rq *crq = rb_entry_crq(n);
633
634 if (sector < crq->rb_key)
635 n = n->rb_left;
636 else if (sector > crq->rb_key)
637 n = n->rb_right;
638 else
639 return crq->request;
640 }
641
642out:
643 return NULL;
644}
645
646static void cfq_activate_request(request_queue_t *q, struct request *rq)
647{
648 struct cfq_data *cfqd = q->elevator->elevator_data;
649
650 cfqd->rq_in_driver++;
651}
652
653static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
654{
655 struct cfq_data *cfqd = q->elevator->elevator_data;
656
657 WARN_ON(!cfqd->rq_in_driver);
658 cfqd->rq_in_driver--;
659}
660
661static void cfq_remove_request(struct request *rq)
662{
663 struct cfq_rq *crq = RQ_DATA(rq);
664
665 list_del_init(&rq->queuelist);
666 cfq_del_crq_rb(crq);
667 cfq_del_crq_hash(crq);
668}
669
670static int
671cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
672{
673 struct cfq_data *cfqd = q->elevator->elevator_data;
674 struct request *__rq;
675 int ret;
676
677 __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
678 if (__rq && elv_rq_merge_ok(__rq, bio)) {
679 ret = ELEVATOR_BACK_MERGE;
680 goto out;
681 }
682
683 __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
684 if (__rq && elv_rq_merge_ok(__rq, bio)) {
685 ret = ELEVATOR_FRONT_MERGE;
686 goto out;
687 }
688
689 return ELEVATOR_NO_MERGE;
690out:
691 *req = __rq;
692 return ret;
693}
694
695static void cfq_merged_request(request_queue_t *q, struct request *req)
696{
697 struct cfq_data *cfqd = q->elevator->elevator_data;
698 struct cfq_rq *crq = RQ_DATA(req);
699
700 cfq_del_crq_hash(crq);
701 cfq_add_crq_hash(cfqd, crq);
702
703 if (rq_rb_key(req) != crq->rb_key) {
704 struct cfq_queue *cfqq = crq->cfq_queue;
705
706 cfq_update_next_crq(crq);
707 cfq_reposition_crq_rb(cfqq, crq);
708 }
709}
710
711static void
712cfq_merged_requests(request_queue_t *q, struct request *rq,
713 struct request *next)
714{
715 cfq_merged_request(q, rq);
716
717 /*
718 * reposition in fifo if next is older than rq
719 */
720 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
721 time_before(next->start_time, rq->start_time))
722 list_move(&rq->queuelist, &next->queuelist);
723
724 cfq_remove_request(next);
725}
726
727static inline void
728__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
729{
730 if (cfqq) {
731 /*
732 * stop potential idle class queues waiting service
733 */
734 del_timer(&cfqd->idle_class_timer);
735
736 cfqq->slice_start = jiffies;
737 cfqq->slice_end = 0;
738 cfqq->slice_left = 0;
739 cfq_clear_cfqq_must_alloc_slice(cfqq);
740 cfq_clear_cfqq_fifo_expire(cfqq);
741 cfq_clear_cfqq_expired(cfqq);
742 }
743
744 cfqd->active_queue = cfqq;
745}
746
747/*
748 * 0
749 * 0,1
750 * 0,1,2
751 * 0,1,2,3
752 * 0,1,2,3,4
753 * 0,1,2,3,4,5
754 * 0,1,2,3,4,5,6
755 * 0,1,2,3,4,5,6,7
756 */
757static int cfq_get_next_prio_level(struct cfq_data *cfqd)
758{
759 int prio, wrap;
760
761 prio = -1;
762 wrap = 0;
763 do {
764 int p;
765
766 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
767 if (!list_empty(&cfqd->rr_list[p])) {
768 prio = p;
769 break;
770 }
771 }
772
773 if (prio != -1)
774 break;
775 cfqd->cur_prio = 0;
776 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
777 cfqd->cur_end_prio = 0;
778 if (wrap)
779 break;
780 wrap = 1;
781 }
782 } while (1);
783
784 if (unlikely(prio == -1))
785 return -1;
786
787 BUG_ON(prio >= CFQ_PRIO_LISTS);
788
789 list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
790
791 cfqd->cur_prio = prio + 1;
792 if (cfqd->cur_prio > cfqd->cur_end_prio) {
793 cfqd->cur_end_prio = cfqd->cur_prio;
794 cfqd->cur_prio = 0;
795 }
796 if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
797 cfqd->cur_prio = 0;
798 cfqd->cur_end_prio = 0;
799 }
800
801 return prio;
802}
803
804static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
805{
806 struct cfq_queue *cfqq;
807
808 /*
809 * if current queue is expired but not done with its requests yet,
810 * wait for that to happen
811 */
812 if ((cfqq = cfqd->active_queue) != NULL) {
813 if (cfq_cfqq_expired(cfqq) && cfq_cfqq_dispatched(cfqq))
814 return NULL;
815 }
816
817 /*
818 * if current list is non-empty, grab first entry. if it is empty,
819 * get next prio level and grab first entry then if any are spliced
820 */
821 if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
822 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
823
824 /*
825 * if we have idle queues and no rt or be queues had pending
826 * requests, either allow immediate service if the grace period
827 * has passed or arm the idle grace timer
828 */
829 if (!cfqq && !list_empty(&cfqd->idle_rr)) {
830 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
831
832 if (time_after_eq(jiffies, end))
833 cfqq = list_entry_cfqq(cfqd->idle_rr.next);
834 else
835 mod_timer(&cfqd->idle_class_timer, end);
836 }
837
838 __cfq_set_active_queue(cfqd, cfqq);
839 return cfqq;
840}
841
842/*
843 * current cfqq expired its slice (or was too idle), select new one
844 */
845static void
846__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
847 int preempted)
848{
849 unsigned long now = jiffies;
850
851 if (cfq_cfqq_wait_request(cfqq))
852 del_timer(&cfqd->idle_slice_timer);
853
854 if (!preempted && !cfq_cfqq_dispatched(cfqq))
855 cfqq->service_last = now;
856
857 cfq_clear_cfqq_must_dispatch(cfqq);
858 cfq_clear_cfqq_wait_request(cfqq);
859
860 /*
861 * store what was left of this slice, if the queue idled out
862 * or was preempted
863 */
864 if (time_after(now, cfqq->slice_end))
865 cfqq->slice_left = now - cfqq->slice_end;
866 else
867 cfqq->slice_left = 0;
868
869 if (cfq_cfqq_on_rr(cfqq))
870 cfq_resort_rr_list(cfqq, preempted);
871
872 if (cfqq == cfqd->active_queue)
873 cfqd->active_queue = NULL;
874
875 if (cfqd->active_cic) {
876 put_io_context(cfqd->active_cic->ioc);
877 cfqd->active_cic = NULL;
878 }
879
880 cfqd->dispatch_slice = 0;
881}
882
883static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
884{
885 struct cfq_queue *cfqq = cfqd->active_queue;
886
887 if (cfqq) {
888 /*
889 * use deferred expiry, if there are requests in progress as
890 * not to disturb the slice of the next queue
891 */
892 if (cfq_cfqq_dispatched(cfqq))
893 cfq_mark_cfqq_expired(cfqq);
894 else
895 __cfq_slice_expired(cfqd, cfqq, preempted);
896 }
897}
898
899static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
900
901{
902 WARN_ON(!RB_EMPTY(&cfqq->sort_list));
903 WARN_ON(cfqq != cfqd->active_queue);
904
905 /*
906 * idle is disabled, either manually or by past process history
907 */
908 if (!cfqd->cfq_slice_idle)
909 return 0;
910 if (!cfq_cfqq_idle_window(cfqq))
911 return 0;
912 /*
913 * task has exited, don't wait
914 */
915 if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
916 return 0;
917
918 cfq_mark_cfqq_must_dispatch(cfqq);
919 cfq_mark_cfqq_wait_request(cfqq);
920
921 if (!timer_pending(&cfqd->idle_slice_timer)) {
922 unsigned long slice_left = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
923
924 cfqd->idle_slice_timer.expires = jiffies + slice_left;
925 add_timer(&cfqd->idle_slice_timer);
926 }
927
928 return 1;
929}
930
931static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
932{
933 struct cfq_data *cfqd = q->elevator->elevator_data;
934 struct cfq_queue *cfqq = crq->cfq_queue;
935
936 cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
937 cfq_remove_request(crq->request);
938 cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
939 elv_dispatch_sort(q, crq->request);
940}
941
942/*
943 * return expired entry, or NULL to just start from scratch in rbtree
944 */
945static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
946{
947 struct cfq_data *cfqd = cfqq->cfqd;
948 struct request *rq;
949 struct cfq_rq *crq;
950
951 if (cfq_cfqq_fifo_expire(cfqq))
952 return NULL;
953
954 if (!list_empty(&cfqq->fifo)) {
955 int fifo = cfq_cfqq_class_sync(cfqq);
956
957 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
958 rq = crq->request;
959 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
960 cfq_mark_cfqq_fifo_expire(cfqq);
961 return crq;
962 }
963 }
964
965 return NULL;
966}
967
968/*
969 * Scale schedule slice based on io priority. Use the sync time slice only
970 * if a queue is marked sync and has sync io queued. A sync queue with async
971 * io only, should not get full sync slice length.
972 */
973static inline int
974cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
975{
976 const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
977
978 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
979
980 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
981}
982
983static inline void
984cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
985{
986 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
987}
988
989static inline int
990cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
991{
992 const int base_rq = cfqd->cfq_slice_async_rq;
993
994 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
995
996 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
997}
998
999/*
1000 * get next queue for service
1001 */
1002static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
1003{
1004 unsigned long now = jiffies;
1005 struct cfq_queue *cfqq;
1006
1007 cfqq = cfqd->active_queue;
1008 if (!cfqq)
1009 goto new_queue;
1010
1011 if (cfq_cfqq_expired(cfqq))
1012 goto new_queue;
1013
1014 /*
1015 * slice has expired
1016 */
1017 if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
1018 goto expire;
1019
1020 /*
1021 * if queue has requests, dispatch one. if not, check if
1022 * enough slice is left to wait for one
1023 */
1024 if (!RB_EMPTY(&cfqq->sort_list))
1025 goto keep_queue;
1026 else if (!force && cfq_cfqq_class_sync(cfqq) &&
1027 time_before(now, cfqq->slice_end)) {
1028 if (cfq_arm_slice_timer(cfqd, cfqq))
1029 return NULL;
1030 }
1031
1032expire:
1033 cfq_slice_expired(cfqd, 0);
1034new_queue:
1035 cfqq = cfq_set_active_queue(cfqd);
1036keep_queue:
1037 return cfqq;
1038}
1039
1040static int
1041__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1042 int max_dispatch)
1043{
1044 int dispatched = 0;
1045
1046 BUG_ON(RB_EMPTY(&cfqq->sort_list));
1047
1048 do {
1049 struct cfq_rq *crq;
1050
1051 /*
1052 * follow expired path, else get first next available
1053 */
1054 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1055 crq = cfqq->next_crq;
1056
1057 /*
1058 * finally, insert request into driver dispatch list
1059 */
1060 cfq_dispatch_insert(cfqd->queue, crq);
1061
1062 cfqd->dispatch_slice++;
1063 dispatched++;
1064
1065 if (!cfqd->active_cic) {
1066 atomic_inc(&crq->io_context->ioc->refcount);
1067 cfqd->active_cic = crq->io_context;
1068 }
1069
1070 if (RB_EMPTY(&cfqq->sort_list))
1071 break;
1072
1073 } while (dispatched < max_dispatch);
1074
1075 /*
1076 * if slice end isn't set yet, set it. if at least one request was
1077 * sync, use the sync time slice value
1078 */
1079 if (!cfqq->slice_end)
1080 cfq_set_prio_slice(cfqd, cfqq);
1081
1082 /*
1083 * expire an async queue immediately if it has used up its slice. idle
1084 * queue always expire after 1 dispatch round.
1085 */
1086 if ((!cfq_cfqq_sync(cfqq) &&
1087 cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1088 cfq_class_idle(cfqq))
1089 cfq_slice_expired(cfqd, 0);
1090
1091 return dispatched;
1092}
1093
1094static int
1095cfq_dispatch_requests(request_queue_t *q, int force)
1096{
1097 struct cfq_data *cfqd = q->elevator->elevator_data;
1098 struct cfq_queue *cfqq;
1099
1100 if (!cfqd->busy_queues)
1101 return 0;
1102
1103 cfqq = cfq_select_queue(cfqd, force);
1104 if (cfqq) {
1105 int max_dispatch;
1106
1107 /*
1108 * if idle window is disabled, allow queue buildup
1109 */
1110 if (!cfq_cfqq_idle_window(cfqq) &&
1111 cfqd->rq_in_driver >= cfqd->cfq_max_depth)
1112 return 0;
1113
1114 cfq_clear_cfqq_must_dispatch(cfqq);
1115 cfq_clear_cfqq_wait_request(cfqq);
1116 del_timer(&cfqd->idle_slice_timer);
1117
1118 if (!force) {
1119 max_dispatch = cfqd->cfq_quantum;
1120 if (cfq_class_idle(cfqq))
1121 max_dispatch = 1;
1122 } else
1123 max_dispatch = INT_MAX;
1124
1125 return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1126 }
1127
1128 return 0;
1129}
1130
1131/*
1132 * task holds one reference to the queue, dropped when task exits. each crq
1133 * in-flight on this queue also holds a reference, dropped when crq is freed.
1134 *
1135 * queue lock must be held here.
1136 */
1137static void cfq_put_queue(struct cfq_queue *cfqq)
1138{
1139 struct cfq_data *cfqd = cfqq->cfqd;
1140
1141 BUG_ON(atomic_read(&cfqq->ref) <= 0);
1142
1143 if (!atomic_dec_and_test(&cfqq->ref))
1144 return;
1145
1146 BUG_ON(rb_first(&cfqq->sort_list));
1147 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1148 BUG_ON(cfq_cfqq_on_rr(cfqq));
1149
1150 if (unlikely(cfqd->active_queue == cfqq)) {
1151 __cfq_slice_expired(cfqd, cfqq, 0);
1152 cfq_schedule_dispatch(cfqd);
1153 }
1154
1155 cfq_put_cfqd(cfqq->cfqd);
1156
1157 /*
1158 * it's on the empty list and still hashed
1159 */
1160 list_del(&cfqq->cfq_list);
1161 hlist_del(&cfqq->cfq_hash);
1162 kmem_cache_free(cfq_pool, cfqq);
1163}
1164
1165static inline struct cfq_queue *
1166__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1167 const int hashval)
1168{
1169 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1170 struct hlist_node *entry, *next;
1171
1172 hlist_for_each_safe(entry, next, hash_list) {
1173 struct cfq_queue *__cfqq = list_entry_qhash(entry);
1174 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
1175
1176 if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
1177 return __cfqq;
1178 }
1179
1180 return NULL;
1181}
1182
1183static struct cfq_queue *
1184cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1185{
1186 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1187}
1188
1189static void cfq_free_io_context(struct cfq_io_context *cic)
1190{
1191 struct cfq_io_context *__cic;
1192 struct list_head *entry, *next;
1193
1194 list_for_each_safe(entry, next, &cic->list) {
1195 __cic = list_entry(entry, struct cfq_io_context, list);
1196 kmem_cache_free(cfq_ioc_pool, __cic);
1197 }
1198
1199 kmem_cache_free(cfq_ioc_pool, cic);
1200}
1201
1202/*
1203 * Called with interrupts disabled
1204 */
1205static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1206{
1207 struct cfq_data *cfqd = cic->cfqq->cfqd;
1208 request_queue_t *q = cfqd->queue;
1209
1210 WARN_ON(!irqs_disabled());
1211
1212 spin_lock(q->queue_lock);
1213
1214 if (unlikely(cic->cfqq == cfqd->active_queue)) {
1215 __cfq_slice_expired(cfqd, cic->cfqq, 0);
1216 cfq_schedule_dispatch(cfqd);
1217 }
1218
1219 cfq_put_queue(cic->cfqq);
1220 cic->cfqq = NULL;
1221 spin_unlock(q->queue_lock);
1222}
1223
1224/*
1225 * Another task may update the task cic list, if it is doing a queue lookup
1226 * on its behalf. cfq_cic_lock excludes such concurrent updates
1227 */
1228static void cfq_exit_io_context(struct cfq_io_context *cic)
1229{
1230 struct cfq_io_context *__cic;
1231 struct list_head *entry;
1232 unsigned long flags;
1233
1234 local_irq_save(flags);
1235
1236 /*
1237 * put the reference this task is holding to the various queues
1238 */
1239 list_for_each(entry, &cic->list) {
1240 __cic = list_entry(entry, struct cfq_io_context, list);
1241 cfq_exit_single_io_context(__cic);
1242 }
1243
1244 cfq_exit_single_io_context(cic);
1245 local_irq_restore(flags);
1246}
1247
1248static struct cfq_io_context *
1249cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1250{
1251 struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1252
1253 if (cic) {
1254 INIT_LIST_HEAD(&cic->list);
1255 cic->cfqq = NULL;
1256 cic->key = NULL;
1257 cic->last_end_request = jiffies;
1258 cic->ttime_total = 0;
1259 cic->ttime_samples = 0;
1260 cic->ttime_mean = 0;
1261 cic->dtor = cfq_free_io_context;
1262 cic->exit = cfq_exit_io_context;
1263 }
1264
1265 return cic;
1266}
1267
1268static void cfq_init_prio_data(struct cfq_queue *cfqq)
1269{
1270 struct task_struct *tsk = current;
1271 int ioprio_class;
1272
1273 if (!cfq_cfqq_prio_changed(cfqq))
1274 return;
1275
1276 ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1277 switch (ioprio_class) {
1278 default:
1279 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1280 case IOPRIO_CLASS_NONE:
1281 /*
1282 * no prio set, place us in the middle of the BE classes
1283 */
1284 cfqq->ioprio = task_nice_ioprio(tsk);
1285 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1286 break;
1287 case IOPRIO_CLASS_RT:
1288 cfqq->ioprio = task_ioprio(tsk);
1289 cfqq->ioprio_class = IOPRIO_CLASS_RT;
1290 break;
1291 case IOPRIO_CLASS_BE:
1292 cfqq->ioprio = task_ioprio(tsk);
1293 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1294 break;
1295 case IOPRIO_CLASS_IDLE:
1296 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1297 cfqq->ioprio = 7;
1298 cfq_clear_cfqq_idle_window(cfqq);
1299 break;
1300 }
1301
1302 /*
1303 * keep track of original prio settings in case we have to temporarily
1304 * elevate the priority of this queue
1305 */
1306 cfqq->org_ioprio = cfqq->ioprio;
1307 cfqq->org_ioprio_class = cfqq->ioprio_class;
1308
1309 if (cfq_cfqq_on_rr(cfqq))
1310 cfq_resort_rr_list(cfqq, 0);
1311
1312 cfq_clear_cfqq_prio_changed(cfqq);
1313}
1314
1315static inline void changed_ioprio(struct cfq_queue *cfqq)
1316{
1317 if (cfqq) {
1318 struct cfq_data *cfqd = cfqq->cfqd;
1319
1320 spin_lock(cfqd->queue->queue_lock);
1321 cfq_mark_cfqq_prio_changed(cfqq);
1322 cfq_init_prio_data(cfqq);
1323 spin_unlock(cfqd->queue->queue_lock);
1324 }
1325}
1326
1327/*
1328 * callback from sys_ioprio_set, irqs are disabled
1329 */
1330static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1331{
1332 struct cfq_io_context *cic = ioc->cic;
1333
1334 changed_ioprio(cic->cfqq);
1335
1336 list_for_each_entry(cic, &cic->list, list)
1337 changed_ioprio(cic->cfqq);
1338
1339 return 0;
1340}
1341
1342static struct cfq_queue *
1343cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
1344 gfp_t gfp_mask)
1345{
1346 const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1347 struct cfq_queue *cfqq, *new_cfqq = NULL;
1348
1349retry:
1350 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1351
1352 if (!cfqq) {
1353 if (new_cfqq) {
1354 cfqq = new_cfqq;
1355 new_cfqq = NULL;
1356 } else if (gfp_mask & __GFP_WAIT) {
1357 spin_unlock_irq(cfqd->queue->queue_lock);
1358 new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1359 spin_lock_irq(cfqd->queue->queue_lock);
1360 goto retry;
1361 } else {
1362 cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1363 if (!cfqq)
1364 goto out;
1365 }
1366
1367 memset(cfqq, 0, sizeof(*cfqq));
1368
1369 INIT_HLIST_NODE(&cfqq->cfq_hash);
1370 INIT_LIST_HEAD(&cfqq->cfq_list);
1371 RB_CLEAR_ROOT(&cfqq->sort_list);
1372 INIT_LIST_HEAD(&cfqq->fifo);
1373
1374 cfqq->key = key;
1375 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1376 atomic_set(&cfqq->ref, 0);
1377 cfqq->cfqd = cfqd;
1378 atomic_inc(&cfqd->ref);
1379 cfqq->service_last = 0;
1380 /*
1381 * set ->slice_left to allow preemption for a new process
1382 */
1383 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1384 cfq_mark_cfqq_idle_window(cfqq);
1385 cfq_mark_cfqq_prio_changed(cfqq);
1386 cfq_init_prio_data(cfqq);
1387 }
1388
1389 if (new_cfqq)
1390 kmem_cache_free(cfq_pool, new_cfqq);
1391
1392 atomic_inc(&cfqq->ref);
1393out:
1394 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1395 return cfqq;
1396}
1397
1398/*
1399 * Setup general io context and cfq io context. There can be several cfq
1400 * io contexts per general io context, if this process is doing io to more
1401 * than one device managed by cfq. Note that caller is holding a reference to
1402 * cfqq, so we don't need to worry about it disappearing
1403 */
1404static struct cfq_io_context *
1405cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
1406{
1407 struct io_context *ioc = NULL;
1408 struct cfq_io_context *cic;
1409
1410 might_sleep_if(gfp_mask & __GFP_WAIT);
1411
1412 ioc = get_io_context(gfp_mask);
1413 if (!ioc)
1414 return NULL;
1415
1416 if ((cic = ioc->cic) == NULL) {
1417 cic = cfq_alloc_io_context(cfqd, gfp_mask);
1418
1419 if (cic == NULL)
1420 goto err;
1421
1422 /*
1423 * manually increment generic io_context usage count, it
1424 * cannot go away since we are already holding one ref to it
1425 */
1426 ioc->cic = cic;
1427 ioc->set_ioprio = cfq_ioc_set_ioprio;
1428 cic->ioc = ioc;
1429 cic->key = cfqd;
1430 atomic_inc(&cfqd->ref);
1431 } else {
1432 struct cfq_io_context *__cic;
1433
1434 /*
1435 * the first cic on the list is actually the head itself
1436 */
1437 if (cic->key == cfqd)
1438 goto out;
1439
1440 /*
1441 * cic exists, check if we already are there. linear search
1442 * should be ok here, the list will usually not be more than
1443 * 1 or a few entries long
1444 */
1445 list_for_each_entry(__cic, &cic->list, list) {
1446 /*
1447 * this process is already holding a reference to
1448 * this queue, so no need to get one more
1449 */
1450 if (__cic->key == cfqd) {
1451 cic = __cic;
1452 goto out;
1453 }
1454 }
1455
1456 /*
1457 * nope, process doesn't have a cic assoicated with this
1458 * cfqq yet. get a new one and add to list
1459 */
1460 __cic = cfq_alloc_io_context(cfqd, gfp_mask);
1461 if (__cic == NULL)
1462 goto err;
1463
1464 __cic->ioc = ioc;
1465 __cic->key = cfqd;
1466 atomic_inc(&cfqd->ref);
1467 list_add(&__cic->list, &cic->list);
1468 cic = __cic;
1469 }
1470
1471out:
1472 return cic;
1473err:
1474 put_io_context(ioc);
1475 return NULL;
1476}
1477
1478static void
1479cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1480{
1481 unsigned long elapsed, ttime;
1482
1483 /*
1484 * if this context already has stuff queued, thinktime is from
1485 * last queue not last end
1486 */
1487#if 0
1488 if (time_after(cic->last_end_request, cic->last_queue))
1489 elapsed = jiffies - cic->last_end_request;
1490 else
1491 elapsed = jiffies - cic->last_queue;
1492#else
1493 elapsed = jiffies - cic->last_end_request;
1494#endif
1495
1496 ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1497
1498 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1499 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1500 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1501}
1502
1503#define sample_valid(samples) ((samples) > 80)
1504
1505/*
1506 * Disable idle window if the process thinks too long or seeks so much that
1507 * it doesn't matter
1508 */
1509static void
1510cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1511 struct cfq_io_context *cic)
1512{
1513 int enable_idle = cfq_cfqq_idle_window(cfqq);
1514
1515 if (!cic->ioc->task || !cfqd->cfq_slice_idle)
1516 enable_idle = 0;
1517 else if (sample_valid(cic->ttime_samples)) {
1518 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1519 enable_idle = 0;
1520 else
1521 enable_idle = 1;
1522 }
1523
1524 if (enable_idle)
1525 cfq_mark_cfqq_idle_window(cfqq);
1526 else
1527 cfq_clear_cfqq_idle_window(cfqq);
1528}
1529
1530
1531/*
1532 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1533 * no or if we aren't sure, a 1 will cause a preempt.
1534 */
1535static int
1536cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1537 struct cfq_rq *crq)
1538{
1539 struct cfq_queue *cfqq = cfqd->active_queue;
1540
1541 if (cfq_class_idle(new_cfqq))
1542 return 0;
1543
1544 if (!cfqq)
1545 return 1;
1546
1547 if (cfq_class_idle(cfqq))
1548 return 1;
1549 if (!cfq_cfqq_wait_request(new_cfqq))
1550 return 0;
1551 /*
1552 * if it doesn't have slice left, forget it
1553 */
1554 if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1555 return 0;
1556 if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1557 return 1;
1558
1559 return 0;
1560}
1561
1562/*
1563 * cfqq preempts the active queue. if we allowed preempt with no slice left,
1564 * let it have half of its nominal slice.
1565 */
1566static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1567{
1568 struct cfq_queue *__cfqq, *next;
1569
1570 list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1571 cfq_resort_rr_list(__cfqq, 1);
1572
1573 if (!cfqq->slice_left)
1574 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1575
1576 cfqq->slice_end = cfqq->slice_left + jiffies;
1577 __cfq_slice_expired(cfqd, cfqq, 1);
1578 __cfq_set_active_queue(cfqd, cfqq);
1579}
1580
1581/*
1582 * should really be a ll_rw_blk.c helper
1583 */
1584static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1585{
1586 request_queue_t *q = cfqd->queue;
1587
1588 if (!blk_queue_plugged(q))
1589 q->request_fn(q);
1590 else
1591 __generic_unplug_device(q);
1592}
1593
1594/*
1595 * Called when a new fs request (crq) is added (to cfqq). Check if there's
1596 * something we should do about it
1597 */
1598static void
1599cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1600 struct cfq_rq *crq)
1601{
1602 struct cfq_io_context *cic;
1603
1604 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
1605
1606 /*
1607 * we never wait for an async request and we don't allow preemption
1608 * of an async request. so just return early
1609 */
1610 if (!cfq_crq_is_sync(crq))
1611 return;
1612
1613 cic = crq->io_context;
1614
1615 cfq_update_io_thinktime(cfqd, cic);
1616 cfq_update_idle_window(cfqd, cfqq, cic);
1617
1618 cic->last_queue = jiffies;
1619
1620 if (cfqq == cfqd->active_queue) {
1621 /*
1622 * if we are waiting for a request for this queue, let it rip
1623 * immediately and flag that we must not expire this queue
1624 * just now
1625 */
1626 if (cfq_cfqq_wait_request(cfqq)) {
1627 cfq_mark_cfqq_must_dispatch(cfqq);
1628 del_timer(&cfqd->idle_slice_timer);
1629 cfq_start_queueing(cfqd, cfqq);
1630 }
1631 } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1632 /*
1633 * not the active queue - expire current slice if it is
1634 * idle and has expired it's mean thinktime or this new queue
1635 * has some old slice time left and is of higher priority
1636 */
1637 cfq_preempt_queue(cfqd, cfqq);
1638 cfq_mark_cfqq_must_dispatch(cfqq);
1639 cfq_start_queueing(cfqd, cfqq);
1640 }
1641}
1642
1643static void cfq_insert_request(request_queue_t *q, struct request *rq)
1644{
1645 struct cfq_data *cfqd = q->elevator->elevator_data;
1646 struct cfq_rq *crq = RQ_DATA(rq);
1647 struct cfq_queue *cfqq = crq->cfq_queue;
1648
1649 cfq_init_prio_data(cfqq);
1650
1651 cfq_add_crq_rb(crq);
1652
1653 list_add_tail(&rq->queuelist, &cfqq->fifo);
1654
1655 if (rq_mergeable(rq))
1656 cfq_add_crq_hash(cfqd, crq);
1657
1658 cfq_crq_enqueued(cfqd, cfqq, crq);
1659}
1660
1661static void cfq_completed_request(request_queue_t *q, struct request *rq)
1662{
1663 struct cfq_rq *crq = RQ_DATA(rq);
1664 struct cfq_queue *cfqq = crq->cfq_queue;
1665 struct cfq_data *cfqd = cfqq->cfqd;
1666 const int sync = cfq_crq_is_sync(crq);
1667 unsigned long now;
1668
1669 now = jiffies;
1670
1671 WARN_ON(!cfqd->rq_in_driver);
1672 WARN_ON(!cfqq->on_dispatch[sync]);
1673 cfqd->rq_in_driver--;
1674 cfqq->on_dispatch[sync]--;
1675
1676 if (!cfq_class_idle(cfqq))
1677 cfqd->last_end_request = now;
1678
1679 if (!cfq_cfqq_dispatched(cfqq)) {
1680 if (cfq_cfqq_on_rr(cfqq)) {
1681 cfqq->service_last = now;
1682 cfq_resort_rr_list(cfqq, 0);
1683 }
1684 if (cfq_cfqq_expired(cfqq)) {
1685 __cfq_slice_expired(cfqd, cfqq, 0);
1686 cfq_schedule_dispatch(cfqd);
1687 }
1688 }
1689
1690 if (cfq_crq_is_sync(crq))
1691 crq->io_context->last_end_request = now;
1692}
1693
1694static struct request *
1695cfq_former_request(request_queue_t *q, struct request *rq)
1696{
1697 struct cfq_rq *crq = RQ_DATA(rq);
1698 struct rb_node *rbprev = rb_prev(&crq->rb_node);
1699
1700 if (rbprev)
1701 return rb_entry_crq(rbprev)->request;
1702
1703 return NULL;
1704}
1705
1706static struct request *
1707cfq_latter_request(request_queue_t *q, struct request *rq)
1708{
1709 struct cfq_rq *crq = RQ_DATA(rq);
1710 struct rb_node *rbnext = rb_next(&crq->rb_node);
1711
1712 if (rbnext)
1713 return rb_entry_crq(rbnext)->request;
1714
1715 return NULL;
1716}
1717
1718/*
1719 * we temporarily boost lower priority queues if they are holding fs exclusive
1720 * resources. they are boosted to normal prio (CLASS_BE/4)
1721 */
1722static void cfq_prio_boost(struct cfq_queue *cfqq)
1723{
1724 const int ioprio_class = cfqq->ioprio_class;
1725 const int ioprio = cfqq->ioprio;
1726
1727 if (has_fs_excl()) {
1728 /*
1729 * boost idle prio on transactions that would lock out other
1730 * users of the filesystem
1731 */
1732 if (cfq_class_idle(cfqq))
1733 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1734 if (cfqq->ioprio > IOPRIO_NORM)
1735 cfqq->ioprio = IOPRIO_NORM;
1736 } else {
1737 /*
1738 * check if we need to unboost the queue
1739 */
1740 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1741 cfqq->ioprio_class = cfqq->org_ioprio_class;
1742 if (cfqq->ioprio != cfqq->org_ioprio)
1743 cfqq->ioprio = cfqq->org_ioprio;
1744 }
1745
1746 /*
1747 * refile between round-robin lists if we moved the priority class
1748 */
1749 if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1750 cfq_cfqq_on_rr(cfqq))
1751 cfq_resort_rr_list(cfqq, 0);
1752}
1753
1754static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
1755{
1756 if (rw == READ || process_sync(task))
1757 return task->pid;
1758
1759 return CFQ_KEY_ASYNC;
1760}
1761
1762static inline int
1763__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1764 struct task_struct *task, int rw)
1765{
1766#if 1
1767 if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1768 !cfq_cfqq_must_alloc_slice(cfqq)) {
1769 cfq_mark_cfqq_must_alloc_slice(cfqq);
1770 return ELV_MQUEUE_MUST;
1771 }
1772
1773 return ELV_MQUEUE_MAY;
1774#else
1775 if (!cfqq || task->flags & PF_MEMALLOC)
1776 return ELV_MQUEUE_MAY;
1777 if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
1778 if (cfq_cfqq_wait_request(cfqq))
1779 return ELV_MQUEUE_MUST;
1780
1781 /*
1782 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
1783 * can quickly flood the queue with writes from a single task
1784 */
1785 if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) {
1786 cfq_mark_cfqq_must_alloc_slice(cfqq);
1787 return ELV_MQUEUE_MUST;
1788 }
1789
1790 return ELV_MQUEUE_MAY;
1791 }
1792 if (cfq_class_idle(cfqq))
1793 return ELV_MQUEUE_NO;
1794 if (cfqq->allocated[rw] >= cfqd->max_queued) {
1795 struct io_context *ioc = get_io_context(GFP_ATOMIC);
1796 int ret = ELV_MQUEUE_NO;
1797
1798 if (ioc && ioc->nr_batch_requests)
1799 ret = ELV_MQUEUE_MAY;
1800
1801 put_io_context(ioc);
1802 return ret;
1803 }
1804
1805 return ELV_MQUEUE_MAY;
1806#endif
1807}
1808
1809static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1810{
1811 struct cfq_data *cfqd = q->elevator->elevator_data;
1812 struct task_struct *tsk = current;
1813 struct cfq_queue *cfqq;
1814
1815 /*
1816 * don't force setup of a queue from here, as a call to may_queue
1817 * does not necessarily imply that a request actually will be queued.
1818 * so just lookup a possibly existing queue, or return 'may queue'
1819 * if that fails
1820 */
1821 cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1822 if (cfqq) {
1823 cfq_init_prio_data(cfqq);
1824 cfq_prio_boost(cfqq);
1825
1826 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1827 }
1828
1829 return ELV_MQUEUE_MAY;
1830}
1831
1832static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1833{
1834 struct cfq_data *cfqd = q->elevator->elevator_data;
1835 struct request_list *rl = &q->rq;
1836
1837 if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
1838 smp_mb();
1839 if (waitqueue_active(&rl->wait[READ]))
1840 wake_up(&rl->wait[READ]);
1841 }
1842
1843 if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
1844 smp_mb();
1845 if (waitqueue_active(&rl->wait[WRITE]))
1846 wake_up(&rl->wait[WRITE]);
1847 }
1848}
1849
1850/*
1851 * queue lock held here
1852 */
1853static void cfq_put_request(request_queue_t *q, struct request *rq)
1854{
1855 struct cfq_data *cfqd = q->elevator->elevator_data;
1856 struct cfq_rq *crq = RQ_DATA(rq);
1857
1858 if (crq) {
1859 struct cfq_queue *cfqq = crq->cfq_queue;
1860 const int rw = rq_data_dir(rq);
1861
1862 BUG_ON(!cfqq->allocated[rw]);
1863 cfqq->allocated[rw]--;
1864
1865 put_io_context(crq->io_context->ioc);
1866
1867 mempool_free(crq, cfqd->crq_pool);
1868 rq->elevator_private = NULL;
1869
1870 cfq_check_waiters(q, cfqq);
1871 cfq_put_queue(cfqq);
1872 }
1873}
1874
1875/*
1876 * Allocate cfq data structures associated with this request.
1877 */
1878static int
1879cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1880 gfp_t gfp_mask)
1881{
1882 struct cfq_data *cfqd = q->elevator->elevator_data;
1883 struct task_struct *tsk = current;
1884 struct cfq_io_context *cic;
1885 const int rw = rq_data_dir(rq);
1886 pid_t key = cfq_queue_pid(tsk, rw);
1887 struct cfq_queue *cfqq;
1888 struct cfq_rq *crq;
1889 unsigned long flags;