1 /*
2 * linux/drivers/block/ll_rw_blk.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
9 */
10
11 /*
12 * This handles all read/write requests to block devices
13 */
14 #include <linux/sched.h>
15 #include <linux/kernel.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/errno.h>
18 #include <linux/string.h>
19 #include <linux/config.h>
20 #include <linux/locks.h>
21 #include <linux/mm.h>
22 #include <linux/swap.h>
23 #include <linux/init.h>
24 #include <linux/smp_lock.h>
25
26 #include <asm/system.h>
27 #include <asm/io.h>
28 #include <linux/blk.h>
29 #include <linux/highmem.h>
30 #include <linux/raid/md.h>
31
32 #include <linux/module.h>
33
34 /*
35 * MAC Floppy IWM hooks
36 */
37
38 #ifdef CONFIG_MAC_FLOPPY_IWM
39 extern int mac_floppy_init(void);
40 #endif
41
42 /*
43 * For the allocated request tables
44 */
45 static kmem_cache_t *request_cachep;
46
47 /*
48 * The "disk" task queue is used to start the actual requests
49 * after a plug
50 */
51 DECLARE_TASK_QUEUE(tq_disk);
52
53 /*
54 * Protect the request list against multiple users..
55 *
56 * With this spinlock the Linux block IO subsystem is 100% SMP threaded
57 * from the IRQ event side, and almost 100% SMP threaded from the syscall
58 * side (we still have protect against block device array operations, and
59 * the do_request() side is casually still unsafe. The kernel lock protects
60 * this part currently.).
61 *
62 * there is a fair chance that things will work just OK if these functions
63 * are called with no global kernel lock held ...
64 */
65 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
66
67 /* This specifies how many sectors to read ahead on the disk. */
68
69 int read_ahead[MAX_BLKDEV];
70
71 /* blk_dev_struct is:
72 * *request_fn
73 * *current_request
74 */
75 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
76
77 /*
78 * blk_size contains the size of all block-devices in units of 1024 byte
79 * sectors:
80 *
81 * blk_size[MAJOR][MINOR]
82 *
83 * if (!blk_size[MAJOR]) then no minor size checking is done.
84 */
85 int * blk_size[MAX_BLKDEV];
86
87 /*
88 * blksize_size contains the size of all block-devices:
89 *
90 * blksize_size[MAJOR][MINOR]
91 *
92 * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
93 */
94 int * blksize_size[MAX_BLKDEV];
95
96 /*
97 * hardsect_size contains the size of the hardware sector of a device.
98 *
99 * hardsect_size[MAJOR][MINOR]
100 *
101 * if (!hardsect_size[MAJOR])
102 * then 512 bytes is assumed.
103 * else
104 * sector_size is hardsect_size[MAJOR][MINOR]
105 * This is currently set by some scsi devices and read by the msdos fs driver.
106 * Other uses may appear later.
107 */
108 int * hardsect_size[MAX_BLKDEV];
109
110 /*
111 * The following tunes the read-ahead algorithm in mm/filemap.c
112 */
113 int * max_readahead[MAX_BLKDEV];
114
115 /*
116 * Max number of sectors per request
117 */
118 int * max_sectors[MAX_BLKDEV];
119
120 /*
121 * queued sectors for all devices, used to make sure we don't fill all
122 * of memory with locked buffers
123 */
124 atomic_t queued_sectors;
125
126 /*
127 * high and low watermark for above
128 */
129 static int high_queued_sectors, low_queued_sectors;
130 static int batch_requests, queue_nr_requests;
131 static DECLARE_WAIT_QUEUE_HEAD(blk_buffers_wait);
132
133 static inline int get_max_sectors(kdev_t dev)
134 {
135 if (!max_sectors[MAJOR(dev)])
136 return MAX_SECTORS;
137 return max_sectors[MAJOR(dev)][MINOR(dev)];
138 }
139
140 inline request_queue_t *__blk_get_queue(kdev_t dev)
141 {
142 struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
143
144 if (bdev->queue)
145 return bdev->queue(dev);
146 else
147 return &blk_dev[MAJOR(dev)].request_queue;
148 }
149
150 /*
151 * NOTE: the device-specific queue() functions
152 * have to be atomic!
153 */
154 request_queue_t *blk_get_queue(kdev_t dev)
155 {
156 request_queue_t *ret;
157 unsigned long flags;
158
159 spin_lock_irqsave(&io_request_lock,flags);
160 ret = __blk_get_queue(dev);
161 spin_unlock_irqrestore(&io_request_lock,flags);
162
163 return ret;
164 }
165
166 static int __blk_cleanup_queue(struct list_head *head)
167 {
168 struct request *rq;
169 int i = 0;
170
171 if (list_empty(head))
172 return 0;
173
174 do {
175 rq = list_entry(head->next, struct request, table);
176 list_del(&rq->table);
177 kmem_cache_free(request_cachep, rq);
178 i++;
179 } while (!list_empty(head));
180
181 return i;
182 }
183
184 /**
185 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
186 * @q: the request queue to be released
187 *
188 * Description:
189 * blk_cleanup_queue is the pair to blk_init_queue(). It should
190 * be called when a request queue is being released; typically
191 * when a block device is being de-registered. Currently, its
192 * primary task it to free all the &struct request structures that
193 * were allocated to the queue.
194 * Caveat:
195 * Hopefully the low level driver will have finished any
196 * outstanding requests first...
197 **/
198 void blk_cleanup_queue(request_queue_t * q)
199 {
200 int count = queue_nr_requests;
201
202 count -= __blk_cleanup_queue(&q->request_freelist[READ]);
203 count -= __blk_cleanup_queue(&q->request_freelist[WRITE]);
204 count -= __blk_cleanup_queue(&q->pending_freelist[READ]);
205 count -= __blk_cleanup_queue(&q->pending_freelist[WRITE]);
206
207 if (count)
208 printk("blk_cleanup_queue: leaked requests (%d)\n", count);
209
210 memset(q, 0, sizeof(*q));
211 }
212
213 /**
214 * blk_queue_headactive - indicate whether head of request queue may be active
215 * @q: The queue which this applies to.
216 * @active: A flag indication where the head of the queue is active.
217 *
218 * Description:
219 * The driver for a block device may choose to leave the currently active
220 * request on the request queue, removing it only when it has completed.
221 * The queue handling routines assume this by default for safety reasons
222 * and will not involve the head of the request queue in any merging or
223 * reordering of requests when the queue is unplugged (and thus may be
224 * working on this particular request).
225 *
226 * If a driver removes requests from the queue before processing them, then
227 * it may indicate that it does so, there by allowing the head of the queue
228 * to be involved in merging and reordering. This is done be calling
229 * blk_queue_headactive() with an @active flag of %0.
230 *
231 * If a driver processes several requests at once, it must remove them (or
232 * at least all but one of them) from the request queue.
233 *
234 * When a queue is plugged (see blk_queue_pluggable()) the head will be
235 * assumed to be inactive.
236 **/
237
238 void blk_queue_headactive(request_queue_t * q, int active)
239 {
240 q->head_active = active;
241 }
242
243 /**
244 * blk_queue_pluggable - define a plugging function for a request queue
245 * @q: the request queue to which the function will apply
246 * @plug: the function to be called to plug a queue
247 *
248 * Description:
249 * A request queue will be "plugged" if a request is added to it
250 * while it is empty. This allows a number of requests to be added
251 * before any are processed, thus providing an opportunity for these
252 * requests to be merged or re-ordered.
253 * The default plugging function (generic_plug_device()) sets the
254 * "plugged" flag for the queue and adds a task to the $tq_disk task
255 * queue to unplug the queue and call the request function at a
256 * later time.
257 *
258 * A device driver may provide an alternate plugging function by
259 * passing it to blk_queue_pluggable(). This function should set
260 * the "plugged" flag if it want calls to the request_function to be
261 * blocked, and should place a task on $tq_disk which will unplug
262 * the queue. Alternately it can simply do nothing and there-by
263 * disable plugging of the device.
264 **/
265
266 void blk_queue_pluggable (request_queue_t * q, plug_device_fn *plug)
267 {
268 q->plug_device_fn = plug;
269 }
270
271
272 /**
273 * blk_queue_make_request - define an alternate make_request function for a device
274 * @q: the request queue for the device to be affected
275 * @mfn: the alternate make_request function
276 *
277 * Description:
278 * The normal way for &struct buffer_heads to be passed to a device
279 * driver is for them to be collected into requests on a request
280 * queue, and then to allow the device driver to select requests
281 * off that queue when it is ready. This works well for many block
282 * devices. However some block devices (typically virtual devices
283 * such as md or lvm) do not benefit from the processing on the
284 * request queue, and are served best by having the requests passed
285 * directly to them. This can be achieved by providing a function
286 * to blk_queue_make_request().
287 *
288 * Caveat:
289 * The driver that does this *must* be able to deal appropriately
290 * with buffers in "highmemory", either by calling bh_kmap() to get
291 * a kernel mapping, to by calling create_bounce() to create a
292 * buffer in normal memory.
293 **/
294
295 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
296 {
297 q->make_request_fn = mfn;
298 }
299
300 static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
301 {
302 if (req->nr_segments < max_segments) {
303 req->nr_segments++;
304 return 1;
305 }
306 return 0;
307 }
308
309 static int ll_back_merge_fn(request_queue_t *q, struct request *req,
310 struct buffer_head *bh, int max_segments)
311 {
312 if (req->bhtail->b_data + req->bhtail->b_size == bh->b_data)
313 return 1;
314 return ll_new_segment(q, req, max_segments);
315 }
316
317 static int ll_front_merge_fn(request_queue_t *q, struct request *req,
318 struct buffer_head *bh, int max_segments)
319 {
320 if (bh->b_data + bh->b_size == req->bh->b_data)
321 return 1;
322 return ll_new_segment(q, req, max_segments);
323 }
324
325 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
326 struct request *next, int max_segments)
327 {
328 int total_segments = req->nr_segments + next->nr_segments;
329
330 if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data)
331 total_segments--;
332
333 if (total_segments > max_segments)
334 return 0;
335
336 req->nr_segments = total_segments;
337 return 1;
338 }
339
340 /*
341 * "plug" the device if there are no outstanding requests: this will
342 * force the transfer to start only after we have put all the requests
343 * on the list.
344 *
345 * This is called with interrupts off and no requests on the queue.
346 * (and with the request spinlock acquired)
347 */
348 static void generic_plug_device(request_queue_t *q, kdev_t dev)
349 {
350 /*
351 * no need to replug device
352 */
353 if (!list_empty(&q->queue_head) || q->plugged)
354 return;
355
356 q->plugged = 1;
357 queue_task(&q->plug_tq, &tq_disk);
358 }
359
360 /*
361 * remove the plug and let it rip..
362 */
363 static inline void __generic_unplug_device(request_queue_t *q)
364 {
365 if (q->plugged) {
366 q->plugged = 0;
367 if (!list_empty(&q->queue_head))
368 q->request_fn(q);
369 }
370 }
371
372 void generic_unplug_device(void *data)
373 {
374 request_queue_t *q = (request_queue_t *) data;
375 unsigned long flags;
376
377 spin_lock_irqsave(&io_request_lock, flags);
378 __generic_unplug_device(q);
379 spin_unlock_irqrestore(&io_request_lock, flags);
380 }
381
382 static void blk_init_free_list(request_queue_t *q)
383 {
384 struct request *rq;
385 int i;
386
387 INIT_LIST_HEAD(&q->request_freelist[READ]);
388 INIT_LIST_HEAD(&q->request_freelist[WRITE]);
389 INIT_LIST_HEAD(&q->pending_freelist[READ]);
390 INIT_LIST_HEAD(&q->pending_freelist[WRITE]);
391 q->pending_free[READ] = q->pending_free[WRITE] = 0;
392
393 /*
394 * Divide requests in half between read and write
395 */
396 for (i = 0; i < queue_nr_requests; i++) {
397 rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
398 memset(rq, 0, sizeof(struct request));
399 rq->rq_status = RQ_INACTIVE;
400 list_add(&rq->table, &q->request_freelist[i & 1]);
401 }
402
403 init_waitqueue_head(&q->wait_for_request);
404 spin_lock_init(&q->queue_lock);
405 }
406
407 static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
408
409 /**
410 * blk_init_queue - prepare a request queue for use with a block device
411 * @q: The &request_queue_t to be initialised
412 * @rfn: The function to be called to process requests that have been
413 * placed on the queue.
414 *
415 * Description:
416 * If a block device wishes to use the standard request handling procedures,
417 * which sorts requests and coalesces adjacent requests, then it must
418 * call blk_init_queue(). The function @rfn will be called when there
419 * are requests on the queue that need to be processed. If the device
420 * supports plugging, then @rfn may not be called immediately when requests
421 * are available on the queue, but may be called at some time later instead.
422 * Plugged queues are generally unplugged when a buffer belonging to one
423 * of the requests on the queue is needed, or due to memory pressure.
424 *
425 * @rfn is not required, or even expected, to remove all requests off the
426 * queue, but only as many as it can handle at a time. If it does leave
427 * requests on the queue, it is responsible for arranging that the requests
428 * get dealt with eventually.
429 *
430 * A global spin lock $io_request_lock must be held while manipulating the
431 * requests on the request queue.
432 *
433 * The request on the head of the queue is by default assumed to be
434 * potentially active, and it is not considered for re-ordering or merging
435 * whenever the given queue is unplugged. This behaviour can be changed with
436 * blk_queue_headactive().
437 *
438 * Note:
439 * blk_init_queue() must be paired with a blk_cleanup_queue() call
440 * when the block device is deactivated (such as at module unload).
441 **/
442 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
443 {
444 INIT_LIST_HEAD(&q->queue_head);
445 elevator_init(&q->elevator, ELEVATOR_LINUS);
446 blk_init_free_list(q);
447 q->request_fn = rfn;
448 q->back_merge_fn = ll_back_merge_fn;
449 q->front_merge_fn = ll_front_merge_fn;
450 q->merge_requests_fn = ll_merge_requests_fn;
451 q->make_request_fn = __make_request;
452 q->plug_tq.sync = 0;
453 q->plug_tq.routine = &generic_unplug_device;
454 q->plug_tq.data = q;
455 q->plugged = 0;
456 /*
457 * These booleans describe the queue properties. We set the
458 * default (and most common) values here. Other drivers can
459 * use the appropriate functions to alter the queue properties.
460 * as appropriate.
461 */
462 q->plug_device_fn = generic_plug_device;
463 q->head_active = 1;
464 }
465
466 #define blkdev_free_rq(list) list_entry((list)->next, struct request, table);
467 /*
468 * Get a free request. io_request_lock must be held and interrupts
469 * disabled on the way in.
470 */
471 static inline struct request *get_request(request_queue_t *q, int rw)
472 {
473 struct request *rq = NULL;
474
475 if (!list_empty(&q->request_freelist[rw])) {
476 rq = blkdev_free_rq(&q->request_freelist[rw]);
477 list_del(&rq->table);
478 rq->rq_status = RQ_ACTIVE;
479 rq->special = NULL;
480 rq->q = q;
481 }
482
483 return rq;
484 }
485
486 /*
487 * No available requests for this queue, unplug the device.
488 */
489 static struct request *__get_request_wait(request_queue_t *q, int rw)
490 {
491 register struct request *rq;
492 DECLARE_WAITQUEUE(wait, current);
493
494 add_wait_queue_exclusive(&q->wait_for_request, &wait);
495 for (;;) {
496 __set_current_state(TASK_UNINTERRUPTIBLE);
497 spin_lock_irq(&io_request_lock);
498 rq = get_request(q, rw);
499 spin_unlock_irq(&io_request_lock);
500 if (rq)
501 break;
502 generic_unplug_device(q);
503 schedule();
504 }
505 remove_wait_queue(&q->wait_for_request, &wait);
506 current->state = TASK_RUNNING;
507 return rq;
508 }
509
510 static inline struct request *get_request_wait(request_queue_t *q, int rw)
511 {
512 register struct request *rq;
513
514 spin_lock_irq(&io_request_lock);
515 rq = get_request(q, rw);
516 spin_unlock_irq(&io_request_lock);
517 if (rq)
518 return rq;
519 return __get_request_wait(q, rw);
520 }
521
522 /* RO fail safe mechanism */
523
524 static long ro_bits[MAX_BLKDEV][8];
525
526 int is_read_only(kdev_t dev)
527 {
528 int minor,major;
529
530 major = MAJOR(dev);
531 minor = MINOR(dev);
532 if (major < 0 || major >= MAX_BLKDEV) return 0;
533 return ro_bits[major][minor >> 5] & (1 << (minor & 31));
534 }
535
536 void set_device_ro(kdev_t dev,int flag)
537 {
538 int minor,major;
539
540 major = MAJOR(dev);
541 minor = MINOR(dev);
542 if (major < 0 || major >= MAX_BLKDEV) return;
543 if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
544 else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
545 }
546
547 inline void drive_stat_acct (kdev_t dev, int rw,
548 unsigned long nr_sectors, int new_io)
549 {
550 unsigned int major = MAJOR(dev);
551 unsigned int index;
552
553 index = disk_index(dev);
554 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
555 return;
556
557 kstat.dk_drive[major][index] += new_io;
558 if (rw == READ) {
559 kstat.dk_drive_rio[major][index] += new_io;
560 kstat.dk_drive_rblk[major][index] += nr_sectors;
561 } else if (rw == WRITE) {
562 kstat.dk_drive_wio[major][index] += new_io;
563 kstat.dk_drive_wblk[major][index] += nr_sectors;
564 } else
565 printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
566 }
567
568 /*
569 * add-request adds a request to the linked list.
570 * io_request_lock is held and interrupts disabled, as we muck with the
571 * request queue list.
572 *
573 * By this point, req->cmd is always either READ/WRITE, never READA,
574 * which is important for drive_stat_acct() above.
575 */
576 static inline void add_request(request_queue_t * q, struct request * req,
577 struct list_head *insert_here)
578 {
579 int major;
580
581 drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
582
583 if (!q->plugged && q->head_active && insert_here == &q->queue_head) {
584 spin_unlock_irq(&io_request_lock);
585 BUG();
586 }
587
588 /*
589 * elevator indicated where it wants this request to be
590 * inserted at elevator_merge time
591 */
592 list_add(&req->queue, insert_here);
593
594 major = MAJOR(req->rq_dev);
595 if (major >= DAC960_MAJOR+0 && major <= DAC960_MAJOR+7)
596 q->request_fn(q);
597 }
598
599 void inline blk_refill_freelist(request_queue_t *q, int rw)
600 {
601 if (q->pending_free[rw]) {
602 list_splice(&q->pending_freelist[rw], &q->request_freelist[rw]);
603 INIT_LIST_HEAD(&q->pending_freelist[rw]);
604 q->pending_free[rw] = 0;
605 }
606 }
607
608 /*
609 * Must be called with io_request_lock held and interrupts disabled
610 */
611 void inline blkdev_release_request(struct request *req)
612 {
613 request_queue_t *q = req->q;
614 int rw = req->cmd;
615
616 req->rq_status = RQ_INACTIVE;
617 req->q = NULL;
618
619 /*
620 * Request may not have originated from ll_rw_blk. if not,
621 * asumme it has free buffers and check waiters
622 */
623 if (q) {
624 /*
625 * we've released enough buffers to start I/O again
626 */
627 if (waitqueue_active(&blk_buffers_wait)
628 && atomic_read(&queued_sectors) < low_queued_sectors)
629 wake_up(&blk_buffers_wait);
630
631 /*
632 * Add to pending free list and batch wakeups
633 */
634 list_add(&req->table, &q->pending_freelist[rw]);
635
636 if (++q->pending_free[rw] >= batch_requests) {
637 int wake_up = q->pending_free[rw];
638 blk_refill_freelist(q, rw);
639 wake_up_nr(&q->wait_for_request, wake_up);
640 }
641 }
642 }
643
644 /*
645 * Has to be called with the request spinlock acquired
646 */
647 static void attempt_merge(request_queue_t * q,
648 struct request *req,
649 int max_sectors,
650 int max_segments)
651 {
652 struct request *next;
653
654 next = blkdev_next_request(req);
655 if (req->sector + req->nr_sectors != next->sector)
656 return;
657 if (req->cmd != next->cmd
658 || req->rq_dev != next->rq_dev
659 || req->nr_sectors + next->nr_sectors > max_sectors
660 || next->sem)
661 return;
662 /*
663 * If we are not allowed to merge these requests, then
664 * return. If we are allowed to merge, then the count
665 * will have been updated to the appropriate number,
666 * and we shouldn't do it here too.
667 */
668 if (!q->merge_requests_fn(q, req, next, max_segments))
669 return;
670
671 q->elevator.elevator_merge_req_fn(req, next);
672 req->bhtail->b_reqnext = next->bh;
673 req->bhtail = next->bhtail;
674 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
675 list_del(&next->queue);
676 blkdev_release_request(next);
677 }
678
679 static inline void attempt_back_merge(request_queue_t * q,
680 struct request *req,
681 int max_sectors,
682 int max_segments)
683 {
684 if (&req->queue == q->queue_head.prev)
685 return;
686 attempt_merge(q, req, max_sectors, max_segments);
687 }
688
689 static inline void attempt_front_merge(request_queue_t * q,
690 struct list_head * head,
691 struct request *req,
692 int max_sectors,
693 int max_segments)
694 {
695 struct list_head * prev;
696
697 prev = req->queue.prev;
698 if (head == prev)
699 return;
700 attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
701 }
702
703 static int __make_request(request_queue_t * q, int rw,
704 struct buffer_head * bh)
705 {
706 unsigned int sector, count;
707 int max_segments = MAX_SEGMENTS;
708 struct request * req = NULL, *freereq = NULL;
709 int rw_ahead, max_sectors, el_ret;
710 struct list_head *head, *insert_here;
711 int latency;
712 elevator_t *elevator = &q->elevator;
713
714 count = bh->b_size >> 9;
715 sector = bh->b_rsector;
716
717 rw_ahead = 0; /* normal case; gets changed below for READA */
718 switch (rw) {
719 case READA:
720 rw_ahead = 1;
721 rw = READ; /* drop into READ */
722 case READ:
723 case WRITE:
724 latency = elevator_request_latency(elevator, rw);
725 break;
726 default:
727 BUG();
728 goto end_io;
729 }
730
731 /* We'd better have a real physical mapping!
732 Check this bit only if the buffer was dirty and just locked
733 down by us so at this point flushpage will block and
734 won't clear the mapped bit under us. */
735 if (!buffer_mapped(bh))
736 BUG();
737
738 /*
739 * Temporary solution - in 2.5 this will be done by the lowlevel
740 * driver. Create a bounce buffer if the buffer data points into
741 * high memory - keep the original buffer otherwise.
742 */
743 #if CONFIG_HIGHMEM
744 bh = create_bounce(rw, bh);
745 #endif
746
747 /* look for a free request. */
748 /*
749 * Try to coalesce the new request with old requests
750 */
751 max_sectors = get_max_sectors(bh->b_rdev);
752
753 again:
754 head = &q->queue_head;
755 /*
756 * Now we acquire the request spinlock, we have to be mega careful
757 * not to schedule or do something nonatomic
758 */
759 spin_lock_irq(&io_request_lock);
760
761 insert_here = head->prev;
762 if (list_empty(head)) {
763 q->plug_device_fn(q, bh->b_rdev); /* is atomic */
764 goto get_rq;
765 } else if (q->head_active && !q->plugged)
766 head = head->next;
767
768 el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,
769 max_sectors, max_segments);
770 switch (el_ret) {
771
772 case ELEVATOR_BACK_MERGE:
773 if (!q->back_merge_fn(q, req, bh, max_segments))
774 break;
775 elevator->elevator_merge_cleanup_fn(q, req, count);
776 req->bhtail->b_reqnext = bh;
777 req->bhtail = bh;
778 req->nr_sectors = req->hard_nr_sectors += count;
779 blk_started_io(count);
780 drive_stat_acct(req->rq_dev, req->cmd, count, 0);
781 attempt_back_merge(q, req, max_sectors, max_segments);
782 goto out;
783
784 case ELEVATOR_FRONT_MERGE:
785 if (!q->front_merge_fn(q, req, bh, max_segments))
786 break;
787 elevator->elevator_merge_cleanup_fn(q, req, count);
788 bh->b_reqnext = req->bh;
789 req->bh = bh;
790 req->buffer = bh->b_data;
791 req->current_nr_sectors = count;
792 req->sector = req->hard_sector = sector;
793 req->nr_sectors = req->hard_nr_sectors += count;
794 blk_started_io(count);
795 drive_stat_acct(req->rq_dev, req->cmd, count, 0);
796 attempt_front_merge(q, head, req, max_sectors, max_segments);
797 goto out;
798
799 /*
800 * elevator says don't/can't merge. get new request
801 */
802 case ELEVATOR_NO_MERGE:
803 /*
804 * use elevator hints as to where to insert the
805 * request. if no hints, just add it to the back
806 * of the queue
807 */
808 if (req)
809 insert_here = &req->queue;
810 break;
811
812 default:
813 printk("elevator returned crap (%d)\n", el_ret);
814 BUG();
815 }
816
817 /*
818 * Grab a free request from the freelist - if that is empty, check
819 * if we are doing read ahead and abort instead of blocking for
820 * a free slot.
821 */
822 get_rq:
823 if (freereq) {
824 req = freereq;
825 freereq = NULL;
826 } else if ((req = get_request(q, rw)) == NULL) {
827 spin_unlock_irq(&io_request_lock);
828 if (rw_ahead)
829 goto end_io;
830
831 freereq = __get_request_wait(q, rw);
832 goto again;
833 }
834
835 /* fill up the request-info, and add it to the queue */
836 req->elevator_sequence = latency;
837 req->cmd = rw;
838 req->errors = 0;
839 req->hard_sector = req->sector = sector;
840 req->hard_nr_sectors = req->nr_sectors = count;
841 req->current_nr_sectors = count;
842 req->nr_segments = 1; /* Always 1 for a new request. */
843 req->nr_hw_segments = 1; /* Always 1 for a new request. */
844 req->buffer = bh->b_data;
845 req->sem = NULL;
846 req->bh = bh;
847 req->bhtail = bh;
848 req->rq_dev = bh->b_rdev;
849 blk_started_io(count);
850 add_request(q, req, insert_here);
851 out:
852 if (freereq)
853 blkdev_release_request(freereq);
854 if (!q->plugged)
855 q->request_fn(q);
856 spin_unlock_irq(&io_request_lock);
857 return 0;
858 end_io:
859 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
860 return 0;
861 }
862
863 /**
864 * generic_make_request: hand a buffer head to it's device driver for I/O
865 * @rw: READ, WRITE, or READA - what sort of I/O is desired.
866 * @bh: The buffer head describing the location in memory and on the device.
867 *
868 * generic_make_request() is used to make I/O requests of block
869 * devices. It is passed a &struct buffer_head and a &rw value. The
870 * %READ and %WRITE options are (hopefully) obvious in meaning. The
871 * %READA value means that a read is required, but that the driver is
872 * free to fail the request if, for example, it cannot get needed
873 * resources immediately.
874 *
875 * generic_make_request() does not return any status. The
876 * success/failure status of the request, along with notification of
877 * completion, is delivered asynchronously through the bh->b_end_io
878 * function described (one day) else where.
879 *
880 * The caller of generic_make_request must make sure that b_page,
881 * b_addr, b_size are set to describe the memory buffer, that b_rdev
882 * and b_rsector are set to describe the device address, and the
883 * b_end_io and optionally b_private are set to describe how
884 * completion notification should be signaled. BH_Mapped should also
885 * be set (to confirm that b_dev and b_blocknr are valid).
886 *
887 * generic_make_request and the drivers it calls may use b_reqnext,
888 * and may change b_rdev and b_rsector. So the values of these fields
889 * should NOT be depended on after the call to generic_make_request.
890 * Because of this, the caller should record the device address
891 * information in b_dev and b_blocknr.
892 *
893 * Apart from those fields mentioned above, no other fields, and in
894 * particular, no other flags, are changed by generic_make_request or
895 * any lower level drivers.
896 * */
897 void generic_make_request (int rw, struct buffer_head * bh)
898 {
899 int major = MAJOR(bh->b_rdev);
900 request_queue_t *q;
901
902 if (!bh->b_end_io)
903 BUG();
904
905 if (blk_size[major]) {
906 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
907 unsigned long sector = bh->b_rsector;
908 unsigned int count = bh->b_size >> 9;
909
910 if (maxsector < count || maxsector - count < sector) {
911 bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
912 if (blk_size[major][MINOR(bh->b_rdev)]) {
913
914 /* This may well happen - the kernel calls bread()
915 without checking the size of the device, e.g.,
916 when mounting a device. */
917 printk(KERN_INFO
918 "attempt to access beyond end of device\n");
919 printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
920 kdevname(bh->b_rdev), rw,
921 (sector + count)>>1,
922 blk_size[major][MINOR(bh->b_rdev)]);
923 }
924 bh->b_end_io(bh, 0);
925 return;
926 }
927 }
928
929 /*
930 * Resolve the mapping until finished. (drivers are
931 * still free to implement/resolve their own stacking
932 * by explicitly returning 0)
933 */
934 /* NOTE: we don't repeat the blk_size check for each new device.
935 * Stacking drivers are expected to know what they are doing.
936 */
937 do {
938 q = blk_get_queue(bh->b_rdev);
939 if (!q) {
940 printk(KERN_ERR
941 "generic_make_request: Trying to access nonexistent block-device %s (%ld)\n",
942 kdevname(bh->b_rdev), bh->b_rsector);
943 buffer_IO_error(bh);
944 break;
945 }
946 } while (q->make_request_fn(q, rw, bh));
947 }
948
949
950 /**
951 * submit_bh: submit a buffer_head to the block device later for I/O
952 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
953 * @bh: The &struct buffer_head which describes the I/O
954 *
955 * submit_bh() is very similar in purpose to generic_make_request(), and
956 * uses that function to do most of the work.
957 *
958 * The extra functionality provided by submit_bh is to determine
959 * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev.
960 * This is is appropriate for IO requests that come from the buffer
961 * cache and page cache which (currently) always use aligned blocks.
962 */
963 void submit_bh(int rw, struct buffer_head * bh)
964 {
965 if (!test_bit(BH_Lock, &bh->b_state))
966 BUG();
967
968 set_bit(BH_Req, &bh->b_state);
969
970 /*
971 * First step, 'identity mapping' - RAID or LVM might
972 * further remap this.
973 */
974 bh->b_rdev = bh->b_dev;
975 bh->b_rsector = bh->b_blocknr * (bh->b_size >> 9);
976
977 generic_make_request(rw, bh);
978
979 switch (rw) {
980 case WRITE:
981 kstat.pgpgout++;
982 break;
983 default:
984 kstat.pgpgin++;
985 break;
986 }
987 }
988
989 /*
990 * Default IO end handler, used by "ll_rw_block()".
991 */
992 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
993 {
994 mark_buffer_uptodate(bh, uptodate);
995 unlock_buffer(bh);
996 }
997
998 /**
999 * ll_rw_block: low-level access to block devices
1000 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
1001 * @nr: number of &struct buffer_heads in the array
1002 * @bhs: array of pointers to &struct buffer_head
1003 *
1004 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
1005 * and requests an I/O operation on them, either a %READ or a %WRITE.
1006 * The third %READA option is described in the documentation for
1007 * generic_make_request() which ll_rw_block() calls.
1008 *
1009 * This function provides extra functionality that is not in
1010 * generic_make_request() that is relevant to buffers in the buffer
1011 * cache or page cache. In particular it drops any buffer that it
1012 * cannot get a lock on (with the BH_Lock state bit), any buffer that
1013 * appears to be clean when doing a write request, and any buffer that
1014 * appears to be up-to-date when doing read request. Further it marks
1015 * as clean buffers that are processed for writing (the buffer cache
1016 * wont assume that they are actually clean until the buffer gets
1017 * unlocked).
1018 *
1019 * ll_rw_block sets b_end_io to simple completion handler that marks
1020 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
1021 * any waiters. As client that needs a more interesting completion
1022 * routine should call submit_bh() (or generic_make_request())
1023 * directly.
1024 *
1025 * Caveat:
1026 * All of the buffers must be for the same device, and must also be
1027 * of the current approved size for the device. */
1028
1029 void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
1030 {
1031 unsigned int major;
1032 int correct_size;
1033 int i;
1034
1035 if (!nr)
1036 return;
1037
1038 major = MAJOR(bhs[0]->b_dev);
1039
1040 /* Determine correct block size for this device. */
1041 correct_size = BLOCK_SIZE;
1042 if (blksize_size[major]) {
1043 i = blksize_size[major][MINOR(bhs[0]->b_dev)];
1044 if (i)
1045 correct_size = i;
1046 }
1047
1048 /* Verify requested block sizes. */
1049 for (i = 0; i < nr; i++) {
1050 struct buffer_head *bh = bhs[i];
1051 if (bh->b_size % correct_size) {
1052 printk(KERN_NOTICE "ll_rw_block: device %s: "
1053 "only %d-char blocks implemented (%u)\n",
1054 kdevname(bhs[0]->b_dev),
1055 correct_size, bh->b_size);
1056 goto sorry;
1057 }
1058 }
1059
1060 if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
1061 printk(KERN_NOTICE "Can't write to read-only device %s\n",
1062 kdevname(bhs[0]->b_dev));
1063 goto sorry;
1064 }
1065
1066 for (i = 0; i < nr; i++) {
1067 struct buffer_head *bh = bhs[i];
1068
1069 /*
1070 * don't lock any more buffers if we are above the high
1071 * water mark. instead start I/O on the queued stuff.
1072 */
1073 if (atomic_read(&queued_sectors) >= high_queued_sectors) {
1074 run_task_queue(&tq_disk);
1075 wait_event(blk_buffers_wait,
1076 atomic_read(&queued_sectors) < low_queued_sectors);
1077 }
1078
1079 /* Only one thread can actually submit the I/O. */
1080 if (test_and_set_bit(BH_Lock, &bh->b_state))
1081 continue;
1082
1083 /* We have the buffer lock */
1084 bh->b_end_io = end_buffer_io_sync;
1085
1086 switch(rw) {
1087 case WRITE:
1088 if (!atomic_set_buffer_clean(bh))
1089 /* Hmmph! Nothing to write */
1090 goto end_io;
1091 __mark_buffer_clean(bh);
1092 break;
1093
1094 case READA:
1095 case READ:
1096 if (buffer_uptodate(bh))
1097 /* Hmmph! Already have it */
1098 goto end_io;
1099 break;
1100 default:
1101 BUG();
1102 end_io:
1103 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1104 continue;
1105 }
1106
1107 submit_bh(rw, bh);
1108 }
1109 return;
1110
1111 sorry:
1112 /* Make sure we don't get infinite dirty retries.. */
1113 for (i = 0; i < nr; i++)
1114 mark_buffer_clean(bhs[i]);
1115 }
1116
1117 #ifdef CONFIG_STRAM_SWAP
1118 extern int stram_device_init (void);
1119 #endif
1120
1121
1122 /**
1123 * end_that_request_first - end I/O on one buffer.
1124 * @req: the request being processed
1125 * @uptodate: 0 for I/O error
1126 * @name: the name printed for an I/O error
1127 *
1128 * Description:
1129 * Ends I/O on the first buffer attached to @req, and sets it up
1130 * for the next buffer_head (if any) in the cluster.
1131 *
1132 * Return:
1133 * 0 - we are done with this request, call end_that_request_last()
1134 * 1 - still buffers pending for this request
1135 *
1136 * Caveat:
1137 * Drivers implementing their own end_request handling must call
1138 * blk_finished_io() appropriately.
1139 **/
1140
1141 int end_that_request_first (struct request *req, int uptodate, char *name)
1142 {
1143 struct buffer_head * bh;
1144 int nsect;
1145
1146 req->errors = 0;
1147 if (!uptodate)
1148 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
1149 kdevname(req->rq_dev), name, req->sector);
1150
1151 if ((bh = req->bh) != NULL) {
1152 nsect = bh->b_size >> 9;
1153 blk_finished_io(nsect);
1154 req->bh = bh->b_reqnext;
1155 bh->b_reqnext = NULL;
1156 bh->b_end_io(bh, uptodate);
1157 if ((bh = req->bh) != NULL) {
1158 req->hard_sector += nsect;
1159 req->hard_nr_sectors -= nsect;
1160 req->sector = req->hard_sector;
1161 req->nr_sectors = req->hard_nr_sectors;
1162
1163 req->current_nr_sectors = bh->b_size >> 9;
1164 if (req->nr_sectors < req->current_nr_sectors) {
1165 req->nr_sectors = req->current_nr_sectors;
1166 printk("end_request: buffer-list destroyed\n");
1167 }
1168 req->buffer = bh->b_data;
1169 return 1;
1170 }
1171 }
1172 return 0;
1173 }
1174
1175 void end_that_request_last(struct request *req)
1176 {
1177 if (req->sem != NULL)
1178 up(req->sem);
1179
1180 blkdev_release_request(req);
1181 }
1182
1183 #define MB(kb) ((kb) << 10)
1184
1185 int __init blk_dev_init(void)
1186 {
1187 struct blk_dev_struct *dev;
1188 int total_ram;
1189
1190 request_cachep = kmem_cache_create("blkdev_requests",
1191 sizeof(struct request),
1192 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1193
1194 if (!request_cachep)
1195 panic("Can't create request pool slab cache\n");
1196
1197 for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
1198 dev->queue = NULL;
1199
1200 memset(ro_bits,0,sizeof(ro_bits));
1201 memset(max_readahead, 0, sizeof(max_readahead));
1202 memset(max_sectors, 0, sizeof(max_sectors));
1203
1204 atomic_set(&queued_sectors, 0);
1205 total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
1206
1207 /*
1208 * Try to keep 128MB max hysteris. If not possible,
1209 * use half of RAM
1210 */
1211 high_queued_sectors = (total_ram * 2) / 3;
1212 low_queued_sectors = high_queued_sectors / 3;
1213 if (high_queued_sectors - low_queued_sectors > MB(128))
1214 low_queued_sectors = high_queued_sectors - MB(128);
1215
1216
1217 /*
1218 * make it sectors (512b)
1219 */
1220 high_queued_sectors <<= 1;
1221 low_queued_sectors <<= 1;
1222
1223 /*
1224 * Scale free request slots per queue too
1225 */
1226 total_ram = (total_ram + MB(32) - 1) & ~(MB(32) - 1);
1227 if ((queue_nr_requests = total_ram >> 9) > QUEUE_NR_REQUESTS)
1228 queue_nr_requests = QUEUE_NR_REQUESTS;
1229
1230 /*
1231 * adjust batch frees according to queue length, with upper limit
1232 */
1233 if ((batch_requests = queue_nr_requests >> 3) > 32)
1234 batch_requests = 32;
1235
1236 printk("block: queued sectors max/low %dkB/%dkB, %d slots per queue\n",
1237 high_queued_sectors / 2,
1238 low_queued_sectors / 2,
1239 queue_nr_requests);
1240
1241 #ifdef CONFIG_AMIGA_Z2RAM
1242 z2_init();
1243 #endif
1244 #ifdef CONFIG_STRAM_SWAP
1245 stram_device_init();
1246 #endif
1247 #ifdef CONFIG_BLK_DEV_RAM
1248 rd_init();
1249 #endif
1250 #ifdef CONFIG_BLK_DEV_LOOP
1251 loop_init();
1252 #endif
1253 #ifdef CONFIG_ISP16_CDI
1254 isp16_init();
1255 #endif
1256 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_IDE)
1257 ide_init(); /* this MUST precede hd_init */
1258 #endif
1259 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
1260 hd_init();
1261 #endif
1262 #ifdef CONFIG_BLK_DEV_PS2
1263 ps2esdi_init();
1264 #endif
1265 #ifdef CONFIG_BLK_DEV_XD
1266 xd_init();
1267 #endif
1268 #ifdef CONFIG_BLK_DEV_MFM
1269 mfm_init();
1270 #endif
1271 #ifdef CONFIG_PARIDE
1272 { extern void paride_init(void); paride_init(); };
1273 #endif
1274 #ifdef CONFIG_MAC_FLOPPY
1275 swim3_init();
1276 #endif
1277 #ifdef CONFIG_BLK_DEV_SWIM_IOP
1278 swimiop_init();
1279 #endif
1280 #ifdef CONFIG_AMIGA_FLOPPY
1281 amiga_floppy_init();
1282 #endif
1283 #ifdef CONFIG_ATARI_FLOPPY
1284 atari_floppy_init();
1285 #endif
1286 #ifdef CONFIG_BLK_DEV_FD
1287 floppy_init();
1288 #else
1289 #if defined(__i386__) /* Do we even need this? */
1290 outb_p(0xc, 0x3f2);
1291 #endif
1292 #endif
1293 #ifdef CONFIG_CDU31A
1294 cdu31a_init();
1295 #endif
1296 #ifdef CONFIG_ATARI_ACSI
1297 acsi_init();
1298 #endif
1299 #ifdef CONFIG_MCD
1300 mcd_init();
1301 #endif
1302 #ifdef CONFIG_MCDX
1303 mcdx_init();
1304 #endif
1305 #ifdef CONFIG_SBPCD
1306 sbpcd_init();
1307 #endif
1308 #ifdef CONFIG_AZTCD
1309 aztcd_init();
1310 #endif
1311 #ifdef CONFIG_CDU535
1312 sony535_init();
1313 #endif
1314 #ifdef CONFIG_GSCD
1315 gscd_init();
1316 #endif
1317 #ifdef CONFIG_CM206
1318 cm206_init();
1319 #endif
1320 #ifdef CONFIG_OPTCD
1321 optcd_init();
1322 #endif
1323 #ifdef CONFIG_SJCD
1324 sjcd_init();
1325 #endif
1326 #ifdef CONFIG_APBLOCK
1327 ap_init();
1328 #endif
1329 #ifdef CONFIG_DDV
1330 ddv_init();
1331 #endif
1332 #ifdef CONFIG_BLK_DEV_NBD
1333 nbd_init();
1334 #endif
1335 #ifdef CONFIG_MDISK
1336 mdisk_init();
1337 #endif
1338 #ifdef CONFIG_DASD
1339 dasd_init();
1340 #endif
1341 #ifdef CONFIG_SUN_JSFLASH
1342 jsfd_init();
1343 #endif
1344 return 0;
1345 };
1346
1347 EXPORT_SYMBOL(io_request_lock);
1348 EXPORT_SYMBOL(end_that_request_first);
1349 EXPORT_SYMBOL(end_that_request_last);
1350 EXPORT_SYMBOL(blk_init_queue);
1351 EXPORT_SYMBOL(blk_get_queue);
1352 EXPORT_SYMBOL(__blk_get_queue);
1353 EXPORT_SYMBOL(blk_cleanup_queue);
1354 EXPORT_SYMBOL(blk_queue_headactive);
1355 EXPORT_SYMBOL(blk_queue_pluggable);
1356 EXPORT_SYMBOL(blk_queue_make_request);
1357 EXPORT_SYMBOL(generic_make_request);
1358 EXPORT_SYMBOL(blkdev_release_request);
1359 EXPORT_SYMBOL(generic_unplug_device);
1360 EXPORT_SYMBOL(queued_sectors);
1361
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.