~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/drivers/block/ll_rw_blk.c

Version: ~ [ 2.2.5 ] ~ [ 2.4.1 ] ~ [ 2.4.9 ] ~ [ 2.6.17.10 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *  linux/drivers/block/ll_rw_blk.c
  3  *
  4  * Copyright (C) 1991, 1992 Linus Torvalds
  5  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
  6  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
  7  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
  8  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
  9  */
 10 
 11 /*
 12  * This handles all read/write requests to block devices
 13  */
 14 #include <linux/sched.h>
 15 #include <linux/kernel.h>
 16 #include <linux/kernel_stat.h>
 17 #include <linux/errno.h>
 18 #include <linux/string.h>
 19 #include <linux/config.h>
 20 #include <linux/locks.h>
 21 #include <linux/mm.h>
 22 #include <linux/swap.h>
 23 #include <linux/init.h>
 24 #include <linux/smp_lock.h>
 25 
 26 #include <asm/system.h>
 27 #include <asm/io.h>
 28 #include <linux/blk.h>
 29 #include <linux/highmem.h>
 30 #include <linux/raid/md.h>
 31 
 32 #include <linux/module.h>
 33 
 34 /*
 35  * MAC Floppy IWM hooks
 36  */
 37 
 38 #ifdef CONFIG_MAC_FLOPPY_IWM
 39 extern int mac_floppy_init(void);
 40 #endif
 41 
 42 /*
 43  * For the allocated request tables
 44  */
 45 static kmem_cache_t *request_cachep;
 46 
 47 /*
 48  * The "disk" task queue is used to start the actual requests
 49  * after a plug
 50  */
 51 DECLARE_TASK_QUEUE(tq_disk);
 52 
 53 /*
 54  * Protect the request list against multiple users..
 55  *
 56  * With this spinlock the Linux block IO subsystem is 100% SMP threaded
 57  * from the IRQ event side, and almost 100% SMP threaded from the syscall
 58  * side (we still have protect against block device array operations, and
 59  * the do_request() side is casually still unsafe. The kernel lock protects
 60  * this part currently.).
 61  *
 62  * there is a fair chance that things will work just OK if these functions
 63  * are called with no global kernel lock held ...
 64  */
 65 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
 66 
 67 /* This specifies how many sectors to read ahead on the disk. */
 68 
 69 int read_ahead[MAX_BLKDEV];
 70 
 71 /* blk_dev_struct is:
 72  *      *request_fn
 73  *      *current_request
 74  */
 75 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
 76 
 77 /*
 78  * blk_size contains the size of all block-devices in units of 1024 byte
 79  * sectors:
 80  *
 81  * blk_size[MAJOR][MINOR]
 82  *
 83  * if (!blk_size[MAJOR]) then no minor size checking is done.
 84  */
 85 int * blk_size[MAX_BLKDEV];
 86 
 87 /*
 88  * blksize_size contains the size of all block-devices:
 89  *
 90  * blksize_size[MAJOR][MINOR]
 91  *
 92  * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
 93  */
 94 int * blksize_size[MAX_BLKDEV];
 95 
 96 /*
 97  * hardsect_size contains the size of the hardware sector of a device.
 98  *
 99  * hardsect_size[MAJOR][MINOR]
100  *
101  * if (!hardsect_size[MAJOR])
102  *              then 512 bytes is assumed.
103  * else
104  *              sector_size is hardsect_size[MAJOR][MINOR]
105  * This is currently set by some scsi devices and read by the msdos fs driver.
106  * Other uses may appear later.
107  */
108 int * hardsect_size[MAX_BLKDEV];
109 
110 /*
111  * The following tunes the read-ahead algorithm in mm/filemap.c
112  */
113 int * max_readahead[MAX_BLKDEV];
114 
115 /*
116  * Max number of sectors per request
117  */
118 int * max_sectors[MAX_BLKDEV];
119 
120 /*
121  * queued sectors for all devices, used to make sure we don't fill all
122  * of memory with locked buffers
123  */
124 atomic_t queued_sectors;
125 
126 /*
127  * high and low watermark for above
128  */
129 static int high_queued_sectors, low_queued_sectors;
130 static int batch_requests, queue_nr_requests;
131 static DECLARE_WAIT_QUEUE_HEAD(blk_buffers_wait);
132 
133 static inline int get_max_sectors(kdev_t dev)
134 {
135         if (!max_sectors[MAJOR(dev)])
136                 return MAX_SECTORS;
137         return max_sectors[MAJOR(dev)][MINOR(dev)];
138 }
139 
140 inline request_queue_t *__blk_get_queue(kdev_t dev)
141 {
142         struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
143 
144         if (bdev->queue)
145                 return bdev->queue(dev);
146         else
147                 return &blk_dev[MAJOR(dev)].request_queue;
148 }
149 
150 /*
151  * NOTE: the device-specific queue() functions
152  * have to be atomic!
153  */
154 request_queue_t *blk_get_queue(kdev_t dev)
155 {
156         request_queue_t *ret;
157         unsigned long flags;
158 
159         spin_lock_irqsave(&io_request_lock,flags);
160         ret = __blk_get_queue(dev);
161         spin_unlock_irqrestore(&io_request_lock,flags);
162 
163         return ret;
164 }
165 
166 static int __blk_cleanup_queue(struct list_head *head)
167 {
168         struct request *rq;
169         int i = 0;
170 
171         if (list_empty(head))
172                 return 0;
173 
174         do {
175                 rq = list_entry(head->next, struct request, table);
176                 list_del(&rq->table);
177                 kmem_cache_free(request_cachep, rq);
178                 i++;
179         } while (!list_empty(head));
180 
181         return i;
182 }
183 
184 /**
185  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
186  * @q:    the request queue to be released
187  *
188  * Description:
189  *     blk_cleanup_queue is the pair to blk_init_queue().  It should
190  *     be called when a request queue is being released; typically
191  *     when a block device is being de-registered.  Currently, its
192  *     primary task it to free all the &struct request structures that
193  *     were allocated to the queue.
194  * Caveat: 
195  *     Hopefully the low level driver will have finished any
196  *     outstanding requests first...
197  **/
198 void blk_cleanup_queue(request_queue_t * q)
199 {
200         int count = queue_nr_requests;
201 
202         count -= __blk_cleanup_queue(&q->request_freelist[READ]);
203         count -= __blk_cleanup_queue(&q->request_freelist[WRITE]);
204         count -= __blk_cleanup_queue(&q->pending_freelist[READ]);
205         count -= __blk_cleanup_queue(&q->pending_freelist[WRITE]);
206 
207         if (count)
208                 printk("blk_cleanup_queue: leaked requests (%d)\n", count);
209 
210         memset(q, 0, sizeof(*q));
211 }
212 
213 /**
214  * blk_queue_headactive - indicate whether head of request queue may be active
215  * @q:       The queue which this applies to.
216  * @active:  A flag indication where the head of the queue is active.
217  *
218  * Description:
219  *    The driver for a block device may choose to leave the currently active
220  *    request on the request queue, removing it only when it has completed.
221  *    The queue handling routines assume this by default for safety reasons
222  *    and will not involve the head of the request queue in any merging or
223  *    reordering of requests when the queue is unplugged (and thus may be
224  *    working on this particular request).
225  *
226  *    If a driver removes requests from the queue before processing them, then
227  *    it may indicate that it does so, there by allowing the head of the queue
228  *    to be involved in merging and reordering.  This is done be calling
229  *    blk_queue_headactive() with an @active flag of %0.
230  *
231  *    If a driver processes several requests at once, it must remove them (or
232  *    at least all but one of them) from the request queue.
233  *
234  *    When a queue is plugged (see blk_queue_pluggable()) the head will be
235  *    assumed to be inactive.
236  **/
237  
238 void blk_queue_headactive(request_queue_t * q, int active)
239 {
240         q->head_active = active;
241 }
242 
243 /**
244  * blk_queue_pluggable - define a plugging function for a request queue
245  * @q:   the request queue to which the function will apply
246  * @plug: the function to be called to plug a queue
247  *
248  * Description:
249  *   A request queue will be "plugged" if a request is added to it
250  *   while it is empty.  This allows a number of requests to be added
251  *   before any are processed, thus providing an opportunity for these
252  *   requests to be merged or re-ordered.
253  *   The default plugging function (generic_plug_device()) sets the
254  *   "plugged" flag for the queue and adds a task to the $tq_disk task
255  *   queue to unplug the queue and call the request function at a
256  *   later time.
257  *
258  *   A device driver may provide an alternate plugging function by
259  *   passing it to blk_queue_pluggable().  This function should set
260  *   the "plugged" flag if it want calls to the request_function to be
261  *   blocked, and should place a task on $tq_disk which will unplug
262  *   the queue.  Alternately it can simply do nothing and there-by
263  *   disable plugging of the device.
264  **/
265 
266 void blk_queue_pluggable (request_queue_t * q, plug_device_fn *plug)
267 {
268         q->plug_device_fn = plug;
269 }
270 
271 
272 /**
273  * blk_queue_make_request - define an alternate make_request function for a device
274  * @q:  the request queue for the device to be affected
275  * @mfn: the alternate make_request function
276  *
277  * Description:
278  *    The normal way for &struct buffer_heads to be passed to a device
279  *    driver is for them to be collected into requests on a request
280  *    queue, and then to allow the device driver to select requests
281  *    off that queue when it is ready.  This works well for many block
282  *    devices. However some block devices (typically virtual devices
283  *    such as md or lvm) do not benefit from the processing on the
284  *    request queue, and are served best by having the requests passed
285  *    directly to them.  This can be achieved by providing a function
286  *    to blk_queue_make_request().
287  *
288  * Caveat:
289  *    The driver that does this *must* be able to deal appropriately
290  *    with buffers in "highmemory", either by calling bh_kmap() to get
291  *    a kernel mapping, to by calling create_bounce() to create a
292  *    buffer in normal memory.
293  **/
294 
295 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
296 {
297         q->make_request_fn = mfn;
298 }
299 
300 static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
301 {
302         if (req->nr_segments < max_segments) {
303                 req->nr_segments++;
304                 return 1;
305         }
306         return 0;
307 }
308 
309 static int ll_back_merge_fn(request_queue_t *q, struct request *req, 
310                             struct buffer_head *bh, int max_segments)
311 {
312         if (req->bhtail->b_data + req->bhtail->b_size == bh->b_data)
313                 return 1;
314         return ll_new_segment(q, req, max_segments);
315 }
316 
317 static int ll_front_merge_fn(request_queue_t *q, struct request *req, 
318                              struct buffer_head *bh, int max_segments)
319 {
320         if (bh->b_data + bh->b_size == req->bh->b_data)
321                 return 1;
322         return ll_new_segment(q, req, max_segments);
323 }
324 
325 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
326                                 struct request *next, int max_segments)
327 {
328         int total_segments = req->nr_segments + next->nr_segments;
329 
330         if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data)
331                 total_segments--;
332     
333         if (total_segments > max_segments)
334                 return 0;
335 
336         req->nr_segments = total_segments;
337         return 1;
338 }
339 
340 /*
341  * "plug" the device if there are no outstanding requests: this will
342  * force the transfer to start only after we have put all the requests
343  * on the list.
344  *
345  * This is called with interrupts off and no requests on the queue.
346  * (and with the request spinlock acquired)
347  */
348 static void generic_plug_device(request_queue_t *q, kdev_t dev)
349 {
350         /*
351          * no need to replug device
352          */
353         if (!list_empty(&q->queue_head) || q->plugged)
354                 return;
355 
356         q->plugged = 1;
357         queue_task(&q->plug_tq, &tq_disk);
358 }
359 
360 /*
361  * remove the plug and let it rip..
362  */
363 static inline void __generic_unplug_device(request_queue_t *q)
364 {
365         if (q->plugged) {
366                 q->plugged = 0;
367                 if (!list_empty(&q->queue_head))
368                         q->request_fn(q);
369         }
370 }
371 
372 void generic_unplug_device(void *data)
373 {
374         request_queue_t *q = (request_queue_t *) data;
375         unsigned long flags;
376 
377         spin_lock_irqsave(&io_request_lock, flags);
378         __generic_unplug_device(q);
379         spin_unlock_irqrestore(&io_request_lock, flags);
380 }
381 
382 static void blk_init_free_list(request_queue_t *q)
383 {
384         struct request *rq;
385         int i;
386 
387         INIT_LIST_HEAD(&q->request_freelist[READ]);
388         INIT_LIST_HEAD(&q->request_freelist[WRITE]);
389         INIT_LIST_HEAD(&q->pending_freelist[READ]);
390         INIT_LIST_HEAD(&q->pending_freelist[WRITE]);
391         q->pending_free[READ] = q->pending_free[WRITE] = 0;
392 
393         /*
394          * Divide requests in half between read and write
395          */
396         for (i = 0; i < queue_nr_requests; i++) {
397                 rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
398                 memset(rq, 0, sizeof(struct request));
399                 rq->rq_status = RQ_INACTIVE;
400                 list_add(&rq->table, &q->request_freelist[i & 1]);
401         }
402 
403         init_waitqueue_head(&q->wait_for_request);
404         spin_lock_init(&q->queue_lock);
405 }
406 
407 static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
408 
409 /**
410  * blk_init_queue  - prepare a request queue for use with a block device
411  * @q:    The &request_queue_t to be initialised
412  * @rfn:  The function to be called to process requests that have been
413  *        placed on the queue.
414  *
415  * Description:
416  *    If a block device wishes to use the standard request handling procedures,
417  *    which sorts requests and coalesces adjacent requests, then it must
418  *    call blk_init_queue().  The function @rfn will be called when there
419  *    are requests on the queue that need to be processed.  If the device
420  *    supports plugging, then @rfn may not be called immediately when requests
421  *    are available on the queue, but may be called at some time later instead.
422  *    Plugged queues are generally unplugged when a buffer belonging to one
423  *    of the requests on the queue is needed, or due to memory pressure.
424  *
425  *    @rfn is not required, or even expected, to remove all requests off the
426  *    queue, but only as many as it can handle at a time.  If it does leave
427  *    requests on the queue, it is responsible for arranging that the requests
428  *    get dealt with eventually.
429  *
430  *    A global spin lock $io_request_lock must be held while manipulating the
431  *    requests on the request queue.
432  *
433  *    The request on the head of the queue is by default assumed to be
434  *    potentially active, and it is not considered for re-ordering or merging
435  *    whenever the given queue is unplugged. This behaviour can be changed with
436  *    blk_queue_headactive().
437  *
438  * Note:
439  *    blk_init_queue() must be paired with a blk_cleanup_queue() call
440  *    when the block device is deactivated (such as at module unload).
441  **/
442 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
443 {
444         INIT_LIST_HEAD(&q->queue_head);
445         elevator_init(&q->elevator, ELEVATOR_LINUS);
446         blk_init_free_list(q);
447         q->request_fn           = rfn;
448         q->back_merge_fn        = ll_back_merge_fn;
449         q->front_merge_fn       = ll_front_merge_fn;
450         q->merge_requests_fn    = ll_merge_requests_fn;
451         q->make_request_fn      = __make_request;
452         q->plug_tq.sync         = 0;
453         q->plug_tq.routine      = &generic_unplug_device;
454         q->plug_tq.data         = q;
455         q->plugged              = 0;
456         /*
457          * These booleans describe the queue properties.  We set the
458          * default (and most common) values here.  Other drivers can
459          * use the appropriate functions to alter the queue properties.
460          * as appropriate.
461          */
462         q->plug_device_fn       = generic_plug_device;
463         q->head_active          = 1;
464 }
465 
466 #define blkdev_free_rq(list) list_entry((list)->next, struct request, table);
467 /*
468  * Get a free request. io_request_lock must be held and interrupts
469  * disabled on the way in.
470  */
471 static inline struct request *get_request(request_queue_t *q, int rw)
472 {
473         struct request *rq = NULL;
474 
475         if (!list_empty(&q->request_freelist[rw])) {
476                 rq = blkdev_free_rq(&q->request_freelist[rw]);
477                 list_del(&rq->table);
478                 rq->rq_status = RQ_ACTIVE;
479                 rq->special = NULL;
480                 rq->q = q;
481         }
482 
483         return rq;
484 }
485 
486 /*
487  * No available requests for this queue, unplug the device.
488  */
489 static struct request *__get_request_wait(request_queue_t *q, int rw)
490 {
491         register struct request *rq;
492         DECLARE_WAITQUEUE(wait, current);
493 
494         add_wait_queue_exclusive(&q->wait_for_request, &wait);
495         for (;;) {
496                 __set_current_state(TASK_UNINTERRUPTIBLE);
497                 spin_lock_irq(&io_request_lock);
498                 rq = get_request(q, rw);
499                 spin_unlock_irq(&io_request_lock);
500                 if (rq)
501                         break;
502                 generic_unplug_device(q);
503                 schedule();
504         }
505         remove_wait_queue(&q->wait_for_request, &wait);
506         current->state = TASK_RUNNING;
507         return rq;
508 }
509 
510 static inline struct request *get_request_wait(request_queue_t *q, int rw)
511 {
512         register struct request *rq;
513 
514         spin_lock_irq(&io_request_lock);
515         rq = get_request(q, rw);
516         spin_unlock_irq(&io_request_lock);
517         if (rq)
518                 return rq;
519         return __get_request_wait(q, rw);
520 }
521 
522 /* RO fail safe mechanism */
523 
524 static long ro_bits[MAX_BLKDEV][8];
525 
526 int is_read_only(kdev_t dev)
527 {
528         int minor,major;
529 
530         major = MAJOR(dev);
531         minor = MINOR(dev);
532         if (major < 0 || major >= MAX_BLKDEV) return 0;
533         return ro_bits[major][minor >> 5] & (1 << (minor & 31));
534 }
535 
536 void set_device_ro(kdev_t dev,int flag)
537 {
538         int minor,major;
539 
540         major = MAJOR(dev);
541         minor = MINOR(dev);
542         if (major < 0 || major >= MAX_BLKDEV) return;
543         if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
544         else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
545 }
546 
547 inline void drive_stat_acct (kdev_t dev, int rw,
548                                 unsigned long nr_sectors, int new_io)
549 {
550         unsigned int major = MAJOR(dev);
551         unsigned int index;
552 
553         index = disk_index(dev);
554         if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
555                 return;
556 
557         kstat.dk_drive[major][index] += new_io;
558         if (rw == READ) {
559                 kstat.dk_drive_rio[major][index] += new_io;
560                 kstat.dk_drive_rblk[major][index] += nr_sectors;
561         } else if (rw == WRITE) {
562                 kstat.dk_drive_wio[major][index] += new_io;
563                 kstat.dk_drive_wblk[major][index] += nr_sectors;
564         } else
565                 printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
566 }
567 
568 /*
569  * add-request adds a request to the linked list.
570  * io_request_lock is held and interrupts disabled, as we muck with the
571  * request queue list.
572  *
573  * By this point, req->cmd is always either READ/WRITE, never READA,
574  * which is important for drive_stat_acct() above.
575  */
576 static inline void add_request(request_queue_t * q, struct request * req,
577                                struct list_head *insert_here)
578 {
579         int major;
580 
581         drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
582 
583         if (!q->plugged && q->head_active && insert_here == &q->queue_head) {
584                 spin_unlock_irq(&io_request_lock);
585                 BUG();
586         }
587 
588         /*
589          * elevator indicated where it wants this request to be
590          * inserted at elevator_merge time
591          */
592         list_add(&req->queue, insert_here);
593 
594         major = MAJOR(req->rq_dev);
595         if (major >= DAC960_MAJOR+0 && major <= DAC960_MAJOR+7)
596                 q->request_fn(q);
597 }
598 
599 void inline blk_refill_freelist(request_queue_t *q, int rw)
600 {
601         if (q->pending_free[rw]) {
602                 list_splice(&q->pending_freelist[rw], &q->request_freelist[rw]);
603                 INIT_LIST_HEAD(&q->pending_freelist[rw]);
604                 q->pending_free[rw] = 0;
605         }
606 }
607 
608 /*
609  * Must be called with io_request_lock held and interrupts disabled
610  */
611 void inline blkdev_release_request(struct request *req)
612 {
613         request_queue_t *q = req->q;
614         int rw = req->cmd;
615 
616         req->rq_status = RQ_INACTIVE;
617         req->q = NULL;
618 
619         /*
620          * Request may not have originated from ll_rw_blk. if not,
621          * asumme it has free buffers and check waiters
622          */
623         if (q) {
624                 /*
625                  * we've released enough buffers to start I/O again
626                  */
627                 if (waitqueue_active(&blk_buffers_wait)
628                     && atomic_read(&queued_sectors) < low_queued_sectors)
629                         wake_up(&blk_buffers_wait);
630 
631                 /*
632                  * Add to pending free list and batch wakeups
633                  */
634                 list_add(&req->table, &q->pending_freelist[rw]);
635 
636                 if (++q->pending_free[rw] >= batch_requests) {
637                         int wake_up = q->pending_free[rw];
638                         blk_refill_freelist(q, rw);
639                         wake_up_nr(&q->wait_for_request, wake_up);
640                 }
641         }
642 }
643 
644 /*
645  * Has to be called with the request spinlock acquired
646  */
647 static void attempt_merge(request_queue_t * q,
648                           struct request *req,
649                           int max_sectors,
650                           int max_segments)
651 {
652         struct request *next;
653   
654         next = blkdev_next_request(req);
655         if (req->sector + req->nr_sectors != next->sector)
656                 return;
657         if (req->cmd != next->cmd
658             || req->rq_dev != next->rq_dev
659             || req->nr_sectors + next->nr_sectors > max_sectors
660             || next->sem)
661                 return;
662         /*
663          * If we are not allowed to merge these requests, then
664          * return.  If we are allowed to merge, then the count
665          * will have been updated to the appropriate number,
666          * and we shouldn't do it here too.
667          */
668         if (!q->merge_requests_fn(q, req, next, max_segments))
669                 return;
670 
671         q->elevator.elevator_merge_req_fn(req, next);
672         req->bhtail->b_reqnext = next->bh;
673         req->bhtail = next->bhtail;
674         req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
675         list_del(&next->queue);
676         blkdev_release_request(next);
677 }
678 
679 static inline void attempt_back_merge(request_queue_t * q,
680                                       struct request *req,
681                                       int max_sectors,
682                                       int max_segments)
683 {
684         if (&req->queue == q->queue_head.prev)
685                 return;
686         attempt_merge(q, req, max_sectors, max_segments);
687 }
688 
689 static inline void attempt_front_merge(request_queue_t * q,
690                                        struct list_head * head,
691                                        struct request *req,
692                                        int max_sectors,
693                                        int max_segments)
694 {
695         struct list_head * prev;
696 
697         prev = req->queue.prev;
698         if (head == prev)
699                 return;
700         attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
701 }
702 
703 static int __make_request(request_queue_t * q, int rw,
704                                   struct buffer_head * bh)
705 {
706         unsigned int sector, count;
707         int max_segments = MAX_SEGMENTS;
708         struct request * req = NULL, *freereq = NULL;
709         int rw_ahead, max_sectors, el_ret;
710         struct list_head *head, *insert_here;
711         int latency;
712         elevator_t *elevator = &q->elevator;
713 
714         count = bh->b_size >> 9;
715         sector = bh->b_rsector;
716 
717         rw_ahead = 0;   /* normal case; gets changed below for READA */
718         switch (rw) {
719                 case READA:
720                         rw_ahead = 1;
721                         rw = READ;      /* drop into READ */
722                 case READ:
723                 case WRITE:
724                         latency = elevator_request_latency(elevator, rw);
725                         break;
726                 default:
727                         BUG();
728                         goto end_io;
729         }
730 
731         /* We'd better have a real physical mapping!
732            Check this bit only if the buffer was dirty and just locked
733            down by us so at this point flushpage will block and
734            won't clear the mapped bit under us. */
735         if (!buffer_mapped(bh))
736                 BUG();
737 
738         /*
739          * Temporary solution - in 2.5 this will be done by the lowlevel
740          * driver. Create a bounce buffer if the buffer data points into
741          * high memory - keep the original buffer otherwise.
742          */
743 #if CONFIG_HIGHMEM
744         bh = create_bounce(rw, bh);
745 #endif
746 
747 /* look for a free request. */
748         /*
749          * Try to coalesce the new request with old requests
750          */
751         max_sectors = get_max_sectors(bh->b_rdev);
752 
753 again:
754         head = &q->queue_head;
755         /*
756          * Now we acquire the request spinlock, we have to be mega careful
757          * not to schedule or do something nonatomic
758          */
759         spin_lock_irq(&io_request_lock);
760 
761         insert_here = head->prev;
762         if (list_empty(head)) {
763                 q->plug_device_fn(q, bh->b_rdev); /* is atomic */
764                 goto get_rq;
765         } else if (q->head_active && !q->plugged)
766                 head = head->next;
767 
768         el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,
769                                              max_sectors, max_segments);
770         switch (el_ret) {
771 
772                 case ELEVATOR_BACK_MERGE:
773                         if (!q->back_merge_fn(q, req, bh, max_segments))
774                                 break;
775                         elevator->elevator_merge_cleanup_fn(q, req, count);
776                         req->bhtail->b_reqnext = bh;
777                         req->bhtail = bh;
778                         req->nr_sectors = req->hard_nr_sectors += count;
779                         blk_started_io(count);
780                         drive_stat_acct(req->rq_dev, req->cmd, count, 0);
781                         attempt_back_merge(q, req, max_sectors, max_segments);
782                         goto out;
783 
784                 case ELEVATOR_FRONT_MERGE:
785                         if (!q->front_merge_fn(q, req, bh, max_segments))
786                                 break;
787                         elevator->elevator_merge_cleanup_fn(q, req, count);
788                         bh->b_reqnext = req->bh;
789                         req->bh = bh;
790                         req->buffer = bh->b_data;
791                         req->current_nr_sectors = count;
792                         req->sector = req->hard_sector = sector;
793                         req->nr_sectors = req->hard_nr_sectors += count;
794                         blk_started_io(count);
795                         drive_stat_acct(req->rq_dev, req->cmd, count, 0);
796                         attempt_front_merge(q, head, req, max_sectors, max_segments);
797                         goto out;
798 
799                 /*
800                  * elevator says don't/can't merge. get new request
801                  */
802                 case ELEVATOR_NO_MERGE:
803                         /*
804                          * use elevator hints as to where to insert the
805                          * request. if no hints, just add it to the back
806                          * of the queue
807                          */
808                         if (req)
809                                 insert_here = &req->queue;
810                         break;
811 
812                 default:
813                         printk("elevator returned crap (%d)\n", el_ret);
814                         BUG();
815         }
816                 
817         /*
818          * Grab a free request from the freelist - if that is empty, check
819          * if we are doing read ahead and abort instead of blocking for
820          * a free slot.
821          */
822 get_rq:
823         if (freereq) {
824                 req = freereq;
825                 freereq = NULL;
826         } else if ((req = get_request(q, rw)) == NULL) {
827                 spin_unlock_irq(&io_request_lock);
828                 if (rw_ahead)
829                         goto end_io;
830 
831                 freereq = __get_request_wait(q, rw);
832                 goto again;
833         }
834 
835 /* fill up the request-info, and add it to the queue */
836         req->elevator_sequence = latency;
837         req->cmd = rw;
838         req->errors = 0;
839         req->hard_sector = req->sector = sector;
840         req->hard_nr_sectors = req->nr_sectors = count;
841         req->current_nr_sectors = count;
842         req->nr_segments = 1; /* Always 1 for a new request. */
843         req->nr_hw_segments = 1; /* Always 1 for a new request. */
844         req->buffer = bh->b_data;
845         req->sem = NULL;
846         req->bh = bh;
847         req->bhtail = bh;
848         req->rq_dev = bh->b_rdev;
849         blk_started_io(count);
850         add_request(q, req, insert_here);
851 out:
852         if (freereq)
853                 blkdev_release_request(freereq);
854         if (!q->plugged)
855                 q->request_fn(q);
856         spin_unlock_irq(&io_request_lock);
857         return 0;
858 end_io:
859         bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
860         return 0;
861 }
862 
863 /**
864  * generic_make_request: hand a buffer head to it's device driver for I/O
865  * @rw:  READ, WRITE, or READA - what sort of I/O is desired.
866  * @bh:  The buffer head describing the location in memory and on the device.
867  *
868  * generic_make_request() is used to make I/O requests of block
869  * devices. It is passed a &struct buffer_head and a &rw value.  The
870  * %READ and %WRITE options are (hopefully) obvious in meaning.  The
871  * %READA value means that a read is required, but that the driver is
872  * free to fail the request if, for example, it cannot get needed
873  * resources immediately.
874  *
875  * generic_make_request() does not return any status.  The
876  * success/failure status of the request, along with notification of
877  * completion, is delivered asynchronously through the bh->b_end_io
878  * function described (one day) else where.
879  *
880  * The caller of generic_make_request must make sure that b_page,
881  * b_addr, b_size are set to describe the memory buffer, that b_rdev
882  * and b_rsector are set to describe the device address, and the
883  * b_end_io and optionally b_private are set to describe how
884  * completion notification should be signaled.  BH_Mapped should also
885  * be set (to confirm that b_dev and b_blocknr are valid).
886  *
887  * generic_make_request and the drivers it calls may use b_reqnext,
888  * and may change b_rdev and b_rsector.  So the values of these fields
889  * should NOT be depended on after the call to generic_make_request.
890  * Because of this, the caller should record the device address
891  * information in b_dev and b_blocknr.
892  *
893  * Apart from those fields mentioned above, no other fields, and in
894  * particular, no other flags, are changed by generic_make_request or
895  * any lower level drivers.
896  * */
897 void generic_make_request (int rw, struct buffer_head * bh)
898 {
899         int major = MAJOR(bh->b_rdev);
900         request_queue_t *q;
901 
902         if (!bh->b_end_io)
903                 BUG();
904 
905         if (blk_size[major]) {
906                 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
907                 unsigned long sector = bh->b_rsector;
908                 unsigned int count = bh->b_size >> 9;
909 
910                 if (maxsector < count || maxsector - count < sector) {
911                         bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
912                         if (blk_size[major][MINOR(bh->b_rdev)]) {
913                                 
914                                 /* This may well happen - the kernel calls bread()
915                                    without checking the size of the device, e.g.,
916                                    when mounting a device. */
917                                 printk(KERN_INFO
918                                        "attempt to access beyond end of device\n");
919                                 printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
920                                        kdevname(bh->b_rdev), rw,
921                                        (sector + count)>>1,
922                                        blk_size[major][MINOR(bh->b_rdev)]);
923                         }
924                         bh->b_end_io(bh, 0);
925                         return;
926                 }
927         }
928 
929         /*
930          * Resolve the mapping until finished. (drivers are
931          * still free to implement/resolve their own stacking
932          * by explicitly returning 0)
933          */
934         /* NOTE: we don't repeat the blk_size check for each new device.
935          * Stacking drivers are expected to know what they are doing.
936          */
937         do {
938                 q = blk_get_queue(bh->b_rdev);
939                 if (!q) {
940                         printk(KERN_ERR
941                                "generic_make_request: Trying to access nonexistent block-device %s (%ld)\n",
942                                kdevname(bh->b_rdev), bh->b_rsector);
943                         buffer_IO_error(bh);
944                         break;
945                 }
946         } while (q->make_request_fn(q, rw, bh));
947 }
948 
949 
950 /**
951  * submit_bh: submit a buffer_head to the block device later for I/O
952  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
953  * @bh: The &struct buffer_head which describes the I/O
954  *
955  * submit_bh() is very similar in purpose to generic_make_request(), and
956  * uses that function to do most of the work.
957  *
958  * The extra functionality provided by submit_bh is to determine
959  * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev.
960  * This is is appropriate for IO requests that come from the buffer
961  * cache and page cache which (currently) always use aligned blocks.
962  */
963 void submit_bh(int rw, struct buffer_head * bh)
964 {
965         if (!test_bit(BH_Lock, &bh->b_state))
966                 BUG();
967 
968         set_bit(BH_Req, &bh->b_state);
969 
970         /*
971          * First step, 'identity mapping' - RAID or LVM might
972          * further remap this.
973          */
974         bh->b_rdev = bh->b_dev;
975         bh->b_rsector = bh->b_blocknr * (bh->b_size >> 9);
976 
977         generic_make_request(rw, bh);
978 
979         switch (rw) {
980                 case WRITE:
981                         kstat.pgpgout++;
982                         break;
983                 default:
984                         kstat.pgpgin++;
985                         break;
986         }
987 }
988 
989 /*
990  * Default IO end handler, used by "ll_rw_block()".
991  */
992 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
993 {
994         mark_buffer_uptodate(bh, uptodate);
995         unlock_buffer(bh);
996 }
997 
998 /**
999  * ll_rw_block: low-level access to block devices
1000  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
1001  * @nr: number of &struct buffer_heads in the array
1002  * @bhs: array of pointers to &struct buffer_head
1003  *
1004  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
1005  * and requests an I/O operation on them, either a %READ or a %WRITE.
1006  * The third %READA option is described in the documentation for
1007  * generic_make_request() which ll_rw_block() calls.
1008  *
1009  * This function provides extra functionality that is not in
1010  * generic_make_request() that is relevant to buffers in the buffer
1011  * cache or page cache.  In particular it drops any buffer that it
1012  * cannot get a lock on (with the BH_Lock state bit), any buffer that
1013  * appears to be clean when doing a write request, and any buffer that
1014  * appears to be up-to-date when doing read request.  Further it marks
1015  * as clean buffers that are processed for writing (the buffer cache
1016  * wont assume that they are actually clean until the buffer gets
1017  * unlocked).
1018  *
1019  * ll_rw_block sets b_end_io to simple completion handler that marks
1020  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
1021  * any waiters.  As client that needs a more interesting completion
1022  * routine should call submit_bh() (or generic_make_request())
1023  * directly.
1024  *
1025  * Caveat:
1026  *  All of the buffers must be for the same device, and must also be
1027  *  of the current approved size for the device.  */
1028 
1029 void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
1030 {
1031         unsigned int major;
1032         int correct_size;
1033         int i;
1034 
1035         if (!nr)
1036                 return;
1037 
1038         major = MAJOR(bhs[0]->b_dev);
1039 
1040         /* Determine correct block size for this device. */
1041         correct_size = BLOCK_SIZE;
1042         if (blksize_size[major]) {
1043                 i = blksize_size[major][MINOR(bhs[0]->b_dev)];
1044                 if (i)
1045                         correct_size = i;
1046         }
1047 
1048         /* Verify requested block sizes. */
1049         for (i = 0; i < nr; i++) {
1050                 struct buffer_head *bh = bhs[i];
1051                 if (bh->b_size % correct_size) {
1052                         printk(KERN_NOTICE "ll_rw_block: device %s: "
1053                                "only %d-char blocks implemented (%u)\n",
1054                                kdevname(bhs[0]->b_dev),
1055                                correct_size, bh->b_size);
1056                         goto sorry;
1057                 }
1058         }
1059 
1060         if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
1061                 printk(KERN_NOTICE "Can't write to read-only device %s\n",
1062                        kdevname(bhs[0]->b_dev));
1063                 goto sorry;
1064         }
1065 
1066         for (i = 0; i < nr; i++) {
1067                 struct buffer_head *bh = bhs[i];
1068 
1069                 /*
1070                  * don't lock any more buffers if we are above the high
1071                  * water mark. instead start I/O on the queued stuff.
1072                  */
1073                 if (atomic_read(&queued_sectors) >= high_queued_sectors) {
1074                         run_task_queue(&tq_disk);
1075                         wait_event(blk_buffers_wait,
1076                          atomic_read(&queued_sectors) < low_queued_sectors);
1077                 }
1078 
1079                 /* Only one thread can actually submit the I/O. */
1080                 if (test_and_set_bit(BH_Lock, &bh->b_state))
1081                         continue;
1082 
1083                 /* We have the buffer lock */
1084                 bh->b_end_io = end_buffer_io_sync;
1085 
1086                 switch(rw) {
1087                 case WRITE:
1088                         if (!atomic_set_buffer_clean(bh))
1089                                 /* Hmmph! Nothing to write */
1090                                 goto end_io;
1091                         __mark_buffer_clean(bh);
1092                         break;
1093 
1094                 case READA:
1095                 case READ:
1096                         if (buffer_uptodate(bh))
1097                                 /* Hmmph! Already have it */
1098                                 goto end_io;
1099                         break;
1100                 default:
1101                         BUG();
1102         end_io:
1103                         bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1104                         continue;
1105                 }
1106 
1107                 submit_bh(rw, bh);
1108         }
1109         return;
1110 
1111 sorry:
1112         /* Make sure we don't get infinite dirty retries.. */
1113         for (i = 0; i < nr; i++)
1114                 mark_buffer_clean(bhs[i]);
1115 }
1116 
1117 #ifdef CONFIG_STRAM_SWAP
1118 extern int stram_device_init (void);
1119 #endif
1120 
1121 
1122 /**
1123  * end_that_request_first - end I/O on one buffer.
1124  * @req:      the request being processed
1125  * @uptodate: 0 for I/O error
1126  * @name:     the name printed for an I/O error
1127  *
1128  * Description:
1129  *     Ends I/O on the first buffer attached to @req, and sets it up
1130  *     for the next buffer_head (if any) in the cluster.
1131  *     
1132  * Return:
1133  *     0 - we are done with this request, call end_that_request_last()
1134  *     1 - still buffers pending for this request
1135  *
1136  * Caveat: 
1137  *     Drivers implementing their own end_request handling must call
1138  *     blk_finished_io() appropriately.
1139  **/
1140 
1141 int end_that_request_first (struct request *req, int uptodate, char *name)
1142 {
1143         struct buffer_head * bh;
1144         int nsect;
1145 
1146         req->errors = 0;
1147         if (!uptodate)
1148                 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
1149                         kdevname(req->rq_dev), name, req->sector);
1150 
1151         if ((bh = req->bh) != NULL) {
1152                 nsect = bh->b_size >> 9;
1153                 blk_finished_io(nsect);
1154                 req->bh = bh->b_reqnext;
1155                 bh->b_reqnext = NULL;
1156                 bh->b_end_io(bh, uptodate);
1157                 if ((bh = req->bh) != NULL) {
1158                         req->hard_sector += nsect;
1159                         req->hard_nr_sectors -= nsect;
1160                         req->sector = req->hard_sector;
1161                         req->nr_sectors = req->hard_nr_sectors;
1162 
1163                         req->current_nr_sectors = bh->b_size >> 9;
1164                         if (req->nr_sectors < req->current_nr_sectors) {
1165                                 req->nr_sectors = req->current_nr_sectors;
1166                                 printk("end_request: buffer-list destroyed\n");
1167                         }
1168                         req->buffer = bh->b_data;
1169                         return 1;
1170                 }
1171         }
1172         return 0;
1173 }
1174 
1175 void end_that_request_last(struct request *req)
1176 {
1177         if (req->sem != NULL)
1178                 up(req->sem);
1179 
1180         blkdev_release_request(req);
1181 }
1182 
1183 #define MB(kb)  ((kb) << 10)
1184 
1185 int __init blk_dev_init(void)
1186 {
1187         struct blk_dev_struct *dev;
1188         int total_ram;
1189 
1190         request_cachep = kmem_cache_create("blkdev_requests",
1191                                            sizeof(struct request),
1192                                            0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1193 
1194         if (!request_cachep)
1195                 panic("Can't create request pool slab cache\n");
1196 
1197         for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
1198                 dev->queue = NULL;
1199 
1200         memset(ro_bits,0,sizeof(ro_bits));
1201         memset(max_readahead, 0, sizeof(max_readahead));
1202         memset(max_sectors, 0, sizeof(max_sectors));
1203 
1204         atomic_set(&queued_sectors, 0);
1205         total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
1206 
1207         /*
1208          * Try to keep 128MB max hysteris. If not possible,
1209          * use half of RAM
1210          */
1211         high_queued_sectors = (total_ram * 2) / 3;
1212         low_queued_sectors = high_queued_sectors / 3;
1213         if (high_queued_sectors - low_queued_sectors > MB(128))
1214                 low_queued_sectors = high_queued_sectors - MB(128);
1215 
1216 
1217         /*
1218          * make it sectors (512b)
1219          */
1220         high_queued_sectors <<= 1;
1221         low_queued_sectors <<= 1;
1222 
1223         /*
1224          * Scale free request slots per queue too
1225          */
1226         total_ram = (total_ram + MB(32) - 1) & ~(MB(32) - 1);
1227         if ((queue_nr_requests = total_ram >> 9) > QUEUE_NR_REQUESTS)
1228                 queue_nr_requests = QUEUE_NR_REQUESTS;
1229 
1230         /*
1231          * adjust batch frees according to queue length, with upper limit
1232          */
1233         if ((batch_requests = queue_nr_requests >> 3) > 32)
1234                 batch_requests = 32;
1235 
1236         printk("block: queued sectors max/low %dkB/%dkB, %d slots per queue\n",
1237                                                 high_queued_sectors / 2,
1238                                                 low_queued_sectors / 2,
1239                                                 queue_nr_requests);
1240 
1241 #ifdef CONFIG_AMIGA_Z2RAM
1242         z2_init();
1243 #endif
1244 #ifdef CONFIG_STRAM_SWAP
1245         stram_device_init();
1246 #endif
1247 #ifdef CONFIG_BLK_DEV_RAM
1248         rd_init();
1249 #endif
1250 #ifdef CONFIG_BLK_DEV_LOOP
1251         loop_init();
1252 #endif
1253 #ifdef CONFIG_ISP16_CDI
1254         isp16_init();
1255 #endif
1256 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_IDE)
1257         ide_init();             /* this MUST precede hd_init */
1258 #endif
1259 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
1260         hd_init();
1261 #endif
1262 #ifdef CONFIG_BLK_DEV_PS2
1263         ps2esdi_init();
1264 #endif
1265 #ifdef CONFIG_BLK_DEV_XD
1266         xd_init();
1267 #endif
1268 #ifdef CONFIG_BLK_DEV_MFM
1269         mfm_init();
1270 #endif
1271 #ifdef CONFIG_PARIDE
1272         { extern void paride_init(void); paride_init(); };
1273 #endif
1274 #ifdef CONFIG_MAC_FLOPPY
1275         swim3_init();
1276 #endif
1277 #ifdef CONFIG_BLK_DEV_SWIM_IOP
1278         swimiop_init();
1279 #endif
1280 #ifdef CONFIG_AMIGA_FLOPPY
1281         amiga_floppy_init();
1282 #endif
1283 #ifdef CONFIG_ATARI_FLOPPY
1284         atari_floppy_init();
1285 #endif
1286 #ifdef CONFIG_BLK_DEV_FD
1287         floppy_init();
1288 #else
1289 #if defined(__i386__)   /* Do we even need this? */
1290         outb_p(0xc, 0x3f2);
1291 #endif
1292 #endif
1293 #ifdef CONFIG_CDU31A
1294         cdu31a_init();
1295 #endif
1296 #ifdef CONFIG_ATARI_ACSI
1297         acsi_init();
1298 #endif
1299 #ifdef CONFIG_MCD
1300         mcd_init();
1301 #endif
1302 #ifdef CONFIG_MCDX
1303         mcdx_init();
1304 #endif
1305 #ifdef CONFIG_SBPCD
1306         sbpcd_init();
1307 #endif
1308 #ifdef CONFIG_AZTCD
1309         aztcd_init();
1310 #endif
1311 #ifdef CONFIG_CDU535
1312         sony535_init();
1313 #endif
1314 #ifdef CONFIG_GSCD
1315         gscd_init();
1316 #endif
1317 #ifdef CONFIG_CM206
1318         cm206_init();
1319 #endif
1320 #ifdef CONFIG_OPTCD
1321         optcd_init();
1322 #endif
1323 #ifdef CONFIG_SJCD
1324         sjcd_init();
1325 #endif
1326 #ifdef CONFIG_APBLOCK
1327         ap_init();
1328 #endif
1329 #ifdef CONFIG_DDV
1330         ddv_init();
1331 #endif
1332 #ifdef CONFIG_BLK_DEV_NBD
1333         nbd_init();
1334 #endif
1335 #ifdef CONFIG_MDISK
1336         mdisk_init();
1337 #endif
1338 #ifdef CONFIG_DASD
1339         dasd_init();
1340 #endif
1341 #ifdef CONFIG_SUN_JSFLASH
1342         jsfd_init();
1343 #endif
1344         return 0;
1345 };
1346 
1347 EXPORT_SYMBOL(io_request_lock);
1348 EXPORT_SYMBOL(end_that_request_first);
1349 EXPORT_SYMBOL(end_that_request_last);
1350 EXPORT_SYMBOL(blk_init_queue);
1351 EXPORT_SYMBOL(blk_get_queue);
1352 EXPORT_SYMBOL(__blk_get_queue);
1353 EXPORT_SYMBOL(blk_cleanup_queue);
1354 EXPORT_SYMBOL(blk_queue_headactive);
1355 EXPORT_SYMBOL(blk_queue_pluggable);
1356 EXPORT_SYMBOL(blk_queue_make_request);
1357 EXPORT_SYMBOL(generic_make_request);
1358 EXPORT_SYMBOL(blkdev_release_request);
1359 EXPORT_SYMBOL(generic_unplug_device);
1360 EXPORT_SYMBOL(queued_sectors);
1361 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.