~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/fs/buffer.c

Version: ~ [ 2.2.5 ] ~ [ 2.4.1 ] ~ [ 2.4.9 ] ~ [ 2.6.17.10 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *  linux/fs/buffer.c
  3  *
  4  *  Copyright (C) 1991, 1992  Linus Torvalds
  5  */
  6 
  7 /*
  8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
  9  * been avoided by NEVER letting an interrupt change a buffer (except for the
 10  * data, of course), but instead letting the caller do it.
 11  */
 12 
 13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
 14 
 15 /* Removed a lot of unnecessary code and simplified things now that
 16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 17  */
 18 
 19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
 20  * hash table, use SLAB cache for buffer heads. -DaveM
 21  */
 22 
 23 /* Added 32k buffer block sizes - these are required older ARM systems.
 24  * - RMK
 25  */
 26 
 27 /* Thread it... -DaveM */
 28 
 29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
 30 
 31 #include <linux/config.h>
 32 #include <linux/sched.h>
 33 #include <linux/fs.h>
 34 #include <linux/malloc.h>
 35 #include <linux/locks.h>
 36 #include <linux/errno.h>
 37 #include <linux/swap.h>
 38 #include <linux/swapctl.h>
 39 #include <linux/smp_lock.h>
 40 #include <linux/vmalloc.h>
 41 #include <linux/blkdev.h>
 42 #include <linux/sysrq.h>
 43 #include <linux/file.h>
 44 #include <linux/init.h>
 45 #include <linux/quotaops.h>
 46 #include <linux/iobuf.h>
 47 #include <linux/highmem.h>
 48 
 49 #include <asm/uaccess.h>
 50 #include <asm/io.h>
 51 #include <asm/bitops.h>
 52 #include <asm/mmu_context.h>
 53 
 54 #define NR_SIZES 7
 55 static char buffersize_index[65] =
 56 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
 57   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 58   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 59  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 60   6};
 61 
 62 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
 63 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
 64 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
 65 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
 66                                              number of unused buffer heads */
 67 
 68 /* Anti-deadlock ordering:
 69  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
 70  */
 71 
 72 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
 73 
 74 /*
 75  * Hash table gook..
 76  */
 77 static unsigned int bh_hash_mask;
 78 static unsigned int bh_hash_shift;
 79 static struct buffer_head **hash_table;
 80 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
 81 
 82 static struct buffer_head *lru_list[NR_LIST];
 83 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
 84 static int nr_buffers_type[NR_LIST];
 85 static unsigned long size_buffers_type[NR_LIST];
 86 
 87 static struct buffer_head * unused_list;
 88 static int nr_unused_buffer_heads;
 89 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
 90 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
 91 
 92 struct bh_free_head {
 93         struct buffer_head *list;
 94         spinlock_t lock;
 95 };
 96 static struct bh_free_head free_list[NR_SIZES];
 97 
 98 static int grow_buffers(int size);
 99 static void __refile_buffer(struct buffer_head *);
100 
101 /* This is used by some architectures to estimate available memory. */
102 atomic_t buffermem_pages = ATOMIC_INIT(0);
103 
104 /* Here is the parameter block for the bdflush process. If you add or
105  * remove any of the parameters, make sure to update kernel/sysctl.c.
106  */
107 
108 #define N_PARAM 9
109 
110 /* The dummy values in this structure are left in there for compatibility
111  * with old programs that play with the /proc entries.
112  */
113 union bdflush_param {
114         struct {
115                 int nfract;  /* Percentage of buffer cache dirty to 
116                                 activate bdflush */
117                 int ndirty;  /* Maximum number of dirty blocks to write out per
118                                 wake-cycle */
119                 int nrefill; /* Number of clean buffers to try to obtain
120                                 each time we call refill */
121                 int dummy1;   /* unused */
122                 int interval; /* jiffies delay between kupdate flushes */
123                 int age_buffer;  /* Time for normal buffer to age before we flush it */
124                 int nfract_sync; /* Percentage of buffer cache dirty to 
125                                     activate bdflush synchronously */
126                 int dummy2;    /* unused */
127                 int dummy3;    /* unused */
128         } b_un;
129         unsigned int data[N_PARAM];
130 } bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
131 
132 /* These are the min and max parameter values that we will allow to be assigned */
133 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
134 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
135 
136 /*
137  * Rewrote the wait-routines to use the "new" wait-queue functionality,
138  * and getting rid of the cli-sti pairs. The wait-queue routines still
139  * need cli-sti, but now it's just a couple of 386 instructions or so.
140  *
141  * Note that the real wait_on_buffer() is an inline function that checks
142  * if 'b_wait' is set before calling this, so that the queues aren't set
143  * up unnecessarily.
144  */
145 void __wait_on_buffer(struct buffer_head * bh)
146 {
147         struct task_struct *tsk = current;
148         DECLARE_WAITQUEUE(wait, tsk);
149 
150         atomic_inc(&bh->b_count);
151         add_wait_queue(&bh->b_wait, &wait);
152         do {
153                 run_task_queue(&tq_disk);
154                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155                 if (!buffer_locked(bh))
156                         break;
157                 schedule();
158         } while (buffer_locked(bh));
159         tsk->state = TASK_RUNNING;
160         remove_wait_queue(&bh->b_wait, &wait);
161         atomic_dec(&bh->b_count);
162 }
163 
164 /* Call sync_buffers with wait!=0 to ensure that the call does not
165  * return until all buffer writes have completed.  Sync() may return
166  * before the writes have finished; fsync() may not.
167  */
168 
169 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
170  * spontaneously dirty themselves without ever brelse being called.
171  * We will ultimately want to put these in a separate list, but for
172  * now we search all of the lists for dirty buffers.
173  */
174 static int sync_buffers(kdev_t dev, int wait)
175 {
176         int i, retry, pass = 0, err = 0;
177         struct buffer_head * bh, *next;
178 
179         /* One pass for no-wait, three for wait:
180          * 0) write out all dirty, unlocked buffers;
181          * 1) write out all dirty buffers, waiting if locked;
182          * 2) wait for completion by waiting for all buffers to unlock.
183          */
184         do {
185                 retry = 0;
186 
187                 /* We search all lists as a failsafe mechanism, not because we expect
188                  * there to be dirty buffers on any of the other lists.
189                  */
190 repeat:
191                 spin_lock(&lru_list_lock);
192                 bh = lru_list[BUF_DIRTY];
193                 if (!bh)
194                         goto repeat2;
195 
196                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
197                         next = bh->b_next_free;
198 
199                         if (!lru_list[BUF_DIRTY])
200                                 break;
201                         if (dev && bh->b_dev != dev)
202                                 continue;
203                         if (buffer_locked(bh)) {
204                                 /* Buffer is locked; skip it unless wait is
205                                  * requested AND pass > 0.
206                                  */
207                                 if (!wait || !pass) {
208                                         retry = 1;
209                                         continue;
210                                 }
211                                 atomic_inc(&bh->b_count);
212                                 spin_unlock(&lru_list_lock);
213                                 wait_on_buffer (bh);
214                                 atomic_dec(&bh->b_count);
215                                 goto repeat;
216                         }
217 
218                         /* If an unlocked buffer is not uptodate, there has
219                          * been an IO error. Skip it.
220                          */
221                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
222                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
223                                 err = -EIO;
224                                 continue;
225                         }
226 
227                         /* Don't write clean buffers.  Don't write ANY buffers
228                          * on the third pass.
229                          */
230                         if (!buffer_dirty(bh) || pass >= 2)
231                                 continue;
232 
233                         atomic_inc(&bh->b_count);
234                         spin_unlock(&lru_list_lock);
235                         ll_rw_block(WRITE, 1, &bh);
236                         atomic_dec(&bh->b_count);
237                         retry = 1;
238                         goto repeat;
239                 }
240 
241     repeat2:
242                 bh = lru_list[BUF_LOCKED];
243                 if (!bh) {
244                         spin_unlock(&lru_list_lock);
245                         break;
246                 }
247                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
248                         next = bh->b_next_free;
249 
250                         if (!lru_list[BUF_LOCKED])
251                                 break;
252                         if (dev && bh->b_dev != dev)
253                                 continue;
254                         if (buffer_locked(bh)) {
255                                 /* Buffer is locked; skip it unless wait is
256                                  * requested AND pass > 0.
257                                  */
258                                 if (!wait || !pass) {
259                                         retry = 1;
260                                         continue;
261                                 }
262                                 atomic_inc(&bh->b_count);
263                                 spin_unlock(&lru_list_lock);
264                                 wait_on_buffer (bh);
265                                 spin_lock(&lru_list_lock);
266                                 atomic_dec(&bh->b_count);
267                                 goto repeat2;
268                         }
269                 }
270                 spin_unlock(&lru_list_lock);
271 
272                 /* If we are waiting for the sync to succeed, and if any dirty
273                  * blocks were written, then repeat; on the second pass, only
274                  * wait for buffers being written (do not pass to write any
275                  * more buffers on the second pass).
276                  */
277         } while (wait && retry && ++pass<=2);
278         return err;
279 }
280 
281 void sync_dev(kdev_t dev)
282 {
283         sync_supers(dev);
284         sync_inodes(dev);
285         DQUOT_SYNC(dev);
286         /* sync all the dirty buffers out to disk only _after_ all the
287            high level layers finished generated buffer dirty data
288            (or we'll return with some buffer still dirty on the blockdevice
289            so breaking the semantics of this call) */
290         sync_buffers(dev, 0);
291         /*
292          * FIXME(eric) we need to sync the physical devices here.
293          * This is because some (scsi) controllers have huge amounts of
294          * cache onboard (hundreds of Mb), and we need to instruct
295          * them to commit all of the dirty memory to disk, and we should
296          * not return until this has happened.
297          *
298          * This would need to get implemented by going through the assorted
299          * layers so that each block major number can be synced, and this
300          * would call down into the upper and mid-layer scsi.
301          */
302 }
303 
304 int fsync_dev(kdev_t dev)
305 {
306         sync_buffers(dev, 0);
307 
308         lock_kernel();
309         sync_supers(dev);
310         sync_inodes(dev);
311         DQUOT_SYNC(dev);
312         unlock_kernel();
313 
314         return sync_buffers(dev, 1);
315 }
316 
317 asmlinkage long sys_sync(void)
318 {
319         fsync_dev(0);
320         return 0;
321 }
322 
323 /*
324  *      filp may be NULL if called via the msync of a vma.
325  */
326  
327 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
328 {
329         struct inode * inode = dentry->d_inode;
330         struct super_block * sb;
331         kdev_t dev;
332         int ret;
333 
334         lock_kernel();
335         /* sync the inode to buffers */
336         write_inode_now(inode, 0);
337 
338         /* sync the superblock to buffers */
339         sb = inode->i_sb;
340         lock_super(sb);
341         if (sb->s_op && sb->s_op->write_super)
342                 sb->s_op->write_super(sb);
343         unlock_super(sb);
344 
345         /* .. finally sync the buffers to disk */
346         dev = inode->i_dev;
347         ret = sync_buffers(dev, 1);
348         unlock_kernel();
349         return ret;
350 }
351 
352 asmlinkage long sys_fsync(unsigned int fd)
353 {
354         struct file * file;
355         struct dentry * dentry;
356         struct inode * inode;
357         int err;
358 
359         err = -EBADF;
360         file = fget(fd);
361         if (!file)
362                 goto out;
363 
364         dentry = file->f_dentry;
365         inode = dentry->d_inode;
366 
367         err = -EINVAL;
368         if (!file->f_op || !file->f_op->fsync)
369                 goto out_putf;
370 
371         /* We need to protect against concurrent writers.. */
372         down(&inode->i_sem);
373         filemap_fdatasync(inode->i_mapping);
374         err = file->f_op->fsync(file, dentry, 0);
375         filemap_fdatawait(inode->i_mapping);
376         up(&inode->i_sem);
377 
378 out_putf:
379         fput(file);
380 out:
381         return err;
382 }
383 
384 asmlinkage long sys_fdatasync(unsigned int fd)
385 {
386         struct file * file;
387         struct dentry * dentry;
388         struct inode * inode;
389         int err;
390 
391         err = -EBADF;
392         file = fget(fd);
393         if (!file)
394                 goto out;
395 
396         dentry = file->f_dentry;
397         inode = dentry->d_inode;
398 
399         err = -EINVAL;
400         if (!file->f_op || !file->f_op->fsync)
401                 goto out_putf;
402 
403         down(&inode->i_sem);
404         filemap_fdatasync(inode->i_mapping);
405         err = file->f_op->fsync(file, dentry, 1);
406         filemap_fdatawait(inode->i_mapping);
407         up(&inode->i_sem);
408 
409 out_putf:
410         fput(file);
411 out:
412         return err;
413 }
414 
415 /* After several hours of tedious analysis, the following hash
416  * function won.  Do not mess with it... -DaveM
417  */
418 #define _hashfn(dev,block)      \
419         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
420          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
421           ((block) << (bh_hash_shift - 12))))
422 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
423 
424 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
425 {
426         if ((bh->b_next = *head) != NULL)
427                 bh->b_next->b_pprev = &bh->b_next;
428         *head = bh;
429         bh->b_pprev = head;
430 }
431 
432 static __inline__ void __hash_unlink(struct buffer_head *bh)
433 {
434         if (bh->b_pprev) {
435                 if (bh->b_next)
436                         bh->b_next->b_pprev = bh->b_pprev;
437                 *(bh->b_pprev) = bh->b_next;
438                 bh->b_pprev = NULL;
439         }
440 }
441 
442 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
443 {
444         struct buffer_head **bhp = &lru_list[blist];
445 
446         if(!*bhp) {
447                 *bhp = bh;
448                 bh->b_prev_free = bh;
449         }
450         bh->b_next_free = *bhp;
451         bh->b_prev_free = (*bhp)->b_prev_free;
452         (*bhp)->b_prev_free->b_next_free = bh;
453         (*bhp)->b_prev_free = bh;
454         nr_buffers_type[blist]++;
455         size_buffers_type[blist] += bh->b_size;
456 }
457 
458 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
459 {
460         if (bh->b_prev_free || bh->b_next_free) {
461                 bh->b_prev_free->b_next_free = bh->b_next_free;
462                 bh->b_next_free->b_prev_free = bh->b_prev_free;
463                 if (lru_list[blist] == bh)
464                         lru_list[blist] = bh->b_next_free;
465                 if (lru_list[blist] == bh)
466                         lru_list[blist] = NULL;
467                 bh->b_next_free = bh->b_prev_free = NULL;
468                 nr_buffers_type[blist]--;
469                 size_buffers_type[blist] -= bh->b_size;
470         }
471 }
472 
473 static void __remove_from_free_list(struct buffer_head * bh, int index)
474 {
475         if(bh->b_next_free == bh)
476                  free_list[index].list = NULL;
477         else {
478                 bh->b_prev_free->b_next_free = bh->b_next_free;
479                 bh->b_next_free->b_prev_free = bh->b_prev_free;
480                 if (free_list[index].list == bh)
481                          free_list[index].list = bh->b_next_free;
482         }
483         bh->b_next_free = bh->b_prev_free = NULL;
484 }
485 
486 /* must be called with both the hash_table_lock and the lru_list_lock
487    held */
488 static void __remove_from_queues(struct buffer_head *bh)
489 {
490         __hash_unlink(bh);
491         __remove_from_lru_list(bh, bh->b_list);
492 }
493 
494 static void __insert_into_queues(struct buffer_head *bh)
495 {
496         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
497 
498         __hash_link(bh, head);
499         __insert_into_lru_list(bh, bh->b_list);
500 }
501 
502 /* This function must only run if there are no other
503  * references _anywhere_ to this buffer head.
504  */
505 static void put_last_free(struct buffer_head * bh)
506 {
507         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
508         struct buffer_head **bhp = &head->list;
509 
510         bh->b_state = 0;
511 
512         spin_lock(&head->lock);
513         bh->b_dev = B_FREE;
514         if(!*bhp) {
515                 *bhp = bh;
516                 bh->b_prev_free = bh;
517         }
518         bh->b_next_free = *bhp;
519         bh->b_prev_free = (*bhp)->b_prev_free;
520         (*bhp)->b_prev_free->b_next_free = bh;
521         (*bhp)->b_prev_free = bh;
522         spin_unlock(&head->lock);
523 }
524 
525 /*
526  * Why like this, I hear you say... The reason is race-conditions.
527  * As we don't lock buffers (unless we are reading them, that is),
528  * something might happen to it while we sleep (ie a read-error
529  * will force it bad). This shouldn't really happen currently, but
530  * the code is ready.
531  */
532 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
533 {
534         struct buffer_head *bh = hash(dev, block);
535 
536         for (; bh; bh = bh->b_next)
537                 if (bh->b_blocknr == block      &&
538                     bh->b_size    == size       &&
539                     bh->b_dev     == dev)
540                         break;
541         if (bh)
542                 atomic_inc(&bh->b_count);
543 
544         return bh;
545 }
546 
547 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
548 {
549         struct buffer_head *bh;
550 
551         read_lock(&hash_table_lock);
552         bh = __get_hash_table(dev, block, size);
553         read_unlock(&hash_table_lock);
554 
555         return bh;
556 }
557 
558 unsigned int get_hardblocksize(kdev_t dev)
559 {
560         /*
561          * Get the hard sector size for the given device.  If we don't know
562          * what it is, return 0.
563          */
564         if (hardsect_size[MAJOR(dev)] != NULL) {
565                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
566                 if (blksize != 0)
567                         return blksize;
568         }
569 
570         /*
571          * We don't know what the hardware sector size for this device is.
572          * Return 0 indicating that we don't know.
573          */
574         return 0;
575 }
576 
577 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
578 {
579         spin_lock(&lru_list_lock);
580         if (bh->b_inode)
581                 list_del(&bh->b_inode_buffers);
582         bh->b_inode = inode;
583         list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
584         spin_unlock(&lru_list_lock);
585 }
586 
587 /* The caller must have the lru_list lock before calling the 
588    remove_inode_queue functions.  */
589 static void __remove_inode_queue(struct buffer_head *bh)
590 {
591         bh->b_inode = NULL;
592         list_del(&bh->b_inode_buffers);
593 }
594 
595 static inline void remove_inode_queue(struct buffer_head *bh)
596 {
597         if (bh->b_inode)
598                 __remove_inode_queue(bh);
599 }
600 
601 int inode_has_buffers(struct inode *inode)
602 {
603         int ret;
604         
605         spin_lock(&lru_list_lock);
606         ret = !list_empty(&inode->i_dirty_buffers);
607         spin_unlock(&lru_list_lock);
608         
609         return ret;
610 }
611 
612 
613 /* If invalidate_buffers() will trash dirty buffers, it means some kind
614    of fs corruption is going on. Trashing dirty data always imply losing
615    information that was supposed to be just stored on the physical layer
616    by the user.
617 
618    Thus invalidate_buffers in general usage is not allwowed to trash dirty
619    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
620 
621    NOTE: In the case where the user removed a removable-media-disk even if
622    there's still dirty data not synced on disk (due a bug in the device driver
623    or due an error of the user), by not destroying the dirty buffers we could
624    generate corruption also on the next media inserted, thus a parameter is
625    necessary to handle this case in the most safe way possible (trying
626    to not corrupt also the new disk inserted with the data belonging to
627    the old now corrupted disk). Also for the ramdisk the natural thing
628    to do in order to release the ramdisk memory is to destroy dirty buffers.
629 
630    These are two special cases. Normal usage imply the device driver
631    to issue a sync on the device (without waiting I/O completation) and
632    then an invalidate_buffers call that doesn't trash dirty buffers. */
633 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
634 {
635         int i, nlist, slept;
636         struct buffer_head * bh, * bh_next;
637 
638  retry:
639         slept = 0;
640         spin_lock(&lru_list_lock);
641         for(nlist = 0; nlist < NR_LIST; nlist++) {
642                 bh = lru_list[nlist];
643                 if (!bh)
644                         continue;
645                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
646                         bh_next = bh->b_next_free;
647 
648                         /* Another device? */
649                         if (bh->b_dev != dev)
650                                 continue;
651                         /* Part of a mapping? */
652                         if (bh->b_page->mapping)
653                                 continue;
654                         if (buffer_locked(bh)) {
655                                 atomic_inc(&bh->b_count);
656                                 spin_unlock(&lru_list_lock);
657                                 wait_on_buffer(bh);
658                                 slept = 1;
659                                 spin_lock(&lru_list_lock);
660                                 atomic_dec(&bh->b_count);
661                         }
662 
663                         write_lock(&hash_table_lock);
664                         if (!atomic_read(&bh->b_count) &&
665                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
666                                 remove_inode_queue(bh);
667                                 __remove_from_queues(bh);
668                                 put_last_free(bh);
669                         }
670                         /* else complain loudly? */
671 
672                         write_unlock(&hash_table_lock);
673                         if (slept)
674                                 goto out;
675                 }
676         }
677 out:
678         spin_unlock(&lru_list_lock);
679         if (slept)
680                 goto retry;
681 }
682 
683 void set_blocksize(kdev_t dev, int size)
684 {
685         extern int *blksize_size[];
686         int i, nlist, slept;
687         struct buffer_head * bh, * bh_next;
688 
689         if (!blksize_size[MAJOR(dev)])
690                 return;
691 
692         /* Size must be a power of two, and between 512 and PAGE_SIZE */
693         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
694                 panic("Invalid blocksize passed to set_blocksize");
695 
696         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
697                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
698                 return;
699         }
700         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
701                 return;
702         sync_buffers(dev, 2);
703         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
704 
705  retry:
706         slept = 0;
707         spin_lock(&lru_list_lock);
708         for(nlist = 0; nlist < NR_LIST; nlist++) {
709                 bh = lru_list[nlist];
710                 if (!bh)
711                         continue;
712                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
713                         bh_next = bh->b_next_free;
714                         if (bh->b_dev != dev || bh->b_size == size)
715                                 continue;
716                         if (buffer_locked(bh)) {
717                                 atomic_inc(&bh->b_count);
718                                 spin_unlock(&lru_list_lock);
719                                 wait_on_buffer(bh);
720                                 slept = 1;
721                                 spin_lock(&lru_list_lock);
722                                 atomic_dec(&bh->b_count);
723                         }
724 
725                         write_lock(&hash_table_lock);
726                         if (!atomic_read(&bh->b_count)) {
727                                 if (buffer_dirty(bh))
728                                         printk(KERN_WARNING
729                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
730                                                kdevname(dev), bh->b_blocknr, bh->b_size);
731                                 remove_inode_queue(bh);
732                                 __remove_from_queues(bh);
733                                 put_last_free(bh);
734                         } else {
735                                 if (atomic_set_buffer_clean(bh))
736                                         __refile_buffer(bh);
737                                 clear_bit(BH_Uptodate, &bh->b_state);
738                                 printk(KERN_WARNING
739                                        "set_blocksize: "
740                                        "b_count %d, dev %s, block %lu, from %p\n",
741                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
742                                        bh->b_blocknr, __builtin_return_address(0));
743                         }
744                         write_unlock(&hash_table_lock);
745                         if (slept)
746                                 goto out;
747                 }
748         }
749  out:
750         spin_unlock(&lru_list_lock);
751         if (slept)
752                 goto retry;
753 }
754 
755 /*
756  * We used to try various strange things. Let's not.
757  * We'll just try to balance dirty buffers, and possibly
758  * launder some pages.
759  */
760 static void refill_freelist(int size)
761 {
762         balance_dirty(NODEV);
763         if (free_shortage())
764                 page_launder(GFP_BUFFER, 0);
765         grow_buffers(size);
766 }
767 
768 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
769 {
770         bh->b_list = BUF_CLEAN;
771         bh->b_end_io = handler;
772         bh->b_private = private;
773 }
774 
775 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
776 {
777         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
778         unsigned long flags;
779         struct buffer_head *tmp;
780         struct page *page;
781 
782         mark_buffer_uptodate(bh, uptodate);
783 
784         /* This is a temporary buffer used for page I/O. */
785         page = bh->b_page;
786 
787         if (!uptodate)
788                 SetPageError(page);
789 
790         /*
791          * Be _very_ careful from here on. Bad things can happen if
792          * two buffer heads end IO at almost the same time and both
793          * decide that the page is now completely done.
794          *
795          * Async buffer_heads are here only as labels for IO, and get
796          * thrown away once the IO for this page is complete.  IO is
797          * deemed complete once all buffers have been visited
798          * (b_count==0) and are now unlocked. We must make sure that
799          * only the _last_ buffer that decrements its count is the one
800          * that unlock the page..
801          */
802         spin_lock_irqsave(&page_uptodate_lock, flags);
803         unlock_buffer(bh);
804         atomic_dec(&bh->b_count);
805         tmp = bh->b_this_page;
806         while (tmp != bh) {
807                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
808                         goto still_busy;
809                 tmp = tmp->b_this_page;
810         }
811 
812         /* OK, the async IO on this page is complete. */
813         spin_unlock_irqrestore(&page_uptodate_lock, flags);
814 
815         /*
816          * if none of the buffers had errors then we can set the
817          * page uptodate:
818          */
819         if (!PageError(page))
820                 SetPageUptodate(page);
821 
822         /*
823          * Run the hooks that have to be done when a page I/O has completed.
824          */
825         if (PageTestandClearDecrAfter(page))
826                 atomic_dec(&nr_async_pages);
827 
828         UnlockPage(page);
829 
830         return;
831 
832 still_busy:
833         spin_unlock_irqrestore(&page_uptodate_lock, flags);
834         return;
835 }
836 
837 void set_buffer_async_io(struct buffer_head *bh) {
838     bh->b_end_io = end_buffer_io_async ;
839 }
840 
841 /*
842  * Synchronise all the inode's dirty buffers to the disk.
843  *
844  * We have conflicting pressures: we want to make sure that all
845  * initially dirty buffers get waited on, but that any subsequently
846  * dirtied buffers don't.  After all, we don't want fsync to last
847  * forever if somebody is actively writing to the file.
848  *
849  * Do this in two main stages: first we copy dirty buffers to a
850  * temporary inode list, queueing the writes as we go.  Then we clean
851  * up, waiting for those writes to complete.
852  * 
853  * During this second stage, any subsequent updates to the file may end
854  * up refiling the buffer on the original inode's dirty list again, so
855  * there is a chance we will end up with a buffer queued for write but
856  * not yet completed on that list.  So, as a final cleanup we go through
857  * the osync code to catch these locked, dirty buffers without requeuing
858  * any newly dirty buffers for write.
859  */
860 
861 int fsync_inode_buffers(struct inode *inode)
862 {
863         struct buffer_head *bh;
864         struct inode tmp;
865         int err = 0, err2;
866         
867         INIT_LIST_HEAD(&tmp.i_dirty_buffers);
868         
869         spin_lock(&lru_list_lock);
870 
871         while (!list_empty(&inode->i_dirty_buffers)) {
872                 bh = BH_ENTRY(inode->i_dirty_buffers.next);
873                 list_del(&bh->b_inode_buffers);
874                 if (!buffer_dirty(bh) && !buffer_locked(bh))
875                         bh->b_inode = NULL;
876                 else {
877                         bh->b_inode = &tmp;
878                         list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
879                         if (buffer_dirty(bh)) {
880                                 atomic_inc(&bh->b_count);
881                                 spin_unlock(&lru_list_lock);
882                                 ll_rw_block(WRITE, 1, &bh);
883                                 brelse(bh);
884                                 spin_lock(&lru_list_lock);
885                         }
886                 }
887         }
888 
889         while (!list_empty(&tmp.i_dirty_buffers)) {
890                 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
891                 remove_inode_queue(bh);
892                 atomic_inc(&bh->b_count);
893                 spin_unlock(&lru_list_lock);
894                 wait_on_buffer(bh);
895                 if (!buffer_uptodate(bh))
896                         err = -EIO;
897                 brelse(bh);
898                 spin_lock(&lru_list_lock);
899         }
900         
901         spin_unlock(&lru_list_lock);
902         err2 = osync_inode_buffers(inode);
903 
904         if (err)
905                 return err;
906         else
907                 return err2;
908 }
909 
910 
911 /*
912  * osync is designed to support O_SYNC io.  It waits synchronously for
913  * all already-submitted IO to complete, but does not queue any new
914  * writes to the disk.
915  *
916  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
917  * you dirty the buffers, and then use osync_inode_buffers to wait for
918  * completion.  Any other dirty buffers which are not yet queued for
919  * write will not be flushed to disk by the osync.
920  */
921 
922 int osync_inode_buffers(struct inode *inode)
923 {
924         struct buffer_head *bh;
925         struct list_head *list;
926         int err = 0;
927 
928         spin_lock(&lru_list_lock);
929         
930  repeat:
931         
932         for (list = inode->i_dirty_buffers.prev; 
933              bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
934              list = bh->b_inode_buffers.prev) {
935                 if (buffer_locked(bh)) {
936                         atomic_inc(&bh->b_count);
937                         spin_unlock(&lru_list_lock);
938                         wait_on_buffer(bh);
939                         if (!buffer_uptodate(bh))
940                                 err = -EIO;
941                         brelse(bh);
942                         spin_lock(&lru_list_lock);
943                         goto repeat;
944                 }
945         }
946 
947         spin_unlock(&lru_list_lock);
948         return err;
949 }
950 
951 
952 /*
953  * Invalidate any and all dirty buffers on a given inode.  We are
954  * probably unmounting the fs, but that doesn't mean we have already
955  * done a sync().  Just drop the buffers from the inode list.
956  */
957 void invalidate_inode_buffers(struct inode *inode)
958 {
959         struct list_head *list, *next;
960         
961         spin_lock(&lru_list_lock);
962         list = inode->i_dirty_buffers.next; 
963         while (list != &inode->i_dirty_buffers) {
964                 next = list->next;
965                 remove_inode_queue(BH_ENTRY(list));
966                 list = next;
967         }
968         spin_unlock(&lru_list_lock);
969 }
970 
971 
972 /*
973  * Ok, this is getblk, and it isn't very clear, again to hinder
974  * race-conditions. Most of the code is seldom used, (ie repeating),
975  * so it should be much more efficient than it looks.
976  *
977  * The algorithm is changed: hopefully better, and an elusive bug removed.
978  *
979  * 14.02.92: changed it to sync dirty buffers a bit: better performance
980  * when the filesystem starts to get full of dirty blocks (I hope).
981  */
982 struct buffer_head * getblk(kdev_t dev, int block, int size)
983 {
984         struct buffer_head * bh;
985         int isize;
986 
987 repeat:
988         spin_lock(&lru_list_lock);
989         write_lock(&hash_table_lock);
990         bh = __get_hash_table(dev, block, size);
991         if (bh)
992                 goto out;
993 
994         isize = BUFSIZE_INDEX(size);
995         spin_lock(&free_list[isize].lock);
996         bh = free_list[isize].list;
997         if (bh) {
998                 __remove_from_free_list(bh, isize);
999                 atomic_set(&bh->b_count, 1);
1000         }
1001         spin_unlock(&free_list[isize].lock);
1002 
1003         /*
1004          * OK, FINALLY we know that this buffer is the only one of
1005          * its kind, we hold a reference (b_count>0), it is unlocked,
1006          * and it is clean.
1007          */
1008         if (bh) {
1009                 init_buffer(bh, NULL, NULL);
1010                 bh->b_dev = dev;
1011                 bh->b_blocknr = block;
1012                 bh->b_state = 1 << BH_Mapped;
1013 
1014                 /* Insert the buffer into the regular lists */
1015                 __insert_into_queues(bh);
1016         out:
1017                 write_unlock(&hash_table_lock);
1018                 spin_unlock(&lru_list_lock);
1019                 touch_buffer(bh);
1020                 return bh;
1021         }
1022 
1023         /*
1024          * If we block while refilling the free list, somebody may
1025          * create the buffer first ... search the hashes again.
1026          */
1027         write_unlock(&hash_table_lock);
1028         spin_unlock(&lru_list_lock);
1029         refill_freelist(size);
1030         goto repeat;
1031 }
1032 
1033 /* -1 -> no need to flush
1034     0 -> async flush
1035     1 -> sync flush (wait for I/O completation) */
1036 int balance_dirty_state(kdev_t dev)
1037 {
1038         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1039         int shortage;
1040 
1041         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1042         tot = nr_free_buffer_pages();
1043 
1044         dirty *= 100;
1045         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1046         hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1047 
1048         /* First, check for the "real" dirty limit. */
1049         if (dirty > soft_dirty_limit) {
1050                 if (dirty > hard_dirty_limit)
1051                         return 1;
1052                 return 0;
1053         }
1054 
1055         /*
1056          * If we are about to get low on free pages and
1057          * cleaning the inactive_dirty pages would help
1058          * fix this, wake up bdflush.
1059          */
1060         shortage = free_shortage();
1061         if (shortage && nr_inactive_dirty_pages > shortage &&
1062                         nr_inactive_dirty_pages > freepages.high)
1063                 return 0;
1064 
1065         return -1;
1066 }
1067 
1068 /*
1069  * if a new dirty buffer is created we need to balance bdflush.
1070  *
1071  * in the future we might want to make bdflush aware of different
1072  * pressures on different devices - thus the (currently unused)
1073  * 'dev' parameter.
1074  */
1075 void balance_dirty(kdev_t dev)
1076 {
1077         int state = balance_dirty_state(dev);
1078 
1079         if (state < 0)
1080                 return;
1081         wakeup_bdflush(state);
1082 }
1083 
1084 static __inline__ void __mark_dirty(struct buffer_head *bh)
1085 {
1086         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1087         refile_buffer(bh);
1088 }
1089 
1090 /* atomic version, the user must call balance_dirty() by hand
1091    as soon as it become possible to block */
1092 void __mark_buffer_dirty(struct buffer_head *bh)
1093 {
1094         if (!atomic_set_buffer_dirty(bh))
1095                 __mark_dirty(bh);
1096 }
1097 
1098 void mark_buffer_dirty(struct buffer_head *bh)
1099 {
1100         if (!atomic_set_buffer_dirty(bh)) {
1101                 __mark_dirty(bh);
1102                 balance_dirty(bh->b_dev);
1103         }
1104 }
1105 
1106 /*
1107  * A buffer may need to be moved from one buffer list to another
1108  * (e.g. in case it is not shared any more). Handle this.
1109  */
1110 static void __refile_buffer(struct buffer_head *bh)
1111 {
1112         int dispose = BUF_CLEAN;
1113         if (buffer_locked(bh))
1114                 dispose = BUF_LOCKED;
1115         if (buffer_dirty(bh))
1116                 dispose = BUF_DIRTY;
1117         if (buffer_protected(bh))
1118                 dispose = BUF_PROTECTED;
1119         if (dispose != bh->b_list) {
1120                 __remove_from_lru_list(bh, bh->b_list);
1121                 bh->b_list = dispose;
1122                 if (dispose == BUF_CLEAN)
1123                         remove_inode_queue(bh);
1124                 __insert_into_lru_list(bh, dispose);
1125         }
1126 }
1127 
1128 void refile_buffer(struct buffer_head *bh)
1129 {
1130         spin_lock(&lru_list_lock);
1131         __refile_buffer(bh);
1132         spin_unlock(&lru_list_lock);
1133 }
1134 
1135 /*
1136  * Release a buffer head
1137  */
1138 void __brelse(struct buffer_head * buf)
1139 {
1140         if (atomic_read(&buf->b_count)) {
1141                 atomic_dec(&buf->b_count);
1142                 return;
1143         }
1144         printk("VFS: brelse: Trying to free free buffer\n");
1145 }
1146 
1147 /*
1148  * bforget() is like brelse(), except it puts the buffer on the
1149  * free list if it can.. We can NOT free the buffer if:
1150  *  - there are other users of it
1151  *  - it is locked and thus can have active IO
1152  */
1153 void __bforget(struct buffer_head * buf)
1154 {
1155         /* grab the lru lock here to block bdflush. */
1156         spin_lock(&lru_list_lock);
1157         write_lock(&hash_table_lock);
1158         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf))
1159                 goto in_use;
1160         __hash_unlink(buf);
1161         remove_inode_queue(buf);
1162         write_unlock(&hash_table_lock);
1163         __remove_from_lru_list(buf, buf->b_list);
1164         spin_unlock(&lru_list_lock);
1165         put_last_free(buf);
1166         return;
1167 
1168  in_use:
1169         write_unlock(&hash_table_lock);
1170         spin_unlock(&lru_list_lock);
1171 }
1172 
1173 /*
1174  * bread() reads a specified block and returns the buffer that contains
1175  * it. It returns NULL if the block was unreadable.
1176  */
1177 struct buffer_head * bread(kdev_t dev, int block, int size)
1178 {
1179         struct buffer_head * bh;
1180 
1181         bh = getblk(dev, block, size);
1182         if (buffer_uptodate(bh))
1183                 return bh;
1184         ll_rw_block(READ, 1, &bh);
1185         wait_on_buffer(bh);
1186         if (buffer_uptodate(bh))
1187                 return bh;
1188         brelse(bh);
1189         return NULL;
1190 }
1191 
1192 /*
1193  * Note: the caller should wake up the buffer_wait list if needed.
1194  */
1195 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1196 {
1197         if (bh->b_inode)
1198                 BUG();
1199         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1200                 kmem_cache_free(bh_cachep, bh);
1201         } else {
1202                 bh->b_blocknr = -1;
1203                 init_waitqueue_head(&bh->b_wait);
1204                 nr_unused_buffer_heads++;
1205                 bh->b_next_free = unused_list;
1206                 bh->b_this_page = NULL;
1207                 unused_list = bh;
1208         }
1209 }
1210 
1211 /*
1212  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1213  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1214  * buffer heads is now handled in create_buffers().
1215  */ 
1216 static struct buffer_head * get_unused_buffer_head(int async)
1217 {
1218         struct buffer_head * bh;
1219 
1220         spin_lock(&unused_list_lock);
1221         if (nr_unused_buffer_heads > NR_RESERVED) {
1222                 bh = unused_list;
1223                 unused_list = bh->b_next_free;
1224                 nr_unused_buffer_heads--;
1225                 spin_unlock(&unused_list_lock);
1226                 return bh;
1227         }
1228         spin_unlock(&unused_list_lock);
1229 
1230         /* This is critical.  We can't swap out pages to get
1231          * more buffer heads, because the swap-out may need
1232          * more buffer-heads itself.  Thus SLAB_BUFFER.
1233          */
1234         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1235                 memset(bh, 0, sizeof(*bh));
1236                 init_waitqueue_head(&bh->b_wait);
1237                 return bh;
1238         }
1239 
1240         /*
1241          * If we need an async buffer, use the reserved buffer heads.
1242          */
1243         if (async) {
1244                 spin_lock(&unused_list_lock);
1245                 if (unused_list) {
1246                         bh = unused_list;
1247                         unused_list = bh->b_next_free;
1248                         nr_unused_buffer_heads--;
1249                         spin_unlock(&unused_list_lock);
1250                         return bh;
1251                 }
1252                 spin_unlock(&unused_list_lock);
1253         }
1254 #if 0
1255         /*
1256          * (Pending further analysis ...)
1257          * Ordinary (non-async) requests can use a different memory priority
1258          * to free up pages. Any swapping thus generated will use async
1259          * buffer heads.
1260          */
1261         if(!async &&
1262            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1263                 memset(bh, 0, sizeof(*bh));
1264                 init_waitqueue_head(&bh->b_wait);
1265                 return bh;
1266         }
1267 #endif
1268 
1269         return NULL;
1270 }
1271 
1272 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1273 {
1274         bh->b_page = page;
1275         if (offset >= PAGE_SIZE)
1276                 BUG();
1277         if (PageHighMem(page))
1278                 /*
1279                  * This catches illegal uses and preserves the offset:
1280                  */
1281                 bh->b_data = (char *)(0 + offset);
1282         else
1283                 bh->b_data = page_address(page) + offset;
1284 }
1285 
1286 /*
1287  * Create the appropriate buffers when given a page for data area and
1288  * the size of each buffer.. Use the bh->b_this_page linked list to
1289  * follow the buffers created.  Return NULL if unable to create more
1290  * buffers.
1291  * The async flag is used to differentiate async IO (paging, swapping)
1292  * from ordinary buffer allocations, and only async requests are allowed
1293  * to sleep waiting for buffer heads. 
1294  */
1295 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1296 {
1297         struct buffer_head *bh, *head;
1298         long offset;
1299 
1300 try_again:
1301         head = NULL;
1302         offset = PAGE_SIZE;
1303         while ((offset -= size) >= 0) {
1304                 bh = get_unused_buffer_head(async);
1305                 if (!bh)
1306                         goto no_grow;
1307 
1308                 bh->b_dev = B_FREE;  /* Flag as unused */
1309                 bh->b_this_page = head;
1310                 head = bh;
1311 
1312                 bh->b_state = 0;
1313                 bh->b_next_free = NULL;
1314                 bh->b_pprev = NULL;
1315                 atomic_set(&bh->b_count, 0);
1316                 bh->b_size = size;
1317 
1318                 set_bh_page(bh, page, offset);
1319 
1320                 bh->b_list = BUF_CLEAN;
1321                 bh->b_end_io = NULL;
1322         }
1323         return head;
1324 /*
1325  * In case anything failed, we just free everything we got.
1326  */
1327 no_grow:
1328         if (head) {
1329                 spin_lock(&unused_list_lock);
1330                 do {
1331                         bh = head;
1332                         head = head->b_this_page;
1333                         __put_unused_buffer_head(bh);
1334                 } while (head);
1335                 spin_unlock(&unused_list_lock);
1336 
1337                 /* Wake up any waiters ... */
1338                 wake_up(&buffer_wait);
1339         }
1340 
1341         /*
1342          * Return failure for non-async IO requests.  Async IO requests
1343          * are not allowed to fail, so we have to wait until buffer heads
1344          * become available.  But we don't want tasks sleeping with 
1345          * partially complete buffers, so all were released above.
1346          */
1347         if (!async)
1348                 return NULL;
1349 
1350         /* We're _really_ low on memory. Now we just
1351          * wait for old buffer heads to become free due to
1352          * finishing IO.  Since this is an async request and
1353          * the reserve list is empty, we're sure there are 
1354          * async buffer heads in use.
1355          */
1356         run_task_queue(&tq_disk);
1357 
1358         /* 
1359          * Set our state for sleeping, then check again for buffer heads.
1360          * This ensures we won't miss a wake_up from an interrupt.
1361          */
1362         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1363         goto try_again;
1364 }
1365 
1366 static void unmap_buffer(struct buffer_head * bh)
1367 {
1368         if (buffer_mapped(bh)) {
1369                 mark_buffer_clean(bh);
1370                 wait_on_buffer(bh);
1371                 clear_bit(BH_Uptodate, &bh->b_state);
1372                 clear_bit(BH_Mapped, &bh->b_state);
1373                 clear_bit(BH_Req, &bh->b_state);
1374                 clear_bit(BH_New, &bh->b_state);
1375         }
1376 }
1377 
1378 /*
1379  * We don't have to release all buffers here, but
1380  * we have to be sure that no dirty buffer is left
1381  * and no IO is going on (no buffer is locked), because
1382  * we have truncated the file and are going to free the
1383  * blocks on-disk..
1384  */
1385 int block_flushpage(struct page *page, unsigned long offset)
1386 {
1387         struct buffer_head *head, *bh, *next;
1388         unsigned int curr_off = 0;
1389 
1390         if (!PageLocked(page))
1391                 BUG();
1392         if (!page->buffers)
1393                 return 1;
1394 
1395         head = page->buffers;
1396         bh = head;
1397         do {
1398                 unsigned int next_off = curr_off + bh->b_size;
1399                 next = bh->b_this_page;
1400 
1401                 /*
1402                  * is this block fully flushed?
1403                  */
1404                 if (offset <= curr_off)
1405                         unmap_buffer(bh);
1406                 curr_off = next_off;
1407                 bh = next;
1408         } while (bh != head);
1409 
1410         /*
1411          * subtle. We release buffer-heads only if this is
1412          * the 'final' flushpage. We have invalidated the get_block
1413          * cached value unconditionally, so real IO is not
1414          * possible anymore.
1415          *
1416          * If the free doesn't work out, the buffers can be
1417          * left around - they just turn into anonymous buffers
1418          * instead.
1419          */
1420         if (!offset) {
1421                 if (!try_to_free_buffers(page, 0)) {
1422                         atomic_inc(&buffermem_pages);
1423                         return 0;
1424                 }
1425         }
1426 
1427         return 1;
1428 }
1429 
1430 static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1431 {
1432         struct buffer_head *bh, *head, *tail;
1433 
1434         head = create_buffers(page, blocksize, 1);
1435         if (page->buffers)
1436                 BUG();
1437 
1438         bh = head;
1439         do {
1440                 bh->b_dev = dev;
1441                 bh->b_blocknr = 0;
1442                 bh->b_end_io = NULL;
1443                 tail = bh;
1444                 bh = bh->b_this_page;
1445         } while (bh);
1446         tail->b_this_page = head;
1447         page->buffers = head;
1448         page_cache_get(page);
1449 }
1450 
1451 /*
1452  * We are taking a block for data and we don't want any output from any
1453  * buffer-cache aliases starting from return from that function and
1454  * until the moment when something will explicitly mark the buffer
1455  * dirty (hopefully that will not happen until we will free that block ;-)
1456  * We don't even need to mark it not-uptodate - nobody can expect
1457  * anything from a newly allocated buffer anyway. We used to used
1458  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1459  * don't want to mark the alias unmapped, for example - it would confuse
1460  * anyone who might pick it with bread() afterwards...
1461  */
1462 
1463 static void unmap_underlying_metadata(struct buffer_head * bh)
1464 {
1465         struct buffer_head *old_bh;
1466 
1467         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1468         if (old_bh) {
1469                 mark_buffer_clean(old_bh);
1470                 wait_on_buffer(old_bh);
1471                 clear_bit(BH_Req, &old_bh->b_state);
1472                 /* Here we could run brelse or bforget. We use
1473                    bforget because it will try to put the buffer
1474                    in the freelist. */
1475                 __bforget(old_bh);
1476         }
1477 }
1478 
1479 /*
1480  * NOTE! All mapped/uptodate combinations are valid:
1481  *
1482  *      Mapped  Uptodate        Meaning
1483  *
1484  *      No      No              "unknown" - must do get_block()
1485  *      No      Yes             "hole" - zero-filled
1486  *      Yes     No              "allocated" - allocated on disk, not read in
1487  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1488  *
1489  * "Dirty" is valid only with the last case (mapped+uptodate).
1490  */
1491 
1492 /*
1493  * block_write_full_page() is SMP-safe - currently it's still
1494  * being called with the kernel lock held, but the code is ready.
1495  */
1496 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1497 {
1498         int err, i;
1499         unsigned long block;
1500         struct buffer_head *bh, *head;
1501 
1502         if (!PageLocked(page))
1503                 BUG();
1504 
1505         if (!page->buffers)
1506                 create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
1507         head = page->buffers;
1508 
1509         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1510 
1511         bh = head;
1512         i = 0;
1513 
1514         /* Stage 1: make sure we have all the buffers mapped! */
1515         do {
1516                 /*
1517                  * If the buffer isn't up-to-date, we can't be sure
1518                  * that the buffer has been initialized with the proper
1519                  * block number information etc..
1520                  *
1521                  * Leave it to the low-level FS to make all those
1522                  * decisions (block #0 may actually be a valid block)
1523                  */
1524                 if (!buffer_mapped(bh)) {
1525                         err = get_block(inode, block, bh, 1);
1526                         if (err)
1527                                 goto out;
1528                         if (buffer_new(bh))
1529                                 unmap_underlying_metadata(bh);
1530                 }
1531                 bh = bh->b_this_page;
1532                 block++;
1533         } while (bh != head);
1534 
1535         /* Stage 2: lock the buffers, mark them clean */
1536         do {
1537                 lock_buffer(bh);
1538                 bh->b_end_io = end_buffer_io_async;
1539                 atomic_inc(&bh->b_count);
1540                 set_bit(BH_Uptodate, &bh->b_state);
1541                 clear_bit(BH_Dirty, &bh->b_state);
1542                 bh = bh->b_this_page;
1543         } while (bh != head);
1544 
1545         /* Stage 3: submit the IO */
1546         do {
1547                 submit_bh(WRITE, bh);
1548                 bh = bh->b_this_page;           
1549         } while (bh != head);
1550 
1551         /* Done - end_buffer_io_async will unlock */
1552         SetPageUptodate(page);
1553         return 0;
1554 
1555 out:
1556         ClearPageUptodate(page);
1557         UnlockPage(page);
1558         return err;
1559 }
1560 
1561 static int __block_prepare_write(struct inode *inode, struct page *page,
1562                 unsigned from, unsigned to, get_block_t *get_block)
1563 {
1564         unsigned block_start, block_end;
1565         unsigned long block;
1566         int err = 0;
1567         unsigned blocksize, bbits;
1568         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1569         char *kaddr = kmap(page);
1570 
1571         blocksize = inode->i_sb->s_blocksize;
1572         if (!page->buffers)
1573                 create_empty_buffers(page, inode->i_dev, blocksize);
1574         head = page->buffers;
1575 
1576         bbits = inode->i_sb->s_blocksize_bits;
1577         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1578 
1579         for(bh = head, block_start = 0; bh != head || !block_start;
1580             block++, block_start=block_end, bh = bh->b_this_page) {
1581                 if (!bh)
1582                         BUG();
1583                 block_end = block_start+blocksize;
1584                 if (block_end <= from)
1585                         continue;
1586                 if (block_start >= to)
1587                         break;
1588                 if (!buffer_mapped(bh)) {
1589                         err = get_block(inode, block, bh, 1);
1590                         if (err)
1591                                 goto out;
1592                         if (buffer_new(bh)) {
1593                                 unmap_underlying_metadata(bh);
1594                                 if (Page_Uptodate(page)) {
1595                                         set_bit(BH_Uptodate, &bh->b_state);
1596                                         continue;
1597                                 }
1598                                 if (block_end > to)
1599                                         memset(kaddr+to, 0, block_end-to);
1600                                 if (block_start < from)
1601                                         memset(kaddr+block_start, 0, from-block_start);
1602                                 if (block_end > to || block_start < from)
1603                                         flush_dcache_page(page);
1604                                 continue;
1605                         }
1606                 }
1607                 if (Page_Uptodate(page)) {
1608                         set_bit(BH_Uptodate, &bh->b_state);
1609                         continue; 
1610                 }
1611                 if (!buffer_uptodate(bh) &&
1612                      (block_start < from || block_end > to)) {
1613                         ll_rw_block(READ, 1, &bh);
1614                         *wait_bh++=bh;
1615                 }
1616         }
1617         /*
1618          * If we issued read requests - let them complete.
1619          */
1620         while(wait_bh > wait) {
1621                 wait_on_buffer(*--wait_bh);
1622                 err = -EIO;
1623                 if (!buffer_uptodate(*wait_bh))
1624                         goto out;
1625         }
1626         return 0;
1627 out:
1628         return err;
1629 }
1630 
1631 static int __block_commit_write(struct inode *inode, struct page *page,
1632                 unsigned from, unsigned to)
1633 {
1634         unsigned block_start, block_end;
1635         int partial = 0, need_balance_dirty = 0;
1636         unsigned blocksize;
1637         struct buffer_head *bh, *head;
1638 
1639         blocksize = inode->i_sb->s_blocksize;
1640 
1641         for(bh = head = page->buffers, block_start = 0;
1642             bh != head || !block_start;
1643             block_start=block_end, bh = bh->b_this_page) {
1644                 block_end = block_start + blocksize;
1645                 if (block_end <= from || block_start >= to) {
1646                         if (!buffer_uptodate(bh))
1647                                 partial = 1;
1648                 } else {
1649                         set_bit(BH_Uptodate, &bh->b_state);
1650                         if (!atomic_set_buffer_dirty(bh)) {
1651                                 __mark_dirty(bh);
1652                                 buffer_insert_inode_queue(bh, inode);
1653                                 need_balance_dirty = 1;
1654                         }
1655                 }
1656         }
1657 
1658         if (need_balance_dirty)
1659                 balance_dirty(bh->b_dev);
1660         /*
1661          * is this a partial write that happened to make all buffers
1662          * uptodate then we can optimize away a bogus readpage() for
1663          * the next read(). Here we 'discover' wether the page went
1664          * uptodate as a result of this (potentially partial) write.
1665          */
1666         if (!partial)
1667                 SetPageUptodate(page);
1668         return 0;
1669 }
1670 
1671 /*
1672  * Generic "read page" function for block devices that have the normal
1673  * get_block functionality. This is most of the block device filesystems.
1674  * Reads the page asynchronously --- the unlock_buffer() and
1675  * mark_buffer_uptodate() functions propagate buffer state into the
1676  * page struct once IO has completed.
1677  */
1678 int block_read_full_page(struct page *page, get_block_t *get_block)
1679 {
1680         struct inode *inode = page->mapping->host;
1681         unsigned long iblock, lblock;
1682         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1683         unsigned int blocksize, blocks;
1684         int nr, i;
1685 
1686         if (!PageLocked(page))
1687                 PAGE_BUG(page);
1688         blocksize = inode->i_sb->s_blocksize;
1689         if (!page->buffers)
1690                 create_empty_buffers(page, inode->i_dev, blocksize);
1691         head = page->buffers;
1692 
1693         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1694         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1695         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1696         bh = head;
1697         nr = 0;
1698         i = 0;
1699 
1700         do {
1701                 if (buffer_uptodate(bh))
1702                         continue;
1703 
1704                 if (!buffer_mapped(bh)) {
1705                         if (iblock < lblock) {
1706                                 if (get_block(inode, iblock, bh, 0))
1707                                         continue;
1708                         }
1709                         if (!buffer_mapped(bh)) {
1710                                 memset(kmap(page) + i*blocksize, 0, blocksize);
1711                                 flush_dcache_page(page);
1712                                 kunmap(page);
1713                                 set_bit(BH_Uptodate, &bh->b_state);
1714                                 continue;
1715                         }
1716                         /* get_block() might have updated the buffer synchronously */
1717                         if (buffer_uptodate(bh))
1718                                 continue;
1719                 }
1720 
1721                 arr[nr] = bh;
1722                 nr++;
1723         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1724 
1725         if (!nr) {
1726                 /*
1727                  * all buffers are uptodate - we can set the page
1728                  * uptodate as well.
1729                  */
1730                 SetPageUptodate(page);
1731                 UnlockPage(page);
1732                 return 0;
1733         }
1734 
1735         /* Stage two: lock the buffers */
1736         for (i = 0; i < nr; i++) {
1737                 struct buffer_head * bh = arr[i];
1738                 lock_buffer(bh);
1739                 bh->b_end_io = end_buffer_io_async;
1740                 atomic_inc(&bh->b_count);
1741         }
1742 
1743         /* Stage 3: start the IO */
1744         for (i = 0; i < nr; i++)
1745                 submit_bh(READ, arr[i]);
1746 
1747         return 0;
1748 }
1749 
1750 /*
1751  * For moronic filesystems that do not allow holes in file.
1752  * We may have to extend the file.
1753  */
1754 
1755 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1756 {
1757         struct address_space *mapping = page->mapping;
1758         struct inode *inode = mapping->host;
1759         struct page *new_page;
1760         unsigned long pgpos;
1761         long status;
1762         unsigned zerofrom;
1763         unsigned blocksize = inode->i_sb->s_blocksize;
1764         char *kaddr;
1765 
1766         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1767                 status = -ENOMEM;
1768                 new_page = grab_cache_page(mapping, pgpos);
1769                 if (!new_page)
1770                         goto out;
1771                 /* we might sleep */
1772                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1773                         UnlockPage(new_page);
1774                         page_cache_release(new_page);
1775                         continue;
1776                 }
1777                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1778                 if (zerofrom & (blocksize-1)) {
1779                         *bytes |= (blocksize-1);
1780                         (*bytes)++;
1781                 }
1782                 status = __block_prepare_write(inode, new_page, zerofrom,
1783                                                 PAGE_CACHE_SIZE, get_block);
1784                 if (status)
1785                         goto out_unmap;
1786                 kaddr = page_address(new_page);
1787                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1788                 flush_dcache_page(new_page);
1789                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1790                 kunmap(new_page);
1791                 UnlockPage(new_page);
1792                 page_cache_release(new_page);
1793         }
1794 
1795         if (page->index < pgpos) {
1796                 /* completely inside the area */
1797                 zerofrom = offset;
1798         } else {
1799                 /* page covers the boundary, find the boundary offset */
1800                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1801 
1802                 /* if we will expand the thing last block will be filled */
1803                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1804                         *bytes |= (blocksize-1);
1805                         (*bytes)++;
1806                 }
1807 
1808                 /* starting below the boundary? Nothing to zero out */
1809                 if (offset <= zerofrom)
1810                         zerofrom = offset;
1811         }
1812         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1813         if (status)
1814                 goto out1;
1815         kaddr = page_address(page);
1816         if (zerofrom < offset) {
1817                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1818                 flush_dcache_page(page);
1819                 __block_commit_write(inode, page, zerofrom, offset);
1820         }
1821         return 0;
1822 out1:
1823         ClearPageUptodate(page);
1824         kunmap(page);
1825         return status;
1826 
1827 out_unmap:
1828         ClearPageUptodate(new_page);
1829         kunmap(new_page);
1830         UnlockPage(new_page);
1831         page_cache_release(new_page);
1832 out:
1833         return status;
1834 }
1835 
1836 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1837                         get_block_t *get_block)
1838 {
1839         struct inode *inode = page->mapping->host;
1840         int err = __block_prepare_write(inode, page, from, to, get_block);
1841         if (err) {
1842                 ClearPageUptodate(page);
1843                 kunmap(page);
1844         }
1845         return err;
1846 }
1847 
1848 int generic_commit_write(struct file *file, struct page *page,
1849                 unsigned from, unsigned to)
1850 {
1851         struct inode *inode = page->mapping->host;
1852         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1853         __block_commit_write(inode,page,from,to);
1854         kunmap(page);
1855         if (pos > inode->i_size) {
1856                 inode->i_size = pos;
1857                 mark_inode_dirty(inode);
1858         }
1859         return 0;
1860 }
1861 
1862 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1863 {
1864         unsigned long index = from >> PAGE_CACHE_SHIFT;
1865         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1866         unsigned blocksize, iblock, length, pos;
1867         struct inode *inode = mapping->host;
1868         struct page *page;
1869         struct buffer_head *bh;
1870         int err;
1871 
1872         blocksize = inode->i_sb->s_blocksize;
1873         length = offset & (blocksize - 1);
1874 
1875         /* Block boundary? Nothing to do */
1876         if (!length)
1877                 return 0;
1878 
1879         length = blocksize - length;
1880         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1881         
1882         page = grab_cache_page(mapping, index);
1883         err = PTR_ERR(page);
1884         if (IS_ERR(page))
1885                 goto out;
1886 
1887         if (!page->buffers)
1888                 create_empty_buffers(page, inode->i_dev, blocksize);
1889 
1890         /* Find the buffer that contains "offset" */
1891         bh = page->buffers;
1892         pos = blocksize;
1893         while (offset >= pos) {
1894                 bh = bh->b_this_page;
1895                 iblock++;
1896                 pos += blocksize;
1897         }
1898 
1899         err = 0;
1900         if (!buffer_mapped(bh)) {
1901                 /* Hole? Nothing to do */
1902                 if (buffer_uptodate(bh))
1903                         goto unlock;
1904                 get_block(inode, iblock, bh, 0);
1905                 /* Still unmapped? Nothing to do */
1906                 if (!buffer_mapped(bh))
1907                         goto unlock;
1908         }
1909 
1910         /* Ok, it's mapped. Make sure it's up-to-date */
1911         if (Page_Uptodate(page))
1912                 set_bit(BH_Uptodate, &bh->b_state);
1913 
1914         if (!buffer_uptodate(bh)) {
1915                 err = -EIO;
1916                 ll_rw_block(READ, 1, &bh);
1917                 wait_on_buffer(bh);
1918                 /* Uhhuh. Read error. Complain and punt. */
1919                 if (!buffer_uptodate(bh))
1920                         goto unlock;
1921         }
1922 
1923         memset(kmap(page) + offset, 0, length);
1924         flush_dcache_page(page);
1925         kunmap(page);
1926 
1927         __mark_buffer_dirty(bh);
1928         err = 0;
1929 
1930 unlock:
1931         UnlockPage(page);
1932         page_cache_release(page);
1933 out:
1934         return err;
1935 }
1936 
1937 int block_write_full_page(struct page *page, get_block_t *get_block)
1938 {
1939         struct inode *inode = page->mapping->host;
1940         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1941         unsigned offset;
1942         int err;
1943 
1944         /* easy case */
1945         if (page->index < end_index)
1946                 return __block_write_full_page(inode, page, get_block);
1947 
1948         /* things got complicated... */
1949         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1950         /* OK, are we completely out? */
1951         if (page->index >= end_index+1 || !offset) {
1952                 UnlockPage(page);
1953                 return -EIO;
1954         }
1955 
1956         /* Sigh... will have to work, then... */
1957         err = __block_prepare_write(inode, page, 0, offset, get_block);
1958         if (!err) {
1959                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
1960                 flush_dcache_page(page);
1961                 __block_commit_write(inode,page,0,offset);
1962 done:
1963                 kunmap(page);
1964                 UnlockPage(page);
1965                 return err;
1966         }
1967         ClearPageUptodate(page);
1968         goto done;
1969 }
1970 
1971 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1972 {
1973         struct buffer_head tmp;
1974         struct inode *inode = mapping->host;
1975         tmp.b_state = 0;
1976         tmp.b_blocknr = 0;
1977         get_block(inode, block, &tmp, 0);
1978         return tmp.b_blocknr;
1979 }
1980 
1981 /*
1982  * IO completion routine for a buffer_head being used for kiobuf IO: we
1983  * can't dispatch the kiobuf callback until io_count reaches 0.  
1984  */
1985 
1986 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1987 {
1988         struct kiobuf *kiobuf;
1989         
1990         mark_buffer_uptodate(bh, uptodate);
1991 
1992         kiobuf = bh->b_private;
1993         unlock_buffer(bh);
1994         end_kio_request(kiobuf, uptodate);
1995 }
1996 
1997 
1998 /*
1999  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2000  * for them to complete.  Clean up the buffer_heads afterwards.  
2001  */
2002 
2003 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2004 {
2005         int iosize;
2006         int i;
2007         struct buffer_head *tmp;
2008 
2009 
2010         iosize = 0;
2011         spin_lock(&unused_list_lock);
2012 
2013         for (i = nr; --i >= 0; ) {
2014                 iosize += size;
2015                 tmp = bh[i];
2016                 if (buffer_locked(tmp)) {
2017                         spin_unlock(&unused_list_lock);
2018                         wait_on_buffer(tmp);
2019                         spin_lock(&unused_list_lock);
2020                 }
2021                 
2022                 if (!buffer_uptodate(tmp)) {
2023                         /* We are traversing bh'es in reverse order so
2024                            clearing iosize on error calculates the
2025                            amount of IO before the first error. */
2026                         iosize = 0;
2027                 }
2028                 __put_unused_buffer_head(tmp);
2029         }
2030         
2031         spin_unlock(&unused_list_lock);
2032 
2033         return iosize;
2034 }
2035 
2036 /*
2037  * Start I/O on a physical range of kernel memory, defined by a vector
2038  * of kiobuf structs (much like a user-space iovec list).
2039  *
2040  * The kiobuf must already be locked for IO.  IO is submitted
2041  * asynchronously: you need to check page->locked, page->uptodate, and
2042  * maybe wait on page->wait.
2043  *
2044  * It is up to the caller to make sure that there are enough blocks
2045  * passed in to completely map the iobufs to disk.
2046  */
2047 
2048 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
2049                kdev_t dev, unsigned long b[], int size)
2050 {
2051         int             err;
2052         int             length;
2053         int             transferred;
2054         int             i;
2055         int             bufind;
2056         int             pageind;
2057         int             bhind;
2058         int             offset;
2059         unsigned long   blocknr;
2060         struct kiobuf * iobuf = NULL;
2061         struct page *   map;
2062         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
2063 
2064         if (!nr)
2065                 return 0;
2066         
2067         /* 
2068          * First, do some alignment and validity checks 
2069          */
2070         for (i = 0; i < nr; i++) {
2071                 iobuf = iovec[i];
2072                 if ((iobuf->offset & (size-1)) ||
2073                     (iobuf->length & (size-1)))
2074                         return -EINVAL;
2075                 if (!iobuf->nr_pages)
2076                         panic("brw_kiovec: iobuf not initialised");
2077         }
2078 
2079         /* 
2080          * OK to walk down the iovec doing page IO on each page we find. 
2081          */
2082         bufind = bhind = transferred = err = 0;
2083         for (i = 0; i < nr; i++) {
2084                 iobuf = iovec[i];
2085                 offset = iobuf->offset;
2086                 length = iobuf->length;
2087                 iobuf->errno = 0;
2088                 
2089                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2090                         map  = iobuf->maplist[pageind];
2091                         if (!map) {
2092                                 err = -EFAULT;
2093                                 goto error;
2094                         }
2095                         
2096                         while (length > 0) {
2097                                 blocknr = b[bufind++];
2098                                 tmp = get_unused_buffer_head(0);
2099                                 if (!tmp) {
2100                                         err = -ENOMEM;
2101                                         goto error;
2102                                 }
2103                                 
2104                                 tmp->b_dev = B_FREE;
2105                                 tmp->b_size = size;
2106                                 set_bh_page(tmp, map, offset);
2107                                 tmp->b_this_page = tmp;
2108 
2109                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2110                                 tmp->b_dev = dev;
2111                                 tmp->b_blocknr = blocknr;
2112                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2113 
2114                                 if (rw == WRITE) {
2115                                         set_bit(BH_Uptodate, &tmp->b_state);
2116                                         clear_bit(BH_Dirty, &tmp->b_state);
2117                                 }
2118 
2119                                 bh[bhind++] = tmp;
2120                                 length -= size;
2121                                 offset += size;
2122 
2123                                 atomic_inc(&iobuf->io_count);
2124 
2125                                 submit_bh(rw, tmp);
2126                                 /* 
2127                                  * Wait for IO if we have got too much 
2128                                  */
2129                                 if (bhind >= KIO_MAX_SECTORS) {
2130                                         err = wait_kio(rw, bhind, bh, size);
2131                                         if (err >= 0)
2132                                                 transferred += err;
2133                                         else
2134                                                 goto finished;
2135                                         bhind = 0;
2136                                 }
2137                                 
2138                                 if (offset >= PAGE_SIZE) {
2139                                         offset = 0;
2140                                         break;
2141                                 }
2142                         } /* End of block loop */
2143                 } /* End of page loop */                
2144         } /* End of iovec loop */
2145 
2146         /* Is there any IO still left to submit? */
2147         if (bhind) {
2148                 err = wait_kio(rw, bhind, bh, size);
2149                 if (err >= 0)
2150                         transferred += err;
2151                 else
2152                         goto finished;
2153         }
2154 
2155  finished:
2156         if (transferred)
2157                 return transferred;
2158         return err;
2159 
2160  error:
2161         /* We got an error allocating the bh'es.  Just free the current
2162            buffer_heads and exit. */
2163         spin_lock(&unused_list_lock);
2164         for (i = bhind; --i >= 0; ) {
2165                 __put_unused_buffer_head(bh[i]);
2166         }
2167         spin_unlock(&unused_list_lock);
2168         goto finished;
2169 }
2170 
2171 /*
2172  * Start I/O on a page.
2173  * This function expects the page to be locked and may return
2174  * before I/O is complete. You then have to check page->locked,
2175  * page->uptodate, and maybe wait on page->wait.
2176  *
2177  * brw_page() is SMP-safe, although it's being called with the
2178  * kernel lock held - but the code is ready.
2179  *
2180  * FIXME: we need a swapper_inode->get_block function to remove
2181  *        some of the bmap kludges and interface ugliness here.
2182  */
2183 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2184 {
2185         struct buffer_head *head, *bh;
2186 
2187         if (!PageLocked(page))
2188                 panic("brw_page: page not locked for I/O");
2189 
2190         if (!page->buffers)
2191                 create_empty_buffers(page, dev, size);
2192         head = bh = page->buffers;
2193 
2194         /* Stage 1: lock all the buffers */
2195         do {
2196                 lock_buffer(bh);
2197                 bh->b_blocknr = *(b++);
2198                 set_bit(BH_Mapped, &bh->b_state);
2199                 bh->b_end_io = end_buffer_io_async;
2200                 atomic_inc(&bh->b_count);
2201                 bh = bh->b_this_page;
2202         } while (bh != head);
2203 
2204         /* Stage 2: start the IO */
2205         do {
2206                 submit_bh(rw, bh);
2207                 bh = bh->b_this_page;
2208         } while (bh != head);
2209         return 0;
2210 }
2211 
2212 int block_symlink(struct inode *inode, const char *symname, int len)
2213 {
2214         struct address_space *mapping = inode->i_mapping;
2215         struct page *page = grab_cache_page(mapping, 0);
2216         int err = -ENOMEM;
2217         char *kaddr;
2218 
2219         if (!page)
2220                 goto fail;
2221         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2222         if (err)
2223                 goto fail_map;
2224         kaddr = page_address(page);
2225         memcpy(kaddr, symname, len-1);
2226         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2227         /*
2228          * Notice that we are _not_ going to block here - end of page is
2229          * unmapped, so this will only try to map the rest of page, see
2230          * that it is unmapped (typically even will not look into inode -
2231          * ->i_size will be enough for everything) and zero it out.
2232          * OTOH it's obviously correct and should make the page up-to-date.
2233          */
2234         err = mapping->a_ops->readpage(NULL, page);
2235         wait_on_page(page);
2236         page_cache_release(page);
2237         if (err < 0)
2238                 goto fail;
2239         mark_inode_dirty(inode);
2240         return 0;
2241 fail_map:
2242         UnlockPage(page);
2243         page_cache_release(page);
2244 fail:
2245         return err;
2246 }
2247 
2248 /*
2249  * Try to increase the number of buffers available: the size argument
2250  * is used to determine what kind of buffers we want.
2251  */
2252 static int grow_buffers(int size)
2253 {
2254         struct page * page;
2255         struct buffer_head *bh, *tmp;
2256         struct buffer_head * insert_point;
2257         int isize;
2258 
2259         if ((size & 511) || (size > PAGE_SIZE)) {
2260                 printk("VFS: grow_buffers: size = %d\n",size);
2261                 return 0;
2262         }
2263 
2264         page = alloc_page(GFP_BUFFER);
2265         if (!page)
2266                 goto out;
2267         LockPage(page);
2268         bh = create_buffers(page, size, 0);
2269         if (!bh)
2270                 goto no_buffer_head;
2271 
2272         isize = BUFSIZE_INDEX(size);
2273 
2274         spin_lock(&free_list[isize].lock);
2275         insert_point = free_list[isize].list;
2276         tmp = bh;
2277         while (1) {
2278                 if (insert_point) {
2279                         tmp->b_next_free = insert_point->b_next_free;
2280                         tmp->b_prev_free = insert_point;
2281                         insert_point->b_next_free->b_prev_free = tmp;
2282                         insert_point->b_next_free = tmp;
2283                 } else {
2284                         tmp->b_prev_free = tmp;
2285                         tmp->b_next_free = tmp;
2286                 }
2287                 insert_point = tmp;
2288                 if (tmp->b_this_page)
2289                         tmp = tmp->b_this_page;
2290                 else
2291                         break;
2292         }
2293         tmp->b_this_page = bh;
2294         free_list[isize].list = bh;
2295         spin_unlock(&free_list[isize].lock);
2296 
2297         page->buffers = bh;
2298         page->flags &= ~(1 << PG_referenced);
2299         lru_cache_add(page);
2300         UnlockPage(page);
2301         atomic_inc(&buffermem_pages);
2302         return 1;
2303 
2304 no_buffer_head:
2305         UnlockPage(page);
2306         page_cache_release(page);
2307 out:
2308         return 0;
2309 }
2310 
2311 /*
2312  * Sync all the buffers on one page..
2313  *
2314  * If we have old buffers that are locked, we'll
2315  * wait on them, but we won't wait on the new ones
2316  * we're writing out now.
2317  *
2318  * This all is required so that we can free up memory
2319  * later.
2320  *
2321  * Wait:
2322  *      0 - no wait (this does not get called - see try_to_free_buffers below)
2323  *      1 - start IO for dirty buffers
2324  *      2 - wait for completion of locked buffers
2325  */
2326 static void sync_page_buffers(struct buffer_head *bh, int wait)
2327 {
2328         struct buffer_head * tmp = bh;
2329 
2330         do {
2331                 struct buffer_head *p = tmp;
2332                 tmp = tmp->b_this_page;
2333                 if (buffer_locked(p)) {
2334                         if (wait > 1)
2335                                 __wait_on_buffer(p);
2336                 } else if (buffer_dirty(p))
2337                         ll_rw_block(WRITE, 1, &p);
2338         } while (tmp != bh);
2339 }
2340 
2341 /*
2342  * Can the buffer be thrown out?
2343  */
2344 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2345 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2346 
2347 /*
2348  * try_to_free_buffers() checks if all the buffers on this particular page
2349  * are unused, and free's the page if so.
2350  *
2351  * Wake up bdflush() if this fails - if we're running low on memory due
2352  * to dirty buffers, we need to flush them out as quickly as possible.
2353  *
2354  * NOTE: There are quite a number of ways that threads of control can
2355  *       obtain a reference to a buffer head within a page.  So we must
2356  *       lock out all of these paths to cleanly toss the page.
2357  */
2358 int try_to_free_buffers(struct page * page, int wait)
2359 {
2360         struct buffer_head * tmp, * bh = page->buffers;
2361         int index = BUFSIZE_INDEX(bh->b_size);
2362         int loop = 0;
2363 
2364 cleaned_buffers_try_again:
2365         spin_lock(&lru_list_lock);
2366         write_lock(&hash_table_lock);
2367         spin_lock(&free_list[index].lock);
2368         tmp = bh;
2369         do {
2370                 struct buffer_head *p = tmp;
2371 
2372                 tmp = tmp->b_this_page;
2373                 if (buffer_busy(p))
2374                         goto busy_buffer_page;
2375         } while (tmp != bh);
2376 
2377         spin_lock(&unused_list_lock);
2378         tmp = bh;
2379         do {
2380                 struct buffer_head * p = tmp;
2381                 tmp = tmp->b_this_page;
2382 
2383                 /* The buffer can be either on the regular
2384                  * queues or on the free list..
2385                  */
2386                 if (p->b_dev != B_FREE) {
2387                         remove_inode_queue(p);
2388                         __remove_from_queues(p);
2389                 } else
2390                         __remove_from_free_list(p, index);
2391                 __put_unused_buffer_head(p);
2392         } while (tmp != bh);
2393         spin_unlock(&unused_list_lock);
2394 
2395         /* Wake up anyone waiting for buffer heads */
2396         wake_up(&buffer_wait);
2397 
2398         /* And free the page */
2399         page->buffers = NULL;
2400         page_cache_release(page);
2401         spin_unlock(&free_list[index].lock);
2402         write_unlock(&hash_table_lock);
2403         spin_unlock(&lru_list_lock);
2404         return 1;
2405 
2406 busy_buffer_page:
2407         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2408         spin_unlock(&free_list[index].lock);
2409         write_unlock(&hash_table_lock);
2410         spin_unlock(&lru_list_lock);
2411         if (wait) {
2412                 sync_page_buffers(bh, wait);
2413                 /* We waited synchronously, so we can free the buffers. */
2414                 if (wait > 1 && !loop) {
2415                         loop = 1;
2416                         goto cleaned_buffers_try_again;
2417                 }
2418                 wakeup_bdflush(0);
2419         }
2420         return 0;
2421 }
2422 
2423 /* ================== Debugging =================== */
2424 
2425 void show_buffers(void)
2426 {
2427 #ifdef CONFIG_SMP
2428         struct buffer_head * bh;
2429         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2430         int protected = 0;
2431         int nlist;
2432         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2433 #endif
2434 
2435         printk("Buffer memory:   %6dkB\n",
2436                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2437 
2438 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2439         if (!spin_trylock(&lru_list_lock))
2440                 return;
2441         for(nlist = 0; nlist < NR_LIST; nlist++) {
2442                 found = locked = dirty = used = lastused = protected = 0;
2443                 bh = lru_list[nlist];
2444                 if(!bh) continue;
2445 
2446                 do {
2447                         found++;
2448                         if (buffer_locked(bh))
2449                                 locked++;
2450                         if (buffer_protected(bh))
2451                                 protected++;
2452                         if (buffer_dirty(bh))
2453                                 dirty++;
2454                         if (atomic_read(&bh->b_count))
2455                                 used++, lastused = found;
2456                         bh = bh->b_next_free;
2457                 } while (bh != lru_list[nlist]);
2458                 {
2459                         int tmp = nr_buffers_type[nlist];
2460                         if (found != tmp)
2461                                 printk("%9s: BUG -> found %d, reported %d\n",
2462                                        buf_types[nlist], found, tmp);
2463                 }
2464                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2465                        "%d locked, %d protected, %d dirty\n",
2466                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2467                        used, lastused, locked, protected, dirty);
2468         }
2469         spin_unlock(&lru_list_lock);
2470 #endif
2471 }
2472 
2473 /* ===================== Init ======================= */
2474 
2475 /*
2476  * allocate the hash table and init the free list
2477  * Use gfp() for the hash table to decrease TLB misses, use
2478  * SLAB cache for buffer heads.
2479  */
2480 void __init buffer_init(unsigned long mempages)
2481 {
2482         int order, i;
2483         unsigned int nr_hash;
2484 
2485         /* The buffer cache hash table is less important these days,
2486          * trim it a bit.
2487          */
2488         mempages >>= 14;
2489 
2490         mempages *= sizeof(struct buffer_head *);
2491 
2492         for (order = 0; (1 << order) < mempages; order++)
2493                 ;
2494 
2495         /* try to allocate something until we get it or we're asking
2496            for something that is really too small */
2497 
2498         do {
2499                 unsigned long tmp;
2500 
2501                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2502                 bh_hash_mask = (nr_hash - 1);
2503 
2504                 tmp = nr_hash;
2505                 bh_hash_shift = 0;
2506                 while((tmp >>= 1UL) != 0UL)
2507                         bh_hash_shift++;
2508 
2509                 hash_table = (struct buffer_head **)
2510                     __get_free_pages(GFP_ATOMIC, order);
2511         } while (hash_table == NULL && --order > 0);
2512         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2513                nr_hash, order, (PAGE_SIZE << order));
2514 
2515         if (!hash_table)
2516                 panic("Failed to allocate buffer hash table\n");
2517 
2518         /* Setup hash chains. */
2519         for(i = 0; i < nr_hash; i++)
2520                 hash_table[i] = NULL;
2521 
2522         /* Setup free lists. */
2523         for(i = 0; i < NR_SIZES; i++) {
2524                 free_list[i].list = NULL;
2525                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2526         }
2527 
2528         /* Setup lru lists. */
2529         for(i = 0; i < NR_LIST; i++)
2530                 lru_list[i] = NULL;
2531 
2532 }
2533 
2534 
2535 /* ====================== bdflush support =================== */
2536 
2537 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2538  * response to dirty buffers.  Once this process is activated, we write back
2539  * a limited number of buffers to the disks and then go back to sleep again.
2540  */
2541 
2542 /* This is the _only_ function that deals with flushing async writes
2543    to disk.
2544    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2545    as all dirty buffers lives _only_ in the DIRTY lru list.
2546    As we never browse the LOCKED and CLEAN lru lists they are infact
2547    completly useless. */
2548 static int flush_dirty_buffers(int check_flushtime)
2549 {
2550         struct buffer_head * bh, *next;
2551         int flushed = 0, i;
2552 
2553  restart:
2554         spin_lock(&lru_list_lock);
2555         bh = lru_list[BUF_DIRTY];
2556         if (!bh)
2557                 goto out_unlock;
2558         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2559                 next = bh->b_next_free;
2560 
2561                 if (!buffer_dirty(bh)) {
2562                         __refile_buffer(bh);
2563                         continue;
2564                 }
2565                 if (buffer_locked(bh))
2566                         continue;
2567 
2568                 if (check_flushtime) {
2569                         /* The dirty lru list is chronologically ordered so
2570                            if the current bh is not yet timed out,
2571                            then also all the following bhs
2572                            will be too young. */
2573                         if (time_before(jiffies, bh->b_flushtime))
2574                                 goto out_unlock;
2575                 } else {
2576                         if (++flushed > bdf_prm.b_un.ndirty)
2577                                 goto out_unlock;
2578                 }
2579 
2580                 /* OK, now we are committed to write it out. */
2581                 atomic_inc(&bh->b_count);
2582                 spin_unlock(&lru_list_lock);
2583                 ll_rw_block(WRITE, 1, &bh);
2584                 atomic_dec(&bh->b_count);
2585 
2586                 if (current->need_resched)
2587                         schedule();
2588                 goto restart;
2589         }
2590  out_unlock:
2591         spin_unlock(&lru_list_lock);
2592 
2593         return flushed;
2594 }
2595 
2596 struct task_struct *bdflush_tsk = 0;
2597 
2598 void wakeup_bdflush(int block)
2599 {
2600         if (current != bdflush_tsk) {
2601                 wake_up_process(bdflush_tsk);
2602 
2603                 if (block)
2604                         flush_dirty_buffers(0);
2605         }
2606 }
2607 
2608 /* 
2609  * Here we attempt to write back old buffers.  We also try to flush inodes 
2610  * and supers as well, since this function is essentially "update", and 
2611  * otherwise there would be no way of ensuring that these quantities ever 
2612  * get written back.  Ideally, we would have a timestamp on the inodes
2613  * and superblocks so that we could write back only the old ones as well
2614  */
2615 
2616 static int sync_old_buffers(void)
2617 {
2618         lock_kernel();
2619         sync_supers(0);
2620         sync_inodes(0);
2621         unlock_kernel();
2622 
2623         flush_dirty_buffers(1);
2624         /* must really sync all the active I/O request to disk here */
2625         run_task_queue(&tq_disk);
2626         return 0;
2627 }
2628 
2629 int block_sync_page(struct page *page)
2630 {
2631         run_task_queue(&tq_disk);
2632         return 0;
2633 }
2634 
2635 /* This is the interface to bdflush.  As we get more sophisticated, we can
2636  * pass tuning parameters to this "process", to adjust how it behaves. 
2637  * We would want to verify each parameter, however, to make sure that it 
2638  * is reasonable. */
2639 
2640 asmlinkage long sys_bdflush(int func, long data)
2641 {
2642         if (!capable(CAP_SYS_ADMIN))
2643                 return -EPERM;
2644 
2645         if (func == 1) {
2646                 /* do_exit directly and let kupdate to do its work alone. */
2647                 do_exit(0);
2648 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2649          a syscall that doesn't care about the current mm context. */
2650                 int error;
2651                 struct mm_struct *user_mm;
2652 
2653                 /*
2654                  * bdflush will spend all of it's time in kernel-space,
2655                  * without touching user-space, so we can switch it into
2656                  * 'lazy TLB mode' to reduce the cost of context-switches
2657                  * to and from bdflush.
2658                  */
2659                 user_mm = start_lazy_tlb();
2660                 error = sync_old_buffers();
2661                 end_lazy_tlb(user_mm);
2662                 return error;
2663 #endif
2664         }
2665 
2666         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2667         if (func >= 2) {
2668                 int i = (func-2) >> 1;
2669                 if (i >= 0 && i < N_PARAM) {
2670                         if ((func & 1) == 0)
2671                                 return put_user(bdf_prm.data[i], (int*)data);
2672 
2673                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2674                                 bdf_prm.data[i] = data;
2675                                 return 0;
2676                         }
2677                 }
2678                 return -EINVAL;
2679         }
2680 
2681         /* Having func 0 used to launch the actual bdflush and then never
2682          * return (unless explicitly killed). We return zero here to 
2683          * remain semi-compatible with present update(8) programs.
2684          */
2685         return 0;
2686 }
2687 
2688 /*
2689  * This is the actual bdflush daemon itself. It used to be started from
2690  * the syscall above, but now we launch it ourselves internally with
2691  * kernel_thread(...)  directly after the first thread in init/main.c
2692  */
2693 int bdflush(void *sem)
2694 {
2695         struct task_struct *tsk = current;
2696         int flushed;
2697         /*
2698          *      We have a bare-bones task_struct, and really should fill
2699          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2700          *      display semi-sane things. Not real crucial though...  
2701          */
2702 
2703         tsk->session = 1;
2704         tsk->pgrp = 1;
2705         strcpy(tsk->comm, "bdflush");
2706         bdflush_tsk = tsk;
2707 
2708         /* avoid getting signals */
2709         spin_lock_irq(&tsk->sigmask_lock);
2710         flush_signals(tsk);
2711         sigfillset(&tsk->blocked);
2712         recalc_sigpending(tsk);
2713         spin_unlock_irq(&tsk->sigmask_lock);
2714 
2715         up((struct semaphore *)sem);
2716 
2717         for (;;) {
2718                 CHECK_EMERGENCY_SYNC
2719 
2720                 flushed = flush_dirty_buffers(0);
2721                 if (free_shortage())
2722                         flushed += page_launder(GFP_KERNEL, 0);
2723 
2724                 /*
2725                  * If there are still a lot of dirty buffers around,
2726                  * skip the sleep and flush some more. Otherwise, we
2727                  * go to sleep waiting a wakeup.
2728                  */
2729                 set_current_state(TASK_INTERRUPTIBLE);
2730                 if (!flushed || balance_dirty_state(NODEV) < 0) {
2731                         run_task_queue(&tq_disk);
2732                         schedule();
2733                 }
2734                 /* Remember to mark us as running otherwise
2735                    the next schedule will block. */
2736                 __set_current_state(TASK_RUNNING);
2737         }
2738 }
2739 
2740 /*
2741  * This is the kernel update daemon. It was used to live in userspace
2742  * but since it's need to run safely we want it unkillable by mistake.
2743  * You don't need to change your userspace configuration since
2744  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2745  */
2746 int kupdate(void *sem)
2747 {
2748         struct task_struct * tsk = current;
2749         int interval;
2750 
2751         tsk->session = 1;
2752         tsk->pgrp = 1;
2753         strcpy(tsk->comm, "kupdate");
2754 
2755         /* sigstop and sigcont will stop and wakeup kupdate */
2756         spin_lock_irq(&tsk->sigmask_lock);
2757         sigfillset(&tsk->blocked);
2758         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2759         recalc_sigpending(tsk);
2760         spin_unlock_irq(&tsk->sigmask_lock);
2761 
2762         up((struct semaphore *)sem);
2763 
2764         for (;;) {
2765                 /* update interval */
2766                 interval = bdf_prm.b_un.interval;
2767                 if (interval) {
2768                         tsk->state = TASK_INTERRUPTIBLE;
2769                         schedule_timeout(interval);
2770                 } else {
2771                 stop_kupdate:
2772                         tsk->state = TASK_STOPPED;
2773                         schedule(); /* wait for SIGCONT */
2774                 }
2775                 /* check for sigstop */
2776                 if (signal_pending(tsk)) {
2777                         int stopped = 0;
2778                         spin_lock_irq(&tsk->sigmask_lock);
2779                         if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2780                                 sigdelset(&tsk->pending.signal, SIGSTOP);
2781                                 stopped = 1;
2782                         }
2783                         recalc_sigpending(tsk);
2784                         spin_unlock_irq(&tsk->sigmask_lock);
2785                         if (stopped)
2786                                 goto stop_kupdate;
2787                 }
2788 #ifdef DEBUG
2789                 printk("kupdate() activated...\n");
2790 #endif
2791                 sync_old_buffers();
2792         }
2793 }
2794 
2795 static int __init bdflush_init(void)
2796 {
2797         DECLARE_MUTEX_LOCKED(sem);
2798         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2799         down(&sem);
2800         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2801         down(&sem);
2802         return 0;
2803 }
2804 
2805 module_init(bdflush_init)
2806 
2807 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.