1 /*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
11 */
12
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17 */
18
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
21 */
22
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
25 */
26
27 /* Thread it... -DaveM */
28
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48
49 #include <asm/uaccess.h>
50 #include <asm/io.h>
51 #include <asm/bitops.h>
52 #include <asm/mmu_context.h>
53
54 #define NR_SIZES 7
55 static char buffersize_index[65] =
56 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
57 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
60 6};
61
62 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
63 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
64 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
65 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
66 number of unused buffer heads */
67
68 /* Anti-deadlock ordering:
69 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
70 */
71
72 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
73
74 /*
75 * Hash table gook..
76 */
77 static unsigned int bh_hash_mask;
78 static unsigned int bh_hash_shift;
79 static struct buffer_head **hash_table;
80 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
81
82 static struct buffer_head *lru_list[NR_LIST];
83 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
84 static int nr_buffers_type[NR_LIST];
85 static unsigned long size_buffers_type[NR_LIST];
86
87 static struct buffer_head * unused_list;
88 static int nr_unused_buffer_heads;
89 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
90 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
91
92 struct bh_free_head {
93 struct buffer_head *list;
94 spinlock_t lock;
95 };
96 static struct bh_free_head free_list[NR_SIZES];
97
98 static int grow_buffers(int size);
99 static void __refile_buffer(struct buffer_head *);
100
101 /* This is used by some architectures to estimate available memory. */
102 atomic_t buffermem_pages = ATOMIC_INIT(0);
103
104 /* Here is the parameter block for the bdflush process. If you add or
105 * remove any of the parameters, make sure to update kernel/sysctl.c.
106 */
107
108 #define N_PARAM 9
109
110 /* The dummy values in this structure are left in there for compatibility
111 * with old programs that play with the /proc entries.
112 */
113 union bdflush_param {
114 struct {
115 int nfract; /* Percentage of buffer cache dirty to
116 activate bdflush */
117 int ndirty; /* Maximum number of dirty blocks to write out per
118 wake-cycle */
119 int nrefill; /* Number of clean buffers to try to obtain
120 each time we call refill */
121 int dummy1; /* unused */
122 int interval; /* jiffies delay between kupdate flushes */
123 int age_buffer; /* Time for normal buffer to age before we flush it */
124 int nfract_sync; /* Percentage of buffer cache dirty to
125 activate bdflush synchronously */
126 int dummy2; /* unused */
127 int dummy3; /* unused */
128 } b_un;
129 unsigned int data[N_PARAM];
130 } bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
131
132 /* These are the min and max parameter values that we will allow to be assigned */
133 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0};
134 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
135
136 /*
137 * Rewrote the wait-routines to use the "new" wait-queue functionality,
138 * and getting rid of the cli-sti pairs. The wait-queue routines still
139 * need cli-sti, but now it's just a couple of 386 instructions or so.
140 *
141 * Note that the real wait_on_buffer() is an inline function that checks
142 * if 'b_wait' is set before calling this, so that the queues aren't set
143 * up unnecessarily.
144 */
145 void __wait_on_buffer(struct buffer_head * bh)
146 {
147 struct task_struct *tsk = current;
148 DECLARE_WAITQUEUE(wait, tsk);
149
150 atomic_inc(&bh->b_count);
151 add_wait_queue(&bh->b_wait, &wait);
152 do {
153 run_task_queue(&tq_disk);
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 if (!buffer_locked(bh))
156 break;
157 schedule();
158 } while (buffer_locked(bh));
159 tsk->state = TASK_RUNNING;
160 remove_wait_queue(&bh->b_wait, &wait);
161 atomic_dec(&bh->b_count);
162 }
163
164 /* Call sync_buffers with wait!=0 to ensure that the call does not
165 * return until all buffer writes have completed. Sync() may return
166 * before the writes have finished; fsync() may not.
167 */
168
169 /* Godamity-damn. Some buffers (bitmaps for filesystems)
170 * spontaneously dirty themselves without ever brelse being called.
171 * We will ultimately want to put these in a separate list, but for
172 * now we search all of the lists for dirty buffers.
173 */
174 static int sync_buffers(kdev_t dev, int wait)
175 {
176 int i, retry, pass = 0, err = 0;
177 struct buffer_head * bh, *next;
178
179 /* One pass for no-wait, three for wait:
180 * 0) write out all dirty, unlocked buffers;
181 * 1) write out all dirty buffers, waiting if locked;
182 * 2) wait for completion by waiting for all buffers to unlock.
183 */
184 do {
185 retry = 0;
186
187 /* We search all lists as a failsafe mechanism, not because we expect
188 * there to be dirty buffers on any of the other lists.
189 */
190 repeat:
191 spin_lock(&lru_list_lock);
192 bh = lru_list[BUF_DIRTY];
193 if (!bh)
194 goto repeat2;
195
196 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
197 next = bh->b_next_free;
198
199 if (!lru_list[BUF_DIRTY])
200 break;
201 if (dev && bh->b_dev != dev)
202 continue;
203 if (buffer_locked(bh)) {
204 /* Buffer is locked; skip it unless wait is
205 * requested AND pass > 0.
206 */
207 if (!wait || !pass) {
208 retry = 1;
209 continue;
210 }
211 atomic_inc(&bh->b_count);
212 spin_unlock(&lru_list_lock);
213 wait_on_buffer (bh);
214 atomic_dec(&bh->b_count);
215 goto repeat;
216 }
217
218 /* If an unlocked buffer is not uptodate, there has
219 * been an IO error. Skip it.
220 */
221 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
222 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
223 err = -EIO;
224 continue;
225 }
226
227 /* Don't write clean buffers. Don't write ANY buffers
228 * on the third pass.
229 */
230 if (!buffer_dirty(bh) || pass >= 2)
231 continue;
232
233 atomic_inc(&bh->b_count);
234 spin_unlock(&lru_list_lock);
235 ll_rw_block(WRITE, 1, &bh);
236 atomic_dec(&bh->b_count);
237 retry = 1;
238 goto repeat;
239 }
240
241 repeat2:
242 bh = lru_list[BUF_LOCKED];
243 if (!bh) {
244 spin_unlock(&lru_list_lock);
245 break;
246 }
247 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
248 next = bh->b_next_free;
249
250 if (!lru_list[BUF_LOCKED])
251 break;
252 if (dev && bh->b_dev != dev)
253 continue;
254 if (buffer_locked(bh)) {
255 /* Buffer is locked; skip it unless wait is
256 * requested AND pass > 0.
257 */
258 if (!wait || !pass) {
259 retry = 1;
260 continue;
261 }
262 atomic_inc(&bh->b_count);
263 spin_unlock(&lru_list_lock);
264 wait_on_buffer (bh);
265 spin_lock(&lru_list_lock);
266 atomic_dec(&bh->b_count);
267 goto repeat2;
268 }
269 }
270 spin_unlock(&lru_list_lock);
271
272 /* If we are waiting for the sync to succeed, and if any dirty
273 * blocks were written, then repeat; on the second pass, only
274 * wait for buffers being written (do not pass to write any
275 * more buffers on the second pass).
276 */
277 } while (wait && retry && ++pass<=2);
278 return err;
279 }
280
281 void sync_dev(kdev_t dev)
282 {
283 sync_supers(dev);
284 sync_inodes(dev);
285 DQUOT_SYNC(dev);
286 /* sync all the dirty buffers out to disk only _after_ all the
287 high level layers finished generated buffer dirty data
288 (or we'll return with some buffer still dirty on the blockdevice
289 so breaking the semantics of this call) */
290 sync_buffers(dev, 0);
291 /*
292 * FIXME(eric) we need to sync the physical devices here.
293 * This is because some (scsi) controllers have huge amounts of
294 * cache onboard (hundreds of Mb), and we need to instruct
295 * them to commit all of the dirty memory to disk, and we should
296 * not return until this has happened.
297 *
298 * This would need to get implemented by going through the assorted
299 * layers so that each block major number can be synced, and this
300 * would call down into the upper and mid-layer scsi.
301 */
302 }
303
304 int fsync_dev(kdev_t dev)
305 {
306 sync_buffers(dev, 0);
307
308 lock_kernel();
309 sync_supers(dev);
310 sync_inodes(dev);
311 DQUOT_SYNC(dev);
312 unlock_kernel();
313
314 return sync_buffers(dev, 1);
315 }
316
317 asmlinkage long sys_sync(void)
318 {
319 fsync_dev(0);
320 return 0;
321 }
322
323 /*
324 * filp may be NULL if called via the msync of a vma.
325 */
326
327 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
328 {
329 struct inode * inode = dentry->d_inode;
330 struct super_block * sb;
331 kdev_t dev;
332 int ret;
333
334 lock_kernel();
335 /* sync the inode to buffers */
336 write_inode_now(inode, 0);
337
338 /* sync the superblock to buffers */
339 sb = inode->i_sb;
340 lock_super(sb);
341 if (sb->s_op && sb->s_op->write_super)
342 sb->s_op->write_super(sb);
343 unlock_super(sb);
344
345 /* .. finally sync the buffers to disk */
346 dev = inode->i_dev;
347 ret = sync_buffers(dev, 1);
348 unlock_kernel();
349 return ret;
350 }
351
352 asmlinkage long sys_fsync(unsigned int fd)
353 {
354 struct file * file;
355 struct dentry * dentry;
356 struct inode * inode;
357 int err;
358
359 err = -EBADF;
360 file = fget(fd);
361 if (!file)
362 goto out;
363
364 dentry = file->f_dentry;
365 inode = dentry->d_inode;
366
367 err = -EINVAL;
368 if (!file->f_op || !file->f_op->fsync)
369 goto out_putf;
370
371 /* We need to protect against concurrent writers.. */
372 down(&inode->i_sem);
373 filemap_fdatasync(inode->i_mapping);
374 err = file->f_op->fsync(file, dentry, 0);
375 filemap_fdatawait(inode->i_mapping);
376 up(&inode->i_sem);
377
378 out_putf:
379 fput(file);
380 out:
381 return err;
382 }
383
384 asmlinkage long sys_fdatasync(unsigned int fd)
385 {
386 struct file * file;
387 struct dentry * dentry;
388 struct inode * inode;
389 int err;
390
391 err = -EBADF;
392 file = fget(fd);
393 if (!file)
394 goto out;
395
396 dentry = file->f_dentry;
397 inode = dentry->d_inode;
398
399 err = -EINVAL;
400 if (!file->f_op || !file->f_op->fsync)
401 goto out_putf;
402
403 down(&inode->i_sem);
404 filemap_fdatasync(inode->i_mapping);
405 err = file->f_op->fsync(file, dentry, 1);
406 filemap_fdatawait(inode->i_mapping);
407 up(&inode->i_sem);
408
409 out_putf:
410 fput(file);
411 out:
412 return err;
413 }
414
415 /* After several hours of tedious analysis, the following hash
416 * function won. Do not mess with it... -DaveM
417 */
418 #define _hashfn(dev,block) \
419 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
420 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
421 ((block) << (bh_hash_shift - 12))))
422 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
423
424 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
425 {
426 if ((bh->b_next = *head) != NULL)
427 bh->b_next->b_pprev = &bh->b_next;
428 *head = bh;
429 bh->b_pprev = head;
430 }
431
432 static __inline__ void __hash_unlink(struct buffer_head *bh)
433 {
434 if (bh->b_pprev) {
435 if (bh->b_next)
436 bh->b_next->b_pprev = bh->b_pprev;
437 *(bh->b_pprev) = bh->b_next;
438 bh->b_pprev = NULL;
439 }
440 }
441
442 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
443 {
444 struct buffer_head **bhp = &lru_list[blist];
445
446 if(!*bhp) {
447 *bhp = bh;
448 bh->b_prev_free = bh;
449 }
450 bh->b_next_free = *bhp;
451 bh->b_prev_free = (*bhp)->b_prev_free;
452 (*bhp)->b_prev_free->b_next_free = bh;
453 (*bhp)->b_prev_free = bh;
454 nr_buffers_type[blist]++;
455 size_buffers_type[blist] += bh->b_size;
456 }
457
458 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
459 {
460 if (bh->b_prev_free || bh->b_next_free) {
461 bh->b_prev_free->b_next_free = bh->b_next_free;
462 bh->b_next_free->b_prev_free = bh->b_prev_free;
463 if (lru_list[blist] == bh)
464 lru_list[blist] = bh->b_next_free;
465 if (lru_list[blist] == bh)
466 lru_list[blist] = NULL;
467 bh->b_next_free = bh->b_prev_free = NULL;
468 nr_buffers_type[blist]--;
469 size_buffers_type[blist] -= bh->b_size;
470 }
471 }
472
473 static void __remove_from_free_list(struct buffer_head * bh, int index)
474 {
475 if(bh->b_next_free == bh)
476 free_list[index].list = NULL;
477 else {
478 bh->b_prev_free->b_next_free = bh->b_next_free;
479 bh->b_next_free->b_prev_free = bh->b_prev_free;
480 if (free_list[index].list == bh)
481 free_list[index].list = bh->b_next_free;
482 }
483 bh->b_next_free = bh->b_prev_free = NULL;
484 }
485
486 /* must be called with both the hash_table_lock and the lru_list_lock
487 held */
488 static void __remove_from_queues(struct buffer_head *bh)
489 {
490 __hash_unlink(bh);
491 __remove_from_lru_list(bh, bh->b_list);
492 }
493
494 static void __insert_into_queues(struct buffer_head *bh)
495 {
496 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
497
498 __hash_link(bh, head);
499 __insert_into_lru_list(bh, bh->b_list);
500 }
501
502 /* This function must only run if there are no other
503 * references _anywhere_ to this buffer head.
504 */
505 static void put_last_free(struct buffer_head * bh)
506 {
507 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
508 struct buffer_head **bhp = &head->list;
509
510 bh->b_state = 0;
511
512 spin_lock(&head->lock);
513 bh->b_dev = B_FREE;
514 if(!*bhp) {
515 *bhp = bh;
516 bh->b_prev_free = bh;
517 }
518 bh->b_next_free = *bhp;
519 bh->b_prev_free = (*bhp)->b_prev_free;
520 (*bhp)->b_prev_free->b_next_free = bh;
521 (*bhp)->b_prev_free = bh;
522 spin_unlock(&head->lock);
523 }
524
525 /*
526 * Why like this, I hear you say... The reason is race-conditions.
527 * As we don't lock buffers (unless we are reading them, that is),
528 * something might happen to it while we sleep (ie a read-error
529 * will force it bad). This shouldn't really happen currently, but
530 * the code is ready.
531 */
532 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
533 {
534 struct buffer_head *bh = hash(dev, block);
535
536 for (; bh; bh = bh->b_next)
537 if (bh->b_blocknr == block &&
538 bh->b_size == size &&
539 bh->b_dev == dev)
540 break;
541 if (bh)
542 atomic_inc(&bh->b_count);
543
544 return bh;
545 }
546
547 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
548 {
549 struct buffer_head *bh;
550
551 read_lock(&hash_table_lock);
552 bh = __get_hash_table(dev, block, size);
553 read_unlock(&hash_table_lock);
554
555 return bh;
556 }
557
558 unsigned int get_hardblocksize(kdev_t dev)
559 {
560 /*
561 * Get the hard sector size for the given device. If we don't know
562 * what it is, return 0.
563 */
564 if (hardsect_size[MAJOR(dev)] != NULL) {
565 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
566 if (blksize != 0)
567 return blksize;
568 }
569
570 /*
571 * We don't know what the hardware sector size for this device is.
572 * Return 0 indicating that we don't know.
573 */
574 return 0;
575 }
576
577 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
578 {
579 spin_lock(&lru_list_lock);
580 if (bh->b_inode)
581 list_del(&bh->b_inode_buffers);
582 bh->b_inode = inode;
583 list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
584 spin_unlock(&lru_list_lock);
585 }
586
587 /* The caller must have the lru_list lock before calling the
588 remove_inode_queue functions. */
589 static void __remove_inode_queue(struct buffer_head *bh)
590 {
591 bh->b_inode = NULL;
592 list_del(&bh->b_inode_buffers);
593 }
594
595 static inline void remove_inode_queue(struct buffer_head *bh)
596 {
597 if (bh->b_inode)
598 __remove_inode_queue(bh);
599 }
600
601 int inode_has_buffers(struct inode *inode)
602 {
603 int ret;
604
605 spin_lock(&lru_list_lock);
606 ret = !list_empty(&inode->i_dirty_buffers);
607 spin_unlock(&lru_list_lock);
608
609 return ret;
610 }
611
612
613 /* If invalidate_buffers() will trash dirty buffers, it means some kind
614 of fs corruption is going on. Trashing dirty data always imply losing
615 information that was supposed to be just stored on the physical layer
616 by the user.
617
618 Thus invalidate_buffers in general usage is not allwowed to trash dirty
619 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
620
621 NOTE: In the case where the user removed a removable-media-disk even if
622 there's still dirty data not synced on disk (due a bug in the device driver
623 or due an error of the user), by not destroying the dirty buffers we could
624 generate corruption also on the next media inserted, thus a parameter is
625 necessary to handle this case in the most safe way possible (trying
626 to not corrupt also the new disk inserted with the data belonging to
627 the old now corrupted disk). Also for the ramdisk the natural thing
628 to do in order to release the ramdisk memory is to destroy dirty buffers.
629
630 These are two special cases. Normal usage imply the device driver
631 to issue a sync on the device (without waiting I/O completation) and
632 then an invalidate_buffers call that doesn't trash dirty buffers. */
633 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
634 {
635 int i, nlist, slept;
636 struct buffer_head * bh, * bh_next;
637
638 retry:
639 slept = 0;
640 spin_lock(&lru_list_lock);
641 for(nlist = 0; nlist < NR_LIST; nlist++) {
642 bh = lru_list[nlist];
643 if (!bh)
644 continue;
645 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
646 bh_next = bh->b_next_free;
647
648 /* Another device? */
649 if (bh->b_dev != dev)
650 continue;
651 /* Part of a mapping? */
652 if (bh->b_page->mapping)
653 continue;
654 if (buffer_locked(bh)) {
655 atomic_inc(&bh->b_count);
656 spin_unlock(&lru_list_lock);
657 wait_on_buffer(bh);
658 slept = 1;
659 spin_lock(&lru_list_lock);
660 atomic_dec(&bh->b_count);
661 }
662
663 write_lock(&hash_table_lock);
664 if (!atomic_read(&bh->b_count) &&
665 (destroy_dirty_buffers || !buffer_dirty(bh))) {
666 remove_inode_queue(bh);
667 __remove_from_queues(bh);
668 put_last_free(bh);
669 }
670 /* else complain loudly? */
671
672 write_unlock(&hash_table_lock);
673 if (slept)
674 goto out;
675 }
676 }
677 out:
678 spin_unlock(&lru_list_lock);
679 if (slept)
680 goto retry;
681 }
682
683 void set_blocksize(kdev_t dev, int size)
684 {
685 extern int *blksize_size[];
686 int i, nlist, slept;
687 struct buffer_head * bh, * bh_next;
688
689 if (!blksize_size[MAJOR(dev)])
690 return;
691
692 /* Size must be a power of two, and between 512 and PAGE_SIZE */
693 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
694 panic("Invalid blocksize passed to set_blocksize");
695
696 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
697 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
698 return;
699 }
700 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
701 return;
702 sync_buffers(dev, 2);
703 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
704
705 retry:
706 slept = 0;
707 spin_lock(&lru_list_lock);
708 for(nlist = 0; nlist < NR_LIST; nlist++) {
709 bh = lru_list[nlist];
710 if (!bh)
711 continue;
712 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
713 bh_next = bh->b_next_free;
714 if (bh->b_dev != dev || bh->b_size == size)
715 continue;
716 if (buffer_locked(bh)) {
717 atomic_inc(&bh->b_count);
718 spin_unlock(&lru_list_lock);
719 wait_on_buffer(bh);
720 slept = 1;
721 spin_lock(&lru_list_lock);
722 atomic_dec(&bh->b_count);
723 }
724
725 write_lock(&hash_table_lock);
726 if (!atomic_read(&bh->b_count)) {
727 if (buffer_dirty(bh))
728 printk(KERN_WARNING
729 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
730 kdevname(dev), bh->b_blocknr, bh->b_size);
731 remove_inode_queue(bh);
732 __remove_from_queues(bh);
733 put_last_free(bh);
734 } else {
735 if (atomic_set_buffer_clean(bh))
736 __refile_buffer(bh);
737 clear_bit(BH_Uptodate, &bh->b_state);
738 printk(KERN_WARNING
739 "set_blocksize: "
740 "b_count %d, dev %s, block %lu, from %p\n",
741 atomic_read(&bh->b_count), bdevname(bh->b_dev),
742 bh->b_blocknr, __builtin_return_address(0));
743 }
744 write_unlock(&hash_table_lock);
745 if (slept)
746 goto out;
747 }
748 }
749 out:
750 spin_unlock(&lru_list_lock);
751 if (slept)
752 goto retry;
753 }
754
755 /*
756 * We used to try various strange things. Let's not.
757 * We'll just try to balance dirty buffers, and possibly
758 * launder some pages.
759 */
760 static void refill_freelist(int size)
761 {
762 balance_dirty(NODEV);
763 if (free_shortage())
764 page_launder(GFP_BUFFER, 0);
765 grow_buffers(size);
766 }
767
768 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
769 {
770 bh->b_list = BUF_CLEAN;
771 bh->b_end_io = handler;
772 bh->b_private = private;
773 }
774
775 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
776 {
777 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
778 unsigned long flags;
779 struct buffer_head *tmp;
780 struct page *page;
781
782 mark_buffer_uptodate(bh, uptodate);
783
784 /* This is a temporary buffer used for page I/O. */
785 page = bh->b_page;
786
787 if (!uptodate)
788 SetPageError(page);
789
790 /*
791 * Be _very_ careful from here on. Bad things can happen if
792 * two buffer heads end IO at almost the same time and both
793 * decide that the page is now completely done.
794 *
795 * Async buffer_heads are here only as labels for IO, and get
796 * thrown away once the IO for this page is complete. IO is
797 * deemed complete once all buffers have been visited
798 * (b_count==0) and are now unlocked. We must make sure that
799 * only the _last_ buffer that decrements its count is the one
800 * that unlock the page..
801 */
802 spin_lock_irqsave(&page_uptodate_lock, flags);
803 unlock_buffer(bh);
804 atomic_dec(&bh->b_count);
805 tmp = bh->b_this_page;
806 while (tmp != bh) {
807 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
808 goto still_busy;
809 tmp = tmp->b_this_page;
810 }
811
812 /* OK, the async IO on this page is complete. */
813 spin_unlock_irqrestore(&page_uptodate_lock, flags);
814
815 /*
816 * if none of the buffers had errors then we can set the
817 * page uptodate:
818 */
819 if (!PageError(page))
820 SetPageUptodate(page);
821
822 /*
823 * Run the hooks that have to be done when a page I/O has completed.
824 */
825 if (PageTestandClearDecrAfter(page))
826 atomic_dec(&nr_async_pages);
827
828 UnlockPage(page);
829
830 return;
831
832 still_busy:
833 spin_unlock_irqrestore(&page_uptodate_lock, flags);
834 return;
835 }
836
837 void set_buffer_async_io(struct buffer_head *bh) {
838 bh->b_end_io = end_buffer_io_async ;
839 }
840
841 /*
842 * Synchronise all the inode's dirty buffers to the disk.
843 *
844 * We have conflicting pressures: we want to make sure that all
845 * initially dirty buffers get waited on, but that any subsequently
846 * dirtied buffers don't. After all, we don't want fsync to last
847 * forever if somebody is actively writing to the file.
848 *
849 * Do this in two main stages: first we copy dirty buffers to a
850 * temporary inode list, queueing the writes as we go. Then we clean
851 * up, waiting for those writes to complete.
852 *
853 * During this second stage, any subsequent updates to the file may end
854 * up refiling the buffer on the original inode's dirty list again, so
855 * there is a chance we will end up with a buffer queued for write but
856 * not yet completed on that list. So, as a final cleanup we go through
857 * the osync code to catch these locked, dirty buffers without requeuing
858 * any newly dirty buffers for write.
859 */
860
861 int fsync_inode_buffers(struct inode *inode)
862 {
863 struct buffer_head *bh;
864 struct inode tmp;
865 int err = 0, err2;
866
867 INIT_LIST_HEAD(&tmp.i_dirty_buffers);
868
869 spin_lock(&lru_list_lock);
870
871 while (!list_empty(&inode->i_dirty_buffers)) {
872 bh = BH_ENTRY(inode->i_dirty_buffers.next);
873 list_del(&bh->b_inode_buffers);
874 if (!buffer_dirty(bh) && !buffer_locked(bh))
875 bh->b_inode = NULL;
876 else {
877 bh->b_inode = &tmp;
878 list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
879 if (buffer_dirty(bh)) {
880 atomic_inc(&bh->b_count);
881 spin_unlock(&lru_list_lock);
882 ll_rw_block(WRITE, 1, &bh);
883 brelse(bh);
884 spin_lock(&lru_list_lock);
885 }
886 }
887 }
888
889 while (!list_empty(&tmp.i_dirty_buffers)) {
890 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
891 remove_inode_queue(bh);
892 atomic_inc(&bh->b_count);
893 spin_unlock(&lru_list_lock);
894 wait_on_buffer(bh);
895 if (!buffer_uptodate(bh))
896 err = -EIO;
897 brelse(bh);
898 spin_lock(&lru_list_lock);
899 }
900
901 spin_unlock(&lru_list_lock);
902 err2 = osync_inode_buffers(inode);
903
904 if (err)
905 return err;
906 else
907 return err2;
908 }
909
910
911 /*
912 * osync is designed to support O_SYNC io. It waits synchronously for
913 * all already-submitted IO to complete, but does not queue any new
914 * writes to the disk.
915 *
916 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
917 * you dirty the buffers, and then use osync_inode_buffers to wait for
918 * completion. Any other dirty buffers which are not yet queued for
919 * write will not be flushed to disk by the osync.
920 */
921
922 int osync_inode_buffers(struct inode *inode)
923 {
924 struct buffer_head *bh;
925 struct list_head *list;
926 int err = 0;
927
928 spin_lock(&lru_list_lock);
929
930 repeat:
931
932 for (list = inode->i_dirty_buffers.prev;
933 bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
934 list = bh->b_inode_buffers.prev) {
935 if (buffer_locked(bh)) {
936 atomic_inc(&bh->b_count);
937 spin_unlock(&lru_list_lock);
938 wait_on_buffer(bh);
939 if (!buffer_uptodate(bh))
940 err = -EIO;
941 brelse(bh);
942 spin_lock(&lru_list_lock);
943 goto repeat;
944 }
945 }
946
947 spin_unlock(&lru_list_lock);
948 return err;
949 }
950
951
952 /*
953 * Invalidate any and all dirty buffers on a given inode. We are
954 * probably unmounting the fs, but that doesn't mean we have already
955 * done a sync(). Just drop the buffers from the inode list.
956 */
957 void invalidate_inode_buffers(struct inode *inode)
958 {
959 struct list_head *list, *next;
960
961 spin_lock(&lru_list_lock);
962 list = inode->i_dirty_buffers.next;
963 while (list != &inode->i_dirty_buffers) {
964 next = list->next;
965 remove_inode_queue(BH_ENTRY(list));
966 list = next;
967 }
968 spin_unlock(&lru_list_lock);
969 }
970
971
972 /*
973 * Ok, this is getblk, and it isn't very clear, again to hinder
974 * race-conditions. Most of the code is seldom used, (ie repeating),
975 * so it should be much more efficient than it looks.
976 *
977 * The algorithm is changed: hopefully better, and an elusive bug removed.
978 *
979 * 14.02.92: changed it to sync dirty buffers a bit: better performance
980 * when the filesystem starts to get full of dirty blocks (I hope).
981 */
982 struct buffer_head * getblk(kdev_t dev, int block, int size)
983 {
984 struct buffer_head * bh;
985 int isize;
986
987 repeat:
988 spin_lock(&lru_list_lock);
989 write_lock(&hash_table_lock);
990 bh = __get_hash_table(dev, block, size);
991 if (bh)
992 goto out;
993
994 isize = BUFSIZE_INDEX(size);
995 spin_lock(&free_list[isize].lock);
996 bh = free_list[isize].list;
997 if (bh) {
998 __remove_from_free_list(bh, isize);
999 atomic_set(&bh->b_count, 1);
1000 }
1001 spin_unlock(&free_list[isize].lock);
1002
1003 /*
1004 * OK, FINALLY we know that this buffer is the only one of
1005 * its kind, we hold a reference (b_count>0), it is unlocked,
1006 * and it is clean.
1007 */
1008 if (bh) {
1009 init_buffer(bh, NULL, NULL);
1010 bh->b_dev = dev;
1011 bh->b_blocknr = block;
1012 bh->b_state = 1 << BH_Mapped;
1013
1014 /* Insert the buffer into the regular lists */
1015 __insert_into_queues(bh);
1016 out:
1017 write_unlock(&hash_table_lock);
1018 spin_unlock(&lru_list_lock);
1019 touch_buffer(bh);
1020 return bh;
1021 }
1022
1023 /*
1024 * If we block while refilling the free list, somebody may
1025 * create the buffer first ... search the hashes again.
1026 */
1027 write_unlock(&hash_table_lock);
1028 spin_unlock(&lru_list_lock);
1029 refill_freelist(size);
1030 goto repeat;
1031 }
1032
1033 /* -1 -> no need to flush
1034 0 -> async flush
1035 1 -> sync flush (wait for I/O completation) */
1036 int balance_dirty_state(kdev_t dev)
1037 {
1038 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1039 int shortage;
1040
1041 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1042 tot = nr_free_buffer_pages();
1043
1044 dirty *= 100;
1045 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1046 hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1047
1048 /* First, check for the "real" dirty limit. */
1049 if (dirty > soft_dirty_limit) {
1050 if (dirty > hard_dirty_limit)
1051 return 1;
1052 return 0;
1053 }
1054
1055 /*
1056 * If we are about to get low on free pages and
1057 * cleaning the inactive_dirty pages would help
1058 * fix this, wake up bdflush.
1059 */
1060 shortage = free_shortage();
1061 if (shortage && nr_inactive_dirty_pages > shortage &&
1062 nr_inactive_dirty_pages > freepages.high)
1063 return 0;
1064
1065 return -1;
1066 }
1067
1068 /*
1069 * if a new dirty buffer is created we need to balance bdflush.
1070 *
1071 * in the future we might want to make bdflush aware of different
1072 * pressures on different devices - thus the (currently unused)
1073 * 'dev' parameter.
1074 */
1075 void balance_dirty(kdev_t dev)
1076 {
1077 int state = balance_dirty_state(dev);
1078
1079 if (state < 0)
1080 return;
1081 wakeup_bdflush(state);
1082 }
1083
1084 static __inline__ void __mark_dirty(struct buffer_head *bh)
1085 {
1086 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1087 refile_buffer(bh);
1088 }
1089
1090 /* atomic version, the user must call balance_dirty() by hand
1091 as soon as it become possible to block */
1092 void __mark_buffer_dirty(struct buffer_head *bh)
1093 {
1094 if (!atomic_set_buffer_dirty(bh))
1095 __mark_dirty(bh);
1096 }
1097
1098 void mark_buffer_dirty(struct buffer_head *bh)
1099 {
1100 if (!atomic_set_buffer_dirty(bh)) {
1101 __mark_dirty(bh);
1102 balance_dirty(bh->b_dev);
1103 }
1104 }
1105
1106 /*
1107 * A buffer may need to be moved from one buffer list to another
1108 * (e.g. in case it is not shared any more). Handle this.
1109 */
1110 static void __refile_buffer(struct buffer_head *bh)
1111 {
1112 int dispose = BUF_CLEAN;
1113 if (buffer_locked(bh))
1114 dispose = BUF_LOCKED;
1115 if (buffer_dirty(bh))
1116 dispose = BUF_DIRTY;
1117 if (buffer_protected(bh))
1118 dispose = BUF_PROTECTED;
1119 if (dispose != bh->b_list) {
1120 __remove_from_lru_list(bh, bh->b_list);
1121 bh->b_list = dispose;
1122 if (dispose == BUF_CLEAN)
1123 remove_inode_queue(bh);
1124 __insert_into_lru_list(bh, dispose);
1125 }
1126 }
1127
1128 void refile_buffer(struct buffer_head *bh)
1129 {
1130 spin_lock(&lru_list_lock);
1131 __refile_buffer(bh);
1132 spin_unlock(&lru_list_lock);
1133 }
1134
1135 /*
1136 * Release a buffer head
1137 */
1138 void __brelse(struct buffer_head * buf)
1139 {
1140 if (atomic_read(&buf->b_count)) {
1141 atomic_dec(&buf->b_count);
1142 return;
1143 }
1144 printk("VFS: brelse: Trying to free free buffer\n");
1145 }
1146
1147 /*
1148 * bforget() is like brelse(), except it puts the buffer on the
1149 * free list if it can.. We can NOT free the buffer if:
1150 * - there are other users of it
1151 * - it is locked and thus can have active IO
1152 */
1153 void __bforget(struct buffer_head * buf)
1154 {
1155 /* grab the lru lock here to block bdflush. */
1156 spin_lock(&lru_list_lock);
1157 write_lock(&hash_table_lock);
1158 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf))
1159 goto in_use;
1160 __hash_unlink(buf);
1161 remove_inode_queue(buf);
1162 write_unlock(&hash_table_lock);
1163 __remove_from_lru_list(buf, buf->b_list);
1164 spin_unlock(&lru_list_lock);
1165 put_last_free(buf);
1166 return;
1167
1168 in_use:
1169 write_unlock(&hash_table_lock);
1170 spin_unlock(&lru_list_lock);
1171 }
1172
1173 /*
1174 * bread() reads a specified block and returns the buffer that contains
1175 * it. It returns NULL if the block was unreadable.
1176 */
1177 struct buffer_head * bread(kdev_t dev, int block, int size)
1178 {
1179 struct buffer_head * bh;
1180
1181 bh = getblk(dev, block, size);
1182 if (buffer_uptodate(bh))
1183 return bh;
1184 ll_rw_block(READ, 1, &bh);
1185 wait_on_buffer(bh);
1186 if (buffer_uptodate(bh))
1187 return bh;
1188 brelse(bh);
1189 return NULL;
1190 }
1191
1192 /*
1193 * Note: the caller should wake up the buffer_wait list if needed.
1194 */
1195 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1196 {
1197 if (bh->b_inode)
1198 BUG();
1199 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1200 kmem_cache_free(bh_cachep, bh);
1201 } else {
1202 bh->b_blocknr = -1;
1203 init_waitqueue_head(&bh->b_wait);
1204 nr_unused_buffer_heads++;
1205 bh->b_next_free = unused_list;
1206 bh->b_this_page = NULL;
1207 unused_list = bh;
1208 }
1209 }
1210
1211 /*
1212 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1213 * no-buffer-head deadlock. Return NULL on failure; waiting for
1214 * buffer heads is now handled in create_buffers().
1215 */
1216 static struct buffer_head * get_unused_buffer_head(int async)
1217 {
1218 struct buffer_head * bh;
1219
1220 spin_lock(&unused_list_lock);
1221 if (nr_unused_buffer_heads > NR_RESERVED) {
1222 bh = unused_list;
1223 unused_list = bh->b_next_free;
1224 nr_unused_buffer_heads--;
1225 spin_unlock(&unused_list_lock);
1226 return bh;
1227 }
1228 spin_unlock(&unused_list_lock);
1229
1230 /* This is critical. We can't swap out pages to get
1231 * more buffer heads, because the swap-out may need
1232 * more buffer-heads itself. Thus SLAB_BUFFER.
1233 */
1234 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1235 memset(bh, 0, sizeof(*bh));
1236 init_waitqueue_head(&bh->b_wait);
1237 return bh;
1238 }
1239
1240 /*
1241 * If we need an async buffer, use the reserved buffer heads.
1242 */
1243 if (async) {
1244 spin_lock(&unused_list_lock);
1245 if (unused_list) {
1246 bh = unused_list;
1247 unused_list = bh->b_next_free;
1248 nr_unused_buffer_heads--;
1249 spin_unlock(&unused_list_lock);
1250 return bh;
1251 }
1252 spin_unlock(&unused_list_lock);
1253 }
1254 #if 0
1255 /*
1256 * (Pending further analysis ...)
1257 * Ordinary (non-async) requests can use a different memory priority
1258 * to free up pages. Any swapping thus generated will use async
1259 * buffer heads.
1260 */
1261 if(!async &&
1262 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1263 memset(bh, 0, sizeof(*bh));
1264 init_waitqueue_head(&bh->b_wait);
1265 return bh;
1266 }
1267 #endif
1268
1269 return NULL;
1270 }
1271
1272 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1273 {
1274 bh->b_page = page;
1275 if (offset >= PAGE_SIZE)
1276 BUG();
1277 if (PageHighMem(page))
1278 /*
1279 * This catches illegal uses and preserves the offset:
1280 */
1281 bh->b_data = (char *)(0 + offset);
1282 else
1283 bh->b_data = page_address(page) + offset;
1284 }
1285
1286 /*
1287 * Create the appropriate buffers when given a page for data area and
1288 * the size of each buffer.. Use the bh->b_this_page linked list to
1289 * follow the buffers created. Return NULL if unable to create more
1290 * buffers.
1291 * The async flag is used to differentiate async IO (paging, swapping)
1292 * from ordinary buffer allocations, and only async requests are allowed
1293 * to sleep waiting for buffer heads.
1294 */
1295 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1296 {
1297 struct buffer_head *bh, *head;
1298 long offset;
1299
1300 try_again:
1301 head = NULL;
1302 offset = PAGE_SIZE;
1303 while ((offset -= size) >= 0) {
1304 bh = get_unused_buffer_head(async);
1305 if (!bh)
1306 goto no_grow;
1307
1308 bh->b_dev = B_FREE; /* Flag as unused */
1309 bh->b_this_page = head;
1310 head = bh;
1311
1312 bh->b_state = 0;
1313 bh->b_next_free = NULL;
1314 bh->b_pprev = NULL;
1315 atomic_set(&bh->b_count, 0);
1316 bh->b_size = size;
1317
1318 set_bh_page(bh, page, offset);
1319
1320 bh->b_list = BUF_CLEAN;
1321 bh->b_end_io = NULL;
1322 }
1323 return head;
1324 /*
1325 * In case anything failed, we just free everything we got.
1326 */
1327 no_grow:
1328 if (head) {
1329 spin_lock(&unused_list_lock);
1330 do {
1331 bh = head;
1332 head = head->b_this_page;
1333 __put_unused_buffer_head(bh);
1334 } while (head);
1335 spin_unlock(&unused_list_lock);
1336
1337 /* Wake up any waiters ... */
1338 wake_up(&buffer_wait);
1339 }
1340
1341 /*
1342 * Return failure for non-async IO requests. Async IO requests
1343 * are not allowed to fail, so we have to wait until buffer heads
1344 * become available. But we don't want tasks sleeping with
1345 * partially complete buffers, so all were released above.
1346 */
1347 if (!async)
1348 return NULL;
1349
1350 /* We're _really_ low on memory. Now we just
1351 * wait for old buffer heads to become free due to
1352 * finishing IO. Since this is an async request and
1353 * the reserve list is empty, we're sure there are
1354 * async buffer heads in use.
1355 */
1356 run_task_queue(&tq_disk);
1357
1358 /*
1359 * Set our state for sleeping, then check again for buffer heads.
1360 * This ensures we won't miss a wake_up from an interrupt.
1361 */
1362 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1363 goto try_again;
1364 }
1365
1366 static void unmap_buffer(struct buffer_head * bh)
1367 {
1368 if (buffer_mapped(bh)) {
1369 mark_buffer_clean(bh);
1370 wait_on_buffer(bh);
1371 clear_bit(BH_Uptodate, &bh->b_state);
1372 clear_bit(BH_Mapped, &bh->b_state);
1373 clear_bit(BH_Req, &bh->b_state);
1374 clear_bit(BH_New, &bh->b_state);
1375 }
1376 }
1377
1378 /*
1379 * We don't have to release all buffers here, but
1380 * we have to be sure that no dirty buffer is left
1381 * and no IO is going on (no buffer is locked), because
1382 * we have truncated the file and are going to free the
1383 * blocks on-disk..
1384 */
1385 int block_flushpage(struct page *page, unsigned long offset)
1386 {
1387 struct buffer_head *head, *bh, *next;
1388 unsigned int curr_off = 0;
1389
1390 if (!PageLocked(page))
1391 BUG();
1392 if (!page->buffers)
1393 return 1;
1394
1395 head = page->buffers;
1396 bh = head;
1397 do {
1398 unsigned int next_off = curr_off + bh->b_size;
1399 next = bh->b_this_page;
1400
1401 /*
1402 * is this block fully flushed?
1403 */
1404 if (offset <= curr_off)
1405 unmap_buffer(bh);
1406 curr_off = next_off;
1407 bh = next;
1408 } while (bh != head);
1409
1410 /*
1411 * subtle. We release buffer-heads only if this is
1412 * the 'final' flushpage. We have invalidated the get_block
1413 * cached value unconditionally, so real IO is not
1414 * possible anymore.
1415 *
1416 * If the free doesn't work out, the buffers can be
1417 * left around - they just turn into anonymous buffers
1418 * instead.
1419 */
1420 if (!offset) {
1421 if (!try_to_free_buffers(page, 0)) {
1422 atomic_inc(&buffermem_pages);
1423 return 0;
1424 }
1425 }
1426
1427 return 1;
1428 }
1429
1430 static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1431 {
1432 struct buffer_head *bh, *head, *tail;
1433
1434 head = create_buffers(page, blocksize, 1);
1435 if (page->buffers)
1436 BUG();
1437
1438 bh = head;
1439 do {
1440 bh->b_dev = dev;
1441 bh->b_blocknr = 0;
1442 bh->b_end_io = NULL;
1443 tail = bh;
1444 bh = bh->b_this_page;
1445 } while (bh);
1446 tail->b_this_page = head;
1447 page->buffers = head;
1448 page_cache_get(page);
1449 }
1450
1451 /*
1452 * We are taking a block for data and we don't want any output from any
1453 * buffer-cache aliases starting from return from that function and
1454 * until the moment when something will explicitly mark the buffer
1455 * dirty (hopefully that will not happen until we will free that block ;-)
1456 * We don't even need to mark it not-uptodate - nobody can expect
1457 * anything from a newly allocated buffer anyway. We used to used
1458 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1459 * don't want to mark the alias unmapped, for example - it would confuse
1460 * anyone who might pick it with bread() afterwards...
1461 */
1462
1463 static void unmap_underlying_metadata(struct buffer_head * bh)
1464 {
1465 struct buffer_head *old_bh;
1466
1467 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1468 if (old_bh) {
1469 mark_buffer_clean(old_bh);
1470 wait_on_buffer(old_bh);
1471 clear_bit(BH_Req, &old_bh->b_state);
1472 /* Here we could run brelse or bforget. We use
1473 bforget because it will try to put the buffer
1474 in the freelist. */
1475 __bforget(old_bh);
1476 }
1477 }
1478
1479 /*
1480 * NOTE! All mapped/uptodate combinations are valid:
1481 *
1482 * Mapped Uptodate Meaning
1483 *
1484 * No No "unknown" - must do get_block()
1485 * No Yes "hole" - zero-filled
1486 * Yes No "allocated" - allocated on disk, not read in
1487 * Yes Yes "valid" - allocated and up-to-date in memory.
1488 *
1489 * "Dirty" is valid only with the last case (mapped+uptodate).
1490 */
1491
1492 /*
1493 * block_write_full_page() is SMP-safe - currently it's still
1494 * being called with the kernel lock held, but the code is ready.
1495 */
1496 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1497 {
1498 int err, i;
1499 unsigned long block;
1500 struct buffer_head *bh, *head;
1501
1502 if (!PageLocked(page))
1503 BUG();
1504
1505 if (!page->buffers)
1506 create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
1507 head = page->buffers;
1508
1509 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1510
1511 bh = head;
1512 i = 0;
1513
1514 /* Stage 1: make sure we have all the buffers mapped! */
1515 do {
1516 /*
1517 * If the buffer isn't up-to-date, we can't be sure
1518 * that the buffer has been initialized with the proper
1519 * block number information etc..
1520 *
1521 * Leave it to the low-level FS to make all those
1522 * decisions (block #0 may actually be a valid block)
1523 */
1524 if (!buffer_mapped(bh)) {
1525 err = get_block(inode, block, bh, 1);
1526 if (err)
1527 goto out;
1528 if (buffer_new(bh))
1529 unmap_underlying_metadata(bh);
1530 }
1531 bh = bh->b_this_page;
1532 block++;
1533 } while (bh != head);
1534
1535 /* Stage 2: lock the buffers, mark them clean */
1536 do {
1537 lock_buffer(bh);
1538 bh->b_end_io = end_buffer_io_async;
1539 atomic_inc(&bh->b_count);
1540 set_bit(BH_Uptodate, &bh->b_state);
1541 clear_bit(BH_Dirty, &bh->b_state);
1542 bh = bh->b_this_page;
1543 } while (bh != head);
1544
1545 /* Stage 3: submit the IO */
1546 do {
1547 submit_bh(WRITE, bh);
1548 bh = bh->b_this_page;
1549 } while (bh != head);
1550
1551 /* Done - end_buffer_io_async will unlock */
1552 SetPageUptodate(page);
1553 return 0;
1554
1555 out:
1556 ClearPageUptodate(page);
1557 UnlockPage(page);
1558 return err;
1559 }
1560
1561 static int __block_prepare_write(struct inode *inode, struct page *page,
1562 unsigned from, unsigned to, get_block_t *get_block)
1563 {
1564 unsigned block_start, block_end;
1565 unsigned long block;
1566 int err = 0;
1567 unsigned blocksize, bbits;
1568 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1569 char *kaddr = kmap(page);
1570
1571 blocksize = inode->i_sb->s_blocksize;
1572 if (!page->buffers)
1573 create_empty_buffers(page, inode->i_dev, blocksize);
1574 head = page->buffers;
1575
1576 bbits = inode->i_sb->s_blocksize_bits;
1577 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1578
1579 for(bh = head, block_start = 0; bh != head || !block_start;
1580 block++, block_start=block_end, bh = bh->b_this_page) {
1581 if (!bh)
1582 BUG();
1583 block_end = block_start+blocksize;
1584 if (block_end <= from)
1585 continue;
1586 if (block_start >= to)
1587 break;
1588 if (!buffer_mapped(bh)) {
1589 err = get_block(inode, block, bh, 1);
1590 if (err)
1591 goto out;
1592 if (buffer_new(bh)) {
1593 unmap_underlying_metadata(bh);
1594 if (Page_Uptodate(page)) {
1595 set_bit(BH_Uptodate, &bh->b_state);
1596 continue;
1597 }
1598 if (block_end > to)
1599 memset(kaddr+to, 0, block_end-to);
1600 if (block_start < from)
1601 memset(kaddr+block_start, 0, from-block_start);
1602 if (block_end > to || block_start < from)
1603 flush_dcache_page(page);
1604 continue;
1605 }
1606 }
1607 if (Page_Uptodate(page)) {
1608 set_bit(BH_Uptodate, &bh->b_state);
1609 continue;
1610 }
1611 if (!buffer_uptodate(bh) &&
1612 (block_start < from || block_end > to)) {
1613 ll_rw_block(READ, 1, &bh);
1614 *wait_bh++=bh;
1615 }
1616 }
1617 /*
1618 * If we issued read requests - let them complete.
1619 */
1620 while(wait_bh > wait) {
1621 wait_on_buffer(*--wait_bh);
1622 err = -EIO;
1623 if (!buffer_uptodate(*wait_bh))
1624 goto out;
1625 }
1626 return 0;
1627 out:
1628 return err;
1629 }
1630
1631 static int __block_commit_write(struct inode *inode, struct page *page,
1632 unsigned from, unsigned to)
1633 {
1634 unsigned block_start, block_end;
1635 int partial = 0, need_balance_dirty = 0;
1636 unsigned blocksize;
1637 struct buffer_head *bh, *head;
1638
1639 blocksize = inode->i_sb->s_blocksize;
1640
1641 for(bh = head = page->buffers, block_start = 0;
1642 bh != head || !block_start;
1643 block_start=block_end, bh = bh->b_this_page) {
1644 block_end = block_start + blocksize;
1645 if (block_end <= from || block_start >= to) {
1646 if (!buffer_uptodate(bh))
1647 partial = 1;
1648 } else {
1649 set_bit(BH_Uptodate, &bh->b_state);
1650 if (!atomic_set_buffer_dirty(bh)) {
1651 __mark_dirty(bh);
1652 buffer_insert_inode_queue(bh, inode);
1653 need_balance_dirty = 1;
1654 }
1655 }
1656 }
1657
1658 if (need_balance_dirty)
1659 balance_dirty(bh->b_dev);
1660 /*
1661 * is this a partial write that happened to make all buffers
1662 * uptodate then we can optimize away a bogus readpage() for
1663 * the next read(). Here we 'discover' wether the page went
1664 * uptodate as a result of this (potentially partial) write.
1665 */
1666 if (!partial)
1667 SetPageUptodate(page);
1668 return 0;
1669 }
1670
1671 /*
1672 * Generic "read page" function for block devices that have the normal
1673 * get_block functionality. This is most of the block device filesystems.
1674 * Reads the page asynchronously --- the unlock_buffer() and
1675 * mark_buffer_uptodate() functions propagate buffer state into the
1676 * page struct once IO has completed.
1677 */
1678 int block_read_full_page(struct page *page, get_block_t *get_block)
1679 {
1680 struct inode *inode = page->mapping->host;
1681 unsigned long iblock, lblock;
1682 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1683 unsigned int blocksize, blocks;
1684 int nr, i;
1685
1686 if (!PageLocked(page))
1687 PAGE_BUG(page);
1688 blocksize = inode->i_sb->s_blocksize;
1689 if (!page->buffers)
1690 create_empty_buffers(page, inode->i_dev, blocksize);
1691 head = page->buffers;
1692
1693 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1694 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1695 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1696 bh = head;
1697 nr = 0;
1698 i = 0;
1699
1700 do {
1701 if (buffer_uptodate(bh))
1702 continue;
1703
1704 if (!buffer_mapped(bh)) {
1705 if (iblock < lblock) {
1706 if (get_block(inode, iblock, bh, 0))
1707 continue;
1708 }
1709 if (!buffer_mapped(bh)) {
1710 memset(kmap(page) + i*blocksize, 0, blocksize);
1711 flush_dcache_page(page);
1712 kunmap(page);
1713 set_bit(BH_Uptodate, &bh->b_state);
1714 continue;
1715 }
1716 /* get_block() might have updated the buffer synchronously */
1717 if (buffer_uptodate(bh))
1718 continue;
1719 }
1720
1721 arr[nr] = bh;
1722 nr++;
1723 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1724
1725 if (!nr) {
1726 /*
1727 * all buffers are uptodate - we can set the page
1728 * uptodate as well.
1729 */
1730 SetPageUptodate(page);
1731 UnlockPage(page);
1732 return 0;
1733 }
1734
1735 /* Stage two: lock the buffers */
1736 for (i = 0; i < nr; i++) {
1737 struct buffer_head * bh = arr[i];
1738 lock_buffer(bh);
1739 bh->b_end_io = end_buffer_io_async;
1740 atomic_inc(&bh->b_count);
1741 }
1742
1743 /* Stage 3: start the IO */
1744 for (i = 0; i < nr; i++)
1745 submit_bh(READ, arr[i]);
1746
1747 return 0;
1748 }
1749
1750 /*
1751 * For moronic filesystems that do not allow holes in file.
1752 * We may have to extend the file.
1753 */
1754
1755 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1756 {
1757 struct address_space *mapping = page->mapping;
1758 struct inode *inode = mapping->host;
1759 struct page *new_page;
1760 unsigned long pgpos;
1761 long status;
1762 unsigned zerofrom;
1763 unsigned blocksize = inode->i_sb->s_blocksize;
1764 char *kaddr;
1765
1766 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1767 status = -ENOMEM;
1768 new_page = grab_cache_page(mapping, pgpos);
1769 if (!new_page)
1770 goto out;
1771 /* we might sleep */
1772 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1773 UnlockPage(new_page);
1774 page_cache_release(new_page);
1775 continue;
1776 }
1777 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1778 if (zerofrom & (blocksize-1)) {
1779 *bytes |= (blocksize-1);
1780 (*bytes)++;
1781 }
1782 status = __block_prepare_write(inode, new_page, zerofrom,
1783 PAGE_CACHE_SIZE, get_block);
1784 if (status)
1785 goto out_unmap;
1786 kaddr = page_address(new_page);
1787 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1788 flush_dcache_page(new_page);
1789 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1790 kunmap(new_page);
1791 UnlockPage(new_page);
1792 page_cache_release(new_page);
1793 }
1794
1795 if (page->index < pgpos) {
1796 /* completely inside the area */
1797 zerofrom = offset;
1798 } else {
1799 /* page covers the boundary, find the boundary offset */
1800 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1801
1802 /* if we will expand the thing last block will be filled */
1803 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1804 *bytes |= (blocksize-1);
1805 (*bytes)++;
1806 }
1807
1808 /* starting below the boundary? Nothing to zero out */
1809 if (offset <= zerofrom)
1810 zerofrom = offset;
1811 }
1812 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1813 if (status)
1814 goto out1;
1815 kaddr = page_address(page);
1816 if (zerofrom < offset) {
1817 memset(kaddr+zerofrom, 0, offset-zerofrom);
1818 flush_dcache_page(page);
1819 __block_commit_write(inode, page, zerofrom, offset);
1820 }
1821 return 0;
1822 out1:
1823 ClearPageUptodate(page);
1824 kunmap(page);
1825 return status;
1826
1827 out_unmap:
1828 ClearPageUptodate(new_page);
1829 kunmap(new_page);
1830 UnlockPage(new_page);
1831 page_cache_release(new_page);
1832 out:
1833 return status;
1834 }
1835
1836 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1837 get_block_t *get_block)
1838 {
1839 struct inode *inode = page->mapping->host;
1840 int err = __block_prepare_write(inode, page, from, to, get_block);
1841 if (err) {
1842 ClearPageUptodate(page);
1843 kunmap(page);
1844 }
1845 return err;
1846 }
1847
1848 int generic_commit_write(struct file *file, struct page *page,
1849 unsigned from, unsigned to)
1850 {
1851 struct inode *inode = page->mapping->host;
1852 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1853 __block_commit_write(inode,page,from,to);
1854 kunmap(page);
1855 if (pos > inode->i_size) {
1856 inode->i_size = pos;
1857 mark_inode_dirty(inode);
1858 }
1859 return 0;
1860 }
1861
1862 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1863 {
1864 unsigned long index = from >> PAGE_CACHE_SHIFT;
1865 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1866 unsigned blocksize, iblock, length, pos;
1867 struct inode *inode = mapping->host;
1868 struct page *page;
1869 struct buffer_head *bh;
1870 int err;
1871
1872 blocksize = inode->i_sb->s_blocksize;
1873 length = offset & (blocksize - 1);
1874
1875 /* Block boundary? Nothing to do */
1876 if (!length)
1877 return 0;
1878
1879 length = blocksize - length;
1880 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1881
1882 page = grab_cache_page(mapping, index);
1883 err = PTR_ERR(page);
1884 if (IS_ERR(page))
1885 goto out;
1886
1887 if (!page->buffers)
1888 create_empty_buffers(page, inode->i_dev, blocksize);
1889
1890 /* Find the buffer that contains "offset" */
1891 bh = page->buffers;
1892 pos = blocksize;
1893 while (offset >= pos) {
1894 bh = bh->b_this_page;
1895 iblock++;
1896 pos += blocksize;
1897 }
1898
1899 err = 0;
1900 if (!buffer_mapped(bh)) {
1901 /* Hole? Nothing to do */
1902 if (buffer_uptodate(bh))
1903 goto unlock;
1904 get_block(inode, iblock, bh, 0);
1905 /* Still unmapped? Nothing to do */
1906 if (!buffer_mapped(bh))
1907 goto unlock;
1908 }
1909
1910 /* Ok, it's mapped. Make sure it's up-to-date */
1911 if (Page_Uptodate(page))
1912 set_bit(BH_Uptodate, &bh->b_state);
1913
1914 if (!buffer_uptodate(bh)) {
1915 err = -EIO;
1916 ll_rw_block(READ, 1, &bh);
1917 wait_on_buffer(bh);
1918 /* Uhhuh. Read error. Complain and punt. */
1919 if (!buffer_uptodate(bh))
1920 goto unlock;
1921 }
1922
1923 memset(kmap(page) + offset, 0, length);
1924 flush_dcache_page(page);
1925 kunmap(page);
1926
1927 __mark_buffer_dirty(bh);
1928 err = 0;
1929
1930 unlock:
1931 UnlockPage(page);
1932 page_cache_release(page);
1933 out:
1934 return err;
1935 }
1936
1937 int block_write_full_page(struct page *page, get_block_t *get_block)
1938 {
1939 struct inode *inode = page->mapping->host;
1940 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1941 unsigned offset;
1942 int err;
1943
1944 /* easy case */
1945 if (page->index < end_index)
1946 return __block_write_full_page(inode, page, get_block);
1947
1948 /* things got complicated... */
1949 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1950 /* OK, are we completely out? */
1951 if (page->index >= end_index+1 || !offset) {
1952 UnlockPage(page);
1953 return -EIO;
1954 }
1955
1956 /* Sigh... will have to work, then... */
1957 err = __block_prepare_write(inode, page, 0, offset, get_block);
1958 if (!err) {
1959 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
1960 flush_dcache_page(page);
1961 __block_commit_write(inode,page,0,offset);
1962 done:
1963 kunmap(page);
1964 UnlockPage(page);
1965 return err;
1966 }
1967 ClearPageUptodate(page);
1968 goto done;
1969 }
1970
1971 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1972 {
1973 struct buffer_head tmp;
1974 struct inode *inode = mapping->host;
1975 tmp.b_state = 0;
1976 tmp.b_blocknr = 0;
1977 get_block(inode, block, &tmp, 0);
1978 return tmp.b_blocknr;
1979 }
1980
1981 /*
1982 * IO completion routine for a buffer_head being used for kiobuf IO: we
1983 * can't dispatch the kiobuf callback until io_count reaches 0.
1984 */
1985
1986 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1987 {
1988 struct kiobuf *kiobuf;
1989
1990 mark_buffer_uptodate(bh, uptodate);
1991
1992 kiobuf = bh->b_private;
1993 unlock_buffer(bh);
1994 end_kio_request(kiobuf, uptodate);
1995 }
1996
1997
1998 /*
1999 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2000 * for them to complete. Clean up the buffer_heads afterwards.
2001 */
2002
2003 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2004 {
2005 int iosize;
2006 int i;
2007 struct buffer_head *tmp;
2008
2009
2010 iosize = 0;
2011 spin_lock(&unused_list_lock);
2012
2013 for (i = nr; --i >= 0; ) {
2014 iosize += size;
2015 tmp = bh[i];
2016 if (buffer_locked(tmp)) {
2017 spin_unlock(&unused_list_lock);
2018 wait_on_buffer(tmp);
2019 spin_lock(&unused_list_lock);
2020 }
2021
2022 if (!buffer_uptodate(tmp)) {
2023 /* We are traversing bh'es in reverse order so
2024 clearing iosize on error calculates the
2025 amount of IO before the first error. */
2026 iosize = 0;
2027 }
2028 __put_unused_buffer_head(tmp);
2029 }
2030
2031 spin_unlock(&unused_list_lock);
2032
2033 return iosize;
2034 }
2035
2036 /*
2037 * Start I/O on a physical range of kernel memory, defined by a vector
2038 * of kiobuf structs (much like a user-space iovec list).
2039 *
2040 * The kiobuf must already be locked for IO. IO is submitted
2041 * asynchronously: you need to check page->locked, page->uptodate, and
2042 * maybe wait on page->wait.
2043 *
2044 * It is up to the caller to make sure that there are enough blocks
2045 * passed in to completely map the iobufs to disk.
2046 */
2047
2048 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2049 kdev_t dev, unsigned long b[], int size)
2050 {
2051 int err;
2052 int length;
2053 int transferred;
2054 int i;
2055 int bufind;
2056 int pageind;
2057 int bhind;
2058 int offset;
2059 unsigned long blocknr;
2060 struct kiobuf * iobuf = NULL;
2061 struct page * map;
2062 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
2063
2064 if (!nr)
2065 return 0;
2066
2067 /*
2068 * First, do some alignment and validity checks
2069 */
2070 for (i = 0; i < nr; i++) {
2071 iobuf = iovec[i];
2072 if ((iobuf->offset & (size-1)) ||
2073 (iobuf->length & (size-1)))
2074 return -EINVAL;
2075 if (!iobuf->nr_pages)
2076 panic("brw_kiovec: iobuf not initialised");
2077 }
2078
2079 /*
2080 * OK to walk down the iovec doing page IO on each page we find.
2081 */
2082 bufind = bhind = transferred = err = 0;
2083 for (i = 0; i < nr; i++) {
2084 iobuf = iovec[i];
2085 offset = iobuf->offset;
2086 length = iobuf->length;
2087 iobuf->errno = 0;
2088
2089 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2090 map = iobuf->maplist[pageind];
2091 if (!map) {
2092 err = -EFAULT;
2093 goto error;
2094 }
2095
2096 while (length > 0) {
2097 blocknr = b[bufind++];
2098 tmp = get_unused_buffer_head(0);
2099 if (!tmp) {
2100 err = -ENOMEM;
2101 goto error;
2102 }
2103
2104 tmp->b_dev = B_FREE;
2105 tmp->b_size = size;
2106 set_bh_page(tmp, map, offset);
2107 tmp->b_this_page = tmp;
2108
2109 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2110 tmp->b_dev = dev;
2111 tmp->b_blocknr = blocknr;
2112 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2113
2114 if (rw == WRITE) {
2115 set_bit(BH_Uptodate, &tmp->b_state);
2116 clear_bit(BH_Dirty, &tmp->b_state);
2117 }
2118
2119 bh[bhind++] = tmp;
2120 length -= size;
2121 offset += size;
2122
2123 atomic_inc(&iobuf->io_count);
2124
2125 submit_bh(rw, tmp);
2126 /*
2127 * Wait for IO if we have got too much
2128 */
2129 if (bhind >= KIO_MAX_SECTORS) {
2130 err = wait_kio(rw, bhind, bh, size);
2131 if (err >= 0)
2132 transferred += err;
2133 else
2134 goto finished;
2135 bhind = 0;
2136 }
2137
2138 if (offset >= PAGE_SIZE) {
2139 offset = 0;
2140 break;
2141 }
2142 } /* End of block loop */
2143 } /* End of page loop */
2144 } /* End of iovec loop */
2145
2146 /* Is there any IO still left to submit? */
2147 if (bhind) {
2148 err = wait_kio(rw, bhind, bh, size);
2149 if (err >= 0)
2150 transferred += err;
2151 else
2152 goto finished;
2153 }
2154
2155 finished:
2156 if (transferred)
2157 return transferred;
2158 return err;
2159
2160 error:
2161 /* We got an error allocating the bh'es. Just free the current
2162 buffer_heads and exit. */
2163 spin_lock(&unused_list_lock);
2164 for (i = bhind; --i >= 0; ) {
2165 __put_unused_buffer_head(bh[i]);
2166 }
2167 spin_unlock(&unused_list_lock);
2168 goto finished;
2169 }
2170
2171 /*
2172 * Start I/O on a page.
2173 * This function expects the page to be locked and may return
2174 * before I/O is complete. You then have to check page->locked,
2175 * page->uptodate, and maybe wait on page->wait.
2176 *
2177 * brw_page() is SMP-safe, although it's being called with the
2178 * kernel lock held - but the code is ready.
2179 *
2180 * FIXME: we need a swapper_inode->get_block function to remove
2181 * some of the bmap kludges and interface ugliness here.
2182 */
2183 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2184 {
2185 struct buffer_head *head, *bh;
2186
2187 if (!PageLocked(page))
2188 panic("brw_page: page not locked for I/O");
2189
2190 if (!page->buffers)
2191 create_empty_buffers(page, dev, size);
2192 head = bh = page->buffers;
2193
2194 /* Stage 1: lock all the buffers */
2195 do {
2196 lock_buffer(bh);
2197 bh->b_blocknr = *(b++);
2198 set_bit(BH_Mapped, &bh->b_state);
2199 bh->b_end_io = end_buffer_io_async;
2200 atomic_inc(&bh->b_count);
2201 bh = bh->b_this_page;
2202 } while (bh != head);
2203
2204 /* Stage 2: start the IO */
2205 do {
2206 submit_bh(rw, bh);
2207 bh = bh->b_this_page;
2208 } while (bh != head);
2209 return 0;
2210 }
2211
2212 int block_symlink(struct inode *inode, const char *symname, int len)
2213 {
2214 struct address_space *mapping = inode->i_mapping;
2215 struct page *page = grab_cache_page(mapping, 0);
2216 int err = -ENOMEM;
2217 char *kaddr;
2218
2219 if (!page)
2220 goto fail;
2221 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2222 if (err)
2223 goto fail_map;
2224 kaddr = page_address(page);
2225 memcpy(kaddr, symname, len-1);
2226 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2227 /*
2228 * Notice that we are _not_ going to block here - end of page is
2229 * unmapped, so this will only try to map the rest of page, see
2230 * that it is unmapped (typically even will not look into inode -
2231 * ->i_size will be enough for everything) and zero it out.
2232 * OTOH it's obviously correct and should make the page up-to-date.
2233 */
2234 err = mapping->a_ops->readpage(NULL, page);
2235 wait_on_page(page);
2236 page_cache_release(page);
2237 if (err < 0)
2238 goto fail;
2239 mark_inode_dirty(inode);
2240 return 0;
2241 fail_map:
2242 UnlockPage(page);
2243 page_cache_release(page);
2244 fail:
2245 return err;
2246 }
2247
2248 /*
2249 * Try to increase the number of buffers available: the size argument
2250 * is used to determine what kind of buffers we want.
2251 */
2252 static int grow_buffers(int size)
2253 {
2254 struct page * page;
2255 struct buffer_head *bh, *tmp;
2256 struct buffer_head * insert_point;
2257 int isize;
2258
2259 if ((size & 511) || (size > PAGE_SIZE)) {
2260 printk("VFS: grow_buffers: size = %d\n",size);
2261 return 0;
2262 }
2263
2264 page = alloc_page(GFP_BUFFER);
2265 if (!page)
2266 goto out;
2267 LockPage(page);
2268 bh = create_buffers(page, size, 0);
2269 if (!bh)
2270 goto no_buffer_head;
2271
2272 isize = BUFSIZE_INDEX(size);
2273
2274 spin_lock(&free_list[isize].lock);
2275 insert_point = free_list[isize].list;
2276 tmp = bh;
2277 while (1) {
2278 if (insert_point) {
2279 tmp->b_next_free = insert_point->b_next_free;
2280 tmp->b_prev_free = insert_point;
2281 insert_point->b_next_free->b_prev_free = tmp;
2282 insert_point->b_next_free = tmp;
2283 } else {
2284 tmp->b_prev_free = tmp;
2285 tmp->b_next_free = tmp;
2286 }
2287 insert_point = tmp;
2288 if (tmp->b_this_page)
2289 tmp = tmp->b_this_page;
2290 else
2291 break;
2292 }
2293 tmp->b_this_page = bh;
2294 free_list[isize].list = bh;
2295 spin_unlock(&free_list[isize].lock);
2296
2297 page->buffers = bh;
2298 page->flags &= ~(1 << PG_referenced);
2299 lru_cache_add(page);
2300 UnlockPage(page);
2301 atomic_inc(&buffermem_pages);
2302 return 1;
2303
2304 no_buffer_head:
2305 UnlockPage(page);
2306 page_cache_release(page);
2307 out:
2308 return 0;
2309 }
2310
2311 /*
2312 * Sync all the buffers on one page..
2313 *
2314 * If we have old buffers that are locked, we'll
2315 * wait on them, but we won't wait on the new ones
2316 * we're writing out now.
2317 *
2318 * This all is required so that we can free up memory
2319 * later.
2320 *
2321 * Wait:
2322 * 0 - no wait (this does not get called - see try_to_free_buffers below)
2323 * 1 - start IO for dirty buffers
2324 * 2 - wait for completion of locked buffers
2325 */
2326 static void sync_page_buffers(struct buffer_head *bh, int wait)
2327 {
2328 struct buffer_head * tmp = bh;
2329
2330 do {
2331 struct buffer_head *p = tmp;
2332 tmp = tmp->b_this_page;
2333 if (buffer_locked(p)) {
2334 if (wait > 1)
2335 __wait_on_buffer(p);
2336 } else if (buffer_dirty(p))
2337 ll_rw_block(WRITE, 1, &p);
2338 } while (tmp != bh);
2339 }
2340
2341 /*
2342 * Can the buffer be thrown out?
2343 */
2344 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2345 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2346
2347 /*
2348 * try_to_free_buffers() checks if all the buffers on this particular page
2349 * are unused, and free's the page if so.
2350 *
2351 * Wake up bdflush() if this fails - if we're running low on memory due
2352 * to dirty buffers, we need to flush them out as quickly as possible.
2353 *
2354 * NOTE: There are quite a number of ways that threads of control can
2355 * obtain a reference to a buffer head within a page. So we must
2356 * lock out all of these paths to cleanly toss the page.
2357 */
2358 int try_to_free_buffers(struct page * page, int wait)
2359 {
2360 struct buffer_head * tmp, * bh = page->buffers;
2361 int index = BUFSIZE_INDEX(bh->b_size);
2362 int loop = 0;
2363
2364 cleaned_buffers_try_again:
2365 spin_lock(&lru_list_lock);
2366 write_lock(&hash_table_lock);
2367 spin_lock(&free_list[index].lock);
2368 tmp = bh;
2369 do {
2370 struct buffer_head *p = tmp;
2371
2372 tmp = tmp->b_this_page;
2373 if (buffer_busy(p))
2374 goto busy_buffer_page;
2375 } while (tmp != bh);
2376
2377 spin_lock(&unused_list_lock);
2378 tmp = bh;
2379 do {
2380 struct buffer_head * p = tmp;
2381 tmp = tmp->b_this_page;
2382
2383 /* The buffer can be either on the regular
2384 * queues or on the free list..
2385 */
2386 if (p->b_dev != B_FREE) {
2387 remove_inode_queue(p);
2388 __remove_from_queues(p);
2389 } else
2390 __remove_from_free_list(p, index);
2391 __put_unused_buffer_head(p);
2392 } while (tmp != bh);
2393 spin_unlock(&unused_list_lock);
2394
2395 /* Wake up anyone waiting for buffer heads */
2396 wake_up(&buffer_wait);
2397
2398 /* And free the page */
2399 page->buffers = NULL;
2400 page_cache_release(page);
2401 spin_unlock(&free_list[index].lock);
2402 write_unlock(&hash_table_lock);
2403 spin_unlock(&lru_list_lock);
2404 return 1;
2405
2406 busy_buffer_page:
2407 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2408 spin_unlock(&free_list[index].lock);
2409 write_unlock(&hash_table_lock);
2410 spin_unlock(&lru_list_lock);
2411 if (wait) {
2412 sync_page_buffers(bh, wait);
2413 /* We waited synchronously, so we can free the buffers. */
2414 if (wait > 1 && !loop) {
2415 loop = 1;
2416 goto cleaned_buffers_try_again;
2417 }
2418 wakeup_bdflush(0);
2419 }
2420 return 0;
2421 }
2422
2423 /* ================== Debugging =================== */
2424
2425 void show_buffers(void)
2426 {
2427 #ifdef CONFIG_SMP
2428 struct buffer_head * bh;
2429 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2430 int protected = 0;
2431 int nlist;
2432 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2433 #endif
2434
2435 printk("Buffer memory: %6dkB\n",
2436 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2437
2438 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2439 if (!spin_trylock(&lru_list_lock))
2440 return;
2441 for(nlist = 0; nlist < NR_LIST; nlist++) {
2442 found = locked = dirty = used = lastused = protected = 0;
2443 bh = lru_list[nlist];
2444 if(!bh) continue;
2445
2446 do {
2447 found++;
2448 if (buffer_locked(bh))
2449 locked++;
2450 if (buffer_protected(bh))
2451 protected++;
2452 if (buffer_dirty(bh))
2453 dirty++;
2454 if (atomic_read(&bh->b_count))
2455 used++, lastused = found;
2456 bh = bh->b_next_free;
2457 } while (bh != lru_list[nlist]);
2458 {
2459 int tmp = nr_buffers_type[nlist];
2460 if (found != tmp)
2461 printk("%9s: BUG -> found %d, reported %d\n",
2462 buf_types[nlist], found, tmp);
2463 }
2464 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2465 "%d locked, %d protected, %d dirty\n",
2466 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2467 used, lastused, locked, protected, dirty);
2468 }
2469 spin_unlock(&lru_list_lock);
2470 #endif
2471 }
2472
2473 /* ===================== Init ======================= */
2474
2475 /*
2476 * allocate the hash table and init the free list
2477 * Use gfp() for the hash table to decrease TLB misses, use
2478 * SLAB cache for buffer heads.
2479 */
2480 void __init buffer_init(unsigned long mempages)
2481 {
2482 int order, i;
2483 unsigned int nr_hash;
2484
2485 /* The buffer cache hash table is less important these days,
2486 * trim it a bit.
2487 */
2488 mempages >>= 14;
2489
2490 mempages *= sizeof(struct buffer_head *);
2491
2492 for (order = 0; (1 << order) < mempages; order++)
2493 ;
2494
2495 /* try to allocate something until we get it or we're asking
2496 for something that is really too small */
2497
2498 do {
2499 unsigned long tmp;
2500
2501 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2502 bh_hash_mask = (nr_hash - 1);
2503
2504 tmp = nr_hash;
2505 bh_hash_shift = 0;
2506 while((tmp >>= 1UL) != 0UL)
2507 bh_hash_shift++;
2508
2509 hash_table = (struct buffer_head **)
2510 __get_free_pages(GFP_ATOMIC, order);
2511 } while (hash_table == NULL && --order > 0);
2512 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2513 nr_hash, order, (PAGE_SIZE << order));
2514
2515 if (!hash_table)
2516 panic("Failed to allocate buffer hash table\n");
2517
2518 /* Setup hash chains. */
2519 for(i = 0; i < nr_hash; i++)
2520 hash_table[i] = NULL;
2521
2522 /* Setup free lists. */
2523 for(i = 0; i < NR_SIZES; i++) {
2524 free_list[i].list = NULL;
2525 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2526 }
2527
2528 /* Setup lru lists. */
2529 for(i = 0; i < NR_LIST; i++)
2530 lru_list[i] = NULL;
2531
2532 }
2533
2534
2535 /* ====================== bdflush support =================== */
2536
2537 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2538 * response to dirty buffers. Once this process is activated, we write back
2539 * a limited number of buffers to the disks and then go back to sleep again.
2540 */
2541
2542 /* This is the _only_ function that deals with flushing async writes
2543 to disk.
2544 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2545 as all dirty buffers lives _only_ in the DIRTY lru list.
2546 As we never browse the LOCKED and CLEAN lru lists they are infact
2547 completly useless. */
2548 static int flush_dirty_buffers(int check_flushtime)
2549 {
2550 struct buffer_head * bh, *next;
2551 int flushed = 0, i;
2552
2553 restart:
2554 spin_lock(&lru_list_lock);
2555 bh = lru_list[BUF_DIRTY];
2556 if (!bh)
2557 goto out_unlock;
2558 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2559 next = bh->b_next_free;
2560
2561 if (!buffer_dirty(bh)) {
2562 __refile_buffer(bh);
2563 continue;
2564 }
2565 if (buffer_locked(bh))
2566 continue;
2567
2568 if (check_flushtime) {
2569 /* The dirty lru list is chronologically ordered so
2570 if the current bh is not yet timed out,
2571 then also all the following bhs
2572 will be too young. */
2573 if (time_before(jiffies, bh->b_flushtime))
2574 goto out_unlock;
2575 } else {
2576 if (++flushed > bdf_prm.b_un.ndirty)
2577 goto out_unlock;
2578 }
2579
2580 /* OK, now we are committed to write it out. */
2581 atomic_inc(&bh->b_count);
2582 spin_unlock(&lru_list_lock);
2583 ll_rw_block(WRITE, 1, &bh);
2584 atomic_dec(&bh->b_count);
2585
2586 if (current->need_resched)
2587 schedule();
2588 goto restart;
2589 }
2590 out_unlock:
2591 spin_unlock(&lru_list_lock);
2592
2593 return flushed;
2594 }
2595
2596 struct task_struct *bdflush_tsk = 0;
2597
2598 void wakeup_bdflush(int block)
2599 {
2600 if (current != bdflush_tsk) {
2601 wake_up_process(bdflush_tsk);
2602
2603 if (block)
2604 flush_dirty_buffers(0);
2605 }
2606 }
2607
2608 /*
2609 * Here we attempt to write back old buffers. We also try to flush inodes
2610 * and supers as well, since this function is essentially "update", and
2611 * otherwise there would be no way of ensuring that these quantities ever
2612 * get written back. Ideally, we would have a timestamp on the inodes
2613 * and superblocks so that we could write back only the old ones as well
2614 */
2615
2616 static int sync_old_buffers(void)
2617 {
2618 lock_kernel();
2619 sync_supers(0);
2620 sync_inodes(0);
2621 unlock_kernel();
2622
2623 flush_dirty_buffers(1);
2624 /* must really sync all the active I/O request to disk here */
2625 run_task_queue(&tq_disk);
2626 return 0;
2627 }
2628
2629 int block_sync_page(struct page *page)
2630 {
2631 run_task_queue(&tq_disk);
2632 return 0;
2633 }
2634
2635 /* This is the interface to bdflush. As we get more sophisticated, we can
2636 * pass tuning parameters to this "process", to adjust how it behaves.
2637 * We would want to verify each parameter, however, to make sure that it
2638 * is reasonable. */
2639
2640 asmlinkage long sys_bdflush(int func, long data)
2641 {
2642 if (!capable(CAP_SYS_ADMIN))
2643 return -EPERM;
2644
2645 if (func == 1) {
2646 /* do_exit directly and let kupdate to do its work alone. */
2647 do_exit(0);
2648 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2649 a syscall that doesn't care about the current mm context. */
2650 int error;
2651 struct mm_struct *user_mm;
2652
2653 /*
2654 * bdflush will spend all of it's time in kernel-space,
2655 * without touching user-space, so we can switch it into
2656 * 'lazy TLB mode' to reduce the cost of context-switches
2657 * to and from bdflush.
2658 */
2659 user_mm = start_lazy_tlb();
2660 error = sync_old_buffers();
2661 end_lazy_tlb(user_mm);
2662 return error;
2663 #endif
2664 }
2665
2666 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2667 if (func >= 2) {
2668 int i = (func-2) >> 1;
2669 if (i >= 0 && i < N_PARAM) {
2670 if ((func & 1) == 0)
2671 return put_user(bdf_prm.data[i], (int*)data);
2672
2673 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2674 bdf_prm.data[i] = data;
2675 return 0;
2676 }
2677 }
2678 return -EINVAL;
2679 }
2680
2681 /* Having func 0 used to launch the actual bdflush and then never
2682 * return (unless explicitly killed). We return zero here to
2683 * remain semi-compatible with present update(8) programs.
2684 */
2685 return 0;
2686 }
2687
2688 /*
2689 * This is the actual bdflush daemon itself. It used to be started from
2690 * the syscall above, but now we launch it ourselves internally with
2691 * kernel_thread(...) directly after the first thread in init/main.c
2692 */
2693 int bdflush(void *sem)
2694 {
2695 struct task_struct *tsk = current;
2696 int flushed;
2697 /*
2698 * We have a bare-bones task_struct, and really should fill
2699 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2700 * display semi-sane things. Not real crucial though...
2701 */
2702
2703 tsk->session = 1;
2704 tsk->pgrp = 1;
2705 strcpy(tsk->comm, "bdflush");
2706 bdflush_tsk = tsk;
2707
2708 /* avoid getting signals */
2709 spin_lock_irq(&tsk->sigmask_lock);
2710 flush_signals(tsk);
2711 sigfillset(&tsk->blocked);
2712 recalc_sigpending(tsk);
2713 spin_unlock_irq(&tsk->sigmask_lock);
2714
2715 up((struct semaphore *)sem);
2716
2717 for (;;) {
2718 CHECK_EMERGENCY_SYNC
2719
2720 flushed = flush_dirty_buffers(0);
2721 if (free_shortage())
2722 flushed += page_launder(GFP_KERNEL, 0);
2723
2724 /*
2725 * If there are still a lot of dirty buffers around,
2726 * skip the sleep and flush some more. Otherwise, we
2727 * go to sleep waiting a wakeup.
2728 */
2729 set_current_state(TASK_INTERRUPTIBLE);
2730 if (!flushed || balance_dirty_state(NODEV) < 0) {
2731 run_task_queue(&tq_disk);
2732 schedule();
2733 }
2734 /* Remember to mark us as running otherwise
2735 the next schedule will block. */
2736 __set_current_state(TASK_RUNNING);
2737 }
2738 }
2739
2740 /*
2741 * This is the kernel update daemon. It was used to live in userspace
2742 * but since it's need to run safely we want it unkillable by mistake.
2743 * You don't need to change your userspace configuration since
2744 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2745 */
2746 int kupdate(void *sem)
2747 {
2748 struct task_struct * tsk = current;
2749 int interval;
2750
2751 tsk->session = 1;
2752 tsk->pgrp = 1;
2753 strcpy(tsk->comm, "kupdate");
2754
2755 /* sigstop and sigcont will stop and wakeup kupdate */
2756 spin_lock_irq(&tsk->sigmask_lock);
2757 sigfillset(&tsk->blocked);
2758 siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2759 recalc_sigpending(tsk);
2760 spin_unlock_irq(&tsk->sigmask_lock);
2761
2762 up((struct semaphore *)sem);
2763
2764 for (;;) {
2765 /* update interval */
2766 interval = bdf_prm.b_un.interval;
2767 if (interval) {
2768 tsk->state = TASK_INTERRUPTIBLE;
2769 schedule_timeout(interval);
2770 } else {
2771 stop_kupdate:
2772 tsk->state = TASK_STOPPED;
2773 schedule(); /* wait for SIGCONT */
2774 }
2775 /* check for sigstop */
2776 if (signal_pending(tsk)) {
2777 int stopped = 0;
2778 spin_lock_irq(&tsk->sigmask_lock);
2779 if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2780 sigdelset(&tsk->pending.signal, SIGSTOP);
2781 stopped = 1;
2782 }
2783 recalc_sigpending(tsk);
2784 spin_unlock_irq(&tsk->sigmask_lock);
2785 if (stopped)
2786 goto stop_kupdate;
2787 }
2788 #ifdef DEBUG
2789 printk("kupdate() activated...\n");
2790 #endif
2791 sync_old_buffers();
2792 }
2793 }
2794
2795 static int __init bdflush_init(void)
2796 {
2797 DECLARE_MUTEX_LOCKED(sem);
2798 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2799 down(&sem);
2800 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2801 down(&sem);
2802 return 0;
2803 }
2804
2805 module_init(bdflush_init)
2806
2807
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.