/*
 * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
 * 
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * 
 * Further, this software is distributed without any warranty that it is
 * free of the rightful claim of any third person regarding infringement
 * or the like.  Any license provided herein, whether implied or
 * otherwise, applies only to this software file.  Patent licenses, if
 * any, provided herein do not apply to combinations of this program with
 * other software, or any other product whatsoever.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write the Free Software Foundation, Inc., 59
 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
 * 
 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
 * Mountain View, CA  94043, or:
 * 
 * http://www.sgi.com 
 * 
 * For further information regarding this notice, see: 
 * 
 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
 */
#ident "$Id:  $"

/*
 *	page_buf.c
 *
 *	The page_buf module provides an abstract buffer cache model on top of
 *	the Linux page cache.  Cached blocks for a file are hashed to the
 *	inode for that file, and can be held dirty in delayed write mode in
 *	the page cache.  Cached metadata blocks for a file system are hashed
 *	to the inode for the mounted device.  The page_buf module assembles
 *	buffer (page_buf_t) objects on demand to aggregate such cached pages
 *	for I/O.
 *
 *
 *      Written by William J. Earl, Steve Lord, Jim Mostek, Russell Cattelan
 *		    and Rajagopal Ananthanarayanan ("ananth") at SGI.
 *
 *	Added kiobuf-based I/O requests: Chait Tumuluri.
 *
 */

#define _PAGE_BUF_INTERNAL_ 1

#include <linux/config.h>
#include <linux/version.h>

#include <linux/module.h>

#include <linux/stddef.h>

#include <linux/spinlock.h>
#include <linux/avl.h>
#include <linux/page_buf.h>
#include <linux/fs.h>
#include <linux/smp_lock.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/locks.h>
#include <linux/swap.h>
#include <asm/softirq.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>

/*
 * Debug code
 */

#define PB_DEFINE_TRACES
#include <linux/page_buf_trace.h>

MODULE_DESCRIPTION("page_buf file system buffer module");
#ifdef PAGEBUF_TRACE
#define STATIC
static	spinlock_t		pb_trace_lock = SPIN_LOCK_UNLOCKED;
struct pagebuf_trace_buf	pb_trace;
EXPORT_SYMBOL(pb_trace);
EXPORT_SYMBOL(pb_trace_func);
#define CIRC_INC(i)     (((i) + 1) & (PB_TRACE_BUFSIZE - 1))

void	pb_trace_func(page_buf_t *pb, int event, void *misc, void *ra)
{
	int	j;
	unsigned long flags;

	if (!pb_params.p_un.debug) return;

	if (ra == NULL) ra = (void *)__builtin_return_address(0);

	spin_lock_irqsave(&pb_trace_lock, flags);
	j = pb_trace.start;
	pb_trace.start = CIRC_INC(j);
	spin_unlock_irqrestore(&pb_trace_lock, flags);

	pb_trace.buf[j].pb = (unsigned long) pb;
	pb_trace.buf[j].event = event;
	pb_trace.buf[j].flags = pb->pb_flags;
	pb_trace.buf[j].hold = pb->pb_hold;
	pb_trace.buf[j].lock_value = PBP(pb)->pb_sema.count.counter;
	pb_trace.buf[j].task = (void *)current;
	pb_trace.buf[j].misc = misc;
	pb_trace.buf[j].ra = ra;
	pb_trace.buf[j].offset = pb->pb_file_offset;
	pb_trace.buf[j].size = pb->pb_buffer_length;
}
#define ENTER(x) printk("Entering " #x "\n");
#define EXIT(x)  printk("Exiting  " #x "\n");
#else
#define STATIC   static
#define ENTER(x)
#define EXIT(x)
#endif

#ifdef PB_TRACKING
#define MAX_PB	10000
page_buf_t	*pb_array[MAX_PB];

EXPORT_SYMBOL(pb_array);
#endif

/*
 *	External locking functions
 */

extern int _pagebuf_check_lockable(struct inode *, page_buf_registration_t **);
extern int _pagebuf_get_lockable_buffer(
		    struct inode *, page_buf_registration_t *, loff_t,
		    size_t, page_buf_flags_t, page_buf_t **);
extern int _pagebuf_find_lockable_buffer(
		    struct inode *, page_buf_registration_t *, loff_t,
		    size_t, page_buf_flags_t, page_buf_t **);
extern int _pagebuf_free_lockable_buffer(page_buf_t *, unsigned long);


/*
 *	Forward declarations
 */

STATIC int _pagebuf_get_kiovec(page_buf_t *, int, int);

/*
 *	File wide globals
 */

STATIC kmem_cache_t *pagebuf_cache = NULL;
STATIC pagebuf_daemon_t *pb_daemon = NULL;

/*
 * Pagebuf module configuration parameters, exported via
 * /proc/sys/vm/pagebuf
 */

unsigned long pagebuf_min[P_PARAM] = { HZ/2, 1*HZ,  1, 0, 1 };
unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, 1024, 1, 4096 };

pagebuf_param_t pb_params = {{ HZ, 15 * HZ, 512, 0, 256 }};

/*
 * Pagebuf statistics variables
 */

struct pbstats pbstats;

#define REMAPPING_SUPPORT

#ifdef REMAPPING_SUPPORT
STATIC void *pagebuf_mapout_locked(page_buf_t *);

STATIC  spinlock_t              as_lock = SPIN_LOCK_UNLOCKED;
typedef struct a_list {
	void	*vm_addr;
	struct a_list	*next;
} a_list_t;
STATIC  a_list_t	*as_free_head;
STATIC  int		as_list_len;

STATIC void
free_address(void *addr)
{
	unsigned long flags;
	a_list_t	*aentry;

	spin_lock_irqsave(&as_lock, flags);
	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
	aentry->next = as_free_head;
	aentry->vm_addr = addr;
	as_free_head = aentry;
	as_list_len++;
	spin_unlock_irqrestore(&as_lock, flags);
}

STATIC void
purge_addresses(void)
{
	unsigned long flags;
	a_list_t	*aentry, *old;

	if (as_free_head == NULL) return;

	spin_lock_irqsave(&as_lock, flags); 
	aentry = as_free_head;
	as_free_head = NULL;
	as_list_len = 0;
	spin_unlock_irqrestore(&as_lock, flags);

	while ((old = aentry) != NULL) {
		vfree(aentry->vm_addr);
		aentry = aentry->next;
		kfree(old);
	}
}
#endif

/*
 *	Locking model:
 *
 *	Buffers associated with inodes for which buffer locking
 *	is not enabled are not protected by semaphores, and are
 *	assumed to be exclusively owned by the caller.  There is
 *	spinlock in the buffer, for use by the caller when concurrent
 *	access is possible.
 */

/*
 *	Internal pagebuf object manipulation
 */

/*
 *	_pagebuf_get_object
 *
 *	This routine allocates a page_buf_t object and initializes it,
 *	with no other operations implied.
 */

int
_pagebuf_get_object(
    struct inode *ip,
    loff_t range_base,
    size_t range_length,
    page_buf_flags_t flags,
    page_buf_t ** pb_p)
{
	page_buf_t *pb;

	pb = kmem_cache_alloc(pagebuf_cache,
		(flags & PBF_DONT_BLOCK) ? SLAB_PAGE_IO : SLAB_KERNEL);
	if (pb == NULL)
		return (-ENOMEM);

#ifdef PB_TRACKING
	{
	int	i;

	for (i = 0; (pb_array[i] != 0) && (i < MAX_PB); i++);
	if (i == MAX_PB) 
		printk("pb 0x%p not recorded in pb_array\n", pb);
	else 
		pb_array[i] = pb;
	}
#endif

	bzero(pb, sizeof(page_buf_private_t));
	pb->pb_hold = 1;
	spin_lock_init(&PBP(pb)->pb_lock);
	init_MUTEX_LOCKED(&PBP(pb)->pb_iodonesema);
	INIT_LIST_HEAD(&pb->pb_list);
	if (flags && _PBF_LOCKABLE)
		init_MUTEX_LOCKED(&PBP(pb)->pb_sema); /* held, no waiters */
	PB_SET_OWNER(pb);
	pb->pb_sector = ip->i_sb->s_blocksize;
	pb->pb_sector_bits = ip->i_sb->s_blocksize_bits;
	pb->pb_target = ip;
	pb->pb_file_offset = range_base;
	pb->pb_buffer_length = pb->pb_count_desired = range_length; 
	/* set buffer_length and count_desired to the same value initially 
	 * io routines should use count_desired, which will the same in
	 * most cases but may be reset (e.g. XFS recovery)
	 */
	pb->pb_flags = (flags & ~(PBF_MAPPED|PBF_DONT_BLOCK)) | PBF_NONE;
	pb->pb_bn = PAGE_BUF_DADDR_NULL;
	atomic_set(&PBP(pb)->pb_pin_count, 0);
	init_waitqueue_head(&PBP(pb)->pb_waiters);
	
	*pb_p = pb;

	PB_STATS_INC(pb_create);
	PB_TRACE(pb, PB_TRACE_REC(get), ip);
	return (0);
}

/*
 * Allocate a kiobuf capable of holding a specified number
 * of pages, and point the page buf at it.
 */
STATIC int
_pagebuf_get_kiovec(page_buf_t * pb, int page_count, int flags)
{

	int rval = 0;
	struct kiobuf *kp;
	int	gpf_mask = (flags & PBF_DONT_BLOCK) ?
				SLAB_PAGE_IO : SLAB_KERNEL;

	/* assure that we have a page list */
	if (pb->pb_mem == NULL) {
		rval = _alloc_kiovec(1, &pb->pb_mem, gpf_mask);
		if (rval != 0)
			return (rval);

		rval = _expand_kiobuf(pb->pb_mem, page_count, gpf_mask);
		if (rval != 0) {
			free_kiovec(1, &pb->pb_mem);
			pb->pb_mem = NULL;
			return (rval);
		}
		kp = pb->pb_mem;

		kp->nr_pages = page_count;
		kp->offset = page_buf_poff(pb->pb_file_offset);
		kp->length = pb->pb_count_desired;
	}
	return (rval);
}

/*
 * Walk a kiobuf releasing all the pages contained
 * within it.
 */
STATIC inline void _kiobuf_freepages(struct kiobuf *kb)
{
	int buf_index;
	struct page *page;

	for (buf_index = 0; buf_index < kb->nr_pages; buf_index++) {
		page = kb->maplist[buf_index];
		if (page != NULL) {
			kb->maplist[buf_index] = NULL;
			page_cache_release(page);
		}
	}
}


/*
 *	_pagebuf_free_object
 *
 *	_pagebuf_free_object releases the contents specified buffer.
 *	The modification state of any associated pages is left unchanged.
 *	Caller must call with pb_lock held.  If pb_hold is non-zero after this
 *	routine decrements it, the page_buf_t is not freed, although it
 *	is marked as having been freed.
 */

void _pagebuf_free_object(
	page_buf_t * pb,	/* buffer to deallocate         */
	unsigned long flags)	/* interrupt state to restore	*/
{
	struct kiobuf *kb;
#ifdef REMAPPING_SUPPORT
	void *vaddr = NULL;
#endif

	PB_TRACE(pb, PB_TRACE_REC(free_obj), 0);
	if (!(pb->pb_flags & PBF_FREED)) {
#ifdef REMAPPING_SUPPORT
		/* release any virtual mapping */ ;
		if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
			vaddr = pagebuf_mapout_locked(pb);
#endif

		if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
			if (pb->pb_mem) {
				/* release the pages in the address list */
				kb = pb->pb_mem;
				if (kb->maplist[0] && 
				    PageSlab(kb->maplist[0])) {
					/*
					 * This came from the slab
					 * allocator free it as such
					 */
					kfree(pb->pb_addr);
				} else {
					_kiobuf_freepages(kb);
				}

				/* release the address list itself */
				free_kiovec(1, &pb->pb_mem);
				pb->pb_mem = NULL;
			}
			pb->pb_flags &= ~_PBF_MEM_ALLOCATED;
		}

		pb->pb_flags |= PBF_FREED;
	}
	pb->pb_hold--;
	if (pb->pb_hold == 0) {
		/* Drop the spinlock before calling free lockable,
		 * as it needs to get the pbr_lock. We have set
		 * PBF_FREED, so anyone doing a lookup should
		 * skip this pagebuf.
		 */
		spin_unlock(&PBP(pb)->pb_lock);
		if (pb->pb_flags & _PBF_LOCKABLE)
			_pagebuf_free_lockable_buffer(pb, flags);
		kmem_cache_free(pagebuf_cache, pb);
#ifdef PB_TRACKING
		{
		int	i;

		for (i = 0; (pb_array[i] != pb) && (i < MAX_PB); i++);
		if (i < MAX_PB)
			pb_array[i] = NULL;
		else 
			printk("Freed unmonitored pagebuf 0x%p\n", pb);
		}
#endif
	} else {
		spin_unlock_irqrestore(&PBP(pb)->pb_lock, flags);
	}

#ifdef REMAPPING_SUPPORT
	if (vaddr) {
		free_address(vaddr);
	}
#endif
}


/*
 *	_pagebuf_lookup_pages
 *
 *	_pagebuf_lookup_pages finds all pages which match the buffer 
 *	in question and the range of file offsets supplied, 
 *	and builds the page list for the buffer, if the
 *	page list is not already formed or if not all of the pages are
 *	already in the list. Invalid pages (pages which have not yet been
 *	read in from disk) are assigned for any pages which are not found.
 */

int
_pagebuf_lookup_pages(
    page_buf_t * pb,
    page_buf_flags_t flags,
    struct page **pages)
{
	loff_t next_buffer_offset;
	unsigned long page_count;
	int rval = 0;
	struct kiobuf *kp;
	unsigned long pi;
	unsigned long index;
	int all_mapped, good_pages;
	struct page *cp, **hash, *cached_page;
	int gfp_mask;

	/* For pagebufs where we want to map an address, do not use
	 * highmem pages - so that we do not need to use kmap resources
	 * to access the data.
	 *
	 * For pages were the caller has indicated there may be resource
	 * contention (e.g. called from a transaction) do not flush
	 * delalloc pages to obtain memory.
	 */

	if (flags & PBF_DONT_BLOCK) {
		gfp_mask = GFP_PAGE_IO;
	} else if (flags & PBF_MAPPABLE) {
		gfp_mask = GFP_KERNEL;
	} else {
		gfp_mask = GFP_HIGHUSER;
	}

	next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length;

	good_pages = page_count = (page_buf_btoc(next_buffer_offset) -
	    page_buf_btoct(pb->pb_file_offset));

	if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) {
		if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) {
			all_mapped = 1;
			kp = pb->pb_mem;
			goto mapit;
		}
		return (0);
	}

	/* assure that we have a page list */
	rval = _pagebuf_get_kiovec(pb, page_count, flags);
	if (rval != 0)
		return (rval);

	kp = pb->pb_mem;

	rval = pi = 0;
	cached_page = NULL;
	/* enter the pages in the page list */
	index = (pb->pb_file_offset - kp->offset) >> PAGE_CACHE_SHIFT;
	for (all_mapped = 1; pi < page_count; pi++, index++) {
		if (pages) {
			if (!PageLocked(*pages))
				PAGE_BUG(*pages);
			kp->maplist[pi] = *pages++;
			continue;
		}
		if (kp->maplist[pi] == 0) {
			hash = page_hash(PB_ADDR_SPACE(pb), index);
		      retry:
			cp = __find_lock_page(PB_ADDR_SPACE(pb), index, hash);
			if (!cp) {
				PB_STATS_INC(pb_page_alloc);
				if (!cached_page) {
					/* allocate a new page */
					cached_page = alloc_pages(gfp_mask, 0);

					if (!cached_page) {
						rval = -ENOMEM;
						all_mapped = 0;
						continue;
					}
				}
				cp = cached_page;
				if (add_to_page_cache_unique(cp,
					PB_ADDR_SPACE(pb), index, hash))
					goto retry;
				cached_page = NULL;
			} else {
				PB_STATS_INC(pb_page_found);
			}

			kp->maplist[pi] = cp;
		} else {
			cp = kp->maplist[pi];
			while (TryLockPage(cp)) {
				___wait_on_page(cp);
			}
		}

		/* Test for the page being valid. There is a special case
		 * in here for the case where we are reading a pagebuf
		 * smaller than a page. We want to populate the whole page
		 * here rather than just the part the caller wanted. That
		 * way we do not need to deal with partially valid pages.
		 * We keep the page locked, and in the read path fake out
		 * the lower layers to issue an I/O for the whole page.
		 */
		if (!Page_Uptodate(cp)) {
			good_pages--;
			if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
			    (flags & PBF_READ) && !PageSlab(cp)) {
				kp->locked = 1;
			}
		}
		if (!kp->locked)
			UnlockPage(cp);
	}
	if (cached_page)
		page_cache_free(cached_page);

	/*
	 * Pages are supplied are locked,
	 * so is the kiobuf ...
	 */
	if (pages)
		kp->locked = 1;


mapit:
	pb->pb_flags |= _PBF_MEM_ALLOCATED;
	if (all_mapped) {
		pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
		/* A single page buffer is always mappable */
		if (page_count == 1) {
			pb->pb_addr =
			    (caddr_t) page_address(kp->maplist[0]) + kp->offset;	
			pb->pb_flags |= PBF_MAPPED;
#ifdef REMAPPING_SUPPORT
		} else if ((flags & PBF_MAPPED) && (kp->offset == 0)) {
			if (as_list_len > 64)
				purge_addresses();
			pb->pb_addr = (caddr_t) remap_page_array(kp->maplist,
			    page_count, gfp_mask);
			if (pb->pb_addr) {
				pb->pb_flags |= PBF_MAPPED |
				    _PBF_ADDR_ALLOCATED;
			}
		}
#else
		} else if (flags & PBF_MAPPED) {
			printk("request for a mapped pagebuf > page size\n");
			BUG();
		}
#endif
	}
	/* If some pages were found with data in them
	 * we are not in PBF_NONE state.
	 */
	if (good_pages != 0) {
		pb->pb_flags &= ~(PBF_NONE);
		if (good_pages != page_count) {
			pb->pb_flags |= PBF_PARTIAL;
		}
	}

	PB_TRACE(pb, PB_TRACE_REC(look_pg), good_pages);

	return (rval);
}


/*
 *	Finding and Reading Buffers 
 */

/*
 *	pagebuf_find
 *
 *	pagebuf_find returns a buffer matching the specified range of
 *	data for the specified inode, if any of the relevant blocks
 *	are in memory.  The buffer may have unallocated holes, if
 *	some, but not all, of the blocks are in memory.  Even where
 *	pages are present in the buffer, not all of every page may be
 *	valid.  The file system may use pagebuf_segment to visit the
 *	various segments of the buffer.  pagebuf_find will return an
 *	empty buffer (with no storage allocated) if the fifth argument
 *	is TRUE. 
 */

page_buf_t *pagebuf_find(	/* find buffer for block if     */
				/* the block is in memory       */
    struct inode * ip,		/* inode for block              */
    loff_t ioff,		/* starting offset of range     */
    size_t isize,		/* length of range              */
    page_buf_flags_t flags)	/* PBF_LOCK, PBF_ALWAYS_ALLOC   */
{
	page_buf_registration_t *reg;
	int rval;
	page_buf_t *pb = NULL;

	if (!_pagebuf_check_lockable(ip, &reg)) {
		rval = _pagebuf_find_lockable_buffer(ip,
		    reg, ioff, isize, flags, &pb);

		if (pb) {
			return (pb);
		}
	}

	return (NULL);
}


/*
 *	pagebuf_get
 *
 *	pagebuf_get assembles a buffer covering the specified range.
 *	Some or all of the blocks in the range may be valid.  The file
 *	system may use pagebuf_segment to visit the various segments
 *	of the buffer.  Storage in memory for all portions of the
 *	buffer will be allocated, although backing storage may not be. 
 *	If PBF_READ is set in flags, pagebuf_read
 */

page_buf_t *pagebuf_get(	/* allocate a buffer            */
    struct inode * ip,		/* inode for buffer (or NULL)   */
    loff_t ioff,		/* starting offset of range     */
    size_t isize,		/* length of range              */
    page_buf_flags_t flags) 	/* PBF_LOCK, PBF_TRYLOCK, PBF_READ, */
				/* PBF_LONG_TERM, PBF_SEQUENTIAL, */
				/* PBF_MAPPED */
{
	page_buf_registration_t *reg;
	int rval;
	page_buf_t *pb;

	/* Enforce alignment of pagebufs on sector boundaries */
	if ((ioff & (ip->i_sb->s_blocksize - 1)) ||
	    (isize & (ip->i_sb->s_blocksize - 1))) {
		printk("Bad alignment of pagebuf offset %Ld size %d\n",
		    ioff, isize);

		BUG();
	}

	if (!_pagebuf_check_lockable(ip, &reg)) {
		rval = _pagebuf_get_lockable_buffer(ip,
		    reg, ioff, isize, flags, &pb);
	} else {
		rval = _pagebuf_get_object(ip, ioff, isize, flags, &pb);
	}

	if (rval != 0)
		return (NULL);

	PB_STATS_INC(pb_get);

	/* fill in any missing pages */
	rval = _pagebuf_lookup_pages(pb, flags, NULL);
	if (rval != 0) {
		pagebuf_free(pb);
		return (NULL);
	}

	/* Always fill in the block number now, the mapped cases can do
	 * their own overlay of this later.
	 */

	pb->pb_bn = pb->pb_file_offset >> PB_SECTOR_BITS(pb);
	pb->pb_count_desired = pb->pb_buffer_length;

	if (flags & PBF_READ) {
		if (PBF_NOT_DONE(pb)) {
			PB_TRACE(pb, PB_TRACE_REC(get_read), flags);
			pagebuf_iostart(pb, flags);
		} else if (flags & PBF_ASYNC) {
			/* Read ahead call which is already satisfied,
			 * drop the buffer
			 */
			if (flags & (PBF_LOCK | PBF_TRYLOCK))
				pagebuf_unlock(pb);
			pagebuf_rele(pb);
			return NULL;
		} else {
			/* We do not want read in the flags */
			pb->pb_flags &= ~PBF_READ;
		}
	}

	PB_TRACE(pb, PB_TRACE_REC(get_obj), flags);

	return (pb);
}

/*
 * If we are not low on memory then do the readahead in a deadlock
 * safe manner.
 */
void
pagebuf_readahead(
    struct inode * ip,		/* inode for buffer (or NULL)   */
    loff_t ioff,		/* starting offset of range     */
    size_t isize,		/* length of range              */
    int	   flags)		/* extra flags for the read	*/
{
	if (!free_shortage()) {
		(void)pagebuf_get(ip, ioff, isize,
			flags | PBF_TRYLOCK | PBF_READ |
			PBF_ASYNC | PBF_MAPPABLE);
	}
}

page_buf_t *
pagebuf_get_empty(int sleep, struct inode * ip)
{
	int rval;
	int flags = _PBF_LOCKABLE;
	page_buf_t *pb;

	rval = _pagebuf_get_object(ip, 0, 0, flags, &pb);
	return ((rval != 0) ? NULL : pb);
}

int
pagebuf_associate_memory(
	page_buf_t *pb,
	void *mem,
	size_t len)
{
	int	rval;
	struct page *page;
	struct kiobuf *kbp;
	int i = 0;
	size_t ptr; 
	size_t end, end_cur;

	pb->pb_addr = mem;
	{
		int page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
		rval = _pagebuf_get_kiovec(pb, page_count, 0);
	}
	if (0 != rval) {
		return (rval);
	}
	kbp = pb->pb_mem;
	page = virt_to_page(mem);

	ptr = (size_t) mem & PAGE_CACHE_MASK;
	end = PAGE_CACHE_ALIGN((size_t) mem + len);
	end_cur = end;
	/* set up first page */
	kbp->offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
	kbp->length = len;
	kbp->maplist[i] = page = virt_to_page(mem);

	ptr += PAGE_CACHE_SIZE;
	kbp->nr_pages = ++i;
	while (ptr < end) {
		kbp->maplist[i] = page = virt_to_page(ptr);
		kbp->nr_pages = ++i;
		ptr += PAGE_CACHE_SIZE;
	}
	kbp->locked = 0;

	pb->pb_count_desired = pb->pb_buffer_length = len;
	pb->pb_flags |= PBF_MAPPED;

	return 0;
}

int pbgnd_debug = 0;

page_buf_t *
pagebuf_get_no_daddr(size_t len, struct inode * ip)
{
	int rval;
	void *rmem = NULL;
	int flags = _PBF_LOCKABLE | PBF_FORCEIO;
	page_buf_t *pb;
	size_t tlen  = 0;

	if (len > 0x20000)
		return(NULL);

	rval = _pagebuf_get_object(ip, 0, len, flags, &pb);

	if (0 != rval)
		return (NULL);

	do {
	  if (tlen == 0){
		tlen = len; /* first time */
	  } else {
		kfree(rmem); /* free the mem from the previous try */
		tlen <<= 1; /* double the size and try again */
		/*
		printk( "pb_get_no_daddr NOT block 0x%p mask 0x%p len %d\n",
                        rmem,
                        ((size_t)rmem & (size_t)~BBMASK),
                        len);
		*/
	  }
	  if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) {
		pagebuf_free(pb);
		return (NULL);
	  }
	} while ((size_t)rmem != ((size_t)rmem & (size_t)~BBMASK));
	
	if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) {
		kfree(rmem);
		pagebuf_free(pb);
		return (NULL);
	}
	/* otherwise pagebuf_free just ignores it */
	pb->pb_flags |= _PBF_MEM_ALLOCATED;

	PB_TRACE(pb, PB_TRACE_REC(no_daddr), rmem);

	return (pb);
}


/*
 *	pagebuf_hold
 *
 *	Increment reference count on buffer, to hold the buffer concurrently
 *	with another thread which may release (free) the buffer asynchronously.
 *
 *	Must hold the buffer already to call this function.
 */

void pagebuf_hold(page_buf_t * pb)
{
	unsigned long flags;

	if (pb != NULL) {
		spin_lock_irqsave(&PBP(pb)->pb_lock, flags);
		assert(pb->pb_hold > 0);
		pb->pb_hold++;
		spin_unlock_irqrestore(&PBP(pb)->pb_lock, flags);

		PB_TRACE(pb, PB_TRACE_REC(hold), 0);
	}
}


/*
 *	pagebuf_free
 *
 *	pagebuf_free releases the specified buffer.  The modification
 *	state of any associated pages is left unchanged.
 */

void pagebuf_free(	/* deallocate a buffer          */
    page_buf_t * pb)	/* buffer to deallocate           */
{
	unsigned long flags;

	spin_lock_irqsave(&PBP(pb)->pb_lock, flags);
	_pagebuf_free_object(pb, flags);
}


/*
 *	pagebuf_rele
 * 
 *	pagebuf_rele releases a hold on the specified buffer.  If the
 *	the hold count is 1, pagebuf_rele calls pagebuf_free.
 */

void pagebuf_rele(page_buf_t * pb)
{
	int	do_free;
	unsigned long flags;

	PB_TRACE(pb, PB_TRACE_REC(rele), pb->pb_relse);
	spin_lock_irqsave(&PBP(pb)->pb_lock, flags);

	if (pb->pb_hold == 1) {
		do_free = 1;
		if (pb->pb_relse) {
			spin_unlock_irqrestore(&PBP(pb)->pb_lock, flags);
			(*(pb->pb_relse)) (pb);
			do_free = 0;
		}
		if (pb->pb_flags & PBF_DELWRI) {
			pb->pb_flags |= PBF_ASYNC;
			if (do_free)
				spin_unlock_irqrestore(&PBP(pb)->pb_lock,flags);
			pagebuf_delwri_queue(pb, 0);
			do_free = 0;
		}

		if (do_free) {
			_pagebuf_free_object(pb, flags);
		}
	} else {
		pb->pb_hold--;
		spin_unlock_irqrestore(&PBP(pb)->pb_lock, flags);
	}
}


/*
 *	Pinning Buffer Storage in Memory
 */

/*
 *	pagebuf_pin
 *
 *	pagebuf_pin locks all of the memory represented by a buffer in
 *	memory.  Multiple calls to pagebuf_pin and pagebuf_unpin, for
 *	the same or different buffers affecting a given page, will
 *	properly count the number of outstanding "pin" requests.  The
 *	buffer may be released after the pagebuf_pin and a different
 *	buffer used when calling pagebuf_unpin, if desired.
 *	pagebuf_pin should be used by the file system when it wants be
 *	assured that no attempt will be made to force the affected
 *	memory to disk.  It does not assure that a given logical page
 *	will not be moved to a different physical page.  Only the
 *	raw_count field of mem_map_t can in general assure that a
 *	logical page will not be moved to a different physical page. 
 */

void pagebuf_pin(	/* pin buffer in memory         */
     page_buf_t * pb)	/* buffer to pin          */
{
	atomic_inc(&PBP(pb)->pb_pin_count);
	PB_TRACE(pb, PB_TRACE_REC(pin), PBP(pb)->pb_pin_count.counter);
}


/*
 *	pagebuf_unpin
 *
 *	pagebuf_unpin reverses the locking of memory performed by
 *	pagebuf_pin.  Note that both functions affected the logical
 *	pages associated with the buffer, not the buffer itself. 
 */

void pagebuf_unpin(		/* unpin buffered data          */
    page_buf_t * pb)		/* buffer to unpin                */
{
	if (atomic_dec_and_test(&PBP(pb)->pb_pin_count)) {
		wake_up(&PBP(pb)->pb_waiters);
	}
	PB_TRACE(pb, PB_TRACE_REC(unpin), PBP(pb)->pb_pin_count.counter);
}

int
pagebuf_ispin(page_buf_t *pb) {
	return atomic_read(&PBP(pb)->pb_pin_count);
}
/*
 *	pagebuf_wait_unpin
 *
 *	pagebuf_wait_unpin waits until all of the memory associated
 *	with the buffer is not longer locked in memory.  It returns
 *	immediately if none of the affected pages are locked. 
 */

static inline void	_pagebuf_wait_unpin(page_buf_t * pb)
{
	DECLARE_WAITQUEUE(wait, current);

	if (atomic_read(&PBP(pb)->pb_pin_count) == 0) {
		return;
	}

	add_wait_queue(&PBP(pb)->pb_waiters, &wait);
	for (;;) {
		current->state = TASK_UNINTERRUPTIBLE;
		if (atomic_read(&PBP(pb)->pb_pin_count) == 0) {
			break;
		}
		run_task_queue(&tq_disk);
		schedule();
	}
	remove_wait_queue(&PBP(pb)->pb_waiters, &wait);
	current->state = TASK_RUNNING;
}

void pagebuf_wait_unpin(	/* wait for buffer to be unpinned */
    page_buf_t * pb)		/* buffer for which to wait       */
{
	_pagebuf_wait_unpin(pb);
}

/*
 * 	Buffer Utility Routines 
 */

/*
 *	pagebuf_geterror
 *
 *	pagebuf_geterror returns the error stored in the buffer, or 0 if
 *	there is no error.
 */

int pagebuf_geterror(		/* return buffer error          */
    page_buf_t * pb)		/* buffer                       */
{
	return (pb->pb_error);
}


/*
 *	pagebuf_iodone
 *
 *	pagebuf_iodone marks a buffer for which I/O is in progress
 *	done with respect to that I/O.  The pb_done routine, if
 *	present, will be called as a side-effect. 
 */

void pagebuf_iodone(		/* mark buffer I/O complete     */
    page_buf_t * pb)		/* buffer to mark 	        */
{
	pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
	if (pb->pb_error == 0) {
		pb->pb_flags &=
		    ~(PBF_PARTIAL | PBF_NONE);
	}

	PB_TRACE(pb, PB_TRACE_REC(done), pb->pb_iodone);

	/* If we were on the delwri list, dequeue if there is no one waiting */
	if ((pb->pb_flags & PBF_ASYNC) &&
	    (pb->pb_list.next != &pb->pb_list))
		pagebuf_delwri_dequeue(pb);

	if (pb->pb_iodone) {
		(*(pb->pb_iodone)) (pb);
		return;
	}

	if (pb->pb_flags & PBF_ASYNC) {
		if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse)
			pagebuf_unlock(pb);
		pagebuf_rele(pb);
	} else {
		up(&PBP(pb)->pb_iodonesema);
	}
}

/*
 *	pagebuf_ioerror
 *
 *	pagebuf_ioerror sets the error code for a buffer.
 */

void pagebuf_ioerror(	/* mark buffer in error (or not) */
    page_buf_t * pb,	/* buffer to mark               */
    int serror)		/* error to store (0 if none)     */
{
	pb->pb_error = serror;
	PB_TRACE(pb, PB_TRACE_REC(ioerror), serror);
}

/*
 *	pagebuf_iostart
 *
 *	pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
 *	If necessary, it will arrange for any disk space allocation required,
 *	and it will break up the request if the block mappings require it.
 *	An pb_iodone routine in the buffer supplied will only be called
 *	when all of the subsidiary I/O requests, if any, have been completed.
 *	pagebuf_iostart calls the inode pagebuf_ioinitiate routine or
 *	pagebuf_iorequest, if the former routine is not defined, to start
 *	the I/O on a given low-level request.  It uses the inode pagebuf_bmap
 *	routine or the inode bmap, if the former routine is not defined,
 *	to locate the disk storage.
 */

int pagebuf_iostart(		/* start I/O on a buffer          */
    page_buf_t * pb,		/* buffer to start                */
    page_buf_flags_t flags)	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
				/* PBF_WRITE, PBF_ALLOCATE,       */
				/* PBF_DELWRI, PBF_SEQUENTIAL,	  */
				/* PBF_SYNC, PBF_DONT_BLOCK       */
				/* PBF_RELEASE			  */
{
	int status = 0;

	PB_TRACE(pb, PB_TRACE_REC(iostart), flags);

	if (flags & PBF_DELWRI) {
		pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
		pb->pb_flags |= flags &
				(PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
		pagebuf_delwri_queue(pb, 1);
		return status;
	}
 
	pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI);
	pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_SYNC);

	if (pb->pb_bn == PAGE_BUF_DADDR_NULL) {
		BUG();
	}
	/* For writes call internal function which checks for
	 * filesystem specific callout function and execute it.
	 */
	if (flags & PBF_WRITE) {
		status = __pagebuf_iorequest(pb);
	} else {
		status = pagebuf_iorequest(pb);
	}

	/* Wait for I/O if we are not an async request */
	if ((status == 0) && (flags & PBF_ASYNC) == 0) {
		status = pagebuf_iowait(pb);
		if (flags & (PBF_WRITE| PBF_DELWRI))
			pagebuf_rele(pb);
	}

	return status;
}

/* Helper routines for pagebuf_iorequest */

typedef struct {
	page_buf_t *pb;		/* pointer to pagebuf page is within */
	int locking;		/* are pages locked */
	atomic_t remain;	/* count of remaining I/O requests */
} pagesync_t;

static inline void _pb_io_done(page_buf_t *pb)
{
	if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) {
		pb->pb_mem->locked = 0;
		pagebuf_iodone(pb);
	}
}


/* I/O completion routine for pagebuf I/O on a page, can be used for a
 * page without a pagebuf - the pb field in pagesync_t is not set.
 */
STATIC void _end_pagebuf_page_io(struct buffer_head *bh, int uptodate)
{
	struct page *page;
	page_buf_t *pb = (page_buf_t *) bh->b_private;

	mark_buffer_uptodate(bh, uptodate);
	atomic_dec(&bh->b_count);

	page = bh->b_page;
	if (!test_bit(BH_Uptodate, &bh->b_state)) {
		set_bit(PG_error, &page->flags);
		pb->pb_error = -EIO;
	}

	unlock_buffer(bh);
	kmem_cache_free(bh_cachep, bh);

	SetPageUptodate(page);
	_pb_io_done(pb);
}

STATIC void _end_pagebuf_page_io_locked(struct buffer_head *bh, int uptodate)
{
	struct page *page;
	page_buf_t *pb = (page_buf_t *) bh->b_private;

	mark_buffer_uptodate(bh, uptodate);
	atomic_dec(&bh->b_count);

	page = bh->b_page;
	if (!test_bit(BH_Uptodate, &bh->b_state)) {
		set_bit(PG_error, &page->flags);
		pb->pb_error = -EIO;
	}

	unlock_buffer(bh);
	kmem_cache_free(bh_cachep, bh);

	SetPageUptodate(page);
	UnlockPage(page);
	_pb_io_done(pb);
}

STATIC void _end_pagebuf_page_io_multi(struct buffer_head *bh, int uptodate)
{
	pagesync_t *psync = (pagesync_t *) bh->b_private;
	page_buf_t *pb = psync->pb;
	struct page *page;

	mark_buffer_uptodate(bh, uptodate);
	atomic_dec(&bh->b_count);

	page = bh->b_page;
	if (!test_bit(BH_Uptodate, &bh->b_state)) {
		set_bit(PG_error, &page->flags);
		pb->pb_error = -EIO;
	}

	unlock_buffer(bh);
	kmem_cache_free(bh_cachep, bh);

	if (atomic_dec_and_test(&psync->remain) == 1) {
		SetPageUptodate(page);
		if (psync->locking)
			UnlockPage(page);
		kfree(psync);
		_pb_io_done(pb);
	}
}

/*
 * Initiate I/O on part of a page we are interested in
 *
 * This will attempt to make a request bigger than the sector
 * size if we are not running on the MD device - LVM need to be
 * added to this logic as well.
 *
 * If you think this change is causing problems initializing the
 * concat_ok variable will turn it off again.
 */
STATIC int
_pagebuf_page_io(
    struct page *page,		/* Page structure we are dealing with */
    page_buf_t * pb,		/* pagebuf holding it, can be NULL */
    page_buf_daddr_t bn,	/* starting block number */
    kdev_t dev,			/* device for I/O */
    size_t sector,		/* device block size */
    int sshift,			/* device block shift */
    off_t pg_offset,		/* starting offset in page */
    size_t pg_length,		/* count of data to process */
    int locking,		/* page locking in use */
    int rw)			/* read/write operation */
{
	int cnt,itr;
	pagesync_t *psync = NULL;
	struct buffer_head *bh, *bufferlist[8];
	size_t blk_length;
	int err=0;
	int concat_ok = ((MAJOR(dev) != LVM_BLK_MAJOR) && (MAJOR(dev) != MD_MAJOR));

	/* Calculate the block offsets and length we will be using */
	if (pg_offset) {
		size_t block_offset;

		block_offset = pg_offset >> sshift;
		block_offset = pg_offset - (block_offset << sshift);
		blk_length = (pg_length + block_offset + sector - 1) >> sshift;
	} else {
		blk_length = (pg_length + sector - 1) >> sshift;
	}

	if (concat_ok) {
		/* This should just create one buffer head */
		sector *= blk_length;
		blk_length = 1;
	}

	/* Allocate pagesync_t and buffer heads for portions of the
	 * page which need I/O.
	 * Call generic_make_request
	 */

	if (blk_length != 1) {
		psync = (pagesync_t *) kmalloc(sizeof(pagesync_t), GFP_PAGE_IO);

		/* Ugh - out of memory condition here */
		if (psync == NULL)
			BUG();

		psync->pb = pb;
		psync->locking = locking;
		atomic_set(&psync->remain, 0);
	}

	for (cnt = 0; blk_length > 0;
	     blk_length--, pg_offset += sector) {
		bh = kmem_cache_alloc(bh_cachep, SLAB_PAGE_IO);
		if (bh == NULL){
			err = -ENOMEM;
			goto  error;
		}
		memset(bh, 0, sizeof(*bh));
		init_waitqueue_head(&bh->b_wait);

		if (psync) {
			init_buffer(bh, _end_pagebuf_page_io_multi, psync);
			atomic_inc(&psync->remain);
		} else if (locking) {
			init_buffer(bh, _end_pagebuf_page_io_locked, pb);
		} else {
			init_buffer(bh, _end_pagebuf_page_io, pb);
		}

		bh->b_size = sector;
		set_bh_page(bh, page, pg_offset);
		atomic_set(&bh->b_count, 1);
		bh->b_dev = dev;
		bh->b_blocknr = bn++;

		bh->b_rdev = bh->b_dev;
		bh->b_rsector = bh->b_blocknr;
		set_bit(BH_Lock, &bh->b_state);
		set_bit(BH_Mapped, &bh->b_state);

		if (rw == WRITE ) {
			set_bit(BH_Uptodate, &bh->b_state);
			set_bit(BH_Dirty, &bh->b_state);
		}
		bufferlist[cnt++] = bh;
	}

	if (cnt) {
		/* Indicate that there is another page in progress */
		atomic_inc(&PBP(pb)->pb_io_remaining);

		for (itr=0; itr < cnt; itr++){
			generic_make_request(rw, bufferlist[itr]);
		}		  
	} else {
		if (psync)
			kfree(psync);
		if (locking)
			UnlockPage(page);
	}

	return err;
error:
	/* If we ever do get here then clean up what we already did */
	for (itr=0; itr < cnt; itr++) {
		atomic_set_buffer_clean (bufferlist[itr]);
		bufferlist[itr]->b_end_io(bufferlist[itr], 0);
	}
	return err;
}

/* Apply function for pagebuf_segment_apply */
STATIC int _page_buf_page_apply(
    page_buf_t * pb,
    loff_t offset,
    struct page *page,
    size_t pg_offset,
    size_t pg_length)
{
	page_buf_daddr_t bn = pb->pb_bn;
	kdev_t dev = pb->pb_target->i_dev;
	size_t sector = PB_SECTOR_SIZE(pb);
	int sshift = PB_SECTOR_BITS(pb);
	loff_t pb_offset;
	size_t	ret_len = pg_length;
	assert(page);

	if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
	    (pb->pb_flags & PBF_READ) && pb->pb_mem->locked) {
		bn -= (pb->pb_mem->offset >> sshift);
		pg_offset = 0;
		pg_length = PAGE_CACHE_SIZE;
	} else {
		pb_offset = offset - pb->pb_file_offset;
		if (pb_offset) {
			bn += (pb_offset + sector - 1) >> sshift;
		}
	}

	if (pb->pb_flags & PBF_READ) {
		/* We only need to do I/O on pages which are not upto date */
		while (!Page_Uptodate(page) || (pb->pb_flags & PBF_FORCEIO)) {
			/* Attempt to lock page */
			if (pb->pb_mem->locked || !TryLockPage(page)) {
				_pagebuf_page_io(page, pb, bn, dev,
				    sector, sshift,
				    (off_t) pg_offset, pg_length, 1, READ);

				/* When doing the I/O we need to setup a
				 * completion routine which gets called for
				 * each buffer_head completion. On completion
				 * we will need to decrement a count and
				 * mark the section of the page we deal
				 * with upto date. Once the count reaches
				 * zero we can update the PG_uptodate
				 * and PG_partial fields, unlock the page
				 * and wake up page waiters.
				 *
				 * We also need to locate the pagebuf and
				 * decrement the a count of pending I/O,
				 * should this count reach zero we need
				 * to call the pagebuf_iodone() function.
				 */

				break;
			} else {
				/* we could not lock the page, this means
				 * someone else is doing I/O in this page,
				 * the possibilities are that either their
				 * I/O will satisfy our I/O, or that we will
				 * still need to do I/O when they are done.
				 * There is no way to tell this until we can
				 * lock the page - ugh!
				 */
				___wait_on_page(page);
			}
		}
	} else if (pb->pb_flags & PBF_WRITE) {
		int locking = (pb->pb_flags & _PBF_LOCKABLE) == 0;

		/* Check we need to lock pages */
		if (locking && (pb->pb_mem->locked == 0))
			lock_page(page);
		_pagebuf_page_io(page, pb, bn, dev, sector, sshift,
			    (off_t) pg_offset, pg_length, locking, WRITE);
	}

	return (ret_len);
}

/*
 *	pagebuf_iorequest
 *
 * 	pagebuf_iorequest is the core I/O request routine.
 *	It assumes that the buffer is well-formed and
 *	mapped and ready for physical I/O, unlike
 *	pagebuf_iostart() and pagebuf_iophysio().  Those
 *	routines call the inode pagebuf_ioinitiate routine to start I/O,
 *	if it is present, or else call pagebuf_iorequest()
 *	directly if the inode pagebuf_ioinitiate routine is not present.  
 *
 *	This function will be responsible for ensuring access to the 
 *	pages is restricted whilst I/O is in progress - for locking
 *	pagebufs the pagebuf lock is the mediator, for non-locking
 *	pagebufs the pages will be locked. In the locking case we
 *	need to use the pagebuf lock as multiple meta-data buffers
 *	will reference the same page.
 */

int pagebuf_iorequest(		/* start real I/O               */
	page_buf_t * pb)	/* buffer to convey to device   */
{
	int status = 0;

	assert(pb->pb_flags & _PBF_ALL_PAGES_MAPPED);

	PB_TRACE(pb, PB_TRACE_REC(ioreq), 0);

	if (pb->pb_flags & PBF_DELWRI) {
		pagebuf_delwri_queue(pb, 1);
		return status;
	}

	if (pb->pb_flags & PBF_WRITE) {
		_pagebuf_wait_unpin(pb);
	}

	/* Set the count to 1 initially, this will stop an I/O
	 * completion callout which happens before we have started
	 * all the I/O from calling iodone too early
	 */
	atomic_set(&PBP(pb)->pb_io_remaining, 1);
	status = pagebuf_segment_apply(_page_buf_page_apply, pb);

	/* Drop our count and if everything worked we are done */
	if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) {
		pagebuf_iodone(pb);
	} else if ((pb->pb_flags & (PBF_SYNC|PBF_ASYNC)) == PBF_SYNC)  {
		run_task_queue(&tq_disk);
	}

	return status < 0 ? status : 0;
}

/*
 *	pagebuf_iowait
 *
 *	pagebuf_iowait waits for I/O to complete on the buffer supplied.
 *	It returns immediately if no I/O is pending.  In any case, it returns
 *	the error code, if any, or 0 if there is no error.
 */

int pagebuf_iowait(page_buf_t * pb) /* buffer to wait on              */
{
	PB_TRACE(pb, PB_TRACE_REC(iowait), 0);
	run_task_queue(&tq_disk);
	down(&PBP(pb)->pb_iodonesema);
	PB_TRACE(pb, PB_TRACE_REC(iowaited), pb->pb_error);
	return (pb->pb_error);
}


/* reverse pagebuf_mapin()      */ 
STATIC void
*pagebuf_mapout_locked(
   page_buf_t * pb)	/* buffer to unmap                */
{				
	void *old_addr = NULL;

	if (pb->pb_flags & PBF_MAPPED) {
		if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
			old_addr = pb->pb_addr;
		pb->pb_addr = NULL;
		pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
	}

	return (old_addr);	/* Caller must free the address space,
				 * we are under a spin lock, probably
				 * not safe to do vfree here
				 */
}

caddr_t
pagebuf_offset(page_buf_t *pb, off_t offset)
{
        struct kiobuf *kp;
        struct page *page;

        kp = pb->pb_mem;
        offset += kp->offset;

        page = kp->maplist[offset >> PAGE_CACHE_SHIFT];
        return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
}

/*
 *	pagebuf_segment
 *
 *	pagebuf_segment is used to retrieve the various contiguous
 *	segments of a buffer.  The variable addressed by the
 *	loff_t * should be initialized to 0, and successive
 *	calls will update to point to the segment following the one
 *	returned.  pagebuf_segment returns 0 on a successful
 *	retrieval, and a negative error code on any error (including
 *	-ENOENT when the loff_t is out of range). 
 *
 *	The mem_map_t * return value may be set to NULL if the
 *	page is outside of main memory (as in the case of memory on a controller
 *	card).  The page_buf_pgno_t may be set to PAGE_BUF_PGNO_NULL
 *	as well, if the page is not actually allocated, unless the
 *	PBF_ALWAYS_ALLOC flag is set in the page_buf_flags_t,
 *	in which allocation of storage will be forced.
 */

int pagebuf_segment(		/* return next segment of buffer */
    page_buf_t * pb,		/* buffer to examine            */
    loff_t * boff_p,		/* offset in buffer of next     */
				/* segment (updated)            */
    mem_map_t ** spage_p,	/* page (updated)               */
				/* (NULL if not in mem_map[])   */
    size_t * soff_p,		/* offset in page (updated)     */
    size_t * ssize_p,		/* length of segment (updated)  */
    page_buf_flags_t flags)	/* PBF_ALWAYS_ALLOC             */
{
	struct kiobuf *kb;	/* current kiobuf               */
	loff_t kpboff;		/* offset in current kiobuf     */
	int kpi;		/* page index in current kiobuf */
	size_t slen;		/* segment length               */

	kb = pb->pb_mem;
	kpboff = *boff_p;

	kpi = page_buf_btoct(kpboff + kb->offset);
	assert(kpi < kb->nr_pages);

	*spage_p = kb->maplist[kpi];

	*soff_p = page_buf_poff(kpboff + kb->offset);
	slen = PAGE_CACHE_SIZE - *soff_p;
	if (slen > (kb->length - kpboff))
		slen = (kb->length - kpboff);
	*ssize_p = slen;

	*boff_p = *boff_p + slen;

	return (0);
}


int pagebuf_iomove(			/* move data in/out of buffer	*/
    page_buf_t		*pb,		/* buffer to process		*/
    off_t		boff,		/* starting buffer offset	*/
    size_t		bsize,		/* length to copy		*/
    caddr_t		data,		/* data address			*/
    page_buf_rw_t	mode)		/* read/write flag		*/
{
	loff_t cboff;
	size_t cpoff;
	size_t csize;
	struct page *page;

	cboff = boff;
	boff += bsize; /* last */

	while (cboff < boff) {
		if (pagebuf_segment(pb, &cboff, &page, &cpoff, &csize, 0)) {
			/* XXX allocate missing page */
			return (-ENOMEM);
		}
		assert(((csize + cpoff) <= PAGE_CACHE_SIZE));
		switch (mode) {
		case PBRW_ZERO:
			memset(page_address(page) + cpoff, 0, csize);
			break;
		case PBRW_READ:
			memcpy(data, page_address(page) + cpoff, csize);
			break;
		case PBRW_WRITE:
			memcpy(page_address(page) + cpoff, data, csize);
		}

		data += csize;
	}
	return 0;
}

#ifdef KIOBUF_IO
/*
 * I/O completion routine for kiobuf-based I/O requests.
 */
static void pagebuf_end_kiobuf_io(struct kiobuf *kiobuf)
{
	int pgcnt, finished, rw;
	size_t curr_offset;
	unsigned int nr_bytes, total_bytes, page_bytes;
	struct page * page, ** pageptr;
	page_buf_t * pb;
	
	pb = (page_buf_t *) kiobuf->k_dev_id;
	if (kiobuf->errno != 0) {
             printk("pagebuf_end_kiobuf_io: got error %d\n", kiobuf->errno);
	     pb->pb_error = -EIO;
	}
	
	total_bytes = kiobuf->length;
	curr_offset = kiobuf->offset;
	/* Offset into kiobuf */
	for (pgcnt=0; pgcnt<kiobuf->nr_pages; pgcnt++) {
		if(curr_offset >= PAGE_CACHE_SIZE) {
			curr_offset -= PAGE_CACHE_SIZE;
			continue;
		}
		break;
	}

        /* Take care of any bounce buffers allocated */
	if (pb->pb_flags & PBF_READ)
		rw = READ;
	else
		rw = WRITE;
        cleanup_bounce_buffers(rw, 1, &kiobuf, total_bytes);

	/* 
	 * Over only pages in kiobuf relevant to I/O:
	 *	- set I/O mapping range valid,
	 * 	- clear partial flag if possible,
	 *	- set PAGE uptodate, if not partial,
	 *	- unlock page,
	 * Finally, indicate kiobuf pages are unlocked.
	 */
	pageptr = &(kiobuf->maplist[pgcnt]);
	nr_bytes = 0;
	while (nr_bytes != total_bytes) {
		/* extent of I/O in this page */
		if (curr_offset != 0) {
			page_bytes = PAGE_CACHE_SIZE - curr_offset;
			if (page_bytes > total_bytes)
				page_bytes = total_bytes;
		} else if ((nr_bytes + PAGE_CACHE_SIZE) > total_bytes) {
			page_bytes = total_bytes - nr_bytes;
		} else {	
			page_bytes = PAGE_CACHE_SIZE;
		}	
		/* I/O successful? */
		page = *pageptr;
		if (kiobuf->errno != 0) {
			set_bit(PG_error, &page->flags);
		} else {
			SetPageUptodate(page);
		}

		nr_bytes += page_bytes;
		pgcnt++;
		pageptr++;
		if (curr_offset) {
			curr_offset = 0;
		}
	}
	/* Sanity Check */
	if (pgcnt > kiobuf->nr_pages)
		BUG();
	if (kiobuf->locked) {
		unlock_kiovec(1, &kiobuf, kiobuf->nr_pages);
	}
	
	/* All kiobuf I/O done? */
	finished = atomic_dec_and_test(&PBP(pb)->pb_io_remaining);
	if (finished == 1) {
		pagebuf_iodone(pb);
	}
}


/*
 * Function:    pagebuf_kiobuf_io()
 *
 * Purpose:     Marshalls offset and length fields of kiobuf and locks
 *		down pages before sending it down to ll_rw_kio().
 *
 * Arguments:   rw      - read/write
 *              kiobuf  - collection of pages
 *		dev	- device against which I/O requested
 *		blocknr - dev block number at which to start I/O
 *              blksize - units (512B or other) of blocknr
 *              error   - return status
 *
 * Lock status: Assumed no lock held upon entry. Will lock down the
 *		pages against which I/O is being requested.
 *
 * Returns:     Nothing
 *
 * Notes:       This function is a composite of _page_buf_page_apply()
 *		and _pagebuf_page_io().
 *
 *		The `error' argument is passed in and set in ll_rw_kio().
 *
 *		Currently, this code does not check for valid pages in the
 *		kiobuf, but blindly issues I/O on the entire kiobuf->length.
 *		A potential optimization would be to issue multiple requests 
 *		over chunks of invalid pages, while skipping the intervening
 *		valid pages (unless (pb->flags & FORCEIO) is true). This would 
 *		be in consonance with the current I/O path thro' buffer_heads. 
 *		The multiple requests would be serialized since the same kiobuf
 *		(i.e., kiobuf->length) has to be reused for each request. Sounds
 *		like over-engineering for now.
 */
void pagebuf_kiobuf_io(int rw,
		       struct kiobuf *kb,
		       kdev_t dev, 
		       unsigned long blocknr,
		       size_t blksize,
		       int locking,
		       int *error)
{
	if (locking)
		*error = lock_kiovec(1, &kb, 1);
	if (*error)
		return;

        /* Bounce ... ouch! */
        *error = setup_kiobuf_bounce_pages(kb, GFP_USER);
        if (*error)
                goto error;
        if (rw & WRITE)
                kiobuf_copy_bounce(kb, COPY_TO_BOUNCE, -1);

	kb->end_io = pagebuf_end_kiobuf_io;
	kb->errno = 0;
	ll_rw_kio(rw, kb, dev, blocknr, blksize, error);

	/* 
	 * Release locked pages on error. Failed retries would have
	 * reset pgcnt and have released all the locks. So do nothing
	 * for that case. Also release any allocated bounce buffers.
	 */
 error:
	if (*error) {
		cleanup_bounce_buffers(rw, 1, &kb, -1);
		if (locking)
			unlock_kiovec(1, &kb, kb->nr_pages);
	}
}
#endif

/*
 *	pagebuf_segment_apply
 *
 *	pagebuf_segment_apply applies the page_buf_apply_t function
 *	to each segment of the page_buf_t.  It may be used to walk
 *	the segments of a buffer, as when building 
 *	a driver scatter-gather list.
 */

int pagebuf_segment_apply(	/* apply function to segments   */
    page_buf_apply_t func,	/* function to call             */
    page_buf_t * pb)		/* buffer to examine            */
{
	int buf_index;
	struct kiobuf *kb;
	int status = 0;
	int sval;
	loff_t buffer_offset = pb->pb_file_offset;
	size_t buffer_len = pb->pb_count_desired;
	size_t page_offset;
	size_t len;
	size_t total = 0;
	size_t cur_offset;
	size_t cur_len;

#ifdef KIOBUF_IO
	loff_t pb_offset;
	unsigned long blocknr;
	int error=0, rw;
	int locking;
#endif

	/*
	 * Earlier _page_buf_page_apply() would align the kiobuf length, 
	 * but pagebuf_segment_apply() didn't align the pagebuf length.
	 * -Chait.
	 */
	if((pb->pb_file_offset & (PB_SECTOR_SIZE(pb) - 1)) != 0 ||
	   (pb->pb_buffer_length & (PB_SECTOR_SIZE(pb) - 1)) != 0) {
		panic("pb offset/length unaligned!");
	}

	pagebuf_hold(pb);

	kb = pb->pb_mem;
	if (kb == NULL) {
		return 0;
	}
	cur_offset = kb->offset;
	cur_len = kb->length = buffer_len;
	/* this is assuming one kiobuf per pagebuf...
	 * if mutiple this will need to be updated
	 */

#ifdef KIOBUF_IO
	/*
	 * For a READ kiobuf io can be used only if ALL
	 * pages in kiobuf are invalid. If some are valid
	 * then we will erase in-memory data such as those
	 * contained in valid HOLE or delalloc pages.
	 */
	if (IS_KIOBUFIO(pb->pb_target) &&
	    (pb->pb_buffer_length > PAGE_CACHE_SIZE) &&
	    ((pb->pb_flags & PBF_WRITE) || (pb->pb_flags & PBF_NONE))) {
		  /* Works for ide & scsi-disks, else get back -ENOSYS */
		  if ((func == _page_buf_page_apply) && error == 0) {
			/* Read/Write */
			if (pb->pb_flags & PBF_READ) {
				rw = READ;
				locking = 1;
			} else {	
				rw = WRITE;
				locking = (pb->pb_flags & _PBF_LOCKABLE) == 0;
			}
			/* Offset into kiobuf */
			pb_offset = buffer_offset - pb->pb_file_offset;
			/* 
			 * In the original code, the blocknr calc.
			 * was made to be sector aligned. This should
			 * be redundant given my alignment checks.
			 */
			blocknr = pb->pb_bn +
				(pb_offset >> PB_SECTOR_BITS(pb));
			/* Store pagebuf for completion fn. */
			kb->k_dev_id = 	pb;
			/*
			 * If ll_rw_kio() sets error we won't try this
			 * path again. Need to check for which error.
			 */
repeat:			
			atomic_inc(&PBP(pb)->pb_io_remaining);
			pagebuf_kiobuf_io(rw, kb, pb->pb_target->i_dev, blocknr,
					  PB_SECTOR_SIZE(pb), locking, &error);
			if (error == 0) {
				/* Success case - we are done */
				buffer_offset += kb->length;
				buffer_len -= kb->length;
				goto out;
			} else {
				/* Failure, either try again, or fall back
				 * to the buffer head path.
				 */
				atomic_dec(&PBP(pb)->pb_io_remaining);
				if (error == -EAGAIN) {
					error = 0;
					goto repeat;
				}
			} 
		}
	}
#endif

	/* This is the buffer head based I/O path */
	for (buf_index = 0; buf_index < kb->nr_pages; buf_index++) {
		if (cur_len == 0)
			break;
		if (cur_offset >= PAGE_CACHE_SIZE) {
			cur_offset -= PAGE_CACHE_SIZE;
			continue;
		}

		page_offset = cur_offset;
		cur_offset = 0;

		len = PAGE_CACHE_SIZE - page_offset;
		if (len > cur_len)
			len = cur_len;
		cur_len -= len;
		/* func probably = _page_buf_page_apply */
		sval = func(pb, buffer_offset,
			    kb->maplist[buf_index], page_offset, len);

		if (sval <= 0) {
			status = sval;
			goto out;
		} else {
			len = sval;
			total += len;
		}

		buffer_offset += len;
		buffer_len -= len;
	}

out:
	pagebuf_rele(pb);

	if (!status)
		status = total;

	return (status);
}

/*
 * Pagebuf delayed write buffer handling
 */


void
pagebuf_delwri_queue(page_buf_t *pb, int unlock)
{
	unsigned long flags;

	PB_TRACE(pb, PB_TRACE_REC(delwri_q), unlock);
	spin_lock_irqsave(&pb_daemon->pb_delwrite_lock, flags);
	/* If already in the queue, dequeue and place at tail */
	if (pb->pb_list.next != &pb->pb_list) {
		if (unlock) {
			spin_lock(&PBP(pb)->pb_lock);
			if (pb->pb_hold > 1) pb->pb_hold--;
			spin_unlock(&PBP(pb)->pb_lock);
		}
		list_del(&pb->pb_list);
	} else {
		pb_daemon->pb_delwri_cnt++;
	}
	list_add_tail(&pb->pb_list, &pb_daemon->pb_delwrite_l);
	PBP(pb)->pb_flushtime = jiffies + pb_params.p_un.age_buffer;
	spin_unlock_irqrestore(&pb_daemon->pb_delwrite_lock, flags);

	if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) {
		pagebuf_unlock(pb);
	}
}

void
pagebuf_delwri_dequeue(page_buf_t *pb)
{
	unsigned long	flags;

	PB_TRACE(pb, PB_TRACE_REC(delwri_uq), 0);
	spin_lock_irqsave(&pb_daemon->pb_delwrite_lock, flags);
	list_del(&pb->pb_list);
	INIT_LIST_HEAD(&pb->pb_list);
	pb->pb_flags &= ~PBF_DELWRI;
	pb_daemon->pb_delwri_cnt--;
	spin_unlock_irqrestore(&pb_daemon->pb_delwrite_lock, flags);
}

/* Defines for page buf daemon */
DECLARE_WAIT_QUEUE_HEAD(pbd_waitq);

STATIC void
pb_daemon_wakeup(void)
{
	wake_up_interruptible(&pbd_waitq);
}

typedef void (*timeout_fn)(unsigned long);


STATIC int
pagebuf_daemon(void *data)
{
	u_long		flags;
	int		count;
	page_buf_t	*pb = NULL;
	struct list_head *head, *curr;
	pagebuf_marker_t *pb_marker_ptr;
	struct timer_list pb_daemon_timer = 
		{ {NULL, NULL}, 0, 0, (timeout_fn)pb_daemon_wakeup };


	pb_marker_ptr = kmalloc(sizeof(pagebuf_marker_t), GFP_KERNEL);
	
	pb_marker_ptr->pb_flags = 0; 

	/*  Set up the thread  */
	exit_files(current);
	daemonize();

	spin_lock_irqsave(&current->sigmask_lock, flags);	
	flush_signals(current);
	sigfillset(&current->blocked);
	recalc_sigpending(current);
	spin_unlock_irqrestore(&current->sigmask_lock, flags);

	strcpy(current->comm, "pagebuf_daemon");
	current->flags |= PF_MEMALLOC;

	do {
		if (pb_daemon->active == 1) {
			del_timer(&pb_daemon_timer);
			pb_daemon_timer.expires = jiffies + pb_params.p_un.flush_interval;
			add_timer(&pb_daemon_timer);
			interruptible_sleep_on(&pbd_waitq);
		}

		if (pb_daemon->active == 0) {
			del_timer(&pb_daemon_timer);
		}

		spin_lock_irqsave(&pb_daemon->pb_delwrite_lock, flags);

		head = curr = &pb_daemon->pb_delwrite_l;
		curr = curr->next; /* need to walk off the list head,
				    * since it just a global place holder */

		count = 0;
		while (curr != head) {
			pb = list_entry(curr, page_buf_t, pb_list);

			/*
			 * Skip other markers.
			 */
			if (pb->pb_flags == 0 ) { 
				curr = curr->next;
				continue;
			}

			PB_TRACE(pb, PB_TRACE_REC(walkq1), pagebuf_ispin(pb));

			if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) &&
			    (((pb->pb_flags & _PBF_LOCKABLE) == 0) ||
			     !pagebuf_cond_lock(pb))) {

				if (time_before(jiffies,
						PBP(pb)->pb_flushtime)) {
					pagebuf_unlock(pb);
					break;
				}

				pb->pb_flags &= ~PBF_DELWRI;
				pb->pb_flags |= PBF_WRITE;

				/* insert a place holder */
				list_add(&pb_marker_ptr->pb_list, curr);

				spin_unlock_irqrestore(
						&pb_daemon->pb_delwrite_lock,
						flags);

				__pagebuf_iorequest(pb);
				count++;

				spin_lock_irqsave(
						&pb_daemon->pb_delwrite_lock,
						flags);
				/*
				 * ok got the lock back; pick up the place
				 * holder and continue on
				 */
				curr = pb_marker_ptr->pb_list.next;
				list_del(&pb_marker_ptr->pb_list);

			} else {
				/* not doing anything with current...
				 * move on to the next one */
				curr = curr->next;
			}
		}

		spin_unlock_irqrestore(&pb_daemon->pb_delwrite_lock, flags);

#ifdef REMAPPING_SUPPORT
		if (as_list_len > 0)
			purge_addresses();
#endif

		if (count)
			run_task_queue(&tq_disk);
	} while (pb_daemon->active == 1);

	pb_daemon->active = -1;
	wake_up_interruptible(&pbd_waitq);
	kfree(pb_marker_ptr);

	return(0);
}


void
pagebuf_delwri_flush(struct inode *target, u_long flags, int *pinptr)
{
	page_buf_t *pb = NULL;
	struct list_head *head, *curr;
	unsigned long save;
	int locked;
	int pincount = 0;
	pagebuf_marker_t *pb_marker_ptr;


	pb_marker_ptr = kmalloc(sizeof(pagebuf_marker_t), GFP_KERNEL);

	pb_marker_ptr->pb_flags = 0;

	spin_lock_irqsave(&pb_daemon->pb_delwrite_lock, save);
	locked = 1;

	head = curr = &pb_daemon->pb_delwrite_l;
	curr = curr->next; /* need to walk off the list head,
			    * since it just a global place holder */

	while (curr != head) {
		pb = list_entry(curr, page_buf_t, pb_list);

		/*
		 * Skip other targets, markers and in progress buffers 
		 */

		if ((pb->pb_flags == 0) || (pb->pb_target != target) ||
		    !(pb->pb_flags & PBF_DELWRI)) {
			curr = curr->next;
			continue;
		}

		PB_TRACE(pb, PB_TRACE_REC(walkq2), pagebuf_ispin(pb));

		if (flags & PBDF_TRYLOCK) {
			if (!pagebuf_cond_lock(pb)) {
				pincount++;
				curr = curr->next;
				continue;
			}
		} else {
			list_add(&pb_marker_ptr->pb_list, curr);
			spin_unlock_irqrestore(&pb_daemon->pb_delwrite_lock,
						save);
			locked = 0;
			pagebuf_lock(pb);
		}

		if (pagebuf_ispin(pb)) {
			pincount++;
			pagebuf_unlock(pb);
			if (!locked)
				goto relock;
			curr = curr->next;
			continue;
		}

		pb->pb_flags &= ~PBF_DELWRI;
		pb->pb_flags |= PBF_WRITE;
		if (flags & PBDF_WAIT)
			pb->pb_flags &= ~PBF_ASYNC;

		if (locked) {
			list_add(&pb_marker_ptr->pb_list, curr);
			spin_unlock_irqrestore(&pb_daemon->pb_delwrite_lock,
						save);
		}

		__pagebuf_iorequest(pb);

relock:
		spin_lock_irqsave( &pb_daemon->pb_delwrite_lock,
					save);
		/*
		 * ok got the lock back; pick up the place
		 * holder and continue on
		 */
		curr = pb_marker_ptr->pb_list.next;
		list_del(&pb_marker_ptr->pb_list);
		locked = 1;
	}

	spin_unlock_irqrestore(&pb_daemon->pb_delwrite_lock, save);

	run_task_queue(&tq_disk);

	if (pinptr)
		*pinptr = pincount;

	if ((flags & PBDF_WAIT) == 0 ){
		kfree(pb_marker_ptr);
		return;
	}

	/*
	 * The problem to solve here:  if you find a buffer on the
	 * delwri queue, under protection of "pb_delwrite_lock",
	 * and it's had I/O initiated via the above loop, as soon
	 * as you drop "pb_delwrite_lock" it can turn into somebody
	 * else's buffer before you can try to lock/unlock it in
	 * order to synchronize with it.
	 */


	/* Now do that again, just waiting for the lock */
	spin_lock_irqsave(&pb_daemon->pb_delwrite_lock, flags);

	head = curr = &pb_daemon->pb_delwrite_l;
	curr = curr->next;


	while (curr != head) {

		pb = list_entry(curr, page_buf_t, pb_list);

		/*
		 * Skip stuff we do not care about
		 */
		if ((pb->pb_flags == 0) || (pb->pb_flags & PBF_DELWRI) ||
		    (pb->pb_target != target)) { 
			curr = curr->next;
			continue;
		}

		PB_TRACE(pb, PB_TRACE_REC(walkq3), pagebuf_ispin(pb));

		if (pb->pb_flags & PBF_ASYNC) {
			curr = curr->next;
			continue;
		}

		list_add(&pb_marker_ptr->pb_list, curr);

		spin_unlock_irqrestore( &pb_daemon->pb_delwrite_lock, flags);
		pagebuf_iowait(pb);
		pagebuf_delwri_dequeue(pb);
		if (!pb->pb_relse)
			pagebuf_unlock(pb);
		pagebuf_rele(pb);

		spin_lock_irqsave(&pb_daemon->pb_delwrite_lock, flags);

		curr = pb_marker_ptr->pb_list.next;
		list_del(&pb_marker_ptr->pb_list);
	}

	spin_unlock_irqrestore(&pb_daemon->pb_delwrite_lock, flags);
	
	kfree(pb_marker_ptr);
}

int
pagebuf_daemon_start(void)
{
  
	if (!pb_daemon){
		pb_daemon = (pagebuf_daemon_t *)
				kmalloc(sizeof(pagebuf_daemon_t), GFP_KERNEL);
		if (!pb_daemon){
			return -1; /* error */
		}

		pb_daemon->active = 1;
		pb_daemon->pb_delwri_cnt = 0;
		pb_daemon->pb_delwrite_lock = SPIN_LOCK_UNLOCKED;

		INIT_LIST_HEAD(&pb_daemon->pb_delwrite_l);

		if (0 > kernel_thread(pagebuf_daemon, (void *)pb_daemon,
				CLONE_FS|CLONE_FILES|CLONE_SIGHAND)) {
			printk("Can't start pagebuf daemon\n");
			kfree(pb_daemon);
			return -1; /* error */
		}
	}
	return 0;
}	

int 
pagebuf_daemon_stop(void)
{
	if (pb_daemon) {
		pb_daemon->active = 0;

		wake_up_interruptible(&pbd_waitq);

		while (pb_daemon->active == 0) {
			interruptible_sleep_on(&pbd_waitq);
		}

		kfree(pb_daemon);
		pb_daemon = NULL;
	}

	return 0;
}

/*
 * Pagebuf sysctl interface
 */

static struct ctl_table_header *pagebuf_table_header;


static ctl_table pagebuf_table[] = {
	{PB_FLUSH_INT, "flush_int", &pb_params.data[0],
	sizeof(int), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
	&sysctl_intvec, NULL, &pagebuf_min[0], &pagebuf_max[0]},
	{PB_FLUSH_AGE, "flush_age", &pb_params.data[1],
	sizeof(int), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
	&sysctl_intvec, NULL, &pagebuf_min[1], &pagebuf_max[1]},
	{PB_CLUSTER_LIMIT, "cluster_limit", &pb_params.data[2],
	sizeof(int), 0644, NULL, &proc_doulongvec_minmax, &sysctl_intvec, NULL,
	&pagebuf_min[2], &pagebuf_max[2]},
	{PB_DEBUG, "debug", &pb_params.data[3],
	sizeof(int), 0644, NULL, &proc_doulongvec_minmax, &sysctl_intvec, NULL,
	&pagebuf_min[3], &pagebuf_max[3]},
	{PB_DIO_MAX, "max_dio_pages", &pb_params.data[4],
	sizeof(int), 0644, NULL, &proc_doulongvec_minmax, &sysctl_intvec, NULL,
	&pagebuf_min[4], &pagebuf_max[4]},
	{0}
};

static ctl_table pagebuf_dir_table[] = {
	{VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
	{0}
};

static ctl_table pagebuf_root_table[] = {
	{CTL_VM, "vm",  NULL, 0, 0555, pagebuf_dir_table},
	{0}
};

#ifdef CONFIG_PROC_FS
static int
pagebuf_readstats(char *buffer, char **start, off_t offset,
			int count, int *eof, void *data)
{
	int     i, len;

	len = 0;
	len += sprintf(buffer + len, "pagebuf"); 
	for (i = 0; i < sizeof(pbstats) / sizeof(u_int32_t); i++) {
		len += sprintf(buffer + len, " %u",
			*(((u_int32_t*)&pbstats) + i));
	}
	buffer[len++] = '\n';

	if (offset >= len) {
		*start = buffer;
		*eof = 1;
		return 0;
	}
	*start = buffer + offset;
	if ((len -= offset) > count)
		return count;
	*eof = 1;

	return len;
}
#endif  /* CONFIG_PROC_FS */


/*
 *	Initialization and Termination
 */

/*
 *	pagebuf_init
 */

int __init pagebuf_init(void)
{
	pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);

#ifdef  CONFIG_PROC_FS
	if (proc_mkdir("fs/pagebuf", 0))
		create_proc_read_entry("fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
#endif

	avl_init();
	pagebuf_locking_init();
	if (pagebuf_cache == NULL) {
		pagebuf_cache = kmem_cache_create("page_buf_t",
		    sizeof(page_buf_private_t),
		    0, SLAB_HWCACHE_ALIGN, NULL, NULL);
		if (pagebuf_cache == NULL)
			return (-ENOMEM);
	}

#ifdef PAGEBUF_TRACE
	pb_trace.buf = (pagebuf_trace_t *)kmalloc(PB_TRACE_BUFSIZE *
				sizeof(pagebuf_trace_t), GFP_KERNEL);
/* For really really long trace bufs */
/*	pb_trace.buf = (pagebuf_trace_t *)vmalloc(PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t)); */
	memset(pb_trace.buf, 0, PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t));
	pb_trace.start = 0;
	pb_trace.end = PB_TRACE_BUFSIZE - 1;
#endif

	return (0);
}


/*
 *	Module management
 */

EXPORT_SYMBOL(pagebuf_delwri_flush);
EXPORT_SYMBOL(pagebuf_delwri_queue);
EXPORT_SYMBOL(pagebuf_delwri_dequeue);

EXPORT_SYMBOL(pagebuf_find);
EXPORT_SYMBOL(pagebuf_get);
EXPORT_SYMBOL(pagebuf_associate_memory);
EXPORT_SYMBOL(pagebuf_get_no_daddr);
EXPORT_SYMBOL(pagebuf_get_empty);
EXPORT_SYMBOL(pagebuf_daemon_start);
EXPORT_SYMBOL(pagebuf_hold);
EXPORT_SYMBOL(pagebuf_free);
EXPORT_SYMBOL(pagebuf_rele);
EXPORT_SYMBOL(pagebuf_geterror);
EXPORT_SYMBOL(pagebuf_iodone);
EXPORT_SYMBOL(pagebuf_ioerror);
EXPORT_SYMBOL(pagebuf_iostart);
EXPORT_SYMBOL(pagebuf_iorequest);
EXPORT_SYMBOL(pagebuf_iowait);
EXPORT_SYMBOL(pagebuf_offset);
EXPORT_SYMBOL(pagebuf_segment);
EXPORT_SYMBOL(pagebuf_iomove);
EXPORT_SYMBOL(pagebuf_pin);
EXPORT_SYMBOL(pagebuf_ispin);
EXPORT_SYMBOL(pagebuf_unpin);
EXPORT_SYMBOL(pagebuf_wait_unpin);
EXPORT_SYMBOL(pagebuf_readahead);

#ifdef MODULE

/*
 *	pagebuf_terminate
 */

STATIC void pagebuf_terminate(void)
{
	if (pagebuf_cache != NULL)
		kmem_cache_destroy(pagebuf_cache);
	pagebuf_daemon_stop();
	pagebuf_locking_terminate();
	avl_terminate();
	unregister_sysctl_table(pagebuf_table_header);
#ifdef  CONFIG_PROC_FS
	remove_proc_entry("fs/pagebuf/stat", NULL);
	remove_proc_entry("fs/pagebuf", NULL);
#endif
}


static int loaded = 0;

int init_module(void)
{
	int status;

	printk(KERN_INFO
	    "page_buf cache Copyright (c) 2000 Silicon Graphics, Inc.\n");
	status = pagebuf_init();
	if (status < 0) {
		printk(KERN_ERR
		    "page_buf module initialization failed (%d)\n", -status);
		return (status);
	}
	loaded = 1;
	return 0;
}



void cleanup_module(void)
{
	if (loaded) {
		pagebuf_terminate();
		loaded = 0;
	}
}

#endif 	/* MODULE */
