/*
 * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
 * 
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * 
 * Further, this software is distributed without any warranty that it is
 * free of the rightful claim of any third person regarding infringement
 * or the like.  Any license provided herein, whether implied or
 * otherwise, applies only to this software file.  Patent licenses, if
 * any, provided herein do not apply to combinations of this program with
 * other software, or any other product whatsoever.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write the Free Software Foundation, Inc., 59
 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
 * 
 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
 * Mountain View, CA  94043, or:
 * 
 * http://www.sgi.com 
 * 
 * For further information regarding this notice, see: 
 * 
 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
 */
#ident "$Id:  $"

/*
 *	page_buf_locking.c
 *
 *	The page_buf module provides an abstract buffer cache model on top of
 *	the Linux page cache.  Cached blocks for a file are hashed to the
 *	inode for that file, and can be held dirty in delayed write mode in
 *	the page cache.  Cached metadata blocks for a file system are hashed
 *	to the inode for the mounted device.  The page_buf module assembles
 *	buffer (page_buf_t) objects on demand to aggregate such cached pages
 *	for I/O.  The page_buf_locking module adds support for locking such
 *      page buffers.
 *
 *      Written by William J. Earl and Steve Lord at SGI 
 *
 *
 */

#define _PAGE_BUF_INTERNAL_ 1

#include <linux/config.h>
#include <linux/version.h>

#include <linux/module.h>

#include <linux/stddef.h>

#include <linux/page_buf.h>
#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/init.h>

#define PB_DEFINE_TRACES
#include <linux/page_buf_trace.h>

/*
 *	Locking model:
 *
 *	Buffers associated with inodes for which buffer locking
 *	is not enabled are not protected by semaphores, and are
 *	assumed to be exclusively owned by the caller.  There is
 *	spinlock in the buffer, for use by the caller when concurrent
 *	access is possible.
 *
 *	Buffers asociated with inodes for which buffer locking is
 *	enabled are protected by semaphores in the page_buf_lockable_t
 *	structure, but only between different callers.  For a given
 *	caller, the buffer is exclusively owned by the caller, but
 *	the caller must still use the spinlock when concurrent access
 *	is possible.
 *
 *	Internally, when implementing buffer locking, page_buf uses
 *	a rwlock_t to protect the pagebuf_registered_inodes tree,
 *	a spinlock_t to protect the buffer tree associated with an inode,
 *	as well as to protect the hold count in the page_buf_lockable_t.
 *	The locking order is the pagebuf_registered_inodes tree lock
 *	first, then the page_buf_registration_t lock.  The semaphore
 *	in the page_buf_lockable_t should be acquired only after acquiring
 *	a hold on the page_buf_lockable_t (and of course releasing the
 *	page_buf_registration_t spinlock_t).
 */

/*
 *	AVL tree of inodes for which locking is enabled
 */

static	spinlock_t   pagebuf_registered_inodes_lock;
static	avl_handle_t pagebuf_registered_inodes = NULL;

static kmem_cache_t *pagebuf_registration_cache = NULL;


/*
 *	Initialization and Termination
 */

/*
 *	pagebuf_locking_init
 */

int __init pagebuf_locking_init(void)
{
	int status;

	if (pagebuf_registered_inodes == NULL) {
		spin_lock_init(&pagebuf_registered_inodes_lock);
		status = avl_create(&pagebuf_registered_inodes,
				    avl_opt_nolock,
				    NULL,
				    NULL);
		if (status < 0)
			return(status);
	}

	if (pagebuf_registration_cache == NULL) {
		pagebuf_registration_cache = kmem_cache_create("page_buf_reg_t",
						  sizeof(page_buf_registration_t),
						  0,
						  SLAB_HWCACHE_ALIGN,
						  NULL,
						  NULL);
		if (pagebuf_registration_cache == NULL)
			return(-ENOMEM);
	}

	return(0);
}


/*
 *	Buffer Locking Control
 */

/*
 *	_pagebuf_registration_free
 *
 *	Free a page_buf_registration_t object.  The caller must hold
 *	the pagebuf_registered_inodes_lock.
 */

static void
_pagebuf_registration_free(page_buf_registration_t *reg)
{
	if (reg != NULL) {
		if (reg->pbr_buffers != NULL) 
			avl_destroy(reg->pbr_buffers);
		kmem_cache_free(pagebuf_registration_cache,reg);
	}
}


/*
 *	_pagebuf_check_lockable
 *
 *	Check if an inode supports lockable buffers.
 *
 *	Returns 0 if the inode supports lockable buffers,
 *	and -ENOENT otherwise.  If the inode supports lockable
 *	buffers, the page_buf_registration_t address is returned
 *	in the referenced pointer.  The registration is returned
 *	with the pbr_lock spinlock held and interrupts disabled.
 */

static __inline__ int
_pagebuf_check_lockable_core(struct inode *ip,
			page_buf_registration_t **reg_p,
			int do_irq)
{
	int	status;

	if (do_irq)
		local_irq_disable();
	spin_lock(&pagebuf_registered_inodes_lock);
	status = avl_lookup(pagebuf_registered_inodes,
			    (avl_key_t) ip,
			    (avl_value_t *) reg_p);
	assert((status != 0) || (*reg_p != NULL));
	if (status == 0) 
		spin_lock(&((*reg_p)->pbr_lock));
	return(status);
}

int
_pagebuf_check_lockable(struct inode *ip,
			page_buf_registration_t **reg_p)
{
	int	status;

	status = _pagebuf_check_lockable_core(ip, reg_p, 1);
	spin_unlock(&pagebuf_registered_inodes_lock);
	if (status)
		local_irq_enable();
	return(status);
}

int
_pagebuf_free_lockable_buffer(page_buf_t *pb, unsigned long flags)
{
	page_buf_registration_t *reg_p;
	int	status;

	PB_TRACE(pb, PB_TRACE_REC(free_lk), 0);

	status = _pagebuf_check_lockable_core(pb->pb_target, &reg_p, 0);
	spin_unlock(&pagebuf_registered_inodes_lock);
	if (status != 0) {
		local_irq_restore(flags);
		return status;
	}

	spin_lock(&PBP(pb)->pb_lock);
	status = (avl_delete(reg_p->pbr_buffers,
			  (avl_key_t) pb, (avl_value_t) pb));

	spin_unlock_irqrestore(&reg_p->pbr_lock, flags);

	PB_TRACE(pb, PB_TRACE_REC(freed_l), status);

	return status;
}



/*
 *	_pagebuf_lockable_compare
 */

static int
_pagebuf_lockable_compare_key(avl_key_t key_a,
			      avl_key_t key_b)
{
	page_buf_t *pb_a = (page_buf_t *) key_a;
	page_buf_t *pb_b = (page_buf_t *) key_b;
	int	ret;

	if (pb_b == NULL) {
		if (pb_a == NULL)
			return(0);
		else
			return(-1);
	}

	assert(pb_a->pb_target == pb_b->pb_target);
	if (pb_a->pb_file_offset == pb_b->pb_file_offset)
		ret = 0;
	else if (pb_a->pb_file_offset < pb_b->pb_file_offset)
		ret = -1;
	else
		ret = 1;

	return ret;
}

/*
 *	_pagebuf_lockable_increment_key
 */

static void
_pagebuf_lockable_increment_key(avl_key_t *next_key,avl_key_t key)
{
	page_buf_t *next_pb = (page_buf_t *) (*next_key);
	page_buf_t *pb = (page_buf_t *) key;

	assert((next_pb != NULL) && \
	       (next_pb->pb_flags & _PBF_NEXT_KEY) && \
	       (pb != NULL));
	
	next_pb->pb_file_offset = pb->pb_file_offset + pb->pb_buffer_length;
}


/*
 *	_pagebuf_get_lockable_buffer
 *
 *	Looks up, and creates if absent, a lockable buffer for
 * 	a given range of an inode.  The buffer is returned
 *	locked.  If other overlapping buffers exist, they are
 *	released before the new buffer is created and locked,
 *	which may imply that this call will block until those buffers
 *	are unlocked.  No I/O is implied by this call.
 *
 *	The caller must have previously called _pagebuf_check_lockable()
 *	successfully, and must pass in the page_buf_registration_t pointer
 *	obtained via that call, with the pbr_lock spinlock held and interrupts
 *	disabled.
 */

void
_pagebuf_grab_lock(page_buf_t *pb)
{
	down(&PBP(pb)->pb_sema);
}


int
_pagebuf_find_lockable_buffer(struct inode *ip,
			     page_buf_registration_t *reg,
			     loff_t range_base,
			     size_t range_length,
			     page_buf_flags_t flags,
			     page_buf_t **pb_p)
{
	page_buf_t next_key_buf;
	page_buf_t *pb;
	avl_key_t next_key;
	avl_key_t key;
	avl_value_t value;
	int	not_locked;

	assert((reg != NULL) && (reg->pbr_inode == ip));

	next_key_buf.pb_flags = _PBF_NEXT_KEY;
	next_key_buf.pb_file_offset = range_base;
	next_key_buf.pb_buffer_length = range_length;
	next_key = (avl_key_t) &next_key_buf;
	while (avl_lookup_next(reg->pbr_buffers,
			       &next_key,
			       &key,
			       &value) == 0) {
		pb = (page_buf_t *)value;
		assert(pb != NULL);

		if (pb->pb_file_offset >= (range_base + range_length))
			break;	/* no overlap found - allocate buffer */

		if (pb->pb_flags & PBF_FREED)
			continue;

		spin_lock(&PBP(pb)->pb_lock);
		if (pb->pb_flags & PBF_FREED) {
			spin_unlock(&PBP(pb)->pb_lock);
			continue;
		}
		pb->pb_hold++;

		PB_TRACE(pb, PB_TRACE_REC(avl_ret), 0);

		/* Attempt to get the semaphore without sleeping,
		 * if this does not work then we need to drop the
		 * spinlocks and do a hard attempt on the semaphore.
		 */
		not_locked = down_trylock(&PBP(pb)->pb_sema);
		if (not_locked) {
			spin_unlock(&PBP(pb)->pb_lock);
			spin_unlock_irq(&(reg->pbr_lock));

			if (!(flags & PBF_TRYLOCK)) {
				/* wait for buffer ownership */
				PB_TRACE(pb, PB_TRACE_REC(get_lk), 0);

				/* If this buffer has I/O outstanding, push */
				if (atomic_read(&PBP(pb)->pb_io_remaining))
					run_task_queue(&tq_disk);
				_pagebuf_grab_lock(pb);
				/** down(&PBP(pb)->pb_sema); **/
				PB_SET_OWNER(pb);
				PB_STATS_INC(pb_get_locked_waited);
			} else {
				/* We asked for a trylock and failed, no need
				 * to look at file offset and length here, we
				 * know that this pagebuf at least overlaps our
				 * pagebuf and is locked, therefore our buffer
				 * either does not exist, or is this buffer
				 */

				pagebuf_rele(pb);

				PB_STATS_INC(pb_busy_locked);
				return -EBUSY;
			}
		} else {
			/* trylock worked */
			PB_SET_OWNER(pb);
			spin_unlock(&reg->pbr_lock);
		}


		if (pb->pb_file_offset == range_base &&
		    pb->pb_buffer_length == range_length) {
			if (not_locked)
				spin_lock_irq(&PBP(pb)->pb_lock);
			if (!(pb->pb_flags & PBF_FREED)) {
				spin_unlock_irq(&PBP(pb)->pb_lock);
				PB_TRACE(pb, PB_TRACE_REC(got_lk), 0);
				*pb_p = pb;
				PB_STATS_INC(pb_get_locked);
				return(0);
			}
			spin_unlock_irq(&PBP(pb)->pb_lock);
		} else if (!not_locked) {
			spin_unlock_irq(&PBP(pb)->pb_lock);
		}

		/* Let go of the buffer - if the count goes to zero
		 * this will remove it from the tree. The problem here
		 * is that if there is a hold on the pagebuf without a
		 * lock then we just threw away the contents....
		 * Which means that if someone else comes along and
		 * locks the pagebuf which they have a hold on they
		 * can discover that the memory has gone away on them.
		 */
		PB_CLEAR_OWNER(pb);
		PB_TRACE(pb, PB_TRACE_REC(skip), 0);
		up(&PBP(pb)->pb_sema);
		pagebuf_rele(pb);

		return -EBUSY;
	}

	spin_unlock_irq(&reg->pbr_lock);

	/* No match found */
	PB_STATS_INC(pb_miss_locked);
	*pb_p = NULL;
	return 0;
}

int
_pagebuf_get_lockable_buffer(struct inode *ip,
			     page_buf_registration_t *reg,
			     loff_t range_base,
			     size_t range_length,
			     page_buf_flags_t flags,
			     page_buf_t **pb_p)
{
	int	status;
	page_buf_t *pb;

retry_scan:
	status = _pagebuf_find_lockable_buffer(ip, reg, range_base,
				range_length, flags, pb_p);

	if (status)
		return status;

	if (*pb_p)
		return 0;


	status = _pagebuf_get_object(ip, range_base, range_length,
				     flags | _PBF_LOCKABLE, &pb);
	if (status != 0) {
		return(status);
	}


	/* Tree manipulation requires the registration spinlock */
	spin_lock_irq(&reg->pbr_lock);
	status = avl_insert(reg->pbr_buffers,
			    (avl_key_t) pb,
			    (avl_value_t) pb);
	spin_unlock_irq(&reg->pbr_lock);
	PB_TRACE(pb, PB_TRACE_REC(avl_ins), status);
	if (status != 0) {
		unsigned long flags;

		spin_lock_irqsave(&PBP(pb)->pb_lock, flags);
		pb->pb_flags &= ~_PBF_LOCKABLE;	/* we are not in the avl */
		_pagebuf_free_object(pb, flags);
		if (status == -EEXIST) {
			/* Race condition with another thread - try again,
			 * set up locking state first.
			 */
			spin_lock_irq(&reg->pbr_lock);
			goto retry_scan;
		}
		return(status);
	}

	*pb_p = pb;
	return(0);
}

/*
 *	Locking and Unlocking Buffers 
 */

/*
 *	pagebuf_cond_lock
 *
 *	pagebuf_cond_lock locks a buffer object, if it is not already locked.
 *	Note that this in no way
 *	locks the underlying pages, so it is only useful for synchronizing
 *	concurrent use of page buffer objects, not for synchronizing independent
 *	access to the underlying pages.
 */

int
pagebuf_cond_lock(  		        /* lock buffer, if not locked   */
					/* returns -EBUSY if locked)    */
		  page_buf_t *pb) 	/* buffer to lock               */
{
	int	locked;

	assert(pb->pb_flags & _PBF_LOCKABLE);

	locked = down_trylock(&PBP(pb)->pb_sema) == 0;
	if (locked) {
		PB_SET_OWNER(pb);
	}

	PB_TRACE(pb, PB_TRACE_REC(condlck), locked);

	return(locked ? 0 : -EBUSY);
}


/*
 *	pagebuf_is_locked
 *
 *	pagebuf_is_locked tests if the buffer is locked, return 1 if locked
 *	and 0 if not.  This routine is useful only for assertions that
 *	the buffer is locked, since the state could change at any time
 *	if the buffer is not locked.
 */

int
pagebuf_is_locked(			/* test if buffer is locked	*/
		  page_buf_t *pb) 	/* buffer to test               */
{
	assert(pb->pb_flags & _PBF_LOCKABLE);

	return(atomic_read(&PBP(pb)->pb_sema.count) <= 0 );
}

/*
 *	pagebuf_lock_value
 *
 *	Return lock value for a pagebuf
 */

int
pagebuf_lock_value(page_buf_t *pb)
{
	assert(pb->pb_flags & _PBF_LOCKABLE);

	return(atomic_read(&PBP(pb)->pb_sema.count));
}



/*
 *	pagebuf_lock
 *
 *	pagebuf_lock locks a buffer object.  Note that this in no way
 *	locks the underlying pages, so it is only useful for synchronizing
 *	concurrent use of page buffer objects, not for synchronizing independent
 *	access to the underlying pages.
 */

int
pagebuf_lock(                        	/* lock buffer                  */
                   			/* (returns -EDEADLK if would   */
                       			/* deadlock)                    */
	     page_buf_t *pb)            /* buffer to lock               */
{
	assert(pb->pb_flags & _PBF_LOCKABLE);

	PB_TRACE(pb, PB_TRACE_REC(lock), 0);
	if (atomic_read(&PBP(pb)->pb_io_remaining))
		run_task_queue(&tq_disk);
	down(&PBP(pb)->pb_sema);
	PB_SET_OWNER(pb);
	PB_TRACE(pb, PB_TRACE_REC(locked), 0);
	return(0);
}


/*
 *	pagebuf_lock_disable
 *
 *	pagebuf_lock_disable disables buffer object locking for an inode.
 *	This call fails with -EBUSY if buffers are still in use and locked for
 *	this inode.
 */

int
pagebuf_lock_disable(			/* disable buffer locking	*/
		     struct inode *ip)  /* inode for buffers	        */
{
	page_buf_registration_t *reg;
	int	status;
	avl_key_t next_key;
	avl_key_t key;
	avl_value_t value;

	status = _pagebuf_check_lockable_core(ip, &reg, 1);
	if (status != 0) {
		spin_unlock_irq(&pagebuf_registered_inodes_lock);
		return(status);
	}

	assert(reg != NULL);

	if (reg->pbr_lock_enable_count == 1) {
		if (reg->pbr_buffers != NULL) {
			next_key = 0;
			if (avl_lookup_next(reg->pbr_buffers,
					    &next_key,
					    &key,
					    &value) == 0) {
				spin_unlock_irq(&reg->pbr_lock);
				spin_unlock(&pagebuf_registered_inodes_lock);
				return(-EBUSY);
			}
		}
		avl_delete(pagebuf_registered_inodes,
				  (avl_key_t) ip,
				  (avl_value_t) reg);
		_pagebuf_registration_free(reg);
		MOD_DEC_USE_COUNT;
	} else {
		reg->pbr_lock_enable_count--;
		spin_unlock(&reg->pbr_lock);
	}
	spin_unlock_irq(&pagebuf_registered_inodes_lock);

	return(0);
}


/*
 *	pagebuf_lock_enable
 *
 *	pagebuf_lock_enable enables buffer object locking for an inode.
 *	This call fails with -EBUSY if buffers are in use for this inode.
 */

int
pagebuf_lock_enable(			/* enable buffer locking	*/
		    struct inode *ip)   /* inode for buffers	        */
{
	page_buf_registration_t *reg = NULL;
	page_buf_registration_t *match;
	int	status = 0;

	while (1) {
		/* Check to see if this inode is already marked
		 * lockable, if it is then increment the count.
		 */
		if (_pagebuf_check_lockable_core(ip, &match, 1) == 0) {
			assert(match != NULL);
			match->pbr_lock_enable_count++;
			spin_unlock_irq(&match->pbr_lock);
			spin_unlock(&pagebuf_registered_inodes_lock);

			/* if we allocated space then we need to
			 * give it back again, we lost the race.
			 */
			if (reg) {
				_pagebuf_registration_free(reg);
			}

			return (0);
		}
		if (reg == NULL) {
			spin_unlock_irq(&pagebuf_registered_inodes_lock);
			reg = kmem_cache_alloc(pagebuf_registration_cache,
						SLAB_KERNEL);
			if (reg == NULL) {
				return(-ENOMEM);
			}
			bzero(reg,sizeof(page_buf_registration_t));
			spin_lock_init(&reg->pbr_lock);
			reg->pbr_inode = ip;
			reg->pbr_lock_enable_count = 1;
			status = avl_create(&reg->pbr_buffers,
					    avl_opt_nolock,
					    _pagebuf_lockable_compare_key,
					    _pagebuf_lockable_increment_key);
			if (status) {
				_pagebuf_registration_free(reg);
				return(status);
			}
			spin_lock_irq(&pagebuf_registered_inodes_lock);
		}
		status = avl_insert(pagebuf_registered_inodes,
				    (avl_key_t) ip,
				    (avl_value_t) reg);
		spin_unlock_irq(&pagebuf_registered_inodes_lock);
		if (status == 0) {
			/* We inserted into the tree successfully */
			MOD_INC_USE_COUNT;
			break;
		}
		if (status == -ENOMEM) {
			/* insert failed, out of memory - give up */
			_pagebuf_registration_free(reg);
			break;
		}
		/* lost the race to insert, go get the entry in the
		 * tree and increment the count. Keep hold of the
		 * memory we allocated just in case the entry we
		 * lost the race with is gone when we look it up.
		 */
	}

	return(status);
}


/*
 *	pagebuf_unlock
 *
 *	pagebuf_unlock releases the lock on the buffer object created by
 *	pagebuf_lock or pagebuf_cond_lock (not any
 *	pinning of underlying pages created by pagebuf_pin).
 */

void
pagebuf_unlock(                     	/* unlock buffer                */
	       page_buf_t *pb)          /* buffer to unlock             */
{
	assert(pb->pb_flags & _PBF_LOCKABLE);

	PB_CLEAR_OWNER(pb);
	up(&PBP(pb)->pb_sema);
	PB_TRACE(pb, PB_TRACE_REC(unlock), 0);
}


/*
 *	Module management
 */

EXPORT_SYMBOL(pagebuf_cond_lock);
EXPORT_SYMBOL(pagebuf_lock);
EXPORT_SYMBOL(pagebuf_is_locked);
EXPORT_SYMBOL(pagebuf_lock_value);
EXPORT_SYMBOL(pagebuf_lock_disable);
EXPORT_SYMBOL(pagebuf_lock_enable);
EXPORT_SYMBOL(pagebuf_unlock);

#ifdef MODULE
/*
 *	pagebuf_terminate
 */

void
pagebuf_locking_terminate(void)
{
	if (pagebuf_registered_inodes != NULL) {
		avl_destroy(pagebuf_registered_inodes);
		pagebuf_registered_inodes = NULL;
	}
	if (pagebuf_registration_cache != NULL)
		kmem_cache_destroy(pagebuf_registration_cache);
}

#endif /* MODULE */
