/*
 * Copyright © INRIA 2009
 * Brice Goglin <Brice.Goglin@inria.fr>
 *
 * This software is a computer program whose purpose is to provide
 * a fast inter-process communication subsystem.
 *
 * This software is governed by the CeCILL-B license under French law and
 * abiding by the rules of distribution of free software.  You can  use,
 * modify and/ or redistribute the software under the terms of the CeCILL-B
 * license as circulated by CEA, CNRS and INRIA at the following URL
 * "http://www.cecill.info".
 *
 * As a counterpart to the access to the source code and  rights to copy,
 * modify and redistribute granted by the license, users are provided only
 * with a limited warranty  and the software's author,  the holder of the
 * economic rights,  and the successive licensors  have only  limited
 * liability.
 *
 * In this respect, the user's attention is drawn to the risks associated
 * with loading,  using,  modifying and/or developing or reproducing the
 * software by the user in light of its specific status of free software,
 * that may mean  that it is complicated to manipulate,  and  that  also
 * therefore means  that it is reserved for developers  and  experienced
 * professionals having in-depth computer knowledge. Users are therefore
 * encouraged to load and test the software's suitability as regards their
 * requirements in conditions enabling the security of their systems and/or
 * data to be ensured and,  more generally, to use and operate it in the
 * same conditions as regards security.
 *
 * The fact that you are presently reading this means that you have had
 * knowledge of the CeCILL-B license and that you accept its terms.
 */

#include <linux/kernel.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/cpumask.h>
#include <asm/uaccess.h>

#include "knem_io.h"
#include "knem_hal.h"

/********************
 * Module parameters
 */

#ifdef KNEM_DRIVER_DEBUG
static int knem_debug = 1;
#else
static int knem_debug = 0;
#endif
module_param_named(debug, knem_debug, uint, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(debug, "Verbose debug messages");

#define KNEM__THREAD_DEFAULT 1
static int knem__thread = KNEM__THREAD_DEFAULT;
module_param_named(thread, knem__thread, uint, S_IRUGO);
MODULE_PARM_DESC(thread, "Support offloading of work to a kernel thread");

static int knem_binding = 1;
module_param_named(binding, knem_binding, int, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(binding, "Bind the kernel thread with-the-user-process (1), anywhere-else (-1) or anywhere (0)");

#define KNEM__DMACPY_DEFAULT 1
static int knem__dmacpy = KNEM__DMACPY_DEFAULT;
module_param_named(dma, knem__dmacpy, uint, S_IRUGO);
MODULE_PARM_DESC(dma, "Support offloading of copies to dma engine");

static unsigned int knem_dma_chunk_min = 1024;
module_param_named(dmamin, knem_dma_chunk_min, uint, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(dmamin, "Minimal chunk size to offload copy on dma engine");

#define KNEM__PINRECV_DEFAULT 0
static int knem__pinrecv = KNEM__PINRECV_DEFAULT;
module_param_named(pinrecv, knem__pinrecv, uint, S_IRUGO);
MODULE_PARM_DESC(pinrecv, "Enforce page pinning on the receiver side");

#define KNEM__SYNC_DEFAULT 0
static int knem__sync = KNEM__SYNC_DEFAULT;
module_param_named(sync, knem__sync, uint, S_IRUGO);
MODULE_PARM_DESC(sync, "Enforce synchronous copy");

static unsigned int knem_force_flags = 0;
module_param_named(forceflags, knem_force_flags, uint, S_IRUGO);
MODULE_PARM_DESC(forceflags, "Mask of flags to be forced on");

static unsigned int knem_ignore_flags = 0;
module_param_named(ignoreflags, knem_ignore_flags, uint, S_IRUGO);
MODULE_PARM_DESC(ignoreflags, "Mask of flags to be ignored");

/************************************
 * Types, constants, macros, globals
 */

#define dprintk(args...) do { if (knem_debug) printk(KERN_INFO "knem: " args); } while (0)

#define KNEM_VERSION_STR PACKAGE_VERSION

typedef u32 pid_id_t;
typedef u32 lid_id_t;
/* knem_cookie_t is a u64 */

#define KNEM_BUILD_COOKIE(pid,lid) ((((knem_cookie_t) (lid)->id) << 32) | ((knem_cookie_t) (pid)->id))
#define KNEM_COOKIE_TO_PID_ID(cookie) ((pid_id_t) (cookie))
#define KNEM_COOKIE_TO_LID_ID(cookie) (lid_id_t) ((cookie) >> 32)

struct knem_pid {
	struct file * file;
	pid_id_t id;
	knem_status_t * status_array;
	struct page ** status_pages;
	unsigned long status_index_max;
	lid_id_t next_lid;
	struct list_head lid_list_head;
	spinlock_t lid_list_lock;

	cpumask_t kthread_cpumask;
	struct task_struct *kthread_task; /* only if in async mode, i.e. if status_array!=NULL */
	wait_queue_head_t kthread_work_wq;
	struct list_head kthread_work_list;
	spinlock_t kthread_work_lock;

#ifdef KNEM_HAVE_DMA_ENGINE
	struct dma_chan *dmacpy_chan;
	struct list_head dmacpy_cleanup_work_list;
	spinlock_t dmacpy_cleanup_work_lock;
#endif
};
static pid_id_t knem_pid_base;
static spinlock_t knem_pids_lock;
static struct knem_pid *knem_pids;
static int knem_pids_nr_free;
#define KNEM_PID_MAX 256

struct knem_send_lid {
	struct list_head lid_list_elt;
	lid_id_t id;
	/* a struct knem_piovecs_desc is appended here */
};

struct knem_piovecs_desc {
	unsigned long iovec_nr;
	struct knem_pinned_iovec {
		unsigned long aligned_vaddr;
		unsigned first_page_offset;
		unsigned long len;
		unsigned long page_nr;
		int vmalloced;
		struct page ** pages;
	} piovecs[0];
};

enum knem_work_type {
	KNEM_WORK_MEMCPY_PINNED,
	KNEM_WORK_MEMCPY_TO_USER, /* cannot be offloaded */
	KNEM_WORK_DMACPY,
};

struct knem_work {
	struct list_head list_elt;
	enum knem_work_type type;
	unsigned int flags;
	struct knem_send_lid *send_lid;
	knem_status_t *status;
	unsigned long status_offset; /* only valid for async requests */
	union {
#ifdef KNEM_HAVE_DMA_ENGINE
		struct {
			dma_cookie_t last_cookie;
		} dmacpy;
#endif
	};
};

/* forward declarations */
static void knem_free_work(struct knem_work *work);
static void knem_do_work(struct knem_pid *pid, struct knem_work *work);

/*******
 * Misc
 */

static inline int
knem_setup_flags(void)
{
	if (!knem__dmacpy) {
		dprintk("Adding DMA (0x%x) to ignored flags (%s)\n",
			(unsigned) KNEM_FLAG_DMA,
			knem__dmacpy != KNEM__DMACPY_DEFAULT ? "module param" : "default");
		knem_ignore_flags |= KNEM_FLAG_DMA;
	}

	if (knem__pinrecv) {
		dprintk("Adding PINRECV (0x%x) to forced flags (%s)\n",
			(unsigned) KNEM_FLAG_PINRECV,
			knem__pinrecv != KNEM__PINRECV_DEFAULT ? "module param" : "default");
		knem_force_flags |= KNEM_FLAG_PINRECV;
	}

	if (knem__sync) {
		dprintk("Adding ANY_ASYNC_MASK (0x%x) to ignored flags (%s)\n",
			(unsigned) KNEM_FLAG_ANY_ASYNC_MASK,
			knem__sync != KNEM__SYNC_DEFAULT ? "module param" : "default");
		knem_ignore_flags |= KNEM_FLAG_ANY_ASYNC_MASK;
	}

	if (!knem__thread) {
		dprintk("Adding ANY_THREAD_MASK (0x%x) to ignored flags (%s)\n",
			(unsigned) KNEM_FLAG_ANY_THREAD_MASK,
			knem__thread != KNEM__THREAD_DEFAULT ? "module param" : "default");
		knem_ignore_flags |= KNEM_FLAG_ANY_THREAD_MASK;
	}

	dprintk("Forcing flags 0x%x, ignoring 0x%x\n",
		knem_force_flags, knem_ignore_flags);

	if (knem_force_flags & knem_ignore_flags) {
		dprintk("Cannot ignore and force flags 0x%x\n",
			knem_force_flags & knem_ignore_flags);
		return -EINVAL;
	}

	return 0;
}

#define KNEM_FIX_FLAGS(flags) (((flags) | knem_force_flags) & ~knem_ignore_flags)

/***********
 * Counters
 */

static struct knem_counters {
	unsigned long long submitted;
	unsigned long long processed_dma;
	unsigned long long processed_thread;
	unsigned long long failed_nomem;
	unsigned long long failed_readcmd;
	unsigned long long failed_findlid;
	unsigned long long failed_pin;
	unsigned long long failed_memcpytouser;
	unsigned long long failed_memcpypinned;
	unsigned long long failed_dmacpy;
} knem_counters;

#ifdef KNEM_DRIVER_DEBUG
static spinlock_t knem_counters_spinlock;
#define knem_counters_lock_init()	spin_lock_init(&knem_counters_spinlock)
#define knem_counters_lock()		spin_lock(&knem_counters_spinlock)
#define knem_counters_unlock()		spin_unlock(&knem_counters_spinlock)
#else
#define knem_counters_lock_init()
#define knem_counters_lock()
#define knem_counters_unlock()
#endif

#define knem_counter_inc(name) do {	\
	knem_counters_lock();		\
	knem_counters.name++;		\
	knem_counters_unlock();		\
} while (0)

#define knem_counter_read(name) knem_counters.name

static void
knem_clear_counters(void)
{
	knem_counters_lock();
	memset(&knem_counters, 0, sizeof(knem_counters));
	knem_counters_unlock();
}

static void
knem_init_counters(void)
{
	knem_counters_lock_init();
	knem_clear_counters();
}

/************************
 * Pinning and unpinning
 */

#define KNEM_IOVEC_VMALLOC_PAGES_THRESHOLD 4096

/* always returns the pdesc in a coherent state that knem_unpin_iovecs_desc() can free */
static int
knem_pin_iovecs_desc(struct knem_piovecs_desc *pdesc, int write)
{
	struct knem_cmd_param_iovec *uiovec;
	struct knem_pinned_iovec *piovec;
	unsigned long uiovec_nr = pdesc->iovec_nr;
	unsigned long i;
	int err = 0;

	/* put pdesc in a coherent state */
	pdesc->iovec_nr = 0;

	uiovec = (void *) &pdesc->piovecs[uiovec_nr]; /* the user ioctl iovecs are stored at the very end */
	piovec = &pdesc->piovecs[0];

	for(i=0; i<uiovec_nr; i++, uiovec++) {
		unsigned long aligned_vaddr, offset, len, aligned_len, page_nr;
		struct page **pages;

		if (unlikely(!uiovec->len))
			continue;

		aligned_vaddr = uiovec->base & PAGE_MASK;
		len = uiovec->len;
		offset = uiovec->base & (~PAGE_MASK);
		aligned_len = PAGE_ALIGN(offset + len);
		page_nr = aligned_len >> PAGE_SHIFT;

		piovec->aligned_vaddr = aligned_vaddr;
		piovec->first_page_offset = offset;
		piovec->len = len;

		if (page_nr > KNEM_IOVEC_VMALLOC_PAGES_THRESHOLD) {
			pages = vmalloc(page_nr * sizeof(struct page *));
			piovec->vmalloced = 1;
		} else {
			pages = kmalloc(page_nr * sizeof(struct page *), GFP_KERNEL);
			piovec->vmalloced = 0;
		}
		if (unlikely(!pages)) {
			dprintk("Failed to allocate iovec array for %ld pages\n", page_nr);
			err = -ENOMEM;
			goto out;
		}

		/* keep pdesc in a coherent state */
		piovec->pages = pages;
		piovec->page_nr = 0;
		pdesc->iovec_nr++;

		down_read(&current->mm->mmap_sem);
		err = get_user_pages(current, current->mm, aligned_vaddr, page_nr,
				     write, 1, pages, NULL);
		up_read(&current->mm->mmap_sem);
		if (unlikely(err != page_nr)) {
			dprintk("Failed to pin iovec, got %d instead of %ld\n", err, page_nr);
			if (err > 0)
				piovec->page_nr = err;
			err = -EFAULT;
			goto out;
		}
		piovec->page_nr = page_nr;

		piovec++;
	}

 out:
	return err;
}

static void
knem_unpin_iovecs_desc(struct knem_piovecs_desc *pdesc)
{
	struct knem_pinned_iovec *piovec, *piovec_max;
	unsigned long iovec_nr = pdesc->iovec_nr;
	int j;

	piovec_max = &pdesc->piovecs[iovec_nr];
	for(piovec = &pdesc->piovecs[0]; piovec < piovec_max; piovec++) {
		struct page **pages = piovec->pages;
		for(j=0; j<piovec->page_nr; j++)
			put_page(pages[j]);
		if (piovec->vmalloced)
			vfree(pages);
		else
			kfree(pages);
	}
}

/*********************
 * Managing send lids
 */

static void
knem_free_send_lid(struct knem_send_lid *lid)
{
	struct knem_piovecs_desc *pdesc = (void *)(lid + 1);
	knem_unpin_iovecs_desc(pdesc);
	kfree(lid);
}

static void
knem_queue_send_lid(struct knem_pid * pid, struct knem_send_lid *lid)
{
	spin_lock(&pid->lid_list_lock);
	list_add_tail(&lid->lid_list_elt, &pid->lid_list_head);
	lid->id = pid->next_lid++;
	spin_unlock(&pid->lid_list_lock);
}

static struct knem_send_lid *
knem_find_dequeue_send_lid(knem_cookie_t cookie)
{
	struct knem_pid * send_pid;
	struct knem_send_lid * send_lid;
	pid_id_t cookie_pid_index;
	lid_id_t cookie_lid;

	cookie_pid_index = KNEM_COOKIE_TO_PID_ID(cookie) - knem_pid_base;
	if (cookie_pid_index >= KNEM_PID_MAX) {
		dprintk("Invalid send lid pid #%lx\n",
			(unsigned long) cookie_pid_index);
		send_lid = ERR_PTR(-EINVAL);
		goto out;
	}

	cookie_lid = KNEM_COOKIE_TO_LID_ID(cookie);

	spin_lock(&knem_pids_lock);

	send_pid = &knem_pids[cookie_pid_index];
	if (unlikely(!send_pid->file)) {
		dprintk("Failed to find send lid pid #%ld\n",
			(unsigned long) cookie_pid_index);
		send_lid = ERR_PTR(-EINVAL);
		goto out_with_pids_lock;
	}

	spin_lock(&send_pid->lid_list_lock);

	list_for_each_entry(send_lid, &send_pid->lid_list_head, lid_list_elt) {
		if (likely(send_lid->id == cookie_lid)) {
			list_del(&send_lid->lid_list_elt);
			goto out_with_lid_list_lock;
		}
	}
	/* not found */
	dprintk("Failed to find send lid #%lx\n",
		(unsigned long) cookie_lid);
	send_lid = ERR_PTR(-EINVAL);

 out_with_lid_list_lock:
	spin_unlock(&send_pid->lid_list_lock);
 out_with_pids_lock:
	spin_unlock(&knem_pids_lock);
 out:
	return send_lid;
}

/**********************************************************
 * Receiving by copying from pinned iovecs into user-space
 */

static int
knem_memcpy_pinned_to_user(unsigned long recv_addr,
			   struct page ** send_page, unsigned send_first_page_offset,
			   unsigned long remaining)
{
	int err;

	while (remaining) {
		unsigned long chunk = remaining;
		void *send_addr;

		if (likely(send_first_page_offset + chunk > PAGE_SIZE))
			chunk = PAGE_SIZE - send_first_page_offset;

		send_addr = kmap(*send_page);
		err = copy_to_user((void __user *)(unsigned long) recv_addr,
				   send_addr + send_first_page_offset,
				   chunk);
		kunmap(*send_page);
		if (unlikely(err)) {
			dprintk("Failed to write into recv user-space %lx-%ld\n",
				recv_addr, chunk);
			err = -EFAULT;
			goto out;
		}

		remaining -= chunk;
		send_page++;
		send_first_page_offset = 0;
		recv_addr += chunk;
	}

	return 0;

 out:
	return err;
}

static int
knem_vectmemcpy_pinned_to_user(struct knem_cmd_param_iovec * recv_iovec_array,
			       unsigned long recv_iovec_nr,
			       struct knem_piovecs_desc *send_pdesc)
{
	struct knem_cmd_param_iovec * cur_recv_iovec = &recv_iovec_array[0];
	struct knem_cmd_param_iovec * recv_iovec_max = cur_recv_iovec + recv_iovec_nr;
	unsigned long cur_recv_addr = cur_recv_iovec->base;
	unsigned long cur_recv_len = cur_recv_iovec->len;
	struct knem_pinned_iovec * cur_send_piovec = &send_pdesc->piovecs[0];
	struct knem_pinned_iovec * send_piovec_max = cur_send_piovec + send_pdesc->iovec_nr;
	struct page ** cur_send_pages = cur_send_piovec->pages;
	unsigned cur_send_first_page_offset = cur_send_piovec->first_page_offset;
	unsigned long cur_send_len = cur_send_piovec->len;
	int err;

	while (1) {
		unsigned long chunk = min(cur_recv_len, cur_send_len);

		err = knem_memcpy_pinned_to_user(cur_recv_addr,
						 cur_send_pages, cur_send_first_page_offset,
						 chunk);
		if (unlikely(err < 0))
			return err;

		if (chunk == cur_send_len) {
			/* next send iovec */
			cur_send_piovec++;
			if (unlikely(cur_send_piovec == send_piovec_max))
				break;
			cur_send_pages = cur_send_piovec->pages;
			cur_send_first_page_offset = cur_send_piovec->first_page_offset;
			cur_send_len = cur_send_piovec->len;
		} else {
			/* advance in current send iovec */
			cur_send_pages += ((cur_send_first_page_offset + chunk) >> PAGE_SHIFT);
			cur_send_first_page_offset = (cur_send_first_page_offset + chunk) & (~PAGE_MASK);
			cur_send_len -= chunk;
		}

		if (chunk == cur_recv_len) {
			/* next recv iovec */
			cur_recv_iovec++;
			if (unlikely(cur_recv_iovec == recv_iovec_max))
				break;
			cur_recv_addr = cur_recv_iovec->base;
			cur_recv_len = cur_recv_iovec->len;
		} else {
			/* advance in current recv iovec */
			cur_recv_addr += chunk;
			cur_recv_len -= chunk;
		}
	}

	return 0;
}

static void
knem_do_work_memcpy_to_user(struct knem_pid *pid,
			    struct knem_work *work)
{
	struct knem_piovecs_desc *recv_pdesc = (void *) (work + 1);
	unsigned long recv_uiovec_nr = recv_pdesc->iovec_nr;
	struct knem_cmd_param_iovec *recv_iovec_array = (void *) &recv_pdesc->piovecs[recv_uiovec_nr]; /* the user ioctl iovecs are stored at the very end */
	struct knem_send_lid *send_lid = work->send_lid;
	struct knem_piovecs_desc * send_pdesc = (void *)(send_lid + 1);
	knem_status_t *status = work->status;
	int err = 0;

	/* put pdesc in a coherent state */
	recv_pdesc->iovec_nr = 0;

	if (likely(send_pdesc->iovec_nr == 1 && recv_uiovec_nr == 1)) {
		/* optimize the contigous case */
		struct knem_pinned_iovec *send_piovec = &send_pdesc->piovecs[0];
		unsigned long len = min((unsigned long) recv_iovec_array[0].len, send_piovec->len);
		err = knem_memcpy_pinned_to_user(recv_iovec_array[0].base,
						 send_piovec->pages, send_piovec->first_page_offset,
						 len);
	} else if (likely(send_pdesc->iovec_nr > 0 && recv_uiovec_nr > 0)) {
		/* generic vectorial case */
		err = knem_vectmemcpy_pinned_to_user(recv_iovec_array, recv_uiovec_nr,
						     send_pdesc);
	}

	if (unlikely(err)) {
		knem_counter_inc(failed_memcpytouser);
		*status = KNEM_STATUS_FAILED;
	} else
		*status = KNEM_STATUS_SUCCESS;
}

/*********************************************
 * Receiving by copying between pinned iovecs
 */

static int
knem_memcpy_pinned(struct page **recv_page, unsigned recv_first_page_offset,
		   struct page **send_page, unsigned send_first_page_offset,
		   unsigned long remaining)
{
	void *send_addr = kmap_atomic(*send_page, KM_USER0);
	void *recv_addr = kmap_atomic(*recv_page, KM_USER1);

	while (1) {
		unsigned long chunk = remaining;
		if (likely(send_first_page_offset + chunk > PAGE_SIZE))
			chunk = PAGE_SIZE - send_first_page_offset;
		if (likely(recv_first_page_offset + chunk > PAGE_SIZE))
			chunk = PAGE_SIZE - recv_first_page_offset;

		memcpy(recv_addr + recv_first_page_offset,
		       send_addr + send_first_page_offset,
		       chunk);

		remaining -= chunk;
		if (unlikely(!remaining))
			break;

		if (chunk == PAGE_SIZE - send_first_page_offset) {
			kunmap_atomic(send_addr, KM_USER0);
			send_page++;
			send_first_page_offset = 0;
			send_addr = kmap_atomic(*send_page, KM_USER0);
		} else {
			send_first_page_offset += chunk;
		}

		if (chunk == PAGE_SIZE - recv_first_page_offset) {
			kunmap_atomic(recv_addr, KM_USER1);
			recv_page++;
			recv_first_page_offset = 0;
			recv_addr = kmap_atomic(*recv_page, KM_USER1);
		} else {
			recv_first_page_offset += chunk;
		}
	}

	kunmap_atomic(send_addr, KM_USER0);
	kunmap_atomic(recv_addr, KM_USER1);

	return 0;
}

static int
knem_vectmemcpy_pinned(struct knem_piovecs_desc *recv_pdesc,
		       struct knem_piovecs_desc *send_pdesc)
{
	struct knem_pinned_iovec * cur_recv_piovec = &recv_pdesc->piovecs[0];
	struct knem_pinned_iovec * recv_piovec_max = cur_recv_piovec + recv_pdesc->iovec_nr;
	struct page ** cur_recv_pages = cur_recv_piovec->pages;
	unsigned cur_recv_first_page_offset = cur_recv_piovec->first_page_offset;
	unsigned long cur_recv_len = cur_recv_piovec->len;
	struct knem_pinned_iovec * cur_send_piovec = &send_pdesc->piovecs[0];
	struct knem_pinned_iovec * send_piovec_max = cur_send_piovec + send_pdesc->iovec_nr;
	struct page ** cur_send_pages = cur_send_piovec->pages;
	unsigned cur_send_first_page_offset = cur_send_piovec->first_page_offset;
	unsigned long cur_send_len = cur_send_piovec->len;
	int err;

	while (1) {
		unsigned long chunk = min(cur_recv_len, cur_send_len);

		err = knem_memcpy_pinned(cur_recv_pages, cur_recv_first_page_offset,
					 cur_send_pages, cur_send_first_page_offset,
					 chunk);
		if (unlikely(err < 0))
			return err;

		if (chunk == cur_send_len) {
			/* next send iovec */
			cur_send_piovec++;
			if (unlikely(cur_send_piovec == send_piovec_max))
				break;
			cur_send_pages = cur_send_piovec->pages;
			cur_send_first_page_offset = cur_send_piovec->first_page_offset;
			cur_send_len = cur_send_piovec->len;
		} else {
			/* advance in current send iovec */
			cur_send_pages += ((cur_send_first_page_offset + chunk) >> PAGE_SHIFT);
			cur_send_first_page_offset = (cur_send_first_page_offset + chunk) & (~PAGE_MASK);
			cur_send_len -= chunk;
		}

		if (chunk == cur_recv_len) {
			/* next recv iovec */
			cur_recv_piovec++;
			if (unlikely(cur_recv_piovec == recv_piovec_max))
				break;
			cur_recv_pages = cur_recv_piovec->pages;
			cur_recv_first_page_offset = cur_recv_piovec->first_page_offset;
			cur_recv_len = cur_recv_piovec->len;
		} else {
			/* advance in current recv iovec */
			cur_recv_pages += ((cur_recv_first_page_offset + chunk) >> PAGE_SHIFT);
			cur_recv_first_page_offset = (cur_recv_first_page_offset + chunk) & (~PAGE_MASK);
			cur_recv_len -= chunk;
		}
	}

	return 0;
}

static void
knem_do_work_memcpy_pinned(struct knem_pid *pid,
			   struct knem_work *work)
{
	struct knem_piovecs_desc *recv_pdesc = (void *) (work + 1);
	struct knem_send_lid *send_lid = work->send_lid;
	struct knem_piovecs_desc * send_pdesc = (void *)(send_lid + 1);
	knem_status_t *status = work->status;
	int err = 0;

	if (likely(send_pdesc->iovec_nr == 1 && recv_pdesc->iovec_nr == 1)) {
		/* optimize the contigous case */
		struct knem_pinned_iovec *send_piovec = &send_pdesc->piovecs[0];
		struct knem_pinned_iovec *recv_piovec = &recv_pdesc->piovecs[0];
		unsigned long len = min(recv_piovec->len, send_piovec->len);
		err = knem_memcpy_pinned(recv_piovec->pages, recv_piovec->first_page_offset,
					 send_piovec->pages, send_piovec->first_page_offset,
					 len);

	} else if (likely(send_pdesc->iovec_nr > 0 && recv_pdesc->iovec_nr > 0)) {
		/* generic vectorial case */
		err = knem_vectmemcpy_pinned(recv_pdesc, send_pdesc);
	}

	if (unlikely(err)) {
		knem_counter_inc(failed_memcpypinned);
		*status = KNEM_STATUS_FAILED;
	} else
		*status = KNEM_STATUS_SUCCESS;
}

/*************************************************
 * Receiving by DMA-copying between pinned iovecs
 */

#ifdef KNEM_HAVE_DMA_ENGINE

static struct page *knem_dmacpy_status_src_page;
static unsigned int knem_dmacpy_status_src_success_page_offset;

static int
knem_dmacpy_init(void)
{
	knem_status_t *status_array;

	knem_dmacpy_status_src_page = alloc_page(GFP_KERNEL);
	if (!knem_dmacpy_status_src_page) {
		dprintk("Failed to allocate the dmacpy status page\n");
		return -ENOMEM;
	}

	status_array = (knem_status_t*)page_address(knem_dmacpy_status_src_page);
	status_array[0] = KNEM_STATUS_SUCCESS;
	knem_dmacpy_status_src_success_page_offset = 0;

	return 0;
}

static void
knem_dmacpy_exit(void)
{
	__free_page(knem_dmacpy_status_src_page);
}

static dma_cookie_t
knem_dmacpy_pinned(struct dma_chan *chan,
		   struct page **recv_page, unsigned recv_first_page_offset,
		   struct page **send_page, unsigned send_first_page_offset,
		   unsigned long remaining)
{
	dma_cookie_t last_cookie = 0;
	int err;

	while (1) {
		unsigned long chunk = remaining;
		if (likely(send_first_page_offset + chunk > PAGE_SIZE))
			chunk = PAGE_SIZE - send_first_page_offset;
		if (likely(recv_first_page_offset + chunk > PAGE_SIZE))
			chunk = PAGE_SIZE - recv_first_page_offset;

		if (chunk <= knem_dma_chunk_min) {
			/* chunk is small, use a regular copy */
			void *send_addr = kmap_atomic(*send_page, KM_USER0);
			void *recv_addr = kmap_atomic(*recv_page, KM_USER1);
			memcpy(recv_addr + recv_first_page_offset,
			       send_addr + send_first_page_offset,
			       chunk);
			kunmap_atomic(send_addr, KM_USER0);
			kunmap_atomic(recv_addr, KM_USER1);
		} else {
			err = dma_async_memcpy_pg_to_pg(chan,
							*recv_page, recv_first_page_offset,
							*send_page, send_first_page_offset,
							chunk);
			if (err < 0)
				goto failure;
			last_cookie = err;
		}

		remaining -= chunk;
		if (unlikely(!remaining))
			break;

		if (chunk == PAGE_SIZE - send_first_page_offset) {
			send_page++;
			send_first_page_offset = 0;
		} else {
			send_first_page_offset += chunk;
		}

		if (chunk == PAGE_SIZE - recv_first_page_offset) {
			recv_page++;
			recv_first_page_offset = 0;
		} else {
			recv_first_page_offset += chunk;
		}
	}

	return last_cookie;

 failure:
	if (last_cookie) {
		/* complete pending DMA before returning the error */
		while (dma_async_memcpy_complete(chan, last_cookie, NULL, NULL) == DMA_IN_PROGRESS);
	}
	return err;
}

static int
knem_vectdmacpy_pinned(struct dma_chan *chan,
		       struct knem_piovecs_desc *recv_pdesc,
		       struct knem_piovecs_desc *send_pdesc)
{
	struct knem_pinned_iovec * cur_recv_piovec = &recv_pdesc->piovecs[0];
	struct knem_pinned_iovec * recv_piovec_max = cur_recv_piovec + recv_pdesc->iovec_nr;
	struct page ** cur_recv_pages = cur_recv_piovec->pages;
	unsigned cur_recv_first_page_offset = cur_recv_piovec->first_page_offset;
	unsigned long cur_recv_len = cur_recv_piovec->len;
	struct knem_pinned_iovec * cur_send_piovec = &send_pdesc->piovecs[0];
	struct knem_pinned_iovec * send_piovec_max = cur_send_piovec + send_pdesc->iovec_nr;
	struct page ** cur_send_pages = cur_send_piovec->pages;
	unsigned cur_send_first_page_offset = cur_send_piovec->first_page_offset;
	unsigned long cur_send_len = cur_send_piovec->len;
	dma_cookie_t last_cookie = 0;
	int err;

	while (1) {
		unsigned long chunk = min(cur_recv_len, cur_send_len);

		err = knem_dmacpy_pinned(chan,
					 cur_recv_pages, cur_recv_first_page_offset,
					 cur_send_pages, cur_send_first_page_offset,
					 chunk);
		if (unlikely(err < 0))
			goto failure;
		last_cookie = err;

		if (chunk == cur_send_len) {
			/* next send iovec */
			cur_send_piovec++;
			if (unlikely(cur_send_piovec == send_piovec_max))
				break;
			cur_send_pages = cur_send_piovec->pages;
			cur_send_first_page_offset = cur_send_piovec->first_page_offset;
			cur_send_len = cur_send_piovec->len;
		} else {
			/* advance in current send iovec */
			cur_send_pages += ((cur_send_first_page_offset + chunk) >> PAGE_SHIFT);
			cur_send_first_page_offset = (cur_send_first_page_offset + chunk) & (~PAGE_MASK);
			cur_send_len -= chunk;
		}

		if (chunk == cur_recv_len) {
			/* next recv iovec */
			cur_recv_piovec++;
			if (unlikely(cur_recv_piovec == recv_piovec_max))
				break;
			cur_recv_pages = cur_recv_piovec->pages;
			cur_recv_first_page_offset = cur_recv_piovec->first_page_offset;
			cur_recv_len = cur_recv_piovec->len;
		} else {
			/* advance in current recv iovec */
			cur_recv_pages += ((cur_recv_first_page_offset + chunk) >> PAGE_SHIFT);
			cur_recv_first_page_offset = (cur_recv_first_page_offset + chunk) & (~PAGE_MASK);
			cur_recv_len -= chunk;
		}
	}

	return last_cookie;

 failure:
	if (last_cookie) {
		/* complete pending DMA before returning the error */
		while (dma_async_memcpy_complete(chan, last_cookie, NULL, NULL) == DMA_IN_PROGRESS);
	}
	return err;
}

static int
knem_do_work_dmacpy_pinned(struct knem_pid *pid,
			   struct knem_work *work)
{
	struct knem_piovecs_desc *recv_pdesc = (void *) (work + 1);
	struct knem_send_lid *send_lid = work->send_lid;
	struct knem_piovecs_desc * send_pdesc = (void *)(send_lid + 1);
	knem_status_t *status = work->status;
	struct dma_chan *chan  = pid->dmacpy_chan;
	int ret = 0;
	int err = 0;

	if (likely(send_pdesc->iovec_nr == 1 && recv_pdesc->iovec_nr == 1)) {
		/* optimize the contigous case */
		struct knem_pinned_iovec *send_piovec = &send_pdesc->piovecs[0];
		struct knem_pinned_iovec *recv_piovec = &recv_pdesc->piovecs[0];
		unsigned long len = min(recv_piovec->len, send_piovec->len);
		err = knem_dmacpy_pinned(chan,
					 recv_piovec->pages, recv_piovec->first_page_offset,
					 send_piovec->pages, send_piovec->first_page_offset,
					 len);

	} else if (likely(send_pdesc->iovec_nr > 0 && recv_pdesc->iovec_nr > 0)) {
		/* generic vectorial case */
		err = knem_vectdmacpy_pinned(chan, recv_pdesc, send_pdesc);
	}

	if (unlikely(err < 0)) {
		/* got an error, all submitted copies have been waited for */
		knem_counter_inc(failed_dmacpy);
		*status = KNEM_STATUS_FAILED;

	} else if (unlikely(!err)) {
		/* no copy submitted, we are done */
		*status = KNEM_STATUS_SUCCESS;

	} else {
		/* some copies were submitted */
		dma_cookie_t last_cookie = err;

		if (!(work->flags & KNEM_FLAG_ASYNCDMACOMPLETE)) {
			/* synchronous wait mode, wait for completion and set the status directly */
 sync:
			dma_async_memcpy_issue_pending(chan);
			while (dma_async_memcpy_complete(chan, last_cookie, NULL, NULL) == DMA_IN_PROGRESS);
			*status = KNEM_STATUS_SUCCESS;
		} else {
			/* asynchronous wait mode, queue the status update as well */
			unsigned long status_offset = work->status_offset;
			struct page *status_page = pid->status_pages[status_offset>>PAGE_SHIFT];
			unsigned status_page_offset = status_offset & ~PAGE_MASK;

			err = dma_async_memcpy_pg_to_pg(chan,
							status_page, status_page_offset,
							knem_dmacpy_status_src_page, knem_dmacpy_status_src_success_page_offset,
							sizeof(knem_status_t));
			if (err < 0)
				/* failed to queue the async status update, revert to sync */
				goto sync;

			dma_async_memcpy_issue_pending(chan);
			work->dmacpy.last_cookie = err;
			ret = 1; /* tell the caller to queue for deferred cleanup */
		}
	}

	return ret;
}

static void
knem_dmacpy_partial_cleanup(struct knem_pid *pid)
{
	struct knem_work *work;

	spin_lock(&pid->dmacpy_cleanup_work_lock);
	if (!list_empty(&pid->dmacpy_cleanup_work_list)) {
		dma_cookie_t done, used;

		/* see if the first work is done */
		work = list_entry(pid->dmacpy_cleanup_work_list.next, struct knem_work, list_elt);
		if (dma_async_memcpy_complete(pid->dmacpy_chan, work->dmacpy.last_cookie, &done, &used) != DMA_IN_PROGRESS) {
			/* cleanup this first work */
			list_del(&work->list_elt);
			spin_unlock(&pid->dmacpy_cleanup_work_lock);
			knem_free_work(work);
			spin_lock(&pid->dmacpy_cleanup_work_lock);

			/* see if the next works are in the same done-used interval */
			while (!list_empty(&pid->dmacpy_cleanup_work_list)) {
				work = list_entry(pid->dmacpy_cleanup_work_list.next, struct knem_work, list_elt);
				if (dma_async_is_complete(work->dmacpy.last_cookie, done, used) == DMA_IN_PROGRESS)
					break;

				list_del(&work->list_elt);
				spin_unlock(&pid->dmacpy_cleanup_work_lock);
				knem_free_work(work);
				spin_lock(&pid->dmacpy_cleanup_work_lock);
			}
		}
	}

	spin_unlock(&pid->dmacpy_cleanup_work_lock);
}

static void
knem_dmacpy_full_cleanup(struct knem_pid *pid)
{
	struct knem_work *work;

	/* kthread and apps are gone, no need to lock the work list */
	if (!list_empty(&pid->dmacpy_cleanup_work_list)) {
		dma_cookie_t done, used;

		/* wait until the last work is done */
		work = list_entry(pid->dmacpy_cleanup_work_list.prev, struct knem_work, list_elt);
		while (dma_async_memcpy_complete(pid->dmacpy_chan, work->dmacpy.last_cookie, &done, &used) == DMA_IN_PROGRESS);

		/* cleanup all works now */
		while (!list_empty(&pid->dmacpy_cleanup_work_list)) {
			work = list_entry(pid->dmacpy_cleanup_work_list.next, struct knem_work, list_elt);
			list_del(&work->list_elt);
			knem_free_work(work);
		}
	}
}

#else /* KNEM_HAVE_DMA_ENGINE */

static inline int knem_dmacpy_init(void) { return 0; }
static inline void knem_dmacpy_exit(void) { /* do nothing */ }

#endif /* !KNEM_HAVE_DMA_ENGINE */

/********************************
 * Common routines for receiving
 */

static int
knem_init_recv(struct knem_pid * pid,
	       void __user * uiovec_array, unsigned long uiovec_nr,
	       knem_cookie_t send_cookie,
	       knem_status_t * status, unsigned status_offset,
	       unsigned int flags)
{
	struct knem_work *work;
	struct knem_piovecs_desc * recv_pdesc;
	struct knem_cmd_param_iovec *uiovecs;
	struct knem_send_lid * send_lid;
	int offload = flags & KNEM_FLAG_MEMCPYTHREAD;
	int pinrecv = flags & KNEM_FLAG_PINRECV;
	int err;

	knem_counter_inc(submitted);

	work = kmalloc(sizeof(*work) + sizeof(*recv_pdesc)
			+ uiovec_nr * sizeof(struct knem_pinned_iovec)
			+ uiovec_nr * sizeof(struct knem_cmd_param_iovec),
			GFP_KERNEL);
	if (unlikely(!work)) {
		dprintk("Failed to allocate recv work\n");
		knem_counter_inc(failed_nomem);
		err = -ENOMEM;
		goto out;
	}
	recv_pdesc = (void *) (work + 1);
	recv_pdesc->iovec_nr = 0; /* make sure the recv_pdesc is in a coherent state */

	uiovecs = (void *) &recv_pdesc->piovecs[uiovec_nr]; /* store the user ioctl uiovecs at the very end */
	err = copy_from_user(uiovecs, uiovec_array,
			     uiovec_nr * sizeof(struct knem_cmd_param_iovec));
	if (unlikely(err)) {
		dprintk("Failed to read recv ioctl iovecs from user-space\n");
		knem_counter_inc(failed_readcmd);
		err = -EFAULT;
		goto out_with_work;
	}

	send_lid = knem_find_dequeue_send_lid(send_cookie);
	if (IS_ERR(send_lid)) {
		knem_counter_inc(failed_findlid);
		err = PTR_ERR(send_lid);
		goto out_with_piovecs;
	}

	*status = KNEM_STATUS_PENDING;

	work->flags = flags;
	work->send_lid = send_lid;
	work->status = status;
	work->status_offset = status_offset;

	/* prepare the work depending on its type (pinning, ...) */
#ifdef KNEM_HAVE_DMA_ENGINE
	if (pid->dmacpy_chan && (flags & KNEM_FLAG_DMA)) {
		/* use DMA engine for copying */
		work->type = KNEM_WORK_DMACPY;
		offload = flags & KNEM_FLAG_DMATHREAD;

		/* need to pin on the recv side first */
		recv_pdesc->iovec_nr = uiovec_nr; /* store the number of user iovecs for knem_pin_iovecs_desc() */
		err = knem_pin_iovecs_desc(recv_pdesc, 1 /* write */);
		if (unlikely(err < 0)) {
			knem_counter_inc(failed_pin);
			goto out_with_send_lid;
		}

		knem_counter_inc(processed_dma);
	} else
#endif
	if (!offload && !pinrecv) {
		/* if not offloading, we can memcpy to user-space without pinning on the receive side */
		work->type = KNEM_WORK_MEMCPY_TO_USER;

		/* no need to pin */

		recv_pdesc->iovec_nr = uiovec_nr; /* store the number of user iovecs for knem_pin_iovecs_desc() */

	} else {
		/* fallback to pinned copy */
		work->type = KNEM_WORK_MEMCPY_PINNED;

		/* need to pin on the recv side first */
		recv_pdesc->iovec_nr = uiovec_nr; /* store the number of user iovecs for knem_pin_iovecs_desc() */
		err = knem_pin_iovecs_desc(recv_pdesc, 1 /* write */);
		if (unlikely(err < 0)) {
			knem_counter_inc(failed_pin);
			goto out_with_send_lid;
		}
	}

	/* actually perform or offload the work */
	if (offload) {
		/* offload the work in the kthread and let it work and free everything */
		spin_lock(&pid->kthread_work_lock);
		list_add_tail(&work->list_elt, &pid->kthread_work_list);
		spin_unlock(&pid->kthread_work_lock);
		wake_up(&pid->kthread_work_wq);
		knem_counter_inc(processed_thread);

	} else {
		/* synchronous work, and free everything */
		knem_do_work(pid, work);
	}

	return 0;

 out_with_send_lid:
	knem_free_send_lid(send_lid);
 out_with_piovecs:
	knem_unpin_iovecs_desc(recv_pdesc);
 out_with_work:
	kfree(work);
 out:
	return err;
}

/**********
 * Sending
 */

static int
knem_init_send(struct knem_pid * pid,
	       void __user * uiovec_array, unsigned long uiovec_nr,
	       knem_cookie_t *cookie,
	       unsigned int flags)
{
	struct knem_cmd_param_iovec *uiovecs;
	struct knem_send_lid *lid;
	struct knem_piovecs_desc *pdesc;
	int err;

	lid = kmalloc(sizeof(*lid) + sizeof(*pdesc)
		      + uiovec_nr * sizeof(struct knem_pinned_iovec)
		      + uiovec_nr * sizeof(struct knem_cmd_param_iovec), GFP_KERNEL);
	if (unlikely(!lid)) {
		dprintk("Failed to allocate send lid\n");
		err = -ENOMEM;
		goto out;
	}
	pdesc = (void *)(lid + 1);
	pdesc->iovec_nr = 0; /* make sure the lid is in a coherent state */

	uiovecs = (void *) &pdesc->piovecs[uiovec_nr]; /* store the user ioctl uiovecs at the very end */
	err = copy_from_user(uiovecs, uiovec_array,
			     uiovec_nr * sizeof(struct knem_cmd_param_iovec));
	if (unlikely(err)) {
		dprintk("Failed to read send ioctl iovecs from user-space\n");
		err = -EFAULT;
		goto out_with_lid;
	}

	pdesc->iovec_nr = uiovec_nr; /* store the number of user iovecs for knem_pin_iovecs_desc() */
	err = knem_pin_iovecs_desc(pdesc, 0 /* read-only */);
	if (unlikely(err < 0))
		goto out_with_lid;

	knem_queue_send_lid(pid, lid);

	*cookie = KNEM_BUILD_COOKIE(pid, lid);

	return 0;

 out_with_lid:
	knem_free_send_lid(lid);
 out:
	return err;
}

/******************
 * Kthread routine
 */

static void
knem_free_work(struct knem_work *work)
{
	struct knem_piovecs_desc * recv_pdesc = (void *)(work + 1);
	knem_free_send_lid(work->send_lid);
	knem_unpin_iovecs_desc(recv_pdesc);
	/* recv_pdesc is freed/allocated with the work */
	kfree(work);
}

static void
knem_do_work(struct knem_pid *pid, struct knem_work *work)
{
	switch (work->type) {

	case KNEM_WORK_MEMCPY_PINNED: {
		knem_do_work_memcpy_pinned(pid, work);
		knem_free_work(work);
		break;
	}

	case KNEM_WORK_MEMCPY_TO_USER: {
		knem_do_work_memcpy_to_user(pid, work);
		knem_free_work(work);
		break;
	}

#ifdef KNEM_HAVE_DMA_ENGINE
	case KNEM_WORK_DMACPY: {
		int ret = knem_do_work_dmacpy_pinned(pid, work);
		if (ret) {
			/* the copy was offloaded, we'll cleanup later */
			spin_lock(&pid->dmacpy_cleanup_work_lock);
			list_add_tail(&work->list_elt, &pid->dmacpy_cleanup_work_list);
			spin_unlock(&pid->dmacpy_cleanup_work_lock);
		} else {
			knem_free_work(work);
		}
		break;
	}
#endif /* KNEM_HAVE_DMA_ENGINE */

	default:
		BUG();
	}
}

static int
knem_kthread_func(void *data)
{
	struct knem_pid *pid = data;
	DECLARE_WAITQUEUE(wait, current);
#define KNEM_CPUS_STR_LEN 64
	char cpus_str[KNEM_CPUS_STR_LEN];
	int err;

	/* buf len is KNEM_CPUS_STR_LEN, need 9 chars per 32 bits. */
	BUILD_BUG_ON((NR_CPUS/32 * 9) > (KNEM_CPUS_STR_LEN-1));
	knem_cpumask_scnprintf(cpus_str, sizeof(cpus_str), &current->cpus_allowed);
	dprintk("Starting kthread for pid #%lx, with cpumask %s\n",
		(unsigned long) (pid->id - knem_pid_base), cpus_str);

	/* bind the thread */
	err = set_cpus_allowed(current, pid->kthread_cpumask);
	if (err < 0)
		dprintk("Failed to bind kthread\n");

	set_current_state(TASK_INTERRUPTIBLE);
	while(!kthread_should_stop()) {
		struct knem_work *work;

		add_wait_queue(&pid->kthread_work_wq, &wait);

		spin_lock(&pid->kthread_work_lock);
		if (list_empty(&pid->kthread_work_list)) {
			spin_unlock(&pid->kthread_work_lock);
			schedule_timeout(HZ);
			spin_lock(&pid->kthread_work_lock);
		} else {
			__set_current_state(TASK_RUNNING);
		}

		while (!list_empty(&pid->kthread_work_list)) {
			work = list_entry(pid->kthread_work_list.next, struct knem_work, list_elt);
			list_del(&work->list_elt);
			spin_unlock(&pid->kthread_work_lock);
			knem_do_work(pid, work);
			spin_lock(&pid->kthread_work_lock);
		}
		spin_unlock(&pid->kthread_work_lock);

#ifdef KNEM_HAVE_DMA_ENGINE
		if (pid->dmacpy_chan)
			knem_dmacpy_partial_cleanup(pid);
#endif

		remove_wait_queue(&pid->kthread_work_wq, &wait);
		set_current_state(TASK_INTERRUPTIBLE);
	}
	__set_current_state(TASK_RUNNING);

	dprintk("Stopping kthread for pid #%lx\n",
		(unsigned long) (pid->id - knem_pid_base));
	return 0;
}

/******************
 * File operations
 */

static int
knem_miscdev_open(struct inode * inode, struct file * file)
{
	struct knem_pid * pid;
	int err = 0;
	int i;

	spin_lock(&knem_pids_lock);

	/* is there a pid available? */
	if (!knem_pids_nr_free) {
		dprintk("No more pids available\n");
		err = -EBUSY;
		goto out_with_lock;
	}

	/* get a pid and mark it as used */
	for(i=0; i<KNEM_PID_MAX; i++)
		if (!knem_pids[i].file)
			break;
	pid = &knem_pids[i];
	pid->file = file;
	pid->status_array = NULL;
	pid->status_index_max = 0;
	pid->kthread_task = NULL;
	knem_pids_nr_free--;

	/* initialize the pid */
	file->private_data = pid;
	INIT_LIST_HEAD(&pid->lid_list_head);
	spin_lock_init(&pid->lid_list_lock);
	INIT_LIST_HEAD(&pid->kthread_work_list);
	spin_lock_init(&pid->kthread_work_lock);
	init_waitqueue_head(&pid->kthread_work_wq);
	pid->next_lid = jiffies; /* let's say it's random */

	dprintk("Using pid #%lx (%p), starting lid ids at offset %lx\n",
		(unsigned long) (pid->id - knem_pid_base), pid, (unsigned long) pid->next_lid);

	/* setup the kthread cpumask to anywhere for now */
	cpus_setall(pid->kthread_cpumask);

	spin_unlock(&knem_pids_lock);

#ifdef KNEM_HAVE_DMA_ENGINE
	/* try to get a dma chan */
	pid->dmacpy_chan = knem_get_dma_channel();
	INIT_LIST_HEAD(&pid->dmacpy_cleanup_work_list);
	spin_lock_init(&pid->dmacpy_cleanup_work_lock);
#endif

	return 0;

 out_with_lock:
	spin_unlock(&knem_pids_lock);
	return err;
}

static int
knem_miscdev_release(struct inode * inode, struct file * file)
{
	struct knem_pid * pid = file->private_data;
	struct knem_send_lid * lid, * nlid;
	struct knem_work * work, * nwork;
	int i;

	BUG_ON(!pid);
	file->private_data = NULL;

	/* release unused send lids.
	 * take the lock since other pids might still access the list.
	 */
	spin_lock(&pid->lid_list_lock);
	i=0;
	list_for_each_entry_safe(lid, nlid, &pid->lid_list_head, lid_list_elt) {
		dprintk("Destroying send lid %#lx (%p) on pid #%lx close\n",
			(unsigned long) lid->id, lid,
			(unsigned long) (pid->id - knem_pid_base));
		list_del(&lid->lid_list_elt);
		knem_free_send_lid(lid);
		i++;
	}
	spin_unlock(&pid->lid_list_lock);
	if (i)
		dprintk("Destroyed %d send lids while releasing pid #%lx\n",
			i, (unsigned long) (pid->id - knem_pid_base));

	/* stop the kthread */
	if (pid->kthread_task)
		kthread_stop(pid->kthread_task);

	/* kthread is gone, no need to lock the work list */
	i=0;
	list_for_each_entry_safe(work, nwork, &pid->kthread_work_list, list_elt) {
		struct knem_piovecs_desc * recv_pdesc = (void *)(work + 1);

		dprintk("Destroying work %p on pid #%lx close\n",
			work,
			(unsigned long) (pid->id - knem_pid_base));
		list_del(&work->list_elt);

		knem_free_send_lid(work->send_lid);
		knem_unpin_iovecs_desc(recv_pdesc);
		/* recv_pdesc is freed/allocated with the work */
		kfree(work);

		i++;
	}
	if (i)
		dprintk("Destroyed %d works while releasing pid #%lx\n",
			i, (unsigned long) (pid->id - knem_pid_base));

#ifdef KNEM_HAVE_DMA_ENGINE
	/* cleanup the pending work. don't check whether we got a dma_chan
	 * since the work list has been properly initialized
	 */
	knem_dmacpy_full_cleanup(pid);
	/* release the channel now that all works are done */
	knem_put_dma_channel(pid->dmacpy_chan);
#endif

	/* release the pid */
	spin_lock(&knem_pids_lock);
	knem_pids_nr_free++;
	pid->file = NULL;
	if (pid->status_array) {
		vfree(pid->status_array);
		kfree(pid->status_pages);
	}
	pid->status_array = NULL;
	pid->status_index_max = 0;
	spin_unlock(&knem_pids_lock);

	return 0;
}

static long
knem_miscdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
	int err;

	switch (cmd) {

	case KNEM_CMD_GET_INFO: {
		struct knem_pid * pid = file->private_data;
		struct knem_cmd_info info;

		BUG_ON(!pid);

		info.abi = KNEM_ABI_VERSION;
		info.features = 0
#ifdef KNEM_HAVE_DMA_ENGINE
				| (knem__dmacpy && pid->dmacpy_chan ? KNEM_FEATURE_DMA : 0)
#endif
				;
		info.forced_flags = knem_force_flags;
		info.ignored_flags = knem_ignore_flags;
		err = copy_to_user((void __user *) arg, &info,
				   sizeof(info));
		if (unlikely(err)) {
			dprintk("Failed to write get_info ioctl user-space param\n");
			err = -EFAULT;
		}
		break;
	}

	case KNEM_CMD_BIND_OFFLOAD: {
		struct knem_pid * pid = file->private_data;
		struct knem_cmd_bind_offload bind_offload;

		BUG_ON(!pid);

		if (pid->status_array) {
			dprintk("Cannot bind offload after kthread startup\n");
			err = -EBUSY;
			goto out;
		}

		err = copy_from_user(&bind_offload, (void __user *) arg, sizeof(bind_offload));
		if (unlikely(err)) {
			dprintk("Failed to read bind-offload ioctl user-space param\n");
			err = -EFAULT;
			goto out;
		}

		switch (bind_offload.flags) {
		case KNEM_BIND_FLAG_CUSTOM:
			if (bind_offload.mask_len < sizeof(cpumask_t)) {
				memset(&pid->kthread_cpumask, 0, sizeof(cpumask_t));
			} else if (bind_offload.mask_len > sizeof(cpumask_t)) {
				bind_offload.mask_len = sizeof(cpumask_t);
			}
			err = copy_from_user(&pid->kthread_cpumask, (void __user *)(unsigned long) bind_offload.mask_ptr, bind_offload.mask_len);
			if (unlikely(err)) {
				dprintk("Failed to read bind-offload mask from ioctl user-space param\n");
				err = -EFAULT;
				goto out;
			}
			break;
		case KNEM_BIND_FLAG_CURRENT:
			pid->kthread_cpumask = current->cpus_allowed;
			break;
		case KNEM_BIND_FLAG_CURRENT_REVERSED:
			cpus_complement(pid->kthread_cpumask, current->cpus_allowed);
			break;
		default:
			dprintk("Unknown bind flags %ld\n", (unsigned long) bind_offload.flags);
			err = -EINVAL;
			goto out;
		}

		break;
	}

	case KNEM_CMD_INIT_SEND: {
		struct knem_pid * pid = file->private_data;
		struct knem_cmd_init_send_param send_param;
		unsigned int flags;

		BUG_ON(!pid);

		err = copy_from_user(&send_param, (void __user *) arg, sizeof(send_param));
		if (unlikely(err)) {
			dprintk("Failed to read send ioctl user-space param\n");
			err = -EFAULT;
			goto out;
		}

		flags = KNEM_FIX_FLAGS(send_param.flags);

		err = knem_init_send(pid,
				     (void __user *)(unsigned long) send_param.send_iovec_array, send_param.send_iovec_nr,
				     &send_param.send_cookie, flags);
		if (likely(!err)) {
			err = copy_to_user((void __user *) arg, &send_param, sizeof(send_param));
			BUG_ON(err); /* copy_from_user worked, so this one can't fail */
		}

		break;
	}

	case KNEM_CMD_INIT_ASYNC_RECV: {
		struct knem_pid * pid = file->private_data;
		struct knem_cmd_init_async_recv_param recv_param;
		unsigned long status_offset;
		knem_status_t *status;
		unsigned int flags;

		BUG_ON(!pid);

		err = copy_from_user(&recv_param, (void __user *) arg, sizeof(recv_param));
		if (unlikely(err)) {
			dprintk("Failed to read send ioctl user-space param\n");
			err = -EFAULT;
			goto out;
		}

		if (unlikely(!pid->status_array)) {
			/* not mapped yet */
			dprintk("Cannot post a recv without a mapped status array\n");
			err = -EINVAL;
			goto out;
		}

		if (unlikely(recv_param.status_index >= pid->status_index_max)) {
			/* invalid index */
			dprintk("Invalid status array index in recv ioctl\n");
			err = -EINVAL;
			goto out;
		}

		flags = KNEM_FIX_FLAGS(recv_param.flags);

		status = &pid->status_array[recv_param.status_index];
		status_offset = recv_param.status_index * sizeof(knem_status_t);

		err = knem_init_recv(pid,
				     (void __user *)(unsigned long) recv_param.recv_iovec_array, recv_param.recv_iovec_nr,
				     recv_param.send_cookie,
				     status, status_offset,
				     flags);
		break;
	}

	case KNEM_CMD_SYNC_RECV: {
		struct knem_pid * pid = file->private_data;
		struct knem_cmd_sync_recv_param recv_param;
		knem_status_t status;
		unsigned int flags;

		BUG_ON(!pid);

		err = copy_from_user(&recv_param, (void __user *) arg, sizeof(recv_param));
		if (unlikely(err)) {
			dprintk("Failed to read send ioctl user-space param\n");
			err = -EFAULT;
			goto out;
		}

		/* no need for the status array to be mapped, status is returned in the param */

		flags = KNEM_FIX_FLAGS(recv_param.flags);

		/* synchronous may use pinning on send side only, or both */
		err = knem_init_recv(pid,
				     (void __user *)(unsigned long) recv_param.recv_iovec_array, recv_param.recv_iovec_nr,
				     recv_param.send_cookie,
				     &status, 0 /* status_offset ignored in sync mode */,
				     flags & ~KNEM_FLAG_ANY_ASYNC_MASK);
		recv_param.status = status;

		err = copy_to_user((void __user *) arg, &recv_param, sizeof(recv_param));
		BUG_ON(err); /* copy_from_user worked, so this one can't fail */

		break;
	}

	default:
		dprintk("Cannot handle unknown ioctl command %d", cmd);
		err = -ENOSYS;
		break;
	}

 out:
	return err;
}

static int
knem_miscdev_mmap(struct file * file, struct vm_area_struct * vma)
{
	struct knem_pid * pid = file->private_data;
	struct task_struct * kthread_task;
	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
	unsigned long size = vma->vm_end - vma->vm_start;
	unsigned long i;
	struct page **pages;
	void * buffer;
	int err;

	if (offset != KNEM_STATUS_ARRAY_FILE_OFFSET) {
		dprintk("Cannot mapped at file offset %lx\n", offset);
		err = -EINVAL;
		goto out;
	}

	buffer = knem_vmalloc_user(size);
	if (!buffer) {
		dprintk("Failed to allocate status array size %ld\n", size);
		err = -ENOMEM;
		goto out;
	}

	pages = kmalloc((size+PAGE_SIZE-1)>>PAGE_SHIFT, GFP_KERNEL);
	if (!pages) {
		dprintk("Failed to allocate status array pages array size %ld\n",
			(size+PAGE_SIZE-1)>>PAGE_SHIFT);
		err = -ENOMEM;
		goto out_with_buffer;
	}

	for(i=0; i<size; i+=PAGE_SIZE)
		pages[i>>PAGE_SHIFT] = vmalloc_to_page(buffer + i);

	err = knem_remap_vmalloc_range(vma, buffer, 0);
	if (err < 0) {
		dprintk("Failed to remap vmalloc'ed status array, got error %d\n", err);
		goto out_with_pages;
	}
	/* the caller will unmap if need on error return below */

	spin_lock(&knem_pids_lock);
	if (pid->status_array) {
		dprintk("Cannot attached another status array\n");
		err = -EBUSY;
		goto out_with_pids_lock;
	}
	pid->status_array = buffer;
	pid->status_pages = pages;
	pid->status_index_max = size / sizeof(knem_status_t);
	spin_unlock(&knem_pids_lock);

	/* enforce binding mask if needed */
	if (knem_binding > 0)
		pid->kthread_cpumask = current->cpus_allowed;
	else if (knem_binding < 0)
		cpus_complement(pid->kthread_cpumask, current->cpus_allowed);

	kthread_task = kthread_run(knem_kthread_func, pid, "knem-pid-%lx",
				   (unsigned long) (pid->id - knem_pid_base));
	if (IS_ERR(kthread_task)) {
		err = PTR_ERR(kthread_task);
		dprintk("Failed to start pid kthread, error %d\n", err);
		goto out_with_pid_setup;
	}
	pid->kthread_task = kthread_task;

	return 0;

 out_with_pid_setup:
	spin_lock(&knem_pids_lock);
	vfree(pid->status_array);
	kfree(pid->status_pages);
	pid->status_array = NULL;
	pid->status_index_max = 0;
 out_with_pids_lock:
	spin_unlock(&knem_pids_lock);
 out_with_pages:
	kfree(pages);
 out_with_buffer:
	vfree(buffer);
 out:
	return err;
}

static ssize_t
knem_miscdev_read(struct file* filp, char __user * buff, size_t count, loff_t* offp)
{
	ssize_t ret = 0;
	char *buffer, *tmp;
	unsigned int len, tmplen;

#define KNEM_DRIVER_STRING_LEN 1024
        buffer = kmalloc(KNEM_DRIVER_STRING_LEN, GFP_KERNEL);
	if (!buffer)
		goto out;

	tmp = buffer;
	len = 0;

	tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
			  "knem " KNEM_VERSION_STR "\n");
	len += tmplen;
	tmp += tmplen;

	tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
			  " Driver ABI=0x%x\n", KNEM_ABI_VERSION);
	len += tmplen;
	tmp += tmplen;

	tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
			  " Flags: forcing 0x%x, ignoring 0x%x\n", knem_force_flags, knem_ignore_flags);
	len += tmplen;
	tmp += tmplen;

#ifdef CONFIG_NET_DMA
	if (!knem__dmacpy)
		tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
				  " DMAEngine: KernelSupported Disabled\n");
	else if (!knem_dma_channel_avail())
		tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
				  " DMAEngine: KernelSupported Enabled NoChannelAvailable\n");
	else
		tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
				  " DMAEngine: KernelSupported Enabled ChansAvail ChunkMin=%dB\n",
				  knem_dma_chunk_min);
#else
	tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
			  " DMAEngine: NoKernelSupport\n");
#endif
	len += tmplen;
	tmp += tmplen;

#ifdef KNEM_DRIVER_DEBUG
	tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
			  " Debug: BuiltIn %s\n",
			  knem_debug ? "Enabled" : "Disabled");
#else
	tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
			  " Debug: NotBuilt\n");
#endif
	len += tmplen;
	tmp += tmplen;

	tmplen = snprintf(tmp, KNEM_DRIVER_STRING_LEN-len,
			  " Requests Submitted          : %lld\n"
			  " Requests Processed/DMA      : %lld\n"
			  " Requests Processed/Thread   : %lld\n"
			  " Requests Failed/NoMemory    : %lld\n"
			  " Requests Failed/ReadCmd     : %lld\n"
			  " Requests Failed/FindLID     : %lld\n"
			  " Requests Failed/Pin         : %lld\n"
			  " Requests Failed/MemcpyToUser: %lld\n"
			  " Requests Failed/MemcpyPinned: %lld\n"
			  " Requests Failed/DMACopy     : %lld\n",
			  knem_counter_read(submitted),
			  knem_counter_read(processed_dma),
			  knem_counter_read(processed_thread),
			  knem_counter_read(failed_nomem),
			  knem_counter_read(failed_readcmd),
			  knem_counter_read(failed_findlid),
			  knem_counter_read(failed_pin),
			  knem_counter_read(failed_memcpytouser),
			  knem_counter_read(failed_memcpypinned),
			  knem_counter_read(failed_dmacpy));
	len += tmplen;
	tmp += tmplen;

	if (*offp > len)
		goto out_with_buffer;

	if (*offp + count > len)
		count = len - *offp;

	ret = copy_to_user(buff, buffer + *offp, count);
	if (ret)
		ret = -EFAULT;
	else
		ret = count;

	*offp += count;

 out_with_buffer:
	kfree(buffer);
 out:
        return ret;
}

static ssize_t
knem_miscdev_write(struct file* filp, const char __user * buff, size_t count, loff_t* offp)
{
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	knem_clear_counters();
	return count;
}

static struct file_operations
knem_miscdev_fops = {
	.owner = THIS_MODULE,
	.open = knem_miscdev_open,
	.release = knem_miscdev_release,
	.mmap = knem_miscdev_mmap,
	.read = knem_miscdev_read,
	.write = knem_miscdev_write,
	.unlocked_ioctl = knem_miscdev_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = knem_miscdev_ioctl,
#endif
};

static struct miscdevice
knem_miscdev = {
	.minor = MISC_DYNAMIC_MINOR,
	.name = "knem",
	.fops = &knem_miscdev_fops,
};

/**************
 * Module core
 */

static int
knem_init(void)
{
	int err;
	int i;

	knem_init_counters();

	err = knem_setup_flags();
	if (err < 0)
		goto out;

	spin_lock_init(&knem_pids_lock);

	err = knem_dmacpy_init();
	if (err < 0)
		goto out;

	knem_pids = kmalloc(KNEM_PID_MAX * sizeof(struct knem_pid), GFP_KERNEL);
	if (!knem_pids) {
		dprintk("Failed to alloc the pid array\n");
		err = -ENOMEM;
		goto out_with_dmacpy;
	}

	knem_pid_base = jiffies; /* let's say it's random */;

	dprintk("Supporting %ld pids, with ids starting at offset %lx\n",
		(unsigned long) KNEM_PID_MAX, (unsigned long) knem_pid_base);

	for(i=0; i<KNEM_PID_MAX; i++) {
		knem_pids[i].file = NULL;
		knem_pids[i].id = knem_pid_base + i;
	}
	knem_pids_nr_free = KNEM_PID_MAX;

	err = misc_register(&knem_miscdev);
	if (err < 0) {
		dprintk("Failed to register misc device, error %d\n", err);
		goto out_with_pids;
	}

	printk(KERN_INFO "knem " KNEM_VERSION_STR ": initialized\n");

	return 0;

 out_with_pids:
	kfree(knem_pids);
 out_with_dmacpy:
	knem_dmacpy_exit();
 out:
	return err;
}
module_init(knem_init);

static void
knem_exit(void)
{
	printk(KERN_INFO "knem " KNEM_VERSION_STR ": terminating\n");
	misc_deregister(&knem_miscdev);
	kfree(knem_pids);
	knem_dmacpy_exit();
}
module_exit(knem_exit);

MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Brice Goglin <Brice.Goglin@inria.fr>");
MODULE_VERSION(PACKAGE_VERSION);
MODULE_DESCRIPTION(PACKAGE_NAME ": kernel-side Nemesis subsystem");
