From 0cc8bd8387f5987ff453b48605811ac535c3d8ab Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Mon, 24 Jan 2011 02:52:16 +0000
Subject: [PATCH 01/24] New branch for CUDA integration. Tracker item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@1907 61e36070-c886-4827-b7c0-ed9e4dc59cc8

From d17fe30576a2e2820929d7a182fccb3508cb1256 Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Mon, 24 Jan 2011 22:54:24 +0000
Subject: [PATCH 02/24] Initial commit of cuda integration. Makefiles are
 hacked to together as I am pressed for time. I wouldn't recommend anyone test
 this yet as makefiles have some harcoded paths from my comp and code is
 tweaked for my tesla board. Tracker item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@1916 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 ExampleMakefile                               |   2 +-
 Makefile                                      |  10 +-
 src/cuda_solver_routines.cu                   | 489 ++++++++++++++++++
 ...ines.c => external_dae_solver_routines.cu} |  16 +-
 src/external_dae_solver_routines.h            |  17 +-
 5 files changed, 524 insertions(+), 10 deletions(-)
 create mode 100644 src/cuda_solver_routines.cu
 rename src/{external_dae_solver_routines.c => external_dae_solver_routines.cu} (89%)

diff --git a/ExampleMakefile b/ExampleMakefile
index 8d914430..83aa6100 100644
--- a/ExampleMakefile
+++ b/ExampleMakefile
@@ -882,7 +882,7 @@ $(EXE_DIR) :
 	mkdir -p $@; 
 
 $(EXECUTABLE) : $(OBJECTS) $(OPENCMISS_LIBRARY)
-	$(EXE_LINK) -o $@ $(OBJECTS) $(OPENCMISS_LIBRARY) $(ELFLAGS) $(EXTERNAL_LIBRARIES)
+	$(EXE_LINK) -o $@ $(OBJECTS) $(OPENCMISS_LIBRARY) $(ELFLAGS) $(EXTERNAL_LIBRARIES) -L/usr/local/cuda/lib64 -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -lcudart -L/usr/local/cuda/lib64 -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -lcudart -lcutil_x86_64 -lshrutil_x86_64
 
 $(OPENCMISS_LIBRARY) :
 	( cd $(GLOBAL_ROOT); $(MAKE) )
diff --git a/Makefile b/Makefile
index 0bc73738..d192cbe0 100644
--- a/Makefile
+++ b/Makefile
@@ -727,7 +727,7 @@ FPPFLAGS += $(EXTERNAL_INCLUDE_PATH)
 
 ELFLAGS += $(EXTERNAL_LIB_PATH)
 
-.SUFFIXES:	.f90	.c
+.SUFFIXES:	.f90	.c  .cu
 
 $(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.f90
 	( cd $(OBJECT_DIR) ; $(FC) -o $@ $(FFLAGS) $(FPPFLAGS) -c $< )
@@ -735,6 +735,10 @@ $(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.f90
 $(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.c
 	( cd $(OBJECT_DIR) ; $(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $< )
 
+$(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.cu
+	( cd $(OBJECT_DIR) ; nvcc  -gencode=arch=compute_10,code=\"sm_10,compute_10\"  -gencode=arch=compute_20,code=\"sm_20,compute_20\" --compile -m64 --compiler-options -fno-strict-aliasing --ptxas-options=-v  -I$(SOURCE_DIR) -I/usr/local/cuda/include -I/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/inc -I/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//inc --use_fast_math -DUNIX -O2 -o $@ -c $< )
+
+
 ifeq ($(USEFIELDML),true)
     FIELDML_OBJECT =  \
       $(OBJECT_DIR)/fieldml_util_routines.o \
@@ -1403,8 +1407,8 @@ $(OBJECT_DIR)/equations_set_routines.o	:	$(SOURCE_DIR)/equations_set_routines.f9
 	$(OBJECT_DIR)/timer_f.o \
 	$(OBJECT_DIR)/types.o
 
-$(OBJECT_DIR)/external_dae_solver_routines.o	:	$(SOURCE_DIR)/external_dae_solver_routines.c \
-	$(SOURCE_DIR)/external_dae_solver_routines.h
+$(OBJECT_DIR)/external_dae_solver_routines.o : $(SOURCE_DIR)/external_dae_solver_routines.cu \
+      $(SOURCE_DIR)/external_dae_solver_routines.h \
 
 $(OBJECT_DIR)/field_routines.o	:	$(SOURCE_DIR)/field_routines.f90 \
 	$(OBJECT_DIR)/base_routines.o \
diff --git a/src/cuda_solver_routines.cu b/src/cuda_solver_routines.cu
new file mode 100644
index 00000000..afe660b4
--- /dev/null
+++ b/src/cuda_solver_routines.cu
@@ -0,0 +1,489 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <cutil_inline.h>
+
+extern __shared__ double shared_array[];
+const int algebraicCount = 25;
+const int rateStateCount = 8;
+const int constantCount = 0;
+const double FLOPSPerFunction = 193.0f;
+const int DEFAULT_TESTING_THREADS = 2000000;
+const int sharedMemoryCellModel = 0;
+const char* cellModelName = "LR R3";
+
+////////////////////////////////////////////////////////////////////////////////
+// Cell Model Device Functions
+////////////////////////////////////////////////////////////////////////////////
+__device__ void computeRates(float VOI, double* DUMMY, double* STATES, double* ALGEBRAIC, double* RATES)
+{
+	ALGEBRAIC[0] = -25.5; // Add stimulus in proper
+	ALGEBRAIC[1] = (0.32f*STATES[0]+15.0816f)/(1.0f - (expf(- 0.1f*STATES[0]-4.713f))); // 7 ops
+
+	if (STATES[0] < -40.0f) {
+		ALGEBRAIC[2] = 0.135f*(expf(((80.0f+STATES[0])/- 6.8f))); // 4 ops
+		ALGEBRAIC[3] = (( - 127140*(expf(0.2444f*STATES[0])) - 3.47400e-05*(expf(-0.04391f*STATES[0])))*(STATES[0]+37.78f))/(1.0f+(expf(0.311f*STATES[0]+24.64053))); // 14 ops
+		ALGEBRAIC[9] = 3.56f*(expf(0.079f*STATES[0]))+ 310000*(expf(0.35f*STATES[0]));  // 7 ops
+		ALGEBRAIC[10] = 0.1212f*(expf(-0.01052f*STATES[0]))/(1.0f+(expf(-0.1378f*STATES[0]-5.531292f))); // 8 ops
+	} else {
+		ALGEBRAIC[2] = 0.00000;
+		ALGEBRAIC[3] = 0.00000;
+		ALGEBRAIC[9] = 1.0f/( 0.13f*(1.0f+(expf(((STATES[0]+10.66f)/- 11.1f)))));
+		ALGEBRAIC[10] = ( 0.3f*(expf(-2.53500e-07*STATES[0])))/(1.0f+(expf(-0.1f*STATES[0]-3.2f)));
+	}
+	if (STATES[0] < -100.0f) {
+		ALGEBRAIC[16] = 2.837f*(expf(0.04f*STATES[0]+3.08f) - 1.0f)/((STATES[0]+77.0f)*(expf(0.04f*STATES[0]+1.4f))); // 11 ops
+	} else {
+		ALGEBRAIC[16] = 1.0f;
+	}
+
+	ALGEBRAIC[4] = (0.095f*(expf(-0.01f*STATES[0] + 0.5f)))/(1.0f+(expf(-0.072*STATES[0] + 0.36f))); // 9 ops
+	ALGEBRAIC[5] = (0.012f*(expf(-0.008f*STATES[0]-0.224f)))/(1.0f+(expf(0.15f*STATES[0]+4.2f))); // 9 ops
+	ALGEBRAIC[6] = (0.0005f*(expf(0.083f*STATES[0]+4.15f)))/(1.0f+(expf(0.057f*STATES[0]+2.85f))); // 9 ops
+	ALGEBRAIC[7] =  23*(powf(STATES[1], 3.0f))*STATES[2]*STATES[3]*(STATES[0] - 54.794463f); // 6 ops
+	ALGEBRAIC[8] =  0.08f*(expf(-STATES[0]/11.0000)); // 3 ops
+	ALGEBRAIC[11] = (0.07f*(expf(-0.017f*STATES[0]-0.748f)))/(1.0f+(expf(0.05f*STATES[0]+2.2f))); // 9 ops
+	ALGEBRAIC[12] = (0.0065f*(expf(-0.02f*STATES[0]-0.6f)))/(1.0f+(expf(-0.2f*STATES[0]-6.0f))); // 9 ops
+	ALGEBRAIC[13] = (0.0013f*(expf(-0.06f*STATES[0]-1.2f)))/(1.0f+(expf(-0.04f*STATES[0]-0.8f))); // 9 ops
+	ALGEBRAIC[14] = 7.7f - 13.0287f*logf(STATES[4]); // 3 ops
+	ALGEBRAIC[15] =  0.09f*STATES[5]*STATES[6]*(STATES[0] - ALGEBRAIC[14]); // 4 ops
+	ALGEBRAIC[17] =  0.282f*STATES[7]*ALGEBRAIC[16]*(STATES[0] + 77.56758f); // 4 ops
+	ALGEBRAIC[18] = 1.02f/(1.0f+(expf(0.2385f*STATES[0] + 6.83967915f))); // 4 ops
+	ALGEBRAIC[19] = (0.49124f*(expf( 0.08032f *STATES[0] + 7.49939f) + expf(0.06175f*STATES[0] - 31.271255925f)))/(1.00000+expf(-0.514300*STATES[0] - 214.85137268791f)); // 13 ops
+	ALGEBRAIC[20] = ALGEBRAIC[18]/(ALGEBRAIC[18]+ALGEBRAIC[19]); // 2 ops
+	ALGEBRAIC[21] =  0.6047f*ALGEBRAIC[20]*(STATES[0] + 87.89290f); // 3 ops
+	ALGEBRAIC[22] = 1.0f/(1.0f+(exp(((7.488f - STATES[0])/5.98f)))); // 5 ops
+	ALGEBRAIC[23] =  0.0183f*ALGEBRAIC[22]*(STATES[0] + 87.89290f); // 3 ops
+	ALGEBRAIC[24] =  0.03921f*STATES[0] +2.3475027f; // 3 ops
+
+	RATES[0] =  -(ALGEBRAIC[0]+ALGEBRAIC[7]+ALGEBRAIC[15]+ALGEBRAIC[17]+ALGEBRAIC[21]+ALGEBRAIC[23]+ALGEBRAIC[24]); // 7 ops
+	RATES[1] =  ALGEBRAIC[1]*(1.00000 - STATES[1]) -  ALGEBRAIC[8]*STATES[1]; // 4 ops
+	RATES[2] =  ALGEBRAIC[2]*(1.00000 - STATES[2]) -  ALGEBRAIC[9]*STATES[2]; // 4 ops
+	RATES[3] =  ALGEBRAIC[3]*(1.00000 - STATES[3]) -  ALGEBRAIC[10]*STATES[3]; // 4 ops
+	RATES[4] =  - 0.0001f*ALGEBRAIC[15]+ 0.000007f - 0.07f*STATES[4]; // 4 ops
+	RATES[5] =  ALGEBRAIC[4]*(1.00000 - STATES[5]) -  ALGEBRAIC[11]*STATES[5]; // 4 ops
+	RATES[6] =  ALGEBRAIC[5]*(1.00000 - STATES[6]) -  ALGEBRAIC[12]*STATES[6]; // 4 ops
+	RATES[7] =  ALGEBRAIC[6]*(1.00000 - STATES[7]) -  ALGEBRAIC[13]*STATES[7]; // 4 ops
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Cell Model Host Functions ////////// Should Not Be Needed Later /////////////
+////////////////////////////////////////////////////////////////////////////////
+void initProblem(int num_threads, double* STATES)
+{
+	int i;
+
+	STATES[0] = -84.3801107371;
+	STATES[1] = 0.00171338077730188;
+	STATES[2] = 0.982660523699656;
+	STATES[3] = 0.989108212766685;
+	STATES[4] = 0.00017948816388306;
+	STATES[5] = 0.00302126301779861;
+	STATES[6] = 0.999967936476325;
+	STATES[7] = 0.0417603108167287;
+
+	for (i=1; i<num_threads; i++)
+		memcpy(STATES + i*rateStateCount, STATES, rateStateCount*sizeof(double));
+}
+
+const int DEFAULT_TESTING_TIMESTEPS = 1000;
+const double FLOPSPerTimeStep = 22.0f;
+const int FunctionEvals = 4;
+const char* integratorName = "RK R2";
+const int sharedMemoryIntegrator = 0;
+
+////////////////////////////////////////////////////////////////////////////////
+// ODE Integrator Device Functions
+////////////////////////////////////////////////////////////////////////////////
+__device__ void integrator(int timeSteps, float stepSize, double* constants, double* states, double* algebraic)
+{
+	int i,j;
+
+	double previousStates[rateStateCount];
+	double kutta[rateStateCount];
+	double offsets[rateStateCount];
+
+	for (j=0; j<rateStateCount; j++) {
+		previousStates[j] = states[j];
+	}
+
+#pragma unroll 40
+	for (i=1; i<timeSteps+1; i++) {
+		computeRates(i*stepSize, constants, previousStates, algebraic, kutta);
+
+		for (j=0; j<rateStateCount; j++) {
+			offsets[j] = previousStates[j] + 0.5f*kutta[j]*stepSize; // 3 ops
+			states[j] += (stepSize/6)*(kutta[j]); // 3 ops
+		}
+		computeRates(i*stepSize, constants, offsets, algebraic, kutta);
+
+		for (j=0; j<rateStateCount; j++) {
+			offsets[j] = previousStates[j] + 0.5f*kutta[j]*stepSize; // 3 ops
+			states[j] += (stepSize/6)*(2*kutta[j]); // 4 ops
+		}
+		computeRates(i*stepSize, constants, offsets, algebraic, kutta);
+
+		for (j=0; j<rateStateCount; j++) {
+			offsets[j] = previousStates[j] + kutta[j]*stepSize; // 2 ops
+			states[j] += (stepSize/6)*(2*kutta[j]); // 4 ops
+		}
+		computeRates(i*stepSize, constants, offsets, algebraic, kutta);
+
+		for (j=0; j<rateStateCount; j++) {
+			states[j] += (stepSize/6)*(kutta[j]); // 3 ops
+			previousStates[j] = states[j];
+		}
+	}
+}
+
+const int sharedMemoryDevice = rateStateCount + algebraicCount;
+
+////////////////////////////////////////////////////////////////////////////////
+// Solver Kernel
+////////////////////////////////////////////////////////////////////////////////
+__global__ void solveSystem(int timeSteps, float stepSize, double* states)
+{
+	int i;
+
+	double* threadAlgebraic = &shared_array[algebraicCount*threadIdx.x + sharedMemoryIntegrator*blockDim.x];
+	double* threadStates = &shared_array[(algebraicCount + sharedMemoryIntegrator)*blockDim.x];
+
+#if constantCount > 0
+	double threadConstants[constantCount];
+
+	intitialiseConstants(threadConstants);
+#endif
+
+
+	for (i=0; i<rateStateCount; i++) {
+		threadStates[threadIdx.x + i*blockDim.x] = states[threadIdx.x + i*blockDim.x + blockIdx.x*blockDim.x*rateStateCount];
+	}
+
+#if constantCount > 0
+	integrator(timeSteps, stepSize, threadConstants, threadStates + threadIdx.x*rateStateCount, threadAlgebraic);
+#else
+	integrator(timeSteps, stepSize, NULL, threadStates + threadIdx.x*rateStateCount, threadAlgebraic);
+#endif
+
+	for (i=0; i<rateStateCount; i++) {
+		states[threadIdx.x + i*blockDim.x + blockIdx.x*blockDim.x*rateStateCount] = threadStates[threadIdx.x + i*blockDim.x];
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Check Error Checking
+////////////////////////////////////////////////////////////////////////////////
+void checkCUDAError(const char *msg)
+{
+    cudaError_t err = cudaGetLastError();
+    if( cudaSuccess != err) 
+    {
+        fprintf(stderr, "Cuda error: %s: %s.\n", msg, 
+                                  cudaGetErrorString( err) );
+        exit(EXIT_FAILURE);
+    }                         
+
+
+}
+
+void *safeMalloc(size_t size)  // Only called once - Pointless??
+{
+	void *ret = malloc(size);
+	if (ret == NULL) {
+		fprintf(stderr, "malloc of %zu bytes failed", size);
+		exit(EXIT_FAILURE);
+	}
+	return ret;
+}
+
+void cleanup() { cudaThreadExit(); }
+
+////////////////////////////////////////////////////////////////////////////////
+// CPU side solution routine
+////////////////////////////////////////////////////////////////////////////////
+void solve(double* h_states, float startTime, float endTime, unsigned int timeSteps,
+		   unsigned int num_threads, unsigned int threads_per_block, unsigned int num_partitions, 
+		   int unsigned num_streams, FILE *timing_file)
+{
+    unsigned int i,j;
+	unsigned int num_blocks = 0;
+	unsigned int local_offset = 0;
+	unsigned int global_offset = 0;
+	double nFLOPS = 0;
+	double dSeconds = 0;
+	double gflops = 0;
+	double kernel_dSeconds = 0;
+	double kernel_nFLOPS = 0;
+	double kernel_gflops = 0;
+	unsigned int timer = 0;
+	unsigned int kernel_timer = 0;
+	unsigned int count = 0;
+
+	size_t freeGPUMemory = 0;
+	size_t test = 0;
+	size_t pagedMemorySize;
+	size_t sharedMem = 0;
+
+	float stepSize = (float)(endTime - startTime)/timeSteps;
+
+	double *d_states = 0;
+	double *h_paged_states = 0;
+
+	cudaStream_t *streams;
+	int cuda_device = 0;
+
+    // Check for a CUDA compatible device
+    int num_devices=0;
+    cutilSafeCall( cudaGetDeviceCount(&num_devices) );
+    if(0==num_devices)
+    {
+        printf("Your system does not have a CUDA capable device\n");
+        return;
+	}
+	
+    // Set appropriate device flags and print relevant information
+	cutilSafeCall( cudaSetDevice( cuda_device ) );
+	cutilSafeCall( cudaSetDeviceFlags(cudaDeviceBlockingSync) );
+    cudaDeviceProp deviceProp;
+    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, cuda_device) );
+    if( (1 == deviceProp.major) && (deviceProp.minor < 1))
+        printf("%s does not have compute capability 1.1 or later\n", deviceProp.name);
+
+	// Adjust number of partitions so that data fits in GPU memory
+	cutilSafeCall( cudaMemGetInfo(&freeGPUMemory, &test) );
+	pagedMemorySize = sizeof(double)*rateStateCount*num_threads/num_partitions;
+	while ( pagedMemorySize*4.0 > freeGPUMemory && count < 200 ) {
+		num_partitions++;
+		count++;
+		if (num_threads%(num_partitions*num_streams)==0) {
+			pagedMemorySize = sizeof(double)*rateStateCount*num_threads/num_partitions;
+		}
+	}
+
+	// Adjust threads per blocks so that states variables fit in shared memory
+	count=0;
+	sharedMem = (size_t) (sharedMemoryIntegrator + sharedMemoryDevice + sharedMemoryCellModel) * threads_per_block * sizeof(double);
+	while ( sharedMem > deviceProp.sharedMemPerBlock*0.95 && count < 200 ) {
+		if (threads_per_block > 32 ) {
+			threads_per_block-=32;
+		} else {
+			fprintf(stderr, "Cannot fit variables in shared memory");
+			exit(EXIT_FAILURE);
+		}
+		count++;
+		sharedMem = (size_t) (sharedMemoryIntegrator + sharedMemoryDevice + sharedMemoryCellModel) * threads_per_block * sizeof(double);
+	}
+	num_blocks=num_threads/threads_per_block;
+	if (num_threads % threads_per_block !=0) num_blocks++; // Round up calculation
+
+//#ifdef DEBUG
+	if (timing_file) {
+		printf("> Device name : %s\n", deviceProp.name );
+		printf("> CUDA Capable SM %d.%d hardware with %d multi-processors\n", 
+			deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); 
+		printf("> Cell Model = %s, Integrator = %s\n", cellModelName, integratorName);
+		printf("> num_threads = %d, num_blocks = %d, threads_per_block = %d, num_partitions = %d, timeSteps = %d, num_streams = %d\n", 
+			num_threads, num_blocks, threads_per_block, num_partitions, timeSteps, num_streams);
+	}
+//#endif
+
+	// Allocate CUDA device and host pinned memory
+	cutilSafeCall( cudaMallocHost((void**) &h_paged_states, pagedMemorySize) );
+	cutilSafeCall( cudaMalloc((void **) &d_states, pagedMemorySize) );
+
+	// Create streams
+	streams = (cudaStream_t*) malloc(num_streams * sizeof(cudaStream_t));
+    for(i = 0; i < num_streams; i++)
+        cutilSafeCall( cudaStreamCreate(&(streams[i])) );
+
+    // Setup execution parameters
+    dim3  grid(num_blocks/num_streams/num_partitions, 1, 1);
+    printf("blox %d streams %d parts %d\n", num_blocks, num_streams, num_partitions);
+    dim3  threads(threads_per_block, 1, 1);
+
+	if (timing_file) {
+		// Setup and start global timer
+		timer = 0;
+		cutCreateTimer(&timer);
+		cutilCheckError(cutStartTimer(timer));
+
+		// Test kernel speed in default stream (timing is more accurate in default stream)
+		memcpy(h_paged_states, h_states, pagedMemorySize/num_streams);
+		cutilSafeCall( cudaMemcpy(d_states, h_paged_states, pagedMemorySize/num_streams, 
+			cudaMemcpyHostToDevice) );
+		// Start kernel timer
+		kernel_timer = 0;
+		cutCreateTimer(&kernel_timer);
+		cutilCheckError(cutStartTimer(kernel_timer));
+		// Start kernel
+		printf("grid.x %d threads.x %d mem %d\n", grid.x, threads.x, sharedMem);
+		solveSystem<<<grid, threads, sharedMem>>>(timeSteps, stepSize, d_states);
+		checkCUDAError("Single Kernel Execution");
+		cutilSafeCall( cudaThreadSynchronize() );
+		// Stop kernel Timer 
+    cutilCheckError(cutStopTimer(kernel_timer));
+		cutilSafeCall( cudaMemcpy(h_paged_states, d_states, pagedMemorySize/num_streams, 
+			cudaMemcpyDeviceToHost) );
+		memcpy(h_states, h_paged_states, pagedMemorySize/num_streams);
+
+		// Prefetch data for next partition in first stream
+		if (num_partitions>1) {
+			global_offset = rateStateCount * num_streams * grid.x * threads.x;
+			memcpy(h_paged_states, h_states + global_offset, pagedMemorySize/num_streams);
+		}
+	} else {
+		memcpy(h_paged_states, h_states, pagedMemorySize);
+	}
+
+	// Queue kernel calls into streams to hide memory transfers (num_partitions sets of kernel calls in each stream)
+	for(i = 0; i < num_partitions; i++) { 
+		// Asynchronously launch num_streams memcopies
+		for(j = 0; j < num_streams; j++) {
+			if (timing_file==NULL || !(i==0 && j==0)) {
+				local_offset = j * rateStateCount * grid.x * threads.x ;
+				cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset, 
+					(size_t)pagedMemorySize/num_streams, 
+					cudaMemcpyHostToDevice, streams[j]) );
+			}
+		}
+		// Execute the kernel
+		// Asynchronously launch num_streams kernels, each operating on its own portion of data
+		for(j = 0; j < num_streams; j++) {
+			if (timing_file==NULL || !(i==0 && j==0)) {
+				local_offset = j * grid.x * threads.x ;
+				solveSystem<<<grid, threads, sharedMem, streams[j]>>>(timeSteps, stepSize, 
+					d_states + rateStateCount*local_offset);
+			}
+		}
+
+		// Asynchronoously launch num_streams memcopies
+		for(j = 0; j < num_streams; j++){
+			if (timing_file==NULL || !(i==0 && j==0)) {
+				local_offset = j * rateStateCount * grid.x * threads.x ;
+				cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset, 
+					pagedMemorySize/num_streams, 
+					cudaMemcpyDeviceToHost, streams[j]) );
+			}
+		}
+		
+		// Execute memcpys in and out of paged memory when CUDA calls in the streams have finished
+		for(j = 0; j < num_streams; j++){
+			if (timing_file==NULL || !(i==0 && j==0)) {
+				cudaStreamSynchronize(streams[j]);
+
+				local_offset = j * rateStateCount * grid.x * threads.x ;
+				global_offset = i * num_streams * grid.x * threads.x;
+
+				memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset, 
+					pagedMemorySize/num_streams);
+				
+				if (i < num_partitions - 1) {
+					global_offset = (i + 1) * num_streams * grid.x * threads.x;
+					memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset, 
+						pagedMemorySize/num_streams);
+				}
+			}
+		}
+	}
+
+	if (timing_file) {
+		// Stop global timer
+		cutilCheckError(cutStopTimer(timer));
+
+		// Calculate timing statistics
+		dSeconds = cutGetTimerValue(timer)/1000.0;
+		nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads;
+		gflops = 1.0e-9 * nFLOPS/dSeconds;
+
+		kernel_dSeconds = cutGetTimerValue(kernel_timer)/1000.0;
+		kernel_nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads/num_streams/num_partitions;
+		kernel_gflops = 1.0e-9 * kernel_nFLOPS/kernel_dSeconds;
+
+		// Store Stats
+		fprintf(timing_file,"%s\t%s\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\n", cellModelName, integratorName, num_threads, num_blocks, 
+			threads_per_block, num_partitions, num_streams, dSeconds, gflops, kernel_dSeconds, 
+			kernel_gflops, gflops/kernel_gflops*100);
+	}
+
+	// Deallocate Memory and Release Threads
+	for(i = 0; i < num_streams; i++)
+        	cutilSafeCall( cudaStreamDestroy(streams[i]) );
+	cutilSafeCall( cudaFree(d_states) );
+	cutilSafeCall( cudaFreeHost(h_paged_states) );
+	cudaThreadExit();
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Auxilliary Testing Functions
+////////////////////////////////////////////////////////////////////////////////
+
+void solveProblem(unsigned int timeSteps, unsigned int num_threads, unsigned int threads_per_block, 
+				  unsigned int num_partitions, unsigned int num_streams, FILE *timing_file, FILE *results_file)
+{
+	unsigned int i, j; 
+	float startTime = 0.0f;
+	float endTime = 0.2f;
+
+	double* h_states = NULL;
+
+	h_states = (double *) safeMalloc(sizeof(double)*rateStateCount*num_threads);
+
+	initProblem(num_threads, h_states);
+
+	solve(h_states, startTime, endTime, timeSteps, num_threads, threads_per_block, num_partitions, num_streams, timing_file);
+
+	if (results_file) {
+		fprintf(results_file,"\n\n");
+
+		for (i=0; i<num_threads; i++) {
+			fprintf(results_file,"%d", i+1);
+			for (j=0; j<rateStateCount; j++) {
+				fprintf(results_file,"\t%f", h_states[i*rateStateCount+j]);
+			}
+			fprintf(results_file,"\n");
+		}
+	}
+
+	free(h_states);
+}
+
+void startToFinish (unsigned int num_threads,unsigned int threads_per_block,unsigned int num_partitions,unsigned int num_streams,unsigned int timeSteps)
+{
+	//int i,j,k;
+ 
+	FILE *file = NULL; 
+	FILE *file1 = NULL;
+ 
+	file = fopen("performance_data.txt", "rt");
+	//file1 = fopen("results_data.txt", "wt");
+
+	if (!file) {
+		file = fopen("performance_data.txt", "wt");
+		fprintf(file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\tSingle Kernel Computaional Time(s)\tKernel GFLOPS\tDevice Utilisation\n");
+	} else {
+		fclose(file);
+		file = fopen("performance_data.txt", "at");
+		if (!file) {
+			fprintf(stderr, "Performance Data file could not be opened or created.");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	atexit( cleanup );
+
+	//for(i=32; i<=1024; i++) if (num_threads%i==0) for(j=0; j<5; j++) solveProblem(timeSteps, num_threads, i, num_partitions, num_streams, file, file1);
+
+	solveProblem(timeSteps, num_threads, threads_per_block, num_partitions, num_streams, file, file1);
+
+	/*for(i=2; i<21; i+=4) {
+		for(j=10; j<40; j+=4) {
+			for (k=0; k<10; k++) {
+				 solveProblem(timeSteps, num_threads, threads_per_block, j, i, file, file1);
+			}
+		}
+	}*/
+
+	if (file) fclose(file);
+	if (file1) fclose(file1);
+}
diff --git a/src/external_dae_solver_routines.c b/src/external_dae_solver_routines.cu
similarity index 89%
rename from src/external_dae_solver_routines.c
rename to src/external_dae_solver_routines.cu
index 0f630bb5..cd76e408 100644
--- a/src/external_dae_solver_routines.c
+++ b/src/external_dae_solver_routines.cu
@@ -1,5 +1,5 @@
 /* \file
- * $Id$
+ * $Id: external_dae_solver_routines.c 1836 2010-12-20 17:25:14Z chrispbradley $
  * \author Chris Bradley
  * \brief This file provides the routines for solving differential-algebraic equations with an external solver.
  *.
@@ -61,11 +61,13 @@ SolverDAEExternalIntegrate     Solves the differential-algebraic equation.
 #include <stdlib.h>
 
 #include "external_dae_solver_routines.h"
+#include "cuda_solver_routines.cu"
 
 /* Type definitions */
 
 /* Function Definitions */
-
+extern "C"
+{
 void SolverDAEExternalIntegrate(const int NumberOfDofs,
 				const double StartTime,
 				const double EndTime,
@@ -81,8 +83,12 @@ void SolverDAEExternalIntegrate(const int NumberOfDofs,
 				int *err)
 {
 
-  printf("Hello World!\n");
-     
-}
+	unsigned int timesteps = 10;
+
+	printf("start %f end %f int step %f dofs %d", StartTime, EndTime, InitialStep[0], NumberOfDofs);
 
+	solve(StateData, StartTime, EndTime, timesteps, NumberOfDofs, 1024, 10, 2, NULL);
+
+}
+}
 
diff --git a/src/external_dae_solver_routines.h b/src/external_dae_solver_routines.h
index fa00d6c5..7f8682a3 100644
--- a/src/external_dae_solver_routines.h
+++ b/src/external_dae_solver_routines.h
@@ -42,4 +42,19 @@
  * the terms of any one of the MPL, the GPL or the LGPL.
  *
  */
-
+extern "C"
+{
+void SolverDAEExternalIntegrate(const int NumberOfDofs,
+				const double StartTime,
+				const double EndTime,
+				double *InitialStep,
+				const int OnlyOneModelIndex,
+				int *ModelsData,
+				int NumberOfState,
+				double *StateData,
+				int NumberOfParameters,
+				double *ParametersData,
+				int NumberOfIntermediate,
+				double *IntermediateData,
+				int *err);
+}

From 5d014f354cdddcd36c826e5016dd11e454ef5ebf Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Mon, 24 Jan 2011 23:33:12 +0000
Subject: [PATCH 03/24] Updated cellml to opencmiss field mapping calls.
 Tracker item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@1918 61e36070-c886-4827-b7c0-ed9e4dc59cc8

From 7557b3f573aebe9b8884f82bcc34365eb96a4887 Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Tue, 1 Feb 2011 22:02:33 +0000
Subject: [PATCH 04/24] CMGUI heart mesh files added. Tracker item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@1988 61e36070-c886-4827-b7c0-ed9e4dc59cc8

From 5707beb20dd51dd81a03e9c3dbb8aca98569f647 Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Thu, 3 Feb 2011 01:33:59 +0000
Subject: [PATCH 05/24] Implemented VTK reading in MonodomainCUDA example.
 Getting Segment violations when accesssing converted C pointers. No Tracker
 Item

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2001 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 Makefile                  | 11 +++++++---
 src/field_IO_routines.f90 | 16 +++++++-------
 src/opencmiss.f90         | 45 ++++++++++++++++++++++++++++++++++++---
 src/vtk_import_routines.c | 20 ++++++++---------
 4 files changed, 68 insertions(+), 24 deletions(-)

diff --git a/Makefile b/Makefile
index f07f6f16..39e29084 100644
--- a/Makefile
+++ b/Makefile
@@ -855,6 +855,7 @@ OBJECTS = $(OBJECT_DIR)/advection_diffusion_equation_routines.o \
 	$(OBJECT_DIR)/trees.o \
 	$(OBJECT_DIR)/types.o \
 	$(OBJECT_DIR)/util_array.o \
+	$(OBJECT_DIR)/vtk_import_routines.o \
 	$(FIELDML_OBJECT) 
 
 ifeq ($(OPERATING_SYSTEM),linux)# Linux
@@ -1416,8 +1417,8 @@ $(OBJECT_DIR)/equations_set_routines.o	:	$(SOURCE_DIR)/equations_set_routines.f9
 	$(OBJECT_DIR)/timer_f.o \
 	$(OBJECT_DIR)/types.o
 
-$(OBJECT_DIR)/external_dae_solver_routines.o : $(SOURCE_DIR)/external_dae_solver_routines.c \
-      $(SOURCE_DIR)/external_dae_solver_routines.h \
+$(OBJECT_DIR)/external_dae_solver_routines.o	:	$(SOURCE_DIR)/external_dae_solver_routines.c \
+	$(SOURCE_DIR)/external_dae_solver_routines.h 
 
 $(OBJECT_DIR)/field_routines.o	:	$(SOURCE_DIR)/field_routines.f90 \
 	$(OBJECT_DIR)/base_routines.o \
@@ -1455,7 +1456,8 @@ $(OBJECT_DIR)/field_IO_routines.o	:	$(SOURCE_DIR)/field_IO_routines.f90 \
 	$(OBJECT_DIR)/node_routines.o \
 	$(OBJECT_DIR)/region_routines.o \
 	$(OBJECT_DIR)/strings.o \
-	$(OBJECT_DIR)/types.o
+	$(OBJECT_DIR)/types.o \
+	$(OBJECT_DIR)/vtk_import_routines.o
 
 $(OBJECT_DIR)/finite_elasticity_routines.o	:	$(SOURCE_DIR)/finite_elasticity_routines.f90 \
 	$(OBJECT_DIR)/base_routines.o \
@@ -2126,6 +2128,9 @@ $(OBJECT_DIR)/util_array.o   :       $(SOURCE_DIR)/util_array.f90 \
 	$(OBJECT_DIR)/base_routines.o \
 	$(OBJECT_DIR)/types.o
 
+$(OBJECT_DIR)/vtk_import_routines.o	:	$(SOURCE_DIR)/vtk_import_routines.c \
+	$(SOURCE_DIR)/vtk_import_routines.h 
+
 # ----------------------------------------------------------------------------
 #
 # clean and clobber for removing objects and executable.
diff --git a/src/field_IO_routines.f90 b/src/field_IO_routines.f90
index a08b6a73..6c93bc45 100644
--- a/src/field_IO_routines.f90
+++ b/src/field_IO_routines.f90
@@ -353,16 +353,16 @@ FUNCTION FieldExport_DerivativeIndices( handle, componentNumber, fieldType, vari
       INTEGER(C_INT) :: FieldExport_DerivativeIndices
     END FUNCTION FieldExport_DerivativeIndices
 
-    SUBROUTINE ReadVTK(filePath, points, numPoints, cells, numCells) &
+    SUBROUTINE READ_VTK(filePath, points, numPoints, cells, numCells) &
     & BIND(C, NAME="readVTK")
       USE TYPES
       USE ISO_C_BINDING
-      CHARACTER(C_CHAR), INTENT(IN) :: filePath(*)
-      REAL(C_DOUBLE), INTENT(OUT) :: points(*)
-      INTEGER(C_INT), INTENT(OUT) :: numPoints(*)
-      INTEGER(C_INT), INTENT(OUT) :: cells(*)
-      INTEGER(C_INT), INTENT(OUT) :: numCells(*)
-    END SUBROUTINE ReadVTK
+      CHARACTER(LEN=1,KIND=C_CHAR), INTENT(IN) :: filePath(*)
+      TYPE(C_PTR), INTENT(OUT):: points(*)
+      INTEGER(C_INT), INTENT(OUT) :: numPoints
+      TYPE(C_PTR), INTENT(OUT) :: cells(*)
+      INTEGER(C_INT), INTENT(OUT) :: numCells
+    END SUBROUTINE READ_VTK
 
   END INTERFACE
 
@@ -391,7 +391,7 @@ END SUBROUTINE ReadVTK
     MODULE PROCEDURE CHECKED_DEALLOCATE_BASIS
   END INTERFACE !CHECKED_DEALLOCATE
 
-  PUBLIC :: FIELD_IO_FIELDS_IMPORT, FIELD_IO_NODES_EXPORT, FIELD_IO_ELEMENTS_EXPORT
+  PUBLIC :: FIELD_IO_FIELDS_IMPORT, FIELD_IO_NODES_EXPORT, FIELD_IO_ELEMENTS_EXPORT, READ_VTK
 
 
 CONTAINS
diff --git a/src/opencmiss.f90 b/src/opencmiss.f90
index 9b4d3b3a..6d69081f 100644
--- a/src/opencmiss.f90
+++ b/src/opencmiss.f90
@@ -3406,7 +3406,7 @@ MODULE OPENCMISS
     MODULE PROCEDURE CMISSFieldIONodesExportVSVSObj
   END INTERFACE !CMISSFieldIONodesExport
 
-  PUBLIC CMISSFieldIOElementsExport,CMISSFieldIONodesExport,CMISSFieldIOFieldsImport
+  PUBLIC CMISSFieldIOElementsExport,CMISSFieldIONodesExport,CMISSFieldIOFieldsImport, CMISSReadVTK
   
 !!==================================================================================================================================
 !!
@@ -29086,7 +29086,7 @@ SUBROUTINE CMISSFieldIONodesExportVSVSObj(Fields,FileName,Method,Err)
     
   END SUBROUTINE CMISSFieldIONodesExportVSVSObj
 
-    !
+  !
   !================================================================================================================================
   !
 
@@ -29127,9 +29127,48 @@ SUBROUTINE CMISSFieldIOFieldsImport(FileName, Method, Region, Mesh, MeshUserNumb
     CALL EXITS("CMISSFieldIOFieldsImport")
     CALL CMISS_HANDLE_ERROR(Err,ERROR)
     RETURN
-
   END SUBROUTINE CMISSFieldIOFieldsImport
+  !
+  !================================================================================================================================
+  !
+
+  !> Import mesh from vtk file
+  SUBROUTINE CMISSReadVTK(filePath, points, numPoints, cells, numCells, Err)
+    !Argument variables
+    USE ISO_C_BINDING
+    CHARACTER(LEN=*), INTENT(IN) :: filePath
+    TYPE(C_PTR), INTENT(OUT) :: points(*)
+    INTEGER(C_INT), INTENT(OUT) :: numPoints
+    TYPE(C_PTR), INTENT(OUT) :: cells(*)
+    INTEGER(C_INT), INTENT(OUT) :: numCells
+    INTEGER(INTG), INTENT(OUT) :: Err !<The error code
+    !Local variables
+    INTEGER(C_INT) :: i,LENGTH
+    CHARACTER(LEN=1,KIND=C_CHAR) :: Cstring(256)
+
+    CALL ENTERS("CMISSReadVTK",Err,ERROR,*999)
+
+    IF(SIZE(Cstring,1)>LEN_TRIM(filePath)) THEN
+      LENGTH=LEN_TRIM(filePath)
+    ELSE
+      LENGTH=SIZE(Cstring,1)-1
+    ENDIF
+    DO i=1,LENGTH
+      Cstring(i)=filePath(i:i)
+    ENDDO !i
+    !Null terminate the string
+    Cstring(LENGTH+1)=C_NULL_CHAR
+
+    CALL READ_VTK(Cstring, points, numPoints, cells, numCells)
+
+    CALL EXITS("CMISSReadVTK")
+    RETURN
+999 CALL ERRORS("CMISSReadVTK",Err,ERROR)
+    CALL EXITS("CMISSReadVTK")
+    CALL CMISS_HANDLE_ERROR(Err,ERROR)
+    RETURN
 
+  END SUBROUTINE CMISSReadVTK
 
 !!==================================================================================================================================
 !!
diff --git a/src/vtk_import_routines.c b/src/vtk_import_routines.c
index c454c006..c0be415e 100644
--- a/src/vtk_import_routines.c
+++ b/src/vtk_import_routines.c
@@ -34,37 +34,37 @@ void readVTK(const char* filePath, double* points, int* numPoints, int* cells, i
 	fgets(buffer, 100, file);
 	fgets(buffer, 100, file);
 
-	FSCANF(file, "%*s %d %*s\n", &numPoints);
+	FSCANF(file, "%*s %d %*s\n", numPoints);
 
-	printf("Number of Points = %d\n", numPoints);
+	printf("Number of Points = %d\n", (*numPoints));
 
-	points = (double *) malloc(sizeof(double)*numPoints*3);
+	points = (double *) malloc(sizeof(double)*(*numPoints)*3);
 
-	while (count<numPoints*3) {
+	while (count<(*numPoints)*3) {
 		numLine = whiteSpaceCounter(file);
 		for (i=0; i<numLine; i++) {
-			FSCANF(file, "%f ", &points[count]);
+			FSCANF(file, "%lf ", &points[count]);
 			count++;
 		}
-		FSCANF(file, "%f\n", &points[count]);
+		FSCANF(file, "%lf\n", &points[count]);
 		count++;
 	}
 
 #ifdef _MSC_VER
 	FSCANF(file, "%s %d %*s\n", buffer, 6, &numCells);
 #else
-      FSCANF(file, "%s %d %*s\n", buffer, &numCells);
+      FSCANF(file, "%s %d %*s\n", buffer, numCells);
 #endif	
 
-	printf("Number of Cells = %d\n", numCells);
-	cells = (int *) malloc(sizeof(int)*numCells*4);
+	printf("Number of Cells = %d\n", (*numCells));
+	cells = (int *) malloc(sizeof(int)*(*numCells)*4);
 	
 	if (strncmp(buffer, cellString, 6)!=0) {
         fprintf(stderr, "Incorrectly formatted VTK file -\nCells description in the wrong place\n");
         exit(EXIT_FAILURE);
     }  
 
-	for (i=0; i<numCells; i++) {
+	for (i=0; i<(*numCells); i++) {
 		FSCANF(file, "%*d %d %d %d %d\n", &cells[i*4], &cells[i*4+1], &cells[i*4+2], &cells[i*4+3]);
 	}
 

From 75ea90cea0692053446baa871c25b0a6b4447203 Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Thu, 3 Feb 2011 02:21:46 +0000
Subject: [PATCH 06/24] Bug fixes.

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2002 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 src/field_IO_routines.f90 |  4 ++--
 src/opencmiss.f90         |  4 ++--
 src/vtk_import_routines.c | 12 ++++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/field_IO_routines.f90 b/src/field_IO_routines.f90
index 6c93bc45..f603c763 100644
--- a/src/field_IO_routines.f90
+++ b/src/field_IO_routines.f90
@@ -358,9 +358,9 @@ SUBROUTINE READ_VTK(filePath, points, numPoints, cells, numCells) &
       USE TYPES
       USE ISO_C_BINDING
       CHARACTER(LEN=1,KIND=C_CHAR), INTENT(IN) :: filePath(*)
-      TYPE(C_PTR), INTENT(OUT):: points(*)
+      TYPE(C_PTR), INTENT(OUT):: points
       INTEGER(C_INT), INTENT(OUT) :: numPoints
-      TYPE(C_PTR), INTENT(OUT) :: cells(*)
+      TYPE(C_PTR), INTENT(OUT) :: cells
       INTEGER(C_INT), INTENT(OUT) :: numCells
     END SUBROUTINE READ_VTK
 
diff --git a/src/opencmiss.f90 b/src/opencmiss.f90
index 6d69081f..3ef43a10 100644
--- a/src/opencmiss.f90
+++ b/src/opencmiss.f90
@@ -29137,9 +29137,9 @@ SUBROUTINE CMISSReadVTK(filePath, points, numPoints, cells, numCells, Err)
     !Argument variables
     USE ISO_C_BINDING
     CHARACTER(LEN=*), INTENT(IN) :: filePath
-    TYPE(C_PTR), INTENT(OUT) :: points(*)
+    TYPE(C_PTR), INTENT(OUT) :: points
     INTEGER(C_INT), INTENT(OUT) :: numPoints
-    TYPE(C_PTR), INTENT(OUT) :: cells(*)
+    TYPE(C_PTR), INTENT(OUT) :: cells
     INTEGER(C_INT), INTENT(OUT) :: numCells
     INTEGER(INTG), INTENT(OUT) :: Err !<The error code
     !Local variables
diff --git a/src/vtk_import_routines.c b/src/vtk_import_routines.c
index c0be415e..1ffe454a 100644
--- a/src/vtk_import_routines.c
+++ b/src/vtk_import_routines.c
@@ -12,7 +12,7 @@ extern int whiteSpaceCounter(FILE*);
 
 const char* cellString = "CELLS"; 
 
-void readVTK(const char* filePath, double* points, int* numPoints, int* cells, int* numCells){
+void readVTK(const char* filePath, double** points, int* numPoints, int** cells, int* numCells){
 	int numLine = 0;
 	int index = 0;
 	int i;
@@ -38,15 +38,15 @@ void readVTK(const char* filePath, double* points, int* numPoints, int* cells, i
 
 	printf("Number of Points = %d\n", (*numPoints));
 
-	points = (double *) malloc(sizeof(double)*(*numPoints)*3);
+	(*points) = (double *) malloc(sizeof(double)*(*numPoints)*3);
 
 	while (count<(*numPoints)*3) {
 		numLine = whiteSpaceCounter(file);
 		for (i=0; i<numLine; i++) {
-			FSCANF(file, "%lf ", &points[count]);
+		  FSCANF(file, "%lf ", &((*points)[count]));
 			count++;
 		}
-		FSCANF(file, "%lf\n", &points[count]);
+		FSCANF(file, "%lf\n", &((*points)[count]));
 		count++;
 	}
 
@@ -57,7 +57,7 @@ void readVTK(const char* filePath, double* points, int* numPoints, int* cells, i
 #endif	
 
 	printf("Number of Cells = %d\n", (*numCells));
-	cells = (int *) malloc(sizeof(int)*(*numCells)*4);
+	(*cells) = (int *) malloc(sizeof(int)*(*numCells)*4);
 	
 	if (strncmp(buffer, cellString, 6)!=0) {
         fprintf(stderr, "Incorrectly formatted VTK file -\nCells description in the wrong place\n");
@@ -65,7 +65,7 @@ void readVTK(const char* filePath, double* points, int* numPoints, int* cells, i
     }  
 
 	for (i=0; i<(*numCells); i++) {
-		FSCANF(file, "%*d %d %d %d %d\n", &cells[i*4], &cells[i*4+1], &cells[i*4+2], &cells[i*4+3]);
+	  FSCANF(file, "%*d %d %d %d %d\n",&((*cells)[i*4]), &((*cells)[i*4+1]), &((*cells)[i*4+2]), &((*cells)[i*4+3]));
 	}
 
 	fclose(file);

From 831e52b6b3d4dbea8b69d535ef20d05163cbc5b1 Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Thu, 3 Feb 2011 03:20:48 +0000
Subject: [PATCH 07/24] VTK importing properly into example. No Tracker Item

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2003 61e36070-c886-4827-b7c0-ed9e4dc59cc8

From 61394ccd7f75bcc442f3b59a46e188b7fe2efa6b Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Mon, 7 Feb 2011 02:30:06 +0000
Subject: [PATCH 08/24] Changes made to Monodomain files. Tracker item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2008 61e36070-c886-4827-b7c0-ed9e4dc59cc8

From e1d55156e4a8c8ed3c164a95ae6ff26f9ab47448 Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Wed, 9 Feb 2011 01:26:14 +0000
Subject: [PATCH 09/24] Updated CUDA example, changed to fitzhugh nagumo for
 simplicity. See Tracker Item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2021 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 ExampleMakefile                               |   3 +-
 Makefile                                      |   9 +-
 src/Darcy_equations_routines.f90              |  82 ++++-----
 src/Poisson_equations_routines.f90            |  62 ++++---
 src/advection_diffusion_equation_routines.f90 | 119 +++++++++++-
 src/cmiss_c.c                                 |  18 +-
 src/control_loop_routines.f90                 |  58 +++---
 src/cuda_solver_routines.cu                   | 169 ++++++++++--------
 src/external_dae_solver_routines.c            |  94 ----------
 src/external_dae_solver_routines.cu           |   6 +-
 src/external_dae_solver_routines.h            |   6 +-
 src/field_IO_routines.f90                     |  70 +++++---
 src/finite_elasticity_routines.f90            | 159 ++++++++--------
 src/fluid_mechanics_IO_routines.f90           |   4 +
 src/solver_routines.f90                       |   1 +
 15 files changed, 461 insertions(+), 399 deletions(-)
 delete mode 100644 src/external_dae_solver_routines.c

diff --git a/ExampleMakefile b/ExampleMakefile
index 41e0ea90..a67943b2 100644
--- a/ExampleMakefile
+++ b/ExampleMakefile
@@ -887,8 +887,7 @@ $(EXE_DIR) :
 	mkdir -p $@; 
 
 $(EXECUTABLE) : $(OBJECTS) $(OPENCMISS_LIBRARY)
-	$(EXE_LINK) -o $@ $(OBJECTS) $(OPENCMISS_LIBRARY) $(ELFLAGS) $(EXTERNAL_LIBRARIES) 
-#-L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -lcudart -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -lcudart -lcutil_x86_64 -lshrutil_x86_64
+	$(EXE_LINK) -o $@ $(OBJECTS) $(OPENCMISS_LIBRARY) $(ELFLAGS) $(EXTERNAL_LIBRARIES) -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -lcudart -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -lcudart -lcutil_x86_64 -lshrutil_x86_64
 
 $(OPENCMISS_LIBRARY) :
 	( cd $(GLOBAL_ROOT); $(MAKE) )
diff --git a/Makefile b/Makefile
index 39e29084..457353ce 100644
--- a/Makefile
+++ b/Makefile
@@ -732,8 +732,7 @@ FPPFLAGS += $(EXTERNAL_INCLUDE_PATH)
 
 ELFLAGS += $(EXTERNAL_LIB_PATH)
 
-.SUFFIXES:	.f90	.c  
-#.cu
+.SUFFIXES:	.f90	.c  .cu
 
 $(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.f90
 	( cd $(OBJECT_DIR) ; $(FC) -o $@ $(FFLAGS) $(FPPFLAGS) -c $< )
@@ -741,8 +740,8 @@ $(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.f90
 $(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.c
 	( cd $(OBJECT_DIR) ; $(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $< )
 
-#(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.cu
-#	( cd $(OBJECT_DIR) ; nvcc  -gencode=arch=compute_10,code=\"sm_10,compute_10\"  -gencode=arch=compute_20,code=\"sm_20,compute_20\" --compile -m64 --compiler-options -fno-strict-aliasing --ptxas-options=-v  -I$(SOURCE_DIR) -I/usr/local/cuda/include -I/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/inc -I/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//inc --use_fast_math -DUNIX -O2 -o $@ -c $< )
+$(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.cu
+	( cd $(OBJECT_DIR) ; nvcc  -gencode=arch=compute_10,code=\"sm_10,compute_10\"  -gencode=arch=compute_20,code=\"sm_20,compute_20\" --compile -m64 --compiler-options -fno-strict-aliasing --ptxas-options=-v  -I$(SOURCE_DIR) -I/usr/local/cuda/include -I/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/inc -I/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//inc --use_fast_math -DUNIX -O2 -o $@ -c $< )
 
 
 ifeq ($(USEFIELDML),true)
@@ -1417,7 +1416,7 @@ $(OBJECT_DIR)/equations_set_routines.o	:	$(SOURCE_DIR)/equations_set_routines.f9
 	$(OBJECT_DIR)/timer_f.o \
 	$(OBJECT_DIR)/types.o
 
-$(OBJECT_DIR)/external_dae_solver_routines.o	:	$(SOURCE_DIR)/external_dae_solver_routines.c \
+$(OBJECT_DIR)/external_dae_solver_routines.o	:	$(SOURCE_DIR)/external_dae_solver_routines.cu \
 	$(SOURCE_DIR)/external_dae_solver_routines.h 
 
 $(OBJECT_DIR)/field_routines.o	:	$(SOURCE_DIR)/field_routines.f90 \
diff --git a/src/Darcy_equations_routines.f90 b/src/Darcy_equations_routines.f90
index e5174dee..e2f939e8 100644
--- a/src/Darcy_equations_routines.f90
+++ b/src/Darcy_equations_routines.f90
@@ -3041,7 +3041,7 @@ SUBROUTINE DARCY_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,ELEMENT_NUMBER,
                   IF(EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_INCOMPRESSIBLE_ELASTICITY_DRIVEN_DARCY_SUBTYPE .OR. &
                     & EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_INCOMPRESSIBLE_ELAST_MULTI_COMP_DARCY_SUBTYPE) THEN
 
-                      C_PARAM=EQUATIONS%INTERPOLATION%SOURCE_INTERP_POINT(FIELD_U_VARIABLE_TYPE)%PTR%VALUES(4, NO_PART_DERIV)
+                      C_PARAM=EQUATIONS%INTERPOLATION%SOURCE_INTERP_POINT(FIELD_U_VARIABLE_TYPE)%PTR%VALUES(mh, NO_PART_DERIV)
 
                       !IF(ABS(C_PARAM)>1.0E-08) WRITE(*,*)'C_PARAM = ',C_PARAM
 
@@ -5375,26 +5375,27 @@ SUBROUTINE DARCY_EQUATION_POST_SOLVE_OUTPUT_DATA(CONTROL_LOOP,SOLVER,ERR,ERROR,*
 
 
                       !Subiteration intermediate solutions / iterates output:
-                       IF(CONTROL_LOOP%PARENT_LOOP%LOOP_TYPE==PROBLEM_CONTROL_WHILE_LOOP_TYPE) THEN  !subiteration exists
-                        IF(CURRENT_LOOP_ITERATION<10) THEN
-                          IF(SUBITERATION_NUMBER<10) THEN
-                            WRITE(OUTPUT_FILE,'("T_00",I0,"_SB_0",I0,"_C",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER, &
-                              & equations_set_idx
-                          ELSE IF(SUBITERATION_NUMBER<100) THEN
-                            WRITE(OUTPUT_FILE,'("T_00",I0,"_SB_",I0,"_C",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER, &
-                              & equations_set_idx
-                          END IF
-                          FILE=OUTPUT_FILE
-                          METHOD="FORTRAN"
-                          EXPORT_FIELD=.TRUE.
-                          IF(EXPORT_FIELD) THEN
-                            CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Darcy export subiterates ...",ERR,ERROR,*999)
-                            CALL FLUID_MECHANICS_IO_WRITE_CMGUI(EQUATIONS_SET%REGION,EQUATIONS_SET%GLOBAL_NUMBER,FILE, &
-                              & ERR,ERROR,*999)
-                            CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
-                          ENDIF
-                        ENDIF
-                       ENDIF
+!                        IF(CONTROL_LOOP%PARENT_LOOP%LOOP_TYPE==PROBLEM_CONTROL_WHILE_LOOP_TYPE) THEN  !subiteration exists
+!                         IF(CURRENT_LOOP_ITERATION<10) THEN
+!                           IF(SUBITERATION_NUMBER<10) THEN
+!                             WRITE(OUTPUT_FILE,'("T_00",I0,"_SB_0",I0,"_C",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER, &
+!                               & equations_set_idx
+!                           ELSE IF(SUBITERATION_NUMBER<100) THEN
+!                             WRITE(OUTPUT_FILE,'("T_00",I0,"_SB_",I0,"_C",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER, &
+!                               & equations_set_idx
+!                           END IF
+!                           FILE=OUTPUT_FILE
+!                           METHOD="FORTRAN"
+!                           EXPORT_FIELD=.TRUE.
+!                           IF(EXPORT_FIELD) THEN
+!                             CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Darcy export subiterates ...",ERR,ERROR,*999)
+!                             CALL FLUID_MECHANICS_IO_WRITE_CMGUI(EQUATIONS_SET%REGION,EQUATIONS_SET%GLOBAL_NUMBER,FILE, &
+!                               & ERR,ERROR,*999)
+!                             CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
+!                           ENDIF
+!                         ENDIF
+!                        ENDIF
+
                     ELSE !for single compartment (i.e. standary Darcy flow) equations sets
                       CONTROL_TIME_LOOP=>CONTROL_LOOP
                       DO loop_idx=1,CONTROL_LOOP%CONTROL_LOOP_LEVEL
@@ -5445,25 +5446,26 @@ SUBROUTINE DARCY_EQUATION_POST_SOLVE_OUTPUT_DATA(CONTROL_LOOP,SOLVER,ERR,ERROR,*
                        ENDIF
 
 
-                      !Subiteration intermediate solutions / iterates output:
-                       IF(CONTROL_LOOP%PARENT_LOOP%LOOP_TYPE==PROBLEM_CONTROL_WHILE_LOOP_TYPE) THEN  !subiteration exists
-                        IF(CURRENT_LOOP_ITERATION<10) THEN
-                          IF(SUBITERATION_NUMBER<10) THEN
-                            WRITE(OUTPUT_FILE,'("T_00",I0,"_SUB_000",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER
-                          ELSE IF(SUBITERATION_NUMBER<100) THEN
-                            WRITE(OUTPUT_FILE,'("T_00",I0,"_SUB_00",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER
-                          END IF
-                          FILE=OUTPUT_FILE
-                          METHOD="FORTRAN"
-                          EXPORT_FIELD=.TRUE.
-                          IF(EXPORT_FIELD) THEN
-                            CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Darcy export subiterates ...",ERR,ERROR,*999)
-                            CALL FLUID_MECHANICS_IO_WRITE_CMGUI(EQUATIONS_SET%REGION,EQUATIONS_SET%GLOBAL_NUMBER,FILE, &
-                              & ERR,ERROR,*999)
-                            CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
-                          ENDIF
-                        ENDIF
-                       ENDIF
+!                       !Subiteration intermediate solutions / iterates output:
+!                        IF(CONTROL_LOOP%PARENT_LOOP%LOOP_TYPE==PROBLEM_CONTROL_WHILE_LOOP_TYPE) THEN  !subiteration exists
+!                         IF(CURRENT_LOOP_ITERATION<10) THEN
+!                           IF(SUBITERATION_NUMBER<10) THEN
+!                             WRITE(OUTPUT_FILE,'("T_00",I0,"_SUB_000",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER
+!                           ELSE IF(SUBITERATION_NUMBER<100) THEN
+!                             WRITE(OUTPUT_FILE,'("T_00",I0,"_SUB_00",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER
+!                           END IF
+!                           FILE=OUTPUT_FILE
+!                           METHOD="FORTRAN"
+!                           EXPORT_FIELD=.TRUE.
+!                           IF(EXPORT_FIELD) THEN
+!                             CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Darcy export subiterates ...",ERR,ERROR,*999)
+!                             CALL FLUID_MECHANICS_IO_WRITE_CMGUI(EQUATIONS_SET%REGION,EQUATIONS_SET%GLOBAL_NUMBER,FILE, &
+!                               & ERR,ERROR,*999)
+!                             CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
+!                           ENDIF
+!                         ENDIF
+!                        ENDIF
+
                     ENDIF
                    ENDIF
                   ENDDO
diff --git a/src/Poisson_equations_routines.f90 b/src/Poisson_equations_routines.f90
index 55fef7ce..71aa1e30 100644
--- a/src/Poisson_equations_routines.f90
+++ b/src/Poisson_equations_routines.f90
@@ -1165,7 +1165,7 @@ SUBROUTINE POISSON_EQUATION_EQUATIONS_SET_SETUP(EQUATIONS_SET,EQUATIONS_SET_SETU
       CASE(EQUATIONS_SET_LINEAR_SOURCE_POISSON_SUBTYPE)
         CALL POISSON_EQUATION_EQUATIONS_SET_LINEAR_SOURCE_SETUP(EQUATIONS_SET,EQUATIONS_SET_SETUP,ERR,ERROR,*999)
       CASE(EQUATIONS_SET_LINEAR_PRESSURE_POISSON_SUBTYPE,EQUATIONS_SET_NONLINEAR_PRESSURE_POISSON_SUBTYPE, &
-        & EQUATIONS_SET_FITTED_PRESSURE_POISSON_SUBTYPE)
+        & EQUATIONS_SET_FITTED_PRESSURE_POISSON_SUBTYPE,EQUATIONS_SET_ALE_PRESSURE_POISSON_SUBTYPE)
         CALL POISSON_EQUATION_EQUATIONS_SET_PRESSURE_POISSON_SETUP(EQUATIONS_SET,EQUATIONS_SET_SETUP,ERR,ERROR,*999)
       CASE(EQUATIONS_SET_QUADRATIC_SOURCE_POISSON_SUBTYPE)
         CALL POISSON_EQUATION_EQUATIONS_SET_NONLINEAR_SOURCE_SETUP(EQUATIONS_SET,EQUATIONS_SET_SETUP,ERR,ERROR,*999)
@@ -1225,7 +1225,8 @@ SUBROUTINE POISSON_EQUATION_EQUATIONS_SET_SOLUTION_METHOD_SET(EQUATIONS_SET,SOLU
           CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
         END SELECT
       CASE(EQUATIONS_SET_LINEAR_SOURCE_POISSON_SUBTYPE,EQUATIONS_SET_LINEAR_PRESSURE_POISSON_SUBTYPE, & 
-        & EQUATIONS_SET_NONLINEAR_PRESSURE_POISSON_SUBTYPE,EQUATIONS_SET_FITTED_PRESSURE_POISSON_SUBTYPE)        
+        & EQUATIONS_SET_NONLINEAR_PRESSURE_POISSON_SUBTYPE,EQUATIONS_SET_FITTED_PRESSURE_POISSON_SUBTYPE, &
+        & EQUATIONS_SET_ALE_PRESSURE_POISSON_SUBTYPE)                
         SELECT CASE(SOLUTION_METHOD)
         CASE(EQUATIONS_SET_FEM_SOLUTION_METHOD)
           EQUATIONS_SET%SOLUTION_METHOD=EQUATIONS_SET_FEM_SOLUTION_METHOD
@@ -1330,6 +1331,10 @@ SUBROUTINE POISSON_EQUATION_EQUATIONS_SET_SUBTYPE_SET(EQUATIONS_SET,EQUATIONS_SE
         EQUATIONS_SET%CLASS=EQUATIONS_SET_CLASSICAL_FIELD_CLASS
         EQUATIONS_SET%TYPE=EQUATIONS_SET_POISSON_EQUATION_TYPE
         EQUATIONS_SET%SUBTYPE=EQUATIONS_SET_NONLINEAR_PRESSURE_POISSON_SUBTYPE
+      CASE(EQUATIONS_SET_ALE_PRESSURE_POISSON_SUBTYPE)
+        EQUATIONS_SET%CLASS=EQUATIONS_SET_CLASSICAL_FIELD_CLASS
+        EQUATIONS_SET%TYPE=EQUATIONS_SET_POISSON_EQUATION_TYPE
+        EQUATIONS_SET%SUBTYPE=EQUATIONS_SET_ALE_PRESSURE_POISSON_SUBTYPE
       CASE(EQUATIONS_SET_FITTED_PRESSURE_POISSON_SUBTYPE)
         EQUATIONS_SET%CLASS=EQUATIONS_SET_CLASSICAL_FIELD_CLASS
         EQUATIONS_SET%TYPE=EQUATIONS_SET_POISSON_EQUATION_TYPE
@@ -1395,6 +1400,7 @@ SUBROUTINE POISSON_EQUATION_EQUATIONS_SET_PRESSURE_POISSON_SETUP(EQUATIONS_SET,E
     IF(ASSOCIATED(EQUATIONS_SET)) THEN
       IF(EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_LINEAR_PRESSURE_POISSON_SUBTYPE.OR. & 
         & EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_NONLINEAR_PRESSURE_POISSON_SUBTYPE.OR. &
+        & EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_ALE_PRESSURE_POISSON_SUBTYPE.OR. &
         & EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_FITTED_PRESSURE_POISSON_SUBTYPE) THEN
         SELECT CASE(EQUATIONS_SET_SETUP%SETUP_TYPE)
         CASE(EQUATIONS_SET_SETUP_INITIAL_TYPE)
@@ -1649,7 +1655,7 @@ SUBROUTINE POISSON_EQUATION_EQUATIONS_SET_PRESSURE_POISSON_SETUP(EQUATIONS_SET,E
         CASE(EQUATIONS_SET_SETUP_SOURCE_TYPE)
           SELECT CASE(EQUATIONS_SET%SUBTYPE)
           CASE(EQUATIONS_SET_LINEAR_PRESSURE_POISSON_SUBTYPE,EQUATIONS_SET_NONLINEAR_PRESSURE_POISSON_SUBTYPE, &
-            & EQUATIONS_SET_FITTED_PRESSURE_POISSON_SUBTYPE)
+            & EQUATIONS_SET_FITTED_PRESSURE_POISSON_SUBTYPE,EQUATIONS_SET_ALE_PRESSURE_POISSON_SUBTYPE)
             SELECT CASE(EQUATIONS_SET_SETUP%ACTION_TYPE)
               !Set start action
             CASE(EQUATIONS_SET_SETUP_START_ACTION)
@@ -2107,19 +2113,19 @@ SUBROUTINE POISSON_PRE_SOLVE_UPDATE_PPE_MESH(CONTROL_LOOP,SOLVER,ERR,ERROR,*)
     CALL ENTERS("POISSON_PRE_SOLVE_UPDATE_PPE_MESH",ERR,ERROR,*999)
 
     IF(ASSOCIATED(CONTROL_LOOP)) THEN
-      CALL CONTROL_LOOP_CURRENT_TIMES_GET(CONTROL_LOOP,CURRENT_TIME,TIME_INCREMENT,ERR,ERROR,*999)
+      CALL CONTROL_LOOP_CURRENT_TIMES_GET(CONTROL_LOOP%PARENT_LOOP,CURRENT_TIME,TIME_INCREMENT,ERR,ERROR,*999)
       NULLIFY(SOLVER_LAPLACE)
       NULLIFY(SOLVER_ALE_PPE)
       IF(ASSOCIATED(SOLVER)) THEN
-        IF(ASSOCIATED(CONTROL_LOOP%PROBLEM)) THEN
-          SELECT CASE(CONTROL_LOOP%PROBLEM%SUBTYPE)
-            CASE(EQUATIONS_SET_LINEAR_PRESSURE_POISSON_SUBTYPE)
+        IF(ASSOCIATED(CONTROL_LOOP%PARENT_LOOP%PROBLEM)) THEN
+          SELECT CASE(CONTROL_LOOP%PARENT_LOOP%PROBLEM%SUBTYPE)
+            CASE(PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE)
               ! do nothing ???
-            CASE(EQUATIONS_SET_NONLINEAR_PRESSURE_POISSON_SUBTYPE)
+            CASE(PROBLEM_NONLINEAR_PRESSURE_POISSON_SUBTYPE)
               ! do nothing ???
-            CASE(EQUATIONS_SET_ALE_PRESSURE_POISSON_SUBTYPE)
+            CASE(PROBLEM_ALE_PRESSURE_POISSON_SUBTYPE)
               !Update mesh within the dynamic solver
-              IF(SOLVER%SOLVE_TYPE==SOLVER_DYNAMIC_TYPE) THEN
+              IF(SOLVER%SOLVE_TYPE==SOLVER_LINEAR_TYPE) THEN
                 !Get the independent field for the ALE PPE problem
                 CALL SOLVERS_SOLVER_GET(SOLVER%SOLVERS,1,SOLVER_ALE_PPE,ERR,ERROR,*999)
                 SOLVER_EQUATIONS_ALE_PPE=>SOLVER_ALE_PPE%SOLVER_EQUATIONS
@@ -2143,7 +2149,8 @@ SUBROUTINE POISSON_PRE_SOLVE_UPDATE_PPE_MESH(CONTROL_LOOP,SOLVER,ERR,ERROR,*)
                     CALL FIELD_PARAMETER_SET_DATA_GET(EQUATIONS_SET_ALE_PPE%INDEPENDENT%INDEPENDENT_FIELD, &
                       & FIELD_U_VARIABLE_TYPE,FIELD_MESH_DISPLACEMENT_SET_TYPE,MESH_DISPLACEMENT_VALUES,ERR,ERROR,*999)
                     CALL FLUID_MECHANICS_IO_READ_DATA(SOLVER_LINEAR_TYPE,MESH_DISPLACEMENT_VALUES, & 
-                      & NUMBER_OF_DIMENSIONS_ALE_PPE,INPUT_TYPE,INPUT_OPTION,CONTROL_LOOP%TIME_LOOP%ITERATION_NUMBER,1.0_DP)
+                      & NUMBER_OF_DIMENSIONS_ALE_PPE,INPUT_TYPE,INPUT_OPTION,CONTROL_LOOP%PARENT_LOOP%TIME_LOOP% &
+                      & ITERATION_NUMBER,1.0_DP)
                     CALL FIELD_PARAMETER_SET_UPDATE_START(EQUATIONS_SET_ALE_PPE%INDEPENDENT%INDEPENDENT_FIELD, & 
                       & FIELD_U_VARIABLE_TYPE,FIELD_MESH_DISPLACEMENT_SET_TYPE,ERR,ERROR,*999)
                     CALL FIELD_PARAMETER_SET_UPDATE_FINISH(EQUATIONS_SET_ALE_PPE%INDEPENDENT%INDEPENDENT_FIELD, & 
@@ -2200,7 +2207,7 @@ SUBROUTINE POISSON_PRE_SOLVE_UPDATE_PPE_MESH(CONTROL_LOOP,SOLVER,ERR,ERROR,*)
                 CALL FIELD_PARAMETER_SET_UPDATE_FINISH(EQUATIONS_SET_ALE_PPE%GEOMETRY%GEOMETRIC_FIELD, & 
                   & FIELD_U_VARIABLE_TYPE,FIELD_VALUES_SET_TYPE,ERR,ERROR,*999)
                 !Now use displacement values to calculate velocity values
-                TIME_INCREMENT=CONTROL_LOOP%TIME_LOOP%TIME_INCREMENT
+                TIME_INCREMENT=CONTROL_LOOP%PARENT_LOOP%TIME_LOOP%TIME_INCREMENT
                 ALPHA=1.0_DP/TIME_INCREMENT
                 CALL FIELD_PARAMETER_SETS_COPY(INDEPENDENT_FIELD_ALE_PPE,FIELD_U_VARIABLE_TYPE, & 
                   & FIELD_MESH_DISPLACEMENT_SET_TYPE,FIELD_MESH_VELOCITY_SET_TYPE,ALPHA,ERR,ERROR,*999)
@@ -3724,7 +3731,7 @@ SUBROUTINE POISSON_EQUATION_PROBLEM_SETUP(PROBLEM,PROBLEM_SETUP,ERR,ERROR,*)
       CASE(PROBLEM_LINEAR_SOURCE_POISSON_SUBTYPE)
         CALL POISSON_EQUATION_PROBLEM_LINEAR_SOURCE_SETUP(PROBLEM,PROBLEM_SETUP,ERR,ERROR,*999)
       CASE(PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE,PROBLEM_NONLINEAR_PRESSURE_POISSON_SUBTYPE, &
-        & PROBLEM_FITTED_PRESSURE_POISSON_SUBTYPE)
+        & PROBLEM_FITTED_PRESSURE_POISSON_SUBTYPE,PROBLEM_ALE_PRESSURE_POISSON_SUBTYPE)
         CALL POISSON_EQUATION_PROBLEM_PRESSURE_POISSON_SETUP(PROBLEM,PROBLEM_SETUP,ERR,ERROR,*999)
       CASE(PROBLEM_NONLINEAR_SOURCE_POISSON_SUBTYPE)
         CALL POISSON_EQUATION_PROBLEM_NONLINEAR_SOURCE_SETUP(PROBLEM,PROBLEM_SETUP,ERR,ERROR,*999)
@@ -3760,7 +3767,7 @@ SUBROUTINE POISSON_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,ELEMENT_NUMBE
     INTEGER(INTG) FIELD_VAR_TYPE,ng,mh,mhs,mi,ms,nh,nhs,ni,ns,I,J,K,L,H,element_node_identity !,k1sum,k2sum
     REAL(DP) :: RWG,SUM,PGMSI(3),PGNSI(3),SUM2,DXI_DX(3,3),DELTA_T,DXI_DX_DX(3,3)
     REAL(DP) :: U_VALUE(3),U_DERIV(3,3),RHO_PARAM,MU_PARAM,U_OLD(3),U_SECOND(3,3,3),X(3),B(3),P_DERIV(3),DIFF_COEFF,W_VALUE(3)
-    TYPE(BASIS_TYPE), POINTER :: DEPENDENT_BASIS,GEOMETRIC_BASIS,SOURCE_BASIS
+    TYPE(BASIS_TYPE), POINTER :: DEPENDENT_BASIS,GEOMETRIC_BASIS,SOURCE_BASIS,INDEPENDENT_BASIS
     TYPE(EQUATIONS_TYPE), POINTER :: EQUATIONS
     TYPE(EQUATIONS_MAPPING_TYPE), POINTER :: EQUATIONS_MAPPING
     TYPE(EQUATIONS_MAPPING_LINEAR_TYPE), POINTER :: LINEAR_MAPPING
@@ -3769,7 +3776,7 @@ SUBROUTINE POISSON_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,ELEMENT_NUMBE
     TYPE(EQUATIONS_MATRICES_RHS_TYPE), POINTER :: RHS_VECTOR
     TYPE(EQUATIONS_MATRICES_SOURCE_TYPE), POINTER :: SOURCE_VECTOR
     TYPE(EQUATIONS_MATRIX_TYPE), POINTER :: EQUATIONS_MATRIX
-    TYPE(FIELD_TYPE), POINTER :: DEPENDENT_FIELD,GEOMETRIC_FIELD,SOURCE_FIELD,MATERIALS_FIELD
+    TYPE(FIELD_TYPE), POINTER :: DEPENDENT_FIELD,GEOMETRIC_FIELD,SOURCE_FIELD,MATERIALS_FIELD,INDEPENDENT_FIELD
     TYPE(FIELD_VARIABLE_TYPE), POINTER :: FIELD_VARIABLE
     TYPE(QUADRATURE_SCHEME_TYPE), POINTER :: QUADRATURE_SCHEME
     TYPE(VARYING_STRING) :: LOCAL_ERROR
@@ -3903,6 +3910,14 @@ SUBROUTINE POISSON_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,ELEMENT_NUMBE
             & MATERIALS_INTERP_PARAMETERS(FIELD_U_VARIABLE_TYPE)%PTR,ERR,ERROR,*999)
           CALL FIELD_INTERPOLATION_PARAMETERS_ELEMENT_GET(FIELD_VALUES_SET_TYPE,ELEMENT_NUMBER,EQUATIONS%INTERPOLATION% &
             & DEPENDENT_INTERP_PARAMETERS(FIELD_VAR_TYPE)%PTR,ERR,ERROR,*999)
+          IF(EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_ALE_PRESSURE_POISSON_SUBTYPE) THEN
+            INDEPENDENT_FIELD=>EQUATIONS%INTERPOLATION%INDEPENDENT_FIELD
+            INDEPENDENT_BASIS=>INDEPENDENT_FIELD%DECOMPOSITION%DOMAIN(INDEPENDENT_FIELD% &
+              & DECOMPOSITION%MESH_COMPONENT_NUMBER)%PTR% &
+              & TOPOLOGY%ELEMENTS%ELEMENTS(ELEMENT_NUMBER)%BASIS
+            CALL FIELD_INTERPOLATION_PARAMETERS_ELEMENT_GET(FIELD_MESH_VELOCITY_SET_TYPE,ELEMENT_NUMBER,EQUATIONS%INTERPOLATION% &
+              & INDEPENDENT_INTERP_PARAMETERS(FIELD_VAR_TYPE)%PTR,ERR,ERROR,*999)
+          ENDIF
 
 !\todo: Check the influence of DIFF_COEFF
           DIFF_COEFF=1.0_DP
@@ -3940,7 +3955,7 @@ SUBROUTINE POISSON_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,ELEMENT_NUMBE
                 & INDEPENDENT_INTERP_POINT(FIELD_U_VARIABLE_TYPE)%PTR,ERR,ERROR,*999)
               W_VALUE(1)=EQUATIONS%INTERPOLATION%INDEPENDENT_INTERP_POINT(FIELD_U_VARIABLE_TYPE)%PTR%VALUES(1,NO_PART_DERIV)
               W_VALUE(2)=EQUATIONS%INTERPOLATION%INDEPENDENT_INTERP_POINT(FIELD_U_VARIABLE_TYPE)%PTR%VALUES(2,NO_PART_DERIV)
-              IF(FIELD_VARIABLE%NUMBER_OF_COMPONENTS==4) THEN
+              IF(INDEPENDENT_BASIS%NUMBER_OF_XI==3) THEN
                 W_VALUE(3)=EQUATIONS%INTERPOLATION%INDEPENDENT_INTERP_POINT(FIELD_U_VARIABLE_TYPE)%PTR%VALUES(3,NO_PART_DERIV)
               END IF 
             ELSE
@@ -4700,6 +4715,10 @@ SUBROUTINE POISSON_EQUATION_PROBLEM_SUBTYPE_SET(PROBLEM,PROBLEM_SUBTYPE,ERR,ERRO
         PROBLEM%CLASS=PROBLEM_CLASSICAL_FIELD_CLASS
         PROBLEM%TYPE=PROBLEM_POISSON_EQUATION_TYPE
         PROBLEM%SUBTYPE=PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE
+      CASE(PROBLEM_ALE_PRESSURE_POISSON_SUBTYPE)        
+        PROBLEM%CLASS=PROBLEM_CLASSICAL_FIELD_CLASS
+        PROBLEM%TYPE=PROBLEM_POISSON_EQUATION_TYPE
+        PROBLEM%SUBTYPE=PROBLEM_ALE_PRESSURE_POISSON_SUBTYPE
       CASE(PROBLEM_FITTED_PRESSURE_POISSON_SUBTYPE)        
         PROBLEM%CLASS=PROBLEM_CLASSICAL_FIELD_CLASS
         PROBLEM%TYPE=PROBLEM_POISSON_EQUATION_TYPE
@@ -5001,7 +5020,8 @@ SUBROUTINE POISSON_EQUATION_PROBLEM_PRESSURE_POISSON_SETUP(PROBLEM,PROBLEM_SETUP
     NULLIFY(SOLVERS)
     IF(ASSOCIATED(PROBLEM)) THEN
       IF(PROBLEM%SUBTYPE==PROBLEM_NONLINEAR_PRESSURE_POISSON_SUBTYPE.OR.PROBLEM%SUBTYPE== &
-        & PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE) THEN
+        & PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE.OR.PROBLEM%SUBTYPE== &
+        & PROBLEM_ALE_PRESSURE_POISSON_SUBTYPE) THEN
         SELECT CASE(PROBLEM_SETUP%SETUP_TYPE)
         CASE(PROBLEM_SETUP_INITIAL_TYPE)
           SELECT CASE(PROBLEM_SETUP%ACTION_TYPE)
@@ -5393,7 +5413,8 @@ SUBROUTINE POISSON_POST_SOLVE(CONTROL_LOOP,SOLVER,ERR,ERROR,*)
 !               do nothing
             CASE(PROBLEM_NONLINEAR_SOURCE_POISSON_SUBTYPE)
 !               do nothing
-            CASE(PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE,PROBLEM_NONLINEAR_PRESSURE_POISSON_SUBTYPE)
+            CASE(PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE,PROBLEM_NONLINEAR_PRESSURE_POISSON_SUBTYPE, &
+              & PROBLEM_ALE_PRESSURE_POISSON_SUBTYPE)
               CALL POISSON_POST_SOLVE_OUTPUT_DATA(CONTROL_LOOP,SOLVER,ERR,ERROR,*999)
             CASE(PROBLEM_FITTED_PRESSURE_POISSON_SUBTYPE)
               !Post solve for the linear solver
@@ -5500,7 +5521,7 @@ SUBROUTINE POISSON_PRE_SOLVE(CONTROL_LOOP,SOLVER,ERR,ERROR,*)
                   !First update source field for PPE calculation
                 CALL POISSON_PRE_SOLVE_UPDATE_PPE_SOURCE(CONTROL_LOOP,SOLVER,ERR,ERROR,*999)
                   !Read in the lable information
-                CALL POISSON_PRE_SOLVE_UPDATE_INPUT_DATA(CONTROL_LOOP,SOLVER,ERR,ERROR,*999)
+!                 CALL POISSON_PRE_SOLVE_UPDATE_INPUT_DATA(CONTROL_LOOP,SOLVER,ERR,ERROR,*999)
                 CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Solving PPE problem... ",ERR,ERROR,*999)
               ELSE
                 CALL FLAG_ERROR("Solver global number not associated for PPE problem.",ERR,ERROR,*999)
@@ -5811,7 +5832,8 @@ SUBROUTINE POISSON_POST_SOLVE_OUTPUT_DATA(CONTROL_LOOP,SOLVER,ERR,ERROR,*)
 !               do nothing
             CASE(PROBLEM_NONLINEAR_SOURCE_POISSON_SUBTYPE)
 !               do nothing
-            CASE(PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE,PROBLEM_NONLINEAR_PRESSURE_POISSON_SUBTYPE)
+            CASE(PROBLEM_LINEAR_PRESSURE_POISSON_SUBTYPE,PROBLEM_NONLINEAR_PRESSURE_POISSON_SUBTYPE, &
+              & PROBLEM_ALE_PRESSURE_POISSON_SUBTYPE)
               IF(CONTROL_LOOP%WHILE_LOOP%ITERATION_NUMBER==CONTROL_LOOP%WHILE_LOOP%MAXIMUM_NUMBER_OF_ITERATIONS)THEN
                 CONTROL_TIME_LOOP=>CONTROL_LOOP%PARENT_LOOP
                 CALL CONTROL_LOOP_CURRENT_TIMES_GET(CONTROL_TIME_LOOP,CURRENT_TIME,TIME_INCREMENT,ERR,ERROR,*999)
diff --git a/src/advection_diffusion_equation_routines.f90 b/src/advection_diffusion_equation_routines.f90
index 6511ace4..9e3833f0 100644
--- a/src/advection_diffusion_equation_routines.f90
+++ b/src/advection_diffusion_equation_routines.f90
@@ -2254,9 +2254,11 @@ SUBROUTINE ADVECTION_DIFFUSION_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,E
     INTEGER(INTG), INTENT(OUT) :: ERR !<The error code
     TYPE(VARYING_STRING), INTENT(OUT) :: ERROR !<The error string
     !Local Variables
-    INTEGER(INTG) mh,mhs,ms,ng,nh,nhs,ni,nj,ns,FIELD_VAR_TYPE
-    REAL(DP) :: C_PARAM,K_PARAM,RWG,SUM,PGMJ(3),PGNJ(3),ADVEC_VEL,A_PARAM
+    INTEGER(INTG) mh,mhs,ms,ng,nh,nhs,ni,nj,ns,FIELD_VAR_TYPE,my_compartment,Ncompartments,imatrix,num_var_count
+    INTEGER(INTG) :: MESH_COMPONENT_1, MESH_COMPONENT_2
+    REAL(DP) :: C_PARAM,K_PARAM,RWG,SUM,PGMJ(3),PGNJ(3),ADVEC_VEL,A_PARAM,COUPLING_PARAM,PGM,PGN
     TYPE(BASIS_TYPE), POINTER :: DEPENDENT_BASIS,GEOMETRIC_BASIS
+    TYPE(BASIS_TYPE), POINTER :: DEPENDENT_BASIS_1, DEPENDENT_BASIS_2
     TYPE(EQUATIONS_TYPE), POINTER :: EQUATIONS
     TYPE(EQUATIONS_MAPPING_TYPE), POINTER :: EQUATIONS_MAPPING
     TYPE(EQUATIONS_MAPPING_LINEAR_TYPE), POINTER :: LINEAR_MAPPING
@@ -2267,13 +2269,17 @@ SUBROUTINE ADVECTION_DIFFUSION_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,E
     TYPE(EQUATIONS_MATRICES_RHS_TYPE), POINTER :: RHS_VECTOR
     TYPE(EQUATIONS_MATRICES_SOURCE_TYPE), POINTER :: SOURCE_VECTOR
     TYPE(EQUATIONS_MATRIX_TYPE), POINTER :: DAMPING_MATRIX,STIFFNESS_MATRIX
-    TYPE(FIELD_TYPE), POINTER :: DEPENDENT_FIELD,GEOMETRIC_FIELD,MATERIALS_FIELD,SOURCE_FIELD,INDEPENDENT_FIELD
+    TYPE(FIELD_TYPE), POINTER :: DEPENDENT_FIELD,GEOMETRIC_FIELD,MATERIALS_FIELD,SOURCE_FIELD,INDEPENDENT_FIELD,EQUATIONS_SET_FIELD
     TYPE(FIELD_VARIABLE_TYPE), POINTER :: FIELD_VARIABLE,GEOMETRIC_VARIABLE
+    TYPE(FIELD_VARIABLE_PTR_TYPE) :: FIELD_VARIABLES(99)
+    TYPE(EQUATIONS_MATRIX_PTR_TYPE) :: COUPLING_MATRICES(99) 
+    INTEGER(INTG) :: FIELD_VAR_TYPES(99)
     TYPE(QUADRATURE_SCHEME_TYPE), POINTER :: QUADRATURE_SCHEME
+    TYPE(QUADRATURE_SCHEME_TYPE), POINTER :: QUADRATURE_SCHEME_1, QUADRATURE_SCHEME_2
     TYPE(VARYING_STRING) :: LOCAL_ERROR
     TYPE(FIELD_INTERPOLATION_PARAMETERS_TYPE), POINTER :: DIFFUSION_DEPENDENT_PREVIOUS_INTERPOLATION_PARAMETERS
     TYPE(FIELD_INTERPOLATED_POINT_TYPE), POINTER :: DIFFUSION_DEPENDENT_PREVIOUS_INTERPOLATED_POINT
-
+    INTEGER(INTG), POINTER :: EQUATIONS_SET_FIELD_DATA(:)
     LOGICAL :: UPDATE_DAMPING_MATRIX,UPDATE_STIFFNESS_MATRIX,UPDATE_RHS_VECTOR,UPDATE_SOURCE_VECTOR
 
     UPDATE_DAMPING_MATRIX = .FALSE.
@@ -2414,6 +2420,26 @@ SUBROUTINE ADVECTION_DIFFUSION_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,E
              IF(ASSOCIATED(SOURCE_VECTOR)) UPDATE_SOURCE_VECTOR=SOURCE_VECTOR%UPDATE_VECTOR
           ENDIF
           EQUATIONS_MAPPING=>EQUATIONS%EQUATIONS_MAPPING
+          IF(EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_MULTI_COMP_TRANSPORT_ADVEC_DIFF_SUBTYPE .OR. &
+           & EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_MULTI_COMP_TRANSPORT_ADVEC_DIFF_SUPG_SUBTYPE) THEN
+           EQUATIONS_SET_FIELD=>EQUATIONS_SET%EQUATIONS_SET_FIELD%EQUATIONS_SET_FIELD_FIELD
+           CALL FIELD_PARAMETER_SET_DATA_GET(EQUATIONS_SET_FIELD,FIELD_U_VARIABLE_TYPE, &
+             & FIELD_VALUES_SET_TYPE,EQUATIONS_SET_FIELD_DATA,ERR,ERROR,*999)
+            my_compartment = EQUATIONS_SET_FIELD_DATA(1)
+            Ncompartments  = EQUATIONS_SET_FIELD_DATA(2)
+            LINEAR_MATRICES=>EQUATIONS_MATRICES%LINEAR_MATRICES
+            LINEAR_MAPPING=>EQUATIONS_MAPPING%LINEAR_MAPPING
+            num_var_count=0
+            DO imatrix = 1,Ncompartments
+             IF(imatrix/=my_compartment)THEN
+              num_var_count=num_var_count+1
+              COUPLING_MATRICES(num_var_count)%PTR=>LINEAR_MATRICES%MATRICES(num_var_count)%PTR
+              FIELD_VARIABLES(num_var_count)%PTR=>LINEAR_MAPPING%EQUATIONS_MATRIX_TO_VAR_MAPS(num_var_count)%VARIABLE
+              FIELD_VAR_TYPES(num_var_count)=FIELD_VARIABLES(num_var_count)%PTR%VARIABLE_TYPE
+              COUPLING_MATRICES(num_var_count)%PTR%ELEMENT_MATRIX%MATRIX=0.0_DP
+             ENDIF
+            END DO
+          ENDIF
           IF(EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_NO_SOURCE_ADVECTION_DIFFUSION_SUBTYPE .OR. &
              & EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_CONSTANT_SOURCE_ADVECTION_DIFFUSION_SUBTYPE .OR. &
              & EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_LINEAR_SOURCE_ADVECTION_DIFFUSION_SUBTYPE .OR. &
@@ -2481,6 +2507,8 @@ SUBROUTINE ADVECTION_DIFFUSION_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,E
            DIFFUSION_DEPENDENT_PREVIOUS_INTERPOLATED_POINT=> &
             & EQUATIONS%INTERPOLATION%DEPENDENT_INTERP_POINT(FIELD_V_VARIABLE_TYPE)%PTR
           ENDIF
+
+
           !Select whether using standard Galerkin scheme, or the stabilised streamwise-upwinding Petrov-Galerkin scheme
           SELECT CASE(EQUATIONS_SET%SUBTYPE)
           CASE(EQUATIONS_SET_NO_SOURCE_ADVECTION_DIFFUSION_SUBTYPE,EQUATIONS_SET_CONSTANT_SOURCE_ADVECTION_DIFFUSION_SUBTYPE, &
@@ -2571,8 +2599,11 @@ SUBROUTINE ADVECTION_DIFFUSION_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,E
                             ! A_PARAM is the material parameter that multiplies the linear source u
                         ELSEIF(EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_MULTI_COMP_TRANSPORT_ADVEC_DIFF_SUBTYPE) THEN
                         !for multi-compartment model must include additional terms into the 
-
-
+                          COUPLING_PARAM=EQUATIONS%INTERPOLATION%MATERIALS_INTERP_POINT(FIELD_V_VARIABLE_TYPE)%PTR% &
+                            & VALUES(my_compartment,NO_PART_DERIV)
+                          STIFFNESS_MATRIX%ELEMENT_MATRIX%MATRIX(mhs,nhs)=STIFFNESS_MATRIX%ELEMENT_MATRIX%MATRIX(mhs,nhs)+ & 
+                            & SUM*RWG + QUADRATURE_SCHEME%GAUSS_BASIS_FNS(ms,NO_PART_DERIV,ng)* &
+                          & QUADRATURE_SCHEME%GAUSS_BASIS_FNS(ns,NO_PART_DERIV,ng)*RWG*COUPLING_PARAM
                         ENDIF
                       ENDIF
                     IF(EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_NO_SOURCE_ADVECTION_DIFFUSION_SUBTYPE .OR. &
@@ -2632,6 +2663,82 @@ SUBROUTINE ADVECTION_DIFFUSION_EQUATION_FINITE_ELEMENT_CALCULATE(EQUATIONS_SET,E
                 ENDIF
               ENDIF
             IF(UPDATE_RHS_VECTOR) RHS_VECTOR%ELEMENT_VECTOR%VECTOR(mhs)=0.0_DP
+  
+            IF(EQUATIONS_SET%SUBTYPE==EQUATIONS_SET_MULTI_COMP_TRANSPORT_ADVEC_DIFF_SUBTYPE) THEN
+            !Calculate the coupling matrices
+
+              !Loop over element rows
+              mhs=0
+              DO mh=1,FIELD_VARIABLE%NUMBER_OF_COMPONENTS !field_variable is the variable associated with the equations set under consideration
+
+                MESH_COMPONENT_1 = FIELD_VARIABLE%COMPONENTS(mh)%MESH_COMPONENT_NUMBER
+                DEPENDENT_BASIS_1 => DEPENDENT_FIELD%DECOMPOSITION%DOMAIN(MESH_COMPONENT_1)%PTR% &
+                  & TOPOLOGY%ELEMENTS%ELEMENTS(ELEMENT_NUMBER)%BASIS
+                QUADRATURE_SCHEME_1 => DEPENDENT_BASIS_1%QUADRATURE% &
+                  & QUADRATURE_SCHEME_MAP(BASIS_DEFAULT_QUADRATURE_SCHEME)%PTR
+                RWG = EQUATIONS%INTERPOLATION%GEOMETRIC_INTERP_POINT_METRICS(FIELD_U_VARIABLE_TYPE)%PTR%JACOBIAN * &
+                  & QUADRATURE_SCHEME_1%GAUSS_WEIGHTS(ng)
+
+                DO ms=1,DEPENDENT_BASIS_1%NUMBER_OF_ELEMENT_PARAMETERS
+                  mhs=mhs+1
+
+                  num_var_count=0
+                  DO imatrix = 1,Ncompartments
+                  IF(imatrix/=my_compartment)THEN
+                    num_var_count=num_var_count+1
+
+!need to test for the case where imatrix==mycompartment
+!the coupling terms then needs to be added into the stiffness matrix
+                    IF(COUPLING_MATRICES(num_var_count)%PTR%UPDATE_MATRIX) THEN
+
+!                       !Loop over element columns
+                      nhs=0
+! !                       DO nh=1,FIELD_VARIABLE%NUMBER_OF_COMPONENTS
+                      DO nh=1,FIELD_VARIABLES(num_var_count)%PTR%NUMBER_OF_COMPONENTS
+
+                        MESH_COMPONENT_2 = FIELD_VARIABLE%COMPONENTS(nh)%MESH_COMPONENT_NUMBER
+                        DEPENDENT_BASIS_2 => DEPENDENT_FIELD%DECOMPOSITION%DOMAIN(MESH_COMPONENT_2)%PTR% &
+                          & TOPOLOGY%ELEMENTS%ELEMENTS(ELEMENT_NUMBER)%BASIS
+                        !--- We cannot use two different quadrature schemes here !!!
+                        QUADRATURE_SCHEME_2 => DEPENDENT_BASIS_2%QUADRATURE% &
+                         & QUADRATURE_SCHEME_MAP(BASIS_DEFAULT_QUADRATURE_SCHEME)%PTR
+                        !RWG = EQUATIONS%INTERPOLATION%GEOMETRIC_INTERP_POINT_METRICS%JACOBIAN * &
+                        !  & QUADRATURE_SCHEME_2%GAUSS_WEIGHTS(ng)
+
+                        DO ns=1,DEPENDENT_BASIS_2%NUMBER_OF_ELEMENT_PARAMETERS
+                          nhs=nhs+1
+
+!                           !-------------------------------------------------------------------------------------------------------------
+!                           !concentration test function, concentration trial function
+!                           !For now, this is only a dummy implementation - this still has to be properly set up.
+!                           IF(mh==nh.AND.nh<NUMBER_OF_VEL_PRESS_COMPONENTS) THEN ! don't need this for diffusion equation
+
+!                             SUM = 0.0_DP
+
+                            PGM=QUADRATURE_SCHEME_1%GAUSS_BASIS_FNS(ms,NO_PART_DERIV,ng)
+                            PGN=QUADRATURE_SCHEME_2%GAUSS_BASIS_FNS(ns,NO_PART_DERIV,ng)
+
+                            !Get the coupling coefficients 
+                              COUPLING_PARAM=EQUATIONS%INTERPOLATION%MATERIALS_INTERP_POINT(FIELD_V_VARIABLE_TYPE)%PTR% &
+                                & VALUES(imatrix,NO_PART_DERIV)
+
+!                              SUM = SUM + COUPLING_PARAM * PGM * PGN
+ 
+                             COUPLING_MATRICES(num_var_count)%PTR%ELEMENT_MATRIX%MATRIX(mhs,nhs) = &
+                               & COUPLING_MATRICES(num_var_count)%PTR%ELEMENT_MATRIX%MATRIX(mhs,nhs) + & 
+                               & COUPLING_PARAM * PGM * PGN * RWG
+!                           ENDIF
+ 
+                        ENDDO !ns
+                      ENDDO !nh
+                    ENDIF
+                   ENDIF
+                  ENDDO !imatrix
+                ENDDO !ms
+              ENDDO !mh
+
+            ENDIF
+
           ENDDO !ng
           
           !Scale factor adjustment
diff --git a/src/cmiss_c.c b/src/cmiss_c.c
index 3cb15425..7b44cfb5 100755
--- a/src/cmiss_c.c
+++ b/src/cmiss_c.c
@@ -190,44 +190,44 @@ static void CMISSFatalHandler(int sig,
 #if defined (SIGBUS)
     case SIGBUS:
       {
-	fprintf(stderr,">>FATAL ERROR: Bus error occured.\n");
+	fprintf(stderr,">>FATAL ERROR: Bus error occurred.\n");
      } break;
 #endif /* defined (SIGBUS) */
 #if defined (SIGEMT)
     case SIGEMT:
       {
-	fprintf(stderr,">>FATAL ERROR: EMT occured.\n");
+	fprintf(stderr,">>FATAL ERROR: EMT occurred.\n");
       } break;
 #endif /* defined (SIGEMT) */
     case SIGFPE:
       {
-	fprintf(stderr,">>FATAL ERROR: Floating point execption occured.\n");
+	fprintf(stderr,">>FATAL ERROR: Floating point execption occurred.\n");
       } break;
     case SIGILL:
       {
-	fprintf(stderr,">>FATAL ERROR: Illegal instruction occured.\n");
+	fprintf(stderr,">>FATAL ERROR: Illegal instruction occurred.\n");
       } break;
     case SIGINT:
       {
-	fprintf(stderr,">>FATAL ERROR: Interrupt occured.\n");
+	fprintf(stderr,">>FATAL ERROR: Interrupt occurred.\n");
       } break;
     case SIGABRT:
       {
-	fprintf(stderr,">>FATAL ERROR: Abort occured.\n");
+	fprintf(stderr,">>FATAL ERROR: Abort occurred.\n");
       } break;
     case SIGSEGV:
       {
-	fprintf(stderr,">>FATAL ERROR: Segment violation occured.\n");
+	fprintf(stderr,">>FATAL ERROR: Segment violation occurred.\n");
       } break;
 #if defined (SIGTRAP)
     case SIGTRAP:
       {
-	fprintf(stderr,">>FATAL ERROR: Trace trap occured.\n");
+	fprintf(stderr,">>FATAL ERROR: Trace trap occurred.\n");
       } break;
 #endif /* defined (SIGTRAP) */
     default:
       {
-	fprintf(stderr,">>FATAL ERROR: Unknown signal %d occured.\n",code);
+	fprintf(stderr,">>FATAL ERROR: Unknown signal %d occurred.\n",code);
       } break;
     }
 #endif
diff --git a/src/control_loop_routines.f90 b/src/control_loop_routines.f90
index bcaf96b8..45800823 100644
--- a/src/control_loop_routines.f90
+++ b/src/control_loop_routines.f90
@@ -1400,43 +1400,39 @@ SUBROUTINE CONTROL_LOOP_TIMES_SET(CONTROL_LOOP,START_TIME,STOP_TIME,TIME_INCREME
     CALL ENTERS("CONTROL_LOOP_TIMES_SET",ERR,ERROR,*999)
 
     IF(ASSOCIATED(CONTROL_LOOP)) THEN
-      IF(CONTROL_LOOP%CONTROL_LOOP_FINISHED) THEN
-        CALL FLAG_ERROR("Control loop has been finished.",ERR,ERROR,*999)
-      ELSE
-        IF(CONTROL_LOOP%LOOP_TYPE==PROBLEM_CONTROL_TIME_LOOP_TYPE) THEN
-          TIME_LOOP=>CONTROL_LOOP%TIME_LOOP
-          IF(ASSOCIATED(TIME_LOOP)) THEN
-            IF(ABS(TIME_INCREMENT)<=ZERO_TOLERANCE) THEN
-              LOCAL_ERROR="The specified time increment of "//TRIM(NUMBER_TO_VSTRING(TIME_INCREMENT,"*",ERR,ERROR))// &
-                & " is invalid. The time increment must not be zero."          
-              CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
+      IF(CONTROL_LOOP%LOOP_TYPE==PROBLEM_CONTROL_TIME_LOOP_TYPE) THEN
+        TIME_LOOP=>CONTROL_LOOP%TIME_LOOP
+        IF(ASSOCIATED(TIME_LOOP)) THEN
+          IF(ABS(TIME_INCREMENT)<=ZERO_TOLERANCE) THEN
+            LOCAL_ERROR="The specified time increment of "//TRIM(NUMBER_TO_VSTRING(TIME_INCREMENT,"*",ERR,ERROR))// &
+              & " is invalid. The time increment must not be zero."          
+            CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
+          ELSE
+            IF(TIME_INCREMENT>0.0_DP) THEN
+              IF(STOP_TIME<=START_TIME) THEN
+                LOCAL_ERROR="The specified stop time of "//TRIM(NUMBER_TO_VSTRING(STOP_TIME,"*",ERR,ERROR))// &
+                  & " is incompatiable with a specified start time of "//TRIM(NUMBER_TO_VSTRING(START_TIME,"*",ERR,ERROR))// &
+                  & ". For a positive time increment the stop time must be > than the start time."
+                CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
+              ENDIF
             ELSE
-              IF(TIME_INCREMENT>0.0_DP) THEN
-                IF(STOP_TIME<=START_TIME) THEN
-                  LOCAL_ERROR="The specified stop time of "//TRIM(NUMBER_TO_VSTRING(STOP_TIME,"*",ERR,ERROR))// &
-                    & " is incompatiable with a specified start time of "//TRIM(NUMBER_TO_VSTRING(START_TIME,"*",ERR,ERROR))// &
-                    & ". For a positive time increment the stop time must be > than the start time."
-                  CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
-                ENDIF
-              ELSE
-                IF(START_TIME<=STOP_TIME) THEN
-                  LOCAL_ERROR="The specified start time of "//TRIM(NUMBER_TO_VSTRING(START_TIME,"*",ERR,ERROR))// &
-                    & " is incompatiable with a specified stop time of "//TRIM(NUMBER_TO_VSTRING(STOP_TIME,"*",ERR,ERROR))// &
-                    & ". For a negative time increment the stop time must be < than the start time."
-                  CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
-                ENDIF
+              IF(START_TIME<=STOP_TIME) THEN
+                LOCAL_ERROR="The specified start time of "//TRIM(NUMBER_TO_VSTRING(START_TIME,"*",ERR,ERROR))// &
+                  & " is incompatiable with a specified stop time of "//TRIM(NUMBER_TO_VSTRING(STOP_TIME,"*",ERR,ERROR))// &
+                  & ". For a negative time increment the stop time must be < than the start time."
+                CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
               ENDIF
             ENDIF
-            TIME_LOOP%START_TIME=START_TIME
-            TIME_LOOP%STOP_TIME=STOP_TIME
-            TIME_LOOP%TIME_INCREMENT=TIME_INCREMENT
-          ELSE
-            CALL FLAG_ERROR("Control loop time loop is not associated.",ERR,ERROR,*999)
           ENDIF
+          TIME_LOOP%START_TIME=START_TIME
+          TIME_LOOP%STOP_TIME=STOP_TIME
+          TIME_LOOP%TIME_INCREMENT=TIME_INCREMENT
         ELSE
-          CALL FLAG_ERROR("The specified control loop is not a time control loop.",ERR,ERROR,*999)
+          CALL FLAG_ERROR("Control loop time loop is not associated.",ERR,ERROR,*999)
         ENDIF
-      ENDIF          
+      ELSE
+        CALL FLAG_ERROR("The specified control loop is not a time control loop.",ERR,ERROR,*999)
+      ENDIF
     ELSE
       CALL FLAG_ERROR("Control loop is not associated.",ERR,ERROR,*999)
     ENDIF
diff --git a/src/cuda_solver_routines.cu b/src/cuda_solver_routines.cu
index afe660b4..97c6e6d3 100644
--- a/src/cuda_solver_routines.cu
+++ b/src/cuda_solver_routines.cu
@@ -3,89 +3,111 @@
 
 #include <cutil_inline.h>
 
+//extern __shared__ double shared_array[];
+//const int algebraicCount = 25;
+//const int rateStateCount = 8;
+//const int constantCount = 0;
+//const double FLOPSPerFunction = 193.0f;
+//const int DEFAULT_TESTING_THREADS = 2000000;
+//const int sharedMemoryCellModel = 0;
+//const char* cellModelName = "LR R3";
+//
+//////////////////////////////////////////////////////////////////////////////////
+//// Cell Model Device Functions
+//////////////////////////////////////////////////////////////////////////////////
+//__device__ void computeRates(float VOI, double* DUMMY, double* STATES, double* ALGEBRAIC, double* RATES)
+//{
+//	ALGEBRAIC[0] = -25.5; // Add stimulus in proper
+//	ALGEBRAIC[1] = (0.32f*STATES[0]+15.0816f)/(1.0f - (expf(- 0.1f*STATES[0]-4.713f))); // 7 ops
+//
+//	if (STATES[0] < -40.0f) {
+//		ALGEBRAIC[2] = 0.135f*(expf(((80.0f+STATES[0])/- 6.8f))); // 4 ops
+//		ALGEBRAIC[3] = (( - 127140*(expf(0.2444f*STATES[0])) - 3.47400e-05*(expf(-0.04391f*STATES[0])))*(STATES[0]+37.78f))/(1.0f+(expf(0.311f*STATES[0]+24.64053))); // 14 ops
+//		ALGEBRAIC[9] = 3.56f*(expf(0.079f*STATES[0]))+ 310000*(expf(0.35f*STATES[0]));  // 7 ops
+//		ALGEBRAIC[10] = 0.1212f*(expf(-0.01052f*STATES[0]))/(1.0f+(expf(-0.1378f*STATES[0]-5.531292f))); // 8 ops
+//	} else {
+//		ALGEBRAIC[2] = 0.00000;
+//		ALGEBRAIC[3] = 0.00000;
+//		ALGEBRAIC[9] = 1.0f/( 0.13f*(1.0f+(expf(((STATES[0]+10.66f)/- 11.1f)))));
+//		ALGEBRAIC[10] = ( 0.3f*(expf(-2.53500e-07*STATES[0])))/(1.0f+(expf(-0.1f*STATES[0]-3.2f)));
+//	}
+//	if (STATES[0] < -100.0f) {
+//		ALGEBRAIC[16] = 2.837f*(expf(0.04f*STATES[0]+3.08f) - 1.0f)/((STATES[0]+77.0f)*(expf(0.04f*STATES[0]+1.4f))); // 11 ops
+//	} else {
+//		ALGEBRAIC[16] = 1.0f;
+//	}
+//
+//	ALGEBRAIC[4] = (0.095f*(expf(-0.01f*STATES[0] + 0.5f)))/(1.0f+(expf(-0.072*STATES[0] + 0.36f))); // 9 ops
+//	ALGEBRAIC[5] = (0.012f*(expf(-0.008f*STATES[0]-0.224f)))/(1.0f+(expf(0.15f*STATES[0]+4.2f))); // 9 ops
+//	ALGEBRAIC[6] = (0.0005f*(expf(0.083f*STATES[0]+4.15f)))/(1.0f+(expf(0.057f*STATES[0]+2.85f))); // 9 ops
+//	ALGEBRAIC[7] =  23*(powf(STATES[1], 3.0f))*STATES[2]*STATES[3]*(STATES[0] - 54.794463f); // 6 ops
+//	ALGEBRAIC[8] =  0.08f*(expf(-STATES[0]/11.0000)); // 3 ops
+//	ALGEBRAIC[11] = (0.07f*(expf(-0.017f*STATES[0]-0.748f)))/(1.0f+(expf(0.05f*STATES[0]+2.2f))); // 9 ops
+//	ALGEBRAIC[12] = (0.0065f*(expf(-0.02f*STATES[0]-0.6f)))/(1.0f+(expf(-0.2f*STATES[0]-6.0f))); // 9 ops
+//	ALGEBRAIC[13] = (0.0013f*(expf(-0.06f*STATES[0]-1.2f)))/(1.0f+(expf(-0.04f*STATES[0]-0.8f))); // 9 ops
+//	ALGEBRAIC[14] = 7.7f - 13.0287f*logf(STATES[4]); // 3 ops
+//	ALGEBRAIC[15] =  0.09f*STATES[5]*STATES[6]*(STATES[0] - ALGEBRAIC[14]); // 4 ops
+//	ALGEBRAIC[17] =  0.282f*STATES[7]*ALGEBRAIC[16]*(STATES[0] + 77.56758f); // 4 ops
+//	ALGEBRAIC[18] = 1.02f/(1.0f+(expf(0.2385f*STATES[0] + 6.83967915f))); // 4 ops
+//	ALGEBRAIC[19] = (0.49124f*(expf( 0.08032f *STATES[0] + 7.49939f) + expf(0.06175f*STATES[0] - 31.271255925f)))/(1.00000+expf(-0.514300*STATES[0] - 214.85137268791f)); // 13 ops
+//	ALGEBRAIC[20] = ALGEBRAIC[18]/(ALGEBRAIC[18]+ALGEBRAIC[19]); // 2 ops
+//	ALGEBRAIC[21] =  0.6047f*ALGEBRAIC[20]*(STATES[0] + 87.89290f); // 3 ops
+//	ALGEBRAIC[22] = 1.0f/(1.0f+(exp(((7.488f - STATES[0])/5.98f)))); // 5 ops
+//	ALGEBRAIC[23] =  0.0183f*ALGEBRAIC[22]*(STATES[0] + 87.89290f); // 3 ops
+//	ALGEBRAIC[24] =  0.03921f*STATES[0] +2.3475027f; // 3 ops
+//
+//	RATES[0] =  -(ALGEBRAIC[0]+ALGEBRAIC[7]+ALGEBRAIC[15]+ALGEBRAIC[17]+ALGEBRAIC[21]+ALGEBRAIC[23]+ALGEBRAIC[24]); // 7 ops
+//	RATES[1] =  ALGEBRAIC[1]*(1.00000 - STATES[1]) -  ALGEBRAIC[8]*STATES[1]; // 4 ops
+//	RATES[2] =  ALGEBRAIC[2]*(1.00000 - STATES[2]) -  ALGEBRAIC[9]*STATES[2]; // 4 ops
+//	RATES[3] =  ALGEBRAIC[3]*(1.00000 - STATES[3]) -  ALGEBRAIC[10]*STATES[3]; // 4 ops
+//	RATES[4] =  - 0.0001f*ALGEBRAIC[15]+ 0.000007f - 0.07f*STATES[4]; // 4 ops
+//	RATES[5] =  ALGEBRAIC[4]*(1.00000 - STATES[5]) -  ALGEBRAIC[11]*STATES[5]; // 4 ops
+//	RATES[6] =  ALGEBRAIC[5]*(1.00000 - STATES[6]) -  ALGEBRAIC[12]*STATES[6]; // 4 ops
+//	RATES[7] =  ALGEBRAIC[6]*(1.00000 - STATES[7]) -  ALGEBRAIC[13]*STATES[7]; // 4 ops
+//}
+
 extern __shared__ double shared_array[];
-const int algebraicCount = 25;
-const int rateStateCount = 8;
+const int rateStateCount = 2;
 const int constantCount = 0;
-const double FLOPSPerFunction = 193.0f;
-const int DEFAULT_TESTING_THREADS = 2000000;
+const int algebraicCount = 1;
+const double FLOPSPerFunction = 9.0f;
+const int DEFAULT_TESTING_THREADS = 20000000;
 const int sharedMemoryCellModel = 0;
-const char* cellModelName = "LR R3";
+const char* cellModelName = "FN R1";
 
 ////////////////////////////////////////////////////////////////////////////////
 // Cell Model Device Functions
 ////////////////////////////////////////////////////////////////////////////////
-__device__ void computeRates(float VOI, double* DUMMY, double* STATES, double* ALGEBRAIC, double* RATES)
+__device__ void computeRates(double time, double* constants, double* states, double* algebraic, double* rates)
 {
-	ALGEBRAIC[0] = -25.5; // Add stimulus in proper
-	ALGEBRAIC[1] = (0.32f*STATES[0]+15.0816f)/(1.0f - (expf(- 0.1f*STATES[0]-4.713f))); // 7 ops
-
-	if (STATES[0] < -40.0f) {
-		ALGEBRAIC[2] = 0.135f*(expf(((80.0f+STATES[0])/- 6.8f))); // 4 ops
-		ALGEBRAIC[3] = (( - 127140*(expf(0.2444f*STATES[0])) - 3.47400e-05*(expf(-0.04391f*STATES[0])))*(STATES[0]+37.78f))/(1.0f+(expf(0.311f*STATES[0]+24.64053))); // 14 ops
-		ALGEBRAIC[9] = 3.56f*(expf(0.079f*STATES[0]))+ 310000*(expf(0.35f*STATES[0]));  // 7 ops
-		ALGEBRAIC[10] = 0.1212f*(expf(-0.01052f*STATES[0]))/(1.0f+(expf(-0.1378f*STATES[0]-5.531292f))); // 8 ops
-	} else {
-		ALGEBRAIC[2] = 0.00000;
-		ALGEBRAIC[3] = 0.00000;
-		ALGEBRAIC[9] = 1.0f/( 0.13f*(1.0f+(expf(((STATES[0]+10.66f)/- 11.1f)))));
-		ALGEBRAIC[10] = ( 0.3f*(expf(-2.53500e-07*STATES[0])))/(1.0f+(expf(-0.1f*STATES[0]-3.2f)));
-	}
-	if (STATES[0] < -100.0f) {
-		ALGEBRAIC[16] = 2.837f*(expf(0.04f*STATES[0]+3.08f) - 1.0f)/((STATES[0]+77.0f)*(expf(0.04f*STATES[0]+1.4f))); // 11 ops
-	} else {
-		ALGEBRAIC[16] = 1.0f;
-	}
-
-	ALGEBRAIC[4] = (0.095f*(expf(-0.01f*STATES[0] + 0.5f)))/(1.0f+(expf(-0.072*STATES[0] + 0.36f))); // 9 ops
-	ALGEBRAIC[5] = (0.012f*(expf(-0.008f*STATES[0]-0.224f)))/(1.0f+(expf(0.15f*STATES[0]+4.2f))); // 9 ops
-	ALGEBRAIC[6] = (0.0005f*(expf(0.083f*STATES[0]+4.15f)))/(1.0f+(expf(0.057f*STATES[0]+2.85f))); // 9 ops
-	ALGEBRAIC[7] =  23*(powf(STATES[1], 3.0f))*STATES[2]*STATES[3]*(STATES[0] - 54.794463f); // 6 ops
-	ALGEBRAIC[8] =  0.08f*(expf(-STATES[0]/11.0000)); // 3 ops
-	ALGEBRAIC[11] = (0.07f*(expf(-0.017f*STATES[0]-0.748f)))/(1.0f+(expf(0.05f*STATES[0]+2.2f))); // 9 ops
-	ALGEBRAIC[12] = (0.0065f*(expf(-0.02f*STATES[0]-0.6f)))/(1.0f+(expf(-0.2f*STATES[0]-6.0f))); // 9 ops
-	ALGEBRAIC[13] = (0.0013f*(expf(-0.06f*STATES[0]-1.2f)))/(1.0f+(expf(-0.04f*STATES[0]-0.8f))); // 9 ops
-	ALGEBRAIC[14] = 7.7f - 13.0287f*logf(STATES[4]); // 3 ops
-	ALGEBRAIC[15] =  0.09f*STATES[5]*STATES[6]*(STATES[0] - ALGEBRAIC[14]); // 4 ops
-	ALGEBRAIC[17] =  0.282f*STATES[7]*ALGEBRAIC[16]*(STATES[0] + 77.56758f); // 4 ops
-	ALGEBRAIC[18] = 1.02f/(1.0f+(expf(0.2385f*STATES[0] + 6.83967915f))); // 4 ops
-	ALGEBRAIC[19] = (0.49124f*(expf( 0.08032f *STATES[0] + 7.49939f) + expf(0.06175f*STATES[0] - 31.271255925f)))/(1.00000+expf(-0.514300*STATES[0] - 214.85137268791f)); // 13 ops
-	ALGEBRAIC[20] = ALGEBRAIC[18]/(ALGEBRAIC[18]+ALGEBRAIC[19]); // 2 ops
-	ALGEBRAIC[21] =  0.6047f*ALGEBRAIC[20]*(STATES[0] + 87.89290f); // 3 ops
-	ALGEBRAIC[22] = 1.0f/(1.0f+(exp(((7.488f - STATES[0])/5.98f)))); // 5 ops
-	ALGEBRAIC[23] =  0.0183f*ALGEBRAIC[22]*(STATES[0] + 87.89290f); // 3 ops
-	ALGEBRAIC[24] =  0.03921f*STATES[0] +2.3475027f; // 3 ops
-
-	RATES[0] =  -(ALGEBRAIC[0]+ALGEBRAIC[7]+ALGEBRAIC[15]+ALGEBRAIC[17]+ALGEBRAIC[21]+ALGEBRAIC[23]+ALGEBRAIC[24]); // 7 ops
-	RATES[1] =  ALGEBRAIC[1]*(1.00000 - STATES[1]) -  ALGEBRAIC[8]*STATES[1]; // 4 ops
-	RATES[2] =  ALGEBRAIC[2]*(1.00000 - STATES[2]) -  ALGEBRAIC[9]*STATES[2]; // 4 ops
-	RATES[3] =  ALGEBRAIC[3]*(1.00000 - STATES[3]) -  ALGEBRAIC[10]*STATES[3]; // 4 ops
-	RATES[4] =  - 0.0001f*ALGEBRAIC[15]+ 0.000007f - 0.07f*STATES[4]; // 4 ops
-	RATES[5] =  ALGEBRAIC[4]*(1.00000 - STATES[5]) -  ALGEBRAIC[11]*STATES[5]; // 4 ops
-	RATES[6] =  ALGEBRAIC[5]*(1.00000 - STATES[6]) -  ALGEBRAIC[12]*STATES[6]; // 4 ops
-	RATES[7] =  ALGEBRAIC[6]*(1.00000 - STATES[7]) -  ALGEBRAIC[13]*STATES[7]; // 4 ops
+	rates[1] =  0.005f*(states[0] - 3.0f*states[1]);
+	rates[0] =  ((states[0]*(states[0] - -0.08f)*(1.0f - states[0]) - states[1])+ algebraic[0]);
 }
 
-////////////////////////////////////////////////////////////////////////////////
-// Cell Model Host Functions ////////// Should Not Be Needed Later /////////////
-////////////////////////////////////////////////////////////////////////////////
-void initProblem(int num_threads, double* STATES)
-{
-	int i;
-
-	STATES[0] = -84.3801107371;
-	STATES[1] = 0.00171338077730188;
-	STATES[2] = 0.982660523699656;
-	STATES[3] = 0.989108212766685;
-	STATES[4] = 0.00017948816388306;
-	STATES[5] = 0.00302126301779861;
-	STATES[6] = 0.999967936476325;
-	STATES[7] = 0.0417603108167287;
-
-	for (i=1; i<num_threads; i++)
-		memcpy(STATES + i*rateStateCount, STATES, rateStateCount*sizeof(double));
+void initProblem(int num_threads, double* STATES) {
+	printf("test\n\n\n\n\n\n");
 }
 
+//////////////////////////////////////////////////////////////////////////////////
+//// Cell Model Host Functions ////////// Should Not Be Needed Later /////////////
+//////////////////////////////////////////////////////////////////////////////////
+//void initProblem(int num_threads, double* STATES)
+//{
+//	int i;
+//
+//	STATES[0] = -84.3801107371;
+//	STATES[1] = 0.00171338077730188;
+//	STATES[2] = 0.982660523699656;
+//	STATES[3] = 0.989108212766685;
+//	STATES[4] = 0.00017948816388306;
+//	STATES[5] = 0.00302126301779861;
+//	STATES[6] = 0.999967936476325;
+//	STATES[7] = 0.0417603108167287;
+//
+//	for (i=1; i<num_threads; i++)
+//		memcpy(STATES + i*rateStateCount, STATES, rateStateCount*sizeof(double));
+//}
+
 const int DEFAULT_TESTING_TIMESTEPS = 1000;
 const double FLOPSPerTimeStep = 22.0f;
 const int FunctionEvals = 4;
@@ -298,7 +320,7 @@ void solve(double* h_states, float startTime, float endTime, unsigned int timeSt
 
     // Setup execution parameters
     dim3  grid(num_blocks/num_streams/num_partitions, 1, 1);
-    printf("blox %d streams %d parts %d\n", num_blocks, num_streams, num_partitions);
+    //printf(" blox %d streams %d parts %d\n", num_blocks, num_streams, num_partitions);
     dim3  threads(threads_per_block, 1, 1);
 
 	if (timing_file) {
@@ -316,12 +338,11 @@ void solve(double* h_states, float startTime, float endTime, unsigned int timeSt
 		cutCreateTimer(&kernel_timer);
 		cutilCheckError(cutStartTimer(kernel_timer));
 		// Start kernel
-		printf("grid.x %d threads.x %d mem %d\n", grid.x, threads.x, sharedMem);
 		solveSystem<<<grid, threads, sharedMem>>>(timeSteps, stepSize, d_states);
 		checkCUDAError("Single Kernel Execution");
 		cutilSafeCall( cudaThreadSynchronize() );
 		// Stop kernel Timer 
-    cutilCheckError(cutStopTimer(kernel_timer));
+		cutilCheckError(cutStopTimer(kernel_timer));
 		cutilSafeCall( cudaMemcpy(h_paged_states, d_states, pagedMemorySize/num_streams, 
 			cudaMemcpyDeviceToHost) );
 		memcpy(h_states, h_paged_states, pagedMemorySize/num_streams);
diff --git a/src/external_dae_solver_routines.c b/src/external_dae_solver_routines.c
deleted file mode 100644
index f365f7fa..00000000
--- a/src/external_dae_solver_routines.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/* \file
- * $Id: external_dae_solver_routines.c 1836 2010-12-20 17:25:14Z chrispbradley $
- * \author Chris Bradley
- * \brief This file provides the routines for solving differential-algebraic equations with an external solver.
- *.
- * \section LICENSE
- * 
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- * 
- * The contents of this file are subject to the Mozilla Public License
- * Version 1.1 (the "License"); you may not use this file except in
- * compliance with the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- * 
- * Software distributed under the License is distributed on an "AS IS"
- * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * The Original Code is OpenCMISS
- * 
- * The Initial Developer of the Original Code is University of Auckland,
- * Auckland, New Zealand, the University of Oxford, Oxford, United
- * Kingdom and King's College, London, United Kingdom. Portions created
- * by the University of Auckland, the University of Oxford and King's
- * College, London are Copyright (C) 2007-2010 by the University of
- * Auckland, the University of Oxford and King's College, London.
- * All Rights Reserved.
- * 
- * Contributor(s):
- * 
- * Alternatively, the contents of this file may be used under the terms of
- * either the GNU General Public License Version 2 or later (the "GPL"), or
- * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- * 
- */
-
-/*
-File: external_dae_solver_routines.c
-===================
- 
-This file provides provides the routines for solving differential-algebraic equations with an external solver.
-
-Functions included:
-
-SolverDAEExternalIntegrate     Solves the differential-algebraic equation.
-
-*/
-
-/* Included files */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "external_dae_solver_routines.h"
-//#include "cuda_solver_routines.cu"
-
-/* Type definitions */
-
-/* Function Definitions 
-extern "C"
-{*/
-void SolverDAEExternalIntegrate(const int NumberOfDofs,
-				const double StartTime,
-				const double EndTime,
-				double *InitialStep,
-				const int OnlyOneModelIndex,
-				int *ModelsData,
-				int NumberOfState,
-				double *StateData,
-				int NumberOfParameters,
-				double *ParametersData,
-				int NumberOfIntermediate,
-				double *IntermediateData,
-				int *err)
-{
-
-	//unsigned int timesteps = 10;
-
-	//printf("start %f end %f int step %f dofs %d", StartTime, EndTime, InitialStep[0], NumberOfDofs);
-
-	//solve(StateData, StartTime, EndTime, timesteps, NumberOfDofs, 1024, 10, 2, NULL);
-
-}
-//}
-
diff --git a/src/external_dae_solver_routines.cu b/src/external_dae_solver_routines.cu
index cd76e408..5479e992 100644
--- a/src/external_dae_solver_routines.cu
+++ b/src/external_dae_solver_routines.cu
@@ -83,11 +83,9 @@ void SolverDAEExternalIntegrate(const int NumberOfDofs,
 				int *err)
 {
 
-	unsigned int timesteps = 10;
+	printf("start %f end %f steps %d\n", StartTime, EndTime, (int)((EndTime-StartTime)/InitialStep[0]));
 
-	printf("start %f end %f int step %f dofs %d", StartTime, EndTime, InitialStep[0], NumberOfDofs);
-
-	solve(StateData, StartTime, EndTime, timesteps, NumberOfDofs, 1024, 10, 2, NULL);
+	solve(StateData, StartTime, EndTime, (int)((EndTime-StartTime)/InitialStep[0]), NumberOfDofs, 1024, 10, 2, NULL);
 
 }
 }
diff --git a/src/external_dae_solver_routines.h b/src/external_dae_solver_routines.h
index ac39b779..d09ea0c9 100644
--- a/src/external_dae_solver_routines.h
+++ b/src/external_dae_solver_routines.h
@@ -43,8 +43,8 @@
  *
  */
 
-//extern "C"
-//{
+extern "C"
+{
 void SolverDAEExternalIntegrate(const int NumberOfDofs,
 				const double StartTime,
 				const double EndTime,
@@ -58,4 +58,4 @@ void SolverDAEExternalIntegrate(const int NumberOfDofs,
 				int NumberOfIntermediate,
 				double *IntermediateData,
 				int *err);
-//}
+}
diff --git a/src/field_IO_routines.f90 b/src/field_IO_routines.f90
index f603c763..c5eeade7 100644
--- a/src/field_IO_routines.f90
+++ b/src/field_IO_routines.f90
@@ -353,16 +353,16 @@ FUNCTION FieldExport_DerivativeIndices( handle, componentNumber, fieldType, vari
       INTEGER(C_INT) :: FieldExport_DerivativeIndices
     END FUNCTION FieldExport_DerivativeIndices
 
-    SUBROUTINE READ_VTK(filePath, points, numPoints, cells, numCells) &
-    & BIND(C, NAME="readVTK")
-      USE TYPES
-      USE ISO_C_BINDING
-      CHARACTER(LEN=1,KIND=C_CHAR), INTENT(IN) :: filePath(*)
-      TYPE(C_PTR), INTENT(OUT):: points
-      INTEGER(C_INT), INTENT(OUT) :: numPoints
-      TYPE(C_PTR), INTENT(OUT) :: cells
-      INTEGER(C_INT), INTENT(OUT) :: numCells
-    END SUBROUTINE READ_VTK
+     SUBROUTINE READ_VTK(filePath, points, numPoints, cells, numCells) &
+     & BIND(C, NAME="readVTK")
+       USE TYPES
+       USE ISO_C_BINDING
+       CHARACTER(LEN=1,KIND=C_CHAR), INTENT(IN) :: filePath(*)
+       TYPE(C_PTR), INTENT(OUT):: points
+       INTEGER(C_INT), INTENT(OUT) :: numPoints
+       TYPE(C_PTR), INTENT(OUT) :: cells
+       INTEGER(C_INT), INTENT(OUT) :: numCells
+     END SUBROUTINE READ_VTK
 
   END INTERFACE
 
@@ -1955,7 +1955,6 @@ SUBROUTINE FIELD_IO_IMPORT_GLOBAL_MESH(NAME, REGION, MESH, MESH_USER_NUMBER, MAS
     ENDIF
 
     DO idx_comp=1, NUMBER_OF_MESH_COMPONENTS
-       NULLIFY(ELEMENTS_PTR(idx_comp)%PTR)
        CALL MESH_TOPOLOGY_ELEMENTS_CREATE_START(MESH,idx_comp,BASIS_FUNCTIONS%BASES(1)%PTR,ELEMENTS_PTR(idx_comp)%PTR, &
          & ERR,ERROR,*999)
     ENDDO
@@ -5224,14 +5223,15 @@ SUBROUTINE FIELD_IO_EXPORT_NODES_INTO_LOCAL_FILE(NODAL_INFO_SET, NAME, my_comput
     TYPE(VARYING_STRING) :: FILE_NAME !the prefix name of file.
     TYPE(FIELD_VARIABLE_COMPONENT_TYPE), POINTER :: COMPONENT !the prefix name of file.
     TYPE(DOMAIN_NODES_TYPE), POINTER :: DOMAIN_NODES ! domain nodes
-    INTEGER(INTG) :: local_number, global_number, sessionHandle, paddingCount, DERIVATIVE_INDEXES(PART_DERIV_S4_S4_S4)
+    INTEGER(INTG) :: local_number, global_number, sessionHandle, paddingCount,DERIVATIVE_INDEXES(PART_DERIV_S4_S4_S4)
     INTEGER(INTG), ALLOCATABLE :: paddingInfo(:)
     INTEGER(INTG) :: nn, comp_idx,  dev_idx, NUM_OF_NODAL_DEV, MAX_NUM_OF_NODAL_DERIVATIVES, total_nodal_values
+    INTEGER(INTG), POINTER :: GEOMETRIC_PARAMETERS_INTG(:)
     LOGICAL :: FOUND
     REAL(C_DOUBLE), ALLOCATABLE, TARGET :: NODAL_BUFFER(:), TOTAL_NODAL_BUFFER(:)
-    REAL(DP), POINTER :: GEOMETRIC_PARAMETERS(:)
+    REAL(DP), POINTER :: GEOMETRIC_PARAMETERS_DP(:)
     !INTEGER(INTG), POINTER :: GEOMETRIC_PARAMETERS_INTG(:)
-    REAL(DP) :: padding(1)
+    REAL(DP) :: padding(1),VALUE
     
     padding(1) = 1.23456789
     
@@ -5334,16 +5334,18 @@ SUBROUTINE FIELD_IO_EXPORT_NODES_INTO_LOCAL_FILE(NODAL_INFO_SET, NAME, my_comput
         ENDDO !paddingCount
 
 
-!         SELECT CASE(COMPONENT%FIELD_VARIABLE%DATA_TYPE)
-!         CASE(FIELD_INTG_TYPE)
-!         NULLIFY(GEOMETRIC_PARAMETERS_INTG)
-!         CALL FIELD_PARAMETER_SET_DATA_GET(COMPONENT%FIELD_VARIABLE%FIELD,FIELD_U_VARIABLE_TYPE, &
-!           & FIELD_VALUES_SET_TYPE,GEOMETRIC_PARAMETERS_INTG,ERR,ERROR,*999)
-!         CASE(FIELD_DP_TYPE)
-        NULLIFY(GEOMETRIC_PARAMETERS)
-        CALL FIELD_PARAMETER_SET_DATA_GET(COMPONENT%FIELD_VARIABLE%FIELD,FIELD_U_VARIABLE_TYPE, &
-          & FIELD_VALUES_SET_TYPE,GEOMETRIC_PARAMETERS,ERR,ERROR,*999)
-!         END SELECT
+         SELECT CASE(COMPONENT%FIELD_VARIABLE%DATA_TYPE)
+         CASE(FIELD_INTG_TYPE)
+           NULLIFY(GEOMETRIC_PARAMETERS_INTG)
+           CALL FIELD_PARAMETER_SET_DATA_GET(COMPONENT%FIELD_VARIABLE%FIELD,FIELD_U_VARIABLE_TYPE, &
+             & FIELD_VALUES_SET_TYPE,GEOMETRIC_PARAMETERS_INTG,ERR,ERROR,*999)
+         CASE(FIELD_DP_TYPE)
+           NULLIFY(GEOMETRIC_PARAMETERS_DP)
+           CALL FIELD_PARAMETER_SET_DATA_GET(COMPONENT%FIELD_VARIABLE%FIELD,FIELD_U_VARIABLE_TYPE, &
+             & FIELD_VALUES_SET_TYPE,GEOMETRIC_PARAMETERS_DP,ERR,ERROR,*999)
+         CASE DEFAULT
+           CALL FLAG_ERROR("Not implemented.",ERR,ERROR,*999)
+         END SELECT
         !get the nodal partial derivatives
         NUM_OF_NODAL_DEV=DOMAIN_NODES%NODES(local_number)%NUMBER_OF_DERIVATIVES
 
@@ -5362,8 +5364,17 @@ SUBROUTINE FIELD_IO_EXPORT_NODES_INTO_LOCAL_FILE(NODAL_INFO_SET, NAME, my_comput
 
           NUM_OF_NODAL_DEV = NUM_OF_NODAL_DEV + 1
 
-          NODAL_BUFFER( NUM_OF_NODAL_DEV ) = GEOMETRIC_PARAMETERS( COMPONENT%PARAM_TO_DOF_MAP% &
-            & NODE_PARAM2DOF_MAP( DERIVATIVE_INDEXES( dev_idx ), local_number ) )
+          SELECT CASE(COMPONENT%FIELD_VARIABLE%DATA_TYPE)
+          CASE(FIELD_INTG_TYPE)
+            VALUE=REAL(GEOMETRIC_PARAMETERS_INTG( COMPONENT%PARAM_TO_DOF_MAP%NODE_PARAM2DOF_MAP( &
+              & DERIVATIVE_INDEXES( dev_idx ), local_number ) ),DP)
+          CASE(FIELD_DP_TYPE)
+            VALUE=GEOMETRIC_PARAMETERS_DP( COMPONENT%PARAM_TO_DOF_MAP%NODE_PARAM2DOF_MAP( &
+              & DERIVATIVE_INDEXES( dev_idx ), local_number ) )
+          CASE DEFAULT
+            CALL FLAG_ERROR("Not implemented.",ERR,ERROR,*999)
+          END SELECT
+          NODAL_BUFFER( NUM_OF_NODAL_DEV ) = VALUE
         ENDDO !dev_idx
 
         CALL GROW_ARRAY( TOTAL_NODAL_BUFFER, NUM_OF_NODAL_DEV, "Insufficient memory during I/O", ERR, ERROR, *999 )
@@ -5412,7 +5423,8 @@ SUBROUTINE FIELD_IO_EXPORT_NODES_INTO_LOCAL_FILE(NODAL_INFO_SET, NAME, my_comput
     !release the temporary memory
     CALL CHECKED_DEALLOCATE( NODAL_BUFFER )
     CALL CHECKED_DEALLOCATE( TOTAL_NODAL_BUFFER )
-    IF(ASSOCIATED(GEOMETRIC_PARAMETERS)) NULLIFY(GEOMETRIC_PARAMETERS)
+    IF(ASSOCIATED(GEOMETRIC_PARAMETERS_DP)) NULLIFY(GEOMETRIC_PARAMETERS_DP)
+    IF(ASSOCIATED(GEOMETRIC_PARAMETERS_INTG)) NULLIFY(GEOMETRIC_PARAMETERS_INTG)
 
     CALL EXITS("FIELD_IO_EXPORT_NODES_INTO_LOCAL_FILE")
     RETURN
@@ -6066,4 +6078,8 @@ SUBROUTINE FIELD_IO_ELEMENTS_EXPORT(FIELDS, FILE_NAME, METHOD,ERR,ERROR,*)
     RETURN 1
   END SUBROUTINE FIELD_IO_ELEMENTS_EXPORT
 
+  !
+  !================================================================================================================================
+  !
+
 END MODULE FIELD_IO_ROUTINES
diff --git a/src/finite_elasticity_routines.f90 b/src/finite_elasticity_routines.f90
index 96502604..de20d74a 100644
--- a/src/finite_elasticity_routines.f90
+++ b/src/finite_elasticity_routines.f90
@@ -2997,28 +2997,29 @@ SUBROUTINE FINITE_ELASTICITY_EQUATIONS_SET_SETUP(EQUATIONS_SET,EQUATIONS_SET_SET
               !Check the user specified field
               CALL FIELD_TYPE_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_GENERAL_TYPE,ERR,ERROR,*999)
               CALL FIELD_DEPENDENT_TYPE_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_INDEPENDENT_TYPE,ERR,ERROR,*999)
+              !Question:Better to leave it up for the user to play around?
               CALL FIELD_NUMBER_OF_VARIABLES_CHECK(EQUATIONS_SET_SETUP%FIELD,2,ERR,ERROR,*999)
               CALL FIELD_VARIABLE_TYPES_CHECK(EQUATIONS_SET_SETUP%FIELD,(/FIELD_U_VARIABLE_TYPE,FIELD_DELUDELN_VARIABLE_TYPE/), &
-                & ERR,ERROR,*999)
+               & ERR,ERROR,*999)
               CALL FIELD_DIMENSION_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_U_VARIABLE_TYPE,FIELD_VECTOR_DIMENSION_TYPE,ERR,ERROR,*999)
               CALL FIELD_DIMENSION_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_DELUDELN_VARIABLE_TYPE,FIELD_VECTOR_DIMENSION_TYPE, &
-                & ERR,ERROR,*999)
+               & ERR,ERROR,*999)
               CALL FIELD_DATA_TYPE_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_U_VARIABLE_TYPE,FIELD_DP_TYPE,ERR,ERROR,*999)
               CALL FIELD_DATA_TYPE_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_DELUDELN_VARIABLE_TYPE,FIELD_DP_TYPE,ERR,ERROR,*999)
               CALL FIELD_NUMBER_OF_COMPONENTS_GET(EQUATIONS_SET%GEOMETRY%GEOMETRIC_FIELD,FIELD_U_VARIABLE_TYPE, &
-                & NUMBER_OF_DIMENSIONS,ERR,ERROR,*999)
+               & NUMBER_OF_DIMENSIONS,ERR,ERROR,*999)
               CALL FIELD_NUMBER_OF_COMPONENTS_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_U_VARIABLE_TYPE,NUMBER_OF_COMPONENTS, &
-                & ERR,ERROR,*999)
+               & ERR,ERROR,*999)
               CALL FIELD_NUMBER_OF_COMPONENTS_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_DELUDELN_VARIABLE_TYPE,NUMBER_OF_COMPONENTS, &
-                & ERR,ERROR,*999)
+               & ERR,ERROR,*999)
               SELECT CASE(EQUATIONS_SET%SOLUTION_METHOD)
               CASE(EQUATIONS_SET_FEM_SOLUTION_METHOD)
-                DO component_idx=1,NUMBER_OF_DIMENSIONS
-                  CALL FIELD_COMPONENT_INTERPOLATION_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_U_VARIABLE_TYPE,component_idx, &
-                    & FIELD_NODE_BASED_INTERPOLATION,ERR,ERROR,*999)
-                  CALL FIELD_COMPONENT_INTERPOLATION_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_DELUDELN_VARIABLE_TYPE,component_idx, &
-                    & FIELD_NODE_BASED_INTERPOLATION,ERR,ERROR,*999)
-                ENDDO !component_idx
+                !DO component_idx=1,NUMBER_OF_DIMENSIONS
+                !  CALL FIELD_COMPONENT_INTERPOLATION_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_U_VARIABLE_TYPE,component_idx, &
+                !    & FIELD_NODE_BASED_INTERPOLATION,ERR,ERROR,*999)
+                !  CALL FIELD_COMPONENT_INTERPOLATION_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_DELUDELN_VARIABLE_TYPE,component_idx, &
+                !    & FIELD_NODE_BASED_INTERPOLATION,ERR,ERROR,*999)
+                !ENDDO !component_idx
 
 !kmith :09.06.09 - Hydrostatic pressure could be node-based as well
 !                CALL FIELD_COMPONENT_INTERPOLATION_CHECK(EQUATIONS_SET_SETUP%FIELD,FIELD_U_VARIABLE_TYPE,4, &
@@ -3756,7 +3757,8 @@ SUBROUTINE FINITE_ELASTICITY_POST_SOLVE_OUTPUT_DATA(CONTROL_LOOP,SOLVER,ERR,ERRO
     TYPE(SOLVER_EQUATIONS_TYPE), POINTER :: SOLVER_EQUATIONS  !<A pointer to the solver equations
     TYPE(EQUATIONS_SET_TYPE), POINTER :: EQUATIONS_SET !<A pointer to the equations set
     TYPE(SOLVER_MAPPING_TYPE), POINTER :: SOLVER_MAPPING !<A pointer to the solver mapping
-    TYPE(CONTROL_LOOP_TYPE), POINTER :: CONTROL_TIME_LOOP !<A pointer to the control time loop.
+    TYPE(CONTROL_LOOP_TYPE), POINTER :: TIME_LOOP !<A pointer to the control time loop.
+    TYPE(CONTROL_LOOP_TYPE), POINTER :: WHILE_LOOP !<A pointer to the subiteration loop.
     TYPE(VARYING_STRING) :: LOCAL_ERROR
     TYPE(VARYING_STRING) :: METHOD !,FILE
     CHARACTER(14) :: FILE
@@ -3803,80 +3805,69 @@ SUBROUTINE FINITE_ELASTICITY_POST_SOLVE_OUTPUT_DATA(CONTROL_LOOP,SOLVER,ERR,ERRO
                   !Make sure the equations sets are up to date
                   DO equations_set_idx=1,SOLVER_MAPPING%NUMBER_OF_EQUATIONS_SETS
                     EQUATIONS_SET=>SOLVER_MAPPING%EQUATIONS_SETS(equations_set_idx)%PTR
-                    IF(EQUATIONS_SET%TYPE==EQUATIONS_SET_FINITE_ELASTICITY_TYPE)THEN
-                    CONTROL_TIME_LOOP=>CONTROL_LOOP
-                    DO loop_idx=1,CONTROL_LOOP%CONTROL_LOOP_LEVEL
-                      IF(CONTROL_TIME_LOOP%LOOP_TYPE==PROBLEM_CONTROL_TIME_LOOP_TYPE) THEN
-                        EXIT
-                      ENDIF
-                      IF (ASSOCIATED(CONTROL_LOOP%PARENT_LOOP)) THEN
-                        CONTROL_TIME_LOOP=>CONTROL_TIME_LOOP%PARENT_LOOP
-                      ELSE
-                        CALL FLAG_ERROR("Could not find a time control loop.",ERR,ERROR,*999)
-                      ENDIF
-                    ENDDO
-
-                    CURRENT_LOOP_ITERATION=CONTROL_TIME_LOOP%TIME_LOOP%ITERATION_NUMBER
-                    OUTPUT_ITERATION_NUMBER=CONTROL_TIME_LOOP%TIME_LOOP%OUTPUT_NUMBER
-
-                    IF(OUTPUT_ITERATION_NUMBER/=0) THEN
-                      IF(CONTROL_TIME_LOOP%TIME_LOOP%CURRENT_TIME<=CONTROL_TIME_LOOP%TIME_LOOP%STOP_TIME) THEN
-                        IF(CURRENT_LOOP_ITERATION<10) THEN
-                          WRITE(OUTPUT_FILE,'("S_TIMESTP_000",I0)') CURRENT_LOOP_ITERATION
-                        ELSE IF(CURRENT_LOOP_ITERATION<100) THEN
-                          WRITE(OUTPUT_FILE,'("S_TIMESTP_00",I0)') CURRENT_LOOP_ITERATION
-                        ELSE IF(CURRENT_LOOP_ITERATION<1000) THEN
-                          WRITE(OUTPUT_FILE,'("S_TIMESTP_0",I0)') CURRENT_LOOP_ITERATION
-                        ELSE IF(CURRENT_LOOP_ITERATION<10000) THEN
-                          WRITE(OUTPUT_FILE,'("S_TIMESTP_",I0)') CURRENT_LOOP_ITERATION
-                        END IF
-                        FILE=OUTPUT_FILE
-  !                    FILE="TRANSIENT_OUTPUT"
-                        METHOD="FORTRAN"
-                        EXPORT_FIELD=.TRUE.
-                        IF(EXPORT_FIELD) THEN          
-                          IF(MOD(CURRENT_LOOP_ITERATION,OUTPUT_ITERATION_NUMBER)==0)  THEN   
-                            IF(SOLVER%OUTPUT_TYPE>=SOLVER_PROGRESS_OUTPUT) THEN
-                              CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Finite Elasticity export fields ...",ERR,ERROR,*999)
-                              CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
-                            ENDIF
-                            CALL FLUID_MECHANICS_IO_WRITE_CMGUI(EQUATIONS_SET%REGION,EQUATIONS_SET%GLOBAL_NUMBER,FILE, &
-                              & ERR,ERROR,*999)
-                            IF(SOLVER%OUTPUT_TYPE>=SOLVER_PROGRESS_OUTPUT) THEN
-                              CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Finite Elasticity all fields exported ...",ERR,ERROR,*999)
+                    IF(EQUATIONS_SET%TYPE==EQUATIONS_SET_FINITE_ELASTICITY_TYPE) THEN
+                      TIME_LOOP=>CONTROL_LOOP !Initialise time loop (load increment loop on first)
+                      !Move up to find outer time loop
+                      DO loop_idx=1,CONTROL_LOOP%CONTROL_LOOP_LEVEL-1
+                        IF(ASSOCIATED(TIME_LOOP%PARENT_LOOP)) THEN
+                          TIME_LOOP=>TIME_LOOP%PARENT_LOOP
+                        ELSE
+                          CALL FLAG_ERROR("Could not find a time control loop.",ERR,ERROR,*999)
+                        ENDIF
+                      ENDDO
+                      WHILE_LOOP=>TIME_LOOP%SUB_LOOPS(1)%PTR
+                      CURRENT_LOOP_ITERATION=TIME_LOOP%TIME_LOOP%ITERATION_NUMBER
+                      OUTPUT_ITERATION_NUMBER=TIME_LOOP%TIME_LOOP%OUTPUT_NUMBER
+
+                      IF(WHILE_LOOP%WHILE_LOOP%CONTINUE_LOOP) THEN
+                        IF(TIME_LOOP%TIME_LOOP%CURRENT_TIME<=TIME_LOOP%TIME_LOOP%STOP_TIME) THEN
+                          WRITE(OUTPUT_FILE,'("S_TIMESTP_",I4.4)') CURRENT_LOOP_ITERATION
+                          FILE=OUTPUT_FILE
+                          METHOD="FORTRAN"
+                          EXPORT_FIELD=.TRUE.
+                          IF(EXPORT_FIELD) THEN          
+                            IF(MOD(CURRENT_LOOP_ITERATION,OUTPUT_ITERATION_NUMBER)==0)  THEN   
+                              IF(SOLVER%OUTPUT_TYPE>=SOLVER_PROGRESS_OUTPUT) THEN
+                                CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Finite Elasticity export fields ...",ERR,ERROR,*999)
+                                CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
+                              ENDIF
+                              CALL FLUID_MECHANICS_IO_WRITE_CMGUI(EQUATIONS_SET%REGION,EQUATIONS_SET%GLOBAL_NUMBER,FILE, &
+                                & ERR,ERROR,*999)
+                              IF(SOLVER%OUTPUT_TYPE>=SOLVER_PROGRESS_OUTPUT) THEN
+                                CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Finite Elasticity all fields exported ...",ERR,ERROR,*999)
+                              ENDIF
+                              CALL WRITE_STRING(DIAGNOSTIC_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
                             ENDIF
-                            CALL WRITE_STRING(DIAGNOSTIC_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
-                          ENDIF
-                        ENDIF 
-                      ENDIF 
-                    ENDIF
-
-                    !Subiteration intermediate solutions / iterates output:
-!                     IF(CONTROL_LOOP%PARENT_LOOP%LOOP_TYPE==PROBLEM_CONTROL_WHILE_LOOP_TYPE) THEN  !subiteration exists
-!                       SUBITERATION_NUMBER=CONTROL_LOOP%PARENT_LOOP%WHILE_LOOP%ITERATION_NUMBER
-!                       IF(CURRENT_LOOP_ITERATION<10) THEN
-!                         IF(SUBITERATION_NUMBER<10) THEN
-!                           WRITE(OUTPUT_FILE,'("S_00",I0,"_SUB_000",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER
-!                         ELSE IF(SUBITERATION_NUMBER<100) THEN
-!                           WRITE(OUTPUT_FILE,'("S_00",I0,"_SUB_00",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER
-!                         END IF
-!                         FILE=OUTPUT_FILE
-!                         METHOD="FORTRAN"
-!                         EXPORT_FIELD=.TRUE.
-!                         IF(EXPORT_FIELD) THEN
-!                           CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Finite Elasticity export subiterates ...",ERR,ERROR,*999)
-!                           CALL FLUID_MECHANICS_IO_WRITE_CMGUI(EQUATIONS_SET%REGION,EQUATIONS_SET%GLOBAL_NUMBER,FILE, &
-!                             & ERR,ERROR,*999)
-!                           CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
-!                           CALL WRITE_STRING(DIAGNOSTIC_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
+                          ENDIF 
+                        ENDIF !stop_time
+                      ENDIF !continue_loop
+
+!                       !Subiteration intermediate solutions / iterates output:
+!                       IF(CONTROL_LOOP%PARENT_LOOP%LOOP_TYPE==PROBLEM_CONTROL_WHILE_LOOP_TYPE) THEN  !subiteration exists
+!                         SUBITERATION_NUMBER=CONTROL_LOOP%PARENT_LOOP%WHILE_LOOP%ITERATION_NUMBER
+!                         IF(CURRENT_LOOP_ITERATION<10) THEN
+!                           IF(SUBITERATION_NUMBER<10) THEN
+!                             WRITE(OUTPUT_FILE,'("S_00",I0,"_SUB_000",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER
+!                           ELSE IF(SUBITERATION_NUMBER<100) THEN
+!                             WRITE(OUTPUT_FILE,'("S_00",I0,"_SUB_00",I0)') CURRENT_LOOP_ITERATION,SUBITERATION_NUMBER
+!                           END IF
+!                           FILE=OUTPUT_FILE
+!                           METHOD="FORTRAN"
+!                           EXPORT_FIELD=.TRUE.
+!                           IF(EXPORT_FIELD) THEN
+!                             CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,"Finite Elasticity export subiterates ...",ERR,ERROR,*999)
+!                             CALL FLUID_MECHANICS_IO_WRITE_CMGUI(EQUATIONS_SET%REGION,EQUATIONS_SET%GLOBAL_NUMBER,FILE, &
+!                               & ERR,ERROR,*999)
+!                             CALL WRITE_STRING(GENERAL_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
+!                             CALL WRITE_STRING(DIAGNOSTIC_OUTPUT_TYPE,OUTPUT_FILE,ERR,ERROR,*999)
+!                           ENDIF
 !                         ENDIF
 !                       ENDIF
-!                     ENDIF
 
-                   ENDIF
-                  ENDDO
-                ENDIF
-              ENDIF
+                   ENDIF !EQUATIONS_SET_FINITE_ELASTICITY_TYPE
+                  ENDDO !equations_set_idx
+                ENDIF !Solver_mapping
+              ENDIF !Solver_equations
             CASE DEFAULT
               LOCAL_ERROR="Problem subtype "//TRIM(NUMBER_TO_VSTRING(CONTROL_LOOP%PROBLEM%SUBTYPE,"*",ERR,ERROR))// &
                 & " is not valid for a finite elasticity problem class."
@@ -4401,9 +4392,9 @@ SUBROUTINE FINITE_ELASTICITY_PRE_SOLVE_UPDATE_BOUNDARY_CONDITIONS(CONTROL_LOOP,S
                                         CALL FIELD_PARAMETER_SET_DATA_RESTORE(DEPENDENT_FIELD,FIELD_DELUDELN_VARIABLE_TYPE, &
                                           & FIELD_PRESSURE_VALUES_SET_TYPE,DUMMY_VALUES1,ERR,ERROR,*999)
                                       ENDIF
+                                      CALL FIELD_PARAMETER_SET_DATA_RESTORE(DEPENDENT_FIELD,FIELD_DELUDELN_VARIABLE_TYPE, &
+                                        & FIELD_PRESSURE_VALUES_SET_TYPE,CURRENT_PRESSURE_VALUES,ERR,ERROR,*999)
                                     ENDIF !Pressure_condition_used
-                                  CALL FIELD_PARAMETER_SET_DATA_RESTORE(DEPENDENT_FIELD,FIELD_DELUDELN_VARIABLE_TYPE, &
-                                    & FIELD_PRESSURE_VALUES_SET_TYPE,CURRENT_PRESSURE_VALUES,ERR,ERROR,*999)
                                   ELSE
                                     CALL FLAG_ERROR("Boundary condition variable is not associated.",ERR,ERROR,*999)
                                   END IF
diff --git a/src/fluid_mechanics_IO_routines.f90 b/src/fluid_mechanics_IO_routines.f90
index e984a23c..53776ef2 100644
--- a/src/fluid_mechanics_IO_routines.f90
+++ b/src/fluid_mechanics_IO_routines.f90
@@ -3712,6 +3712,10 @@ SUBROUTINE FLUID_MECHANICS_IO_READ_DATA(SOLVER_TYPE,INPUT_VALUES,NUMBER_OF_DIMEN
             WRITE(INPUT_FILE,'("./input/motion/DISPLACEMENT_",I0,".dat")') TIME_STEP
           ENDIF
           OPEN(UNIT=TIME_STEP, FILE=INPUT_FILE,STATUS='unknown') 
+          READ(TIME_STEP,*) CHECK
+          IF(CHECK/=ENDI) THEN
+            STOP 'Error during data input - probably wrong Lagrangian/Hermite input file!'
+          ENDIF
           DO I=1,ENDI
             READ(TIME_STEP,*) INPUT_VALUES(I)
           ENDDO
diff --git a/src/solver_routines.f90 b/src/solver_routines.f90
index 2b41a3a3..2d5912a5 100644
--- a/src/solver_routines.f90
+++ b/src/solver_routines.f90
@@ -2951,6 +2951,7 @@ SUBROUTINE SOLVER_DAE_EXTERNAL_SOLVE(EXTERNAL_SOLVER,ERR,ERROR,*)
                       ENDIF
                     ELSE
                       NULLIFY(INTERMEDIATE_DATA)
+                      NULLIFY(INTERMEDIATE_FIELD)
                     ENDIF
 
                     !Call the external solver to integrate these CellML equations

From 80d84675e9480b545af17f82f8c244818f90e483 Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Mon, 18 Apr 2011 01:00:15 +0000
Subject: [PATCH 10/24] Fixed issues with recent changes to field variables. No
 relevant Tracker Item

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2122 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 src/cuda_solver_routines.cu         | 20 ++++++++++----------
 src/diffusion_equation_routines.f90 | 14 +++++++-------
 src/equations_set_routines.f90      |  2 +-
 src/field_routines.f90              |  8 ++++++++
 src/solver_routines.f90             |  2 +-
 5 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/cuda_solver_routines.cu b/src/cuda_solver_routines.cu
index 8d34faaa..3d98036c 100644
--- a/src/cuda_solver_routines.cu
+++ b/src/cuda_solver_routines.cu
@@ -423,9 +423,8 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
     dim3  grid(blocksPerDomain, 1, 1);
     dim3  threads(threads_per_block, 1, 1);
 
-
     //#ifdef DEBUG
-    if (timing_file) {
+
     	printf("> Device name : %s\n", deviceProp.name );
 		printf("> CUDA Capable SM %d.%d hardware with %d multi-processors\n",
 			deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
@@ -436,6 +435,7 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
     	printf("grid.x %d threads.x %d sharedMem %d\n", grid.x, threads.x, sharedMem);
     	printf("Spills %d %d %d\n", spill[0], spill[1], spill[2]);
 
+        if (timing_file) {
 		// Setup and start global timer
 		timer = 0;
 		cutCreateTimer(&timer);
@@ -476,11 +476,11 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 			if (timing_file==NULL || !(i==0 && j==0) || (domainIndex+1)*grid.x<=num_blocks) {
 				local_offset = j * rateStateCount * grid.x * threads.x ;
 				if (i == num_partitions - 1 && j == spill[0] && (spill[1]!=0 || spill[2]!=0)) {
-					//printf("last async in %d, size %d\n", domainIndex, lastStreamMemorySize);
+					printf("last async in %d, size %d\n", domainIndex, lastStreamMemorySize);
 					cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset,
 							lastStreamMemorySize, cudaMemcpyHostToDevice, streams[j]) );
 				} else {
-					//printf("normal async in %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
+					printf("normal async in %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
 					cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset,
 							pagedMemorySize/num_streams, cudaMemcpyHostToDevice, streams[j]) );
 				}
@@ -509,11 +509,11 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 			if (timing_file==NULL || !(i==0 && j==0) || (domainIndex+1)*grid.x<=num_blocks) {
 				local_offset = j * rateStateCount * grid.x * threads.x ;
 				if (i == num_partitions - 1 && j == spill[0] && (spill[1]!=0 || spill[2]!=0) ) {
-					//printf("last async out %d, size %d\n", domainIndex, lastStreamMemorySize);
+					printf("last async out %d, size %d\n", domainIndex, lastStreamMemorySize);
 					cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset,
 						lastStreamMemorySize, cudaMemcpyDeviceToHost, streams[j]) );
 				} else {
-					//printf("normal async out %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
+					printf("normal async out %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
 					cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset,
 						pagedMemorySize/num_streams, cudaMemcpyDeviceToHost, streams[j]) );
 				}
@@ -530,23 +530,23 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 				global_offset = i * num_streams * grid.x * threads.x;
 				
 				if (i == num_partitions - 1 && j == spill[0] && (spill[1]!=0 || spill[2]!=0)) {
-					//printf("last memcpy out %d\n", domainIndex);
+					printf("last memcpy out %d\n", domainIndex);
 					memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset,
 							lastStreamMemorySize);
 				} else {
-					//printf("normal memcpy out %d\n", domainIndex);
+					printf("normal memcpy out %d\n", domainIndex);
 					memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset,
 						pagedMemorySize/num_streams);
 				}
 
 				global_offset = (i + 1) * num_streams * grid.x * threads.x;
 				if (i == num_partitions - 2 && j == spill[0] && (spill[1]!=0 || spill[2]!=0)) {
-					//printf("last memcpy in %d\n", domainIndex);
+					printf("last memcpy in %d\n", domainIndex);
 					lastStreamMemorySize = sizeof(double)*rateStateCount*(spill[1]*threads_per_block+spill[2]);
 					memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset,
 							lastStreamMemorySize);
 				} else if (i < num_partitions - 2 || j < spill[0] || (i == num_partitions - 2 && j == spill[0])) {
-					//printf("normal memcpy in %d\n", domainIndex);
+					printf("normal memcpy in %d\n", domainIndex);
 					memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset,
 						pagedMemorySize/num_streams);
 				}
diff --git a/src/diffusion_equation_routines.f90 b/src/diffusion_equation_routines.f90
index 6bf7641e..6cb29525 100644
--- a/src/diffusion_equation_routines.f90
+++ b/src/diffusion_equation_routines.f90
@@ -3012,7 +3012,7 @@ SUBROUTINE DIFFUSION_EQUATION_EQUATIONS_SET_NONLINEAR_SETUP(EQUATIONS_SET,EQUATI
               CALL EQUATIONS_MAPPING_DYNAMIC_MATRICES_SET(EQUATIONS_MAPPING,.TRUE.,.TRUE.,ERR,ERROR,*999)
               CALL EQUATIONS_MAPPING_DYNAMIC_VARIABLE_TYPE_SET(EQUATIONS_MAPPING,FIELD_U_VARIABLE_TYPE,ERR,ERROR,*999)
               CALL EQUATIONS_MAPPING_RHS_VARIABLE_TYPE_SET(EQUATIONS_MAPPING,FIELD_DELUDELN_VARIABLE_TYPE,ERR,ERROR,*999)
-              CALL EQUATIONS_MAPPING_RESIDUAL_VARIABLE_TYPE_SET(EQUATIONS_MAPPING,FIELD_U_VARIABLE_TYPE,ERR,ERROR,*999)
+              CALL EQUATIONS_MAPPING_RESIDUAL_VARIABLE_TYPES_SET(EQUATIONS_MAPPING,[FIELD_U_VARIABLE_TYPE],ERR,ERROR,*999)
               CALL EQUATIONS_MAPPING_CREATE_FINISH(EQUATIONS_MAPPING,ERR,ERROR,*999)
               !Create the equations matrices
               CALL EQUATIONS_MATRICES_CREATE_START(EQUATIONS,EQUATIONS_MATRICES,ERR,ERROR,*999)
@@ -5458,7 +5458,7 @@ SUBROUTINE DIFFUSION_EQUATION_FINITE_ELEMENT_JACOBIAN_EVALUATE(EQUATIONS_SET,ELE
         CASE(EQUATIONS_SET_QUADRATIC_SOURCE_DIFFUSION_SUBTYPE)
           EQUATIONS_MATRICES=>EQUATIONS%EQUATIONS_MATRICES
           NONLINEAR_MATRICES=>EQUATIONS_MATRICES%NONLINEAR_MATRICES
-          JACOBIAN_MATRIX=>NONLINEAR_MATRICES%JACOBIAN
+          JACOBIAN_MATRIX=>NONLINEAR_MATRICES%JACOBIANS(1)%PTR
           IF(JACOBIAN_MATRIX%UPDATE_JACOBIAN) THEN
             !Store all these in equations matrices/somewhere else?????
             DEPENDENT_FIELD=>EQUATIONS%INTERPOLATION%DEPENDENT_FIELD
@@ -5466,7 +5466,7 @@ SUBROUTINE DIFFUSION_EQUATION_FINITE_ELEMENT_JACOBIAN_EVALUATE(EQUATIONS_SET,ELE
             MATERIALS_FIELD=>EQUATIONS%INTERPOLATION%MATERIALS_FIELD
             EQUATIONS_MAPPING=>EQUATIONS%EQUATIONS_MAPPING
             NONLINEAR_MAPPING=>EQUATIONS_MAPPING%NONLINEAR_MAPPING
-            DEPENDENT_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLE
+            DEPENDENT_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLES(1)%PTR
             FIELD_VAR_TYPE=DEPENDENT_VARIABLE%VARIABLE_TYPE
             GEOMETRIC_VARIABLE=>GEOMETRIC_FIELD%VARIABLE_TYPE_MAP(FIELD_U_VARIABLE_TYPE)%PTR
             DEPENDENT_BASIS=>DEPENDENT_FIELD%DECOMPOSITION%DOMAIN(DEPENDENT_FIELD%DECOMPOSITION%MESH_COMPONENT_NUMBER)%PTR% &
@@ -5520,7 +5520,7 @@ SUBROUTINE DIFFUSION_EQUATION_FINITE_ELEMENT_JACOBIAN_EVALUATE(EQUATIONS_SET,ELE
         CASE(EQUATIONS_SET_EXPONENTIAL_SOURCE_DIFFUSION_SUBTYPE)                 
           EQUATIONS_MATRICES=>EQUATIONS%EQUATIONS_MATRICES
           NONLINEAR_MATRICES=>EQUATIONS_MATRICES%NONLINEAR_MATRICES
-          JACOBIAN_MATRIX=>NONLINEAR_MATRICES%JACOBIAN
+          JACOBIAN_MATRIX=>NONLINEAR_MATRICES%JACOBIANS(1)%PTR
           IF(JACOBIAN_MATRIX%UPDATE_JACOBIAN) THEN
             !Store all these in equations matrices/somewhere else?????
             DEPENDENT_FIELD=>EQUATIONS%INTERPOLATION%DEPENDENT_FIELD
@@ -5528,7 +5528,7 @@ SUBROUTINE DIFFUSION_EQUATION_FINITE_ELEMENT_JACOBIAN_EVALUATE(EQUATIONS_SET,ELE
             MATERIALS_FIELD=>EQUATIONS%INTERPOLATION%MATERIALS_FIELD
             EQUATIONS_MAPPING=>EQUATIONS%EQUATIONS_MAPPING
             NONLINEAR_MAPPING=>EQUATIONS_MAPPING%NONLINEAR_MAPPING
-            DEPENDENT_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLE
+            DEPENDENT_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLES(1)%PTR
             FIELD_VAR_TYPE=DEPENDENT_VARIABLE%VARIABLE_TYPE
             GEOMETRIC_VARIABLE=>GEOMETRIC_FIELD%VARIABLE_TYPE_MAP(FIELD_U_VARIABLE_TYPE)%PTR
             DEPENDENT_BASIS=>DEPENDENT_FIELD%DECOMPOSITION%DOMAIN(DEPENDENT_FIELD%DECOMPOSITION%MESH_COMPONENT_NUMBER)%PTR% &
@@ -5669,7 +5669,7 @@ SUBROUTINE DIFFUSION_EQUATION_FINITE_ELEMENT_RESIDUAL_EVALUATE(EQUATIONS_SET,ELE
           EQUATIONS_MAPPING=>EQUATIONS%EQUATIONS_MAPPING
           DYNAMIC_MAPPING=>EQUATIONS_MAPPING%DYNAMIC_MAPPING
           NONLINEAR_MAPPING=>EQUATIONS_MAPPING%NONLINEAR_MAPPING
-          DEPENDENT_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLE
+          DEPENDENT_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLES(1)%PTR
           FIELD_VAR_TYPE=DEPENDENT_VARIABLE%VARIABLE_TYPE
           GEOMETRIC_VARIABLE=>GEOMETRIC_FIELD%VARIABLE_TYPE_MAP(FIELD_U_VARIABLE_TYPE)%PTR
           DEPENDENT_BASIS=>DEPENDENT_FIELD%DECOMPOSITION%DOMAIN(DEPENDENT_FIELD%DECOMPOSITION%MESH_COMPONENT_NUMBER)%PTR% &
@@ -5790,7 +5790,7 @@ SUBROUTINE DIFFUSION_EQUATION_FINITE_ELEMENT_RESIDUAL_EVALUATE(EQUATIONS_SET,ELE
           EQUATIONS_MAPPING=>EQUATIONS%EQUATIONS_MAPPING
           DYNAMIC_MAPPING=>EQUATIONS_MAPPING%DYNAMIC_MAPPING
           NONLINEAR_MAPPING=>EQUATIONS_MAPPING%NONLINEAR_MAPPING
-          DEPENDENT_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLE
+          DEPENDENT_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLES(1)%PTR
           FIELD_VAR_TYPE=DEPENDENT_VARIABLE%VARIABLE_TYPE
           GEOMETRIC_VARIABLE=>GEOMETRIC_FIELD%VARIABLE_TYPE_MAP(FIELD_U_VARIABLE_TYPE)%PTR
           DEPENDENT_BASIS=>DEPENDENT_FIELD%DECOMPOSITION%DOMAIN(DEPENDENT_FIELD%DECOMPOSITION%MESH_COMPONENT_NUMBER)%PTR% &
diff --git a/src/equations_set_routines.f90 b/src/equations_set_routines.f90
index b4012b27..781dda06 100644
--- a/src/equations_set_routines.f90
+++ b/src/equations_set_routines.f90
@@ -4942,7 +4942,7 @@ SUBROUTINE EQUATIONS_SET_RESIDUAL_EVALUATE(EQUATIONS_SET,ERR,ERROR,*)
           IF(ASSOCIATED(EQUATIONS_MAPPING)) THEN
             NONLINEAR_MAPPING=>EQUATIONS_MAPPING%NONLINEAR_MAPPING
             IF(ASSOCIATED(NONLINEAR_MAPPING)) THEN
-              RESIDUAL_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLE
+              RESIDUAL_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLES(1)%PTR
               IF(ASSOCIATED(RESIDUAL_VARIABLE)) THEN
                 RESIDUAL_PARAMETER_SET=>RESIDUAL_VARIABLE%PARAMETER_SETS%SET_TYPE(FIELD_RESIDUAL_SET_TYPE)%PTR
                 IF(ASSOCIATED(RESIDUAL_PARAMETER_SET)) THEN
diff --git a/src/field_routines.f90 b/src/field_routines.f90
index f77a4100..16a48dcd 100644
--- a/src/field_routines.f90
+++ b/src/field_routines.f90
@@ -20463,6 +20463,14 @@ SUBROUTINE FIELD_PARAMETER_SET_UPDATE_NODE_DP(FIELD,VARIABLE_TYPE,FIELD_SET_TYPE
                               !Only checks if version number is greater than zero
                               IF(DERIVATIVE_NUMBER>0.AND.DERIVATIVE_NUMBER<=DOMAIN_NODES%NODES(DOMAIN_LOCAL_NODE_NUMBER)% &
                                 NUMBER_OF_DERIVATIVES.AND.VERSION_NUMBER>0) THEN
+                                WRITE (*,*) SIZE(FIELD_VARIABLE%COMPONENTS, 1)
+                                WRITE (*,*) SIZE(FIELD_VARIABLE%COMPONENTS(COMPONENT_NUMBER)%PARAM_TO_DOF_MAP% &
+                                  & NODE_PARAM2DOF_MAP%NODES, 1)
+                                WRITE (*,*) SIZE(FIELD_VARIABLE%COMPONENTS(COMPONENT_NUMBER)%PARAM_TO_DOF_MAP% &
+                                  & NODE_PARAM2DOF_MAP%NODES(DOMAIN_LOCAL_NODE_NUMBER)%DERIVATIVES, 1)
+                                WRITE (*,*) SIZE(FIELD_VARIABLE%COMPONENTS(COMPONENT_NUMBER)%PARAM_TO_DOF_MAP% &
+                                  & NODE_PARAM2DOF_MAP%NODES(DOMAIN_LOCAL_NODE_NUMBER)%DERIVATIVES(DERIVATIVE_NUMBER)% &
+                                  & VERSIONS, 1)
                                 dof_idx=FIELD_VARIABLE%COMPONENTS(COMPONENT_NUMBER)%PARAM_TO_DOF_MAP% &
                                   & NODE_PARAM2DOF_MAP%NODES(DOMAIN_LOCAL_NODE_NUMBER)%DERIVATIVES(DERIVATIVE_NUMBER)% &
                                   & VERSIONS(VERSION_NUMBER)
diff --git a/src/solver_routines.f90 b/src/solver_routines.f90
index e1702a92..5c6a3624 100644
--- a/src/solver_routines.f90
+++ b/src/solver_routines.f90
@@ -15318,7 +15318,7 @@ SUBROUTINE SOLVER_VARIABLES_DYNAMIC_FIELD_PREVIOUS_VALUES_UPDATE(SOLVER,ERR,ERRO
                             IF(ASSOCIATED(EQUATIONS_MAPPING)) THEN
                               NONLINEAR_MAPPING=>EQUATIONS_MAPPING%NONLINEAR_MAPPING
                               IF(ASSOCIATED(NONLINEAR_MAPPING)) THEN
-                                RESIDUAL_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLE
+                                RESIDUAL_VARIABLE=>NONLINEAR_MAPPING%RESIDUAL_VARIABLES(1)%PTR
                                 IF(ASSOCIATED(RESIDUAL_VARIABLE)) THEN
                                   CALL FIELD_PARAMETER_SETS_COPY(RESIDUAL_VARIABLE%FIELD,RESIDUAL_VARIABLE%VARIABLE_TYPE, &
                                     & FIELD_RESIDUAL_SET_TYPE,FIELD_PREVIOUS_RESIDUAL_SET_TYPE,1.0_DP,ERR,ERROR,*999)

From b6c36290fa547752bd20c6fdc8af9e2df87fef9b Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Tue, 19 Apr 2011 02:14:33 +0000
Subject: [PATCH 11/24] Further optimisations and fixes to partitioning
 algorithm. Added FHN model definition. See Tracker Item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2123 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 src/cuda_solver_routines.cu | 57 +++++++++++++++++++++----------------
 src/field_routines.f90      |  8 ------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/src/cuda_solver_routines.cu b/src/cuda_solver_routines.cu
index 3d98036c..b2bb747d 100644
--- a/src/cuda_solver_routines.cu
+++ b/src/cuda_solver_routines.cu
@@ -275,7 +275,7 @@ void domainDecomposition(unsigned int* threads_per_domain, unsigned int* spill,
 	}
 	(*num_blocks)=num_threads/(*threads_per_block);
 	spill[2]=num_threads % (*threads_per_block);
-	if (spill[2] !=0) (*num_blocks)++; // Round up calculation
+	//if (spill[2] !=0) (*num_blocks)++; // Round up calculation
 
 	(*blocksPerDomain) = (*num_blocks)/(num_streams*(*num_partitions));
 	if ((*blocksPerDomain) == 0) {
@@ -291,9 +291,14 @@ void domainDecomposition(unsigned int* threads_per_domain, unsigned int* spill,
 	} else if (remainingBlocks > num_streams*(*blocksPerDomain)) {
 		numberRemainingDomains = ceil((float)remainingBlocks/(*blocksPerDomain));
 		(*num_partitions) += ceil((float)numberRemainingDomains/num_streams);
-		remainingBlocks = ((spill[2]!=0) ? (*num_blocks)-1 : (*num_blocks))%(num_streams*((*num_partitions)-1));
 	}
 
+	if ((*num_blocks)/(num_streams*(*num_partitions))==0) {
+		(*num_partitions)--;
+	}
+
+	remainingBlocks = (*num_blocks)%(*num_partitions);
+
 	// Adjust number of partitions so that data fits in GPU memory
 	count = 0;
 	cutilSafeCall( cudaMemGetInfo(&freeGPUMemory, &test) );
@@ -307,8 +312,8 @@ void domainDecomposition(unsigned int* threads_per_domain, unsigned int* spill,
 			} else if (remainingBlocks > num_streams*(*blocksPerDomain)) {
 				numberRemainingDomains = ceil((float)remainingBlocks/(*blocksPerDomain));
 				(*num_partitions) += ceil((float)numberRemainingDomains/num_streams);
-				remainingBlocks = ((spill[2]!=0) ? (*num_blocks)-1 : (*num_blocks))%(num_streams*((*num_partitions)-1));
 			}
+			remainingBlocks = (*num_blocks)%(*num_partitions);
 		} else {
 			fprintf(stderr, "Cannot fit variables in GPU device memory\nReduce threads per block and try again");
 			exit(EXIT_FAILURE);
@@ -327,7 +332,6 @@ void domainDecomposition(unsigned int* threads_per_domain, unsigned int* spill,
 	spill[0] = remainingBlocks/(*blocksPerDomain);
 	spill[1] = remainingBlocks%(*blocksPerDomain);
 
-
 //	if (renainingBlocks != 0) {
 //		blocksLastStream =
 //	}
@@ -375,6 +379,7 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 	unsigned int kernel_timer = 0;
 	unsigned int domainIndex = 0;
 	unsigned int blocksPerDomain = 0;
+	unsigned int lastFullDomain = 0;
 	unsigned int threads_per_domain = 0;
 	unsigned int spill[3] = { 0 };
 
@@ -410,6 +415,9 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
     domainDecomposition(&threads_per_domain, spill, num_threads, num_streams, &pagedMemorySize, &threads_per_block,
     		&num_partitions, &blocksPerDomain, &num_blocks, &sharedMem);
 
+
+    lastFullDomain = num_blocks/blocksPerDomain;
+
 	// Allocate CUDA device and host pinned memory
 	cutilSafeCall( cudaMallocHost((void**) &h_paged_states, pagedMemorySize) );
 	cutilSafeCall( cudaMalloc((void **) &d_states, pagedMemorySize) );
@@ -424,7 +432,7 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
     dim3  threads(threads_per_block, 1, 1);
 
     //#ifdef DEBUG
-
+    if (timing_file) {
     	printf("> Device name : %s\n", deviceProp.name );
 		printf("> CUDA Capable SM %d.%d hardware with %d multi-processors\n",
 			deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
@@ -435,7 +443,6 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
     	printf("grid.x %d threads.x %d sharedMem %d\n", grid.x, threads.x, sharedMem);
     	printf("Spills %d %d %d\n", spill[0], spill[1], spill[2]);
 
-        if (timing_file) {
 		// Setup and start global timer
 		timer = 0;
 		cutCreateTimer(&timer);
@@ -469,18 +476,18 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 	}
 
 	// Queue kernel calls into streams to hide memory transfers (num_partitions sets of kernel calls in each stream)
-	for(i = 0; i < num_partitions; i++) { 
+	for(i = 0; i < num_partitions+1; i++) {
 		// Asynchronously launch num_streams memcopies
 		for(j = 0; j < num_streams; j++) {
 			domainIndex = j + i*num_streams;
-			if (timing_file==NULL || !(i==0 && j==0) || (domainIndex+1)*grid.x<=num_blocks) {
+			if (domainIndex <= lastFullDomain && (timing_file==NULL || !(i==0 && j==0))) {
 				local_offset = j * rateStateCount * grid.x * threads.x ;
-				if (i == num_partitions - 1 && j == spill[0] && (spill[1]!=0 || spill[2]!=0)) {
-					printf("last async in %d, size %d\n", domainIndex, lastStreamMemorySize);
+				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
+					//printf("last async in %d, size %d\n", domainIndex, lastStreamMemorySize);
 					cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset,
 							lastStreamMemorySize, cudaMemcpyHostToDevice, streams[j]) );
 				} else {
-					printf("normal async in %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
+					//printf("normal async in %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
 					cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset,
 							pagedMemorySize/num_streams, cudaMemcpyHostToDevice, streams[j]) );
 				}
@@ -490,9 +497,9 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 		// Asynchronously launch num_streams kernels, each operating on its own portion of data
 		for(j = 0; j < num_streams; j++) {
 			domainIndex = j + i*num_streams;
-			if (timing_file==NULL || !(i==0 && j==0) || (domainIndex+1)*grid.x<=num_blocks) {
+			if (domainIndex <= lastFullDomain && (timing_file==NULL || !(i==0 && j==0))) {
 				local_offset = j * grid.x * threads.x ;
-				if (i == num_partitions - 1 && j == spill[0] && (spill[1]!=0 || spill[2]!=0)) {
+				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
 				    grid.x = spill[1]+1;
 					solveSystem<<<grid, threads, sharedMem, streams[j]>>>(timeSteps, stepSize,
 							d_states + rateStateCount*local_offset);
@@ -506,14 +513,14 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 		// Asynchronoously launch num_streams memcopies
 		for(j = 0; j < num_streams; j++){
 			domainIndex = j + i*num_streams;
-			if (timing_file==NULL || !(i==0 && j==0) || (domainIndex+1)*grid.x<=num_blocks) {
+			if (domainIndex <= lastFullDomain && (timing_file==NULL || !(i==0 && j==0))) {
 				local_offset = j * rateStateCount * grid.x * threads.x ;
-				if (i == num_partitions - 1 && j == spill[0] && (spill[1]!=0 || spill[2]!=0) ) {
-					printf("last async out %d, size %d\n", domainIndex, lastStreamMemorySize);
+				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
+					//printf("last async out %d, size %d\n", domainIndex, lastStreamMemorySize);
 					cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset,
 						lastStreamMemorySize, cudaMemcpyDeviceToHost, streams[j]) );
 				} else {
-					printf("normal async out %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
+					//printf("normal async out %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
 					cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset,
 						pagedMemorySize/num_streams, cudaMemcpyDeviceToHost, streams[j]) );
 				}
@@ -523,30 +530,30 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 		// Execute memcpys in and out of paged memory when CUDA calls in the streams have finished
 		for(j = 0; j < num_streams; j++){
 			domainIndex = j + i*num_streams;
-			if (timing_file==NULL || !(i==0 && j==0) || (domainIndex+1)*grid.x<=num_blocks) {
+			if (domainIndex <= lastFullDomain && (timing_file==NULL || !(i==0 && j==0))) {
 				cudaStreamSynchronize(streams[j]);
 
 				local_offset = j * rateStateCount * grid.x * threads.x ;
 				global_offset = i * num_streams * grid.x * threads.x;
 				
-				if (i == num_partitions - 1 && j == spill[0] && (spill[1]!=0 || spill[2]!=0)) {
-					printf("last memcpy out %d\n", domainIndex);
+				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
+					//printf("last memcpy out %d\n", domainIndex);
 					memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset,
 							lastStreamMemorySize);
 				} else {
-					printf("normal memcpy out %d\n", domainIndex);
+					//printf("normal memcpy out %d\n", domainIndex);
 					memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset,
 						pagedMemorySize/num_streams);
 				}
 
 				global_offset = (i + 1) * num_streams * grid.x * threads.x;
-				if (i == num_partitions - 2 && j == spill[0] && (spill[1]!=0 || spill[2]!=0)) {
-					printf("last memcpy in %d\n", domainIndex);
+				if (domainIndex == lastFullDomain - num_streams && (spill[1]!=0 || spill[2]!=0)) {
+					//printf("last memcpy in %d\n", domainIndex);
 					lastStreamMemorySize = sizeof(double)*rateStateCount*(spill[1]*threads_per_block+spill[2]);
 					memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset,
 							lastStreamMemorySize);
-				} else if (i < num_partitions - 2 || j < spill[0] || (i == num_partitions - 2 && j == spill[0])) {
-					printf("normal memcpy in %d\n", domainIndex);
+				} else if (domainIndex < lastFullDomain - num_streams) {
+					//printf("normal memcpy in %d\n", domainIndex);
 					memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset,
 						pagedMemorySize/num_streams);
 				}
diff --git a/src/field_routines.f90 b/src/field_routines.f90
index 16a48dcd..f77a4100 100644
--- a/src/field_routines.f90
+++ b/src/field_routines.f90
@@ -20463,14 +20463,6 @@ SUBROUTINE FIELD_PARAMETER_SET_UPDATE_NODE_DP(FIELD,VARIABLE_TYPE,FIELD_SET_TYPE
                               !Only checks if version number is greater than zero
                               IF(DERIVATIVE_NUMBER>0.AND.DERIVATIVE_NUMBER<=DOMAIN_NODES%NODES(DOMAIN_LOCAL_NODE_NUMBER)% &
                                 NUMBER_OF_DERIVATIVES.AND.VERSION_NUMBER>0) THEN
-                                WRITE (*,*) SIZE(FIELD_VARIABLE%COMPONENTS, 1)
-                                WRITE (*,*) SIZE(FIELD_VARIABLE%COMPONENTS(COMPONENT_NUMBER)%PARAM_TO_DOF_MAP% &
-                                  & NODE_PARAM2DOF_MAP%NODES, 1)
-                                WRITE (*,*) SIZE(FIELD_VARIABLE%COMPONENTS(COMPONENT_NUMBER)%PARAM_TO_DOF_MAP% &
-                                  & NODE_PARAM2DOF_MAP%NODES(DOMAIN_LOCAL_NODE_NUMBER)%DERIVATIVES, 1)
-                                WRITE (*,*) SIZE(FIELD_VARIABLE%COMPONENTS(COMPONENT_NUMBER)%PARAM_TO_DOF_MAP% &
-                                  & NODE_PARAM2DOF_MAP%NODES(DOMAIN_LOCAL_NODE_NUMBER)%DERIVATIVES(DERIVATIVE_NUMBER)% &
-                                  & VERSIONS, 1)
                                 dof_idx=FIELD_VARIABLE%COMPONENTS(COMPONENT_NUMBER)%PARAM_TO_DOF_MAP% &
                                   & NODE_PARAM2DOF_MAP%NODES(DOMAIN_LOCAL_NODE_NUMBER)%DERIVATIVES(DERIVATIVE_NUMBER)% &
                                   & VERSIONS(VERSION_NUMBER)

From fb3328a9924215b6a16c6e88d9ef91cad92fb693 Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Wed, 20 Apr 2011 00:17:58 +0000
Subject: [PATCH 12/24] Added per-time step solver timing data dump and total
 solve timing recording in example code. See tracker item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2124 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 src/cuda_solver_routines.cu         | 18 +++++++++---------
 src/external_dae_solver_routines.cu | 23 +++++++++++++++++++----
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/cuda_solver_routines.cu b/src/cuda_solver_routines.cu
index b2bb747d..a39c9076 100644
--- a/src/cuda_solver_routines.cu
+++ b/src/cuda_solver_routines.cu
@@ -433,15 +433,15 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 
     //#ifdef DEBUG
     if (timing_file) {
-    	printf("> Device name : %s\n", deviceProp.name );
-		printf("> CUDA Capable SM %d.%d hardware with %d multi-processors\n",
-			deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
-		printf("> Cell Model = %s, Integrator = %s\n", cellModelName, integratorName);
-		printf("> num_threads = %d, num_blocks = %d, threads_per_block = %d, num_partitions = %d, timeSteps = %d, num_streams = %d\n",
-			num_threads, num_blocks, threads_per_block, num_partitions, timeSteps, num_streams);
-		//#endif
-    	printf("grid.x %d threads.x %d sharedMem %d\n", grid.x, threads.x, sharedMem);
-    	printf("Spills %d %d %d\n", spill[0], spill[1], spill[2]);
+//    	printf("> Device name : %s\n", deviceProp.name );
+//		printf("> CUDA Capable SM %d.%d hardware with %d multi-processors\n",
+//			deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+//		printf("> Cell Model = %s, Integrator = %s\n", cellModelName, integratorName);
+//		printf("> num_threads = %d, num_blocks = %d, threads_per_block = %d, num_partitions = %d, timeSteps = %d, num_streams = %d\n",
+//			num_threads, num_blocks, threads_per_block, num_partitions, timeSteps, num_streams);
+//		//#endif
+//    	printf("grid.x %d threads.x %d sharedMem %d\n", grid.x, threads.x, sharedMem);
+//    	printf("Spills %d %d %d\n", spill[0], spill[1], spill[2]);
 
 		// Setup and start global timer
 		timer = 0;
diff --git a/src/external_dae_solver_routines.cu b/src/external_dae_solver_routines.cu
index 449e90c1..172af625 100644
--- a/src/external_dae_solver_routines.cu
+++ b/src/external_dae_solver_routines.cu
@@ -56,7 +56,6 @@ SolverDAEExternalIntegrate     Solves the differential-algebraic equation.
 */
 
 /* Included files */
-
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -87,14 +86,30 @@ void SolverDAEExternalIntegrate(const int NumberOfDofs,
 {
 
 	FILE* timing_file = NULL;
-	//timing_file = fopen("performance_data.txt","w");
+	char *filename = NULL;
+
+	asprintf(&filename,"Results/MonodomainExample-CUDAON-%d-%d-%d-%d.txt",(NumberOfDofs/101) - 1,ThreadsPerBlock,NumberOfPartitions,NumberOfStreams);
+	timing_file = fopen(filename, "rt");
+
+	if (!timing_file) {
+		timing_file = fopen(filename, "wt");
+		fprintf(timing_file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\tSingle Kernel Computaional Time(s)\tKernel GFLOPS\tDevice Utilisation\n");
+	} else {
+		fclose(timing_file);
+		timing_file = fopen(filename, "at");
+		if (!timing_file) {
+			fprintf(stderr, "Timing file could not be opened or created.");
+			exit(EXIT_FAILURE);
+		}
+	}
 
 	//printf("start %f end %f steps %d\n", StartTime, EndTime, (int)((EndTime-StartTime)/InitialStep[0]));
     //  timeSteps = (int)ceil(((EndTime-StartTime)/InitialStep[0]));
 
-	solve(StateData, StartTime, EndTime, InitialStep[0], NumberOfDofs, ThreadsPerBlock, NumberOfPartitions, NumberOfStreams, timing_file);
+	solve(StateData, StartTime, EndTime, InitialStep[0], NumberOfDofs, ThreadsPerBlock, NumberOfPartitions, NumberOfStreams, NULL);// timing_file);
 
-	//if (timing_file != NULL ) fclose(timing_file);
+	if (timing_file != NULL ) fclose(timing_file);
+	free(filename);
 }
 }
 

From 7c52a10da2af56e2b2dfd4e5c641b5a1da902512 Mon Sep 17 00:00:00 2001
From: Jin Budelmann <vbud003@aucklanduni.ac.nz>
Date: Wed, 20 Apr 2011 05:35:33 +0000
Subject: [PATCH 13/24] Changed some WRITEs to write_string_value to clean up
 output. Added hack dae solver timer using COMMON variables in
 SOLVER_DAE_SOLVE. Temporarily removed single domain timing as it was
 producing NANs for some reason. See tracker item 2180

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2125 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 src/cmiss_cellml.f90                | 32 +++++++++----
 src/cuda_solver_routines.cu         | 74 +++++++++++++++--------------
 src/external_dae_solver_routines.cu |  6 +--
 src/solver_routines.f90             | 11 +++++
 4 files changed, 75 insertions(+), 48 deletions(-)

diff --git a/src/cmiss_cellml.f90 b/src/cmiss_cellml.f90
index 36d6adb7..91989d98 100644
--- a/src/cmiss_cellml.f90
+++ b/src/cmiss_cellml.f90
@@ -439,7 +439,9 @@ SUBROUTINE CELLML_CREATE_FINISH(CELLML,ERR,ERROR,*)
         !Check that we have set up the models
         IF(CELLML%NUMBER_OF_MODELS>0) THEN
           DO model_idx=1,CELLML%NUMBER_OF_MODELS
-            write(*,*) 'model_idx = ',model_idx
+            IF(DIAGNOSTICS1) THEN
+              CALL WRITE_STRING_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'model_idx = ',model_idx,ERR,ERROR,*999)
+            ENDIF
             CELLML_MODEL => CELLML%MODELS(model_idx)%PTR
             !CALL CELLML_MODEL_DEFINITION_SET_SAVE_TEMP_FILES(CELLML_MODEL%PTR,1)
             ERROR_CODE = CELLML_MODEL_DEFINITION_INSTANTIATE(CELLML_MODEL%PTR)
@@ -2719,8 +2721,11 @@ SUBROUTINE CELLML_STATE_FIELD_CREATE_FINISH(CELLML,ERR,ERROR,*)
                         & TRIM(NUMBER_TO_VSTRING(state_component_idx,"*",ERR,ERROR))//"."
                       CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
                     ENDIF
-                    WRITE(*,*) '(single model) Initial value for state variable: ',state_component_idx,'; type: ',&
-                      & CELLML_VARIABLE_TYPE,'; value = ',INITIAL_VALUE
+                    IF(DIAGNOSTICS1) THEN
+                      CALL WRITE_STRING_TWO_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'(single model) Initial value for state variable: ', &
+                        & state_component_idx,'; type: ',CELLML_VARIABLE_TYPE,ERR,ERROR,*999)
+                      CALL WRITE_STRING_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'value = ',INITIAL_VALUE,ERR,ERROR,*999)
+                    ENDIF
                     CALL FIELD_COMPONENT_VALUES_INITIALISE(CELLML%STATE_FIELD%STATE_FIELD,FIELD_U_VARIABLE_TYPE, &
                       & FIELD_VALUES_SET_TYPE,state_component_idx,INITIAL_VALUE,ERR,ERROR,*999)
                   ENDDO !state_component_idx
@@ -2750,8 +2755,11 @@ SUBROUTINE CELLML_STATE_FIELD_CREATE_FINISH(CELLML,ERR,ERROR,*)
                             & TRIM(NUMBER_TO_VSTRING(state_component_idx,"*",ERR,ERROR))//"."
                           CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
                         ENDIF
-                          WRITE(*,*) '(multiple models) Initial value for state variable: ',state_component_idx,'; type: ',&
-                          & CELLML_VARIABLE_TYPE,'; value = ',INITIAL_VALUE
+                        IF(DIAGNOSTICS1) THEN
+                          CALL WRITE_STRING_TWO_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'(multiple models) Initial value for state variable: '&
+                             &,state_component_idx,'; type: ', CELLML_VARIABLE_TYPE,ERR,ERROR,*999)
+                          CALL WRITE_STRING_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'value = ',INITIAL_VALUE,ERR,ERROR,*999)
+                        ENDIF !\todo make diagnostic output
                           CALL CELLML_FIELD_VARIABLE_SOURCE_DOF_SET_CONSTANT(STATE_VARIABLE,FIELD_VALUES_SET_TYPE,source_dof_idx, &
                             & state_component_idx,INITIAL_VALUE,ERR,ERROR,*999)
                         ENDDO !state_component_idx
@@ -3593,8 +3601,11 @@ SUBROUTINE CELLML_PARAMETERS_FIELD_CREATE_FINISH(CELLML,ERR,ERROR,*)
                         & TRIM(NUMBER_TO_VSTRING(parameter_component_idx,"*",ERR,ERROR))//"."
                       CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
                     ENDIF
-                    WRITE(*,*) '(single model) Initial value for parameter variable: ',parameter_component_idx,'; type: ',&
-                      & CELLML_VARIABLE_TYPE,'; value = ',INITIAL_VALUE
+                    IF(DIAGNOSTICS1) THEN
+                      CALL WRITE_STRING_TWO_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'(single model) Initial value for state variable: ', &
+                        & parameter_component_idx,'; type: ', CELLML_VARIABLE_TYPE,ERR,ERROR,*999)
+                      CALL WRITE_STRING_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'value = ',INITIAL_VALUE,ERR,ERROR,*999)
+                    ENDIF
                     CALL FIELD_COMPONENT_VALUES_INITIALISE(CELLML%PARAMETERS_FIELD%PARAMETERS_FIELD,FIELD_U_VARIABLE_TYPE, &
                       & FIELD_VALUES_SET_TYPE,parameter_component_idx,INITIAL_VALUE,ERR,ERROR,*999)
                   ENDDO !parameter_component_idx
@@ -3624,8 +3635,11 @@ SUBROUTINE CELLML_PARAMETERS_FIELD_CREATE_FINISH(CELLML,ERR,ERROR,*)
                             & TRIM(NUMBER_TO_VSTRING(parameter_component_idx,"*",ERR,ERROR))//"."
                           CALL FLAG_ERROR(LOCAL_ERROR,ERR,ERROR,*999)
                         ENDIF
-                        WRITE(*,*) '(multiple models) Initial value for parameter variable: ',parameter_component_idx,'; type: ',&
-                          & CELLML_VARIABLE_TYPE,'; value = ',INITIAL_VALUE
+                        IF(DIAGNOSTICS1) THEN
+                          CALL WRITE_STRING_TWO_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'(multiple models) Initial value for state variable: '&
+                             &, parameter_component_idx,'; type: ', CELLML_VARIABLE_TYPE,ERR,ERROR,*999)
+                          CALL WRITE_STRING_VALUE(DIAGNOSTIC_OUTPUT_TYPE,'value = ',INITIAL_VALUE,ERR,ERROR,*999)
+                        ENDIF !\todo make diagnostic output
                         CALL CELLML_FIELD_VARIABLE_SOURCE_DOF_SET_CONSTANT(PARAMETERS_VARIABLE,FIELD_VALUES_SET_TYPE, &
                           & source_dof_idx, parameter_component_idx,INITIAL_VALUE,ERR,ERROR,*999)
                       ENDDO !parameter_component_idx
diff --git a/src/cuda_solver_routines.cu b/src/cuda_solver_routines.cu
index a39c9076..dcf65120 100644
--- a/src/cuda_solver_routines.cu
+++ b/src/cuda_solver_routines.cu
@@ -448,39 +448,39 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 		cutCreateTimer(&timer);
 		cutilCheckError(cutStartTimer(timer));
 
-		// Test kernel speed in default stream (timing is more accurate in default stream)
-		memcpy(h_paged_states, h_states, pagedMemorySize/num_streams);
-		cutilSafeCall( cudaMemcpy(d_states, h_paged_states, pagedMemorySize/num_streams, 
-			cudaMemcpyHostToDevice) );
-		// Start kernel timer
-		kernel_timer = 0;
-		cutCreateTimer(&kernel_timer);
-		cutilCheckError(cutStartTimer(kernel_timer));
-		// Start kernel
-		solveSystem<<<grid, threads, sharedMem>>>(timeSteps, stepSize, d_states);
-		checkCUDAError("Single Kernel Execution");
-		cutilSafeCall( cudaThreadSynchronize() );
-		// Stop kernel Timer 
-		cutilCheckError(cutStopTimer(kernel_timer));
-		cutilSafeCall( cudaMemcpy(h_paged_states, d_states, pagedMemorySize/num_streams, 
-			cudaMemcpyDeviceToHost) );
-		memcpy(h_states, h_paged_states, pagedMemorySize/num_streams);
+//		// Test kernel speed in default stream (timing is more accurate in default stream)
+//		memcpy(h_paged_states, h_states, pagedMemorySize/num_streams);
+//		cutilSafeCall( cudaMemcpy(d_states, h_paged_states, pagedMemorySize/num_streams,
+//			cudaMemcpyHostToDevice) );
+//		// Start kernel timer
+//		kernel_timer = 0;
+//		cutCreateTimer(&kernel_timer);
+//		cutilCheckError(cutStartTimer(kernel_timer));
+//		// Start kernel
+//		solveSystem<<<grid, threads, sharedMem>>>(timeSteps, stepSize, d_states);
+//		checkCUDAError("Single Kernel Execution");
+//		cutilSafeCall( cudaThreadSynchronize() );
+//		// Stop kernel Timer
+//		cutilCheckError(cutStopTimer(kernel_timer));
+//		cutilSafeCall( cudaMemcpy(h_paged_states, d_states, pagedMemorySize/num_streams,
+//			cudaMemcpyDeviceToHost) );
+//		memcpy(h_states, h_paged_states, pagedMemorySize/num_streams);
 
 		// Prefetch data for next partition in first stream
-		if (num_partitions>1) {
-			global_offset = rateStateCount * num_streams * grid.x * threads.x;
-			memcpy(h_paged_states, h_states + global_offset, pagedMemorySize/num_streams);
-		}
-	} else {
+//		if (num_partitions>1) {
+//			global_offset = rateStateCount * num_streams * grid.x * threads.x;
+//			memcpy(h_paged_states, h_states + global_offset, pagedMemorySize/num_streams);
+//		}
+//	} else {
 		memcpy(h_paged_states, h_states, pagedMemorySize);
 	}
 
 	// Queue kernel calls into streams to hide memory transfers (num_partitions sets of kernel calls in each stream)
 	for(i = 0; i < num_partitions+1; i++) {
 		// Asynchronously launch num_streams memcopies
-		for(j = 0; j < num_streams; j++) {
+		for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
 			domainIndex = j + i*num_streams;
-			if (domainIndex <= lastFullDomain && (timing_file==NULL || !(i==0 && j==0))) {
+			if (domainIndex <= lastFullDomain) {
 				local_offset = j * rateStateCount * grid.x * threads.x ;
 				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
 					//printf("last async in %d, size %d\n", domainIndex, lastStreamMemorySize);
@@ -495,9 +495,9 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 		}
 		// Execute the kernel
 		// Asynchronously launch num_streams kernels, each operating on its own portion of data
-		for(j = 0; j < num_streams; j++) {
+		for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
 			domainIndex = j + i*num_streams;
-			if (domainIndex <= lastFullDomain && (timing_file==NULL || !(i==0 && j==0))) {
+			if (domainIndex <= lastFullDomain) {
 				local_offset = j * grid.x * threads.x ;
 				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
 				    grid.x = spill[1]+1;
@@ -511,9 +511,9 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 		}
 
 		// Asynchronoously launch num_streams memcopies
-		for(j = 0; j < num_streams; j++){
+		for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
 			domainIndex = j + i*num_streams;
-			if (domainIndex <= lastFullDomain && (timing_file==NULL || !(i==0 && j==0))) {
+			if (domainIndex <= lastFullDomain) {
 				local_offset = j * rateStateCount * grid.x * threads.x ;
 				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
 					//printf("last async out %d, size %d\n", domainIndex, lastStreamMemorySize);
@@ -528,9 +528,9 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 		}
 		
 		// Execute memcpys in and out of paged memory when CUDA calls in the streams have finished
-		for(j = 0; j < num_streams; j++){
+		for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
 			domainIndex = j + i*num_streams;
-			if (domainIndex <= lastFullDomain && (timing_file==NULL || !(i==0 && j==0))) {
+			if (domainIndex <= lastFullDomain) {
 				cudaStreamSynchronize(streams[j]);
 
 				local_offset = j * rateStateCount * grid.x * threads.x ;
@@ -570,14 +570,16 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 		nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads;
 		gflops = 1.0e-9 * nFLOPS/dSeconds;
 
-		kernel_dSeconds = cutGetTimerValue(kernel_timer)/1000.0;
-		kernel_nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads/num_streams/num_partitions;
-		kernel_gflops = 1.0e-9 * kernel_nFLOPS/kernel_dSeconds;
+//		kernel_dSeconds = cutGetTimerValue(kernel_timer)/1000.0;
+//		kernel_nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads/num_streams/num_partitions;
+//		kernel_gflops = 1.0e-9 * kernel_nFLOPS/kernel_dSeconds;
 
 		// Store Stats
-		fprintf(timing_file,"%s\t%s\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\n", cellModelName, integratorName, num_threads, num_blocks, 
-			threads_per_block, num_partitions, num_streams, dSeconds, gflops, kernel_dSeconds, 
-			kernel_gflops, gflops/kernel_gflops*100);
+		fprintf(timing_file,"%s\t%s\t%d\t%d\t%d\t%d\t%d\t%f\t%f\n", cellModelName, integratorName, num_threads, num_blocks,
+			threads_per_block, num_partitions, num_streams, dSeconds, gflops);
+//		fprintf(timing_file,"%s\t%s\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\n", cellModelName, integratorName, num_threads, num_blocks,
+//			threads_per_block, num_partitions, num_streams, dSeconds, gflops, kernel_dSeconds,
+//			kernel_gflops, gflops/kernel_gflops*100);
 	}
 
 	// Deallocate Memory and Release Threads
diff --git a/src/external_dae_solver_routines.cu b/src/external_dae_solver_routines.cu
index 172af625..12acb39f 100644
--- a/src/external_dae_solver_routines.cu
+++ b/src/external_dae_solver_routines.cu
@@ -84,7 +84,6 @@ void SolverDAEExternalIntegrate(const int NumberOfDofs,
 				double *IntermediateData,
 				int *err)
 {
-
 	FILE* timing_file = NULL;
 	char *filename = NULL;
 
@@ -93,7 +92,8 @@ void SolverDAEExternalIntegrate(const int NumberOfDofs,
 
 	if (!timing_file) {
 		timing_file = fopen(filename, "wt");
-		fprintf(timing_file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\tSingle Kernel Computaional Time(s)\tKernel GFLOPS\tDevice Utilisation\n");
+		fprintf(timing_file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\n");
+//		fprintf(timing_file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\tSingle Kernel Computaional Time(s)\tKernel GFLOPS\tDevice Utilisation\n");
 	} else {
 		fclose(timing_file);
 		timing_file = fopen(filename, "at");
@@ -106,7 +106,7 @@ void SolverDAEExternalIntegrate(const int NumberOfDofs,
 	//printf("start %f end %f steps %d\n", StartTime, EndTime, (int)((EndTime-StartTime)/InitialStep[0]));
     //  timeSteps = (int)ceil(((EndTime-StartTime)/InitialStep[0]));
 
-	solve(StateData, StartTime, EndTime, InitialStep[0], NumberOfDofs, ThreadsPerBlock, NumberOfPartitions, NumberOfStreams, NULL);// timing_file);
+	solve(StateData, StartTime, EndTime, InitialStep[0], NumberOfDofs, ThreadsPerBlock, NumberOfPartitions, NumberOfStreams, timing_file);
 
 	if (timing_file != NULL ) fclose(timing_file);
 	free(filename);
diff --git a/src/solver_routines.f90 b/src/solver_routines.f90
index 5c6a3624..94787d30 100644
--- a/src/solver_routines.f90
+++ b/src/solver_routines.f90
@@ -3254,6 +3254,13 @@ SUBROUTINE SOLVER_DAE_SOLVE(DAE_SOLVER,ERR,ERROR,*)
     TYPE(SOLVER_TYPE), POINTER :: SOLVER
     TYPE(VARYING_STRING) :: LOCAL_ERROR
 
+
+    ! TEMP ADDED BY JIN
+    REAL TEMP, ODE_TIME
+    REAL TIMEARRAY(2)
+    COMMON ODE_TIME
+    TEMP = DTIME(TIMEARRAY)
+
     CALL ENTERS("SOLVER_DAE_SOLVE",ERR,ERROR,*999)
 
     IF(ASSOCIATED(DAE_SOLVER)) THEN
@@ -3320,6 +3327,10 @@ SUBROUTINE SOLVER_DAE_SOLVE(DAE_SOLVER,ERR,ERROR,*)
     ELSE
       CALL FLAG_ERROR("Differential-algebraic equation solver is not associated.",ERR,ERROR,*999)
     ENDIF
+
+
+    ! TEMP CODE ADDED BY JIN
+    ODE_TIME = ODE_TIME + DTIME(TIMEARRAY)
          
     CALL EXITS("SOLVER_DAE_SOLVE")
     RETURN

From 102334e161defb758c2da4b0e696c2ace27bd03f Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Tue, 14 Jun 2011 22:33:44 +0000
Subject: [PATCH 14/24] CUDA changes.

git-svn-id: https://opencmiss.svn.sourceforge.net/svnroot/opencmiss/cm/branches/developers/jin@2186 61e36070-c886-4827-b7c0-ed9e4dc59cc8
---
 ExampleMakefile | 50 +++++++++++++++++++++++++++++++++++++++++++++++--
 Makefile        |  4 ++--
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/ExampleMakefile b/ExampleMakefile
index 1fbd5fb7..377a68b1 100644
--- a/ExampleMakefile
+++ b/ExampleMakefile
@@ -85,6 +85,10 @@ ifndef USEFIELDML
   USEFIELDML := false
 endif
 
+ifndef USECUDA
+  USECUDA := false
+endif
+
 ifeq ($(MPI),mpich2)
   MPI := mpich2
 else
@@ -122,6 +126,17 @@ ifeq ($(MPIPROF),true)
   endif
 endif
 
+ifeq ($(USECUDA),true)
+  ifndef CUDA_ROOT
+    $(error CUDA_ROOT is not defined)
+  endif
+  CUDA_DIR := ${CUDA_ROOT}
+  ifndef CUDA_SDK_ROOT
+    $(error CUDA_SDK_ROOT is not defined)
+  endif
+  CUDA_SDK_DIR := ${CUDA_SDK_ROOT}
+endif
+
 #----------------------------------------------------------------------------------------------------------------------------------
 
 BASE_EXAMPLE_NAME = $(EXAMPLE_NAME)Example
@@ -839,6 +854,32 @@ else
   endif
 endif
 
+#CUDA
+CUDA_LIBRARIES =#
+CUDA_LIB_PATH =#
+CUDA_SDK_LIBRARIES =#
+CUDA_SDK_LIB_PATH =#
+ifeq ($(USECUDA),true)
+  ifeq ($(OPERATING_SYSTEM),linux)# Linux
+    CUDA_LIBRARIES += -lcudart
+    ifeq ($(API),32)
+      CUDA_LIB_PATH += $(addprefix -L, $(CUDA_DIR)/lib )
+      CUDA_SDK_LIBRARIES += -lcutil -lshrutil
+    else
+      CUDA_LIB_PATH += $(addprefix -L, $(CUDA_DIR)/lib64 )
+      CUDA_SDK_LIBRARIES += -lcutil_x86_64 -lshrutil_x86_64
+    endif
+    CUDA_SDK_LIB_PATH += $(addprefix -L, $(CUDA_SDK_DIR)/C/common/lib/linux )
+    CUDA_SDK_LIB_PATH += $(addprefix -L, $(CUDA_SDK_DIR)/shared/lib/linux )
+  else
+    ifeq ($(OPERATING_SYSTEM),aix)# AIX
+        $(error CUDA not implemented for AIX)
+    else# windows
+        $(error CUDA not implemented for Windows)
+    endif
+  endif
+endif
+
 OPENCMISS_LIBRARY = $(LIBRARY)
 OPENCMISS_INCLUDE_PATH = $(addprefix -I, $(INC_DIR) )
 
@@ -857,6 +898,11 @@ ifeq ($(USEFIELDML),true)
   EXTERNAL_INCLUDE_PATH += $(FIELDML_INCLUDE_PATH)
 endif
 
+ifeq ($(USECUDA),true)
+  EXTERNAL_LIBRARIES += $(CUDA_SDK_LIBRARIES) $(CUDA_LIBRARIES) 
+  EXTERNAL_LIB_PATH += $(CUDA_SDK_LIB_PATH) $(CUDA_LIB_PATH)
+endif
+
 EXTERNAL_LIBRARIES += $(strip $(TAO_LIBRARIES) $(PETSC_LIBRARIES) $(SUNDIALS_LIBRARIES) $(HYPRE_LIBRARIES) $(SUPERLU_LIBRARIES) $(PASTIX_LIBRARIES) $(SCOTCH_LIBRARIES) $(MUMPS_LIBRARIES) $(SCALAPACK_LIBRARIES) $(BLACS_LIBRARIES) $(PARMETIS_LIBRARIES) $(TAU_LIBRARIES) $(MPI_LIBRARIES) $(BLAS_LIBRARIES))
 EXTERNAL_LIB_PATH += $(strip $(TAO_LIB_PATH) $(PETSC_LIB_PATH) $(SUNDIALS_LIB_PATH) $(HYPRE_LIB_PATH) $(SUPERLU_LIB_PATH) $(PASTIX_LIB_PATH) $(SCOTCH_LIB_PATH) $(MUMPS_LIB_PATH) $(SCALAPACK_LIB_PATH) $(BLACS_LIB_PATH) $(PARMETIS_LIB_PATH) $(TAU_LIB_PATH) $(MPI_LIB_PATH) $(BLAS_LIB_PATH))
 EXTERNAL_INCLUDE_PATH += $(strip $(MPI_INCLUDE_PATH) )
@@ -892,12 +938,12 @@ $(EXE_DIR) :
 	mkdir -p $@; 
 
 $(EXECUTABLE) : $(OBJECTS) $(OPENCMISS_LIBRARY)
-	$(EXE_LINK) -o $@ $(OBJECTS) $(OPENCMISS_LIBRARY) $(ELFLAGS) $(EXTERNAL_LIBRARIES) -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -lcudart -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -L/usr/local/cuda/lib64/ -lcudart -lcutil_x86_64 -lshrutil_x86_64
+	$(EXE_LINK) -o $@ $(OBJECTS) $(OPENCMISS_LIBRARY) $(ELFLAGS) $(EXTERNAL_LIBRARIES)
+#	$(EXE_LINK) -o $@ $(OBJECTS) $(OPENCMISS_LIBRARY) $(ELFLAGS) $(EXTERNAL_LIBRARIES) -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -lcudart -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/lib -L/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/lib/linux -L/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//lib -L/usr/local/cuda/lib64/ -lcudart -lcutil_x86_64 -lshrutil_x86_64
 
 $(OPENCMISS_LIBRARY) :
 	( cd $(GLOBAL_ROOT); $(MAKE) )
 
-
 # Place the list of dependencies for the objects here.
 #
 # ----------------------------------------------------------------------------
diff --git a/Makefile b/Makefile
index 1e458d82..57a24484 100644
--- a/Makefile
+++ b/Makefile
@@ -741,10 +741,10 @@ $(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.c
 	( cd $(OBJECT_DIR) ; $(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $< )
 
 $(OBJECT_DIR)/%.o : $(SOURCE_DIR)/%.cu
-	( cd $(OBJECT_DIR) ; nvcc -gencode=arch=compute_20,code=\"sm_20,compute_20\" --compile -m64 --compiler-options -fno-strict-aliasing --ptxas-options=-v  -I$(SOURCE_DIR) -I/usr/local/cuda/include -I/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/inc -I/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//inc --use_fast_math -DUNIX -O2 -o $@ -c $< )
+	( cd $(OBJECT_DIR) ; nvcc -g  -gencode=arch=compute_20,code=\"sm_20,compute_20\" --compile -m64 --compiler-options -fno-strict-aliasing --ptxas-options=-v  -I$(SOURCE_DIR) -I${CUDA_ROOT}/include -I${CUDA_SDK_ROOT}/C/common/inc -I${CUDA_SDK_ROOT}/shared/inc --use_fast_math -DUNIX -O2 -o $@ -c $< )
+#	( cd $(OBJECT_DIR) ; nvcc -g  -gencode=arch=compute_20,code=\"sm_20,compute_20\" --compile -m64 --compiler-options -fno-strict-aliasing --ptxas-options=-v  -I$(SOURCE_DIR) -I/usr/local/cuda/include -I/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/inc -I/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//inc --use_fast_math -DUNIX -O2 -o $@ -c $< )
 #-gencode=arch=compute_10,code=\"sm_10,compute_10\"  -gencode=arch=compute_20,code=\"sm_20,compute_20\" --compile  -m64 --compiler-options -fno-strict-aliasing --ptxas-options=-v  -I$(SOURCE_DIR) -I/usr/local/cuda/include -I/people/vbud003/NVIDIA_GPU_Computing_SDK/C/common/inc -I/people/vbud003/NVIDIA_GPU_Computing_SDK/shared//inc --use_fast_math -DUNIX -O2 -o $@ -c $< )
 
-
 ifeq ($(USEFIELDML),true)
     FIELDML_OBJECT =  \
       $(OBJECT_DIR)/fieldml_util_routines.o \

From 9584a6336d254f1ee975674cf921148b21e5fe9d Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Tue, 18 Oct 2011 09:04:47 +1300
Subject: [PATCH 15/24] Initialise pointers bug fix.

---
 src/analytic_analysis_routines.f90 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/analytic_analysis_routines.f90 b/src/analytic_analysis_routines.f90
index a894fc8f..b6bc4575 100644
--- a/src/analytic_analysis_routines.f90
+++ b/src/analytic_analysis_routines.f90
@@ -158,6 +158,8 @@ SUBROUTINE ANALYTIC_ANALYSIS_OUTPUT(FIELD,FILENAME,ERR,ERROR,*)
               LOCAL_STRING="Field "//TRIM(NUMBER_TO_VSTRING(FIELD%USER_NUMBER,"*",ERR,ERROR))//" : "//FIELD%LABEL
               IF(ERR/=0) GOTO 999
               CALL WRITE_STRING(OUTPUT_ID,LOCAL_STRING,ERR,ERROR,*999)
+              NULLIFY(NUMERICAL_VALUES)
+              NULLIFY(ANALYTIC_VALUES)
               !Loop over the variables
               DO var_idx=1,FIELD%NUMBER_OF_VARIABLES
                 variable_type=FIELD%VARIABLES(var_idx)%VARIABLE_TYPE

From 5175b077c514a36f56fdf0d0c270a154de08c0f2 Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Tue, 18 Oct 2011 14:47:13 +1300
Subject: [PATCH 16/24] More uninitialised pointers.

---
 src/equations_set_routines.f90 | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/equations_set_routines.f90 b/src/equations_set_routines.f90
index 1f12324e..5a6466f1 100644
--- a/src/equations_set_routines.f90
+++ b/src/equations_set_routines.f90
@@ -1838,6 +1838,10 @@ SUBROUTINE EQUATIONS_SET_BACKSUBSTITUTE(EQUATIONS_SET,BOUNDARY_CONDITIONS,ERR,ER
     TYPE(FIELD_VARIABLE_TYPE), POINTER :: DEPENDENT_VARIABLE,RHS_VARIABLE
     TYPE(VARYING_STRING) :: LOCAL_ERROR
 
+    NULLIFY(DEPENDENT_PARAMETERS)
+    NULLIFY(EQUATIONS_MATRIX_DATA)
+    NULLIFY(SOURCE_VECTOR_DATA)
+
     CALL ENTERS("EQUATIONS_SET_BACKSUBSTITUTE",ERR,ERROR,*999)
 
     IF(ASSOCIATED(EQUATIONS_SET)) THEN

From 3fcb2e1856a634c6019fbcc5a3e2caa34d2b9059 Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Tue, 18 Oct 2011 14:47:38 +1300
Subject: [PATCH 17/24] OpenMPI include path corrections.

---
 ExampleMakefile |  1 +
 Makefile        | 24 +++++++++++++++---------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/ExampleMakefile b/ExampleMakefile
index 7f2ded5f..f18c0557 100644
--- a/ExampleMakefile
+++ b/ExampleMakefile
@@ -872,6 +872,7 @@ ifeq ($(OPERATING_SYSTEM),linux)# Linux
     else
       ifeq ($(MPI),openmpi)
         MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/include/ )
+        MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
         MPI_LIBRARIES += -lmpi_f90 -lmpi_f77 -lmpi -libverbs -lnsl -lutil -ldl -lnsl -lutil
         MPI_LIB_PATH += $(addprefix -L, $(EXTERNAL_CM_DIR)/lib/ )
       else
diff --git a/Makefile b/Makefile
index 3319190c..cb178798 100644
--- a/Makefile
+++ b/Makefile
@@ -708,18 +708,24 @@ endif
 MPI_INCLUDE_PATH =#
 ifeq ($(OPERATING_SYSTEM),linux)# Linux
   ifeq ($(MPI),mpich2)
+    MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/include/ )
   else
-    ifeq ($(MPI),intel)
-      ifeq ($(MPIPROF),true)
-        MPI_INCLUDE_PATH += $(addprefix -I, $(VT_ROOT)/include/ )
-      endif
-      ifeq ($(ABI),64)
-        MPI_INCLUDE_PATH += $(addprefix -I, $(I_MPI_ROOT)/include64/ )
+    ifeq ($(MPI),openmpi)
+      MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
+      MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/include/ )
+    else
+      ifeq ($(MPI),intel)
+        ifeq ($(MPIPROF),true)
+          MPI_INCLUDE_PATH += $(addprefix -I, $(VT_ROOT)/include/ )
+        endif
+        ifeq ($(ABI),64)
+          MPI_INCLUDE_PATH += $(addprefix -I, $(I_MPI_ROOT)/include64/ )
+        else
+          MPI_INCLUDE_PATH += $(addprefix -I, $(I_MPI_ROOT)/include/ )
+        endif
       else
-        MPI_INCLUDE_PATH += $(addprefix -I, $(I_MPI_ROOT)/include/ )
+        MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
       endif
-    else
-      MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
     endif
   endif
 else

From 24e3f1fdf6cbfca2769e9ecd6fddf9b1ded34ba5 Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Tue, 18 Oct 2011 14:50:48 +1300
Subject: [PATCH 18/24] Uninitialised pointer.

---
 src/finite_elasticity_routines.f90 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finite_elasticity_routines.f90 b/src/finite_elasticity_routines.f90
index 402c758f..9a9d9bc1 100644
--- a/src/finite_elasticity_routines.f90
+++ b/src/finite_elasticity_routines.f90
@@ -150,6 +150,8 @@ SUBROUTINE FINITE_ELASTICITY_ANALYTIC_CALCULATE(EQUATIONS_SET,BOUNDARY_CONDITION
     LOGICAL :: X_FIXED,Y_FIXED,NODE_EXISTS, X_OKAY,Y_OKAY
     TYPE(VARYING_STRING) :: LOCAL_ERROR
 
+    NULLIFY(GEOMETRIC_PARAMETERS)
+
     CALL ENTERS("FINITE_ELASTICITY_ANALYTIC_CALCULATE",ERR,ERROR,*999)
 
     MY_COMPUTATIONAL_NODE_NUMBER=COMPUTATIONAL_NODE_NUMBER_GET(ERR,ERROR)

From e72f935e3e4c08db41cfc901f189f506e43cecfd Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Tue, 22 Nov 2011 17:57:53 +1300
Subject: [PATCH 19/24] Moving MPI to its own repository.

---
 ExampleMakefile | 39 +++++++++++++++++++++++++++------------
 Makefile        | 24 ++++++++++++++++++++----
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/ExampleMakefile b/ExampleMakefile
index f18c0557..d204b055 100644
--- a/ExampleMakefile
+++ b/ExampleMakefile
@@ -58,6 +58,9 @@ endif
 ifndef EXTERNAL_CM_ROOT
   EXTERNAL_CM_ROOT := ${OPENCMISSEXTRAS_ROOT}/cm/external
 endif
+ifndef GLOBAL_MPI_ROOT
+  GLOBAL_MPI_ROOT := ${OPENCMISSEXTRAS_ROOT}/mpi
+endif
 ifndef EXTERNAL_CELLML_ROOT
   EXTERNAL_CELLML_ROOT := ${OPENCMISSEXTRAS_ROOT}/cellml
 endif
@@ -159,6 +162,9 @@ ifeq ($(OPERATING_SYSTEM),linux)# Linux
     ifndef EXTERNAL_CM_DIR
       EXTERNAL_CM_DIR := $(EXTERNAL_CM_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)_$(COMPILER_VERSION)
     endif
+    ifndef MPI_DIR
+      MPI_DIR := $(GLOBAL_MPI_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)_$(COMPILER_VERSION)
+    endif
     EXTERNAL_CELLML_DIR := $(EXTERNAL_CELLML_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(COMPILER)_$(COMPILER_VERSION)
     EXTERNAL_FIELDML_DIR := $(EXTERNAL_FIELDML_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(COMPILER)_$(COMPILER_VERSION)
     EXTERNAL_COMMON_DIR := $(EXTERNAL_COMMON_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(COMPILER)_$(COMPILER_VERSION)
@@ -171,6 +177,9 @@ ifeq ($(OPERATING_SYSTEM),linux)# Linux
     ifndef EXTERNAL_CM_DIR
       EXTERNAL_CM_DIR := $(EXTERNAL_CM_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
     endif
+    ifndef MPI_DIR
+      MPI_DIR := $(GLOBAL_MPI_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
+    endif
     EXTERNAL_CELLML_DIR := $(EXTERNAL_CELLML_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(COMPILER)
     EXTERNAL_FIELDML_DIR := $(EXTERNAL_FIELDML_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(COMPILER)
     EXTERNAL_COMMON_DIR := $(EXTERNAL_COMMON_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(COMPILER)
@@ -185,6 +194,9 @@ else
     ifndef EXTERNAL_CM_DIR
       EXTERNAL_CM_DIR := $(EXTERNAL_CM_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
     endif
+    ifndef MPI_DIR
+      MPI_DIR := $(GLOBAL_MPI_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
+    endif
     EXTERNAL_CELLML_DIR := $(EXTERNAL_CELLML_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(COMPILER)
     EXTERNAL_FIELDML_DIR := $(EXTERNAL_FIELDML_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(COMPILER)
     EXTERNAL_COMMON_DIR := $(EXTERNAL_COMMON_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)
@@ -192,6 +204,9 @@ else
     ifndef EXTERNAL_CM_DIR
       EXTERNAL_CM_DIR := $(EXTERNAL_CM_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)
     endif
+    ifndef MPI_DIR
+      MPI_DIR := $(GLOBAL_MPI_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)
+    endif
     EXTERNAL_CELLML_DIR := $(EXTERNAL_CELLML_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)
     EXTERNAL_FIELDML_DIR := $(EXTERNAL_FIELDML_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)
     EXTERNAL_COMMON_DIR := $(EXTERNAL_COMMON_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)
@@ -846,14 +861,14 @@ ifeq ($(OPERATING_SYSTEM),linux)# Linux
     ifeq ($(MPIPROF),true)
       MPI_LIBRARIES += -lmpe_f2cmpi -llmpe -lmpe
     endif
-    MPL_LIB = $(call searchdirs, $(EXTERNAL_CM_DIR)/lib/, libmpl* )
+    MPL_LIB = $(call searchdirs, $(MPI_DIR)/lib/, libmpl* )
     MPI_LIBRARIES += -lmpich
     ifneq (,$(MPL_LIB))
       MPI_LIBRARIES += -lmpl
     endif
     MPI_LIBRARIES += -lpthread -lrt
-    MPI_LIB_PATH += $(addprefix -L, $(EXTERNAL_CM_DIR)/lib/ )
-    MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/include/ )
+    MPI_LIB_PATH += $(addprefix -L, $(MPI_DIR)/lib/ )
+    MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/include/ )
   else
     ifeq ($(MPI),intel)
       ifeq ($(MPIPROF),true)
@@ -871,19 +886,19 @@ ifeq ($(OPERATING_SYSTEM),linux)# Linux
       endif
     else
       ifeq ($(MPI),openmpi)
-        MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/include/ )
-        MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
+        MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/include/ )
+        MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/lib/ )
         MPI_LIBRARIES += -lmpi_f90 -lmpi_f77 -lmpi -libverbs -lnsl -lutil -ldl -lnsl -lutil
-        MPI_LIB_PATH += $(addprefix -L, $(EXTERNAL_CM_DIR)/lib/ )
+        MPI_LIB_PATH += $(addprefix -L, $(MPI_DIR)/lib/ )
       else
         ifeq ($(MPI),mvapich2)
-          MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/include/ )
+          MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/include/ )
           MPI_LIBRARIES += -lmpich -lmpichf90 -lpthread  -lrt -luuid -lrt
-          MPI_LIB_PATH += $(addprefix -L, $(EXTERNAL_CM_DIR)/lib/ )
+          MPI_LIB_PATH += $(addprefix -L, $(MPI_DIR)/lib/ )
         else
-          MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
+          MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/lib/ )
           MPI_LIBRARIES += -lmpi_f90 -lmpi_f77 -lmpi -lopen-rte -lopen-pal -ldl -lnsl -lutil
-          MPI_LIB_PATH += $(addprefix -L, $(EXTERNAL_CM_DIR)/lib/ )
+          MPI_LIB_PATH += $(addprefix -L, $(MPI_DIR)/lib/ )
         endif 
       endif 
     endif
@@ -892,11 +907,11 @@ else
   ifeq ($(OPERATING_SYSTEM),aix)# AIX
     MPI_LIBRARIES = -lmpi
     MPI_LIB_PATH += $(addprefix -L, /usr/lpp/ppe.poe/lib/ )
-    MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
+    MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/include/ )
   else# windows
     MPI_LIBRARIES = -lmpichf90 -lmpich -lpthread -lrt
     MPI_LIB_PATH += $(addprefix -L, /home/users/local/lib/ )
-    MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
+    MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/include/ )
   endif
 endif
 
diff --git a/Makefile b/Makefile
index cb178798..92114c2a 100644
--- a/Makefile
+++ b/Makefile
@@ -61,6 +61,10 @@ endif
 
 GLOBAL_CM_ROOT = $(CURDIR)
 
+ifndef GLOBAL_MPI_ROOT
+  GLOBAL_MPI_ROOT := ${OPENCMISSEXTRAS_ROOT}/mpi
+endif
+
 ifndef GLOBAL_CELLML_ROOT
   GLOBAL_CELLML_ROOT := ${OPENCMISS_ROOT}/cellml
 endif
@@ -148,6 +152,9 @@ ifeq ($(OPERATING_SYSTEM),linux)# Linux
     ifndef EXTERNAL_CM_DIR
       EXTERNAL_CM_DIR := $(EXTERNAL_CM_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)_$(COMPILER_VERSION)
     endif
+    ifndef MPI_DIR
+      MPI_DIR := $(GLOBAL_MPI_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)_$(COMPILER_VERSION)
+    endif
     OBJECT_DIR := $(GLOBAL_CM_ROOT)/object/$(LIB_ARCH_DIR)$(MT_SUFFIX)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)_$(COMPILER_VERSION)
     INC_DIR := $(GLOBAL_CM_ROOT)/include/$(BIN_ARCH_DIR)/$(MPI)/$(COMPILER)_$(COMPILER_VERSION)
     LIB_DIR := $(GLOBAL_CM_ROOT)/lib/$(BIN_ARCH_DIR)/$(MPI)/$(COMPILER)_$(COMPILER_VERSION)
@@ -155,6 +162,9 @@ ifeq ($(OPERATING_SYSTEM),linux)# Linux
     ifndef EXTERNAL_CM_DIR
       EXTERNAL_CM_DIR := $(EXTERNAL_CM_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
     endif
+    ifndef MPI_DIR
+      MPI_DIR := $(GLOBAL_MPI_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
+    endif
     OBJECT_DIR := $(GLOBAL_CM_ROOT)/object/$(LIB_ARCH_DIR)$(MT_SUFFIX)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
     INC_DIR := $(GLOBAL_CM_ROOT)/include/$(BIN_ARCH_DIR)/$(MPI)/$(COMPILER)
     LIB_DIR := $(GLOBAL_CM_ROOT)/lib/$(BIN_ARCH_DIR)/$(MPI)/$(COMPILER)
@@ -164,10 +174,16 @@ else
     ifndef EXTERNAL_CM_DIR
       EXTERNAL_CM_DIR := $(EXTERNAL_CM_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
     endif
+    ifndef MPI_DIR
+      MPI_DIR := $(GLOBAL_MPI_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
+    endif
   else# windows
     ifndef EXTERNAL_CM_DIR
       EXTERNAL_CM_DIR := $(EXTERNAL_CM_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)
     endif
+    ifndef MPI_DIR
+      MPI_DIR := $(GLOBAL_MPI_ROOT)/$(LIB_ARCH_DIR)$(DEBUG_SUFFIX)$(PROF_SUFFIX)
+    endif
   endif
   OBJECT_DIR := $(GLOBAL_CM_ROOT)/object/$(LIB_ARCH_DIR)$(MT_SUFFIX)$(DEBUG_SUFFIX)$(PROF_SUFFIX)/$(MPI)/$(COMPILER)
   INC_DIR := $(GLOBAL_CM_ROOT)/include/$(BIN_ARCH_DIR)/$(MPI)/$(COMPILER)
@@ -708,11 +724,11 @@ endif
 MPI_INCLUDE_PATH =#
 ifeq ($(OPERATING_SYSTEM),linux)# Linux
   ifeq ($(MPI),mpich2)
-    MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/include/ )
+    MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/include/ )
   else
     ifeq ($(MPI),openmpi)
-      MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
-      MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/include/ )
+      MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/lib/ )
+      MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/include/ )
     else
       ifeq ($(MPI),intel)
         ifeq ($(MPIPROF),true)
@@ -724,7 +740,7 @@ ifeq ($(OPERATING_SYSTEM),linux)# Linux
           MPI_INCLUDE_PATH += $(addprefix -I, $(I_MPI_ROOT)/include/ )
         endif
       else
-        MPI_INCLUDE_PATH += $(addprefix -I, $(EXTERNAL_CM_DIR)/lib/ )
+        MPI_INCLUDE_PATH += $(addprefix -I, $(MPI_DIR)/include/ )
       endif
     endif
   endif

From a0ceb30204a990360ae0c47a5f93a38f4a7dbefb Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Mon, 28 Nov 2011 10:48:54 +1300
Subject: [PATCH 20/24] Remove nodal from FieldML input parameters.

---
 src/fieldml_input_routines.f90 | 66 +++++++++++++++++++++++++++-
 src/opencmiss.f90              | 80 +++++++++++++++++-----------------
 2 files changed, 104 insertions(+), 42 deletions(-)

diff --git a/src/fieldml_input_routines.f90 b/src/fieldml_input_routines.f90
index 674588ec..a796dd2e 100644
--- a/src/fieldml_input_routines.f90
+++ b/src/fieldml_input_routines.f90
@@ -73,14 +73,14 @@ MODULE FIELDML_INPUT_ROUTINES
 
   PUBLIC :: FIELDML_INPUT_INITIALISE_FROM_FILE, FIELDML_INPUT_MESH_CREATE_START, &
     & FIELDML_INPUT_COORDINATE_SYSTEM_CREATE_START, FIELDML_INPUT_BASIS_CREATE_START, FIELDML_INPUT_CREATE_MESH_COMPONENT, &
-    & FIELDML_INPUT_FIELD_CREATE_START, FIELDML_INPUT_FIELD_NODAL_PARAMETERS_UPDATE, FIELDML_INPUT_NODES_CREATE_START
+    & FIELDML_INPUT_FIELD_CREATE_START, FIELDML_INPUT_FIELD_PARAMETERS_UPDATE, FIELDML_INPUT_NODES_CREATE_START
 
 CONTAINS
 
   !
   !================================================================================================================================
   !
-
+uu
   SUBROUTINE FIELDML_ASSERT_IS_IN( FIELDML_INFO, ERR, ERROR, * )
     !Argument variables
     TYPE(FIELDML_IO_TYPE), INTENT(IN) :: FIELDML_INFO !<The FieldML parsing state.
@@ -1139,6 +1139,68 @@ END SUBROUTINE FIELDML_INPUT_FIELD_CREATE_START
   !================================================================================================================================
   !
 
+  !>Inputs from a FieldML file the parameters for a field variable parameter set.
+  SUBROUTINE FIELDML_INPUT_FIELD_PARAMETERS_UPDATE(FIELDML_INFO, EVALUATOR_NAME, FIELD, VARIABLE_TYPE, SET_TYPE, &
+    & ERR, ERROR, * )
+    !Argument variables
+    TYPE(FIELDML_IO_TYPE), INTENT(INOUT) :: FIELDML_INFO !<The FieldML parsing state.
+    TYPE(VARYING_STRING), INTENT(IN) :: EVALUATOR_NAME !<The name of the nodal dofs evaluator.
+    TYPE(FIELD_TYPE), POINTER, INTENT(INOUT) :: FIELD !<The field whose parameters are to be updated.
+    INTEGER(INTG), INTENT(IN) :: VARIABLE_TYPE !<The OpenCMISS variable type.
+    INTEGER(INTG), INTENT(IN) :: SET_TYPE !<The parameter set type.
+    INTEGER(INTG), INTENT(OUT) :: ERR !<The error code.
+    TYPE(VARYING_STRING), INTENT(OUT) :: ERROR !<The error string.
+    !Local Variables
+    INTEGER(INTG) :: INTERPOLATION_TYPE,MESH_COMPONENT1,MESH_COMPONENT2,NUMBER_OF_COMPONENTS
+    LOGICAL :: IS_ALL_NODAL_INTERPOLATION,IS_SAME_MESH_COMPONENTS
+    TYPE(VARYING_STRING) :: LOCAL_ERROR
+
+    CALL ENTERS("FIELDML_INPUT_FIELD_PARAMETERS_UPDATE",ERR,ERROR,*999)
+
+    IF(ASSOCIATED(FIELD)) THEN
+      CALL FIELD_NUMBER_OF_COMPONENTS_GET(FIELD,VARIABLE_TYPE,NUMBER_OF_COMPONENTS,ERR,ERROR,*999)
+      IF(NUMBER_OF_COMPONENTS>1) THEN
+        CALL FIELD_COMPONENT_INTERPOLATION_GET(FIELD,VARIABLE_TYPE,1,INTERPOLATION_TYPE,ERR,ERROR,*999)
+        CALL FIELD_COMPONENT_MESH_COMPONENT_GET(FIELD,VARIABLE_TYPE,1,MESH_COMPONENT1,ERR,ERROR,*999)
+        IS_ALL_NODAL_INTERPOLATION=INTERPOLATION_TYPE==FIELD_NODE_BASED_INTERPOLATION
+        IS_SAME_MESH_COMPONENTS=.TRUE.
+        DO component_idx=2,NUMBER_OF_COMPONENTS
+          CALL FIELD_COMPONENT_INTERPOLATION_GET(FIELD,VARIABLE_TYPE,component_idx,INTERPOLATION_TYPE,ERR,ERROR,*999)
+          CALL FIELD_COMPONENT_MESH_COMPONENT_GET(FIELD,VARIABLE_TYPE,component_idx,MESH_COMPONENT2,ERR,ERROR,*999)
+          IS_ALL_NODAL_INTERPOLATION=IS_ALL_NODAL_INTERPOLATION.AND.INTERPOLATION_TYPE==FIELD_NODE_BASED_INTERPOLATION
+          IS_SAME_MESH_COMPONENTS=IS_SAME_MESH_COMPONENTS.AND.MESH_COMPONENT2==MESH_COMPONENT1
+        ENDDO !component_idx
+        IF(IS_ALL_NODAL_INTERPOLATION) THEN
+          IF(IS_ALL_MESH_COMPONENTS) THEN
+            CALL FIELDML_INPUT_FIELD_NODAL_PARAMETERS_UPDATE(FIELDML_INFO,EVALUATOR_NAME,FIELD,VARIABLE_TYPE,SET_TYPE, &
+              & ERR,ERROR,*999)
+          ELSE
+            CALL FLAG_ERROR( &
+              & "FieldML input parameters only implemented for fields where all components have the same mesh component.", &
+              & ERR,ERROR,*999)            
+          ENDIF
+        ELSE
+          CALL FLAG_ERROR("FieldML input parameters only implemented for fields where all components are nodally interpolated.", &
+            & ERR,ERROR,*999)
+        ENDIF
+      ELSE
+        CALL FLAG_ERROR("Field does not have any components.",ERR,ERROR,*999)
+      ENDIF
+    ELSE
+      CALL FLAG_ERROR("Field is not associated.",ERR,ERROR,*999)
+    ENDIF
+
+    CALL EXITS("FIELDML_INPUT_FIELD_PARAMETERS_UPDATE")
+    RETURN
+999 CALL ERRORS("FIELDML_INPUT_FIELD_PARAMETERS_UPDATE",ERR,ERROR)
+    CALL EXITS("FIELDML_INPUT_FIELD_PARAMETERS_UPDATE")
+    RETURN 1
+  END SUBROUTINE FIELDML_INPUT_FIELD_PARAMETERS_UPDATE
+
+  !
+  !================================================================================================================================
+  !
+
   !>Update the given field's nodal parameters using the given parameter evaluator.
   SUBROUTINE FIELDML_INPUT_FIELD_NODAL_PARAMETERS_UPDATE( FIELDML_INFO, EVALUATOR_NAME, FIELD, VARIABLE_TYPE, SET_TYPE, &
     & ERR, ERROR, * )
diff --git a/src/opencmiss.f90 b/src/opencmiss.f90
index 639bea7e..cf16b2cd 100644
--- a/src/opencmiss.f90
+++ b/src/opencmiss.f90
@@ -5739,13 +5739,13 @@ MODULE OPENCMISS
     MODULE PROCEDURE CMISSFieldMLInputFieldCreateStartNumberC
   END INTERFACE CMISSFieldMLInputFieldCreateStart
 
-  !> Updates the given field's nodal dofs using the given parameter evaluator.
-  INTERFACE CMISSFieldMLInputFieldNodalParametersUpdate
-    MODULE PROCEDURE CMISSFieldMLInputFieldNodalParametersUpdateObjVS
-    MODULE PROCEDURE CMISSFieldMLInputFieldNodalParametersUpdateNumberVS
-    MODULE PROCEDURE CMISSFieldMLInputFieldNodalParametersUpdateObjC
-    MODULE PROCEDURE CMISSFieldMLInputFieldNodalParametersUpdateNumberC
-  END INTERFACE CMISSFieldMLInputFieldNodalParametersUpdate
+  !> Updates the given field's dofs using the given parameter evaluator.
+  INTERFACE CMISSFieldMLInputFieldParametersUpdate
+    MODULE PROCEDURE CMISSFieldMLInputFieldParametersUpdateObjVS
+    MODULE PROCEDURE CMISSFieldMLInputFieldParametersUpdateNumberVS
+    MODULE PROCEDURE CMISSFieldMLInputFieldParametersUpdateObjC
+    MODULE PROCEDURE CMISSFieldMLInputFieldParametersUpdateNumberC
+  END INTERFACE CMISSFieldMLInputFieldParametersUpdate
 
   !> Creates a basis using the given FieldML evaluator.
   INTERFACE CMISSFieldMLInputBasisCreateStart
@@ -5788,7 +5788,7 @@ MODULE OPENCMISS
   PUBLIC :: CMISSFieldMLInputCreateFromFile, CMISSFieldMLInputMeshCreateStart, &
     & CMISSFieldMLInputCoordinateSystemCreateStart, CMISSFieldMLInputCreateMeshComponent, &
     & CMISSFieldMLInputFieldCreateStart, CMISSFieldMLInputBasisCreateStart, CMISSFieldMLInputNodesCreateStart, &
-    & CMISSFieldMLInputFieldNodalParametersUpdate
+    & CMISSFieldMLInputFieldParametersUpdate
 
   PUBLIC :: CMISSFieldMLIOTypeFinalise, CMISSFieldMLIOTypeInitialise, CMISSFieldMLIOGetSession
 
@@ -47414,8 +47414,8 @@ END SUBROUTINE CMISSFieldMLInputFieldCreateStartNumberC
   !================================================================================================================================
   !
 
-  !> Update the nodal parameters of the given field, using the given FieldML evaluator.
-  SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateObjVS( fieldml, field, evaluatorName, variableType, &
+  !> Update the DOF parameters of the given field, using the given FieldML evaluator.
+  SUBROUTINE CMISSFieldMLInputFieldParametersUpdateObjVS( fieldml, field, evaluatorName, variableType, &
     & setType, err )
     !Arguments
     TYPE(CMISSFieldMLIOType), INTENT(INOUT) :: FIELDML !< The FieldML context containing the evaluator to use.
@@ -47425,26 +47425,26 @@ SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateObjVS( fieldml, field, eva
     INTEGER(INTG), INTENT(IN) :: setType !<The parameter set type.
     INTEGER(INTG), INTENT(OUT) :: err !< The error code.
 
-    CALL ENTERS("CMISSFieldMLInputFieldNodalParametersUpdateObjVS",Err,ERROR,*999)
+    CALL ENTERS("CMISSFieldMLInputFieldParametersUpdateObjVS",Err,ERROR,*999)
 
-    CALL FIELDML_INPUT_FIELD_NODAL_PARAMETERS_UPDATE( fieldml%fieldmlInfo, evaluatorName, field%FIELD, variableType, &
+    CALL FIELDML_INPUT_FIELD_PARAMETERS_UPDATE( fieldml%fieldmlInfo, evaluatorName, field%FIELD, variableType, &
       &  setType, err, error, *999 )
 
-    CALL EXITS("CMISSFieldMLInputFieldNodalParametersUpdateObjVS")
+    CALL EXITS("CMISSFieldMLInputFieldParametersUpdateObjVS")
     RETURN
-999 CALL ERRORS("CMISSFieldMLInputFieldNodalParametersUpdateObjVS",Err,ERROR)
-    CALL EXITS("CMISSFieldMLInputFieldNodalParametersUpdateObjVS")
+999 CALL ERRORS("CMISSFieldMLInputFieldParametersUpdateObjVS",Err,ERROR)
+    CALL EXITS("CMISSFieldMLInputFieldParametersUpdateObjVS")
     CALL CMISS_HANDLE_ERROR(Err,ERROR)
     RETURN
     
-  END SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateObjVS
+  END SUBROUTINE CMISSFieldMLInputFieldParametersUpdateObjVS
 
   !  
   !================================================================================================================================
   !
 
-  !> Update the nodal parameters of field with the given user number, using the given FieldML evaluator.
-  SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateNumberVS( fieldml, regionNumber, fieldNumber, &
+  !> Update the DOF parameters of field with the given user number, using the given FieldML evaluator.
+  SUBROUTINE CMISSFieldMLInputFieldParametersUpdateNumberVS( fieldml, regionNumber, fieldNumber, &
     & evaluatorName, variableType, setType, err )
     !Arguments
     TYPE(CMISSFieldMLIOType), INTENT(INOUT) :: FIELDML !< The FieldML context containing the evaluator to use.
@@ -47459,29 +47459,29 @@ SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateNumberVS( fieldml, regionN
     TYPE(REGION_TYPE), POINTER :: region
     TYPE(FIELD_TYPE), POINTER :: field
 
-    CALL ENTERS("CMISSFieldMLInputFieldNodalParametersUpdateNumberVS",Err,ERROR,*999)
+    CALL ENTERS("CMISSFieldMLInputFieldParametersUpdateNumberVS",Err,ERROR,*999)
 
     CALL REGION_USER_NUMBER_TO_REGION( regionNumber, region, err, error, *999 )
     CALL FIELD_USER_NUMBER_TO_FIELD( fieldNumber, region, field, err, error, *999 )
 
-    CALL FIELDML_INPUT_FIELD_NODAL_PARAMETERS_UPDATE( fieldml%fieldmlInfo, evaluatorName, field, variableType, setType, &
+    CALL FIELDML_INPUT_FIELD_PARAMETERS_UPDATE( fieldml%fieldmlInfo, evaluatorName, field, variableType, setType, &
       & err, error, *999 )
 
-    CALL EXITS("CMISSFieldMLInputFieldNodalParametersUpdateNumberVS")
+    CALL EXITS("CMISSFieldMLInputFieldParametersUpdateNumberVS")
     RETURN
-999 CALL ERRORS("CMISSFieldMLInputFieldNodalParametersUpdateNumberVS",Err,ERROR)
-    CALL EXITS("CMISSFieldMLInputFieldNodalParametersUpdateNumberVS")
+999 CALL ERRORS("CMISSFieldMLInputFieldParametersUpdateNumberVS",Err,ERROR)
+    CALL EXITS("CMISSFieldMLInputFieldParametersUpdateNumberVS")
     CALL CMISS_HANDLE_ERROR(Err,ERROR)
     RETURN
     
-  END SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateNumberVS
+  END SUBROUTINE CMISSFieldMLInputFieldParametersUpdateNumberVS
 
   !  
   !================================================================================================================================
   !
 
-  !> Update the nodal parameters of the given field, using the given FieldML evaluator.
-  SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateObjC( fieldml, field, evaluatorName, &
+  !> Update the DOF parameters of the given field, using the given FieldML evaluator.
+  SUBROUTINE CMISSFieldMLInputFieldParametersUpdateObjC( fieldml, field, evaluatorName, &
     & variableType, setType, err )
     !Arguments
     TYPE(CMISSFieldMLIOType), INTENT(INOUT) :: FIELDML !< The FieldML context containing the evaluator to use.
@@ -47491,26 +47491,26 @@ SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateObjC( fieldml, field, eval
     INTEGER(INTG), INTENT(IN) :: setType !<The parameter set type.
     INTEGER(INTG), INTENT(OUT) :: err !< The error code.
 
-    CALL ENTERS("CMISSFieldMLInputFieldNodalParametersUpdateObjC",Err,ERROR,*999)
+    CALL ENTERS("CMISSFieldMLInputFieldParametersUpdateObjC",Err,ERROR,*999)
 
-    CALL FIELDML_INPUT_FIELD_NODAL_PARAMETERS_UPDATE( fieldml%fieldmlInfo, var_str(evaluatorName), field%FIELD, variableType, &
+    CALL FIELDML_INPUT_FIELD_PARAMETERS_UPDATE( fieldml%fieldmlInfo, var_str(evaluatorName), field%FIELD, variableType, &
       & setType, err, error, *999 )
 
-    CALL EXITS("CMISSFieldMLInputFieldNodalParametersUpdateObjC")
+    CALL EXITS("CMISSFieldMLInputFieldParametersUpdateObjC")
     RETURN
-999 CALL ERRORS("CMISSFieldMLInputFieldNodalParametersUpdateObjC",Err,ERROR)
-    CALL EXITS("CMISSFieldMLInputFieldNodalParametersUpdateObjC")
+999 CALL ERRORS("CMISSFieldMLInputFieldParametersUpdateObjC",Err,ERROR)
+    CALL EXITS("CMISSFieldMLInputFieldParametersUpdateObjC")
     CALL CMISS_HANDLE_ERROR(Err,ERROR)
     RETURN
     
-  END SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateObjC
+  END SUBROUTINE CMISSFieldMLInputFieldParametersUpdateObjC
 
   !  
   !================================================================================================================================
   !
 
-  !> Update the nodal parameters of field with the given user number, using the given FieldML evaluator.
-  SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateNumberC( fieldml, regionNumber, fieldNumber, &
+  !> Update the DOF parameters of field with the given user number, using the given FieldML evaluator.
+  SUBROUTINE CMISSFieldMLInputFieldParametersUpdateNumberC( fieldml, regionNumber, fieldNumber, &
     & evaluatorName, variableType, setType, err )
     !Arguments
     TYPE(CMISSFieldMLIOType), INTENT(INOUT) :: FIELDML !< The FieldML context containing the evaluator to use.
@@ -47525,22 +47525,22 @@ SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateNumberC( fieldml, regionNu
     TYPE(REGION_TYPE), POINTER :: region
     TYPE(FIELD_TYPE), POINTER :: field
 
-    CALL ENTERS("CMISSFieldMLInputFieldNodalParametersUpdateNumberC",Err,ERROR,*999)
+    CALL ENTERS("CMISSFieldMLInputFieldParametersUpdateNumberC",Err,ERROR,*999)
 
     CALL REGION_USER_NUMBER_TO_REGION( regionNumber, region, err, error, *999 )
     CALL FIELD_USER_NUMBER_TO_FIELD( fieldNumber, region, field, err, error, *999 )
 
-    CALL FIELDML_INPUT_FIELD_NODAL_PARAMETERS_UPDATE( fieldml%fieldmlInfo, var_str(evaluatorName), field, variableType, &
+    CALL FIELDML_INPUT_FIELD_PARAMETERS_UPDATE( fieldml%fieldmlInfo, var_str(evaluatorName), field, variableType, &
       & setType, err, error, *999 )
 
-    CALL EXITS("CMISSFieldMLInputFieldNodalParametersUpdateNumberC")
+    CALL EXITS("CMISSFieldMLInputFieldParametersUpdateNumberC")
     RETURN
-999 CALL ERRORS("CMISSFieldMLInputFieldNodalParametersUpdateNumberC",Err,ERROR)
-    CALL EXITS("CMISSFieldMLInputFieldNodalParametersUpdateNumberC")
+999 CALL ERRORS("CMISSFieldMLInputFieldParametersUpdateNumberC",Err,ERROR)
+    CALL EXITS("CMISSFieldMLInputFieldParametersUpdateNumberC")
     CALL CMISS_HANDLE_ERROR(Err,ERROR)
     RETURN
     
-  END SUBROUTINE CMISSFieldMLInputFieldNodalParametersUpdateNumberC
+  END SUBROUTINE CMISSFieldMLInputFieldParametersUpdateNumberC
 
   !  
   !================================================================================================================================

From e6cfb557139322835a55ce2cb08a6e8481b1c1ba Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Mon, 28 Nov 2011 11:31:26 +1300
Subject: [PATCH 21/24] Fix typo.

---
 src/fieldml_input_routines.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fieldml_input_routines.f90 b/src/fieldml_input_routines.f90
index a796dd2e..669a7eef 100644
--- a/src/fieldml_input_routines.f90
+++ b/src/fieldml_input_routines.f90
@@ -80,7 +80,7 @@ MODULE FIELDML_INPUT_ROUTINES
   !
   !================================================================================================================================
   !
-uu
+
   SUBROUTINE FIELDML_ASSERT_IS_IN( FIELDML_INFO, ERR, ERROR, * )
     !Argument variables
     TYPE(FIELDML_IO_TYPE), INTENT(IN) :: FIELDML_INFO !<The FieldML parsing state.

From bc06e2662408428571a10fe01b3d5f20f90236f9 Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Mon, 28 Nov 2011 11:46:17 +1300
Subject: [PATCH 22/24] Fix more typos.

---
 src/fieldml_input_routines.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fieldml_input_routines.f90 b/src/fieldml_input_routines.f90
index 669a7eef..548cd0a7 100644
--- a/src/fieldml_input_routines.f90
+++ b/src/fieldml_input_routines.f90
@@ -1151,7 +1151,7 @@ SUBROUTINE FIELDML_INPUT_FIELD_PARAMETERS_UPDATE(FIELDML_INFO, EVALUATOR_NAME, F
     INTEGER(INTG), INTENT(OUT) :: ERR !<The error code.
     TYPE(VARYING_STRING), INTENT(OUT) :: ERROR !<The error string.
     !Local Variables
-    INTEGER(INTG) :: INTERPOLATION_TYPE,MESH_COMPONENT1,MESH_COMPONENT2,NUMBER_OF_COMPONENTS
+    INTEGER(INTG) :: component_idx,INTERPOLATION_TYPE,MESH_COMPONENT1,MESH_COMPONENT2,NUMBER_OF_COMPONENTS
     LOGICAL :: IS_ALL_NODAL_INTERPOLATION,IS_SAME_MESH_COMPONENTS
     TYPE(VARYING_STRING) :: LOCAL_ERROR
 
@@ -1171,7 +1171,7 @@ SUBROUTINE FIELDML_INPUT_FIELD_PARAMETERS_UPDATE(FIELDML_INFO, EVALUATOR_NAME, F
           IS_SAME_MESH_COMPONENTS=IS_SAME_MESH_COMPONENTS.AND.MESH_COMPONENT2==MESH_COMPONENT1
         ENDDO !component_idx
         IF(IS_ALL_NODAL_INTERPOLATION) THEN
-          IF(IS_ALL_MESH_COMPONENTS) THEN
+          IF(IS_SAME_MESH_COMPONENTS) THEN
             CALL FIELDML_INPUT_FIELD_NODAL_PARAMETERS_UPDATE(FIELDML_INFO,EVALUATOR_NAME,FIELD,VARIABLE_TYPE,SET_TYPE, &
               & ERR,ERROR,*999)
           ELSE

From 2e9dd3331092d385729cb34b282735e86e0590a2 Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Fri, 2 Dec 2011 17:34:07 +1300
Subject: [PATCH 23/24] Error message for fopen problems.

---
 src/external_dae_solver_routines.cu | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/external_dae_solver_routines.cu b/src/external_dae_solver_routines.cu
index 766838ef..4bb67595 100644
--- a/src/external_dae_solver_routines.cu
+++ b/src/external_dae_solver_routines.cu
@@ -57,6 +57,8 @@ SolverDAEExternalIntegrate     Solves the differential-algebraic equation.
 /* Included files */
 #include <stdio.h>
 #include <stdlib.h>
+#include <errno.h>
+
 
 #include "external_dae_solver_routines.h"
 #include "cuda_solver_routines.cu"
@@ -91,12 +93,18 @@ void SolverDAEExternalIntegrate(const int NumberOfDofs,
 
 	if (!timing_file) {
 		timing_file = fopen(filename, "wt");
+		if (!timing_file) {
+		        fprintf(stderr,"File error: %s\n",strerror(errno));
+			fprintf(stderr, "Timing file could not be opened or created.");
+			exit(EXIT_FAILURE);
+		}
 		fprintf(timing_file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\n");
 //		fprintf(timing_file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\tSingle Kernel Computaional Time(s)\tKernel GFLOPS\tDevice Utilisation\n");
 	} else {
-		fclose(timing_file);
+		fclose(timing_file); 
 		timing_file = fopen(filename, "at");
 		if (!timing_file) {
+		        fprintf(stderr,"File error: %s\n",strerror(errno));
 			fprintf(stderr, "Timing file could not be opened or created.");
 			exit(EXIT_FAILURE);
 		}

From 4a03384cf931338d17c4bf11748529680c416d17 Mon Sep 17 00:00:00 2001
From: Chris Bradley <c.bradley@auckland.ac.nz>
Date: Sun, 4 Dec 2011 11:09:46 +1300
Subject: [PATCH 24/24] Tidy ups

---
 src/cuda_solver_routines.cu | 803 +++++++++++++++++++-----------------
 utils/MakefileCommon.inc    |   2 +-
 2 files changed, 430 insertions(+), 375 deletions(-)

diff --git a/src/cuda_solver_routines.cu b/src/cuda_solver_routines.cu
index dcf65120..5d174a90 100644
--- a/src/cuda_solver_routines.cu
+++ b/src/cuda_solver_routines.cu
@@ -1,3 +1,58 @@
+/* \file
+ * \author Jin Budlemann
+ * \brief This file provides the routines for solving differential-algebraic equations with an external solver.
+ *.
+ * \section LICENSE
+ * 
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ * 
+ * The contents of this file are subject to the Mozilla Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ * 
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * The Original Code is OpenCMISS
+ * 
+ * The Initial Developer of the Original Code is University of Auckland,
+ * Auckland, New Zealand, the University of Oxford, Oxford, United
+ * Kingdom and King's College, London, United Kingdom. Portions created
+ * by the University of Auckland, the University of Oxford and King's
+ * College, London are Copyright (C) 2007-2010 by the University of
+ * Auckland, the University of Oxford and King's College, London.
+ * All Rights Reserved.
+ * 
+ * Contributor(s): Chris Bradley
+ * 
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ * 
+ */
+
+/*
+File: cuda_solver_routines.cu
+=============================
+ 
+This file provides provides the routines for cuda acceleration with an external solver.
+
+Functions included:
+
+
+*/
+
 #include <stdlib.h>
 #include <stdio.h>
 
@@ -260,334 +315,334 @@ void domainDecomposition(unsigned int* threads_per_domain, unsigned int* spill,
 
     (*threads_per_block) = (*threads_per_block) > deviceProp.maxThreadsPerBlock ? deviceProp.maxThreadsPerBlock : (*threads_per_block);
 
-	// Adjust threads per blocks so that states variables fit in shared memory
-	count=0;
-	(*sharedMem) = (size_t) (sharedMemoryIntegrator + sharedMemoryDevice + sharedMemoryCellModel) * (*threads_per_block) * sizeof(double);
-	while ( (*sharedMem) > deviceProp.sharedMemPerBlock*0.5 && count < 200 ) {
-		if ((*threads_per_block) > 32 ) {
-			(*threads_per_block)-=32;
-		} else {
-			fprintf(stderr, "Cannot fit variables in shared memory");
-			exit(EXIT_FAILURE);
-		}
-		count++;
-		(*sharedMem) = (size_t) (sharedMemoryIntegrator + sharedMemoryDevice + sharedMemoryCellModel) * (*threads_per_block) * sizeof(double);
-	}
-	(*num_blocks)=num_threads/(*threads_per_block);
-	spill[2]=num_threads % (*threads_per_block);
-	//if (spill[2] !=0) (*num_blocks)++; // Round up calculation
-
-	(*blocksPerDomain) = (*num_blocks)/(num_streams*(*num_partitions));
-	if ((*blocksPerDomain) == 0) {
-		fprintf(stderr, "Too many streams and partitions for your problem size\nReduce these and try again");
-		exit(EXIT_FAILURE);
-	} else if ((*blocksPerDomain) > deviceProp.maxGridSize[0]) {
-		fprintf(stderr, "Too many blocks allocated to each domain for your problem size\nIncrease number of partitions and try again");
-		exit(EXIT_FAILURE);
-	}
+    // Adjust threads per blocks so that states variables fit in shared memory
+    count=0;
+    (*sharedMem) = (size_t) (sharedMemoryIntegrator + sharedMemoryDevice + sharedMemoryCellModel) * (*threads_per_block) * sizeof(double);
+    while ( (*sharedMem) > deviceProp.sharedMemPerBlock*0.5 && count < 200 ) {
+      if ((*threads_per_block) > 32 ) {
+	(*threads_per_block)-=32;
+      } else {
+	fprintf(stderr, "Cannot fit variables in shared memory");
+	exit(EXIT_FAILURE);
+      }
+      count++;
+      (*sharedMem) = (size_t) (sharedMemoryIntegrator + sharedMemoryDevice + sharedMemoryCellModel) * (*threads_per_block) * sizeof(double);
+    }
+    (*num_blocks)=num_threads/(*threads_per_block);
+    spill[2]=num_threads % (*threads_per_block);
+    //if (spill[2] !=0) (*num_blocks)++; // Round up calculation
+    
+    (*blocksPerDomain) = (*num_blocks)/(num_streams*(*num_partitions));
+    if ((*blocksPerDomain) == 0) {
+      fprintf(stderr, "Too many streams and partitions for your problem size\nReduce these and try again");
+      exit(EXIT_FAILURE);
+    } else if ((*blocksPerDomain) > deviceProp.maxGridSize[0]) {
+      fprintf(stderr, "Too many blocks allocated to each domain for your problem size\nIncrease number of partitions and try again");
+      exit(EXIT_FAILURE);
+    }
+    remainingBlocks = (*num_blocks)%(num_streams*(*num_partitions));
+    if (remainingBlocks > 0 && remainingBlocks <= num_streams*(*blocksPerDomain)) {
+      (*num_partitions)++;
+    } else if (remainingBlocks > num_streams*(*blocksPerDomain)) {
+      numberRemainingDomains = ceil((float)remainingBlocks/(*blocksPerDomain));
+      (*num_partitions) += ceil((float)numberRemainingDomains/num_streams);
+    }
+    
+    if ((*num_blocks)/(num_streams*(*num_partitions))==0) {
+      (*num_partitions)--;
+    }
+    
+    remainingBlocks = (*num_blocks)%(*num_partitions);
+    
+    // Adjust number of partitions so that data fits in GPU memory
+    count = 0;
+    cutilSafeCall( cudaMemGetInfo(&freeGPUMemory, &test) );
+    *pagedMemorySize = sizeof(double)*rateStateCount*(*blocksPerDomain)*num_streams*(*threads_per_block);
+    while ( *pagedMemorySize > freeGPUMemory*0.9 && count < 200 ) {
+      if ((*blocksPerDomain) > 1 ) {
+	(*num_partitions) = (*num_blocks)/num_streams/--(*blocksPerDomain);
 	remainingBlocks = (*num_blocks)%(num_streams*(*num_partitions));
-	if (remainingBlocks > 0 && remainingBlocks <= num_streams*(*blocksPerDomain)) {
-		(*num_partitions)++;
+	if (remainingBlocks != 0) {
+	  (*num_partitions)++;
 	} else if (remainingBlocks > num_streams*(*blocksPerDomain)) {
-		numberRemainingDomains = ceil((float)remainingBlocks/(*blocksPerDomain));
-		(*num_partitions) += ceil((float)numberRemainingDomains/num_streams);
+	  numberRemainingDomains = ceil((float)remainingBlocks/(*blocksPerDomain));
+	  (*num_partitions) += ceil((float)numberRemainingDomains/num_streams);
 	}
-
-	if ((*num_blocks)/(num_streams*(*num_partitions))==0) {
-		(*num_partitions)--;
-	}
-
 	remainingBlocks = (*num_blocks)%(*num_partitions);
-
-	// Adjust number of partitions so that data fits in GPU memory
-	count = 0;
-	cutilSafeCall( cudaMemGetInfo(&freeGPUMemory, &test) );
-	*pagedMemorySize = sizeof(double)*rateStateCount*(*blocksPerDomain)*num_streams*(*threads_per_block);
-	while ( *pagedMemorySize > freeGPUMemory*0.9 && count < 200 ) {
-		if ((*blocksPerDomain) > 1 ) {
-			(*num_partitions) = (*num_blocks)/num_streams/--(*blocksPerDomain);
-			remainingBlocks = (*num_blocks)%(num_streams*(*num_partitions));
-			if (remainingBlocks != 0) {
-				(*num_partitions)++;
-			} else if (remainingBlocks > num_streams*(*blocksPerDomain)) {
-				numberRemainingDomains = ceil((float)remainingBlocks/(*blocksPerDomain));
-				(*num_partitions) += ceil((float)numberRemainingDomains/num_streams);
-			}
-			remainingBlocks = (*num_blocks)%(*num_partitions);
-		} else {
-			fprintf(stderr, "Cannot fit variables in GPU device memory\nReduce threads per block and try again");
-			exit(EXIT_FAILURE);
-		}
-		count++;
-
-		*pagedMemorySize = sizeof(double)*rateStateCount*(*blocksPerDomain)*num_streams*(*threads_per_block);
-	}
-
-	if (*pagedMemorySize > freeGPUMemory*0.9) {
-		fprintf(stderr, "Cannot fit variables in GPU device memory\nReduce threads per block and try again");
-		exit(EXIT_FAILURE);
-	}
-
-	(*threads_per_domain) = (*threads_per_block)*(*blocksPerDomain);
-	spill[0] = remainingBlocks/(*blocksPerDomain);
-	spill[1] = remainingBlocks%(*blocksPerDomain);
-
-//	if (renainingBlocks != 0) {
-//		blocksLastStream =
-//	}
-//	remainingBlocks = num_threads%(num_blocks*(*num_partitions))
-//	for (i=0; i<(*num_partitions); i++) {
-//		if (i == (*num_partitions) && remainingBlocks != 0) {
-//
-//			for (j=0; j<remainingStreams; j++) {
-//				domainIndex = j + i*num_streams;
-//				(*threads_per_domain) = (*threads_per_block)*(*blocksPerDomain);
-//				count += threads_per_domain[domainIndex];
-//			}
-//			if (blocksLastStream!=0) {
-//				domainIndex ++;
-//				threads_per_domain[domainIndex] = num_threads - count;
-//			}
-//		} else {
-//			for (j=0; j<num_streams; j++) {
-//				domainIndex = j + i*num_streams;
-//				threads_per_domain[domainIndex] = (*threads_per_block)*(*blocksPerDomain);
-//				count += threads_per_domain[domainIndex];
-//			}
-//		}
-//	}
+      } else {
+	fprintf(stderr, "Cannot fit variables in GPU device memory\nReduce threads per block and try again");
+	exit(EXIT_FAILURE);
+      }
+      count++;
+      
+      *pagedMemorySize = sizeof(double)*rateStateCount*(*blocksPerDomain)*num_streams*(*threads_per_block);
+    }
+    
+    if (*pagedMemorySize > freeGPUMemory*0.9) {
+      fprintf(stderr, "Cannot fit variables in GPU device memory\nReduce threads per block and try again");
+      exit(EXIT_FAILURE);
+    }
+    
+    (*threads_per_domain) = (*threads_per_block)*(*blocksPerDomain);
+    spill[0] = remainingBlocks/(*blocksPerDomain);
+    spill[1] = remainingBlocks%(*blocksPerDomain);
+    
+    //	if (renainingBlocks != 0) {
+    //		blocksLastStream =
+    //	}
+    //	remainingBlocks = num_threads%(num_blocks*(*num_partitions))
+    //	for (i=0; i<(*num_partitions); i++) {
+    //		if (i == (*num_partitions) && remainingBlocks != 0) {
+    //
+    //			for (j=0; j<remainingStreams; j++) {
+    //				domainIndex = j + i*num_streams;
+    //				(*threads_per_domain) = (*threads_per_block)*(*blocksPerDomain);
+    //				count += threads_per_domain[domainIndex];
+    //			}
+    //			if (blocksLastStream!=0) {
+    //				domainIndex ++;
+    //				threads_per_domain[domainIndex] = num_threads - count;
+    //			}
+    //		} else {
+    //			for (j=0; j<num_streams; j++) {
+    //				domainIndex = j + i*num_streams;
+    //				threads_per_domain[domainIndex] = (*threads_per_block)*(*blocksPerDomain);
+    //				count += threads_per_domain[domainIndex];
+    //			}
+    //		}
+    //	}
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // CPU side solution routine
 ////////////////////////////////////////////////////////////////////////////////
 void solve(double* h_states, double startTime, double endTime, double stepSize,
-		   unsigned int num_threads, unsigned int threads_per_block, unsigned int num_partitions, 
-		   int unsigned num_streams, FILE *timing_file)
+	   unsigned int num_threads, unsigned int threads_per_block, unsigned int num_partitions, 
+	   int unsigned num_streams, FILE *timing_file)
 {
-    unsigned int i,j;
-	unsigned int num_blocks = 0;
-	unsigned int local_offset = 0;
-	unsigned int global_offset = 0;
-	double nFLOPS = 0;
-	double dSeconds = 0;
-	double gflops = 0;
-	double kernel_dSeconds = 0;
-	double kernel_nFLOPS = 0;
-	double kernel_gflops = 0;
-	unsigned int timer = 0;
-	unsigned int kernel_timer = 0;
-	unsigned int domainIndex = 0;
-	unsigned int blocksPerDomain = 0;
-	unsigned int lastFullDomain = 0;
-	unsigned int threads_per_domain = 0;
-	unsigned int spill[3] = { 0 };
-
-	size_t pagedMemorySize;
-	size_t sharedMem = 0;
-	size_t lastStreamMemorySize = 0;
-
-	double *d_states = 0;
-	double *h_paged_states = 0;
-
-	cudaStream_t *streams;
-	int cuda_device = 0;
-
-	unsigned int timeSteps = (endTime-startTime)/stepSize;
-
-    // Check for a CUDA compatible device
-    int num_devices=0;
-    cutilSafeCall( cudaGetDeviceCount(&num_devices) );
-    if(0==num_devices)
+  unsigned int i,j;
+  unsigned int num_blocks = 0;
+  unsigned int local_offset = 0;
+  unsigned int global_offset = 0;
+  double nFLOPS = 0;
+  double dSeconds = 0;
+  double gflops = 0;
+  double kernel_dSeconds = 0;
+  double kernel_nFLOPS = 0;
+  double kernel_gflops = 0;
+  unsigned int timer = 0;
+  unsigned int kernel_timer = 0;
+  unsigned int domainIndex = 0;
+  unsigned int blocksPerDomain = 0;
+  unsigned int lastFullDomain = 0;
+  unsigned int threads_per_domain = 0;
+  unsigned int spill[3] = { 0 };
+  
+  size_t pagedMemorySize;
+  size_t sharedMem = 0;
+  size_t lastStreamMemorySize = 0;
+  
+  double *d_states = 0;
+  double *h_paged_states = 0;
+  
+  cudaStream_t *streams;
+  int cuda_device = 0;
+  
+  unsigned int timeSteps = (endTime-startTime)/stepSize;
+  
+  // Check for a CUDA compatible device
+  int num_devices=0;
+  cutilSafeCall( cudaGetDeviceCount(&num_devices) );
+  if(0==num_devices)
     {
-        printf("Your system does not have a CUDA capable device\n");
-        return;
+      printf("Your system does not have a CUDA capable device\n");
+      return;
+    }
+  
+  // Set appropriate device flags and print relevant information
+  cutilSafeCall( cudaSetDevice( cuda_device ) );
+  cutilSafeCall( cudaSetDeviceFlags(cudaDeviceBlockingSync) );
+  cudaDeviceProp deviceProp;
+  cutilSafeCall( cudaGetDeviceProperties(&deviceProp, cuda_device) );
+  if( (1 == deviceProp.major) && (deviceProp.minor < 1))
+    printf("%s does not have compute capability 1.1 or later\n", deviceProp.name);
+  
+  domainDecomposition(&threads_per_domain, spill, num_threads, num_streams, &pagedMemorySize, &threads_per_block,
+		      &num_partitions, &blocksPerDomain, &num_blocks, &sharedMem);
+  
+  
+  lastFullDomain = num_blocks/blocksPerDomain;
+  
+  // Allocate CUDA device and host pinned memory
+  cutilSafeCall( cudaMallocHost((void**) &h_paged_states, pagedMemorySize) );
+  cutilSafeCall( cudaMalloc((void **) &d_states, pagedMemorySize) );
+  
+  // Create streams
+  streams = (cudaStream_t*) malloc(num_streams * sizeof(cudaStream_t));
+  for(i = 0; i < num_streams; i++)
+    cutilSafeCall( cudaStreamCreate(&(streams[i])) );
+  
+  // Setup execution parameters
+  dim3  grid(blocksPerDomain, 1, 1);
+  dim3  threads(threads_per_block, 1, 1);
+  
+  //#ifdef DEBUG
+  if (timing_file) {
+    //    	printf("> Device name : %s\n", deviceProp.name );
+    //		printf("> CUDA Capable SM %d.%d hardware with %d multi-processors\n",
+    //			deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+    //		printf("> Cell Model = %s, Integrator = %s\n", cellModelName, integratorName);
+    //		printf("> num_threads = %d, num_blocks = %d, threads_per_block = %d, num_partitions = %d, timeSteps = %d, num_streams = %d\n",
+    //			num_threads, num_blocks, threads_per_block, num_partitions, timeSteps, num_streams);
+    //		//#endif
+    //    	printf("grid.x %d threads.x %d sharedMem %d\n", grid.x, threads.x, sharedMem);
+    //    	printf("Spills %d %d %d\n", spill[0], spill[1], spill[2]);
+    
+    // Setup and start global timer
+    timer = 0;
+    cutCreateTimer(&timer);
+    cutilCheckError(cutStartTimer(timer));
+    
+    //		// Test kernel speed in default stream (timing is more accurate in default stream)
+    //		memcpy(h_paged_states, h_states, pagedMemorySize/num_streams);
+    //		cutilSafeCall( cudaMemcpy(d_states, h_paged_states, pagedMemorySize/num_streams,
+    //			cudaMemcpyHostToDevice) );
+    //		// Start kernel timer
+    //		kernel_timer = 0;
+    //		cutCreateTimer(&kernel_timer);
+    //		cutilCheckError(cutStartTimer(kernel_timer));
+    //		// Start kernel
+    //		solveSystem<<<grid, threads, sharedMem>>>(timeSteps, stepSize, d_states);
+    //		checkCUDAError("Single Kernel Execution");
+    //		cutilSafeCall( cudaThreadSynchronize() );
+    //		// Stop kernel Timer
+    //		cutilCheckError(cutStopTimer(kernel_timer));
+    //		cutilSafeCall( cudaMemcpy(h_paged_states, d_states, pagedMemorySize/num_streams,
+    //			cudaMemcpyDeviceToHost) );
+    //		memcpy(h_states, h_paged_states, pagedMemorySize/num_streams);
+    
+    // Prefetch data for next partition in first stream
+    //		if (num_partitions>1) {
+    //			global_offset = rateStateCount * num_streams * grid.x * threads.x;
+    //			memcpy(h_paged_states, h_states + global_offset, pagedMemorySize/num_streams);
+    //		}
+    //	} else {
+    memcpy(h_paged_states, h_states, pagedMemorySize);
+  }
+  
+  // Queue kernel calls into streams to hide memory transfers (num_partitions sets of kernel calls in each stream)
+  for(i = 0; i < num_partitions+1; i++) {
+    // Asynchronously launch num_streams memcopies
+    for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
+      domainIndex = j + i*num_streams;
+      if (domainIndex <= lastFullDomain) {
+	local_offset = j * rateStateCount * grid.x * threads.x ;
+	if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
+	  //printf("last async in %d, size %d\n", domainIndex, lastStreamMemorySize);
+	  cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset,
+					 lastStreamMemorySize, cudaMemcpyHostToDevice, streams[j]) );
+	} else {
+	  //printf("normal async in %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
+	  cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset,
+					 pagedMemorySize/num_streams, cudaMemcpyHostToDevice, streams[j]) );
 	}
-	
-    // Set appropriate device flags and print relevant information
-	cutilSafeCall( cudaSetDevice( cuda_device ) );
-	cutilSafeCall( cudaSetDeviceFlags(cudaDeviceBlockingSync) );
-    cudaDeviceProp deviceProp;
-    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, cuda_device) );
-    if( (1 == deviceProp.major) && (deviceProp.minor < 1))
-        printf("%s does not have compute capability 1.1 or later\n", deviceProp.name);
-
-    domainDecomposition(&threads_per_domain, spill, num_threads, num_streams, &pagedMemorySize, &threads_per_block,
-    		&num_partitions, &blocksPerDomain, &num_blocks, &sharedMem);
-
-
-    lastFullDomain = num_blocks/blocksPerDomain;
-
-	// Allocate CUDA device and host pinned memory
-	cutilSafeCall( cudaMallocHost((void**) &h_paged_states, pagedMemorySize) );
-	cutilSafeCall( cudaMalloc((void **) &d_states, pagedMemorySize) );
-
-	// Create streams
-	streams = (cudaStream_t*) malloc(num_streams * sizeof(cudaStream_t));
-    for(i = 0; i < num_streams; i++)
-        cutilSafeCall( cudaStreamCreate(&(streams[i])) );
-
-    // Setup execution parameters
-    dim3  grid(blocksPerDomain, 1, 1);
-    dim3  threads(threads_per_block, 1, 1);
-
-    //#ifdef DEBUG
-    if (timing_file) {
-//    	printf("> Device name : %s\n", deviceProp.name );
-//		printf("> CUDA Capable SM %d.%d hardware with %d multi-processors\n",
-//			deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
-//		printf("> Cell Model = %s, Integrator = %s\n", cellModelName, integratorName);
-//		printf("> num_threads = %d, num_blocks = %d, threads_per_block = %d, num_partitions = %d, timeSteps = %d, num_streams = %d\n",
-//			num_threads, num_blocks, threads_per_block, num_partitions, timeSteps, num_streams);
-//		//#endif
-//    	printf("grid.x %d threads.x %d sharedMem %d\n", grid.x, threads.x, sharedMem);
-//    	printf("Spills %d %d %d\n", spill[0], spill[1], spill[2]);
-
-		// Setup and start global timer
-		timer = 0;
-		cutCreateTimer(&timer);
-		cutilCheckError(cutStartTimer(timer));
-
-//		// Test kernel speed in default stream (timing is more accurate in default stream)
-//		memcpy(h_paged_states, h_states, pagedMemorySize/num_streams);
-//		cutilSafeCall( cudaMemcpy(d_states, h_paged_states, pagedMemorySize/num_streams,
-//			cudaMemcpyHostToDevice) );
-//		// Start kernel timer
-//		kernel_timer = 0;
-//		cutCreateTimer(&kernel_timer);
-//		cutilCheckError(cutStartTimer(kernel_timer));
-//		// Start kernel
-//		solveSystem<<<grid, threads, sharedMem>>>(timeSteps, stepSize, d_states);
-//		checkCUDAError("Single Kernel Execution");
-//		cutilSafeCall( cudaThreadSynchronize() );
-//		// Stop kernel Timer
-//		cutilCheckError(cutStopTimer(kernel_timer));
-//		cutilSafeCall( cudaMemcpy(h_paged_states, d_states, pagedMemorySize/num_streams,
-//			cudaMemcpyDeviceToHost) );
-//		memcpy(h_states, h_paged_states, pagedMemorySize/num_streams);
-
-		// Prefetch data for next partition in first stream
-//		if (num_partitions>1) {
-//			global_offset = rateStateCount * num_streams * grid.x * threads.x;
-//			memcpy(h_paged_states, h_states + global_offset, pagedMemorySize/num_streams);
-//		}
-//	} else {
-		memcpy(h_paged_states, h_states, pagedMemorySize);
+      }
+    }
+    // Execute the kernel
+    // Asynchronously launch num_streams kernels, each operating on its own portion of data
+    for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
+      domainIndex = j + i*num_streams;
+      if (domainIndex <= lastFullDomain) {
+	local_offset = j * grid.x * threads.x ;
+	if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
+	  grid.x = spill[1]+1;
+	  solveSystem<<<grid, threads, sharedMem, streams[j]>>>(timeSteps, stepSize,
+								d_states + rateStateCount*local_offset);
+	} else {
+	  solveSystem<<<grid, threads, sharedMem, streams[j]>>>(timeSteps, stepSize,
+								d_states + rateStateCount*local_offset);
 	}
-
-	// Queue kernel calls into streams to hide memory transfers (num_partitions sets of kernel calls in each stream)
-	for(i = 0; i < num_partitions+1; i++) {
-		// Asynchronously launch num_streams memcopies
-		for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
-			domainIndex = j + i*num_streams;
-			if (domainIndex <= lastFullDomain) {
-				local_offset = j * rateStateCount * grid.x * threads.x ;
-				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
-					//printf("last async in %d, size %d\n", domainIndex, lastStreamMemorySize);
-					cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset,
-							lastStreamMemorySize, cudaMemcpyHostToDevice, streams[j]) );
-				} else {
-					//printf("normal async in %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
-					cutilSafeCall( cudaMemcpyAsync(d_states + local_offset, h_paged_states + local_offset,
-							pagedMemorySize/num_streams, cudaMemcpyHostToDevice, streams[j]) );
-				}
-			}
-		}
-		// Execute the kernel
-		// Asynchronously launch num_streams kernels, each operating on its own portion of data
-		for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
-			domainIndex = j + i*num_streams;
-			if (domainIndex <= lastFullDomain) {
-				local_offset = j * grid.x * threads.x ;
-				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
-				    grid.x = spill[1]+1;
-					solveSystem<<<grid, threads, sharedMem, streams[j]>>>(timeSteps, stepSize,
-							d_states + rateStateCount*local_offset);
-				} else {
-					solveSystem<<<grid, threads, sharedMem, streams[j]>>>(timeSteps, stepSize,
-							d_states + rateStateCount*local_offset);
-				}
-			}
-		}
-
-		// Asynchronoously launch num_streams memcopies
-		for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
-			domainIndex = j + i*num_streams;
-			if (domainIndex <= lastFullDomain) {
-				local_offset = j * rateStateCount * grid.x * threads.x ;
-				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
-					//printf("last async out %d, size %d\n", domainIndex, lastStreamMemorySize);
-					cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset,
-						lastStreamMemorySize, cudaMemcpyDeviceToHost, streams[j]) );
-				} else {
-					//printf("normal async out %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
-					cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset,
-						pagedMemorySize/num_streams, cudaMemcpyDeviceToHost, streams[j]) );
-				}
-			}
-		}
-		
-		// Execute memcpys in and out of paged memory when CUDA calls in the streams have finished
-		for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
-			domainIndex = j + i*num_streams;
-			if (domainIndex <= lastFullDomain) {
-				cudaStreamSynchronize(streams[j]);
-
-				local_offset = j * rateStateCount * grid.x * threads.x ;
-				global_offset = i * num_streams * grid.x * threads.x;
-				
-				if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
-					//printf("last memcpy out %d\n", domainIndex);
-					memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset,
-							lastStreamMemorySize);
-				} else {
-					//printf("normal memcpy out %d\n", domainIndex);
-					memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset,
-						pagedMemorySize/num_streams);
-				}
-
-				global_offset = (i + 1) * num_streams * grid.x * threads.x;
-				if (domainIndex == lastFullDomain - num_streams && (spill[1]!=0 || spill[2]!=0)) {
-					//printf("last memcpy in %d\n", domainIndex);
-					lastStreamMemorySize = sizeof(double)*rateStateCount*(spill[1]*threads_per_block+spill[2]);
-					memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset,
-							lastStreamMemorySize);
-				} else if (domainIndex < lastFullDomain - num_streams) {
-					//printf("normal memcpy in %d\n", domainIndex);
-					memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset,
-						pagedMemorySize/num_streams);
-				}
-			}
-		}
+      }
+    }
+    
+    // Asynchronoously launch num_streams memcopies
+    for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
+      domainIndex = j + i*num_streams;
+      if (domainIndex <= lastFullDomain) {
+	local_offset = j * rateStateCount * grid.x * threads.x ;
+	if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
+	  //printf("last async out %d, size %d\n", domainIndex, lastStreamMemorySize);
+	  cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset,
+					 lastStreamMemorySize, cudaMemcpyDeviceToHost, streams[j]) );
+	} else {
+	  //printf("normal async out %d, size %d\n", domainIndex, pagedMemorySize/num_streams);
+	  cutilSafeCall( cudaMemcpyAsync(h_paged_states + local_offset, d_states + local_offset,
+					 pagedMemorySize/num_streams, cudaMemcpyDeviceToHost, streams[j]) );
 	}
-
-	if (timing_file) {
-		// Stop global timer
-		cutilCheckError(cutStopTimer(timer));
-
-		// Calculate timing statistics
-		dSeconds = cutGetTimerValue(timer)/1000.0;
-		nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads;
-		gflops = 1.0e-9 * nFLOPS/dSeconds;
-
-//		kernel_dSeconds = cutGetTimerValue(kernel_timer)/1000.0;
-//		kernel_nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads/num_streams/num_partitions;
-//		kernel_gflops = 1.0e-9 * kernel_nFLOPS/kernel_dSeconds;
-
-		// Store Stats
-		fprintf(timing_file,"%s\t%s\t%d\t%d\t%d\t%d\t%d\t%f\t%f\n", cellModelName, integratorName, num_threads, num_blocks,
-			threads_per_block, num_partitions, num_streams, dSeconds, gflops);
-//		fprintf(timing_file,"%s\t%s\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\n", cellModelName, integratorName, num_threads, num_blocks,
-//			threads_per_block, num_partitions, num_streams, dSeconds, gflops, kernel_dSeconds,
-//			kernel_gflops, gflops/kernel_gflops*100);
+      }
+    }
+    
+    // Execute memcpys in and out of paged memory when CUDA calls in the streams have finished
+    for(j = 0; j < num_streams; j++) { //((timing_file!=NULL && i==0) ? 1 : 0); j < num_streams; j++){
+      domainIndex = j + i*num_streams;
+      if (domainIndex <= lastFullDomain) {
+	cudaStreamSynchronize(streams[j]);
+	
+	local_offset = j * rateStateCount * grid.x * threads.x ;
+	global_offset = i * num_streams * grid.x * threads.x;
+	
+	if (domainIndex == lastFullDomain && (spill[1]!=0 || spill[2]!=0)) {
+	  //printf("last memcpy out %d\n", domainIndex);
+	  memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset,
+		 lastStreamMemorySize);
+	} else {
+	  //printf("normal memcpy out %d\n", domainIndex);
+	  memcpy(h_states + rateStateCount * global_offset + local_offset, h_paged_states + local_offset,
+		 pagedMemorySize/num_streams);
 	}
-
-	// Deallocate Memory and Release Threads
-	for(i = 0; i < num_streams; i++)
-        	cutilSafeCall( cudaStreamDestroy(streams[i]) );
-	cutilSafeCall( cudaFree(d_states) );
-	cutilSafeCall( cudaFreeHost(h_paged_states) );
-	cudaThreadExit();
+	
+	global_offset = (i + 1) * num_streams * grid.x * threads.x;
+	if (domainIndex == lastFullDomain - num_streams && (spill[1]!=0 || spill[2]!=0)) {
+	  //printf("last memcpy in %d\n", domainIndex);
+	  lastStreamMemorySize = sizeof(double)*rateStateCount*(spill[1]*threads_per_block+spill[2]);
+	  memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset,
+		 lastStreamMemorySize);
+	} else if (domainIndex < lastFullDomain - num_streams) {
+	  //printf("normal memcpy in %d\n", domainIndex);
+	  memcpy(h_paged_states + local_offset, h_states + rateStateCount * global_offset + local_offset,
+		 pagedMemorySize/num_streams);
+	}
+      }
+    }
+  }
+  
+  if (timing_file) {
+    // Stop global timer
+    cutilCheckError(cutStopTimer(timer));
+    
+    // Calculate timing statistics
+    dSeconds = cutGetTimerValue(timer)/1000.0;
+    nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads;
+    gflops = 1.0e-9 * nFLOPS/dSeconds;
+    
+    //		kernel_dSeconds = cutGetTimerValue(kernel_timer)/1000.0;
+    //		kernel_nFLOPS = (FLOPSPerTimeStep*rateStateCount + FLOPSPerFunction*FunctionEvals)*timeSteps*num_threads/num_streams/num_partitions;
+    //		kernel_gflops = 1.0e-9 * kernel_nFLOPS/kernel_dSeconds;
+    
+    // Store Stats
+    fprintf(timing_file,"%s\t%s\t%d\t%d\t%d\t%d\t%d\t%f\t%f\n", cellModelName, integratorName, num_threads, num_blocks,
+	    threads_per_block, num_partitions, num_streams, dSeconds, gflops);
+    //		fprintf(timing_file,"%s\t%s\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\n", cellModelName, integratorName, num_threads, num_blocks,
+    //			threads_per_block, num_partitions, num_streams, dSeconds, gflops, kernel_dSeconds,
+    //			kernel_gflops, gflops/kernel_gflops*100);
+  }
+  
+  // Deallocate Memory and Release Threads
+  for(i = 0; i < num_streams; i++)
+    cutilSafeCall( cudaStreamDestroy(streams[i]) );
+  cutilSafeCall( cudaFree(d_states) );
+  cutilSafeCall( cudaFreeHost(h_paged_states) );
+  cudaThreadExit();
 }
 
 
@@ -596,71 +651,71 @@ void solve(double* h_states, double startTime, double endTime, double stepSize,
 ////////////////////////////////////////////////////////////////////////////////
 
 void solveProblem(unsigned int timeSteps, unsigned int num_threads, unsigned int threads_per_block, 
-				  unsigned int num_partitions, unsigned int num_streams, FILE *timing_file, FILE *results_file)
+		  unsigned int num_partitions, unsigned int num_streams, FILE *timing_file, FILE *results_file)
 {
-	unsigned int i, j; 
-	float startTime = 0.0f;
-	float endTime = 0.2f;
-
-	double* h_states = NULL;
-
-	h_states = (double *) safeMalloc(sizeof(double)*rateStateCount*num_threads);
-
-	initProblem(num_threads, h_states);
-
-	solve(h_states, startTime, endTime, timeSteps, num_threads, threads_per_block, num_partitions, num_streams, timing_file);
-
-	if (results_file) {
-		fprintf(results_file,"\n\n");
-
-		for (i=0; i<num_threads; i++) {
-			fprintf(results_file,"%d", i+1);
-			for (j=0; j<rateStateCount; j++) {
-				fprintf(results_file,"\t%f", h_states[i*rateStateCount+j]);
-			}
-			fprintf(results_file,"\n");
-		}
-	}
-
-	free(h_states);
+  unsigned int i, j; 
+  float startTime = 0.0f;
+  float endTime = 0.2f;
+  
+  double* h_states = NULL;
+  
+  h_states = (double *) safeMalloc(sizeof(double)*rateStateCount*num_threads);
+  
+  initProblem(num_threads, h_states);
+  
+  solve(h_states, startTime, endTime, timeSteps, num_threads, threads_per_block, num_partitions, num_streams, timing_file);
+  
+  if (results_file) {
+    fprintf(results_file,"\n\n");
+    
+    for (i=0; i<num_threads; i++) {
+      fprintf(results_file,"%d", i+1);
+      for (j=0; j<rateStateCount; j++) {
+	fprintf(results_file,"\t%f", h_states[i*rateStateCount+j]);
+      }
+      fprintf(results_file,"\n");
+    }
+  }
+  
+  free(h_states);
 }
 
 void startToFinish (unsigned int num_threads,unsigned int threads_per_block,unsigned int num_partitions,unsigned int num_streams,unsigned int timeSteps)
 {
-	//int i,j,k;
- 
-	FILE *file = NULL; 
-	FILE *file1 = NULL;
- 
-	file = fopen("performance_data.txt", "rt");
-	//file1 = fopen("results_data.txt", "wt");
-
-	if (!file) {
-		file = fopen("performance_data.txt", "wt");
-		fprintf(file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\tSingle Kernel Computaional Time(s)\tKernel GFLOPS\tDevice Utilisation\n");
-	} else {
-		fclose(file);
-		file = fopen("performance_data.txt", "at");
-		if (!file) {
-			fprintf(stderr, "Performance Data file could not be opened or created.");
-			exit(EXIT_FAILURE);
-		}
-	}
-
-	atexit( cleanup );
-
-	//for(i=32; i<=1024; i++) if (num_threads%i==0) for(j=0; j<5; j++) solveProblem(timeSteps, num_threads, i, num_partitions, num_streams, file, file1);
-
-	solveProblem(timeSteps, num_threads, threads_per_block, num_partitions, num_streams, file, file1);
-
-	/*for(i=2; i<21; i+=4) {
-		for(j=10; j<40; j+=4) {
-			for (k=0; k<10; k++) {
-				 solveProblem(timeSteps, num_threads, threads_per_block, j, i, file, file1);
-			}
-		}
-	}*/
-
-	if (file) fclose(file);
-	if (file1) fclose(file1);
+  //int i,j,k;
+  
+  FILE *file = NULL; 
+  FILE *file1 = NULL;
+  
+  file = fopen("performance_data.txt", "rt");
+  //file1 = fopen("results_data.txt", "wt");
+  
+  if (!file) {
+    file = fopen("performance_data.txt", "wt");
+    fprintf(file,"Cell Model\tIntegrator\tNumber of Threads\tNumber 0f Blocks\tThreads Per Block\tNumber of Partitions\tNumber of Streams\tTotal Computational Time(s)\tTotal GFLOPS\tSingle Kernel Computaional Time(s)\tKernel GFLOPS\tDevice Utilisation\n");
+  } else {
+    fclose(file);
+    file = fopen("performance_data.txt", "at");
+    if (!file) {
+      fprintf(stderr, "Performance Data file could not be opened or created.");
+      exit(EXIT_FAILURE);
+    }
+  }
+  
+  atexit( cleanup );
+  
+  //for(i=32; i<=1024; i++) if (num_threads%i==0) for(j=0; j<5; j++) solveProblem(timeSteps, num_threads, i, num_partitions, num_streams, file, file1);
+  
+  solveProblem(timeSteps, num_threads, threads_per_block, num_partitions, num_streams, file, file1);
+  
+  /*for(i=2; i<21; i+=4) {
+    for(j=10; j<40; j+=4) {
+    for (k=0; k<10; k++) {
+    solveProblem(timeSteps, num_threads, threads_per_block, j, i, file, file1);
+    }
+    }
+    }*/
+  
+  if (file) fclose(file);
+  if (file1) fclose(file1);
 }
diff --git a/utils/MakefileCommon.inc b/utils/MakefileCommon.inc
index 00256ef5..c556a996 100644
--- a/utils/MakefileCommon.inc
+++ b/utils/MakefileCommon.inc
@@ -500,7 +500,7 @@ ifeq ($(OPERATING_SYSTEM),linux)
 
   ifeq ($(NVCC),nvcc)
     NVCC_FLGS += --generate-code=arch=compute_20,code=\"sm_20,compute_20\" --compile -m64 --compiler-options -fno-strict-aliasing,-fPIC --ptxas-options=-v 
-    DBGNVCC_FLGS += -O0 -g -G0
+    DBGNVCC_FLGS += -g -G
     OPTNVCC_FLGS = --use_fast_math -O2
     ifeq ($(PROF),true)
       NVCC_FLGS += -pg