package edu.stanford.rsl.conrad.cuda;
import static edu.stanford.rsl.conrad.filtering.multiprojection.anisotropic.AnisotropicFilterFunction.MAX_FILTERS;
import static edu.stanford.rsl.conrad.filtering.multiprojection.anisotropic.AnisotropicFilterFunction.filt_get_filt_dirs;
import static edu.stanford.rsl.conrad.filtering.multiprojection.anisotropic.AnisotropicFilterFunction.filt_get_n_filters;
import ij.IJ;
import ij.ImagePlus;
import java.util.ArrayList;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUdevprop;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.JCudaDriver;
import jcuda.runtime.JCuda;
import jcuda.runtime.dim3;
import edu.stanford.rsl.conrad.utils.CONRAD;
import edu.stanford.rsl.conrad.volume3d.Volume3D;
import edu.stanford.rsl.conrad.volume3d.VolumeOperator;
/**
* Class to implement all functions of VolumeOperator on CUDA hardware. Code is made CUDA compatible by replacing the VolumeOperator with CUDAVolumeOperator. As a result all code is executed on CUDA instead of the CPU. On good hardware usually a substantial speed-up is achieved.
* Development and debugging should be done using Volume3D, as CUDA problems are usually hard to debug.
* @author akmaier
* @see edu.stanford.rsl.conrad.volume3d.VolumeOperator
*/
public class CUDAVolumeOperator extends VolumeOperator {
/**
* The CUDA module containing the kernel
*/
private CUmodule module = null;
private static boolean debug = false;
private boolean inited = false;
/**
* the context
*/
private CUcontext cuCtx = null;
static long last = System.currentTimeMillis();
private static void printTime(){
long time = System.currentTimeMillis();
long diff = time - last;
System.out.println("Time passed: " + diff);
last = time;
}
/**
* The grid size of the kernel execution
*/
private dim3 gridSize = null;
private void callCUDAFunction(CUfunction function, ArrayList<Object> arguments){
int offset = 0;
if (debug) System.out.println("Working on Parameter set.");
// send the parameters to the function
for (int i = 0; i < arguments.size(); i++) {
if (debug) System.out.print("Parameter " + i);
Object argument = arguments.get(i);
if (argument instanceof CUdeviceptr) {
if (debug) System.out.println(" is CUdeviceptr");
Pointer pointer = Pointer.to((Pointer) argument);
offset = CUDAUtil.align(offset, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, pointer, Sizeof.POINTER);
offset += Sizeof.POINTER;
}
if (argument instanceof Integer) {
if (debug) System.out.println(" is Integer");
int [] integer = {((Integer) argument).intValue()};
Pointer intPointer = Pointer.to(integer);
offset = CUDAUtil.align(offset, Sizeof.INT);
JCudaDriver.cuParamSetv(function, offset, intPointer, Sizeof.INT);
offset += Sizeof.INT;
}
if (argument instanceof Float){
if (debug) System.out.println(" is Float");
float [] array = {((Float) argument).floatValue()};
Pointer floatPointer = Pointer.to(array);
offset = CUDAUtil.align(offset, Sizeof.FLOAT);
JCudaDriver.cuParamSetv(function, offset, floatPointer, Sizeof.FLOAT);
offset += Sizeof.FLOAT;
}
}
// set parameter space
JCudaDriver.cuParamSetSize(function, offset);
if (debug) System.out.println("Parameters set.");
// Call the CUDA kernel, writing the results into the volume which is pointed at
if (debug) System.out.println("Setting blocks.");
JCudaDriver.cuFuncSetBlockShape(function, CUDAUtil.gridBlockSize[0], CUDAUtil.gridBlockSize[1], 1);
if (debug) System.out.println("Staring grid: " + gridSize.x + "x"+ gridSize.y);
int revan = JCudaDriver.cuLaunchGrid(function, gridSize.x, gridSize.y);
if (debug) System.out.println("Exit code launch: "+revan);
revan = JCudaDriver.cuCtxSynchronize();
if (debug) System.out.println("Exit code sync: "+revan);
}
public void initCUDA(){
if (!inited) {
// Initialize the JCudaDriver. Note that this has to be done from
// the same thread that will later use the JCudaDriver API.
JCudaDriver.setExceptionsEnabled(true);
JCudaDriver.cuInit(0);
CUdevice dev = CUDAUtil.getBestDevice();
cuCtx = new CUcontext();
JCudaDriver.cuCtxCreate(cuCtx, 0, dev);
// check space on device:
int [] memory = new int [1];
JCudaDriver.cuDeviceTotalMem(memory, dev);
int availableMemory = (int) (CUDAUtil.correctMemoryValue(memory[0]) / ((long)1024 * 1024));
if (debug) {
System.out.println("Total available Memory on CUDA card:" + availableMemory);
}
if (debug) {
CUdevprop prop = new CUdevprop();
JCudaDriver.cuDeviceGetProperties(prop, dev);
System.out.println(prop.toFormattedString());
}
// Load the CUBIN file containing the kernel
module = new CUmodule();
JCudaDriver.cuModuleLoad(module, "CUDAVolumeFunctions.sm_10.cubin");
// Obtain a function pointer to the kernel function. This function
// will later be called.
//
if (debug) System.out.println("Initialized.");
inited = true;
}
}
public void cleanup(){
//JCudaDriver.cuCtxDestroy(cuCtx);
JCudaDriver.cuModuleUnload(module);
}
public void fill(Volume3D vol, float number){
initCUDA();
CUdeviceptr sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z4fillPfPifi");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Float(number));
arguments.add(new Integer(vol.getInternalDimension()));
// Calculate new grid size
gridSize = getGrid(vol.size);
if (debug) System.out.println("Calling.");
callCUDAFunction(function, arguments);
if (debug) System.out.println("Freeing.");
JCuda.cudaFree(sizePointer);
//((CUDAVolume3D) vol).fetch();
}
private dim3 getGrid(int [] size){
return new dim3(
CUDAUtil.iDivUp(size[2], CUDAUtil.gridBlockSize[0]),
CUDAUtil.iDivUp(size[1], CUDAUtil.gridBlockSize[1]),
1);
}
@Override
public int divideByVolume(Volume3D vol1, Volume3D vol2)
{
int dim_loop;
if (DEBUG_FLAG)
fprintf("vol_div\n");
for (dim_loop=0; dim_loop<vol1.in_dim; dim_loop++)
if (vol1.size[dim_loop] != vol2.size[dim_loop]) {
fprintf( "vol_div: Volumes have different sizes\n");
return(-1);
}
if (vol1.in_dim==2 && vol2.in_dim==1) {
makeComplex(vol2);
CONRAD.gc();
}
if (vol1.in_dim==1 && vol2.in_dim==2) {
makeComplex(vol1);
CONRAD.gc();
}
if (vol2.in_dim>2) {
fprintf( "vol_div: Invalid dimension\n");
return(0);
}
initCUDA();
CUdeviceptr sizePointer = CUDAUtil.copyToDeviceMemory(vol1.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z6dividePfS_Pii");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol1).getDevicePointer());
arguments.add(((CUDAVolume3D) vol2).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol1.getInternalDimension()));
// Calculate new grid size
gridSize = getGrid(vol1.size);
if (debug) System.out.println("Calling.");
callCUDAFunction(function, arguments);
if (debug) System.out.println("Freeing.");
JCuda.cudaFree(sizePointer);
return(0);
}
@Override
public Volume3D solveMaximumEigenvalue(Volume3D [][] structureTensor)
{
CUDAVolume3D a11 = (CUDAVolume3D) structureTensor[0][0];
CUDAVolume3D a12 = (CUDAVolume3D) structureTensor[0][1];
CUDAVolume3D a13 = (CUDAVolume3D) structureTensor[0][2];
CUDAVolume3D a22 = (CUDAVolume3D) structureTensor[1][1];
CUDAVolume3D a23 = (CUDAVolume3D) structureTensor[1][2];
CUDAVolume3D a33 = (CUDAVolume3D) structureTensor[2][2];
CUDAVolume3D vol = (CUDAVolume3D) createVolume(a11.size, a11.spacing, a11.getInternalDimension());
initCUDA();
CUdeviceptr sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z25filt_solve_max_eigenvaluePfS_S_S_S_S_PiiS_");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(a11.getDevicePointer());
arguments.add(a12.getDevicePointer());
arguments.add(a13.getDevicePointer());
arguments.add(a22.getDevicePointer());
arguments.add(a23.getDevicePointer());
arguments.add(a33.getDevicePointer());
arguments.add(sizePointer);
arguments.add(a11.getInternalDimension());
arguments.add(vol.getDevicePointer());
CUDAUtil.gridBlockSize[0] /= 2;
gridSize = getGrid(vol.size);
if (debug) System.out.println("Calling.");
callCUDAFunction(function, arguments);
if (debug) System.out.println("Freeing.");
CUDAUtil.gridBlockSize[0] *= 2;
JCuda.cudaFree(sizePointer);
return(vol);
}
@Override
public Volume3D createDirectionalWeights(int dimensions, int size[],
float dim[], float dir[], int A, FILTER_TYPE t_filt)
{
Volume3D vol = null;
float [] f_max = new float [Volume3D.MAX_DIM];
float [] f_delta = new float [Volume3D.MAX_DIM];
float r_abs;
int dim_loop;
if (DEBUG_FLAG)
fprintf("filt_cos2\n"+t_filt);
vol=createVolume(size, dim, 1);
/* normalize filter direction */
r_abs = 0.0f;
r_abs += dir[0] * dir[0];
r_abs += dir[1] * dir[1];
r_abs += dir[2] * dir[2];
r_abs = (float) Math.sqrt(r_abs);
for (dim_loop=0; dim_loop<dimensions; dim_loop++)
dir[dim_loop] /= r_abs;
if (DEBUG_FLAG) {
fprintf(" direction = ");
for (dim_loop=0; dim_loop<dimensions; dim_loop++) {
fprintf(dir[dim_loop]);
if (dim_loop<dimensions-1)
fprintf(", ");
}
fprintf("\n");
}
/* calculate filter boundings */
getFrequencyBoundings(dimensions, size, dim, f_max, f_delta);
// load CUDA stuff.
initCUDA();
CUdeviceptr sizePointer = CUDAUtil.copyToDeviceMemory(size);
CUdeviceptr dirPointer = CUDAUtil.copyToDeviceMemory(dir);
CUdeviceptr fDeltaPointer = CUDAUtil.copyToDeviceMemory(f_delta);
CUdeviceptr fMaxPointer = CUDAUtil.copyToDeviceMemory(f_max);
Integer filt = 0;
if (t_filt == FILTER_TYPE.QUADRATIC) filt = new Integer(1);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z9filt_cos2PfPiS_S_S_ii");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(dirPointer);
arguments.add(fDeltaPointer);
arguments.add(fMaxPointer);
arguments.add(new Integer(A));
arguments.add(filt);
gridSize = getGrid(vol.size);
if (debug) System.out.println("Calling.");
callCUDAFunction(function, arguments);
if (debug) System.out.println("Freeing.");
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(dirPointer);
JCuda.cudaFree(fDeltaPointer);
JCuda.cudaFree(fMaxPointer);
return(vol);
}
@Override
public Volume3D createVolume(int size[],
float dim[],
int in_dim)
{
Volume3D volume;
volume= new CUDAVolume3D(size, dim, in_dim);
return(volume);
}
@Override
public Volume3D createExponentialDirectionalHighPassFilter(int dimensions, int size[],
float dim[], float dir[], int A, float B, float ri,
FILTER_TYPE t_filt)
{
Volume3D vol;
float [] f_max = new float [Volume3D.MAX_DIM];
float [] f_delta = new float [Volume3D.MAX_DIM];
float r_abs;
int dim_loop;
if (DEBUG_FLAG)
fprintf("filt_cos2_r\n"+t_filt);
vol=createVolume(size, dim, 1);
/* normalize filter direction */
r_abs=0;
for (dim_loop=0; dim_loop<dimensions; dim_loop++)
r_abs += dir[dim_loop] * dir[dim_loop];
r_abs = (float) Math.sqrt(r_abs);
for (dim_loop=0; dim_loop<dimensions; dim_loop++) {
dir[dim_loop] /= r_abs;
if (DEBUG_FLAG)
fprintf(" direction "+dim_loop+" = "+dir[dim_loop]+"\n");
}
/* calculate filter boudings */
getFrequencyBoundings(dimensions, size, dim, f_max, f_delta);
initCUDA();
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(size);
Pointer dirPointer = CUDAUtil.copyToDeviceMemory(dir);
Pointer fDeltaPointer = CUDAUtil.copyToDeviceMemory(f_delta);
Pointer fMaxPointer = CUDAUtil.copyToDeviceMemory(f_max);
Integer filt = 0;
if (t_filt == FILTER_TYPE.QUADRATIC) filt = 1;
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z11filt_cos2_rPfPiS_S_S_iffi");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(dirPointer);
arguments.add(fDeltaPointer);
arguments.add(fMaxPointer);
arguments.add(new Integer(A));
arguments.add(new Float(B));
arguments.add(new Float(ri));
arguments.add(filt);
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(dirPointer);
JCuda.cudaFree(fDeltaPointer);
JCuda.cudaFree(fMaxPointer);
//((CUDAVolume3D) vol).fetch();
//vol.getImagePlus(dir[0] + " " +dir[1] + " " +dir[2] +" Cosine Square R").show();
fftShift(vol);
//vol.getImagePlus(dir[0] + " " +dir[1] + " " +dir[2] +"Cosine Square R shifted").show();
return(vol);
}
@Override
public int fftShift(Volume3D vol)
{
if (DEBUG_FLAG)
fprintf("vol_fftshift\n");
initCUDA();
if (vol.in_dim == 1) makeComplex(vol);
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z8fftshiftPfPii");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.getInternalDimension()));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
return(0);
}
@Override
public Volume3D createHighPassFilter(int dimensions, int [] size, float [] dim, int filt_loop, float lp_upper){
float [][] dir = new float[MAX_FILTERS][Volume3D.MAX_DIM];
float hp_lower = (float) (10f*Math.PI); /* was PI*10 LW 2006-01-31 */
float hp_upper = (float) (10f*Math.PI); /* was PI LW 2006-01-31 */
//float lp_upper = 1.50f; /* was 1.5 LW 2006-01-31 */
CUDAVolume3D vol= (CUDAVolume3D) createVolume(size, dim, 1);
int n_filters;
float [] f_max = new float [Volume3D.MAX_DIM];
float [] f_delta = new float [Volume3D.MAX_DIM];
VolumeOperator.getFrequencyBoundings(dimensions, size, dim, f_max, f_delta);
/* create HP filter */
initCUDA();
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(size);
Pointer fDeltaPointer = CUDAUtil.copyToDeviceMemory(f_delta);
Pointer fMaxPointer = CUDAUtil.copyToDeviceMemory(f_max);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z20createHighPassFilterPfPiS_S_fff");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(fDeltaPointer);
arguments.add(fMaxPointer);
arguments.add(new Float(lp_upper));
arguments.add(new Float(hp_lower));
arguments.add(new Float(hp_upper));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(fDeltaPointer);
JCuda.cudaFree(fMaxPointer);
//Volume3D.vol_fftshift(vol);
filt_get_filt_dirs(vol.dimensions, dir);
n_filters = filt_get_n_filters(vol.dimensions);
IJ.showStatus("Computing High Pass Filters");
IJ.showProgress((((float)(filt_loop))/n_filters));
CUDAVolume3D filt = (CUDAVolume3D) createDirectionalWeights(vol.dimensions, vol.size, vol.spacing,
dir[filt_loop] , 1, FILTER_TYPE.NORMAL);
if (filt==null) {
fprintf( "filt_make_enhance_filters: Out of memory\n");
return(null);
}
multiply(filt,vol);
vol.destroy();
fftShift(filt);
//filt.getImagePlus("filter"+ filt_loop).show();
return filt;
}
@Override
public int multiply(Volume3D vol1, Volume3D vol2)
{
int dim_loop;
if (debug) {
System.out.print("Called multiply ");
printTime();
}
if (DEBUG_FLAG)
fprintf("vol_mult\n");
for (dim_loop=0; dim_loop<vol1.in_dim; dim_loop++)
if (vol1.size[dim_loop] != vol2.size[dim_loop]) {
fprintf( "vol_mult: Volumes have different sizes\n");
return(-1);
}
if (vol1.in_dim==2 && vol2.in_dim==1) {
makeComplex(vol2);
CONRAD.gc();
}
if (vol1.in_dim==1 && vol2.in_dim==2){
makeComplex(vol1);
CONRAD.gc();
}
if (vol1.in_dim>2 || vol2.in_dim>2) {
fprintf( "vol_mult: Invalid dimension\n");
return(0);
}
if (debug) {
System.out.print("prechecks ");
printTime();
}
initCUDA();
if (debug) {
System.out.print("first init step ");
printTime();
}
// Calculate new grid size
gridSize = getGrid(vol1.size);
if (debug) {
System.out.print("get Grid");
printTime();
}
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol1.size);
if (debug) {
System.out.print("mem cpy");
printTime();
}
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z8multiplyPfS_Pii");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol1).getDevicePointer());
arguments.add(((CUDAVolume3D) vol2).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol1.getInternalDimension()));
if (debug) {
System.out.print("init done ");
printTime();
}
callCUDAFunction(function, arguments);
if (debug) {
System.out.print("CUDA done ");
printTime();
}
JCuda.cudaFree(sizePointer);
if (debug) {
System.out.print("clean up done ");
printTime();
}
return(0);
}
@Override
public Volume3D createVolume(ImagePlus image, int mirror, int cuty, boolean uneven)
{
Volume3D volume;
volume= new CUDAVolume3D(image, mirror, cuty, uneven);
return(volume);
}
@Override
public void makeComplex(Volume3D vol)
{
if (vol.getInternalDimension() == 2) return;
if (vol.getInternalDimension() != 1) {
fprintf("vol_make_comlex: Invalid dimension\n");
return;
}
initCUDA();
int adaptedWidth = CUDAUtil.iDivUp(vol.size[2], CUDAUtil.gridBlockSize[0]) * CUDAUtil.gridBlockSize[0];
int adaptedHeight = CUDAUtil.iDivUp(vol.size[1], CUDAUtil.gridBlockSize[1]) * CUDAUtil.gridBlockSize[1];
int memorySize = adaptedWidth*adaptedHeight*vol.size[0]* 2 * Sizeof.FLOAT;
CUdeviceptr deviceX = new CUdeviceptr();
JCuda.cudaMalloc(deviceX, memorySize);
JCuda.cudaMemset(deviceX, 0, memorySize);
CUDAVolume3D cudaVol = (CUDAVolume3D) vol;
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z11makeComplexPfS_Pi");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(cudaVol.getDevicePointer());
arguments.add(deviceX);
arguments.add(sizePointer);
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(cudaVol.getDevicePointer());
cudaVol.setDevicePointer(deviceX);
float [][][] temp = new float [vol.size[0]][vol.size[1]][vol.size[2]*2];
vol.data = null;
vol.data = temp;
temp = null;
vol.in_dim = 2;
return;
}
@Override
public Volume3D createLowPassFilter(int dimensions, int size[], float dim [], float lp_upper){
float [] f_max = new float [Volume3D.MAX_DIM];
float [] f_delta = new float [Volume3D.MAX_DIM];
CUDAVolume3D vol = (CUDAVolume3D) createVolume(size, dim, 1);
/* calculate filter boudings */
VolumeOperator.getFrequencyBoundings(dimensions, size, dim, f_max, f_delta);
/* create LP filter */
initCUDA();
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(size);
Pointer fDeltaPointer = CUDAUtil.copyToDeviceMemory(f_delta);
Pointer fMaxPointer = CUDAUtil.copyToDeviceMemory(f_max);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z19createLowPassFilterPfPiS_S_f");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(fDeltaPointer);
arguments.add(fMaxPointer);
arguments.add(new Float(lp_upper));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(fDeltaPointer);
JCuda.cudaFree(fMaxPointer);
fftShift(vol);
return vol;
}
@Override
public Volume3D createGaussLowPassFilter(int dimensions, int size[],
float dim[], float alpha)
{
Volume3D vol;
float [] f_max = new float [Volume3D.MAX_DIM];
float [] f_delta = new float [Volume3D.MAX_DIM];
if (DEBUG_FLAG)
fprintf("filt_gauss\n");
vol=createVolume(size, dim, 1);
/* calculate filter boudings */
VolumeOperator.getFrequencyBoundings(dimensions, size, dim, f_max, f_delta);
/* create LP filter */
initCUDA();
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(size);
Pointer fDeltaPointer = CUDAUtil.copyToDeviceMemory(f_delta);
Pointer fMaxPointer = CUDAUtil.copyToDeviceMemory(f_max);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z10filt_gaussPfPiS_S_f");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(fDeltaPointer);
arguments.add(fMaxPointer);
arguments.add(new Float(alpha));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(fDeltaPointer);
JCuda.cudaFree(fMaxPointer);
fftShift(vol);
return(vol);
}
@Override
public int sigmoid(Volume3D vol,
float smoothing, float lowValue, float highValue,
float highPassLowerLevel, float highPassUpperLevel)
{
if (DEBUG_FLAG)
fprintf("vol_sigmoid_th\n");
if (vol.in_dim != 1) {
fprintf( "vol_abs: Invalid dimension\n");
return(-1);
}
initCUDA();
// Calculate new grid size
// Sigmoid is requires a lot of registers. Hence we have to reduce the block size a bit.
CUDAUtil.gridBlockSize[0] /= 2;
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z7sigmoidPfPiifffff");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
arguments.add(new Float(smoothing));
arguments.add(new Float(lowValue));
arguments.add(new Float(highValue));
arguments.add(new Float(highPassLowerLevel));
arguments.add(new Float(highPassUpperLevel));
callCUDAFunction(function, arguments);
// restore original block size.
CUDAUtil.gridBlockSize[0] *= 2;
JCuda.cudaFree(sizePointer);
return(0);
}
@Override
public int addVolume(Volume3D vol1, Volume3D vol2)
{
int dim_loop;
if (DEBUG_FLAG)
fprintf("vol_add\n");
for (dim_loop=0; dim_loop<vol1.in_dim; dim_loop++)
if (vol1.size[dim_loop] != vol2.size[dim_loop]) {
fprintf( "vol_add: Volumes have different sizes\n");
return(-1);
}
/* OBS !!! borde inte behova konvertera vol2 . komplex */
if (vol1.in_dim==2 && vol2.in_dim==1) {
makeComplex(vol2);
CONRAD.gc();
}
if (vol1.in_dim==1 && vol2.in_dim==2){
makeComplex(vol1);
CONRAD.gc();
}
if (vol2.in_dim>2 || vol1.in_dim>2) {
fprintf( "vol_add: Invalid dimension\n");
return(-1);
}
initCUDA();
gridSize = getGrid(vol1.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol1.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z9addVolumePfS_Pii");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol1).getDevicePointer());
arguments.add(((CUDAVolume3D) vol2).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol1.in_dim));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
return(0);
}
@Override
public int real(Volume3D vol)
{
if (debug) {
System.out.print("Called real ");
printTime();
}
if (DEBUG_FLAG)
fprintf("vol_real\n");
if (vol.in_dim == 1) return(0);
if (vol.in_dim != 2) {
fprintf( "vol_real: Invalid dimension\n");
return(-1);
}
int adaptedWidth = CUDAUtil.iDivUp(vol.size[2], CUDAUtil.gridBlockSize[0]) * CUDAUtil.gridBlockSize[0];
int adaptedHeight = CUDAUtil.iDivUp(vol.size[1], CUDAUtil.gridBlockSize[1]) * CUDAUtil.gridBlockSize[1];
int memorySize = adaptedWidth*adaptedHeight*vol.size[0]* 1 * Sizeof.FLOAT;
CUdeviceptr deviceX = new CUdeviceptr();
JCuda.cudaMalloc(deviceX, memorySize);
CUDAVolume3D cudaVol = (CUDAVolume3D) vol;
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z4realPfS_Pi");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(cudaVol.getDevicePointer());
arguments.add(deviceX);
arguments.add(sizePointer);
if (debug) {
System.out.print("Called init done ");
printTime();
}
callCUDAFunction(function, arguments);
if (debug) {
System.out.print("CUDA done ");
printTime();
}
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(cudaVol.getDevicePointer());
cudaVol.setDevicePointer(deviceX);
float [][][] temp = new float[vol.size[0]][vol.size[1]][vol.size[2]];
vol.data = null;
vol.data = temp;
temp = null;
vol.in_dim = 1;
if (debug) {
System.out.print("Clean up done ");
printTime();
}
return(0);
}
@Override
public int imag(Volume3D vol)
{
if (debug) {
System.out.print("Called real ");
printTime();
}
if (DEBUG_FLAG)
fprintf("vol_real\n");
if (vol.in_dim == 1) return(0);
if (vol.in_dim != 2) {
fprintf( "vol_real: Invalid dimension\n");
return(-1);
}
int adaptedWidth = CUDAUtil.iDivUp(vol.size[2], CUDAUtil.gridBlockSize[0]) * CUDAUtil.gridBlockSize[0];
int adaptedHeight = CUDAUtil.iDivUp(vol.size[1], CUDAUtil.gridBlockSize[1]) * CUDAUtil.gridBlockSize[1];
int memorySize = adaptedWidth*adaptedHeight*vol.size[0]* 1 * Sizeof.FLOAT;
CUdeviceptr deviceX = new CUdeviceptr();
JCuda.cudaMalloc(deviceX, memorySize);
CUDAVolume3D cudaVol = (CUDAVolume3D) vol;
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z4imagPfS_Pi");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(cudaVol.getDevicePointer());
arguments.add(deviceX);
arguments.add(sizePointer);
if (debug) {
System.out.print("Called init done ");
printTime();
}
callCUDAFunction(function, arguments);
if (debug) {
System.out.print("CUDA done ");
printTime();
}
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(cudaVol.getDevicePointer());
cudaVol.setDevicePointer(deviceX);
float [][][] temp = new float[vol.size[0]][vol.size[1]][vol.size[2]];
vol.data = null;
vol.data = temp;
temp = null;
vol.in_dim = 1;
if (debug) {
System.out.print("Clean up done ");
printTime();
}
return(0);
}
@Override
public int addVolume(Volume3D vol1, Volume3D vol2, double weight)
{
int dim_loop;
if (DEBUG_FLAG)
fprintf("vol_add\n");
for (dim_loop=0; dim_loop<vol1.in_dim; dim_loop++)
if (vol1.size[dim_loop] != vol2.size[dim_loop]) {
fprintf( "vol_add: Volumes have different sizes\n");
return(-1);
}
/* OBS !!! borde inte behova konvertera vol2 . komplex */
if (vol1.in_dim==2 && vol2.in_dim==1){
makeComplex(vol2);
CONRAD.gc();
}
if (vol1.in_dim==1 && vol2.in_dim==2){
makeComplex(vol1);
CONRAD.gc();
}
if (vol2.in_dim>2 || vol1.in_dim>2) {
fprintf( "vol_add: Invalid dimension\n");
return(-1);
}
initCUDA();
gridSize = getGrid(vol1.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol1.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z9addVolumePfS_Piif");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol1).getDevicePointer());
arguments.add(((CUDAVolume3D) vol2).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol1.in_dim));
arguments.add(new Float(weight));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
return(0);
}
@Override
public float min(Volume3D vol)
{
/* defined for non-complex volumes only */
if (vol.in_dim != 1) {
fprintf("vol_max: Invalid dimension\n");
return(0);
}
initCUDA();
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
float [] results = new float [vol.size[2] * vol.size[1]];
CUdeviceptr resultPointer = CUDAUtil.copyToDeviceMemory(results);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z3minPfPiiS_");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
arguments.add(resultPointer);
callCUDAFunction(function, arguments);
CUDAUtil.fetchFromDeviceMemory(results, resultPointer);
JCuda.cudaFree(sizePointer);
float m = results[0];
for (int i = 1; i < results.length; i++){
if (results[i] < m) m = results[i];
}
return(m);
}
@Override
public int minOfTwoVolumes(Volume3D vol1, Volume3D vol2)
{
int dim_loop;
if (DEBUG_FLAG)
fprintf("vol_get_min\n");
for (dim_loop=0; dim_loop<vol1.in_dim; dim_loop++)
if (vol1.size[dim_loop] != vol2.size[dim_loop]) {
fprintf( "vol_get_min: Volumes have different sizes\n");
return(-1);
}
if (vol1.in_dim!=1 || vol2.in_dim!=1) {
fprintf( "vol_get_min: Invalid dimension\n");
return(-1);
}
initCUDA();
gridSize = getGrid(vol1.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol1.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z15minOfTwoVolumesPfS_Pii");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol1).getDevicePointer());
arguments.add(((CUDAVolume3D) vol2).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol1.in_dim));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
return(0);
}
@Override
public float mean(Volume3D vol)
{
/* defined for non-complex volumes only */
if (vol.in_dim != 1) {
fprintf("vol_max_pos: Invalid inner dimension\n");
return(0);
}
initCUDA();
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
float [] results = new float [vol.size[2] * vol.size[1]];
CUdeviceptr resultPointer = CUDAUtil.copyToDeviceMemory(results);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z4meanPfPiiS_");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
arguments.add(resultPointer);
callCUDAFunction(function, arguments);
CUDAUtil.fetchFromDeviceMemory(results, resultPointer);
JCuda.cudaFree(sizePointer);
float m = 0;
for (int i = 0; i < results.length; i++){
m += results[i];
}
return(m/results.length);
}
@Override
public float max(Volume3D vol)
{
/* defined for non-complex volumes only */
if (vol.in_dim != 1) {
fprintf("vol_max: Invalid dimension\n");
return(0);
}
initCUDA();
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
float [] results = new float [vol.size[2] * vol.size[1]];
CUdeviceptr resultPointer = CUDAUtil.copyToDeviceMemory(results);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z3maxPfPiiS_");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
arguments.add(resultPointer);
callCUDAFunction(function, arguments);
CUDAUtil.fetchFromDeviceMemory(results, resultPointer);
JCuda.cudaFree(sizePointer);
float m = results[0];
for (int i = 1; i < results.length; i++){
if (results[i] > m) m = results[i];
}
return(m);
}
@Override
public int addScalar(Volume3D vol,
float realPart,
float imagPart )
{
if (DEBUG_FLAG)
fprintf("vol_add_sc\n");
if (imagPart!=0){
makeComplex(vol);
}
initCUDA();
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z9addScalarPfPiiff");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
arguments.add(new Float(realPart));
arguments.add(new Float(imagPart));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
return(0);
}
@Override
public int abs(Volume3D vol)
{
if (DEBUG_FLAG)
fprintf("vol_abs\n");
if (vol.in_dim != 1 && vol.in_dim != 2) {
fprintf( "vol_abs: Invalid dimension\n");
return(-1);
}
int adaptedWidth = CUDAUtil.iDivUp(vol.size[2], CUDAUtil.gridBlockSize[0]) * CUDAUtil.gridBlockSize[0];
int adaptedHeight = CUDAUtil.iDivUp(vol.size[1], CUDAUtil.gridBlockSize[1]) * CUDAUtil.gridBlockSize[1];
int memorySize = adaptedWidth*adaptedHeight*vol.size[0]* 1 * Sizeof.FLOAT;
CUdeviceptr deviceX = new CUdeviceptr();
JCuda.cudaMalloc(deviceX, memorySize);
initCUDA();
CUDAVolume3D cudaVol = (CUDAVolume3D) vol;
// Calculate new grid size
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z3absPfS_Pii");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(cudaVol.getDevicePointer());
arguments.add(deviceX);
arguments.add(sizePointer);
arguments.add(cudaVol.in_dim);
if (debug) {
System.out.print("Called init done ");
printTime();
}
callCUDAFunction(function, arguments);
if (debug) {
System.out.print("CUDA done ");
printTime();
}
JCuda.cudaFree(sizePointer);
JCuda.cudaFree(cudaVol.getDevicePointer());
cudaVol.setDevicePointer(deviceX);
float [][][] temp = new float[vol.size[0]][vol.size[1]][vol.size[2]];
vol.data = null;
vol.data = temp;
temp = null;
vol.in_dim = 1;
if (debug) {
System.out.print("Clean up done ");
printTime();
}
return(0);
}
@Override
public int multiplyScalar(Volume3D vol,
float realPart,
float imagPart )
{
if (DEBUG_FLAG)
fprintf("vol_mult_sc\n");
if (vol.in_dim == 1) {
if (imagPart!=0) {
makeComplex(vol);
CONRAD.gc();
}
initCUDA();
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z14multiplyScalarPfPiif");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
arguments.add(new Float(realPart));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
// _Z14multiplyScalarPfPiif
} else {
// _Z14multiplyScalarPfPiiff
initCUDA();
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z14multiplyScalarPfPiiff");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
arguments.add(new Float(realPart));
arguments.add(new Float(imagPart));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
}
return(0);
}
@Override
public int sqrt(Volume3D vol)
{
if (DEBUG_FLAG)
fprintf("vol_sqrt\n");
if (vol.in_dim != 1) {
fprintf( "vol_sqrt: Invalid dimension\n");
return(-1);
}
initCUDA();
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z4sqrtPfPii");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
return(0);
}
@Override
public int upperLimit(Volume3D vol, float max)
{
if (DEBUG_FLAG)
fprintf("vol_cut_upper\n");
if (vol.in_dim != 1) {
fprintf( "vol_abs: Invalid dimension\n");
return(-1);
}
initCUDA();
gridSize = getGrid(vol.size);
Pointer sizePointer = CUDAUtil.copyToDeviceMemory(vol.size);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module,
"_Z10upperLimitPfPiif");
ArrayList<Object> arguments = new ArrayList<Object>();
arguments.add(((CUDAVolume3D) vol).getDevicePointer());
arguments.add(sizePointer);
arguments.add(new Integer(vol.in_dim));
arguments.add(new Float(max));
callCUDAFunction(function, arguments);
JCuda.cudaFree(sizePointer);
return(0);
}
}
/*
* Copyright (C) 2010-2014 - Andreas Maier
* CONRAD is developed as an Open Source project under the GNU General Public License (GPL).
*/