package edu.stanford.rsl.conrad.geometry.motion; import java.io.IOException; import java.io.InputStream; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.util.ArrayList; import com.jogamp.opencl.CLBuffer; import com.jogamp.opencl.CLCommandQueue; import com.jogamp.opencl.CLContext; import com.jogamp.opencl.CLDevice; import com.jogamp.opencl.CLKernel; import com.jogamp.opencl.CLProgram; import com.jogamp.opencl.CLMemory.Mem; import edu.stanford.rsl.conrad.geometry.shapes.simple.PointND; import edu.stanford.rsl.conrad.geometry.trajectories.Trajectory; import edu.stanford.rsl.conrad.utils.Configuration; /** * General Class to map any of the ParzenWindowMotionField classes to GPU. We use the original class to create * the raster points. The expensive motionfield evaluation is then done on GPU. * @author akmaier * */ public class OpenCLParzenWindowMotionField extends ParzenWindowMotionField { /** * */ private static final long serialVersionUID = -322305647900119904L; ParzenWindowMotionField originalMotionField; CLContext context = null; CLDevice device = null; CLProgram program; static int bpBlockSize[] = {32, 16}; public static CLBuffer<FloatBuffer> generateTimeSamplingPoints(float tIndex, int elementCountU, int elementCountV, CLContext context, CLDevice device){ // prepare sampling points CLBuffer<FloatBuffer> samplingPoints = context.createFloatBuffer(elementCountU * elementCountV*3, Mem.READ_ONLY); for (int j = 0; j < elementCountV; j++){ for (int i = 0; i < elementCountU; i++){ samplingPoints.getBuffer().put(i*(1.0f / elementCountU)); samplingPoints.getBuffer().put(j*(1.0f / elementCountV)); samplingPoints.getBuffer().put(tIndex); //System.out.println(i*(1.0f / elementCountU) + " " +j*(1.0f / elementCountV)+ " "+t*(1.0f / elementCountT) ); } } samplingPoints.getBuffer().rewind(); CLCommandQueue clc = device.createCommandQueue(); clc.putWriteBuffer(samplingPoints, true).finish(); clc.release(); return samplingPoints; } public OpenCLParzenWindowMotionField(ParzenWindowMotionField motion, CLContext context, CLDevice device) throws IOException { super(motion.sigma); originalMotionField = motion; this.device = device; this.context = context; InputStream programFile = OpenCLParzenWindowMotionField.class.getResourceAsStream("MotionField.cl"); program = context.createProgram(programFile).build(); } @Override public PointND[] getRasterPoints(double time) { return originalMotionField.getRasterPoints(time); } private void fillBuffer(CLBuffer<IntBuffer> searchIdx, int[] searchIndicies) { IntBuffer buffer = searchIdx.getBuffer(); for(int i= 0; i < searchIndicies.length; i++){ buffer.put(searchIndicies[i]); } buffer.rewind(); } private void fillBuffer(CLBuffer<FloatBuffer> clbuffer, PointND[] points){ FloatBuffer buffer = clbuffer.getBuffer(); for(int i= 0; i < points.length; i++){ buffer.put((float) points[i].get(0)); buffer.put((float) points[i].get(1)); buffer.put((float) points[i].get(2)); } buffer.rewind(); } private void fillBuffer(CLBuffer<FloatBuffer> clbuffer, ArrayList<PointND> points){ FloatBuffer buffer = clbuffer.getBuffer(); for(int i= 0; i < points.size(); i++){ buffer.put((float) points.get(i).get(0)); buffer.put((float) points.get(i).get(1)); buffer.put((float) points.get(i).get(2)); } buffer.rewind(); } private void fillBuffer(CLBuffer<FloatBuffer> search, float[] searchCandidates) { FloatBuffer buffer = search.getBuffer(); for(int i= 0; i < searchCandidates.length; i++){ buffer.put((float) searchCandidates[i]); } buffer.rewind(); } @Override public ArrayList<PointND> getPositions(double from, double to, PointND ... initialPositions){ PointND[] input = getRasterPoints(from); PointND[] output = getRasterPoints(to); // create command queue on device. CLCommandQueue queue = device.createCommandQueue(); // Allocate buffers CLBuffer<FloatBuffer> inputPoint = context.createFloatBuffer(input.length * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> outputPoint = context.createFloatBuffer(output.length * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> motionField = context.createFloatBuffer(initialPositions.length*3, Mem.READ_WRITE); fillBuffer(inputPoint, input); fillBuffer(outputPoint, output); fillBuffer(motionField, initialPositions); CLKernel kernel = program.createCLKernel("evaluateParzen1D"); int[] realLocalSize = {Math.min(device.getMaxWorkGroupSize(),bpBlockSize[0])}; // rounded up to the nearest multiple of localWorkSize int[] globalWorkSize = {(int) (Math.ceil(initialPositions.length/(double)realLocalSize[0]))*realLocalSize[0]}; long time = System.currentTimeMillis(); queue.putWriteBuffer(inputPoint, true) .putWriteBuffer(outputPoint, true) .putWriteBuffer(motionField, true); kernel .putArg(inputPoint) .putArg(outputPoint) .putArg(motionField) .putArg(initialPositions.length) .putArg(output.length) .putArg((float) sigma); queue.put1DRangeKernel(kernel, 0, globalWorkSize[0], realLocalSize[0]) .putReadBuffer(motionField, true) .finish(); time = System.currentTimeMillis() - time; //System.out.println("Kernel execution times: " + time); ArrayList<PointND> result = new ArrayList<PointND>(); motionField.getBuffer().rewind(); for(int i=0; i < initialPositions.length; i++){ PointND newPointND = new PointND (initialPositions[i].get(0) + motionField.getBuffer().get(), initialPositions[i].get(1) +motionField.getBuffer().get(), initialPositions[i].get(2) +motionField.getBuffer().get()); result.add(newPointND); } // release buffers inputPoint.release(); outputPoint.release(); motionField.release(); queue.release(); kernel.release(); return result; } public float [] getMotionFieldAsArrayNN(double from, double to){ Trajectory geom = Configuration.getGlobalConfiguration().getGeometry(); PointND[] input = getRasterPoints(from); PointND[] output = getRasterPoints(to); // create command queue on device. CLCommandQueue queue = device.createCommandQueue(); // Allocate buffers CLBuffer<FloatBuffer> inputPoint = context.createFloatBuffer(input.length * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> outputPoint = context.createFloatBuffer(output.length * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> motionField = context.createFloatBuffer(geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3, Mem.WRITE_ONLY); fillBuffer(inputPoint, input); fillBuffer(outputPoint, output); CLKernel kernel = program.createCLKernel("evaluateNN"); // determine local worksize. "getMaxWorkGroupSize()" returns the overall number of workers! Thus the product over all elements of bpBlockSize // must be smaller then "getMaxWorkGroupSize()" int[] realLocalSize = new int[2]; realLocalSize[0] = Math.min(device.getMaxWorkGroupSize(),bpBlockSize[0]); realLocalSize[1] = Math.max(1, Math.min(device.getMaxWorkGroupSize()/realLocalSize[0], bpBlockSize[1])); // rounded up to the nearest multiple of localWorkSize int[] globalWorkSize = {geom.getReconDimensionX(), geom.getReconDimensionY()}; if ((globalWorkSize[0] % realLocalSize[0] ) != 0){ globalWorkSize[0] = ((globalWorkSize[0] / realLocalSize[0]) + 1) * realLocalSize[0]; } if ((globalWorkSize[1] % realLocalSize[1] ) != 0){ globalWorkSize[1] = ((globalWorkSize[1] / realLocalSize[1]) + 1) * realLocalSize[1]; } long time = System.currentTimeMillis(); queue.putWriteBuffer(inputPoint, true) .putWriteBuffer(outputPoint, true); double originx = geom.getOriginX(); kernel.rewind(); kernel .putArg(inputPoint) .putArg(outputPoint) .putArg(motionField) .putArg(geom.getReconDimensionX()) .putArg(geom.getReconDimensionY()) .putArg(0)//dummy integer - will be overwritten in loop .putArg(output.length) .putArg((float) sigma) .putArg((float) geom.getVoxelSpacingX()) .putArg((float) geom.getVoxelSpacingY()) .putArg((float) geom.getVoxelSpacingZ()) .putArg((float) originx) .putArg((float) geom.getOriginY()) .putArg((float) geom.getOriginZ()); for(int i=0; i < geom.getReconDimensionZ();i++){ System.out.println("Computing Slice " + i); kernel.setArg(5, i); queue.put2DRangeKernel(kernel, 0, 0, globalWorkSize[0], globalWorkSize[1], realLocalSize[0], realLocalSize[1]) .finish(); } queue.putReadBuffer(motionField, true).finish(); time = System.currentTimeMillis() - time; System.out.println("Kernel execution times: " + time); float [] result = new float [geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3]; motionField.getBuffer().rewind(); for(int i=0; i < result.length; i++){ result[i] = motionField.getBuffer().get(); } // release buffers inputPoint.release(); outputPoint.release(); motionField.release(); queue.release(); kernel.release(); return result; } public float [] getMotionFieldAsArray(double from, double to){ Trajectory geom = Configuration.getGlobalConfiguration().getGeometry(); PointND[] input = getRasterPoints(from); PointND[] output = getRasterPoints(to); // create command queue on device. CLCommandQueue queue = device.createCommandQueue(); // Allocate buffers CLBuffer<FloatBuffer> inputPoint = context.createFloatBuffer(input.length * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> outputPoint = context.createFloatBuffer(output.length * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> motionField = context.createFloatBuffer(geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3, Mem.WRITE_ONLY); fillBuffer(inputPoint, input); fillBuffer(outputPoint, output); CLKernel kernel = program.createCLKernel("evaluateParzen"); // determine local worksize. "getMaxWorkGroupSize()" returns the overall number of workers! Thus the product over all elements of bpBlockSize // must be smaller then "getMaxWorkGroupSize()" int[] realLocalSize = new int[2]; realLocalSize[0] = Math.min(device.getMaxWorkGroupSize(),bpBlockSize[0]); realLocalSize[1] = Math.max(1, Math.min(device.getMaxWorkGroupSize()/realLocalSize[0], bpBlockSize[1])); // rounded up to the nearest multiple of localWorkSize int[] globalWorkSize = {geom.getReconDimensionX(), geom.getReconDimensionY()}; if ((globalWorkSize[0] % realLocalSize[0] ) != 0){ globalWorkSize[0] = ((globalWorkSize[0] / realLocalSize[0]) + 1) * realLocalSize[0]; } if ((globalWorkSize[1] % realLocalSize[1] ) != 0){ globalWorkSize[1] = ((globalWorkSize[1] / realLocalSize[1]) + 1) * realLocalSize[1]; } long time = System.currentTimeMillis(); queue.putWriteBuffer(inputPoint, true) .putWriteBuffer(outputPoint, true); kernel.rewind(); kernel .putArg(inputPoint) .putArg(outputPoint) .putArg(motionField) .putArg(geom.getReconDimensionX()) .putArg(geom.getReconDimensionY()) .putArg(0)//dummy integer will be overwritten inside loop .putArg(output.length) .putArg((float) sigma) .putArg((float) geom.getVoxelSpacingX()) .putArg((float) geom.getVoxelSpacingY()) .putArg((float) geom.getVoxelSpacingZ()) .putArg((float) geom.getOriginX()) .putArg((float) geom.getOriginY()) .putArg((float) geom.getOriginZ()); for(int i=0; i < geom.getReconDimensionZ();i++){ System.out.println("Computing Slice " + i); kernel.setArg(5, i); queue.put2DRangeKernel(kernel, 0, 0, globalWorkSize[0], globalWorkSize[1], realLocalSize[0], realLocalSize[1]) .finish(); } queue.putReadBuffer(motionField, true).finish(); time = System.currentTimeMillis() - time; System.out.println("Kernel execution times: " + time); float [] result = new float [geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3]; motionField.getBuffer().rewind(); for(int i=0; i < result.length; i++){ result[i] = motionField.getBuffer().get(); } // release buffers inputPoint.release(); outputPoint.release(); motionField.release(); queue.release(); kernel.release(); return result; } public float [] getMotionFieldAsArrayReduceZ(double from, double to){ Trajectory geom = Configuration.getGlobalConfiguration().getGeometry(); PointND[] input = getRasterPoints(from); PointND[] output = getRasterPoints(to); // create command queue on device. CLCommandQueue queue = device.createCommandQueue(); // Allocate buffers CLBuffer<FloatBuffer> motionField = context.createFloatBuffer(geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3, Mem.WRITE_ONLY); CLKernel kernel = program.createCLKernel("evaluateParzen"); // determine local worksize. "getMaxWorkGroupSize()" returns the overall number of workers! Thus the product over all elements of bpBlockSize // must be smaller then "getMaxWorkGroupSize()" int[] realLocalSize = new int[2]; realLocalSize[0] = Math.min(device.getMaxWorkGroupSize(),bpBlockSize[0]); realLocalSize[1] = Math.max(1, Math.min(device.getMaxWorkGroupSize()/realLocalSize[0], bpBlockSize[1])); // rounded up to the nearest multiple of localWorkSize int[] globalWorkSize = {geom.getReconDimensionX(), geom.getReconDimensionY()}; if ((globalWorkSize[0] % realLocalSize[0] ) != 0){ globalWorkSize[0] = ((globalWorkSize[0] / realLocalSize[0]) + 1) * realLocalSize[0]; } if ((globalWorkSize[1] % realLocalSize[1] ) != 0){ globalWorkSize[1] = ((globalWorkSize[1] / realLocalSize[1]) + 1) * realLocalSize[1]; } ArrayList<ArrayList<PointND>> inputCandidates = new ArrayList<ArrayList<PointND>>(); ArrayList<ArrayList<PointND>> outputCandidates = new ArrayList<ArrayList<PointND>>(); double zMargin = (geom.getVoxelSpacingZ()*geom.getReconDimensionZ()*0.1) + 6 * sigma; for(int i=0; i < geom.getReconDimensionZ();i++){ inputCandidates.add(new ArrayList<PointND>()); outputCandidates.add(new ArrayList<PointND>()); double zValue = i * geom.getVoxelSpacingZ() + geom.getOriginZ(); for (int j=0; j < input.length; j++){ double distance = Math.abs(input[j].get(2) - zValue); //System.out.println(distance + " " + zValue + " " +input[j].get(2)); if (distance < zMargin){ inputCandidates.get(i).add(input[j]); outputCandidates.get(i).add(output[j]); } } } long time = System.currentTimeMillis(); double total = 0; // put static kernel arguments before loop kernel .setArg(2,motionField) .setArg(3,geom.getReconDimensionX()) .setArg(4,geom.getReconDimensionY()) .setArg(7,(float) sigma) .setArg(8,(float) geom.getVoxelSpacingX()) .setArg(9,(float) geom.getVoxelSpacingY()) .setArg(10,(float) geom.getVoxelSpacingZ()) .setArg(11,(float) geom.getOriginX()) .setArg(12,(float) geom.getOriginY()) .setArg(13,(float) geom.getOriginZ()); for(int i=0; i < geom.getReconDimensionZ();i++){ if (inputCandidates.get(i).size()==0) continue; CLBuffer<FloatBuffer> inputPoint = context.createFloatBuffer(inputCandidates.get(i).size() * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> outputPoint = context.createFloatBuffer(outputCandidates.get(i).size() * 3, Mem.READ_ONLY); fillBuffer(inputPoint, inputCandidates.get(i)); fillBuffer(outputPoint, outputCandidates.get(i)); queue.putWriteBuffer(inputPoint, true) .putWriteBuffer(outputPoint, true); //variable kernel arguments inside loop kernel .setArg(0,inputPoint) .setArg(1, outputPoint) .setArg(5, i) .setArg(6, inputCandidates.get(i).size()); System.out.println("Computing Slice " + i + " (" + inputCandidates.get(i).size() + " points)"); total += inputCandidates.get(i).size(); queue.put2DRangeKernel(kernel, 0, 0, globalWorkSize[0], globalWorkSize[1], realLocalSize[0], realLocalSize[1]) .finish(); inputPoint.release(); outputPoint.release(); } System.out.println("Evaluation quota: " + total/(input.length*geom.getReconDimensionZ())); queue.putReadBuffer(motionField, true).finish(); time = System.currentTimeMillis() - time; System.out.println("Kernel execution times: " + time); float [] result = new float [geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3]; motionField.getBuffer().rewind(); for(int i=0; i < result.length; i++){ result[i] = motionField.getBuffer().get(); } // release buffers motionField.release(); queue.release(); kernel.release(); return result; } public float [] getMotionFieldAsArrayReduceZGridXY(double from, double to, int nx, int ny){ Trajectory geom = Configuration.getGlobalConfiguration().getGeometry(); // create command queue on device. CLCommandQueue queue = device.createCommandQueue(); CLBuffer<FloatBuffer> motionField = getMotionFieldAsArrayReduceZGridXY(from, to, nx, ny, queue, true); queue.putReadBuffer(motionField, true).finish(); float [] result = new float [geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3]; motionField.getBuffer().rewind(); for(int i=0; i < result.length; i++){ result[i] = motionField.getBuffer().get(); } // release buffers motionField.release(); queue.release(); return result; } public CLBuffer<FloatBuffer> getMotionFieldAsArrayReduceZGridXY(double from, double to, int nx, int ny, CLCommandQueue queue, boolean print){ Trajectory geom = Configuration.getGlobalConfiguration().getGeometry(); PointND[] input = getRasterPoints(from); PointND[] output = getRasterPoints(to); // Allocate buffers CLBuffer<FloatBuffer> motionField = context.createFloatBuffer(geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3, Mem.WRITE_ONLY); CLKernel kernel = program.createCLKernel("evaluateParzenLocal"); int[] realLocalSize = new int[2]; realLocalSize[0] = Math.min(device.getMaxWorkGroupSize(),bpBlockSize[0]); realLocalSize[1] = Math.max(1, Math.min(device.getMaxWorkGroupSize()/realLocalSize[0], bpBlockSize[1])); // rounded up to the nearest multiple of localWorkSize int[] globalWorkSize = {geom.getReconDimensionX(), geom.getReconDimensionY()}; if ((globalWorkSize[0] % realLocalSize[0] ) != 0){ globalWorkSize[0] = ((globalWorkSize[0] / realLocalSize[0]) + 1) * realLocalSize[0]; } if ((globalWorkSize[1] % realLocalSize[1] ) != 0){ globalWorkSize[1] = ((globalWorkSize[1] / realLocalSize[1]) + 1) * realLocalSize[1]; } ArrayList<ArrayList<PointND>> inputCandidates = new ArrayList<ArrayList<PointND>>(); ArrayList<ArrayList<PointND>> outputCandidates = new ArrayList<ArrayList<PointND>>(); int total = 0; double zMargin = (geom.getVoxelSpacingZ()*geom.getReconDimensionZ()*0.1) + 6 * sigma; for(int i=0; i < geom.getReconDimensionZ();i++){ inputCandidates.add(new ArrayList<PointND>()); outputCandidates.add(new ArrayList<PointND>()); double zValue = i * geom.getVoxelSpacingZ() + geom.getOriginZ(); for (int j=0; j < input.length; j++){ double distance = Math.abs(input[j].get(2) - zValue); if (distance < zMargin){ inputCandidates.get(i).add(input[j]); outputCandidates.get(i).add(output[j]); } } } long time = System.currentTimeMillis(); double stepSizeX = (geom.getReconDimensionX() * geom.getVoxelSpacingX()) / (nx); double stepSizeY = (geom.getReconDimensionY() * geom.getVoxelSpacingY()) / (ny); // set all static kernel parameters kernel .setArg(4,motionField) .setArg(5,geom.getReconDimensionX()) .setArg(6,geom.getReconDimensionY()) .setArg(7,nx*ny) .setArg(9,(float) sigma) .setArg(10,(float) geom.getVoxelSpacingX()) .setArg(11,(float) geom.getVoxelSpacingY()) .setArg(12,(float) geom.getVoxelSpacingZ()) .setArg(13,(float) geom.getOriginX()) .setArg(14,(float) geom.getOriginY()) .setArg(15,(float) geom.getOriginZ()); for(int i=0; i < geom.getReconDimensionZ();i++){ double zValue = i * geom.getVoxelSpacingZ() + geom.getOriginZ(); float [] searchCandidates = new float[nx*ny*3]; int [] searchIndicies = new int[nx*ny*2]; ArrayList<PointND> localList = new ArrayList<PointND>(); ArrayList<PointND> localList2 = new ArrayList<PointND>(); for (int x = 0; x < nx; x++){ for (int y = 0; y < ny; y++){ int idx = y*nx+x; searchCandidates[idx*3] = (float) ((stepSizeX / 2.0) + stepSizeX*x); searchCandidates[idx*3+1] = (float) ((stepSizeY / 2.0) + stepSizeY*y); searchCandidates[idx*3+2] = (float) zValue; searchIndicies[idx*2] = localList.size(); int count = 0; for (int k = 0 ; k<inputCandidates.get(i).size(); k++){ // compute euclidean distance; double distance = Math.pow(searchCandidates[idx*3] - inputCandidates.get(i).get(k).get(0),2); distance += Math.pow(searchCandidates[idx*3+1] - inputCandidates.get(i).get(k).get(1),2); distance += Math.pow(searchCandidates[idx*3+2] - inputCandidates.get(i).get(k).get(2),2); distance = Math.sqrt(distance); if (distance < 2 * stepSizeX + sigma * 6){ localList.add(inputCandidates.get(i).get(k)); localList2.add(outputCandidates.get(i).get(k)); count++; } } searchIndicies[idx*2+1] = count; } } if (localList.size() == 0) continue; CLBuffer<FloatBuffer> search = context.createFloatBuffer(searchCandidates.length, Mem.READ_ONLY); CLBuffer<IntBuffer> searchIdx = context.createIntBuffer(searchIndicies.length, Mem.READ_ONLY); CLBuffer<FloatBuffer> local = context.createFloatBuffer(localList.size() * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> local2 = context.createFloatBuffer(localList2.size() * 3, Mem.READ_ONLY); fillBuffer(search, searchCandidates); fillBuffer(searchIdx, searchIndicies); fillBuffer(local, localList); fillBuffer(local2, localList2); queue.putWriteBuffer(search, true) .putWriteBuffer(searchIdx, true) .putWriteBuffer(local, true) .putWriteBuffer(local2, true); total += localList.size(); if (print) System.out.println("Computing Slice " + i + " (" + inputCandidates.get(i).size() + " points, local size "+localList.size()+")"); kernel .setArg(0,search) .setArg(1,searchIdx) .setArg(2,local) .setArg(3,local2) .setArg(8,i); queue.put2DRangeKernel(kernel, 0, 0, globalWorkSize[0], globalWorkSize[1], realLocalSize[0], realLocalSize[1]) .finish(); search.release(); searchIdx.release(); local.release(); local2.release(); } if (print) System.out.println("Evaluation quota [0.1 %]: " + total*1000/(input.length*geom.getReconDimensionZ()*nx*ny)); time = System.currentTimeMillis() - time; if (print) System.out.println("Kernel execution times: " + time); // release buffers kernel.release(); return motionField; } public float [] getMotionFieldAsArrayRandomBallCover(double from, double to, int n){ Trajectory geom = Configuration.getGlobalConfiguration().getGeometry(); PointND[] input = getRasterPoints(from); PointND[] output = getRasterPoints(to); // create command queue on device. CLCommandQueue queue = device.createCommandQueue(); // Allocate buffers CLBuffer<FloatBuffer> motionField = context.createFloatBuffer(geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3, Mem.WRITE_ONLY); CLKernel kernel = program.createCLKernel("evaluateParzenLocal"); int[] realLocalSize = new int[2]; realLocalSize[0] = Math.min(device.getMaxWorkGroupSize(),bpBlockSize[0]); realLocalSize[1] = Math.max(1, Math.min(device.getMaxWorkGroupSize()/realLocalSize[0], bpBlockSize[1])); // rounded up to the nearest multiple of localWorkSize int[] globalWorkSize = {geom.getReconDimensionX(), geom.getReconDimensionY()}; if ((globalWorkSize[0] % realLocalSize[0] ) != 0){ globalWorkSize[0] = ((globalWorkSize[0] / realLocalSize[0]) + 1) * realLocalSize[0]; } if ((globalWorkSize[1] % realLocalSize[1] ) != 0){ globalWorkSize[1] = ((globalWorkSize[1] / realLocalSize[1]) + 1) * realLocalSize[1]; } float [] searchCandidates = new float[n*3]; int [] searchIndicies = new int[n*2]; // Select Random Points for (int x = 0; x < n; x++){ int idx = x; int random = (int)(Math.random() * input.length); searchCandidates[idx*3] = (float) input[random].get(0); searchCandidates[idx*3+1] = (float) input[random].get(1); searchCandidates[idx*3+2] = (float) input[random].get(2); } // Compute average distance estimate double avgDist = 0; for (int x = 0; x < 100; x++){ double mindist = Double.MAX_VALUE; int x2 = (int) (Math.random() * n); for (int y = 0; y < n; y++){ double dist = Math.sqrt(Math.pow(searchCandidates[x2*3] - searchCandidates[(y)*3],2) +Math.pow(searchCandidates[x2*3+1] - searchCandidates[(y)*3+1],2) +Math.pow(searchCandidates[x2*3+2] - searchCandidates[(y)*3+2],2)); if (dist == 0) continue; mindist =Math.min(dist, mindist); } avgDist = Math.max(mindist, avgDist); } avgDist += Math.min(6*sigma, avgDist); System.out.println("Average Distance:" + avgDist + " Sigma: " + sigma); // Compute Point lists ArrayList<PointND> localList = new ArrayList<PointND>(); ArrayList<PointND> localList2 = new ArrayList<PointND>(); for (int x = 0; x < n; x++){ int idx = x; searchIndicies[idx*2] = localList.size(); int count = 0; for (int k = 0 ; k<input.length; k++){ // compute euclidean distance; double distance = Math.pow(searchCandidates[idx*3] - input[k].get(0),2); distance += Math.pow(searchCandidates[idx*3+1] - input[k].get(1),2); distance += Math.pow(searchCandidates[idx*3+2] - input[k].get(2),2); distance = Math.sqrt(distance); // selection needs to be independent of sigma if (distance < avgDist){ localList.add(input[k]); localList2.add(output[k]); count++; } } searchIndicies[idx*2+1] = count; } if (localList.size() == 0) return null; CLBuffer<FloatBuffer> search = context.createFloatBuffer(searchCandidates.length, Mem.READ_ONLY); CLBuffer<IntBuffer> searchIdx = context.createIntBuffer(searchIndicies.length, Mem.READ_ONLY); CLBuffer<FloatBuffer> local = context.createFloatBuffer(localList.size() * 3, Mem.READ_ONLY); CLBuffer<FloatBuffer> local2 = context.createFloatBuffer(localList2.size() * 3, Mem.READ_ONLY); fillBuffer(search, searchCandidates); fillBuffer(searchIdx, searchIndicies); fillBuffer(local, localList); fillBuffer(local2, localList2); queue.putWriteBuffer(search, true) .putWriteBuffer(searchIdx, true) .putWriteBuffer(local, true) .putWriteBuffer(local2, true); long time = System.currentTimeMillis(); // set static kernel variables kernel.rewind(); kernel .putArg(search) .putArg(searchIdx) .putArg(local) .putArg(local2) .putArg(motionField) .putArg(geom.getReconDimensionX()) .putArg(geom.getReconDimensionY()) .putArg(n) .putArg(0)// dummy slice variable - will be overwritten inside loop .putArg((float) sigma) .putArg((float) geom.getVoxelSpacingX()) .putArg((float) geom.getVoxelSpacingY()) .putArg((float) geom.getVoxelSpacingZ()) .putArg((float) geom.getOriginX()) .putArg((float) geom.getOriginY()) .putArg((float) geom.getOriginZ()); for(int i=0; i < geom.getReconDimensionZ();i++){ System.out.println("Computing Slice " + i + " (" + input.length + " points, local size "+localList.size()+")"); kernel.setArg(8, i); queue.put2DRangeKernel(kernel, 0, 0, globalWorkSize[0], globalWorkSize[1], realLocalSize[0], realLocalSize[1]) .finish(); } double total = localList.size(); System.out.println("Evaluation quota: " + total/(input.length*n)); queue.putReadBuffer(motionField, true).finish(); time = System.currentTimeMillis() - time; System.out.println("Kernel execution times: " + time); float [] result = new float [geom.getReconDimensionX()*geom.getReconDimensionY()*geom.getReconDimensionZ()*3]; motionField.getBuffer().rewind(); for(int i=0; i < result.length; i++){ result[i] = motionField.getBuffer().get(); } // release buffers search.release(); searchIdx.release(); local.release(); local2.release(); motionField.release(); queue.release(); kernel.release(); return result; } /** * @return the originalMotionField */ public ParzenWindowMotionField getOriginalMotionField() { return originalMotionField; } /** * @param originalMotionField the originalMotionField to set */ public void setOriginalMotionField(ParzenWindowMotionField originalMotionField) { this.originalMotionField = originalMotionField; } } /* * Copyright (C) 2010-2014 Andreas Maier * CONRAD is developed as an Open Source project under the GNU General Public License (GPL). */