Line data Source code
1 : #include "JosephsMethodCUDA.h"
2 : #include "LogGuard.h"
3 : #include "Timer.h"
4 : #include "TypeCasts.hpp"
5 :
6 : namespace elsa
7 : {
8 : template <typename data_t>
9 0 : JosephsMethodCUDA<data_t>::JosephsMethodCUDA(const VolumeDescriptor& domainDescriptor,
10 : const DetectorDescriptor& rangeDescriptor,
11 : bool fast)
12 : : base_type(domainDescriptor, rangeDescriptor),
13 0 : _detectorDescriptor(static_cast<DetectorDescriptor&>(*_rangeDescriptor)),
14 0 : _volumeDescriptor(static_cast<VolumeDescriptor&>(*_domainDescriptor)),
15 0 : _fast{fast}
16 : {
17 0 : auto dim = static_cast<std::size_t>(_domainDescriptor->getNumberOfDimensions());
18 0 : if (dim != static_cast<std::size_t>(_rangeDescriptor->getNumberOfDimensions())) {
19 0 : throw LogicError(
20 0 : std::string("JosephsMethodCUDA: domain and range dimension need to match"));
21 : }
22 :
23 0 : if (dim != 2 && dim != 3) {
24 0 : throw LogicError("JosephsMethodCUDA: only supporting 2d/3d operations");
25 : }
26 :
27 0 : if (_detectorDescriptor.getNumberOfGeometryPoses() == 0) {
28 0 : throw LogicError("JosephsMethodCUDA: geometry list was empty");
29 : }
30 :
31 0 : const index_t numGeometry = _detectorDescriptor.getNumberOfGeometryPoses();
32 :
33 : // allocate device memory and copy ray origins and the inverse of projection matrices to
34 : // device
35 0 : cudaExtent extent = make_cudaExtent(dim * sizeof(real_t), dim, asUnsigned(numGeometry));
36 :
37 0 : if (cudaMallocPitch(&_rayOrigins.ptr, &_rayOrigins.pitch, dim * sizeof(real_t),
38 : asUnsigned(numGeometry))
39 0 : != cudaSuccess)
40 0 : throw std::bad_alloc();
41 :
42 0 : _rayOrigins.xsize = dim;
43 0 : _rayOrigins.ysize = asUnsigned(numGeometry);
44 :
45 0 : if (cudaMalloc3D(&_projInvMatrices, extent) != cudaSuccess)
46 0 : throw std::bad_alloc();
47 :
48 0 : if (_fast && cudaMalloc3D(&_projMatrices, extent) != cudaSuccess)
49 0 : throw std::bad_alloc();
50 :
51 0 : auto projPitch = _projInvMatrices.pitch;
52 0 : auto* rayBasePtr = (int8_t*) _rayOrigins.ptr;
53 0 : auto rayPitch = _rayOrigins.pitch;
54 :
55 0 : for (unsigned i = 0; i < numGeometry; i++) {
56 0 : auto geometry = _detectorDescriptor.getGeometryAt(i);
57 :
58 0 : if (!geometry)
59 0 : throw LogicError("JosephsMethodCUDA: Access not existing geometry pose");
60 :
61 0 : RealMatrix_t P = geometry->getInverseProjectionMatrix().block(0, 0, dim, dim);
62 0 : auto* slice = (int8_t*) _projInvMatrices.ptr + i * projPitch * dim;
63 :
64 : // transfer inverse of projection matrix
65 0 : if (cudaMemcpy2D(slice, projPitch, P.data(), dim * sizeof(real_t), dim * sizeof(real_t),
66 : dim, cudaMemcpyDefault)
67 0 : != cudaSuccess)
68 0 : throw LogicError(
69 : "JosephsMethodCUDA: Could not transfer inverse projection matrices to GPU.");
70 :
71 : // transfer projection matrix if _fast flag is set
72 0 : if (_fast) {
73 0 : P = geometry->getProjectionMatrix().block(0, 0, dim, dim);
74 0 : slice = (int8_t*) _projMatrices.ptr + i * projPitch * dim;
75 0 : if (cudaMemcpy2D(slice, projPitch, P.data(), dim * sizeof(real_t),
76 : dim * sizeof(real_t), dim, cudaMemcpyDefault)
77 0 : != cudaSuccess)
78 0 : throw LogicError("JosephsMethodCUDA: Could not transfer "
79 : "projection matrices to GPU.");
80 : }
81 :
82 0 : int8_t* rayPtr = rayBasePtr + i * rayPitch;
83 : // get ray origin using direct inverse
84 0 : RealVector_t ro =
85 0 : -geometry->getInverseProjectionMatrix().block(0, 0, dim, dim)
86 0 : * geometry->getProjectionMatrix().block(0, static_cast<index_t>(dim), dim, 1);
87 : // transfer ray origin
88 0 : if (cudaMemcpyAsync(rayPtr, ro.data(), dim * sizeof(real_t), cudaMemcpyDefault)
89 0 : != cudaSuccess)
90 0 : throw LogicError("JosephsMethodCUDA: Could not transfer ray origins to GPU.");
91 : }
92 0 : }
93 :
94 : template <typename data_t>
95 0 : JosephsMethodCUDA<data_t>::~JosephsMethodCUDA()
96 : {
97 : // Free CUDA resources
98 0 : if (cudaFree(_rayOrigins.ptr) != cudaSuccess
99 0 : || cudaFree(_projInvMatrices.ptr) != cudaSuccess)
100 : Logger::get("JosephsMethodCUDA")
101 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
102 :
103 0 : if (_fast)
104 0 : if (cudaFree(_projMatrices.ptr) != cudaSuccess)
105 : Logger::get("JosephsMethodCUDA")
106 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
107 0 : }
108 :
109 : template <typename data_t>
110 0 : JosephsMethodCUDA<data_t>* JosephsMethodCUDA<data_t>::_cloneImpl() const
111 : {
112 0 : return new JosephsMethodCUDA(*this);
113 : }
114 :
115 : template <typename data_t>
116 0 : bool JosephsMethodCUDA<data_t>::_isEqual(const LinearOperator<data_t>& other) const
117 : {
118 : // TODO: only need Geometry vector stored internally for comparisons, use kernels instead
119 0 : if (!LinearOperator<data_t>::isEqual(other))
120 0 : return false;
121 :
122 0 : auto otherJM = downcast_safe<JosephsMethodCUDA>(&other);
123 0 : if (!otherJM)
124 0 : return false;
125 :
126 0 : if (_fast != otherJM->_fast)
127 0 : return false;
128 :
129 0 : return true;
130 : }
131 :
132 : template <typename data_t>
133 0 : void JosephsMethodCUDA<data_t>::forward(const BoundingBox& aabb, const DataContainer<data_t>& x,
134 : DataContainer<data_t>& Ax) const
135 : {
136 0 : Timer timeGuard("JosephsMethodCUDA", "apply");
137 :
138 : // transfer volume as texture
139 0 : auto [volumeTex, volume] = copyTextureToGPU(x);
140 :
141 : // allocate memory for projections
142 : cudaPitchedPtr dsinoPtr;
143 0 : index_t dim = _domainDescriptor->getNumberOfDimensions();
144 0 : IndexVector_t rangeDims = _rangeDescriptor->getNumberOfCoefficientsPerDimension();
145 :
146 0 : auto rangeDimsui = rangeDims.template cast<unsigned int>();
147 :
148 0 : if (dim == 3) {
149 : cudaExtent sinoExt =
150 0 : make_cudaExtent(rangeDimsui[0] * sizeof(data_t), rangeDimsui[1], rangeDimsui[2]);
151 0 : if (cudaMalloc3D(&dsinoPtr, sinoExt) != cudaSuccess)
152 0 : throw std::bad_alloc();
153 :
154 0 : IndexVector_t bmax = aabb._max.template cast<index_t>();
155 : typename TraverseJosephsCUDA<data_t, 3>::BoundingBox boxMax;
156 0 : boxMax._max[0] = static_cast<real_t>(bmax[0]);
157 0 : boxMax._max[1] = static_cast<real_t>(bmax[1]);
158 0 : boxMax._max[2] = static_cast<real_t>(bmax[2]);
159 :
160 : // synchronize because we are using multiple streams
161 0 : cudaDeviceSynchronize();
162 : // perform projection
163 0 : const dim3 sinogramDims(rangeDimsui[2], rangeDimsui[1], rangeDimsui[0]);
164 0 : TraverseJosephsCUDA<data_t, 3>::traverseForward(
165 0 : sinogramDims, THREADS_PER_BLOCK, volumeTex, (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch,
166 0 : (int8_t*) _rayOrigins.ptr, static_cast<uint32_t>(_rayOrigins.pitch),
167 0 : (int8_t*) _projInvMatrices.ptr, static_cast<uint32_t>(_projInvMatrices.pitch),
168 : boxMax);
169 0 : cudaDeviceSynchronize();
170 :
171 : // retrieve results from GPU
172 0 : copy3DDataContainer<ContainerCpyKind::cpyRawGPUToContainer, false>((void*) &Ax[0],
173 : dsinoPtr, sinoExt);
174 0 : } else {
175 0 : if (cudaMallocPitch(&dsinoPtr.ptr, &dsinoPtr.pitch, rangeDimsui[0] * sizeof(data_t),
176 : rangeDimsui[1])
177 0 : != cudaSuccess)
178 0 : throw std::bad_alloc();
179 :
180 0 : IndexVector_t bmax = aabb._max.template cast<index_t>();
181 : typename TraverseJosephsCUDA<data_t, 2>::BoundingBox boxMax;
182 0 : boxMax._max[0] = static_cast<real_t>(bmax[0]);
183 0 : boxMax._max[1] = static_cast<real_t>(bmax[1]);
184 :
185 : // synchronize because we are using multiple streams
186 0 : cudaDeviceSynchronize();
187 0 : const dim3 sinogramDims(rangeDimsui[1], 1, rangeDimsui[0]);
188 0 : TraverseJosephsCUDA<data_t, 2>::traverseForward(
189 0 : sinogramDims, THREADS_PER_BLOCK, volumeTex, (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch,
190 0 : (int8_t*) _rayOrigins.ptr, static_cast<uint32_t>(_rayOrigins.pitch),
191 0 : (int8_t*) _projInvMatrices.ptr, static_cast<uint32_t>(_projInvMatrices.pitch),
192 : boxMax);
193 0 : cudaDeviceSynchronize();
194 :
195 : // retrieve results from GPU
196 0 : if (cudaMemcpy2D((void*) &Ax[0], rangeDimsui[0] * sizeof(data_t), dsinoPtr.ptr,
197 0 : dsinoPtr.pitch, rangeDimsui[0] * sizeof(data_t), rangeDimsui[1],
198 : cudaMemcpyDefault)
199 0 : != cudaSuccess)
200 0 : throw LogicError("JosephsMethodCUDA::apply: Couldn't retrieve results from GPU");
201 0 : }
202 :
203 0 : if (cudaDestroyTextureObject(volumeTex) != cudaSuccess)
204 : Logger::get("JosephsMethodCUDA")
205 0 : ->error("Couldn't destroy texture object; This may cause problems later.");
206 :
207 0 : if (cudaFreeArray(volume) != cudaSuccess)
208 : Logger::get("JosephsMethodCUDA")
209 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
210 :
211 0 : if (cudaFree(dsinoPtr.ptr) != cudaSuccess)
212 : Logger::get("JosephsMethodCUDA")
213 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
214 0 : }
215 :
216 : template <typename data_t>
217 0 : void JosephsMethodCUDA<data_t>::backward(const BoundingBox& aabb,
218 : const DataContainer<data_t>& y,
219 : DataContainer<data_t>& Aty) const
220 : {
221 0 : Timer timeguard("JosephsMethodCUDA", "applyAdjoint");
222 :
223 : // allocate memory for volume
224 0 : auto domainDims = _domainDescriptor->getNumberOfCoefficientsPerDimension();
225 0 : auto domainDimsui = domainDims.template cast<unsigned int>();
226 :
227 0 : auto rangeDims = _rangeDescriptor->getNumberOfCoefficientsPerDimension();
228 0 : auto rangeDimsui = rangeDims.template cast<unsigned int>();
229 :
230 0 : index_t dim = _domainDescriptor->getNumberOfDimensions();
231 :
232 : cudaPitchedPtr dvolumePtr;
233 0 : if (dim == 3) {
234 : cudaExtent volExt =
235 0 : make_cudaExtent(domainDimsui[0] * sizeof(data_t), domainDimsui[1], domainDimsui[2]);
236 0 : if (cudaMalloc3D(&dvolumePtr, volExt) != cudaSuccess)
237 0 : throw std::bad_alloc();
238 :
239 : // transfer projections
240 0 : if (_fast) {
241 0 : auto [sinoTex, sino] = copyTextureToGPU<cudaArrayLayered>(y);
242 :
243 0 : cudaDeviceSynchronize();
244 0 : const dim3 volumeDims(domainDimsui[2], domainDimsui[1], domainDimsui[0]);
245 0 : const int threads = THREADS_PER_BLOCK;
246 :
247 0 : TraverseJosephsCUDA<data_t, 3>::traverseAdjointFast(
248 0 : volumeDims, threads, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch, sinoTex,
249 0 : (int8_t*) _rayOrigins.ptr, static_cast<uint32_t>(_rayOrigins.pitch),
250 0 : (int8_t*) _projMatrices.ptr, static_cast<uint32_t>(_projMatrices.pitch),
251 : rangeDimsui[2]);
252 0 : cudaDeviceSynchronize();
253 :
254 0 : if (cudaDestroyTextureObject(sinoTex) != cudaSuccess)
255 : Logger::get("JosephsMethodCUDA")
256 0 : ->error("Couldn't destroy texture object; This may cause problems later.");
257 :
258 0 : if (cudaFreeArray(sino) != cudaSuccess)
259 : Logger::get("JosephsMethodCUDA")
260 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
261 : } else {
262 0 : if (cudaMemset3DAsync(dvolumePtr, 0, volExt) != cudaSuccess)
263 0 : throw LogicError("JosephsMethodCUDA::applyAdjoint: Could not "
264 : "zero-initialize volume on GPU.");
265 :
266 : cudaPitchedPtr dsinoPtr;
267 0 : cudaExtent sinoExt = make_cudaExtent(rangeDimsui[0] * sizeof(data_t),
268 0 : rangeDimsui[1], rangeDimsui[2]);
269 :
270 0 : if (cudaMalloc3D(&dsinoPtr, sinoExt) != cudaSuccess)
271 0 : throw std::bad_alloc();
272 :
273 0 : copy3DDataContainer<ContainerCpyKind::cpyContainerToRawGPU>((void*) &y[0], dsinoPtr,
274 : sinoExt);
275 :
276 : // perform projection
277 :
278 0 : IndexVector_t bmax = aabb._max.template cast<index_t>();
279 : typename TraverseJosephsCUDA<data_t, 3>::BoundingBox boxMax;
280 0 : boxMax._max[0] = static_cast<real_t>(bmax[0]);
281 0 : boxMax._max[1] = static_cast<real_t>(bmax[1]);
282 0 : boxMax._max[2] = static_cast<real_t>(bmax[2]);
283 :
284 : // synchronize because we are using multiple streams
285 0 : cudaDeviceSynchronize();
286 0 : const dim3 sinogramDims(rangeDimsui[2], rangeDimsui[1], rangeDimsui[0]);
287 0 : TraverseJosephsCUDA<data_t, 3>::traverseAdjoint(
288 0 : sinogramDims, THREADS_PER_BLOCK, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch,
289 0 : (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch, (int8_t*) _rayOrigins.ptr,
290 0 : static_cast<uint32_t>(_rayOrigins.pitch), (int8_t*) _projInvMatrices.ptr,
291 0 : static_cast<uint32_t>(_projInvMatrices.pitch), boxMax);
292 : // synchronize because we are using multiple streams
293 0 : cudaDeviceSynchronize();
294 :
295 : // free allocated memory
296 0 : if (cudaFree(dsinoPtr.ptr) != cudaSuccess)
297 : Logger::get("JosephsMethodCUDA")
298 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
299 0 : }
300 :
301 : // retrieve results from GPU
302 0 : copy3DDataContainer<ContainerCpyKind::cpyRawGPUToContainer, false>((void*) &Aty[0],
303 : dvolumePtr, volExt);
304 : } else {
305 0 : if (cudaMallocPitch(&dvolumePtr.ptr, &dvolumePtr.pitch,
306 0 : domainDimsui[0] * sizeof(data_t), domainDimsui[1])
307 0 : != cudaSuccess)
308 0 : throw std::bad_alloc();
309 :
310 0 : if (_fast) {
311 : // transfer projections
312 0 : auto [sinoTex, sino] = copyTextureToGPU<cudaArrayLayered>(y);
313 :
314 0 : cudaDeviceSynchronize();
315 0 : const dim3 volumeDims(1, domainDimsui[1], domainDimsui[0]);
316 0 : const int threads = THREADS_PER_BLOCK;
317 :
318 0 : TraverseJosephsCUDA<data_t, 2>::traverseAdjointFast(
319 0 : volumeDims, threads, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch, sinoTex,
320 0 : (int8_t*) _rayOrigins.ptr, static_cast<uint32_t>(_rayOrigins.pitch),
321 0 : (int8_t*) _projMatrices.ptr, static_cast<uint32_t>(_projMatrices.pitch),
322 : rangeDimsui[dim - 1]);
323 0 : cudaDeviceSynchronize();
324 :
325 0 : if (cudaDestroyTextureObject(sinoTex) != cudaSuccess)
326 : Logger::get("JosephsMethodCUDA")
327 0 : ->error("Couldn't destroy texture object; This may cause problems later.");
328 :
329 0 : if (cudaFreeArray(sino) != cudaSuccess)
330 : Logger::get("JosephsMethodCUDA")
331 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
332 : } else {
333 0 : if (cudaMemset2DAsync(dvolumePtr.ptr, dvolumePtr.pitch, 0,
334 0 : domainDimsui[0] * sizeof(data_t), domainDimsui[1])
335 0 : != cudaSuccess)
336 0 : throw LogicError("JosephsMethodCUDA::applyAdjoint: Could not "
337 : "zero-initialize volume on GPU.");
338 :
339 : cudaPitchedPtr dsinoPtr;
340 0 : IndexVector_t rangeDims = _rangeDescriptor->getNumberOfCoefficientsPerDimension();
341 0 : if (cudaMallocPitch(&dsinoPtr.ptr, &dsinoPtr.pitch, rangeDimsui[0] * sizeof(data_t),
342 0 : rangeDimsui[1])
343 0 : != cudaSuccess)
344 0 : throw std::bad_alloc();
345 :
346 0 : if (cudaMemcpy2DAsync(dsinoPtr.ptr, dsinoPtr.pitch, (void*) &y[0],
347 0 : rangeDimsui[0] * sizeof(data_t),
348 0 : rangeDimsui[0] * sizeof(data_t), rangeDimsui[1],
349 : cudaMemcpyDefault)
350 0 : != cudaSuccess)
351 0 : throw LogicError(
352 : "JosephsMethodCUDA::applyAdjoint: Couldn't transfer sinogram to GPU.");
353 :
354 0 : IndexVector_t bmax = aabb._max.template cast<index_t>();
355 : typename TraverseJosephsCUDA<data_t, 2>::BoundingBox boxMax;
356 0 : boxMax._max[0] = static_cast<real_t>(bmax[0]);
357 0 : boxMax._max[1] = static_cast<real_t>(bmax[1]);
358 : // synchronize because we are using multiple streams
359 0 : cudaDeviceSynchronize();
360 0 : const dim3 sinogramDims(rangeDimsui[1], 1, rangeDimsui[0]);
361 0 : const int threads = THREADS_PER_BLOCK;
362 0 : TraverseJosephsCUDA<data_t, 2>::traverseAdjoint(
363 0 : sinogramDims, threads, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch,
364 0 : (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch, (int8_t*) _rayOrigins.ptr,
365 0 : static_cast<uint32_t>(_rayOrigins.pitch), (int8_t*) _projInvMatrices.ptr,
366 0 : static_cast<uint32_t>(_projInvMatrices.pitch), boxMax);
367 : // synchronize because we are using multiple streams
368 0 : cudaDeviceSynchronize();
369 :
370 : // free allocated memory
371 0 : if (cudaFree(dsinoPtr.ptr) != cudaSuccess)
372 : Logger::get("JosephsMethodCUDA")
373 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
374 0 : }
375 :
376 : // retrieve results from GPU
377 0 : if (cudaMemcpy2D((void*) &Aty[0], domainDimsui[0] * sizeof(data_t), dvolumePtr.ptr,
378 0 : dvolumePtr.pitch, sizeof(data_t) * domainDimsui[0], domainDimsui[1],
379 : cudaMemcpyDefault)
380 0 : != cudaSuccess)
381 0 : throw LogicError(
382 : "JosephsMethodCUDA::applyAdjoint: Couldn't retrieve results from GPU");
383 : }
384 :
385 : // free allocated memory
386 0 : if (cudaFree(dvolumePtr.ptr) != cudaSuccess)
387 : Logger::get("JosephsMethodCUDA")
388 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
389 0 : }
390 :
391 : template <typename data_t>
392 0 : JosephsMethodCUDA<data_t>::JosephsMethodCUDA(const JosephsMethodCUDA<data_t>& other)
393 0 : : base_type(other._volumeDescriptor, other._detectorDescriptor),
394 0 : _detectorDescriptor(downcast<DetectorDescriptor>(*_rangeDescriptor)),
395 0 : _volumeDescriptor(downcast<VolumeDescriptor>(*_domainDescriptor)),
396 0 : _fast{other._fast}
397 : {
398 0 : auto dim = static_cast<std::size_t>(_domainDescriptor->getNumberOfDimensions());
399 0 : auto numAngles = static_cast<std::size_t>(
400 0 : _rangeDescriptor->getNumberOfCoefficientsPerDimension()[static_cast<index_t>(dim) - 1]);
401 :
402 0 : cudaExtent extent = make_cudaExtent(dim * sizeof(real_t), dim, numAngles);
403 :
404 0 : if (cudaMallocPitch(&_rayOrigins.ptr, &_rayOrigins.pitch, dim * sizeof(real_t), numAngles)
405 0 : != cudaSuccess)
406 0 : throw std::bad_alloc();
407 :
408 0 : _rayOrigins.xsize = dim;
409 0 : _rayOrigins.ysize = asUnsigned(_detectorDescriptor.getNumberOfGeometryPoses());
410 :
411 0 : if (cudaMalloc3D(&_projInvMatrices, extent) != cudaSuccess)
412 0 : throw std::bad_alloc();
413 :
414 0 : if (cudaMemcpyAsync(_projInvMatrices.ptr, other._projInvMatrices.ptr,
415 0 : _projInvMatrices.pitch * dim * numAngles, cudaMemcpyDefault)
416 0 : != cudaSuccess)
417 0 : throw LogicError(
418 : "JosephsMethodCUDA: Could not transfer inverse projection matrices to GPU.");
419 :
420 0 : if (cudaMemcpyAsync(_rayOrigins.ptr, other._rayOrigins.ptr, _rayOrigins.pitch * numAngles,
421 : cudaMemcpyDefault)
422 0 : != cudaSuccess)
423 0 : throw LogicError("JosephsMethodCUDA: Could not transfer ray origins to GPU.");
424 :
425 0 : if (_fast) {
426 0 : if (cudaMalloc3D(&_projMatrices, extent) != cudaSuccess)
427 0 : throw std::bad_alloc();
428 :
429 0 : if (cudaMemcpyAsync(_projMatrices.ptr, other._projMatrices.ptr,
430 0 : _projMatrices.pitch * dim * numAngles, cudaMemcpyDefault)
431 0 : != cudaSuccess)
432 0 : throw LogicError(
433 : "JosephsMethodCUDA: Could not transfer projection matrices to GPU.");
434 : }
435 0 : }
436 :
437 : template <typename data_t>
438 : template <typename JosephsMethodCUDA<data_t>::ContainerCpyKind direction, bool async>
439 0 : void JosephsMethodCUDA<data_t>::copy3DDataContainer(void* hostData,
440 : const cudaPitchedPtr& gpuData,
441 : const cudaExtent& extent) const
442 : {
443 0 : cudaMemcpy3DParms cpyParams = {};
444 0 : cpyParams.extent = extent;
445 0 : cpyParams.kind = cudaMemcpyDefault;
446 : cudaPitchedPtr tmp =
447 0 : make_cudaPitchedPtr(hostData, extent.width, extent.width, extent.height);
448 :
449 : if (direction == ContainerCpyKind::cpyContainerToRawGPU) {
450 0 : cpyParams.dstPtr = gpuData;
451 0 : cpyParams.srcPtr = tmp;
452 : } else if (direction == ContainerCpyKind::cpyRawGPUToContainer) {
453 0 : cpyParams.srcPtr = gpuData;
454 0 : cpyParams.dstPtr = tmp;
455 : }
456 :
457 : if (async) {
458 0 : if (cudaMemcpy3DAsync(&cpyParams) != cudaSuccess)
459 0 : throw LogicError("Failed copying data between device and host");
460 : } else {
461 0 : if (cudaMemcpy3D(&cpyParams) != cudaSuccess)
462 0 : throw LogicError("Failed copying data between device and host");
463 : }
464 0 : }
465 :
466 : template <typename data_t>
467 : template <unsigned int flags>
468 : std::pair<cudaTextureObject_t, cudaArray*>
469 0 : JosephsMethodCUDA<data_t>::copyTextureToGPU(const DataContainer<data_t>& hostData) const
470 : {
471 : // transfer volume as texture
472 0 : auto domainDims = hostData.getDataDescriptor().getNumberOfCoefficientsPerDimension();
473 0 : auto domainDimsui = domainDims.template cast<unsigned int>();
474 :
475 : cudaArray* volume;
476 0 : cudaTextureObject_t volumeTex = 0;
477 :
478 : cudaChannelFormatDesc channelDesc;
479 :
480 : if constexpr (sizeof(data_t) == 4)
481 : channelDesc =
482 0 : cudaCreateChannelDesc(sizeof(data_t) * 8, 0, 0, 0, cudaChannelFormatKindFloat);
483 : else if (sizeof(data_t) == 8)
484 0 : channelDesc = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned);
485 : else
486 : throw InvalidArgumentError("JosephsMethodCUDA::copyTextureToGPU: only supports "
487 : "DataContainer<data_t> with data_t of length 4 or 8 bytes");
488 :
489 0 : if (hostData.getDataDescriptor().getNumberOfDimensions() == 3) {
490 : cudaExtent volumeExtent =
491 0 : make_cudaExtent(domainDimsui[0], domainDimsui[1], domainDimsui[2]);
492 0 : if (cudaMalloc3DArray(&volume, &channelDesc, volumeExtent, flags) != cudaSuccess)
493 0 : throw std::bad_alloc();
494 0 : cudaMemcpy3DParms cpyParams = {};
495 0 : cpyParams.srcPtr.ptr = (void*) &hostData[0];
496 0 : cpyParams.srcPtr.pitch = domainDimsui[0] * sizeof(data_t);
497 0 : cpyParams.srcPtr.xsize = domainDimsui[0] * sizeof(data_t);
498 0 : cpyParams.srcPtr.ysize = domainDimsui[1];
499 0 : cpyParams.dstArray = volume;
500 0 : cpyParams.extent = volumeExtent;
501 0 : cpyParams.kind = cudaMemcpyDefault;
502 :
503 0 : if (cudaMemcpy3DAsync(&cpyParams) != cudaSuccess)
504 0 : throw LogicError(
505 : "JosephsMethodCUDA::copyTextureToGPU: Could not transfer data to GPU.");
506 : } else {
507 : // CUDA has a very weird way of handling layered 1D arrays
508 : if (flags == cudaArrayLayered) {
509 :
510 : // must be allocated as a 3D Array of height 0
511 0 : cudaExtent volumeExtent = make_cudaExtent(domainDimsui[0], 0, domainDimsui[1]);
512 0 : if (cudaMalloc3DArray(&volume, &channelDesc, volumeExtent, flags) != cudaSuccess)
513 0 : throw std::bad_alloc();
514 :
515 : // adjust height to 1 for copy
516 0 : volumeExtent.height = 1;
517 0 : cudaMemcpy3DParms cpyParams = {};
518 0 : cpyParams.srcPos = make_cudaPos(0, 0, 0);
519 0 : cpyParams.dstPos = make_cudaPos(0, 0, 0);
520 0 : cpyParams.srcPtr = make_cudaPitchedPtr(
521 0 : (void*) &hostData[0], domainDimsui[0] * sizeof(data_t), domainDimsui[0], 1);
522 0 : cpyParams.dstArray = volume;
523 0 : cpyParams.extent = volumeExtent;
524 0 : cpyParams.kind = cudaMemcpyDefault;
525 :
526 0 : if (cudaMemcpy3DAsync(&cpyParams) != cudaSuccess)
527 0 : throw LogicError(
528 : "JosephsMethodCUDA::copyTextureToGPU: Could not transfer data to GPU.");
529 : } else {
530 0 : if (cudaMallocArray(&volume, &channelDesc, domainDimsui[0], domainDimsui[1], flags)
531 0 : != cudaSuccess)
532 0 : throw std::bad_alloc();
533 :
534 0 : if (cudaMemcpy2DToArrayAsync(
535 0 : volume, 0, 0, &hostData[0], domainDimsui[0] * sizeof(data_t),
536 0 : domainDimsui[0] * sizeof(data_t), domainDimsui[1], cudaMemcpyDefault)
537 0 : != cudaSuccess)
538 0 : throw LogicError(
539 : "JosephsMethodCUDA::copyTextureToGPU: Could not transfer data to GPU.");
540 : }
541 : }
542 :
543 : cudaResourceDesc resDesc;
544 0 : std::memset(&resDesc, 0, sizeof(resDesc));
545 0 : resDesc.resType = cudaResourceTypeArray;
546 0 : resDesc.res.array.array = volume;
547 :
548 : cudaTextureDesc texDesc;
549 0 : std::memset(&texDesc, 0, sizeof(texDesc));
550 0 : texDesc.addressMode[0] = flags ? cudaAddressModeBorder : cudaAddressModeClamp;
551 0 : texDesc.addressMode[1] = flags ? cudaAddressModeBorder : cudaAddressModeClamp;
552 0 : texDesc.addressMode[2] = flags ? cudaAddressModeBorder : cudaAddressModeClamp;
553 0 : texDesc.filterMode = sizeof(data_t) == 4 ? cudaFilterModeLinear : cudaFilterModePoint;
554 0 : texDesc.readMode = cudaReadModeElementType;
555 0 : texDesc.normalizedCoords = 0;
556 :
557 0 : if (cudaCreateTextureObject(&volumeTex, &resDesc, &texDesc, nullptr) != cudaSuccess)
558 0 : throw LogicError("Couldn't create texture object");
559 :
560 0 : return std::pair<cudaTextureObject_t, cudaArray*>(volumeTex, volume);
561 0 : }
562 :
563 : // ------------------------------------------
564 : // explicit template instantiation
565 : template class JosephsMethodCUDA<float>;
566 : template class JosephsMethodCUDA<double>;
567 : } // namespace elsa
|