Line data Source code
1 : #include "SiddonsMethodCUDA.h"
2 : #include "LogGuard.h"
3 : #include "Timer.h"
4 : #include "TypeCasts.hpp"
5 :
6 : #include "Logger.h"
7 : namespace elsa
8 : {
9 : template <typename data_t>
10 0 : SiddonsMethodCUDA<data_t>::SiddonsMethodCUDA(const VolumeDescriptor& domainDescriptor,
11 : const DetectorDescriptor& rangeDescriptor)
12 : : base_type(domainDescriptor, rangeDescriptor),
13 0 : _detectorDescriptor(static_cast<DetectorDescriptor&>(*_rangeDescriptor)),
14 0 : _volumeDescriptor(static_cast<VolumeDescriptor&>(*_domainDescriptor))
15 : {
16 0 : auto dim = static_cast<std::size_t>(_domainDescriptor->getNumberOfDimensions());
17 0 : if (dim != static_cast<std::size_t>(_rangeDescriptor->getNumberOfDimensions())) {
18 0 : throw LogicError(
19 0 : std::string("SiddonsMethodCUDA: domain and range dimension need to match"));
20 : }
21 :
22 0 : if (dim != 2 && dim != 3) {
23 0 : throw LogicError("SiddonsMethodCUDA: only supporting 2d/3d operations");
24 : }
25 :
26 0 : if (_detectorDescriptor.getNumberOfGeometryPoses() == 0) {
27 0 : throw LogicError("SiddonsMethodCUDA: geometry list was empty");
28 : }
29 :
30 0 : const index_t numGeometry = _detectorDescriptor.getNumberOfGeometryPoses();
31 :
32 : // allocate device memory and copy ray origins and the inverse of the significant part of
33 : // projection matrices to device
34 0 : cudaExtent extent = make_cudaExtent(dim * sizeof(real_t), dim, asUnsigned(numGeometry));
35 :
36 0 : if (cudaMallocPitch(&_rayOrigins.ptr, &_rayOrigins.pitch, dim * sizeof(real_t),
37 : asUnsigned(numGeometry))
38 0 : != cudaSuccess)
39 0 : throw std::bad_alloc();
40 0 : _rayOrigins.xsize = dim;
41 0 : _rayOrigins.ysize = asUnsigned(numGeometry);
42 :
43 0 : if (cudaMalloc3D(&_projInvMatrices, extent) != cudaSuccess)
44 0 : throw std::bad_alloc();
45 :
46 0 : auto* projPtr = (int8_t*) _projInvMatrices.ptr;
47 0 : auto projPitch = _projInvMatrices.pitch;
48 0 : auto* rayBasePtr = (int8_t*) _rayOrigins.ptr;
49 0 : auto rayPitch = _rayOrigins.pitch;
50 :
51 0 : for (unsigned i = 0; i < numGeometry; i++) {
52 0 : auto geometry = _detectorDescriptor.getGeometryAt(i);
53 :
54 0 : if (!geometry)
55 0 : throw LogicError("JosephsMethodCUDA: Access not existing geometry pose");
56 :
57 0 : RealMatrix_t P = geometry->getInverseProjectionMatrix().block(0, 0, dim, dim);
58 0 : int8_t* slice = projPtr + i * projPitch * dim;
59 : // CUDA also uses a column-major representation, directly transfer matrix
60 : // transfer inverse of projection matrix
61 0 : if (cudaMemcpy2DAsync(slice, projPitch, P.data(), dim * sizeof(real_t),
62 : dim * sizeof(real_t), dim, cudaMemcpyDefault)
63 0 : != cudaSuccess)
64 0 : throw LogicError(
65 : "SiddonsMethodCUDA: Could not transfer inverse projection matrices to GPU.");
66 :
67 0 : int8_t* rayPtr = rayBasePtr + i * rayPitch;
68 : // get camera center using direct inverse
69 0 : RealVector_t ro =
70 0 : -P * geometry->getProjectionMatrix().block(0, static_cast<index_t>(dim), dim, 1);
71 : // transfer ray origin
72 0 : if (cudaMemcpyAsync(rayPtr, ro.data(), dim * sizeof(real_t), cudaMemcpyDefault)
73 0 : != cudaSuccess)
74 0 : throw LogicError("SiddonsMethodCUDA: Could not transfer ray origins to GPU.");
75 : }
76 0 : }
77 :
78 : template <typename data_t>
79 0 : SiddonsMethodCUDA<data_t>::~SiddonsMethodCUDA()
80 : {
81 : // Free CUDA resources
82 0 : if (cudaFree(_rayOrigins.ptr) != cudaSuccess
83 0 : || cudaFree(_projInvMatrices.ptr) != cudaSuccess)
84 : Logger::get("SiddonsMethodCUDA")
85 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
86 0 : }
87 :
88 : template <typename data_t>
89 0 : SiddonsMethodCUDA<data_t>* SiddonsMethodCUDA<data_t>::_cloneImpl() const
90 : {
91 0 : return new SiddonsMethodCUDA<data_t>(_volumeDescriptor, _detectorDescriptor);
92 : }
93 :
94 : template <typename data_t>
95 0 : void SiddonsMethodCUDA<data_t>::forward(const BoundingBox& aabb, const DataContainer<data_t>& x,
96 : DataContainer<data_t>& Ax) const
97 : {
98 0 : Timer timeGuard("SiddonsMethodCUDA", "apply");
99 :
100 0 : traverseVolume<false>(aabb, (void*) &(x[0]), (void*) &(Ax[0]));
101 0 : }
102 :
103 : template <typename data_t>
104 0 : void SiddonsMethodCUDA<data_t>::backward(const BoundingBox& aabb,
105 : const DataContainer<data_t>& y,
106 : DataContainer<data_t>& Aty) const
107 : {
108 0 : Timer timeguard("SiddonsMethodCUDA", "applyAdjoint");
109 :
110 0 : traverseVolume<true>(aabb, (void*) &(Aty[0]), (void*) &(y[0]));
111 0 : }
112 :
113 : template <typename data_t>
114 0 : bool SiddonsMethodCUDA<data_t>::_isEqual(const LinearOperator<data_t>& other) const
115 : {
116 0 : if (!LinearOperator<data_t>::isEqual(other))
117 0 : return false;
118 :
119 0 : auto otherSM = downcast_safe<SiddonsMethodCUDA>(&other);
120 0 : if (!otherSM)
121 0 : return false;
122 :
123 0 : return true;
124 : }
125 :
126 : template <typename data_t>
127 : template <bool adjoint>
128 0 : void SiddonsMethodCUDA<data_t>::traverseVolume(const BoundingBox& aabb, void* volumePtr,
129 : void* sinoPtr) const
130 : {
131 0 : auto domainDims = _domainDescriptor->getNumberOfCoefficientsPerDimension();
132 0 : auto domainDimsui = domainDims.template cast<unsigned int>();
133 0 : IndexVector_t rangeDims = _rangeDescriptor->getNumberOfCoefficientsPerDimension();
134 0 : auto rangeDimsui = rangeDims.template cast<unsigned int>();
135 : cudaPitchedPtr dvolumePtr;
136 : cudaPitchedPtr dsinoPtr;
137 :
138 0 : if (_domainDescriptor->getNumberOfDimensions() == 3) {
139 : typename TraverseSiddonsCUDA<data_t, 3>::BoundingBox boundingBox;
140 0 : boundingBox._max[0] = aabb._max.template cast<uint32_t>()[0];
141 0 : boundingBox._max[1] = aabb._max.template cast<uint32_t>()[1];
142 0 : boundingBox._max[2] = aabb._max.template cast<uint32_t>()[2];
143 :
144 : // transfer volume and sinogram
145 : cudaExtent volExt =
146 0 : make_cudaExtent(domainDimsui[0] * sizeof(data_t), domainDimsui[1], domainDimsui[2]);
147 0 : if (cudaMalloc3D(&dvolumePtr, volExt) != cudaSuccess)
148 0 : throw std::bad_alloc();
149 :
150 : cudaExtent sinoExt =
151 0 : make_cudaExtent(rangeDimsui[0] * sizeof(data_t), rangeDimsui[1], rangeDimsui[2]);
152 0 : if (cudaMalloc3D(&dsinoPtr, sinoExt) != cudaSuccess)
153 0 : throw std::bad_alloc();
154 :
155 : if (adjoint) {
156 0 : copy3DDataContainerGPU<ContainerCpyKind::cpyContainerToRawGPU>(sinoPtr, dsinoPtr,
157 : sinoExt);
158 0 : if (cudaMemset3DAsync(dvolumePtr, 0, volExt) != cudaSuccess)
159 0 : throw LogicError("SiddonsMethodCUDA::traverseVolume: Could not "
160 : "zero-initialize volume on GPU.");
161 : } else {
162 0 : copy3DDataContainerGPU<ContainerCpyKind::cpyContainerToRawGPU>(volumePtr,
163 : dvolumePtr, volExt);
164 : }
165 :
166 0 : dim3 sinogramDims(rangeDimsui[2], rangeDimsui[1], rangeDimsui[0]);
167 : // synchronize because we are using multiple streams
168 0 : cudaDeviceSynchronize();
169 : if (adjoint) {
170 0 : TraverseSiddonsCUDA<data_t, 3>::traverseAdjoint(
171 0 : sinogramDims, THREADS_PER_BLOCK, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch,
172 0 : (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch, (int8_t*) _rayOrigins.ptr,
173 0 : static_cast<uint32_t>(_rayOrigins.pitch), (int8_t*) _projInvMatrices.ptr,
174 0 : static_cast<uint32_t>(_projInvMatrices.pitch), boundingBox);
175 : } else {
176 0 : TraverseSiddonsCUDA<data_t, 3>::traverseForward(
177 0 : sinogramDims, THREADS_PER_BLOCK, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch,
178 0 : (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch, (int8_t*) _rayOrigins.ptr,
179 0 : static_cast<uint32_t>(_rayOrigins.pitch), (int8_t*) _projInvMatrices.ptr,
180 0 : static_cast<uint32_t>(_projInvMatrices.pitch), boundingBox);
181 : }
182 : // synchonize because we are using multiple streams
183 0 : cudaDeviceSynchronize();
184 :
185 : // retrieve results from GPU
186 : if (adjoint)
187 0 : copy3DDataContainerGPU<ContainerCpyKind::cpyRawGPUToContainer, false>(
188 : volumePtr, dvolumePtr, volExt);
189 : else
190 0 : copy3DDataContainerGPU<ContainerCpyKind::cpyRawGPUToContainer, false>(
191 : sinoPtr, dsinoPtr, sinoExt);
192 :
193 : } else {
194 : typename TraverseSiddonsCUDA<data_t, 2>::BoundingBox boundingBox;
195 0 : boundingBox._max[0] = aabb._max.template cast<uint32_t>()[0];
196 0 : boundingBox._max[1] = aabb._max.template cast<uint32_t>()[1];
197 :
198 : // transfer volume and sinogram
199 :
200 0 : if (cudaMallocPitch(&dvolumePtr.ptr, &dvolumePtr.pitch,
201 0 : domainDimsui[0] * sizeof(data_t), domainDimsui[1])
202 0 : != cudaSuccess)
203 0 : throw std::bad_alloc();
204 :
205 0 : if (cudaMallocPitch(&dsinoPtr.ptr, &dsinoPtr.pitch, rangeDimsui[0] * sizeof(data_t),
206 : rangeDimsui[1])
207 0 : != cudaSuccess)
208 0 : throw std::bad_alloc();
209 :
210 : if (adjoint) {
211 0 : if (cudaMemcpy2DAsync(
212 0 : dsinoPtr.ptr, dsinoPtr.pitch, sinoPtr, rangeDimsui[0] * sizeof(data_t),
213 0 : rangeDimsui[0] * sizeof(data_t), rangeDimsui[1], cudaMemcpyDefault)
214 0 : != cudaSuccess)
215 0 : throw LogicError(
216 : "SiddonsMethodCUDA::traverseVolume: Couldn't transfer sinogram to GPU.");
217 :
218 0 : if (cudaMemset2DAsync(dvolumePtr.ptr, dvolumePtr.pitch, 0,
219 0 : domainDimsui[0] * sizeof(data_t), domainDimsui[1])
220 0 : != cudaSuccess)
221 0 : throw LogicError("SiddonsMethodCUDA::traverseVolume: Could not "
222 : "zero-initialize volume on GPU.");
223 : } else {
224 0 : if (cudaMemcpy2DAsync(dvolumePtr.ptr, dvolumePtr.pitch, volumePtr,
225 0 : domainDimsui[0] * sizeof(data_t),
226 0 : domainDimsui[0] * sizeof(data_t), domainDimsui[1],
227 : cudaMemcpyDefault)
228 0 : != cudaSuccess)
229 0 : throw LogicError(
230 : "SiddonsMethodCUDA::traverseVolume: Couldn't transfer volume to GPU.");
231 : }
232 :
233 : // perform projection
234 0 : dim3 sinogramDims(rangeDimsui[1], 1, rangeDimsui[0]);
235 : // synchronize because we are using multiple streams
236 0 : cudaDeviceSynchronize();
237 : if (adjoint) {
238 0 : TraverseSiddonsCUDA<data_t, 2>::traverseAdjoint(
239 0 : sinogramDims, THREADS_PER_BLOCK, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch,
240 0 : (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch, (int8_t*) _rayOrigins.ptr,
241 0 : static_cast<uint32_t>(_rayOrigins.pitch), (int8_t*) _projInvMatrices.ptr,
242 0 : static_cast<uint32_t>(_projInvMatrices.pitch), boundingBox);
243 : } else {
244 0 : TraverseSiddonsCUDA<data_t, 2>::traverseForward(
245 0 : sinogramDims, THREADS_PER_BLOCK, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch,
246 0 : (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch, (int8_t*) _rayOrigins.ptr,
247 0 : static_cast<uint32_t>(_rayOrigins.pitch), (int8_t*) _projInvMatrices.ptr,
248 0 : static_cast<uint32_t>(_projInvMatrices.pitch), boundingBox);
249 : }
250 : // synchonize because we are using multiple streams
251 0 : cudaDeviceSynchronize();
252 :
253 : // retrieve results from GPU
254 : if (adjoint) {
255 0 : if (cudaMemcpy2D(volumePtr, domainDimsui[0] * sizeof(data_t), dvolumePtr.ptr,
256 0 : dvolumePtr.pitch, domainDimsui[0] * sizeof(data_t),
257 0 : domainDimsui[1], cudaMemcpyDefault)
258 0 : != cudaSuccess)
259 0 : throw LogicError(
260 : "SiddonsMethodCUDA::traverseVolume: Couldn't retrieve results from GPU.");
261 : } else {
262 0 : if (cudaMemcpy2D(sinoPtr, rangeDimsui[0] * sizeof(data_t), dsinoPtr.ptr,
263 0 : dsinoPtr.pitch, rangeDimsui[0] * sizeof(data_t), rangeDimsui[1],
264 : cudaMemcpyDefault)
265 0 : != cudaSuccess)
266 0 : throw LogicError(
267 : "SiddonsMethodCUDA::traverseVolume: Couldn't retrieve results from GPU");
268 : }
269 : }
270 :
271 : // free allocated memory
272 0 : if (cudaFree(dvolumePtr.ptr) != cudaSuccess || cudaFree(dsinoPtr.ptr) != cudaSuccess)
273 : Logger::get("SiddonsMethodCUDA")
274 0 : ->error("Couldn't free GPU memory; This may cause problems later.");
275 0 : }
276 :
277 : template <typename data_t>
278 : template <typename SiddonsMethodCUDA<data_t>::ContainerCpyKind direction, bool async>
279 0 : void SiddonsMethodCUDA<data_t>::copy3DDataContainerGPU(void* hostData,
280 : const cudaPitchedPtr& gpuData,
281 : const cudaExtent& extent) const
282 : {
283 0 : cudaMemcpy3DParms cpyParams = {};
284 0 : cpyParams.extent = extent;
285 0 : cpyParams.kind = cudaMemcpyDefault;
286 :
287 : cudaPitchedPtr tmp =
288 0 : make_cudaPitchedPtr(hostData, extent.width, extent.width, extent.height);
289 :
290 : if (direction == ContainerCpyKind::cpyContainerToRawGPU) {
291 0 : cpyParams.dstPtr = gpuData;
292 0 : cpyParams.srcPtr = tmp;
293 : } else if (direction == ContainerCpyKind::cpyRawGPUToContainer) {
294 0 : cpyParams.srcPtr = gpuData;
295 0 : cpyParams.dstPtr = tmp;
296 : }
297 :
298 : if (async) {
299 0 : if (cudaMemcpy3DAsync(&cpyParams) != cudaSuccess)
300 0 : throw LogicError("Failed copying data between device and host");
301 : } else {
302 0 : if (cudaMemcpy3D(&cpyParams) != cudaSuccess)
303 0 : throw LogicError("Failed copying data between device acudaMemcpyKindnd host");
304 : }
305 0 : }
306 :
307 : // ------------------------------------------
308 : // explicit template instantiation
309 : template class SiddonsMethodCUDA<float>;
310 : template class SiddonsMethodCUDA<double>;
311 : } // namespace elsa
|