LCOV - code coverage report
Current view: top level - projectors_cuda - JosephsMethodCUDA.cpp (source / functions) Hit Total Coverage
Test: test_coverage.info.cleaned Lines: 0 319 0.0 %
Date: 2022-08-04 03:43:28 Functions: 0 24 0.0 %

          Line data    Source code
       1             : #include "JosephsMethodCUDA.h"
       2             : #include "LogGuard.h"
       3             : #include "Timer.h"
       4             : #include "TypeCasts.hpp"
       5             : 
       6             : namespace elsa
       7             : {
       8             :     template <typename data_t>
       9           0 :     JosephsMethodCUDA<data_t>::JosephsMethodCUDA(const VolumeDescriptor& domainDescriptor,
      10             :                                                  const DetectorDescriptor& rangeDescriptor,
      11             :                                                  bool fast)
      12             :         : base_type(domainDescriptor, rangeDescriptor),
      13           0 :           _detectorDescriptor(static_cast<DetectorDescriptor&>(*_rangeDescriptor)),
      14           0 :           _volumeDescriptor(static_cast<VolumeDescriptor&>(*_domainDescriptor)),
      15           0 :           _fast{fast}
      16             :     {
      17           0 :         auto dim = static_cast<std::size_t>(_domainDescriptor->getNumberOfDimensions());
      18           0 :         if (dim != static_cast<std::size_t>(_rangeDescriptor->getNumberOfDimensions())) {
      19           0 :             throw LogicError(
      20           0 :                 std::string("JosephsMethodCUDA: domain and range dimension need to match"));
      21             :         }
      22             : 
      23           0 :         if (dim != 2 && dim != 3) {
      24           0 :             throw LogicError("JosephsMethodCUDA: only supporting 2d/3d operations");
      25             :         }
      26             : 
      27           0 :         if (_detectorDescriptor.getNumberOfGeometryPoses() == 0) {
      28           0 :             throw LogicError("JosephsMethodCUDA: geometry list was empty");
      29             :         }
      30             : 
      31           0 :         const index_t numGeometry = _detectorDescriptor.getNumberOfGeometryPoses();
      32             : 
      33             :         // allocate device memory and copy ray origins and the inverse of projection matrices to
      34             :         // device
      35           0 :         cudaExtent extent = make_cudaExtent(dim * sizeof(real_t), dim, asUnsigned(numGeometry));
      36             : 
      37           0 :         if (cudaMallocPitch(&_rayOrigins.ptr, &_rayOrigins.pitch, dim * sizeof(real_t),
      38             :                             asUnsigned(numGeometry))
      39           0 :             != cudaSuccess)
      40           0 :             throw std::bad_alloc();
      41             : 
      42           0 :         _rayOrigins.xsize = dim;
      43           0 :         _rayOrigins.ysize = asUnsigned(numGeometry);
      44             : 
      45           0 :         if (cudaMalloc3D(&_projInvMatrices, extent) != cudaSuccess)
      46           0 :             throw std::bad_alloc();
      47             : 
      48           0 :         if (_fast && cudaMalloc3D(&_projMatrices, extent) != cudaSuccess)
      49           0 :             throw std::bad_alloc();
      50             : 
      51           0 :         auto projPitch = _projInvMatrices.pitch;
      52           0 :         auto* rayBasePtr = (int8_t*) _rayOrigins.ptr;
      53           0 :         auto rayPitch = _rayOrigins.pitch;
      54             : 
      55           0 :         for (unsigned i = 0; i < numGeometry; i++) {
      56           0 :             auto geometry = _detectorDescriptor.getGeometryAt(i);
      57             : 
      58           0 :             if (!geometry)
      59           0 :                 throw LogicError("JosephsMethodCUDA: Access not existing geometry pose");
      60             : 
      61           0 :             RealMatrix_t P = geometry->getInverseProjectionMatrix().block(0, 0, dim, dim);
      62           0 :             auto* slice = (int8_t*) _projInvMatrices.ptr + i * projPitch * dim;
      63             : 
      64             :             // transfer inverse of projection matrix
      65           0 :             if (cudaMemcpy2D(slice, projPitch, P.data(), dim * sizeof(real_t), dim * sizeof(real_t),
      66             :                              dim, cudaMemcpyDefault)
      67           0 :                 != cudaSuccess)
      68           0 :                 throw LogicError(
      69             :                     "JosephsMethodCUDA: Could not transfer inverse projection matrices to GPU.");
      70             : 
      71             :             // transfer projection matrix if _fast flag is set
      72           0 :             if (_fast) {
      73           0 :                 P = geometry->getProjectionMatrix().block(0, 0, dim, dim);
      74           0 :                 slice = (int8_t*) _projMatrices.ptr + i * projPitch * dim;
      75           0 :                 if (cudaMemcpy2D(slice, projPitch, P.data(), dim * sizeof(real_t),
      76             :                                  dim * sizeof(real_t), dim, cudaMemcpyDefault)
      77           0 :                     != cudaSuccess)
      78           0 :                     throw LogicError("JosephsMethodCUDA: Could not transfer "
      79             :                                      "projection matrices to GPU.");
      80             :             }
      81             : 
      82           0 :             int8_t* rayPtr = rayBasePtr + i * rayPitch;
      83             :             // get ray origin using direct inverse
      84           0 :             RealVector_t ro =
      85           0 :                 -geometry->getInverseProjectionMatrix().block(0, 0, dim, dim)
      86           0 :                 * geometry->getProjectionMatrix().block(0, static_cast<index_t>(dim), dim, 1);
      87             :             // transfer ray origin
      88           0 :             if (cudaMemcpyAsync(rayPtr, ro.data(), dim * sizeof(real_t), cudaMemcpyDefault)
      89           0 :                 != cudaSuccess)
      90           0 :                 throw LogicError("JosephsMethodCUDA: Could not transfer ray origins to GPU.");
      91             :         }
      92           0 :     }
      93             : 
      94             :     template <typename data_t>
      95           0 :     JosephsMethodCUDA<data_t>::~JosephsMethodCUDA()
      96             :     {
      97             :         // Free CUDA resources
      98           0 :         if (cudaFree(_rayOrigins.ptr) != cudaSuccess
      99           0 :             || cudaFree(_projInvMatrices.ptr) != cudaSuccess)
     100             :             Logger::get("JosephsMethodCUDA")
     101           0 :                 ->error("Couldn't free GPU memory; This may cause problems later.");
     102             : 
     103           0 :         if (_fast)
     104           0 :             if (cudaFree(_projMatrices.ptr) != cudaSuccess)
     105             :                 Logger::get("JosephsMethodCUDA")
     106           0 :                     ->error("Couldn't free GPU memory; This may cause problems later.");
     107           0 :     }
     108             : 
     109             :     template <typename data_t>
     110           0 :     JosephsMethodCUDA<data_t>* JosephsMethodCUDA<data_t>::_cloneImpl() const
     111             :     {
     112           0 :         return new JosephsMethodCUDA(*this);
     113             :     }
     114             : 
     115             :     template <typename data_t>
     116           0 :     bool JosephsMethodCUDA<data_t>::_isEqual(const LinearOperator<data_t>& other) const
     117             :     {
     118             :         // TODO: only need Geometry vector stored internally for comparisons, use kernels instead
     119           0 :         if (!LinearOperator<data_t>::isEqual(other))
     120           0 :             return false;
     121             : 
     122           0 :         auto otherJM = downcast_safe<JosephsMethodCUDA>(&other);
     123           0 :         if (!otherJM)
     124           0 :             return false;
     125             : 
     126           0 :         if (_fast != otherJM->_fast)
     127           0 :             return false;
     128             : 
     129           0 :         return true;
     130             :     }
     131             : 
     132             :     template <typename data_t>
     133           0 :     void JosephsMethodCUDA<data_t>::forward(const BoundingBox& aabb, const DataContainer<data_t>& x,
     134             :                                             DataContainer<data_t>& Ax) const
     135             :     {
     136           0 :         Timer timeGuard("JosephsMethodCUDA", "apply");
     137             : 
     138             :         // transfer volume as texture
     139           0 :         auto [volumeTex, volume] = copyTextureToGPU(x);
     140             : 
     141             :         // allocate memory for projections
     142             :         cudaPitchedPtr dsinoPtr;
     143           0 :         index_t dim = _domainDescriptor->getNumberOfDimensions();
     144           0 :         IndexVector_t rangeDims = _rangeDescriptor->getNumberOfCoefficientsPerDimension();
     145             : 
     146           0 :         auto rangeDimsui = rangeDims.template cast<unsigned int>();
     147             : 
     148           0 :         if (dim == 3) {
     149             :             cudaExtent sinoExt =
     150           0 :                 make_cudaExtent(rangeDimsui[0] * sizeof(data_t), rangeDimsui[1], rangeDimsui[2]);
     151           0 :             if (cudaMalloc3D(&dsinoPtr, sinoExt) != cudaSuccess)
     152           0 :                 throw std::bad_alloc();
     153             : 
     154           0 :             IndexVector_t bmax = aabb._max.template cast<index_t>();
     155             :             typename TraverseJosephsCUDA<data_t, 3>::BoundingBox boxMax;
     156           0 :             boxMax._max[0] = static_cast<real_t>(bmax[0]);
     157           0 :             boxMax._max[1] = static_cast<real_t>(bmax[1]);
     158           0 :             boxMax._max[2] = static_cast<real_t>(bmax[2]);
     159             : 
     160             :             // synchronize because we are using multiple streams
     161           0 :             cudaDeviceSynchronize();
     162             :             // perform projection
     163           0 :             const dim3 sinogramDims(rangeDimsui[2], rangeDimsui[1], rangeDimsui[0]);
     164           0 :             TraverseJosephsCUDA<data_t, 3>::traverseForward(
     165           0 :                 sinogramDims, THREADS_PER_BLOCK, volumeTex, (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch,
     166           0 :                 (int8_t*) _rayOrigins.ptr, static_cast<uint32_t>(_rayOrigins.pitch),
     167           0 :                 (int8_t*) _projInvMatrices.ptr, static_cast<uint32_t>(_projInvMatrices.pitch),
     168             :                 boxMax);
     169           0 :             cudaDeviceSynchronize();
     170             : 
     171             :             // retrieve results from GPU
     172           0 :             copy3DDataContainer<ContainerCpyKind::cpyRawGPUToContainer, false>((void*) &Ax[0],
     173             :                                                                                dsinoPtr, sinoExt);
     174           0 :         } else {
     175           0 :             if (cudaMallocPitch(&dsinoPtr.ptr, &dsinoPtr.pitch, rangeDimsui[0] * sizeof(data_t),
     176             :                                 rangeDimsui[1])
     177           0 :                 != cudaSuccess)
     178           0 :                 throw std::bad_alloc();
     179             : 
     180           0 :             IndexVector_t bmax = aabb._max.template cast<index_t>();
     181             :             typename TraverseJosephsCUDA<data_t, 2>::BoundingBox boxMax;
     182           0 :             boxMax._max[0] = static_cast<real_t>(bmax[0]);
     183           0 :             boxMax._max[1] = static_cast<real_t>(bmax[1]);
     184             : 
     185             :             // synchronize because we are using multiple streams
     186           0 :             cudaDeviceSynchronize();
     187           0 :             const dim3 sinogramDims(rangeDimsui[1], 1, rangeDimsui[0]);
     188           0 :             TraverseJosephsCUDA<data_t, 2>::traverseForward(
     189           0 :                 sinogramDims, THREADS_PER_BLOCK, volumeTex, (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch,
     190           0 :                 (int8_t*) _rayOrigins.ptr, static_cast<uint32_t>(_rayOrigins.pitch),
     191           0 :                 (int8_t*) _projInvMatrices.ptr, static_cast<uint32_t>(_projInvMatrices.pitch),
     192             :                 boxMax);
     193           0 :             cudaDeviceSynchronize();
     194             : 
     195             :             // retrieve results from GPU
     196           0 :             if (cudaMemcpy2D((void*) &Ax[0], rangeDimsui[0] * sizeof(data_t), dsinoPtr.ptr,
     197           0 :                              dsinoPtr.pitch, rangeDimsui[0] * sizeof(data_t), rangeDimsui[1],
     198             :                              cudaMemcpyDefault)
     199           0 :                 != cudaSuccess)
     200           0 :                 throw LogicError("JosephsMethodCUDA::apply: Couldn't retrieve results from GPU");
     201           0 :         }
     202             : 
     203           0 :         if (cudaDestroyTextureObject(volumeTex) != cudaSuccess)
     204             :             Logger::get("JosephsMethodCUDA")
     205           0 :                 ->error("Couldn't destroy texture object; This may cause problems later.");
     206             : 
     207           0 :         if (cudaFreeArray(volume) != cudaSuccess)
     208             :             Logger::get("JosephsMethodCUDA")
     209           0 :                 ->error("Couldn't free GPU memory; This may cause problems later.");
     210             : 
     211           0 :         if (cudaFree(dsinoPtr.ptr) != cudaSuccess)
     212             :             Logger::get("JosephsMethodCUDA")
     213           0 :                 ->error("Couldn't free GPU memory; This may cause problems later.");
     214           0 :     }
     215             : 
     216             :     template <typename data_t>
     217           0 :     void JosephsMethodCUDA<data_t>::backward(const BoundingBox& aabb,
     218             :                                              const DataContainer<data_t>& y,
     219             :                                              DataContainer<data_t>& Aty) const
     220             :     {
     221           0 :         Timer timeguard("JosephsMethodCUDA", "applyAdjoint");
     222             : 
     223             :         // allocate memory for volume
     224           0 :         auto domainDims = _domainDescriptor->getNumberOfCoefficientsPerDimension();
     225           0 :         auto domainDimsui = domainDims.template cast<unsigned int>();
     226             : 
     227           0 :         auto rangeDims = _rangeDescriptor->getNumberOfCoefficientsPerDimension();
     228           0 :         auto rangeDimsui = rangeDims.template cast<unsigned int>();
     229             : 
     230           0 :         index_t dim = _domainDescriptor->getNumberOfDimensions();
     231             : 
     232             :         cudaPitchedPtr dvolumePtr;
     233           0 :         if (dim == 3) {
     234             :             cudaExtent volExt =
     235           0 :                 make_cudaExtent(domainDimsui[0] * sizeof(data_t), domainDimsui[1], domainDimsui[2]);
     236           0 :             if (cudaMalloc3D(&dvolumePtr, volExt) != cudaSuccess)
     237           0 :                 throw std::bad_alloc();
     238             : 
     239             :             // transfer projections
     240           0 :             if (_fast) {
     241           0 :                 auto [sinoTex, sino] = copyTextureToGPU<cudaArrayLayered>(y);
     242             : 
     243           0 :                 cudaDeviceSynchronize();
     244           0 :                 const dim3 volumeDims(domainDimsui[2], domainDimsui[1], domainDimsui[0]);
     245           0 :                 const int threads = THREADS_PER_BLOCK;
     246             : 
     247           0 :                 TraverseJosephsCUDA<data_t, 3>::traverseAdjointFast(
     248           0 :                     volumeDims, threads, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch, sinoTex,
     249           0 :                     (int8_t*) _rayOrigins.ptr, static_cast<uint32_t>(_rayOrigins.pitch),
     250           0 :                     (int8_t*) _projMatrices.ptr, static_cast<uint32_t>(_projMatrices.pitch),
     251             :                     rangeDimsui[2]);
     252           0 :                 cudaDeviceSynchronize();
     253             : 
     254           0 :                 if (cudaDestroyTextureObject(sinoTex) != cudaSuccess)
     255             :                     Logger::get("JosephsMethodCUDA")
     256           0 :                         ->error("Couldn't destroy texture object; This may cause problems later.");
     257             : 
     258           0 :                 if (cudaFreeArray(sino) != cudaSuccess)
     259             :                     Logger::get("JosephsMethodCUDA")
     260           0 :                         ->error("Couldn't free GPU memory; This may cause problems later.");
     261             :             } else {
     262           0 :                 if (cudaMemset3DAsync(dvolumePtr, 0, volExt) != cudaSuccess)
     263           0 :                     throw LogicError("JosephsMethodCUDA::applyAdjoint: Could not "
     264             :                                      "zero-initialize volume on GPU.");
     265             : 
     266             :                 cudaPitchedPtr dsinoPtr;
     267           0 :                 cudaExtent sinoExt = make_cudaExtent(rangeDimsui[0] * sizeof(data_t),
     268           0 :                                                      rangeDimsui[1], rangeDimsui[2]);
     269             : 
     270           0 :                 if (cudaMalloc3D(&dsinoPtr, sinoExt) != cudaSuccess)
     271           0 :                     throw std::bad_alloc();
     272             : 
     273           0 :                 copy3DDataContainer<ContainerCpyKind::cpyContainerToRawGPU>((void*) &y[0], dsinoPtr,
     274             :                                                                             sinoExt);
     275             : 
     276             :                 // perform projection
     277             : 
     278           0 :                 IndexVector_t bmax = aabb._max.template cast<index_t>();
     279             :                 typename TraverseJosephsCUDA<data_t, 3>::BoundingBox boxMax;
     280           0 :                 boxMax._max[0] = static_cast<real_t>(bmax[0]);
     281           0 :                 boxMax._max[1] = static_cast<real_t>(bmax[1]);
     282           0 :                 boxMax._max[2] = static_cast<real_t>(bmax[2]);
     283             : 
     284             :                 // synchronize because we are using multiple streams
     285           0 :                 cudaDeviceSynchronize();
     286           0 :                 const dim3 sinogramDims(rangeDimsui[2], rangeDimsui[1], rangeDimsui[0]);
     287           0 :                 TraverseJosephsCUDA<data_t, 3>::traverseAdjoint(
     288           0 :                     sinogramDims, THREADS_PER_BLOCK, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch,
     289           0 :                     (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch, (int8_t*) _rayOrigins.ptr,
     290           0 :                     static_cast<uint32_t>(_rayOrigins.pitch), (int8_t*) _projInvMatrices.ptr,
     291           0 :                     static_cast<uint32_t>(_projInvMatrices.pitch), boxMax);
     292             :                 // synchronize because we are using multiple streams
     293           0 :                 cudaDeviceSynchronize();
     294             : 
     295             :                 // free allocated memory
     296           0 :                 if (cudaFree(dsinoPtr.ptr) != cudaSuccess)
     297             :                     Logger::get("JosephsMethodCUDA")
     298           0 :                         ->error("Couldn't free GPU memory; This may cause problems later.");
     299           0 :             }
     300             : 
     301             :             // retrieve results from GPU
     302           0 :             copy3DDataContainer<ContainerCpyKind::cpyRawGPUToContainer, false>((void*) &Aty[0],
     303             :                                                                                dvolumePtr, volExt);
     304             :         } else {
     305           0 :             if (cudaMallocPitch(&dvolumePtr.ptr, &dvolumePtr.pitch,
     306           0 :                                 domainDimsui[0] * sizeof(data_t), domainDimsui[1])
     307           0 :                 != cudaSuccess)
     308           0 :                 throw std::bad_alloc();
     309             : 
     310           0 :             if (_fast) {
     311             :                 // transfer projections
     312           0 :                 auto [sinoTex, sino] = copyTextureToGPU<cudaArrayLayered>(y);
     313             : 
     314           0 :                 cudaDeviceSynchronize();
     315           0 :                 const dim3 volumeDims(1, domainDimsui[1], domainDimsui[0]);
     316           0 :                 const int threads = THREADS_PER_BLOCK;
     317             : 
     318           0 :                 TraverseJosephsCUDA<data_t, 2>::traverseAdjointFast(
     319           0 :                     volumeDims, threads, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch, sinoTex,
     320           0 :                     (int8_t*) _rayOrigins.ptr, static_cast<uint32_t>(_rayOrigins.pitch),
     321           0 :                     (int8_t*) _projMatrices.ptr, static_cast<uint32_t>(_projMatrices.pitch),
     322             :                     rangeDimsui[dim - 1]);
     323           0 :                 cudaDeviceSynchronize();
     324             : 
     325           0 :                 if (cudaDestroyTextureObject(sinoTex) != cudaSuccess)
     326             :                     Logger::get("JosephsMethodCUDA")
     327           0 :                         ->error("Couldn't destroy texture object; This may cause problems later.");
     328             : 
     329           0 :                 if (cudaFreeArray(sino) != cudaSuccess)
     330             :                     Logger::get("JosephsMethodCUDA")
     331           0 :                         ->error("Couldn't free GPU memory; This may cause problems later.");
     332             :             } else {
     333           0 :                 if (cudaMemset2DAsync(dvolumePtr.ptr, dvolumePtr.pitch, 0,
     334           0 :                                       domainDimsui[0] * sizeof(data_t), domainDimsui[1])
     335           0 :                     != cudaSuccess)
     336           0 :                     throw LogicError("JosephsMethodCUDA::applyAdjoint: Could not "
     337             :                                      "zero-initialize volume on GPU.");
     338             : 
     339             :                 cudaPitchedPtr dsinoPtr;
     340           0 :                 IndexVector_t rangeDims = _rangeDescriptor->getNumberOfCoefficientsPerDimension();
     341           0 :                 if (cudaMallocPitch(&dsinoPtr.ptr, &dsinoPtr.pitch, rangeDimsui[0] * sizeof(data_t),
     342           0 :                                     rangeDimsui[1])
     343           0 :                     != cudaSuccess)
     344           0 :                     throw std::bad_alloc();
     345             : 
     346           0 :                 if (cudaMemcpy2DAsync(dsinoPtr.ptr, dsinoPtr.pitch, (void*) &y[0],
     347           0 :                                       rangeDimsui[0] * sizeof(data_t),
     348           0 :                                       rangeDimsui[0] * sizeof(data_t), rangeDimsui[1],
     349             :                                       cudaMemcpyDefault)
     350           0 :                     != cudaSuccess)
     351           0 :                     throw LogicError(
     352             :                         "JosephsMethodCUDA::applyAdjoint: Couldn't transfer sinogram to GPU.");
     353             : 
     354           0 :                 IndexVector_t bmax = aabb._max.template cast<index_t>();
     355             :                 typename TraverseJosephsCUDA<data_t, 2>::BoundingBox boxMax;
     356           0 :                 boxMax._max[0] = static_cast<real_t>(bmax[0]);
     357           0 :                 boxMax._max[1] = static_cast<real_t>(bmax[1]);
     358             :                 // synchronize because we are using multiple streams
     359           0 :                 cudaDeviceSynchronize();
     360           0 :                 const dim3 sinogramDims(rangeDimsui[1], 1, rangeDimsui[0]);
     361           0 :                 const int threads = THREADS_PER_BLOCK;
     362           0 :                 TraverseJosephsCUDA<data_t, 2>::traverseAdjoint(
     363           0 :                     sinogramDims, threads, (int8_t*) dvolumePtr.ptr, dvolumePtr.pitch,
     364           0 :                     (int8_t*) dsinoPtr.ptr, dsinoPtr.pitch, (int8_t*) _rayOrigins.ptr,
     365           0 :                     static_cast<uint32_t>(_rayOrigins.pitch), (int8_t*) _projInvMatrices.ptr,
     366           0 :                     static_cast<uint32_t>(_projInvMatrices.pitch), boxMax);
     367             :                 // synchronize because we are using multiple streams
     368           0 :                 cudaDeviceSynchronize();
     369             : 
     370             :                 // free allocated memory
     371           0 :                 if (cudaFree(dsinoPtr.ptr) != cudaSuccess)
     372             :                     Logger::get("JosephsMethodCUDA")
     373           0 :                         ->error("Couldn't free GPU memory; This may cause problems later.");
     374           0 :             }
     375             : 
     376             :             // retrieve results from GPU
     377           0 :             if (cudaMemcpy2D((void*) &Aty[0], domainDimsui[0] * sizeof(data_t), dvolumePtr.ptr,
     378           0 :                              dvolumePtr.pitch, sizeof(data_t) * domainDimsui[0], domainDimsui[1],
     379             :                              cudaMemcpyDefault)
     380           0 :                 != cudaSuccess)
     381           0 :                 throw LogicError(
     382             :                     "JosephsMethodCUDA::applyAdjoint: Couldn't retrieve results from GPU");
     383             :         }
     384             : 
     385             :         // free allocated memory
     386           0 :         if (cudaFree(dvolumePtr.ptr) != cudaSuccess)
     387             :             Logger::get("JosephsMethodCUDA")
     388           0 :                 ->error("Couldn't free GPU memory; This may cause problems later.");
     389           0 :     }
     390             : 
     391             :     template <typename data_t>
     392           0 :     JosephsMethodCUDA<data_t>::JosephsMethodCUDA(const JosephsMethodCUDA<data_t>& other)
     393           0 :         : base_type(other._volumeDescriptor, other._detectorDescriptor),
     394           0 :           _detectorDescriptor(downcast<DetectorDescriptor>(*_rangeDescriptor)),
     395           0 :           _volumeDescriptor(downcast<VolumeDescriptor>(*_domainDescriptor)),
     396           0 :           _fast{other._fast}
     397             :     {
     398           0 :         auto dim = static_cast<std::size_t>(_domainDescriptor->getNumberOfDimensions());
     399           0 :         auto numAngles = static_cast<std::size_t>(
     400           0 :             _rangeDescriptor->getNumberOfCoefficientsPerDimension()[static_cast<index_t>(dim) - 1]);
     401             : 
     402           0 :         cudaExtent extent = make_cudaExtent(dim * sizeof(real_t), dim, numAngles);
     403             : 
     404           0 :         if (cudaMallocPitch(&_rayOrigins.ptr, &_rayOrigins.pitch, dim * sizeof(real_t), numAngles)
     405           0 :             != cudaSuccess)
     406           0 :             throw std::bad_alloc();
     407             : 
     408           0 :         _rayOrigins.xsize = dim;
     409           0 :         _rayOrigins.ysize = asUnsigned(_detectorDescriptor.getNumberOfGeometryPoses());
     410             : 
     411           0 :         if (cudaMalloc3D(&_projInvMatrices, extent) != cudaSuccess)
     412           0 :             throw std::bad_alloc();
     413             : 
     414           0 :         if (cudaMemcpyAsync(_projInvMatrices.ptr, other._projInvMatrices.ptr,
     415           0 :                             _projInvMatrices.pitch * dim * numAngles, cudaMemcpyDefault)
     416           0 :             != cudaSuccess)
     417           0 :             throw LogicError(
     418             :                 "JosephsMethodCUDA: Could not transfer inverse projection matrices to GPU.");
     419             : 
     420           0 :         if (cudaMemcpyAsync(_rayOrigins.ptr, other._rayOrigins.ptr, _rayOrigins.pitch * numAngles,
     421             :                             cudaMemcpyDefault)
     422           0 :             != cudaSuccess)
     423           0 :             throw LogicError("JosephsMethodCUDA: Could not transfer ray origins to GPU.");
     424             : 
     425           0 :         if (_fast) {
     426           0 :             if (cudaMalloc3D(&_projMatrices, extent) != cudaSuccess)
     427           0 :                 throw std::bad_alloc();
     428             : 
     429           0 :             if (cudaMemcpyAsync(_projMatrices.ptr, other._projMatrices.ptr,
     430           0 :                                 _projMatrices.pitch * dim * numAngles, cudaMemcpyDefault)
     431           0 :                 != cudaSuccess)
     432           0 :                 throw LogicError(
     433             :                     "JosephsMethodCUDA: Could not transfer projection matrices to GPU.");
     434             :         }
     435           0 :     }
     436             : 
     437             :     template <typename data_t>
     438             :     template <typename JosephsMethodCUDA<data_t>::ContainerCpyKind direction, bool async>
     439           0 :     void JosephsMethodCUDA<data_t>::copy3DDataContainer(void* hostData,
     440             :                                                         const cudaPitchedPtr& gpuData,
     441             :                                                         const cudaExtent& extent) const
     442             :     {
     443           0 :         cudaMemcpy3DParms cpyParams = {};
     444           0 :         cpyParams.extent = extent;
     445           0 :         cpyParams.kind = cudaMemcpyDefault;
     446             :         cudaPitchedPtr tmp =
     447           0 :             make_cudaPitchedPtr(hostData, extent.width, extent.width, extent.height);
     448             : 
     449             :         if (direction == ContainerCpyKind::cpyContainerToRawGPU) {
     450           0 :             cpyParams.dstPtr = gpuData;
     451           0 :             cpyParams.srcPtr = tmp;
     452             :         } else if (direction == ContainerCpyKind::cpyRawGPUToContainer) {
     453           0 :             cpyParams.srcPtr = gpuData;
     454           0 :             cpyParams.dstPtr = tmp;
     455             :         }
     456             : 
     457             :         if (async) {
     458           0 :             if (cudaMemcpy3DAsync(&cpyParams) != cudaSuccess)
     459           0 :                 throw LogicError("Failed copying data between device and host");
     460             :         } else {
     461           0 :             if (cudaMemcpy3D(&cpyParams) != cudaSuccess)
     462           0 :                 throw LogicError("Failed copying data between device and host");
     463             :         }
     464           0 :     }
     465             : 
     466             :     template <typename data_t>
     467             :     template <unsigned int flags>
     468             :     std::pair<cudaTextureObject_t, cudaArray*>
     469           0 :         JosephsMethodCUDA<data_t>::copyTextureToGPU(const DataContainer<data_t>& hostData) const
     470             :     {
     471             :         // transfer volume as texture
     472           0 :         auto domainDims = hostData.getDataDescriptor().getNumberOfCoefficientsPerDimension();
     473           0 :         auto domainDimsui = domainDims.template cast<unsigned int>();
     474             : 
     475             :         cudaArray* volume;
     476           0 :         cudaTextureObject_t volumeTex = 0;
     477             : 
     478             :         cudaChannelFormatDesc channelDesc;
     479             : 
     480             :         if constexpr (sizeof(data_t) == 4)
     481             :             channelDesc =
     482           0 :                 cudaCreateChannelDesc(sizeof(data_t) * 8, 0, 0, 0, cudaChannelFormatKindFloat);
     483             :         else if (sizeof(data_t) == 8)
     484           0 :             channelDesc = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned);
     485             :         else
     486             :             throw InvalidArgumentError("JosephsMethodCUDA::copyTextureToGPU: only supports "
     487             :                                        "DataContainer<data_t> with data_t of length 4 or 8 bytes");
     488             : 
     489           0 :         if (hostData.getDataDescriptor().getNumberOfDimensions() == 3) {
     490             :             cudaExtent volumeExtent =
     491           0 :                 make_cudaExtent(domainDimsui[0], domainDimsui[1], domainDimsui[2]);
     492           0 :             if (cudaMalloc3DArray(&volume, &channelDesc, volumeExtent, flags) != cudaSuccess)
     493           0 :                 throw std::bad_alloc();
     494           0 :             cudaMemcpy3DParms cpyParams = {};
     495           0 :             cpyParams.srcPtr.ptr = (void*) &hostData[0];
     496           0 :             cpyParams.srcPtr.pitch = domainDimsui[0] * sizeof(data_t);
     497           0 :             cpyParams.srcPtr.xsize = domainDimsui[0] * sizeof(data_t);
     498           0 :             cpyParams.srcPtr.ysize = domainDimsui[1];
     499           0 :             cpyParams.dstArray = volume;
     500           0 :             cpyParams.extent = volumeExtent;
     501           0 :             cpyParams.kind = cudaMemcpyDefault;
     502             : 
     503           0 :             if (cudaMemcpy3DAsync(&cpyParams) != cudaSuccess)
     504           0 :                 throw LogicError(
     505             :                     "JosephsMethodCUDA::copyTextureToGPU: Could not transfer data to GPU.");
     506             :         } else {
     507             :             // CUDA has a very weird way of handling layered 1D arrays
     508             :             if (flags == cudaArrayLayered) {
     509             : 
     510             :                 // must be allocated as a 3D Array of height 0
     511           0 :                 cudaExtent volumeExtent = make_cudaExtent(domainDimsui[0], 0, domainDimsui[1]);
     512           0 :                 if (cudaMalloc3DArray(&volume, &channelDesc, volumeExtent, flags) != cudaSuccess)
     513           0 :                     throw std::bad_alloc();
     514             : 
     515             :                 // adjust height to 1 for copy
     516           0 :                 volumeExtent.height = 1;
     517           0 :                 cudaMemcpy3DParms cpyParams = {};
     518           0 :                 cpyParams.srcPos = make_cudaPos(0, 0, 0);
     519           0 :                 cpyParams.dstPos = make_cudaPos(0, 0, 0);
     520           0 :                 cpyParams.srcPtr = make_cudaPitchedPtr(
     521           0 :                     (void*) &hostData[0], domainDimsui[0] * sizeof(data_t), domainDimsui[0], 1);
     522           0 :                 cpyParams.dstArray = volume;
     523           0 :                 cpyParams.extent = volumeExtent;
     524           0 :                 cpyParams.kind = cudaMemcpyDefault;
     525             : 
     526           0 :                 if (cudaMemcpy3DAsync(&cpyParams) != cudaSuccess)
     527           0 :                     throw LogicError(
     528             :                         "JosephsMethodCUDA::copyTextureToGPU: Could not transfer data to GPU.");
     529             :             } else {
     530           0 :                 if (cudaMallocArray(&volume, &channelDesc, domainDimsui[0], domainDimsui[1], flags)
     531           0 :                     != cudaSuccess)
     532           0 :                     throw std::bad_alloc();
     533             : 
     534           0 :                 if (cudaMemcpy2DToArrayAsync(
     535           0 :                         volume, 0, 0, &hostData[0], domainDimsui[0] * sizeof(data_t),
     536           0 :                         domainDimsui[0] * sizeof(data_t), domainDimsui[1], cudaMemcpyDefault)
     537           0 :                     != cudaSuccess)
     538           0 :                     throw LogicError(
     539             :                         "JosephsMethodCUDA::copyTextureToGPU: Could not transfer data to GPU.");
     540             :             }
     541             :         }
     542             : 
     543             :         cudaResourceDesc resDesc;
     544           0 :         std::memset(&resDesc, 0, sizeof(resDesc));
     545           0 :         resDesc.resType = cudaResourceTypeArray;
     546           0 :         resDesc.res.array.array = volume;
     547             : 
     548             :         cudaTextureDesc texDesc;
     549           0 :         std::memset(&texDesc, 0, sizeof(texDesc));
     550           0 :         texDesc.addressMode[0] = flags ? cudaAddressModeBorder : cudaAddressModeClamp;
     551           0 :         texDesc.addressMode[1] = flags ? cudaAddressModeBorder : cudaAddressModeClamp;
     552           0 :         texDesc.addressMode[2] = flags ? cudaAddressModeBorder : cudaAddressModeClamp;
     553           0 :         texDesc.filterMode = sizeof(data_t) == 4 ? cudaFilterModeLinear : cudaFilterModePoint;
     554           0 :         texDesc.readMode = cudaReadModeElementType;
     555           0 :         texDesc.normalizedCoords = 0;
     556             : 
     557           0 :         if (cudaCreateTextureObject(&volumeTex, &resDesc, &texDesc, nullptr) != cudaSuccess)
     558           0 :             throw LogicError("Couldn't create texture object");
     559             : 
     560           0 :         return std::pair<cudaTextureObject_t, cudaArray*>(volumeTex, volume);
     561           0 :     }
     562             : 
     563             :     // ------------------------------------------
     564             :     // explicit template instantiation
     565             :     template class JosephsMethodCUDA<float>;
     566             :     template class JosephsMethodCUDA<double>;
     567             : } // namespace elsa

Generated by: LCOV version 1.14