MC-GPU
MC-GPU_kernel_v1.3.cu
Go to the documentation of this file.
00001 
00002 ////////////////////////////////////////////////////////////////////////////////
00003 //
00004 //              ****************************
00005 //              *** MC-GPU , version 1.3 ***
00006 //              ****************************
00007 //                                          
00008 //!  Definition of the CUDA GPU kernel for the simulation of x ray tracks in a voxelized geometry.
00009 //!  This kernel has been optimized to yield a good performance in the GPU but can still be
00010 //!  compiled in the CPU without problems. All the CUDA especific commands are enclosed in
00011 //!  pre-processor directives that are skipped if the parameter "USING_CUDA" is not defined
00012 //!  at compilation time.
00013 //
00014 //        ** DISCLAIMER **
00015 //
00016 // This software and documentation (the "Software") were developed at the Food and
00017 // Drug Administration (FDA) by employees of the Federal Government in the course
00018 // of their official duties. Pursuant to Title 17, Section 105 of the United States
00019 // Code, this work is not subject to copyright protection and is in the public
00020 // domain. Permission is hereby granted, free of charge, to any person obtaining a
00021 // copy of the Software, to deal in the Software without restriction, including
00022 // without limitation the rights to use, copy, modify, merge, publish, distribute,
00023 // sublicense, or sell copies of the Software or derivatives, and to permit persons
00024 // to whom the Software is furnished to do so. FDA assumes no responsibility
00025 // whatsoever for use by other parties of the Software, its source code,
00026 // documentation or compiled executables, and makes no guarantees, expressed or
00027 // implied, about its quality, reliability, or any other characteristic. Further,
00028 // use of this code in no way implies endorsement by the FDA or confers any
00029 // advantage in regulatory decisions.  Although this software can be redistributed
00030 // and/or modified freely, we ask that any derivative works bear some notice that
00031 // they are derived from it, and any modified versions bear some notice that they
00032 // have been modified.
00033 //                                                                            
00034 //
00035 //!                     @file    MC-GPU_kernel_v1.3.cu
00036 //!                     @author  Andreu Badal (Andreu.Badal-Soler@fda.hhs.gov)
00037 //!                     @date    2012/12/12
00038 //                       -- Original code started on:  2009/04/14
00039 //
00040 ////////////////////////////////////////////////////////////////////////////////
00041 
00042 
00043 
00044 ////////////////////////////////////////////////////////////////////////////////
00045 //!  Initialize the image array, ie, set all pixels to zero
00046 //!  Essentially, this function has the same effect as the command: 
00047 //!   "cutilSafeCall(cudaMemcpy(image_device, image, image_bytes, cudaMemcpyHostToDevice))";
00048 //!  
00049 //!  CUDA performs some initialization work the first time a GPU kernel is called.
00050 //!  Therefore, calling a short kernel before the real particle tracking is performed
00051 //!  may improve the accuracy of the timing measurements in the relevant kernel.
00052 //!  
00053 //!       @param[in,out] image   Pointer to the image array.
00054 //!       @param[in] pixels_per_image  Number of pixels in the image (ie, elements in the array).
00055 ////////////////////////////////////////////////////////////////////////////////
00056 #ifdef USING_CUDA
00057 __global__
00058 void init_image_array_GPU(unsigned long long int* image, int pixels_per_image)
00059 {
00060   int my_pixel = threadIdx.x + blockIdx.x*blockDim.x;
00061   if (my_pixel < pixels_per_image)
00062   {
00063     // -- Set the current pixel to 0 and return, avoiding overflow when more threads than pixels are used:
00064     image[my_pixel] = (unsigned long long int)(0);    // Initialize non-scatter image
00065     my_pixel += pixels_per_image;                     //  (advance to next image)
00066     image[my_pixel] = (unsigned long long int)(0);    // Initialize Compton image
00067     my_pixel += pixels_per_image;                     //  (advance to next image)
00068     image[my_pixel] = (unsigned long long int)(0);    // Initialize Rayleigh image
00069     my_pixel += pixels_per_image;                     //  (advance to next image)
00070     image[my_pixel] = (unsigned long long int)(0);    // Initialize multi-scatter image
00071   }
00072 }
00073 
00074 // ////////////////////////////////////////////////////////////////////////////////
00075 // //!  Initialize the dose deposition array, ie, set all voxel doses to zero
00076 // //!  
00077 // //!       @param[in,out] dose   Pointer to the dose mean and sigma arrays.
00078 // //!       @param[in] num_voxels_dose  Number of voxels in the dose ROI (ie, elements in the arrays).
00079 // ////////////////////////////////////////////////////////////////////////////////
00080 // __global__
00081 // void init_dose_array_GPU(ulonglong2* voxels_Edep, int num_voxels_dose)
00082 // {  
00083 //   int my_voxel = threadIdx.x + blockIdx.x*blockDim.x;
00084 //   register ulonglong2 ulonglong2_zero;
00085 //   ulonglong2_zero.x = ulonglong2_zero.y = (unsigned long long int) 0;
00086 //   if (my_voxel < num_voxels_dose)
00087 //   {
00088 //     dose[my_voxel] = ulonglong2_zero;    // Set the current voxel to (0,0) and return, avoiding overflow
00089 //   }
00090 // }
00091 
00092 #endif
00093 
00094  
00095 ////////////////////////////////////////////////////////////////////////////////
00096 //!  Main function to simulate x-ray tracks inside a voxelized geometry.
00097 //!  Secondary electrons are not simulated (in photoelectric and Compton 
00098 //!  events the energy is locally deposited).
00099 //!
00100 //!  The following global variables, in  the GPU __constant__ memory are used:
00101 //!           voxel_data_CONST, 
00102 //!           source_energy_data_CONST,
00103 //!           detector_data_CONST, 
00104 //!           mfp_table_data_CONST.
00105 //!
00106 //!       @param[in] history_batch  Particle batch number (only used in the CPU version when CUDA is disabled!, the GPU uses the built-in variable threadIdx)
00107 //!       @param[in] num_p  Projection number in the CT simulation. This variable defines a specific angle and the corresponding source and detector will be used.
00108 //!       @param[in] histories_per_thread   Number of histories to simulate for each call to this function (ie, for GPU thread).
00109 //!       @param[in] seed_input   Random number generator seed (the same seed is used to initialize the two MLCGs of RANECU).
00110 //!       @param[in] voxel_mat_dens   Pointer to the voxel densities and material vector (the voxelized geometry), stored in GPU glbal memory.
00111 //!       @param[in] mfp_Woodcock_table    Two parameter table for the linear interpolation of the Woodcock mean free path (MFP) (stored in GPU global memory).
00112 //!       @param[in] mfp_table_a   First element for the linear interpolation of the interaction mean free paths (stored in GPU global memory).
00113 //!       @param[in] mfp_table_b   Second element for the linear interpolation of the interaction mean free paths (stored in GPU global memory).
00114 //!       @param[in] rayleigh_table   Pointer to the table with the data required by the Rayleigh interaction sampling, stored in GPU global memory.
00115 //!       @param[in] compton_table   Pointer to the table with the data required by the Compton interaction sampling, stored in GPU global memory.
00116 //!       @param[in,out] image   Pointer to the image vector in the GPU global memory.
00117 //!       @param[in,out] dose   Pointer to the array containing the 3D voxel dose (and its uncertainty) in the GPU global memory.
00118 ////////////////////////////////////////////////////////////////////////////////
00119 #ifdef USING_CUDA
00120 __global__ void track_particles(int histories_per_thread,
00121                                 int num_p,      // For a CT simulation: allocate space for up to MAX_NUM_PROJECTIONS projections.
00122                                 int seed_input,
00123                                 unsigned long long int* image,
00124                                 ulonglong2* voxels_Edep,
00125                                 float2* voxel_mat_dens,
00126                                 float2* mfp_Woodcock_table,
00127                                 float3* mfp_table_a,
00128                                 float3* mfp_table_b,
00129                                 struct rayleigh_struct* rayleigh_table,
00130                                 struct compton_struct* compton_table,
00131                                 struct detector_struct* detector_data_array,
00132                                 struct source_struct* source_data_array, 
00133                                 ulonglong2* materials_dose)
00134 #else
00135            void track_particles(int history_batch,             // This variable is not required in the GPU, it uses the thread ID           
00136                                 int histories_per_thread,
00137                                 int num_p,
00138                                 int seed_input,
00139                                 unsigned long long int* image,
00140                                 ulonglong2* voxels_Edep,
00141                                 float2* voxel_mat_dens,
00142                                 float2* mfp_Woodcock_table,
00143                                 float3* mfp_table_a,
00144                                 float3* mfp_table_b,
00145                                 struct rayleigh_struct* rayleigh_table,
00146                                 struct compton_struct* compton_table,
00147                                 struct detector_struct* detector_data_array,
00148                                 struct source_struct* source_data_array, 
00149                                 ulonglong2* materials_dose)
00150 #endif
00151 {
00152   // -- Declare the track state variables:
00153   float3 position, direction;
00154   float energy, step, prob, randno, mfp_density, mfp_Woodcock;
00155   float3 mfp_table_read_a, mfp_table_read_b;
00156   int2 seed;
00157   int index;
00158   int material0,        // Current material, starting at 0 for 1st material
00159       material_old;     // Flag to mark a material or energy change
00160   signed char scatter_state;    // Flag for scatter images: scatter_state=0 for non-scattered, =1 for Compton, =2 for Rayleigh, and =3 for multiple scatter.
00161 
00162   // -- Store the Compton table in shared memory from global memory:
00163   //    For Compton and Rayleigh the access to memory is not coherent and the caching capability do not speeds up the accesses, they actually slows down the acces to other data.
00164 #ifdef USING_CUDA
00165   __shared__
00166 #endif
00167   struct compton_struct cgco_SHARED;  
00168 #ifdef USING_CUDA
00169   __shared__
00170 #endif
00171   struct detector_struct detector_data_SHARED;
00172 #ifdef USING_CUDA
00173   __shared__
00174 #endif 
00175   struct source_struct source_data_SHARED;    
00176 
00177     
00178 #ifdef USING_CUDA
00179   if (0==threadIdx.x)  // First GPU thread copies the variables to shared memory
00180   {
00181 #endif
00182 
00183     // -Copy the current source, detector data from global to shared memory for fast access:
00184     source_data_SHARED    = source_data_array[num_p];      
00185     detector_data_SHARED  = detector_data_array[num_p];    // Copy the long array to a single instance in shared memory for the current projection
00186         
00187     // -Copy the compton data to shared memory:
00188     cgco_SHARED = *compton_table;
00189     
00190 #ifdef USING_CUDA
00191   }
00192   __syncthreads();     // Make sure all threads will see the initialized shared variable  
00193 #endif
00194 
00195 
00196   // -- Initialize the RANECU generator in a position far away from the previous history:
00197 #ifdef USING_CUDA
00198   init_PRNG((threadIdx.x + blockIdx.x*blockDim.x), histories_per_thread, seed_input, &seed);   // Using a 1D block
00199 #else
00200   init_PRNG(history_batch, histories_per_thread, seed_input, &seed);
00201 #endif
00202 
00203   
00204   // -- Loop for the "histories_per_thread" particles in the current history_batch:
00205 
00206   for( ; histories_per_thread>0; histories_per_thread--)
00207   {
00208         //  printf("\n\n********* NEW HISTORY:  %d    [seeds: %d, %d]\n\n", histories_per_thread, seed.x, seed.y); //  fflush(stdout);  // !!Verbose!! calling printf from the GPU is possible but if multiple threads call it at the same time some output will be lost.
00209 
00210     int absvox = 1;
00211     
00212     // -- Call the source function to get a primary x ray:
00213     source(&position, &direction, &energy, &seed, &absvox, &source_data_SHARED, &detector_data_SHARED);
00214 
00215     scatter_state = (signed char)0;     // Reset previous scatter state: new non-scattered particle loaded
00216 
00217     // -- Find the current energy bin by truncation (this could be pre-calculated for a monoenergetic beam):    
00218     //    The initialization host code made sure that the sampled energy will always be within the tabulated energies (index never negative or too large).
00219 #ifdef USING_CUDA
00220     index = __float2int_rd((energy-mfp_table_data_CONST.e0)*mfp_table_data_CONST.ide);  // Using CUDA function to convert float to integer rounding down (towards minus infinite)
00221 #else
00222     index = (int)((energy-mfp_table_data_CONST.e0)*mfp_table_data_CONST.ide + 0.00001f);    // Adding EPSILON to truncate to INT towards minus infinite. There may be a small error for energy<=mfp_table_data_CONST.e0 but this case is irrelevant (particles will always have more energy than e0).
00223 #endif          
00224 
00225   
00226     // -- Get the minimum mfp at the current energy using linear interpolation (Woodcock tracking):      
00227     {
00228       float2 mfp_Woodcock_read = mfp_Woodcock_table[index];   // Read the 2 parameters for the linear interpolation in a single read from global memory
00229       mfp_Woodcock = mfp_Woodcock_read.x + energy * mfp_Woodcock_read.y;   // Interpolated minimum MFP          
00230     }
00231 
00232 
00233     // -- Reset previous material to force a recalculation of the MFPs (negative materials are not allowed in the voxels):
00234     material_old  = -1;
00235 
00236     // *** X-ray interaction loop:
00237     for(;;)
00238     {
00239       
00240       if (absvox<0)   // !!DeBuG!!  MC-GPU_v1.3 ==> if I move this "if" above the code runs much slower!? Why???
00241           break;    // -- Primary particle was not pointing to the voxel region! (but may still be detected after moving in vacuum in a straight line).      
00242 
00243 
00244       // *** Virtual interaction loop:  // New loop structure in MC-GPU_v1.3: simulate all virtual events before sampling Compton & Rayleigh:  // !!DeBuG!!
00245       
00246       float2 matdens;
00247       short3 voxel_coord;    // Variable used only by DOSE TALLY
00248 
00249       do
00250       {     
00251         step = -(mfp_Woodcock)*logf(ranecu(&seed));   // Using the minimum MFP in the geometry for the input energy (Woodcock tracking)
00252           
00253         position.x += step*direction.x;
00254         position.y += step*direction.y;
00255         position.z += step*direction.z;
00256 
00257         // -- Locate the new particle in the voxel geometry:      
00258         absvox = locate_voxel(&position, &voxel_coord);   // Get the voxel number at the current position and the voxel coordinates (used to check if inside the dose ROI in DOSE TALLY).
00259         if (absvox<0)
00260           break;    // -- Particle escaped the voxel region! ("index" is still >0 at this moment)
00261           
00262         matdens = voxel_mat_dens[absvox];     // Get the voxel material and density in a single read from global memory
00263         material0 = (int)(matdens.x - 1);   // Set the current material by truncation, and set 1st material to value '0'.
00264 
00265         // -- Get the data for the linear interpolation of the interaction MFPs, in case the energy or material have changed:
00266         if (material0 != material_old)
00267         {
00268           mfp_table_read_a = mfp_table_a[index*(MAX_MATERIALS)+material0];
00269           mfp_table_read_b = mfp_table_b[index*(MAX_MATERIALS)+material0];
00270           material_old = material0;                                              // Store the new material
00271         }
00272         
00273         // *** Apply Woodcock tracking:
00274         mfp_density = mfp_Woodcock * matdens.y;
00275         // -- Calculate probability of delta scattering, using the total mean free path for the current material and energy (linear interpolation):
00276         prob = 1.0f - mfp_density * (mfp_table_read_a.x + energy * mfp_table_read_b.x);
00277         randno = ranecu(&seed);    // Sample uniform PRN
00278       }
00279       while (randno<prob);   // [Iterate if there is a delta scattering event]
00280 
00281       if (absvox<0)
00282         break;    // -- Particle escaped the voxel region! Break the interaction loop to call tally image.
00283 
00284         
00285       // The GPU threads will be stopped and waiting here until ALL threads have a REAL event: 
00286 
00287       // -- Real event takes place! Check the kind of event and sample the effects of the interaction:
00288       
00289       prob += mfp_density * (mfp_table_read_a.y + energy * mfp_table_read_b.y);    // Interpolate total Compton MFP ('y' component)
00290       if (randno<prob)   // [Checking Compton scattering]
00291       {
00292         // *** Compton interaction:
00293 
00294         //  -- Sample new direction and energy:
00295         double costh_Compton;
00296         randno = energy;     // Save temporal copy of the particle energy (variable randno not necessary until next sampling). DOSE TALLY
00297         
00298         GCOa(&energy, &costh_Compton, &material0, &seed, &cgco_SHARED);
00299         rotate_double(&direction, costh_Compton, /*phi=2*pi*PRN=*/ 6.28318530717958647693*ranecu_double(&seed));
00300 
00301         randno = energy - randno;   // Save temporal copy of the negative of the energy lost in the interaction.  DOSE TALLY
00302 
00303         // -- Find the new energy interval:
00304 #ifdef USING_CUDA
00305         index = __float2int_rd((energy-mfp_table_data_CONST.e0)*mfp_table_data_CONST.ide);  // Using CUDA function to convert float to integer rounding down (towards minus infinite)
00306 #else
00307         index = (int)((energy-mfp_table_data_CONST.e0)*mfp_table_data_CONST.ide + 0.00001f);    // Adding EPSILON to truncate to INT
00308 #endif          
00309 
00310         
00311         if (index>-1)  // 'index' will be negative only when the energy is below the tabulated minimum energy: particle will be then absorbed (rejected) after tallying the dose.
00312         {          
00313           // -- Get the Woodcock MFP for the new energy (energy above minimum cutoff):
00314           float2 mfp_Woodcock_read = mfp_Woodcock_table[index];   // Read the 2 parameters for the linear interpolation in a single read from global memory
00315           mfp_Woodcock = mfp_Woodcock_read.x + energy * mfp_Woodcock_read.y;   // Interpolated minimum MFP
00316 
00317           material_old = -2;    // Set an impossible material to force an update of the MFPs data for the nex energy interval
00318 
00319           // -- Update scatter state:
00320           if (scatter_state==(signed char)0)
00321             scatter_state = (signed char)1;   // Set scatter_state == 1: Compton scattered particle
00322           else
00323             scatter_state = (signed char)3;   // Set scatter_state == 3: Multi-scattered particle
00324         }
00325 
00326       }
00327       else
00328       {
00329         prob += mfp_density * (mfp_table_read_a.z + energy * mfp_table_read_b.z);    // Interpolate total Rayleigh MFP ('z' component)
00330         if (randno<prob)   // [Checking Rayleigh scattering]
00331         {
00332           // *** Rayleigh interaction:
00333 
00334           //  -- Sample angular deflection:
00335           double costh_Rayleigh;
00336           float pmax_current = rayleigh_table->pmax[(index+1)*MAX_MATERIALS+material0];   // Get max (ie, value for next bin?) cumul prob square form factor for Rayleigh sampling
00337 
00338           GRAa(&energy, &costh_Rayleigh, &material0, &pmax_current, &seed, rayleigh_table);
00339           rotate_double(&direction, costh_Rayleigh, /*phi=2*pi*PRN=*/ 6.28318530717958647693*ranecu_double(&seed));
00340 
00341           // -- Update scatter state:
00342           if (scatter_state==(signed char)0)
00343             scatter_state = (signed char)2;   // Set scatter_state == 1: Rayleigh scattered particle
00344           else
00345             scatter_state = (signed char)3;   // Set scatter_state == 3: Multi-scattered particle
00346 
00347         }
00348         else
00349         {
00350           // *** Photoelectric interaction (or pair production): mark particle for absorption after dose tally (ie, index<0)!
00351           randno = -energy;   // Save temporal copy of the (negative) energy deposited in the interaction (variable randno not necessary anymore).
00352           index = -11;       // A negative "index" marks that the particle was absorved and that it will never arrive at the detector.
00353         }
00354       }
00355     
00356       //  -- Tally the dose deposited in Compton and photoelectric interactions:
00357       if (randno<-0.001f)
00358       {
00359         float Edep = -1.0f*randno;   // If any energy was deposited, this variable will temporarily store the negative value of Edep.
00360         
00361         //  -- Tally the dose deposited in the current material, if enabled (ie, array allocated and not null):
00362         if (materials_dose!=NULL)
00363           tally_materials_dose(&Edep, &material0, materials_dose);    // !!tally_materials_dose!!
00364 
00365         //  -- Tally the energy deposited in the current voxel, if enabled (tally disabled when dose_ROI_x_max_CONST is negative). DOSE TALLY
00366         if (dose_ROI_x_max_CONST > -1)
00367           tally_voxel_energy_deposition(&Edep, &voxel_coord, voxels_Edep);
00368 
00369       }    
00370 
00371       // -- Break interaction loop for particles that have been absorved or with energy below the tabulated cutoff: particle is "absorbed" (ie, track discontinued).
00372       if (index<0)
00373         break;  
00374       
00375     }   // [Cycle the X-ray interaction loop]
00376 
00377     if (index>-1)
00378     {
00379       // -- Particle escaped the voxels but was not absorbed, check if it will arrive at the detector and tally its energy:
00380       tally_image(&energy, &position, &direction, &scatter_state, image, &source_data_SHARED, &detector_data_SHARED);
00381     }
00382   }   // [Continue with a new history]
00383 
00384 }   // [All tracks simulated for this kernel call: return to CPU]
00385 
00386 
00387 
00388 
00389 
00390 
00391 ////////////////////////////////////////////////////////////////////////////////
00392 //!  Tally the dose deposited in the voxels.
00393 //!  This function is called whenever a particle suffers a Compton or photoelectric
00394 //!  interaction. It is not necessary to call this function if the dose tally
00395 //!  was disabled in the input file (ie, dose_ROI_x_max_CONST < 0).
00396 //!  Electrons are not transported in MC-GPU and therefore we are approximating
00397 //!  that the dose is equal to the KERMA (energy released by the photons alone).
00398 //!  This approximation is acceptable when there is electronic equilibrium and when
00399 //!  the range of the secondary electrons is shorter than the voxel size. Usually the
00400 //!  doses will be acceptable for photon energies below 1 MeV. The dose estimates may
00401 //!  not be accurate at the interface of low density volumes.
00402 //!
00403 //!  We need to use atomicAdd() in the GPU to prevent that multiple threads update the 
00404 //!  same voxel at the same time, which would result in a lose of information.
00405 //!  This is very improbable when using a large number of voxels but gives troubles 
00406 //!  with a simple geometries with few voxels (in this case the atomicAdd will slow 
00407 //!  down the code because threads will update the voxel dose secuentially).
00408 //!
00409 //!
00410 //!       @param[in] Edep   Energy deposited in the interaction
00411 //!       @param[in] voxel_coord   Voxel coordinates, needed to check if particle located inside the input region of interest (ROI)
00412 //!       @param[out] voxels_Edep   ulonglong2 array containing the 3D voxel dose and dose^2 (ie, uncertainty) as unsigned integers scaled by SCALE_eV.
00413 ////////////////////////////////////////////////////////////////////////////////
00414 #ifdef USING_CUDA
00415 __device__
00416 #endif
00417 inline 
00418 void tally_voxel_energy_deposition(float* Edep, short3* voxel_coord, ulonglong2* voxels_Edep)
00419 {
00420 
00421     // !!DeBuG!! Maybe it would be faster to store a 6 element struct and save temp copy?? struct_short_int_x6_align16  dose_ROI_size = dose_ROI_size_CONST;   // Get ROI coordinates from GPU constant memory and store temporal copy
00422   
00423   if((voxel_coord->x < dose_ROI_x_min_CONST) || (voxel_coord->x > dose_ROI_x_max_CONST) ||
00424      (voxel_coord->y < dose_ROI_y_min_CONST) || (voxel_coord->y > dose_ROI_y_max_CONST) ||
00425      (voxel_coord->z < dose_ROI_z_min_CONST) || (voxel_coord->z > dose_ROI_z_max_CONST))
00426     {
00427       return;   // -- Particle outside the ROI: return without tallying anything.
00428     }
00429 
00430   // -- Particle inside the ROI: tally Edep.
00431   register int DX = 1 + (int)(dose_ROI_x_max_CONST - dose_ROI_x_min_CONST);
00432   register int num_voxel = (int)(voxel_coord->x-dose_ROI_x_min_CONST) + ((int)(voxel_coord->y-dose_ROI_y_min_CONST))*DX + ((int)(voxel_coord->z-dose_ROI_z_min_CONST))*DX*(1 + (int)(dose_ROI_y_max_CONST-dose_ROI_y_min_CONST));
00433   
00434    #ifdef USING_CUDA
00435      atomicAdd(&voxels_Edep[num_voxel].x, __float2ull_rn((*Edep)*SCALE_eV) );    // Energy deposited at the voxel, scaled by the factor SCALE_eV and rounded.
00436      atomicAdd(&voxels_Edep[num_voxel].y, __float2ull_rn((*Edep)*(*Edep)) );     // (not using SCALE_eV for std_dev to prevent overflow)           
00437    #else
00438      voxels_Edep[num_voxel].x += (unsigned long long int)((*Edep)*SCALE_eV + 0.5f);
00439      voxels_Edep[num_voxel].y += (unsigned long long int)((*Edep)*(*Edep) + 0.5f);
00440    #endif
00441           
00442   return;
00443 }
00444 
00445 
00446 ////////////////////////////////////////////////////////////////////////////////
00447 //!  Tally a radiographic projection image.
00448 //!  This function is called whenever a particle escapes the voxelized volume.
00449 //!  The code checks if the particle would arrive at the detector if it kept
00450 //!  moving in a straight line after exiting the voxels (assuming vacuum enclosure).
00451 //!  An ideal image formation model is implemented: each pixel counts the total energy
00452 //!  of the x rays that enter the pixel (100% detection efficiency for any energy).
00453 //!  The image due to primaries and different kinds of scatter is tallied separately.
00454 //!
00455 //!  In the GPU, and atomicAdd() function is used to make sure that multiple threads do
00456 //!  not update the same pixel at the same time, which would result in a lose of information.
00457 //!  Since the atomicAdd function is only available for 'unsigned long long int' data,
00458 //!  the float pixel values are scaled by a factor "SCALE_eV" defined in the header file
00459 //!  (eg, #define SCALE_eV 10000.0f) and stored as unsigned long long integers in main
00460 //!  memory.
00461 //!
00462 //!  WARNING! If the total tallied signal (for all particles) is larger than "1.8e19/SCALE_eV",
00463 //!    there will be a bit overflow and the value will be reset to 0 giving bogus results.
00464 //!
00465 //!  WARNING! The detector plane should be located outside the voxels bounding box. However, since
00466 //!    the particles are moved outside the bbox in the last step, they could cross the detector 
00467 //!    plane anyway. If the particles are less than 2.0 cm behind the detector, they are moved 
00468 //!    back and detected. Therefore the detector can be a few cm inside the bbox and still work.
00469 //!    If the Woodcock mean free path is larger than the distance from the bbox to the detector, 
00470 //!    we may lose some particles behind the detector!
00471 //!
00472 //!
00473 //!       @param[in] energy   X-ray energy
00474 //!       @param[in] position   Particle position
00475 //!       @param[in] direction   Particle direction (cosine vectors)
00476 //!       @param[in] scatter_state  Flag marking primaries, single Compton, single Rayleigh or multiple scattered radiation
00477 //!       @param[out] image   Integer array containing the image, ie, the pixel values (in tenths of meV)
00478 ////////////////////////////////////////////////////////////////////////////////
00479 #ifdef USING_CUDA
00480 __device__
00481 #endif
00482 inline void tally_image(float* energy, float3* position, float3* direction, signed char* scatter_state, unsigned long long int* image, struct source_struct* source_data_SHARED, struct detector_struct* detector_data_SHARED)
00483 {
00484   float dist_detector, rotated_position;
00485 
00486   if (detector_data_SHARED->rotation_flag == 1)    // -->  Initial source direction is not (0,1,0): detector has to be rotated to +Y to find the pixel number
00487   {
00488     
00489     // *** Skip particles not moving towards the detector. 
00490     //       (a) Skip particles that were deflected more than 90 deg from the original source direction (backscatter).
00491     //       (b) Skip particles located more than 10 cm behind the detector 
00492     //       (c) Skip particles for which the direction to the detector is way bigger than SDD (likely to intersect the plane outside the pixel region).
00493                   // !!DeBuG!! NOTE: This may give problems for big detectors very close to the source
00494                   
00495     //      !!DeBuG!! Particles located after the detector will be moved back to the surface of the detector, but 10 cm maximum!!
00496     //                In this way the detector can intersect the voxels bbox or be located right on the surface of the bbox: the particles will be 
00497     //                transported across the detector and until a little after the end of the bbox in the last step, but then moved back.
00498     //                This algorithm will give correct results ONLY when the detector intersects just slightly the air space around the phantom,
00499     //                so that the interactions after the detector are not significant (this happens sometimes using oblique beams).
00500     //                I could remove particles after the detector using "if (dist_detector<0.0f) return;".
00501 
00502     //  (a) Calculate the angle between the particle and the initial direction (dot product): reject particle if cos_angle < cos(89)==0 (angle>89deg):
00503     //      [Extra parenthesis are coded to suggest to the compiler the use of intrinsic multiply-add operations].
00504 
00505     register float cos_angle = direction->x * source_data_SHARED->direction.x +
00506                               (direction->y * source_data_SHARED->direction.y +
00507                               (direction->z * source_data_SHARED->direction.z));    
00508     if (cos_angle < 0.025f)
00509       return;  // Reject particle: Angle larger than 89 deg --> particle moving parallel to the detector or backwards towards the source!
00510 
00511     //   (b) Find the distance from the current particle location (likely just after the surface of the voxel bbox) to the intersection with the detector plane:
00512     dist_detector = ( source_data_SHARED->direction.x * (detector_data_SHARED->center.x - position->x) +
00513                      (source_data_SHARED->direction.y * (detector_data_SHARED->center.y - position->y) +
00514                      (source_data_SHARED->direction.z * (detector_data_SHARED->center.z - position->z))) ) / cos_angle;
00515 
00516                         
00517                      
00518 // !!DeBuG!!  IF's below (used in v1.2) are not needed when checking the x ray angle:
00519 //   if (dist_detector < -10.0f)   // !!DeBuG!! Is 10 cm enough or too much? Should I use 0? or allow any distance?
00520 //      return;  // !!DeBuG!! Reject particles located more than 10 cm behind the detector. 10 cm was selected arbitrarily. Woodcock MFP for x-rays in bone: MFP 200 keV photons in bone ==> 4 cm.
00521 //      
00522 //    if (fabsf(dist_detector)>(2.1f*detector_data_CONST.sdd))          
00523 //      return;  // Reject particle: distance to the detector plane too large, the particle is likely to travel almost parallel to the detector and will not be detected.
00524 
00525             
00526     // *** Translate the particle to the detector plane (we assume the detector is completely absorbent: 100% detection efficiency):
00527     position->x = position->x + dist_detector * direction->x;
00528     position->y = position->y + dist_detector * direction->y;
00529     position->z = position->z + dist_detector * direction->z;
00530 
00531     // *** Rotate the particle position vector to the default reference system where the detector is perpendicular to the +Y axis, then find out if the particle is located inside a pixel:
00532     #ifdef USING_CUDA
00533       rotated_position = detector_data_SHARED->rot_inv[0]*position->x + detector_data_SHARED->rot_inv[1]*position->y + detector_data_SHARED->rot_inv[2]*position->z;  // X coordinate
00534       int pixel_coord_x = __float2int_rd((rotated_position - detector_data_SHARED->corner_min_rotated_to_Y.x) * detector_data_SHARED->inv_pixel_size_X);    // Using CUDA intrinsic function to convert float to integer rounding down (towards minus infinite)
00535       if ((pixel_coord_x>-1)&&(pixel_coord_x<detector_data_SHARED->num_pixels.x))
00536       {
00537         rotated_position = detector_data_SHARED->rot_inv[6]*position->x + detector_data_SHARED->rot_inv[7]*position->y + detector_data_SHARED->rot_inv[8]*position->z;  // Z coordinate
00538         int pixel_coord_z = __float2int_rd((rotated_position - detector_data_SHARED->corner_min_rotated_to_Y.z) * detector_data_SHARED->inv_pixel_size_Z);
00539         if ((pixel_coord_z>-1)&&(pixel_coord_z<detector_data_SHARED->num_pixels.y))
00540         {
00541           // -- Particle enters the detector! Tally the particle energy in the corresponding pixel (in tenths of meV):
00542           //    Using a CUDA atomic function (not available for global floats yet) to read and increase the pixel value in a single instruction, blocking interferences from other threads.
00543           //    The offset for the primaries or scatter images are calculated considering that:
00544           //      scatter_state=0 for non-scattered, =1 for Compton, =2 for Rayleigh, and =3 for multiple scatter.
00545           atomicAdd(( image +                                                               // Pointer to beginning of image array
00546                     (int)(*scatter_state) * detector_data_SHARED->total_num_pixels +         // Offset to corresponding scatter image
00547                     (pixel_coord_x + pixel_coord_z*(detector_data_SHARED->num_pixels.x)) ),  // Offset to the corresponding pixel
00548                     __float2ull_rn((*energy)*SCALE_eV) );     // Energy arriving at the pixel, scaled by the factor SCALE_eV and rounded.
00549                                                               // The maximum unsigned long long int value is ~1.8e19:
00550         }
00551       }
00552     #else
00553       // CPU version (not using CUDA intrinsics: atomicAdd, fast type casting)
00554       rotated_position = detector_data_SHARED->rot_inv[0]*position->x + detector_data_SHARED->rot_inv[1]*position->y + detector_data_SHARED->rot_inv[2]*position->z;  // X coordinate
00555       
00556       float pixel_coord_x = floor((rotated_position - detector_data_SHARED->corner_min_rotated_to_Y.x)*detector_data_SHARED->inv_pixel_size_X);   // Using float+floor instead of INT to avoid truncation errors for positive and negative values
00557       if ( (pixel_coord_x>-0.1f) && (pixel_coord_x<(detector_data_SHARED->num_pixels.x-0.1f)) )    // Rejecting values negative or bigger than the image size
00558       {
00559         rotated_position = detector_data_SHARED->rot_inv[6]*position->x + detector_data_SHARED->rot_inv[7]*position->y + detector_data_SHARED->rot_inv[8]*position->z;  // Z coordinate
00560         float pixel_coord_z = floor((rotated_position - detector_data_SHARED->corner_min_rotated_to_Y.z)*detector_data_SHARED->inv_pixel_size_Z);
00561         if ( (pixel_coord_z>-0.1f) && (pixel_coord_z<(detector_data_SHARED->num_pixels.y-0.1f)) )
00562           image[(int)(((float)*scatter_state)*detector_data_SHARED->total_num_pixels + pixel_coord_x + pixel_coord_z*detector_data_SHARED->num_pixels.x  +  0.0001f)]
00563              += (unsigned long long int)((*energy)*SCALE_eV + 0.5f);   // Tally the particle energy in the pixel. This instruction is not thread-safe, but it is ok in sequential CPU code.          
00564       }
00565     #endif
00566   }
00567   else  // (detector_data_SHARED->rotation_flag != 1) -->  Initial source direction is (0,1,0): pixel number and distance can be found easily
00568   {  
00569     if (direction->y < 0.0001f)
00570       return;  // *** Reject particles not moving towards the detector plane at +Y.
00571 
00572     dist_detector = (detector_data_SHARED->center.y - position->y)/(direction->y);  // Distance to the intersection with the detector at +Y.
00573   
00574       // !!DeBuG!! IF below (v1.2) not needed when checking the angle
00575       //     if (dist_detector>(2.1f*detector_data_SHARED->sdd)) return;  
00576      
00577     
00578     #ifdef USING_CUDA
00579     int pixel_coord_x = __float2int_rd((position->x + dist_detector*direction->x - detector_data_SHARED->corner_min_rotated_to_Y.x)*detector_data_SHARED->inv_pixel_size_X);
00580     if ((pixel_coord_x>-1)&&(pixel_coord_x<detector_data_SHARED->num_pixels.x))
00581     {
00582       int pixel_coord_z = __float2int_rd((position->z + dist_detector*direction->z - detector_data_SHARED->corner_min_rotated_to_Y.z)*detector_data_SHARED->inv_pixel_size_Z);
00583       if ((pixel_coord_z>-1)&&(pixel_coord_z<detector_data_SHARED->num_pixels.y))
00584         atomicAdd( ( image +                                                                // Pointer to beginning of image array
00585                      (int)(*scatter_state) * detector_data_SHARED->total_num_pixels +         // Offset to corresponding scatter image
00586                      (pixel_coord_x + pixel_coord_z*(detector_data_SHARED->num_pixels.x)) ),  // Offset to the corresponding pixel
00587                    __float2ull_rn((*energy)*SCALE_eV) );    // Energy arriving at the pixel, scaled by the factor SCALE_eV and rounded.
00588     }
00589     #else
00590 
00591     // --Calculate the pixel the xray enters, truncating towards minus infinite and making sure the conversion to int is safe:
00592     float pixel_coord_x = floor((position->x + dist_detector*direction->x - detector_data_SHARED->corner_min_rotated_to_Y.x)*detector_data_SHARED->inv_pixel_size_X);
00593 
00594     if ( (pixel_coord_x>-0.1f) && (pixel_coord_x<(detector_data_SHARED->num_pixels.x-0.1f)) )
00595     {
00596       float pixel_coord_z = floor((position->z + dist_detector*direction->z - detector_data_SHARED->corner_min_rotated_to_Y.z)*detector_data_SHARED->inv_pixel_size_Z);
00597       if ( (pixel_coord_z>-0.1f) && (pixel_coord_z<(detector_data_SHARED->num_pixels.y-0.1f)) )
00598         image[(int)(((float)*scatter_state)*detector_data_SHARED->total_num_pixels + pixel_coord_x + pixel_coord_z*detector_data_SHARED->num_pixels.x  +  0.0001f)]
00599            += (unsigned long long int)((*energy)*SCALE_eV + 0.5f);    // Truncate the pixel number to INT and round the energy value
00600     }
00601     #endif
00602   }
00603 
00604 }
00605 
00606 
00607 
00608 ////////////////////////////////////////////////////////////////////////////////
00609 //!  Source that creates primary x rays, according to the defined source model.
00610 //!  The particles are automatically moved to the surface of the voxel bounding box,
00611 //!  to start the tracking inside a real material. If the sampled particle do not
00612 //!  enter the voxels, it is init in the focal spot and the main program will check
00613 //!  if it arrives at the detector or not.
00614 //!
00615 //!       @param[in] source_data   Structure describing the source.
00616 //!       @param[in] source_energy_data_CONST   Global variable in constant memory space describing the source energy spectrum.
00617 //!       @param[out] position   Initial particle position (particle transported inside the voxel bbox).
00618 //!       @param[out] direction   Sampled particle direction (cosine vectors).
00619 //!       @param[out] energy   Sampled energy of the new x ray.
00620 //!       @param[in] seed   Current seed of the random number generator, requiered to sample the movement direction.
00621 //!       @param[out] absvox   Set to <0 if primary particle will not cross the voxels, not changed otherwise (>0).
00622 ////////////////////////////////////////////////////////////////////////////////
00623 #ifdef USING_CUDA
00624 __device__
00625 #endif
00626 inline void source(float3* position, float3* direction, float* energy, int2* seed, int* absvox, struct source_struct* source_data_SHARED, struct detector_struct* detector_data_SHARED)
00627 {
00628   // *** Sample the initial x-ray energy following the input energy spectrum using the Walker aliasing algorithm from PENELOPE:
00629       // The following code is equivalent to calling the function "seeki_walker": int sampled_bin = seeki_walker(source_data_CONST.espc_cutoff, source_data_CONST.espc_alias, ranecu(seed), source_data_CONST.num_bins_espc);      
00630   int sampled_bin;
00631   float RN = ranecu(seed) * source_energy_data_CONST.num_bins_espc;    // Find initial interval (array starting at 0):   
00632   #ifdef USING_CUDA
00633     int int_part = __float2int_rd(RN);                          //   -- Integer part (round down)
00634   #else
00635     int int_part = (int)(RN);
00636   #endif
00637   float fraction_part = RN - ((float)int_part);                 //   -- Fractional part
00638   if (fraction_part < source_energy_data_CONST.espc_cutoff[int_part])  // Check if we are in the aliased part
00639     sampled_bin = int_part;                                     // Below the cutoff: return current value
00640   else
00641     sampled_bin = (int)source_energy_data_CONST.espc_alias[int_part];  // Above the cutoff: return alias
00642   
00643   // Linear interpolation of the final energy within the sampled energy bin:
00644   *energy = source_energy_data_CONST.espc[sampled_bin] + ranecu(seed) * (source_energy_data_CONST.espc[sampled_bin+1] - source_energy_data_CONST.espc[sampled_bin]);   
00645       
00646  
00647    // *** Sample the initial direction:
00648    
00649   do   //  Iterate sampling if the sampled direction is not acceptable to get a square field at the given phi (rejection sampling): force square field for any phi!!
00650   {
00651     //     Using the algorithm used in PENMAIN.f, from penelope 2008 (by F. Salvat).
00652     direction->z = source_data_SHARED->cos_theta_low + ranecu(seed)*source_data_SHARED->D_cos_theta;     // direction->z = w = cos(theta_sampled)
00653     register float phi_sampled = source_data_SHARED->phi_low + ranecu(seed)*source_data_SHARED->D_phi;
00654     register float sin_theta_sampled = sqrtf(1.0f - direction->z*direction->z);
00655     float sinphi_sampled, cosphi_sampled;
00656     
00657     #ifdef USING_CUDA
00658       sincos(phi_sampled, &sinphi_sampled,&cosphi_sampled);    // Calculate the SIN and COS at the same time.
00659     #else
00660       sinphi_sampled = sin(phi_sampled);   // Some CPU compilers will be able to use "sincos", but let's be safe.
00661       cosphi_sampled = cos(phi_sampled);
00662     #endif       
00663     
00664     direction->y = sin_theta_sampled * sinphi_sampled;
00665     direction->x = sin_theta_sampled * cosphi_sampled;
00666   }
00667   while( fabsf(direction->z/(direction->y+1.0e-7f)) > source_data_SHARED->max_height_at_y1cm );  // !!DeBuG!! Force square field for any phi by rejection sampling!! Is it necessary to use the "+1.0e-7f" to prevent possible division by zero???
00668     
00669 
00670   if (detector_data_SHARED->rotation_flag == 1)
00671   {
00672     // --Initial beam not pointing to (0,1,0), apply rotation:
00673     register float direction_x_tmp = direction->x;
00674     register float direction_y_tmp = direction->y;
00675     direction->x = source_data_SHARED->rot_fan[0]*direction_x_tmp + source_data_SHARED->rot_fan[1]*direction_y_tmp + source_data_SHARED->rot_fan[2]*direction->z;
00676     direction->y = source_data_SHARED->rot_fan[3]*direction_x_tmp + source_data_SHARED->rot_fan[4]*direction_y_tmp + source_data_SHARED->rot_fan[5]*direction->z;
00677     direction->z = source_data_SHARED->rot_fan[6]*direction_x_tmp + source_data_SHARED->rot_fan[7]*direction_y_tmp + source_data_SHARED->rot_fan[8]*direction->z;
00678   }
00679 
00680   // Initialize x ray position at focal spot before translation into bbox. Particle stays in focal spot if no interaction found:
00681   position->x = source_data_SHARED->position.x;
00682   position->y = source_data_SHARED->position.y;
00683   position->z = source_data_SHARED->position.z;
00684       
00685   move_to_bbox(position, direction, voxel_data_CONST.size_bbox, absvox);  // Move the particle inside the voxel bounding box.
00686 }
00687 
00688 
00689 
00690 ////////////////////////////////////////////////////////////////////////////////
00691 //!  Functions that moves a particle inside the voxelized geometry bounding box.
00692 //!  An EPSILON distance is added to make sure the particles will be clearly inside the bbox, 
00693 //!  not exactly on the surface. 
00694 //!
00695 //!  This algorithm makes the following assumtions:
00696 //!     - The back lower vertex of the voxel bounding box is always located at the origin: (x0,y0,z0)=(0,0,0).
00697 //!     - The initial value of "position" corresponds to the focal spot location.
00698 //!     - When a ray is not pointing towards the bbox plane that it should cross according to the sign of the direction,
00699 //!       I assign a distance to the intersection =0 instead of the real negative distance. The wall that will be 
00700 //!       crossed to enter the bbox is always the furthest and therefore a 0 distance will never be used except
00701 //!       in the case of a ray starting inside the bbox or outside the bbox and not pointing to any of the 3 planes. 
00702 //!       In this situation the ray will be transported a 0 distance, meaning that it will stay at the focal spot.
00703 //!
00704 //!  (Interesting information on ray-box intersection: http://tog.acm.org/resources/GraphicsGems/gems/RayBox.c)
00705 //!
00706 //!       @param[in,out] position Particle position: initially set to the focal spot, returned transported inside the voxel bbox.
00707 //!       @param[out] direction   Sampled particle direction (cosine vectors).
00708 //!       @param[out] intersection_flag   Set to <0 if particle outside bbox and will not cross the voxels, not changed otherwise.
00709 //!       @param[out] size_bbox   Size of the bounding box.
00710 ////////////////////////////////////////////////////////////////////////////////
00711 #ifdef USING_CUDA
00712 __device__
00713 #endif
00714 inline void move_to_bbox(float3* position, float3* direction, float3 size_bbox, int* intersection_flag)
00715 {
00716   float dist_y, dist_x, dist_z;
00717 
00718   // -Distance to the nearest Y plane:
00719   if ((direction->y) > EPS_SOURCE)   // Moving to +Y: check distance to y=0 plane
00720   {
00721     // Check Y=0 (bbox wall):
00722     if (position->y > 0.0f)  // The input position must correspond to the focal spot => position->y == source_data_CONST.position[*num_p].y
00723       dist_y = 0.0f;  // No intersection with this plane: particle inside or past the box  
00724           // The actual distance would be negative but we set it to 0 bc we will not move the particle if no intersection exist.
00725     else
00726       dist_y = EPS_SOURCE + (-position->y)/(direction->y);    // dist_y > 0 for sure in this case
00727   }
00728   else if ((direction->y) < NEG_EPS_SOURCE)
00729   {
00730     // Check Y=voxel_data_CONST.size_bbox.y:
00731     if (position->y < size_bbox.y)
00732       dist_y = 0.0f;  // No intersection with this plane
00733     else
00734       dist_y = EPS_SOURCE + (size_bbox.y - position->y)/(direction->y);    // dist_y > 0 for sure in this case
00735   }
00736   else   // (direction->y)~0
00737     dist_y = NEG_INF;   // Particle moving parallel to the plane: no interaction possible (set impossible negative dist = -INFINITE)
00738 
00739   // -Distance to the nearest X plane:
00740   if ((direction->x) > EPS_SOURCE)
00741   {
00742     // Check X=0:
00743     if (position->x > 0.0f)
00744       dist_x = 0.0f;
00745     else  
00746       dist_x = EPS_SOURCE + (-position->x)/(direction->x);    // dist_x > 0 for sure in this case
00747   }
00748   else if ((direction->x) < NEG_EPS_SOURCE)
00749   {
00750     // Check X=voxel_data_CONST.size_bbox.x:
00751     if (position->x < size_bbox.x)
00752       dist_x = 0.0f;
00753     else  
00754       dist_x = EPS_SOURCE + (size_bbox.x - position->x)/(direction->x);    // dist_x > 0 for sure in this case
00755   }
00756   else
00757     dist_x = NEG_INF;
00758 
00759   // -Distance to the nearest Z plane:
00760   if ((direction->z) > EPS_SOURCE)
00761   {
00762     // Check Z=0:
00763     if (position->z > 0.0f)
00764       dist_z = 0.0f;
00765     else
00766       dist_z = EPS_SOURCE + (-position->z)/(direction->z);    // dist_z > 0 for sure in this case
00767   }
00768   else if ((direction->z) < NEG_EPS_SOURCE)
00769   {
00770     // Check Z=voxel_data_CONST.size_bbox.z:
00771     if (position->z < size_bbox.z)
00772       dist_z = 0.0f;
00773     else
00774       dist_z = EPS_SOURCE + (size_bbox.z - position->z)/(direction->z);    // dist_z > 0 for sure in this case
00775   }
00776   else
00777     dist_z = NEG_INF;
00778 
00779   
00780   // -- Find the longest distance plane, which is the one that has to be crossed to enter the bbox.
00781   //    Storing the maximum distance in variable "dist_z". Distance will be =0 if no intersection exists or 
00782   //    if the x ray is already inside the bbox.
00783   if ( (dist_y>dist_x) && (dist_y>dist_z) )
00784     dist_z = dist_y;      // dist_z == dist_max 
00785   else if (dist_x>dist_z)
00786     dist_z = dist_x;
00787 // else
00788 //   dist_max = dist_z;
00789     
00790   // -- Move particle from the focal spot (current location) to the bbox wall surface (slightly inside):
00791   position->x += dist_z * direction->x;
00792   position->y += dist_z * direction->y;
00793   position->z += dist_z * direction->z;      
00794   
00795   // Check if the new position is outside the bbox. If true, the particle must be moved back to the focal spot location:
00796   if ( (position->x < 0.0f) || (position->x > size_bbox.x) || 
00797        (position->y < 0.0f) || (position->y > size_bbox.y) || 
00798        (position->z < 0.0f) || (position->z > size_bbox.z) )
00799   {
00800     position->x -= dist_z * direction->x;  // Reject new position undoing the previous translation
00801     position->y -= dist_z * direction->y;
00802     position->z -= dist_z * direction->z;
00803     (*intersection_flag) = -111;  // Particle outside the bbox AND not pointing to the bbox: set absvox<0 to skip interaction sampling.
00804   }
00805 }
00806 
00807 
00808 ////////////////////////////////////////////////////////////////////////////////
00809 
00810 
00811 //!  Upper limit of the number of random values sampled in a single track.
00812 #define  LEAP_DISTANCE     256
00813 //!  Multipliers and moduli for the two MLCG in RANECU.
00814 #define  a1_RANECU       40014
00815 #define  m1_RANECU  2147483563
00816 #define  a2_RANECU       40692
00817 #define  m2_RANECU  2147483399
00818 ////////////////////////////////////////////////////////////////////////////////
00819 //! Initialize the pseudo-random number generator (PRNG) RANECU to a position
00820 //! far away from the previous history (leap frog technique).
00821 //!
00822 //! Each calculated seed initiates a consecutive and disjoint sequence of
00823 //! pseudo-random numbers with length LEAP_DISTANCE, that can be used to
00824 //! in a parallel simulation (Sequence Splitting parallelization method).
00825 //! The basic equation behind the algorithm is:
00826 //!    S(i+j) = (a**j * S(i)) MOD m = [(a**j MOD m)*S(i)] MOD m  ,
00827 //! which is described in:
00828 //!   P L'Ecuyer, Commun. ACM 31 (1988) p.742
00829 //!
00830 //! This function has been adapted from "seedsMLCG.f", see:
00831 //!   A Badal and J Sempau, Computer Physics Communications 175 (2006) p. 440-450
00832 //!
00833 //!       @param[in] history   Particle bach number.
00834 //!       @param[in] seed_input   Initial PRNG seed input (used to initiate both MLCGs in RANECU).
00835 //!       @param[out] seed   Initial PRNG seeds for the present history.
00836 //!
00837 ////////////////////////////////////////////////////////////////////////////////
00838 #ifdef USING_CUDA
00839 __device__
00840 #endif
00841 inline void init_PRNG(int history_batch, int histories_per_thread, int seed_input, int2* seed)
00842 {
00843   // -- Move the RANECU generator to a unique position for the current batch of histories:
00844   //    I have to use an "unsigned long long int" value to represent all the simulated histories in all previous batches
00845   //    The maximum unsigned long long int value is ~1.8e19: if history >1.8e16 and LEAP_DISTANCE==1000, 'leap' will overflow.
00846   // **** 1st MLCG:
00847   unsigned long long int leap = ((unsigned long long int)(history_batch+1))*(histories_per_thread*LEAP_DISTANCE);
00848   int y = 1;
00849   int z = a1_RANECU;
00850   // -- Calculate the modulo power '(a^leap)MOD(m)' using a divide-and-conquer algorithm adapted to modulo arithmetic
00851   for(;;)
00852   {
00853     // (A2) Halve n, and store the integer part and the residue
00854     if (0!=(leap&01))  // (bit-wise operation for MOD(leap,2), or leap%2 ==> proceed if leap is an odd number)  Equivalent: t=(short)(leap%2);
00855     {
00856       leap >>= 1;     // Halve n moving the bits 1 position right. Equivalent to:  leap=(leap/2);  
00857       y = abMODm(m1_RANECU,z,y);      // (A3) Multiply y by z:  y = [z*y] MOD m
00858       if (0==leap) break;         // (A4) leap==0? ==> finish
00859     }
00860     else           // (leap is even)
00861     {
00862       leap>>= 1;     // Halve leap moving the bits 1 position right. Equivalent to:  leap=(leap/2);
00863     }
00864     z = abMODm(m1_RANECU,z,z);        // (A5) Square z:  z = [z*z] MOD m
00865   }
00866   // AjMODm1 = y;                 // Exponentiation finished:  AjMODm = expMOD = y = a^j
00867 
00868   // -- Compute and display the seeds S(i+j), from the present seed S(i), using the previously calculated value of (a^j)MOD(m):
00869   //         S(i+j) = [(a**j MOD m)*S(i)] MOD m
00870   //         S_i = abMODm(m,S_i,AjMODm)
00871   seed->x = abMODm(m1_RANECU, seed_input, y);     // Using the input seed as the starting seed
00872 
00873   // **** 2nd MLCG (repeating the previous calculation for the 2nd MLCG parameters):
00874   leap = ((unsigned long long int)(history_batch+1))*(histories_per_thread*LEAP_DISTANCE);
00875   y = 1;
00876   z = a2_RANECU;
00877   for(;;)
00878   {
00879     // (A2) Halve n, and store the integer part and the residue
00880     if (0!=(leap&01))  // (bit-wise operation for MOD(leap,2), or leap%2 ==> proceed if leap is an odd number)  Equivalent: t=(short)(leap%2);
00881     {
00882       leap >>= 1;     // Halve n moving the bits 1 position right. Equivalent to:  leap=(leap/2);
00883       y = abMODm(m2_RANECU,z,y);      // (A3) Multiply y by z:  y = [z*y] MOD m
00884       if (0==leap) break;         // (A4) leap==0? ==> finish
00885     }
00886     else           // (leap is even)
00887     {
00888       leap>>= 1;     // Halve leap moving the bits 1 position right. Equivalent to:  leap=(leap/2);
00889     }
00890     z = abMODm(m2_RANECU,z,z);        // (A5) Square z:  z = [z*z] MOD m
00891   }
00892   // AjMODm2 = y;
00893   seed->y = abMODm(m2_RANECU, seed_input, y);     // Using the input seed as the starting seed
00894 }
00895 
00896 
00897 
00898 /////////////////////////////////////////////////////////////////////
00899 //!  Calculate "(a1*a2) MOD m" with 32-bit integers and avoiding
00900 //!  the possible overflow, using the Russian Peasant approach
00901 //!  modulo m and the approximate factoring method, as described
00902 //!  in:  L'Ecuyer and Cote, ACM Trans. Math. Soft. 17 (1991).
00903 //!
00904 //!  This function has been adapted from "seedsMLCG.f", see: 
00905 //!  Badal and Sempau, Computer Physics Communications 175 (2006)
00906 //!
00907 //!       @param[in] m,a,s  MLCG parameters
00908 //!       @return   (a1*a2) MOD m   
00909 //
00910 //    Input:          0 < a1 < m                                  
00911 //                    0 < a2 < m                                  
00912 //
00913 //    Return value:  (a1*a2) MOD m                                
00914 //
00915 /////////////////////////////////////////////////////////////////////
00916 #ifdef USING_CUDA
00917 __device__ __host__    // Function will be callable from host and also from device
00918 #endif
00919 inline int abMODm(int m, int a, int s)
00920 {
00921   // CAUTION: the input parameters are modified in the function but should not be returned to the calling function! (pass by value!)
00922   int q, k;
00923   int p = -m;            // p is always negative to avoid overflow when adding
00924 
00925   // ** Apply the Russian peasant method until "a =< 32768":
00926   while (a>32768)        // We assume '32' bit integers (4 bytes): 2^(('32'-2)/2) = 32768
00927   {
00928     if (0!=(a&1))        // Store 's' when 'a' is odd     Equivalent code:   if (1==(a%2))
00929     {
00930       p += s;
00931       if (p>0) p -= m;
00932     }
00933     a >>= 1;             // Half a (move bits 1 position right)   Equivalent code: a = a/2;
00934     s = (s-m) + s;       // Double s (MOD m)
00935     if (s<0) s += m;     // (s is always positive)
00936   }
00937 
00938   // ** Employ the approximate factoring method (a is small enough to avoid overflow):
00939   q = (int) m / a;
00940   k = (int) s / q;
00941   s = a*(s-k*q)-k*(m-q*a);
00942   while (s<0)
00943     s += m;
00944 
00945   // ** Compute the final result:
00946   p += s;
00947   if (p<0) p += m;
00948 
00949   return p;
00950 }
00951 
00952 
00953 
00954 ////////////////////////////////////////////////////////////////////////////////
00955 //! Pseudo-random number generator (PRNG) RANECU returning a float value
00956 //! (single precision version).
00957 //!
00958 //!       @param[in,out] seed   PRNG seed (seed kept in the calling function and updated here).
00959 //!       @return   PRN double value in the open interval (0,1)
00960 //!
00961 ////////////////////////////////////////////////////////////////////////////////
00962 #ifdef USING_CUDA
00963 __device__ 
00964 #endif
00965 inline float ranecu(int2* seed)
00966 {
00967   int i1 = (int)(seed->x/53668);
00968   seed->x = 40014*(seed->x-i1*53668)-i1*12211;
00969 
00970   int i2 = (int)(seed->y/52774);
00971   seed->y = 40692*(seed->y-i2*52774)-i2*3791;
00972 
00973   if (seed->x < 0) seed->x += 2147483563;
00974   if (seed->y < 0) seed->y += 2147483399;
00975 
00976   i2 = seed->x-seed->y;
00977   if (i2 < 1) i2 += 2147483562;
00978 
00979 
00980 #ifdef USING_CUDA
00981   return (__int2float_rn(i2)*4.65661305739e-10f);        // 4.65661305739e-10 == 1/2147483563
00982 #else
00983   return ((float)(i2)*4.65661305739e-10f);          
00984 #endif
00985 
00986 }
00987 
00988 
00989 ////////////////////////////////////////////////////////////////////////////////
00990 //! Pseudo-random number generator (PRNG) RANECU returning a double value.
00991 ////////////////////////////////////////////////////////////////////////////////
00992 #ifdef USING_CUDA
00993 __device__ 
00994 #endif
00995 inline double ranecu_double(int2* seed)
00996 {
00997   int i1 = (int)(seed->x/53668);
00998   seed->x = 40014*(seed->x-i1*53668)-i1*12211;
00999 
01000   int i2 = (int)(seed->y/52774);
01001   seed->y = 40692*(seed->y-i2*52774)-i2*3791;
01002 
01003   if (seed->x < 0) seed->x += 2147483563;
01004   if (seed->y < 0) seed->y += 2147483399;
01005 
01006   i2 = seed->x-seed->y;
01007   if (i2 < 1) i2 += 2147483562;
01008 
01009 #ifdef USING_CUDA
01010   return (__int2double_rn(i2)*4.6566130573917692e-10);
01011 #else
01012   return ((double)(i2)*4.6566130573917692e-10);
01013 #endif
01014 
01015 }
01016 
01017 
01018 
01019 ////////////////////////////////////////////////////////////////////////////////
01020 //! Find the voxel that contains the current position.
01021 //! Report the voxel absolute index and the x,y,z indices.
01022 //! The structure containing the voxel number and size is read from CONSTANT memory.
01023 //!
01024 //!       @param[in] position   Particle position
01025 //!       @param[out] voxel_coord   Pointer to three integer values (short3*) that will store the x,y and z voxel indices.
01026 //!       @return   Returns "absvox", the voxel number where the particle is
01027 //!                 located (negative if position outside the voxel bbox).
01028 //!
01029 ////////////////////////////////////////////////////////////////////////////////
01030 #ifdef USING_CUDA
01031 __device__
01032 #endif
01033 inline int locate_voxel(float3* position, short3* voxel_coord)
01034 {
01035 
01036   if ( (position->y < EPS_SOURCE) || (position->y > (voxel_data_CONST.size_bbox.y - EPS_SOURCE)) ||
01037        (position->x < EPS_SOURCE) || (position->x > (voxel_data_CONST.size_bbox.x - EPS_SOURCE)) ||
01038        (position->z < EPS_SOURCE) || (position->z > (voxel_data_CONST.size_bbox.z - EPS_SOURCE)) )
01039   {
01040     // -- Particle escaped the voxelized geometry (using EPS_SOURCE to avoid numerical precision errors):      
01041      return -1;
01042   }
01043  
01044   // -- Particle inside the voxelized geometry, find current voxel:
01045   //    The truncation from float to integer could give troubles for negative coordinates but this will never happen thanks to the IF at the begining of this function.
01046   //    (no need to use the CUDA function to convert float to integer rounding down (towards minus infinite): __float2int_rd)
01047   
01048   register int voxel_coord_x, voxel_coord_y, voxel_coord_z;
01049 #ifdef USING_CUDA
01050   voxel_coord_x = __float2int_rd(position->x * voxel_data_CONST.inv_voxel_size.x);  
01051   voxel_coord_y = __float2int_rd(position->y * voxel_data_CONST.inv_voxel_size.y);
01052   voxel_coord_z = __float2int_rd(position->z * voxel_data_CONST.inv_voxel_size.z);
01053 #else
01054   voxel_coord_x = (int)(position->x * voxel_data_CONST.inv_voxel_size.x);     
01055   voxel_coord_y = (int)(position->y * voxel_data_CONST.inv_voxel_size.y);
01056   voxel_coord_z = (int)(position->z * voxel_data_CONST.inv_voxel_size.z);
01057 #endif
01058 
01059   // Output the voxel coordinates as short int (2 bytes) instead of int (4 bytes) to save registers; avoid type castings in the calculation of the return value.
01060   voxel_coord->x = (short int) voxel_coord_x;
01061   voxel_coord->y = (short int) voxel_coord_y;
01062   voxel_coord->z = (short int) voxel_coord_z;
01063   
01064   return (voxel_coord_x + voxel_coord_y*(voxel_data_CONST.num_voxels.x) + voxel_coord_z*(voxel_data_CONST.num_voxels.x)*(voxel_data_CONST.num_voxels.y));  
01065 }
01066 
01067 
01068 
01069 //////////////////////////////////////////////////////////////////////
01070 //!   Rotates a vector; the rotation is specified by giving
01071 //!   the polar and azimuthal angles in the "self-frame", as
01072 //!   determined by the vector to be rotated.
01073 //!   This function is a literal translation from Fortran to C of
01074 //!   PENELOPE (v. 2006) subroutine "DIRECT".
01075 //!
01076 //!    @param[in,out]  (u,v,w)  input vector (=d) in the lab. frame; returns the rotated vector components in the lab. frame
01077 //!    @param[in]  costh  cos(theta), angle between d before and after turn
01078 //!    @param[in]  phi  azimuthal angle (rad) turned by d in its self-frame
01079 //
01080 //    Output:
01081 //      (u,v,w) -> rotated vector components in the lab. frame
01082 //
01083 //    Comments:
01084 //      -> (u,v,w) should have norm=1 on input; if not, it is
01085 //         renormalized on output, provided norm>0.
01086 //      -> The algorithm is based on considering the turned vector
01087 //         d' expressed in the self-frame S',
01088 //           d' = (sin(th)cos(ph), sin(th)sin(ph), cos(th))
01089 //         and then apply a change of frame from S' to the lab
01090 //         frame. S' is defined as having its z' axis coincident
01091 //         with d, its y' axis perpendicular to z and z' and its
01092 //         x' axis equal to y'*z'. The matrix of the change is then
01093 //                   / uv/rho    -v/rho    u \
01094 //          S ->lab: | vw/rho     u/rho    v |  , rho=(u^2+v^2)^0.5
01095 //                   \ -rho       0        w /
01096 //      -> When rho=0 (w=1 or -1) z and z' are parallel and the y'
01097 //         axis cannot be defined in this way. Instead y' is set to
01098 //         y and therefore either x'=x (if w=1) or x'=-x (w=-1)
01099 //////////////////////////////////////////////////////////////////////
01100 #ifdef USING_CUDA
01101 __device__
01102 #endif
01103 inline void rotate_double(float3* direction, double costh, double phi)   // !!DeBuG!! The direction vector is single precision but the rotation is performed in doule precision for increased accuracy.
01104 {
01105   double DXY, NORM, cosphi, sinphi, SDT;
01106   DXY = direction->x*direction->x + direction->y*direction->y;
01107   
01108 #ifdef USING_CUDA
01109   sincos(phi, &sinphi,&cosphi);   // Calculate the SIN and COS at the same time.
01110 #else
01111   sinphi = sin(phi);   // Some CPU compilers will be able to use "sincos", but let's be safe.
01112   cosphi = cos(phi);
01113 #endif   
01114 
01115   // ****  Ensure normalisation
01116   NORM = DXY + direction->z*direction->z;     // !!DeBuG!! Check if it is really necessary to renormalize in a real simulation!!
01117   if (fabs(NORM-1.0)>1.0e-14)
01118   {
01119     NORM = 1.0/sqrt(NORM);
01120     direction->x = NORM*direction->x;
01121     direction->y = NORM*direction->y;
01122     direction->z = NORM*direction->z;
01123     DXY = direction->x*direction->x + direction->y*direction->y;
01124   }
01125   if (DXY>1.0e-28)
01126   {
01127     SDT = sqrt((1.0-costh*costh)/DXY);
01128     float direction_x_in = direction->x;
01129     direction->x = direction->x*costh + SDT*(direction_x_in*direction->z*cosphi-direction->y*sinphi);
01130     direction->y = direction->y*costh+SDT*(direction->y*direction->z*cosphi+direction_x_in*sinphi);
01131     direction->z = direction->z*costh-DXY*SDT*cosphi;
01132   }
01133   else
01134   {
01135     SDT = sqrt(1.0-costh*costh);
01136     direction->y = SDT*sinphi;
01137     if (direction->z>0.0)
01138     {
01139       direction->x = SDT*cosphi;
01140       direction->z = costh;
01141     }
01142     else
01143     {
01144       direction->x =-SDT*cosphi;
01145       direction->z =-costh;
01146     }
01147   }
01148 }
01149 
01150 
01151 //////////////////////////////////////////////////////////////////////
01152 
01153 
01154 //  ***********************************************************************
01155 //  *   Translation of PENELOPE's "SUBROUTINE GRAa" from FORTRAN77 to C   *
01156 //  ***********************************************************************
01157 //!  Sample a Rayleigh interaction using the sampling algorithm
01158 //!  used in PENELOPE 2006.
01159 //!
01160 //!       @param[in] energy   Particle energy (not modified with Rayleigh)
01161 //!       @param[out] costh_Rayleigh   Cosine of the angular deflection
01162 //!       @param[in] material  Current voxel material
01163 //
01164 //  CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
01165 //  C  PENELOPE/PENGEOM (version 2006)                                     C
01166 //  C    Copyright (c) 2001-2006                                           C
01167 //  C    Universitat de Barcelona                                          C
01168 //  C  Permission to use, copy, modify, distribute and sell this software  C
01169 //  C  and its documentation for any purpose is hereby granted without     C
01170 //  C  fee, provided that the above copyright notice appears in all        C
01171 //  C  copies and that both that copyright notice and this permission      C
01172 //  C  notice appear in all supporting documentation. The Universitat de   C
01173 //  C  Barcelona makes no representations about the suitability of this    C
01174 //  C  software for any purpose. It is provided "as is" without express    C
01175 //  C  or implied warranty.                                                C
01176 //  CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
01177 //////////////////////////////////////////////////////////////////////
01178 #ifdef USING_CUDA
01179 __device__
01180 #endif
01181 inline void GRAa(float *energy, double *costh_Rayleigh, int *mat, float *pmax_current, int2 *seed, struct rayleigh_struct* cgra)
01182 {
01183 /*  ****  Energy grid and interpolation constants for the current energy. */
01184     double  xmax = ((double)*energy) * 8.065535669099010e-5;       // 8.065535669099010e-5 == 2.0*20.6074/510998.918
01185     double x2max = min_value( (xmax*xmax) , ((double)cgra->xco[(*mat+1)*NP_RAYLEIGH - 1]) );   // Get the last tabulated value of xco for this mat
01186     
01187     if (xmax < 0.01)
01188     {
01189        do
01190        {
01191           *costh_Rayleigh = 1.0 - ranecu_double(seed) * 2.0;
01192        }
01193        while ( ranecu_double(seed) > (((*costh_Rayleigh)*(*costh_Rayleigh)+1.0)*0.5) );
01194        return;
01195     }
01196 
01197     for(;;)    // (Loop will iterate everytime the sampled value is rejected or above maximum)
01198     {
01199       double ru = ranecu_double(seed) * (double)(*pmax_current);    // Pmax for the current energy is entered as a parameter
01200  
01201 /*  ****  Selection of the interval  (binary search within pre-calculated limits). */
01202       int itn = (int)(ru * (NP_RAYLEIGH-1));     // 'itn' will never reach the last interval 'NP_RAYLEIGH-1', but this is how RITA is implemented in PENELOPE
01203       int i__ = (int)cgra->itlco[itn + (*mat)*NP_RAYLEIGH];
01204       int j   = (int)cgra->ituco[itn + (*mat)*NP_RAYLEIGH];
01205       
01206       if ((j - i__) > 1)
01207       {
01208         do
01209         {
01210           register int k = (i__ + j)>>1;     // >>1 == /2 
01211           if (ru > cgra->pco[k -1 + (*mat)*NP_RAYLEIGH])
01212             i__ = k;
01213           else
01214             j = k;
01215         }
01216         while ((j - i__) > 1);
01217       }
01218        
01219 /*  ****  Sampling from the rational inverse cumulative distribution. */
01220       int index = i__ - 1 + (*mat)*NP_RAYLEIGH;
01221 
01222       double rr = ru - cgra->pco[index];
01223       double xx;
01224       if (rr > 1e-16)
01225       {      
01226         double d__ = (double)(cgra->pco[index+1] - cgra->pco[index]);
01227         float aco_index = cgra->aco[index], bco_index = cgra->bco[index], xco_index = cgra->xco[index];   // Avoid multiple accesses to the same global variable
01228 
01229         xx = (double)xco_index + (double)(aco_index + 1.0f + bco_index)* d__* rr / (d__*d__ + (aco_index*d__ + bco_index*rr) * rr) * (double)(cgra->xco[index+1] - xco_index);
01230         
01231       }
01232       else
01233       {
01234         xx = cgra->xco[index];
01235       }
01236       
01237       if (xx < x2max)
01238       {
01239         // Sampled value below maximum possible value:
01240         *costh_Rayleigh = 1.0 - 2.0 * xx / x2max;   // !!DeBuG!! costh_Rayleigh in double precision, but not all intermediate steps are!?
01241         /*  ****  Rejection: */    
01242         if (ranecu_double(seed) < (((*costh_Rayleigh)*(*costh_Rayleigh) + 1.0)*0.5))
01243           break;   // Sample value not rejected! break loop and return.
01244       }
01245     }
01246 } /* graa */
01247 
01248 
01249 
01250 //////////////////////////////////////////////////////////////////////////
01251 
01252 
01253 //  ***********************************************************************
01254 //  *   Translation of PENELOPE's "SUBROUTINE GCOa"  from FORTRAN77 to C  *
01255 //  ********************************************************************* *
01256 //!  Random sampling of incoherent (Compton) scattering of photons, using 
01257 //!  the sampling algorithm from PENELOPE 2006:
01258 //!    Relativistic impulse approximation with analytical one-electron Compton profiles
01259 
01260 // !!DeBuG!!  In penelope, Doppler broadening is not used for E greater than 5 MeV.
01261 //            We don't use it in GPU to reduce the lines of code and prevent using COMMON/compos/ZT(M)
01262 
01263 //!       @param[in,out] energy   incident and final photon energy (eV)
01264 //!       @param[out] costh_Compton   cosine of the polar scattering angle
01265 //!       @param[in] material   Current voxel material
01266 //!       @param[in] seed   RANECU PRNG seed
01267 //
01268 //  CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
01269 //  C  PENELOPE/PENGEOM (version 2006)                                     C
01270 //  C    Copyright (c) 2001-2006                                           C
01271 //  C    Universitat de Barcelona                                          C
01272 //  C  Permission to use, copy, modify, distribute and sell this software  C
01273 //  C  and its documentation for any purpose is hereby granted without     C
01274 //  C  fee, provided that the above copyright notice appears in all        C
01275 //  C  copies and that both that copyright notice and this permission      C
01276 //  C  notice appear in all supporting documentation. The Universitat de   C
01277 //  C  Barcelona makes no representations about the suitability of this    C
01278 //  C  software for any purpose. It is provided "as is" without express    C
01279 //  C  or implied warranty.                                                C
01280 //  CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
01281 //
01282 //  ************************************************************************
01283 
01284 #ifdef USING_CUDA
01285 __device__
01286 #endif
01287 inline void GCOa(float *energy, double *costh_Compton, int *mat, int2 *seed, struct compton_struct* cgco_SHARED)
01288 {
01289     float s, a1, s0, af, ek, ek2, ek3, tau, pzomc, taumin;
01290     float rn[MAX_SHELLS];
01291     double cdt1;
01292 
01293      // Some variables used in PENELOPE have been eliminated to save register: float aux, taum2, fpzmax, a, a2, ek1 ,rni, xqc, fpz, pac[MAX_SHELLS];
01294 
01295     int i__;
01296     int my_noscco = cgco_SHARED->noscco[*mat];    // Store the number of oscillators for the input material in a local variable
01297     
01298 #ifndef USING_CUDA
01299     static int warning_flag_1 = -1, warning_flag_2 = -1, warning_flag_3 = -1;    // Write warnings for the CPU code, but only once.  !!DeBuG!!
01300 #endif
01301 
01302     ek = *energy * 1.956951306108245e-6f;    // (1.956951306108245e-6 == 1.0/510998.918)
01303     ek2 = ek * 2.f + 1.f;
01304     ek3 = ek * ek;
01305     // ek1 = ek3 - ek2 - 1.;
01306     taumin = 1.f / ek2;
01307     // taum2 = taumin * taumin;
01308     a1 = logf(ek2);
01309     // a2 = a1 + ek * 2. * (ek + 1.) * taum2;    // a2 was used only once, code moved below
01310 
01311 
01312 /*  ****  Incoherent scattering function for theta=PI. */
01313 
01314     s0 = 0.0f;
01315     for (i__ = 0; i__ < my_noscco; i__++)
01316     {
01317        register float temp = cgco_SHARED->uico[*mat + i__*MAX_MATERIALS];
01318        if (temp < *energy)
01319        {
01320          register float aux = *energy * (*energy - temp) * 2.f;
01321          #ifdef USING_CUDA
01322            pzomc = cgco_SHARED->fj0[*mat + i__*MAX_MATERIALS] * (aux - temp * 510998.918f) * rsqrtf(aux + aux + temp * temp) * 1.956951306108245e-6f;
01323              // 1.956951306108245e-6 = 1.0/510998.918f   // Version using the reciprocal of sqrt in CUDA: faster and more accurate!!
01324          #else
01325            pzomc = cgco_SHARED->fj0[*mat + i__*MAX_MATERIALS] * (aux - temp * 510998.918f) / (sqrtf(aux + aux + temp * temp) * 510998.918f);
01326          #endif
01327          if (pzomc > 0.0f)
01328            temp = (0.707106781186545f+pzomc*1.4142135623731f) * (0.707106781186545f+pzomc*1.4142135623731f);
01329          else
01330            temp = (0.707106781186545f-pzomc*1.4142135623731f) * (0.707106781186545f-pzomc*1.4142135623731f);
01331 
01332          temp = 0.5f * expf(0.5f - temp);    // Calculate EXP outside the IF to avoid branching
01333 
01334          if (pzomc > 0.0f)
01335             temp = 1.0f - temp;
01336                                 
01337          s0 += cgco_SHARED->fco[*mat + i__*MAX_MATERIALS] * temp;
01338        }
01339     }
01340             
01341 /*  ****  Sampling tau. */
01342     do
01343     {
01344       if (ranecu(seed)*/*a2=*/(a1+2.*ek*(ek+1.f)*taumin*taumin) < a1)
01345       { 
01346         tau = powf(taumin, ranecu(seed));    // !!DeBuG!!  "powf()" has a big error (7 ULP), the double version has only 2!! 
01347       }
01348       else
01349       {
01350         tau = sqrtf(1.f + ranecu(seed) * (taumin * taumin - 1.f));
01351       }
01352 
01353       cdt1 = (double)(1.f-tau) / (((double)tau)*((double)*energy)*1.956951306108245e-6);    // !!DeBuG!! The sampled COS will be double precision, but TAU is not!!!
01354 
01355       if (cdt1 > 2.0) cdt1 = 1.99999999;   // !!DeBuG!! Make sure that precision error in POW, SQRT never gives cdt1>2 ==> costh_Compton<-1
01356       
01357   /*  ****  Incoherent scattering function. */
01358       s = 0.0f;
01359       for (i__ = 0; i__ < my_noscco; i__++)
01360       {
01361         register float temp = cgco_SHARED->uico[*mat + i__*MAX_MATERIALS];
01362         if (temp < *energy)
01363         {
01364           register float aux = (*energy) * (*energy - temp) * ((float)cdt1);
01365 
01366           if ((aux>1.0e-12f)||(temp>1.0e-12f))  // !!DeBuG!! Make sure the SQRT argument is never <0, and that we never get 0/0 -> NaN when aux=temp=0 !!
01367           {
01368          #ifdef USING_CUDA
01369            pzomc = cgco_SHARED->fj0[*mat + i__*MAX_MATERIALS] * (aux - temp * 510998.918f) * rsqrtf(aux + aux + temp * temp) * 1.956951306108245e-6f;
01370              // 1.956951306108245e-6 = 1.0/510998.918f   //  Version using the reciprocal of sqrt in CUDA: faster and more accurate!!
01371          #else
01372            pzomc = cgco_SHARED->fj0[*mat + i__*MAX_MATERIALS] * (aux - temp * 510998.918f) / (sqrtf(aux + aux + temp * temp) * 510998.918f);
01373          #endif
01374 
01375           }
01376           else
01377           {
01378             pzomc = 0.002f;    // !!DeBuG!! Using a rough approximation to a sample value of pzomc found using pure double precision: NOT RIGUROUS! But this code is expected to be used very seldom, only in extreme cases.
01379             #ifndef USING_CUDA
01380             if (warning_flag_1<0)
01381             {
01382                warning_flag_1 = +1;  // Disable warning, do not show again
01383                printf("          [... Small numerical precision error detected computing \"pzomc\" in GCOa (this warning will not be repeated).]\n               i__=%d, aux=%.14f, temp=%.14f, pzomc(forced)=%.14f, uico=%.14f, energy=%.7f, cgco_SHARED->fj0=%.14f, mat=%d, cdt1=%.14lf\n", (int)i__, aux, temp, pzomc, cgco_SHARED->uico[*mat+i__*MAX_MATERIALS], *energy, cgco_SHARED->fj0[*mat+i__*MAX_MATERIALS], (int)*mat, cdt1);   // !!DeBuG!!
01384             }
01385             #endif                    
01386           }
01387           
01388           temp = pzomc * 1.4142135623731f;
01389           if (pzomc > 0.0f)
01390             temp = 0.5f - (temp + 0.70710678118654502f) * (temp + 0.70710678118654502f);   // Calculate exponential argument
01391           else
01392             temp = 0.5f - (0.70710678118654502f - temp) * (0.70710678118654502f - temp);
01393 
01394           temp = 0.5f * expf(temp);      // All threads will calculate the expf together
01395           
01396           if (pzomc > 0.0f)
01397             temp = 1.0f - temp;
01398 
01399           s += cgco_SHARED->fco[*mat + i__*MAX_MATERIALS] * temp;
01400           rn[i__] = temp;
01401         }        
01402       }
01403     } while( (ranecu(seed)*s0) > (s*(1.0f+tau*(/*ek1=*/(ek3 - ek2 - 1.0f)+tau*(ek2+tau*ek3)))/(ek3*tau*(tau*tau+1.0f))) );  //  ****  Rejection function
01404 
01405     *costh_Compton = 1.0 - cdt1;
01406         
01407 /*  ****  Target electron shell. */
01408     for (;;)
01409     {
01410       register float temp = s*ranecu(seed);
01411       float pac = 0.0f;
01412 
01413       int ishell = my_noscco - 1;     // First shell will have number 0
01414       for (i__ = 0; i__ < (my_noscco-1); i__++)    // !!DeBuG!! Iterate to (my_noscco-1) only: the last oscillator is excited in case all other fail (no point in double checking) ??
01415       {
01416         pac += cgco_SHARED->fco[*mat + i__*MAX_MATERIALS] * rn[i__];   // !!DeBuG!! pac[] is calculated on the fly to save registers!
01417         if (pac > temp)       //  pac[] is calculated on the fly to save registers!  
01418         {
01419             ishell = i__;
01420             break;
01421         }
01422       }
01423 
01424     /*  ****  Projected momentum of the target electron. */
01425       temp = ranecu(seed) * rn[ishell];
01426 
01427       if (temp < 0.5f)
01428       {
01429         pzomc = (0.70710678118654502f - sqrtf(0.5f - logf(temp + temp))) / (cgco_SHARED->fj0[*mat + ishell * MAX_MATERIALS] * 1.4142135623731f);
01430       }
01431       else
01432       {
01433         pzomc = (sqrtf(0.5f - logf(2.0f - 2.0f*temp)) - 0.70710678118654502f) / (cgco_SHARED->fj0[*mat + ishell * MAX_MATERIALS] * 1.4142135623731f);
01434       }
01435       if (pzomc < -1.0f)
01436       {
01437         continue;      // re-start the loop
01438       }
01439 
01440   /*  ****  F(EP) rejection. */
01441       temp = tau * (tau - (*costh_Compton) * 2.f) + 1.f;       // this variable was originally called "xqc"
01442       
01443         // af = sqrt( max_value(temp,1.0e-30f) ) * (tau * (tau - *costh_Compton) / max_value(temp,1.0e-30f) + 1.f);  //!!DeBuG!! Make sure the SQRT argument is never <0, and that I don't divide by zero!!
01444 
01445       if (temp>1.0e-20f)   // !!DeBuG!! Make sure the SQRT argument is never <0, and that I don't divide by zero!!
01446       {
01447         af = sqrtf(temp) * (tau * (tau - ((float)(*costh_Compton))) / temp + 1.f);
01448       }
01449       else
01450       {
01451         // When using single precision, it is possible (but very uncommon) to get costh_Compton==1 and tau==1; then temp is 0 and 'af' can not be calculated (0/0 -> nan). Analysing the results obtained using double precision, we found that 'af' would be almost 0 in this situation, with an "average" about ~0.002 (this is just a rough estimation, but using af=0 the value would never be rejected below).
01452 
01453         af = 0.00200f;    // !!DeBuG!!
01454                 
01455         #ifndef USING_CUDA
01456         if (warning_flag_2<0)
01457         {
01458             warning_flag_2 = +1;  // Disable warning, do not show again
01459             printf("          [... Small numerical precision error detected computing \"af\" in GCOa (this warning will not be repeated)].\n               xqc=%.14f, af(forced)=%.14f, tau=%.14f, costh_Compton=%.14lf\n", temp, af, tau, *costh_Compton);    // !!DeBuG!!
01460         }
01461         #endif
01462       }
01463 
01464       if (af > 0.0f)
01465       {
01466         temp = af * 0.2f + 1.f;    // this variable was originally called "fpzmax"
01467       }
01468       else
01469       {
01470         temp = 1.f - af * 0.2f;
01471       }
01472       
01473       if ( ranecu(seed)*temp < /*fpz =*/(af * max_value( min_value(pzomc,0.2f) , -0.2f ) + 1.f) )
01474       {
01475         break;
01476       }
01477 
01478     }
01479 
01480 /*  ****  Energy of the scattered photon. */
01481     {
01482       register float t, b1, b2, temp;
01483       t = pzomc * pzomc;
01484       b1 = 1.f - t * tau * tau;
01485       b2 = 1.f - t * tau * ((float)(*costh_Compton));
01486 
01487       temp = sqrtf( fabsf(b2 * b2 - b1 * (1.0f - t)) );
01488       
01489           
01490       if (pzomc < 0.0f)
01491          temp *= -1.0f;
01492 
01493       // !Error! energy may increase (slightly) due to inacurate calculation!  !!DeBuG!!
01494       t = (tau / b1) * (b2 + temp);
01495       if (t > 1.0f)
01496       {
01497         #ifndef USING_CUDA
01498 
01499         #endif      
01500         #ifndef USING_CUDA
01501         if (warning_flag_3<0)
01502         {
01503             warning_flag_3 = +1;  // Disable warning, do not show again
01504             printf("\n          [... a Compton event tried to increase the x ray energy due to precision error. Keeping initial energy. (This warning will not be repeated.)]\n               scaling=%.14f, costh_Compton=%.14lf\n", t, *costh_Compton);   // !!DeBuG!!
01505         }
01506         #endif
01507         
01508         t = 1.0f; // !!DeBuG!! Avoid increasing energy by hand!!! not nice!!
01509       }
01510 
01511       (*energy) *= t;
01512        // (*energy) *= (tau / b1) * (b2 + temp);    //  Original PENELOPE code
01513     }
01514     
01515 }  // [End subroutine GCOa]
01516 
01517 
01518 
01519 ////////////////////////////////////////////////////////////////////////////////
01520 
01521 
01522 ////////////////////////////////////////////////////////////////////////////////
01523 //!  Tally the depose deposited inside each material.
01524 //!  This function is called whenever a particle suffers a Compton or photoelectric
01525 //!  interaction. The energy released in each interaction is added and later in the 
01526 //!  report function the total deposited energy is divided by the total mass of the 
01527 //!  material in the voxelized object to get the dose. This naturally accounts for
01528 //!  multiple densities for voxels with the same material (not all voxels have same mass).
01529 //!  Electrons are not transported in MC-GPU and therefore we are approximating
01530 //!  that the dose is equal to the KERMA (energy released by the photons alone).
01531 //!  This approximation is acceptable when there is electronic equilibrium and 
01532 //!  when the range of the secondary electrons is shorter than the organ size. 
01533 //!
01534 //!  The function uses atomic functions for a thread-safe access to the GPU memory.
01535 //!  We can check if this tally was disabled in the input file checking if the array
01536 //!  materials_dose was allocated in the GPU (disabled if pointer = NULL).
01537 //!
01538 //!
01539 //!       @param[in] Edep   Energy deposited in the interaction
01540 //!       @param[in] material   Current material id number
01541 //!       @param[out] materials_dose   ulonglong2 array storing the mateials dose [in eV/g] and dose^2 (ie, uncertainty).
01542 ////////////////////////////////////////////////////////////////////////////////
01543 #ifdef USING_CUDA
01544 __device__
01545 #endif
01546 inline 
01547 void tally_materials_dose(float* Edep, int* material, ulonglong2* materials_dose)
01548 {
01549       
01550 // !!DeBuG!! The energy can be tallied directly with atomicAdd in global memory or using shared memory first and then global for whole block if too slow. With the initial testing it looks like using global memory is already very fast!
01551 
01552 // !!DeBuG!! WARNING: with many histories and few materials the materials_dose integer variables may overflow!! Using double precision floats would be better. Single precision is not good enough because adding small energies to a large counter would give problems.
01553 
01554 #ifdef USING_CUDA
01555   atomicAdd(&materials_dose[*material].x, __float2ull_rn((*Edep)*SCALE_eV) );  // Energy deposited at the material, scaled by the factor SCALE_eV and rounded.
01556   atomicAdd(&materials_dose[*material].y, __float2ull_rn((*Edep)*(*Edep)) );   // Square of the dose to estimate standard deviation (not using SCALE_eV for std_dev to prevent overflow)
01557 #else
01558   materials_dose[*material].x += (unsigned long long int)((*Edep)*SCALE_eV + 0.5f);
01559   materials_dose[*material].y += (unsigned long long int)((*Edep)*(*Edep) + 0.5f);
01560 #endif     
01561           
01562   return;
01563 }