MC-GPU
|
00001 00002 //////////////////////////////////////////////////////////////////////////////// 00003 // 00004 // **************************** 00005 // *** MC-GPU , version 1.3 *** 00006 // **************************** 00007 // 00008 //! Definition of the CUDA GPU kernel for the simulation of x ray tracks in a voxelized geometry. 00009 //! This kernel has been optimized to yield a good performance in the GPU but can still be 00010 //! compiled in the CPU without problems. All the CUDA especific commands are enclosed in 00011 //! pre-processor directives that are skipped if the parameter "USING_CUDA" is not defined 00012 //! at compilation time. 00013 // 00014 // ** DISCLAIMER ** 00015 // 00016 // This software and documentation (the "Software") were developed at the Food and 00017 // Drug Administration (FDA) by employees of the Federal Government in the course 00018 // of their official duties. Pursuant to Title 17, Section 105 of the United States 00019 // Code, this work is not subject to copyright protection and is in the public 00020 // domain. Permission is hereby granted, free of charge, to any person obtaining a 00021 // copy of the Software, to deal in the Software without restriction, including 00022 // without limitation the rights to use, copy, modify, merge, publish, distribute, 00023 // sublicense, or sell copies of the Software or derivatives, and to permit persons 00024 // to whom the Software is furnished to do so. FDA assumes no responsibility 00025 // whatsoever for use by other parties of the Software, its source code, 00026 // documentation or compiled executables, and makes no guarantees, expressed or 00027 // implied, about its quality, reliability, or any other characteristic. Further, 00028 // use of this code in no way implies endorsement by the FDA or confers any 00029 // advantage in regulatory decisions. Although this software can be redistributed 00030 // and/or modified freely, we ask that any derivative works bear some notice that 00031 // they are derived from it, and any modified versions bear some notice that they 00032 // have been modified. 00033 // 00034 // 00035 //! @file MC-GPU_kernel_v1.3.cu 00036 //! @author Andreu Badal (Andreu.Badal-Soler@fda.hhs.gov) 00037 //! @date 2012/12/12 00038 // -- Original code started on: 2009/04/14 00039 // 00040 //////////////////////////////////////////////////////////////////////////////// 00041 00042 00043 00044 //////////////////////////////////////////////////////////////////////////////// 00045 //! Initialize the image array, ie, set all pixels to zero 00046 //! Essentially, this function has the same effect as the command: 00047 //! "cutilSafeCall(cudaMemcpy(image_device, image, image_bytes, cudaMemcpyHostToDevice))"; 00048 //! 00049 //! CUDA performs some initialization work the first time a GPU kernel is called. 00050 //! Therefore, calling a short kernel before the real particle tracking is performed 00051 //! may improve the accuracy of the timing measurements in the relevant kernel. 00052 //! 00053 //! @param[in,out] image Pointer to the image array. 00054 //! @param[in] pixels_per_image Number of pixels in the image (ie, elements in the array). 00055 //////////////////////////////////////////////////////////////////////////////// 00056 #ifdef USING_CUDA 00057 __global__ 00058 void init_image_array_GPU(unsigned long long int* image, int pixels_per_image) 00059 { 00060 int my_pixel = threadIdx.x + blockIdx.x*blockDim.x; 00061 if (my_pixel < pixels_per_image) 00062 { 00063 // -- Set the current pixel to 0 and return, avoiding overflow when more threads than pixels are used: 00064 image[my_pixel] = (unsigned long long int)(0); // Initialize non-scatter image 00065 my_pixel += pixels_per_image; // (advance to next image) 00066 image[my_pixel] = (unsigned long long int)(0); // Initialize Compton image 00067 my_pixel += pixels_per_image; // (advance to next image) 00068 image[my_pixel] = (unsigned long long int)(0); // Initialize Rayleigh image 00069 my_pixel += pixels_per_image; // (advance to next image) 00070 image[my_pixel] = (unsigned long long int)(0); // Initialize multi-scatter image 00071 } 00072 } 00073 00074 // //////////////////////////////////////////////////////////////////////////////// 00075 // //! Initialize the dose deposition array, ie, set all voxel doses to zero 00076 // //! 00077 // //! @param[in,out] dose Pointer to the dose mean and sigma arrays. 00078 // //! @param[in] num_voxels_dose Number of voxels in the dose ROI (ie, elements in the arrays). 00079 // //////////////////////////////////////////////////////////////////////////////// 00080 // __global__ 00081 // void init_dose_array_GPU(ulonglong2* voxels_Edep, int num_voxels_dose) 00082 // { 00083 // int my_voxel = threadIdx.x + blockIdx.x*blockDim.x; 00084 // register ulonglong2 ulonglong2_zero; 00085 // ulonglong2_zero.x = ulonglong2_zero.y = (unsigned long long int) 0; 00086 // if (my_voxel < num_voxels_dose) 00087 // { 00088 // dose[my_voxel] = ulonglong2_zero; // Set the current voxel to (0,0) and return, avoiding overflow 00089 // } 00090 // } 00091 00092 #endif 00093 00094 00095 //////////////////////////////////////////////////////////////////////////////// 00096 //! Main function to simulate x-ray tracks inside a voxelized geometry. 00097 //! Secondary electrons are not simulated (in photoelectric and Compton 00098 //! events the energy is locally deposited). 00099 //! 00100 //! The following global variables, in the GPU __constant__ memory are used: 00101 //! voxel_data_CONST, 00102 //! source_energy_data_CONST, 00103 //! detector_data_CONST, 00104 //! mfp_table_data_CONST. 00105 //! 00106 //! @param[in] history_batch Particle batch number (only used in the CPU version when CUDA is disabled!, the GPU uses the built-in variable threadIdx) 00107 //! @param[in] num_p Projection number in the CT simulation. This variable defines a specific angle and the corresponding source and detector will be used. 00108 //! @param[in] histories_per_thread Number of histories to simulate for each call to this function (ie, for GPU thread). 00109 //! @param[in] seed_input Random number generator seed (the same seed is used to initialize the two MLCGs of RANECU). 00110 //! @param[in] voxel_mat_dens Pointer to the voxel densities and material vector (the voxelized geometry), stored in GPU glbal memory. 00111 //! @param[in] mfp_Woodcock_table Two parameter table for the linear interpolation of the Woodcock mean free path (MFP) (stored in GPU global memory). 00112 //! @param[in] mfp_table_a First element for the linear interpolation of the interaction mean free paths (stored in GPU global memory). 00113 //! @param[in] mfp_table_b Second element for the linear interpolation of the interaction mean free paths (stored in GPU global memory). 00114 //! @param[in] rayleigh_table Pointer to the table with the data required by the Rayleigh interaction sampling, stored in GPU global memory. 00115 //! @param[in] compton_table Pointer to the table with the data required by the Compton interaction sampling, stored in GPU global memory. 00116 //! @param[in,out] image Pointer to the image vector in the GPU global memory. 00117 //! @param[in,out] dose Pointer to the array containing the 3D voxel dose (and its uncertainty) in the GPU global memory. 00118 //////////////////////////////////////////////////////////////////////////////// 00119 #ifdef USING_CUDA 00120 __global__ void track_particles(int histories_per_thread, 00121 int num_p, // For a CT simulation: allocate space for up to MAX_NUM_PROJECTIONS projections. 00122 int seed_input, 00123 unsigned long long int* image, 00124 ulonglong2* voxels_Edep, 00125 float2* voxel_mat_dens, 00126 float2* mfp_Woodcock_table, 00127 float3* mfp_table_a, 00128 float3* mfp_table_b, 00129 struct rayleigh_struct* rayleigh_table, 00130 struct compton_struct* compton_table, 00131 struct detector_struct* detector_data_array, 00132 struct source_struct* source_data_array, 00133 ulonglong2* materials_dose) 00134 #else 00135 void track_particles(int history_batch, // This variable is not required in the GPU, it uses the thread ID 00136 int histories_per_thread, 00137 int num_p, 00138 int seed_input, 00139 unsigned long long int* image, 00140 ulonglong2* voxels_Edep, 00141 float2* voxel_mat_dens, 00142 float2* mfp_Woodcock_table, 00143 float3* mfp_table_a, 00144 float3* mfp_table_b, 00145 struct rayleigh_struct* rayleigh_table, 00146 struct compton_struct* compton_table, 00147 struct detector_struct* detector_data_array, 00148 struct source_struct* source_data_array, 00149 ulonglong2* materials_dose) 00150 #endif 00151 { 00152 // -- Declare the track state variables: 00153 float3 position, direction; 00154 float energy, step, prob, randno, mfp_density, mfp_Woodcock; 00155 float3 mfp_table_read_a, mfp_table_read_b; 00156 int2 seed; 00157 int index; 00158 int material0, // Current material, starting at 0 for 1st material 00159 material_old; // Flag to mark a material or energy change 00160 signed char scatter_state; // Flag for scatter images: scatter_state=0 for non-scattered, =1 for Compton, =2 for Rayleigh, and =3 for multiple scatter. 00161 00162 // -- Store the Compton table in shared memory from global memory: 00163 // For Compton and Rayleigh the access to memory is not coherent and the caching capability do not speeds up the accesses, they actually slows down the acces to other data. 00164 #ifdef USING_CUDA 00165 __shared__ 00166 #endif 00167 struct compton_struct cgco_SHARED; 00168 #ifdef USING_CUDA 00169 __shared__ 00170 #endif 00171 struct detector_struct detector_data_SHARED; 00172 #ifdef USING_CUDA 00173 __shared__ 00174 #endif 00175 struct source_struct source_data_SHARED; 00176 00177 00178 #ifdef USING_CUDA 00179 if (0==threadIdx.x) // First GPU thread copies the variables to shared memory 00180 { 00181 #endif 00182 00183 // -Copy the current source, detector data from global to shared memory for fast access: 00184 source_data_SHARED = source_data_array[num_p]; 00185 detector_data_SHARED = detector_data_array[num_p]; // Copy the long array to a single instance in shared memory for the current projection 00186 00187 // -Copy the compton data to shared memory: 00188 cgco_SHARED = *compton_table; 00189 00190 #ifdef USING_CUDA 00191 } 00192 __syncthreads(); // Make sure all threads will see the initialized shared variable 00193 #endif 00194 00195 00196 // -- Initialize the RANECU generator in a position far away from the previous history: 00197 #ifdef USING_CUDA 00198 init_PRNG((threadIdx.x + blockIdx.x*blockDim.x), histories_per_thread, seed_input, &seed); // Using a 1D block 00199 #else 00200 init_PRNG(history_batch, histories_per_thread, seed_input, &seed); 00201 #endif 00202 00203 00204 // -- Loop for the "histories_per_thread" particles in the current history_batch: 00205 00206 for( ; histories_per_thread>0; histories_per_thread--) 00207 { 00208 // printf("\n\n********* NEW HISTORY: %d [seeds: %d, %d]\n\n", histories_per_thread, seed.x, seed.y); // fflush(stdout); // !!Verbose!! calling printf from the GPU is possible but if multiple threads call it at the same time some output will be lost. 00209 00210 int absvox = 1; 00211 00212 // -- Call the source function to get a primary x ray: 00213 source(&position, &direction, &energy, &seed, &absvox, &source_data_SHARED, &detector_data_SHARED); 00214 00215 scatter_state = (signed char)0; // Reset previous scatter state: new non-scattered particle loaded 00216 00217 // -- Find the current energy bin by truncation (this could be pre-calculated for a monoenergetic beam): 00218 // The initialization host code made sure that the sampled energy will always be within the tabulated energies (index never negative or too large). 00219 #ifdef USING_CUDA 00220 index = __float2int_rd((energy-mfp_table_data_CONST.e0)*mfp_table_data_CONST.ide); // Using CUDA function to convert float to integer rounding down (towards minus infinite) 00221 #else 00222 index = (int)((energy-mfp_table_data_CONST.e0)*mfp_table_data_CONST.ide + 0.00001f); // Adding EPSILON to truncate to INT towards minus infinite. There may be a small error for energy<=mfp_table_data_CONST.e0 but this case is irrelevant (particles will always have more energy than e0). 00223 #endif 00224 00225 00226 // -- Get the minimum mfp at the current energy using linear interpolation (Woodcock tracking): 00227 { 00228 float2 mfp_Woodcock_read = mfp_Woodcock_table[index]; // Read the 2 parameters for the linear interpolation in a single read from global memory 00229 mfp_Woodcock = mfp_Woodcock_read.x + energy * mfp_Woodcock_read.y; // Interpolated minimum MFP 00230 } 00231 00232 00233 // -- Reset previous material to force a recalculation of the MFPs (negative materials are not allowed in the voxels): 00234 material_old = -1; 00235 00236 // *** X-ray interaction loop: 00237 for(;;) 00238 { 00239 00240 if (absvox<0) // !!DeBuG!! MC-GPU_v1.3 ==> if I move this "if" above the code runs much slower!? Why??? 00241 break; // -- Primary particle was not pointing to the voxel region! (but may still be detected after moving in vacuum in a straight line). 00242 00243 00244 // *** Virtual interaction loop: // New loop structure in MC-GPU_v1.3: simulate all virtual events before sampling Compton & Rayleigh: // !!DeBuG!! 00245 00246 float2 matdens; 00247 short3 voxel_coord; // Variable used only by DOSE TALLY 00248 00249 do 00250 { 00251 step = -(mfp_Woodcock)*logf(ranecu(&seed)); // Using the minimum MFP in the geometry for the input energy (Woodcock tracking) 00252 00253 position.x += step*direction.x; 00254 position.y += step*direction.y; 00255 position.z += step*direction.z; 00256 00257 // -- Locate the new particle in the voxel geometry: 00258 absvox = locate_voxel(&position, &voxel_coord); // Get the voxel number at the current position and the voxel coordinates (used to check if inside the dose ROI in DOSE TALLY). 00259 if (absvox<0) 00260 break; // -- Particle escaped the voxel region! ("index" is still >0 at this moment) 00261 00262 matdens = voxel_mat_dens[absvox]; // Get the voxel material and density in a single read from global memory 00263 material0 = (int)(matdens.x - 1); // Set the current material by truncation, and set 1st material to value '0'. 00264 00265 // -- Get the data for the linear interpolation of the interaction MFPs, in case the energy or material have changed: 00266 if (material0 != material_old) 00267 { 00268 mfp_table_read_a = mfp_table_a[index*(MAX_MATERIALS)+material0]; 00269 mfp_table_read_b = mfp_table_b[index*(MAX_MATERIALS)+material0]; 00270 material_old = material0; // Store the new material 00271 } 00272 00273 // *** Apply Woodcock tracking: 00274 mfp_density = mfp_Woodcock * matdens.y; 00275 // -- Calculate probability of delta scattering, using the total mean free path for the current material and energy (linear interpolation): 00276 prob = 1.0f - mfp_density * (mfp_table_read_a.x + energy * mfp_table_read_b.x); 00277 randno = ranecu(&seed); // Sample uniform PRN 00278 } 00279 while (randno<prob); // [Iterate if there is a delta scattering event] 00280 00281 if (absvox<0) 00282 break; // -- Particle escaped the voxel region! Break the interaction loop to call tally image. 00283 00284 00285 // The GPU threads will be stopped and waiting here until ALL threads have a REAL event: 00286 00287 // -- Real event takes place! Check the kind of event and sample the effects of the interaction: 00288 00289 prob += mfp_density * (mfp_table_read_a.y + energy * mfp_table_read_b.y); // Interpolate total Compton MFP ('y' component) 00290 if (randno<prob) // [Checking Compton scattering] 00291 { 00292 // *** Compton interaction: 00293 00294 // -- Sample new direction and energy: 00295 double costh_Compton; 00296 randno = energy; // Save temporal copy of the particle energy (variable randno not necessary until next sampling). DOSE TALLY 00297 00298 GCOa(&energy, &costh_Compton, &material0, &seed, &cgco_SHARED); 00299 rotate_double(&direction, costh_Compton, /*phi=2*pi*PRN=*/ 6.28318530717958647693*ranecu_double(&seed)); 00300 00301 randno = energy - randno; // Save temporal copy of the negative of the energy lost in the interaction. DOSE TALLY 00302 00303 // -- Find the new energy interval: 00304 #ifdef USING_CUDA 00305 index = __float2int_rd((energy-mfp_table_data_CONST.e0)*mfp_table_data_CONST.ide); // Using CUDA function to convert float to integer rounding down (towards minus infinite) 00306 #else 00307 index = (int)((energy-mfp_table_data_CONST.e0)*mfp_table_data_CONST.ide + 0.00001f); // Adding EPSILON to truncate to INT 00308 #endif 00309 00310 00311 if (index>-1) // 'index' will be negative only when the energy is below the tabulated minimum energy: particle will be then absorbed (rejected) after tallying the dose. 00312 { 00313 // -- Get the Woodcock MFP for the new energy (energy above minimum cutoff): 00314 float2 mfp_Woodcock_read = mfp_Woodcock_table[index]; // Read the 2 parameters for the linear interpolation in a single read from global memory 00315 mfp_Woodcock = mfp_Woodcock_read.x + energy * mfp_Woodcock_read.y; // Interpolated minimum MFP 00316 00317 material_old = -2; // Set an impossible material to force an update of the MFPs data for the nex energy interval 00318 00319 // -- Update scatter state: 00320 if (scatter_state==(signed char)0) 00321 scatter_state = (signed char)1; // Set scatter_state == 1: Compton scattered particle 00322 else 00323 scatter_state = (signed char)3; // Set scatter_state == 3: Multi-scattered particle 00324 } 00325 00326 } 00327 else 00328 { 00329 prob += mfp_density * (mfp_table_read_a.z + energy * mfp_table_read_b.z); // Interpolate total Rayleigh MFP ('z' component) 00330 if (randno<prob) // [Checking Rayleigh scattering] 00331 { 00332 // *** Rayleigh interaction: 00333 00334 // -- Sample angular deflection: 00335 double costh_Rayleigh; 00336 float pmax_current = rayleigh_table->pmax[(index+1)*MAX_MATERIALS+material0]; // Get max (ie, value for next bin?) cumul prob square form factor for Rayleigh sampling 00337 00338 GRAa(&energy, &costh_Rayleigh, &material0, &pmax_current, &seed, rayleigh_table); 00339 rotate_double(&direction, costh_Rayleigh, /*phi=2*pi*PRN=*/ 6.28318530717958647693*ranecu_double(&seed)); 00340 00341 // -- Update scatter state: 00342 if (scatter_state==(signed char)0) 00343 scatter_state = (signed char)2; // Set scatter_state == 1: Rayleigh scattered particle 00344 else 00345 scatter_state = (signed char)3; // Set scatter_state == 3: Multi-scattered particle 00346 00347 } 00348 else 00349 { 00350 // *** Photoelectric interaction (or pair production): mark particle for absorption after dose tally (ie, index<0)! 00351 randno = -energy; // Save temporal copy of the (negative) energy deposited in the interaction (variable randno not necessary anymore). 00352 index = -11; // A negative "index" marks that the particle was absorved and that it will never arrive at the detector. 00353 } 00354 } 00355 00356 // -- Tally the dose deposited in Compton and photoelectric interactions: 00357 if (randno<-0.001f) 00358 { 00359 float Edep = -1.0f*randno; // If any energy was deposited, this variable will temporarily store the negative value of Edep. 00360 00361 // -- Tally the dose deposited in the current material, if enabled (ie, array allocated and not null): 00362 if (materials_dose!=NULL) 00363 tally_materials_dose(&Edep, &material0, materials_dose); // !!tally_materials_dose!! 00364 00365 // -- Tally the energy deposited in the current voxel, if enabled (tally disabled when dose_ROI_x_max_CONST is negative). DOSE TALLY 00366 if (dose_ROI_x_max_CONST > -1) 00367 tally_voxel_energy_deposition(&Edep, &voxel_coord, voxels_Edep); 00368 00369 } 00370 00371 // -- Break interaction loop for particles that have been absorved or with energy below the tabulated cutoff: particle is "absorbed" (ie, track discontinued). 00372 if (index<0) 00373 break; 00374 00375 } // [Cycle the X-ray interaction loop] 00376 00377 if (index>-1) 00378 { 00379 // -- Particle escaped the voxels but was not absorbed, check if it will arrive at the detector and tally its energy: 00380 tally_image(&energy, &position, &direction, &scatter_state, image, &source_data_SHARED, &detector_data_SHARED); 00381 } 00382 } // [Continue with a new history] 00383 00384 } // [All tracks simulated for this kernel call: return to CPU] 00385 00386 00387 00388 00389 00390 00391 //////////////////////////////////////////////////////////////////////////////// 00392 //! Tally the dose deposited in the voxels. 00393 //! This function is called whenever a particle suffers a Compton or photoelectric 00394 //! interaction. It is not necessary to call this function if the dose tally 00395 //! was disabled in the input file (ie, dose_ROI_x_max_CONST < 0). 00396 //! Electrons are not transported in MC-GPU and therefore we are approximating 00397 //! that the dose is equal to the KERMA (energy released by the photons alone). 00398 //! This approximation is acceptable when there is electronic equilibrium and when 00399 //! the range of the secondary electrons is shorter than the voxel size. Usually the 00400 //! doses will be acceptable for photon energies below 1 MeV. The dose estimates may 00401 //! not be accurate at the interface of low density volumes. 00402 //! 00403 //! We need to use atomicAdd() in the GPU to prevent that multiple threads update the 00404 //! same voxel at the same time, which would result in a lose of information. 00405 //! This is very improbable when using a large number of voxels but gives troubles 00406 //! with a simple geometries with few voxels (in this case the atomicAdd will slow 00407 //! down the code because threads will update the voxel dose secuentially). 00408 //! 00409 //! 00410 //! @param[in] Edep Energy deposited in the interaction 00411 //! @param[in] voxel_coord Voxel coordinates, needed to check if particle located inside the input region of interest (ROI) 00412 //! @param[out] voxels_Edep ulonglong2 array containing the 3D voxel dose and dose^2 (ie, uncertainty) as unsigned integers scaled by SCALE_eV. 00413 //////////////////////////////////////////////////////////////////////////////// 00414 #ifdef USING_CUDA 00415 __device__ 00416 #endif 00417 inline 00418 void tally_voxel_energy_deposition(float* Edep, short3* voxel_coord, ulonglong2* voxels_Edep) 00419 { 00420 00421 // !!DeBuG!! Maybe it would be faster to store a 6 element struct and save temp copy?? struct_short_int_x6_align16 dose_ROI_size = dose_ROI_size_CONST; // Get ROI coordinates from GPU constant memory and store temporal copy 00422 00423 if((voxel_coord->x < dose_ROI_x_min_CONST) || (voxel_coord->x > dose_ROI_x_max_CONST) || 00424 (voxel_coord->y < dose_ROI_y_min_CONST) || (voxel_coord->y > dose_ROI_y_max_CONST) || 00425 (voxel_coord->z < dose_ROI_z_min_CONST) || (voxel_coord->z > dose_ROI_z_max_CONST)) 00426 { 00427 return; // -- Particle outside the ROI: return without tallying anything. 00428 } 00429 00430 // -- Particle inside the ROI: tally Edep. 00431 register int DX = 1 + (int)(dose_ROI_x_max_CONST - dose_ROI_x_min_CONST); 00432 register int num_voxel = (int)(voxel_coord->x-dose_ROI_x_min_CONST) + ((int)(voxel_coord->y-dose_ROI_y_min_CONST))*DX + ((int)(voxel_coord->z-dose_ROI_z_min_CONST))*DX*(1 + (int)(dose_ROI_y_max_CONST-dose_ROI_y_min_CONST)); 00433 00434 #ifdef USING_CUDA 00435 atomicAdd(&voxels_Edep[num_voxel].x, __float2ull_rn((*Edep)*SCALE_eV) ); // Energy deposited at the voxel, scaled by the factor SCALE_eV and rounded. 00436 atomicAdd(&voxels_Edep[num_voxel].y, __float2ull_rn((*Edep)*(*Edep)) ); // (not using SCALE_eV for std_dev to prevent overflow) 00437 #else 00438 voxels_Edep[num_voxel].x += (unsigned long long int)((*Edep)*SCALE_eV + 0.5f); 00439 voxels_Edep[num_voxel].y += (unsigned long long int)((*Edep)*(*Edep) + 0.5f); 00440 #endif 00441 00442 return; 00443 } 00444 00445 00446 //////////////////////////////////////////////////////////////////////////////// 00447 //! Tally a radiographic projection image. 00448 //! This function is called whenever a particle escapes the voxelized volume. 00449 //! The code checks if the particle would arrive at the detector if it kept 00450 //! moving in a straight line after exiting the voxels (assuming vacuum enclosure). 00451 //! An ideal image formation model is implemented: each pixel counts the total energy 00452 //! of the x rays that enter the pixel (100% detection efficiency for any energy). 00453 //! The image due to primaries and different kinds of scatter is tallied separately. 00454 //! 00455 //! In the GPU, and atomicAdd() function is used to make sure that multiple threads do 00456 //! not update the same pixel at the same time, which would result in a lose of information. 00457 //! Since the atomicAdd function is only available for 'unsigned long long int' data, 00458 //! the float pixel values are scaled by a factor "SCALE_eV" defined in the header file 00459 //! (eg, #define SCALE_eV 10000.0f) and stored as unsigned long long integers in main 00460 //! memory. 00461 //! 00462 //! WARNING! If the total tallied signal (for all particles) is larger than "1.8e19/SCALE_eV", 00463 //! there will be a bit overflow and the value will be reset to 0 giving bogus results. 00464 //! 00465 //! WARNING! The detector plane should be located outside the voxels bounding box. However, since 00466 //! the particles are moved outside the bbox in the last step, they could cross the detector 00467 //! plane anyway. If the particles are less than 2.0 cm behind the detector, they are moved 00468 //! back and detected. Therefore the detector can be a few cm inside the bbox and still work. 00469 //! If the Woodcock mean free path is larger than the distance from the bbox to the detector, 00470 //! we may lose some particles behind the detector! 00471 //! 00472 //! 00473 //! @param[in] energy X-ray energy 00474 //! @param[in] position Particle position 00475 //! @param[in] direction Particle direction (cosine vectors) 00476 //! @param[in] scatter_state Flag marking primaries, single Compton, single Rayleigh or multiple scattered radiation 00477 //! @param[out] image Integer array containing the image, ie, the pixel values (in tenths of meV) 00478 //////////////////////////////////////////////////////////////////////////////// 00479 #ifdef USING_CUDA 00480 __device__ 00481 #endif 00482 inline void tally_image(float* energy, float3* position, float3* direction, signed char* scatter_state, unsigned long long int* image, struct source_struct* source_data_SHARED, struct detector_struct* detector_data_SHARED) 00483 { 00484 float dist_detector, rotated_position; 00485 00486 if (detector_data_SHARED->rotation_flag == 1) // --> Initial source direction is not (0,1,0): detector has to be rotated to +Y to find the pixel number 00487 { 00488 00489 // *** Skip particles not moving towards the detector. 00490 // (a) Skip particles that were deflected more than 90 deg from the original source direction (backscatter). 00491 // (b) Skip particles located more than 10 cm behind the detector 00492 // (c) Skip particles for which the direction to the detector is way bigger than SDD (likely to intersect the plane outside the pixel region). 00493 // !!DeBuG!! NOTE: This may give problems for big detectors very close to the source 00494 00495 // !!DeBuG!! Particles located after the detector will be moved back to the surface of the detector, but 10 cm maximum!! 00496 // In this way the detector can intersect the voxels bbox or be located right on the surface of the bbox: the particles will be 00497 // transported across the detector and until a little after the end of the bbox in the last step, but then moved back. 00498 // This algorithm will give correct results ONLY when the detector intersects just slightly the air space around the phantom, 00499 // so that the interactions after the detector are not significant (this happens sometimes using oblique beams). 00500 // I could remove particles after the detector using "if (dist_detector<0.0f) return;". 00501 00502 // (a) Calculate the angle between the particle and the initial direction (dot product): reject particle if cos_angle < cos(89)==0 (angle>89deg): 00503 // [Extra parenthesis are coded to suggest to the compiler the use of intrinsic multiply-add operations]. 00504 00505 register float cos_angle = direction->x * source_data_SHARED->direction.x + 00506 (direction->y * source_data_SHARED->direction.y + 00507 (direction->z * source_data_SHARED->direction.z)); 00508 if (cos_angle < 0.025f) 00509 return; // Reject particle: Angle larger than 89 deg --> particle moving parallel to the detector or backwards towards the source! 00510 00511 // (b) Find the distance from the current particle location (likely just after the surface of the voxel bbox) to the intersection with the detector plane: 00512 dist_detector = ( source_data_SHARED->direction.x * (detector_data_SHARED->center.x - position->x) + 00513 (source_data_SHARED->direction.y * (detector_data_SHARED->center.y - position->y) + 00514 (source_data_SHARED->direction.z * (detector_data_SHARED->center.z - position->z))) ) / cos_angle; 00515 00516 00517 00518 // !!DeBuG!! IF's below (used in v1.2) are not needed when checking the x ray angle: 00519 // if (dist_detector < -10.0f) // !!DeBuG!! Is 10 cm enough or too much? Should I use 0? or allow any distance? 00520 // return; // !!DeBuG!! Reject particles located more than 10 cm behind the detector. 10 cm was selected arbitrarily. Woodcock MFP for x-rays in bone: MFP 200 keV photons in bone ==> 4 cm. 00521 // 00522 // if (fabsf(dist_detector)>(2.1f*detector_data_CONST.sdd)) 00523 // return; // Reject particle: distance to the detector plane too large, the particle is likely to travel almost parallel to the detector and will not be detected. 00524 00525 00526 // *** Translate the particle to the detector plane (we assume the detector is completely absorbent: 100% detection efficiency): 00527 position->x = position->x + dist_detector * direction->x; 00528 position->y = position->y + dist_detector * direction->y; 00529 position->z = position->z + dist_detector * direction->z; 00530 00531 // *** Rotate the particle position vector to the default reference system where the detector is perpendicular to the +Y axis, then find out if the particle is located inside a pixel: 00532 #ifdef USING_CUDA 00533 rotated_position = detector_data_SHARED->rot_inv[0]*position->x + detector_data_SHARED->rot_inv[1]*position->y + detector_data_SHARED->rot_inv[2]*position->z; // X coordinate 00534 int pixel_coord_x = __float2int_rd((rotated_position - detector_data_SHARED->corner_min_rotated_to_Y.x) * detector_data_SHARED->inv_pixel_size_X); // Using CUDA intrinsic function to convert float to integer rounding down (towards minus infinite) 00535 if ((pixel_coord_x>-1)&&(pixel_coord_x<detector_data_SHARED->num_pixels.x)) 00536 { 00537 rotated_position = detector_data_SHARED->rot_inv[6]*position->x + detector_data_SHARED->rot_inv[7]*position->y + detector_data_SHARED->rot_inv[8]*position->z; // Z coordinate 00538 int pixel_coord_z = __float2int_rd((rotated_position - detector_data_SHARED->corner_min_rotated_to_Y.z) * detector_data_SHARED->inv_pixel_size_Z); 00539 if ((pixel_coord_z>-1)&&(pixel_coord_z<detector_data_SHARED->num_pixels.y)) 00540 { 00541 // -- Particle enters the detector! Tally the particle energy in the corresponding pixel (in tenths of meV): 00542 // Using a CUDA atomic function (not available for global floats yet) to read and increase the pixel value in a single instruction, blocking interferences from other threads. 00543 // The offset for the primaries or scatter images are calculated considering that: 00544 // scatter_state=0 for non-scattered, =1 for Compton, =2 for Rayleigh, and =3 for multiple scatter. 00545 atomicAdd(( image + // Pointer to beginning of image array 00546 (int)(*scatter_state) * detector_data_SHARED->total_num_pixels + // Offset to corresponding scatter image 00547 (pixel_coord_x + pixel_coord_z*(detector_data_SHARED->num_pixels.x)) ), // Offset to the corresponding pixel 00548 __float2ull_rn((*energy)*SCALE_eV) ); // Energy arriving at the pixel, scaled by the factor SCALE_eV and rounded. 00549 // The maximum unsigned long long int value is ~1.8e19: 00550 } 00551 } 00552 #else 00553 // CPU version (not using CUDA intrinsics: atomicAdd, fast type casting) 00554 rotated_position = detector_data_SHARED->rot_inv[0]*position->x + detector_data_SHARED->rot_inv[1]*position->y + detector_data_SHARED->rot_inv[2]*position->z; // X coordinate 00555 00556 float pixel_coord_x = floor((rotated_position - detector_data_SHARED->corner_min_rotated_to_Y.x)*detector_data_SHARED->inv_pixel_size_X); // Using float+floor instead of INT to avoid truncation errors for positive and negative values 00557 if ( (pixel_coord_x>-0.1f) && (pixel_coord_x<(detector_data_SHARED->num_pixels.x-0.1f)) ) // Rejecting values negative or bigger than the image size 00558 { 00559 rotated_position = detector_data_SHARED->rot_inv[6]*position->x + detector_data_SHARED->rot_inv[7]*position->y + detector_data_SHARED->rot_inv[8]*position->z; // Z coordinate 00560 float pixel_coord_z = floor((rotated_position - detector_data_SHARED->corner_min_rotated_to_Y.z)*detector_data_SHARED->inv_pixel_size_Z); 00561 if ( (pixel_coord_z>-0.1f) && (pixel_coord_z<(detector_data_SHARED->num_pixels.y-0.1f)) ) 00562 image[(int)(((float)*scatter_state)*detector_data_SHARED->total_num_pixels + pixel_coord_x + pixel_coord_z*detector_data_SHARED->num_pixels.x + 0.0001f)] 00563 += (unsigned long long int)((*energy)*SCALE_eV + 0.5f); // Tally the particle energy in the pixel. This instruction is not thread-safe, but it is ok in sequential CPU code. 00564 } 00565 #endif 00566 } 00567 else // (detector_data_SHARED->rotation_flag != 1) --> Initial source direction is (0,1,0): pixel number and distance can be found easily 00568 { 00569 if (direction->y < 0.0001f) 00570 return; // *** Reject particles not moving towards the detector plane at +Y. 00571 00572 dist_detector = (detector_data_SHARED->center.y - position->y)/(direction->y); // Distance to the intersection with the detector at +Y. 00573 00574 // !!DeBuG!! IF below (v1.2) not needed when checking the angle 00575 // if (dist_detector>(2.1f*detector_data_SHARED->sdd)) return; 00576 00577 00578 #ifdef USING_CUDA 00579 int pixel_coord_x = __float2int_rd((position->x + dist_detector*direction->x - detector_data_SHARED->corner_min_rotated_to_Y.x)*detector_data_SHARED->inv_pixel_size_X); 00580 if ((pixel_coord_x>-1)&&(pixel_coord_x<detector_data_SHARED->num_pixels.x)) 00581 { 00582 int pixel_coord_z = __float2int_rd((position->z + dist_detector*direction->z - detector_data_SHARED->corner_min_rotated_to_Y.z)*detector_data_SHARED->inv_pixel_size_Z); 00583 if ((pixel_coord_z>-1)&&(pixel_coord_z<detector_data_SHARED->num_pixels.y)) 00584 atomicAdd( ( image + // Pointer to beginning of image array 00585 (int)(*scatter_state) * detector_data_SHARED->total_num_pixels + // Offset to corresponding scatter image 00586 (pixel_coord_x + pixel_coord_z*(detector_data_SHARED->num_pixels.x)) ), // Offset to the corresponding pixel 00587 __float2ull_rn((*energy)*SCALE_eV) ); // Energy arriving at the pixel, scaled by the factor SCALE_eV and rounded. 00588 } 00589 #else 00590 00591 // --Calculate the pixel the xray enters, truncating towards minus infinite and making sure the conversion to int is safe: 00592 float pixel_coord_x = floor((position->x + dist_detector*direction->x - detector_data_SHARED->corner_min_rotated_to_Y.x)*detector_data_SHARED->inv_pixel_size_X); 00593 00594 if ( (pixel_coord_x>-0.1f) && (pixel_coord_x<(detector_data_SHARED->num_pixels.x-0.1f)) ) 00595 { 00596 float pixel_coord_z = floor((position->z + dist_detector*direction->z - detector_data_SHARED->corner_min_rotated_to_Y.z)*detector_data_SHARED->inv_pixel_size_Z); 00597 if ( (pixel_coord_z>-0.1f) && (pixel_coord_z<(detector_data_SHARED->num_pixels.y-0.1f)) ) 00598 image[(int)(((float)*scatter_state)*detector_data_SHARED->total_num_pixels + pixel_coord_x + pixel_coord_z*detector_data_SHARED->num_pixels.x + 0.0001f)] 00599 += (unsigned long long int)((*energy)*SCALE_eV + 0.5f); // Truncate the pixel number to INT and round the energy value 00600 } 00601 #endif 00602 } 00603 00604 } 00605 00606 00607 00608 //////////////////////////////////////////////////////////////////////////////// 00609 //! Source that creates primary x rays, according to the defined source model. 00610 //! The particles are automatically moved to the surface of the voxel bounding box, 00611 //! to start the tracking inside a real material. If the sampled particle do not 00612 //! enter the voxels, it is init in the focal spot and the main program will check 00613 //! if it arrives at the detector or not. 00614 //! 00615 //! @param[in] source_data Structure describing the source. 00616 //! @param[in] source_energy_data_CONST Global variable in constant memory space describing the source energy spectrum. 00617 //! @param[out] position Initial particle position (particle transported inside the voxel bbox). 00618 //! @param[out] direction Sampled particle direction (cosine vectors). 00619 //! @param[out] energy Sampled energy of the new x ray. 00620 //! @param[in] seed Current seed of the random number generator, requiered to sample the movement direction. 00621 //! @param[out] absvox Set to <0 if primary particle will not cross the voxels, not changed otherwise (>0). 00622 //////////////////////////////////////////////////////////////////////////////// 00623 #ifdef USING_CUDA 00624 __device__ 00625 #endif 00626 inline void source(float3* position, float3* direction, float* energy, int2* seed, int* absvox, struct source_struct* source_data_SHARED, struct detector_struct* detector_data_SHARED) 00627 { 00628 // *** Sample the initial x-ray energy following the input energy spectrum using the Walker aliasing algorithm from PENELOPE: 00629 // The following code is equivalent to calling the function "seeki_walker": int sampled_bin = seeki_walker(source_data_CONST.espc_cutoff, source_data_CONST.espc_alias, ranecu(seed), source_data_CONST.num_bins_espc); 00630 int sampled_bin; 00631 float RN = ranecu(seed) * source_energy_data_CONST.num_bins_espc; // Find initial interval (array starting at 0): 00632 #ifdef USING_CUDA 00633 int int_part = __float2int_rd(RN); // -- Integer part (round down) 00634 #else 00635 int int_part = (int)(RN); 00636 #endif 00637 float fraction_part = RN - ((float)int_part); // -- Fractional part 00638 if (fraction_part < source_energy_data_CONST.espc_cutoff[int_part]) // Check if we are in the aliased part 00639 sampled_bin = int_part; // Below the cutoff: return current value 00640 else 00641 sampled_bin = (int)source_energy_data_CONST.espc_alias[int_part]; // Above the cutoff: return alias 00642 00643 // Linear interpolation of the final energy within the sampled energy bin: 00644 *energy = source_energy_data_CONST.espc[sampled_bin] + ranecu(seed) * (source_energy_data_CONST.espc[sampled_bin+1] - source_energy_data_CONST.espc[sampled_bin]); 00645 00646 00647 // *** Sample the initial direction: 00648 00649 do // Iterate sampling if the sampled direction is not acceptable to get a square field at the given phi (rejection sampling): force square field for any phi!! 00650 { 00651 // Using the algorithm used in PENMAIN.f, from penelope 2008 (by F. Salvat). 00652 direction->z = source_data_SHARED->cos_theta_low + ranecu(seed)*source_data_SHARED->D_cos_theta; // direction->z = w = cos(theta_sampled) 00653 register float phi_sampled = source_data_SHARED->phi_low + ranecu(seed)*source_data_SHARED->D_phi; 00654 register float sin_theta_sampled = sqrtf(1.0f - direction->z*direction->z); 00655 float sinphi_sampled, cosphi_sampled; 00656 00657 #ifdef USING_CUDA 00658 sincos(phi_sampled, &sinphi_sampled,&cosphi_sampled); // Calculate the SIN and COS at the same time. 00659 #else 00660 sinphi_sampled = sin(phi_sampled); // Some CPU compilers will be able to use "sincos", but let's be safe. 00661 cosphi_sampled = cos(phi_sampled); 00662 #endif 00663 00664 direction->y = sin_theta_sampled * sinphi_sampled; 00665 direction->x = sin_theta_sampled * cosphi_sampled; 00666 } 00667 while( fabsf(direction->z/(direction->y+1.0e-7f)) > source_data_SHARED->max_height_at_y1cm ); // !!DeBuG!! Force square field for any phi by rejection sampling!! Is it necessary to use the "+1.0e-7f" to prevent possible division by zero??? 00668 00669 00670 if (detector_data_SHARED->rotation_flag == 1) 00671 { 00672 // --Initial beam not pointing to (0,1,0), apply rotation: 00673 register float direction_x_tmp = direction->x; 00674 register float direction_y_tmp = direction->y; 00675 direction->x = source_data_SHARED->rot_fan[0]*direction_x_tmp + source_data_SHARED->rot_fan[1]*direction_y_tmp + source_data_SHARED->rot_fan[2]*direction->z; 00676 direction->y = source_data_SHARED->rot_fan[3]*direction_x_tmp + source_data_SHARED->rot_fan[4]*direction_y_tmp + source_data_SHARED->rot_fan[5]*direction->z; 00677 direction->z = source_data_SHARED->rot_fan[6]*direction_x_tmp + source_data_SHARED->rot_fan[7]*direction_y_tmp + source_data_SHARED->rot_fan[8]*direction->z; 00678 } 00679 00680 // Initialize x ray position at focal spot before translation into bbox. Particle stays in focal spot if no interaction found: 00681 position->x = source_data_SHARED->position.x; 00682 position->y = source_data_SHARED->position.y; 00683 position->z = source_data_SHARED->position.z; 00684 00685 move_to_bbox(position, direction, voxel_data_CONST.size_bbox, absvox); // Move the particle inside the voxel bounding box. 00686 } 00687 00688 00689 00690 //////////////////////////////////////////////////////////////////////////////// 00691 //! Functions that moves a particle inside the voxelized geometry bounding box. 00692 //! An EPSILON distance is added to make sure the particles will be clearly inside the bbox, 00693 //! not exactly on the surface. 00694 //! 00695 //! This algorithm makes the following assumtions: 00696 //! - The back lower vertex of the voxel bounding box is always located at the origin: (x0,y0,z0)=(0,0,0). 00697 //! - The initial value of "position" corresponds to the focal spot location. 00698 //! - When a ray is not pointing towards the bbox plane that it should cross according to the sign of the direction, 00699 //! I assign a distance to the intersection =0 instead of the real negative distance. The wall that will be 00700 //! crossed to enter the bbox is always the furthest and therefore a 0 distance will never be used except 00701 //! in the case of a ray starting inside the bbox or outside the bbox and not pointing to any of the 3 planes. 00702 //! In this situation the ray will be transported a 0 distance, meaning that it will stay at the focal spot. 00703 //! 00704 //! (Interesting information on ray-box intersection: http://tog.acm.org/resources/GraphicsGems/gems/RayBox.c) 00705 //! 00706 //! @param[in,out] position Particle position: initially set to the focal spot, returned transported inside the voxel bbox. 00707 //! @param[out] direction Sampled particle direction (cosine vectors). 00708 //! @param[out] intersection_flag Set to <0 if particle outside bbox and will not cross the voxels, not changed otherwise. 00709 //! @param[out] size_bbox Size of the bounding box. 00710 //////////////////////////////////////////////////////////////////////////////// 00711 #ifdef USING_CUDA 00712 __device__ 00713 #endif 00714 inline void move_to_bbox(float3* position, float3* direction, float3 size_bbox, int* intersection_flag) 00715 { 00716 float dist_y, dist_x, dist_z; 00717 00718 // -Distance to the nearest Y plane: 00719 if ((direction->y) > EPS_SOURCE) // Moving to +Y: check distance to y=0 plane 00720 { 00721 // Check Y=0 (bbox wall): 00722 if (position->y > 0.0f) // The input position must correspond to the focal spot => position->y == source_data_CONST.position[*num_p].y 00723 dist_y = 0.0f; // No intersection with this plane: particle inside or past the box 00724 // The actual distance would be negative but we set it to 0 bc we will not move the particle if no intersection exist. 00725 else 00726 dist_y = EPS_SOURCE + (-position->y)/(direction->y); // dist_y > 0 for sure in this case 00727 } 00728 else if ((direction->y) < NEG_EPS_SOURCE) 00729 { 00730 // Check Y=voxel_data_CONST.size_bbox.y: 00731 if (position->y < size_bbox.y) 00732 dist_y = 0.0f; // No intersection with this plane 00733 else 00734 dist_y = EPS_SOURCE + (size_bbox.y - position->y)/(direction->y); // dist_y > 0 for sure in this case 00735 } 00736 else // (direction->y)~0 00737 dist_y = NEG_INF; // Particle moving parallel to the plane: no interaction possible (set impossible negative dist = -INFINITE) 00738 00739 // -Distance to the nearest X plane: 00740 if ((direction->x) > EPS_SOURCE) 00741 { 00742 // Check X=0: 00743 if (position->x > 0.0f) 00744 dist_x = 0.0f; 00745 else 00746 dist_x = EPS_SOURCE + (-position->x)/(direction->x); // dist_x > 0 for sure in this case 00747 } 00748 else if ((direction->x) < NEG_EPS_SOURCE) 00749 { 00750 // Check X=voxel_data_CONST.size_bbox.x: 00751 if (position->x < size_bbox.x) 00752 dist_x = 0.0f; 00753 else 00754 dist_x = EPS_SOURCE + (size_bbox.x - position->x)/(direction->x); // dist_x > 0 for sure in this case 00755 } 00756 else 00757 dist_x = NEG_INF; 00758 00759 // -Distance to the nearest Z plane: 00760 if ((direction->z) > EPS_SOURCE) 00761 { 00762 // Check Z=0: 00763 if (position->z > 0.0f) 00764 dist_z = 0.0f; 00765 else 00766 dist_z = EPS_SOURCE + (-position->z)/(direction->z); // dist_z > 0 for sure in this case 00767 } 00768 else if ((direction->z) < NEG_EPS_SOURCE) 00769 { 00770 // Check Z=voxel_data_CONST.size_bbox.z: 00771 if (position->z < size_bbox.z) 00772 dist_z = 0.0f; 00773 else 00774 dist_z = EPS_SOURCE + (size_bbox.z - position->z)/(direction->z); // dist_z > 0 for sure in this case 00775 } 00776 else 00777 dist_z = NEG_INF; 00778 00779 00780 // -- Find the longest distance plane, which is the one that has to be crossed to enter the bbox. 00781 // Storing the maximum distance in variable "dist_z". Distance will be =0 if no intersection exists or 00782 // if the x ray is already inside the bbox. 00783 if ( (dist_y>dist_x) && (dist_y>dist_z) ) 00784 dist_z = dist_y; // dist_z == dist_max 00785 else if (dist_x>dist_z) 00786 dist_z = dist_x; 00787 // else 00788 // dist_max = dist_z; 00789 00790 // -- Move particle from the focal spot (current location) to the bbox wall surface (slightly inside): 00791 position->x += dist_z * direction->x; 00792 position->y += dist_z * direction->y; 00793 position->z += dist_z * direction->z; 00794 00795 // Check if the new position is outside the bbox. If true, the particle must be moved back to the focal spot location: 00796 if ( (position->x < 0.0f) || (position->x > size_bbox.x) || 00797 (position->y < 0.0f) || (position->y > size_bbox.y) || 00798 (position->z < 0.0f) || (position->z > size_bbox.z) ) 00799 { 00800 position->x -= dist_z * direction->x; // Reject new position undoing the previous translation 00801 position->y -= dist_z * direction->y; 00802 position->z -= dist_z * direction->z; 00803 (*intersection_flag) = -111; // Particle outside the bbox AND not pointing to the bbox: set absvox<0 to skip interaction sampling. 00804 } 00805 } 00806 00807 00808 //////////////////////////////////////////////////////////////////////////////// 00809 00810 00811 //! Upper limit of the number of random values sampled in a single track. 00812 #define LEAP_DISTANCE 256 00813 //! Multipliers and moduli for the two MLCG in RANECU. 00814 #define a1_RANECU 40014 00815 #define m1_RANECU 2147483563 00816 #define a2_RANECU 40692 00817 #define m2_RANECU 2147483399 00818 //////////////////////////////////////////////////////////////////////////////// 00819 //! Initialize the pseudo-random number generator (PRNG) RANECU to a position 00820 //! far away from the previous history (leap frog technique). 00821 //! 00822 //! Each calculated seed initiates a consecutive and disjoint sequence of 00823 //! pseudo-random numbers with length LEAP_DISTANCE, that can be used to 00824 //! in a parallel simulation (Sequence Splitting parallelization method). 00825 //! The basic equation behind the algorithm is: 00826 //! S(i+j) = (a**j * S(i)) MOD m = [(a**j MOD m)*S(i)] MOD m , 00827 //! which is described in: 00828 //! P L'Ecuyer, Commun. ACM 31 (1988) p.742 00829 //! 00830 //! This function has been adapted from "seedsMLCG.f", see: 00831 //! A Badal and J Sempau, Computer Physics Communications 175 (2006) p. 440-450 00832 //! 00833 //! @param[in] history Particle bach number. 00834 //! @param[in] seed_input Initial PRNG seed input (used to initiate both MLCGs in RANECU). 00835 //! @param[out] seed Initial PRNG seeds for the present history. 00836 //! 00837 //////////////////////////////////////////////////////////////////////////////// 00838 #ifdef USING_CUDA 00839 __device__ 00840 #endif 00841 inline void init_PRNG(int history_batch, int histories_per_thread, int seed_input, int2* seed) 00842 { 00843 // -- Move the RANECU generator to a unique position for the current batch of histories: 00844 // I have to use an "unsigned long long int" value to represent all the simulated histories in all previous batches 00845 // The maximum unsigned long long int value is ~1.8e19: if history >1.8e16 and LEAP_DISTANCE==1000, 'leap' will overflow. 00846 // **** 1st MLCG: 00847 unsigned long long int leap = ((unsigned long long int)(history_batch+1))*(histories_per_thread*LEAP_DISTANCE); 00848 int y = 1; 00849 int z = a1_RANECU; 00850 // -- Calculate the modulo power '(a^leap)MOD(m)' using a divide-and-conquer algorithm adapted to modulo arithmetic 00851 for(;;) 00852 { 00853 // (A2) Halve n, and store the integer part and the residue 00854 if (0!=(leap&01)) // (bit-wise operation for MOD(leap,2), or leap%2 ==> proceed if leap is an odd number) Equivalent: t=(short)(leap%2); 00855 { 00856 leap >>= 1; // Halve n moving the bits 1 position right. Equivalent to: leap=(leap/2); 00857 y = abMODm(m1_RANECU,z,y); // (A3) Multiply y by z: y = [z*y] MOD m 00858 if (0==leap) break; // (A4) leap==0? ==> finish 00859 } 00860 else // (leap is even) 00861 { 00862 leap>>= 1; // Halve leap moving the bits 1 position right. Equivalent to: leap=(leap/2); 00863 } 00864 z = abMODm(m1_RANECU,z,z); // (A5) Square z: z = [z*z] MOD m 00865 } 00866 // AjMODm1 = y; // Exponentiation finished: AjMODm = expMOD = y = a^j 00867 00868 // -- Compute and display the seeds S(i+j), from the present seed S(i), using the previously calculated value of (a^j)MOD(m): 00869 // S(i+j) = [(a**j MOD m)*S(i)] MOD m 00870 // S_i = abMODm(m,S_i,AjMODm) 00871 seed->x = abMODm(m1_RANECU, seed_input, y); // Using the input seed as the starting seed 00872 00873 // **** 2nd MLCG (repeating the previous calculation for the 2nd MLCG parameters): 00874 leap = ((unsigned long long int)(history_batch+1))*(histories_per_thread*LEAP_DISTANCE); 00875 y = 1; 00876 z = a2_RANECU; 00877 for(;;) 00878 { 00879 // (A2) Halve n, and store the integer part and the residue 00880 if (0!=(leap&01)) // (bit-wise operation for MOD(leap,2), or leap%2 ==> proceed if leap is an odd number) Equivalent: t=(short)(leap%2); 00881 { 00882 leap >>= 1; // Halve n moving the bits 1 position right. Equivalent to: leap=(leap/2); 00883 y = abMODm(m2_RANECU,z,y); // (A3) Multiply y by z: y = [z*y] MOD m 00884 if (0==leap) break; // (A4) leap==0? ==> finish 00885 } 00886 else // (leap is even) 00887 { 00888 leap>>= 1; // Halve leap moving the bits 1 position right. Equivalent to: leap=(leap/2); 00889 } 00890 z = abMODm(m2_RANECU,z,z); // (A5) Square z: z = [z*z] MOD m 00891 } 00892 // AjMODm2 = y; 00893 seed->y = abMODm(m2_RANECU, seed_input, y); // Using the input seed as the starting seed 00894 } 00895 00896 00897 00898 ///////////////////////////////////////////////////////////////////// 00899 //! Calculate "(a1*a2) MOD m" with 32-bit integers and avoiding 00900 //! the possible overflow, using the Russian Peasant approach 00901 //! modulo m and the approximate factoring method, as described 00902 //! in: L'Ecuyer and Cote, ACM Trans. Math. Soft. 17 (1991). 00903 //! 00904 //! This function has been adapted from "seedsMLCG.f", see: 00905 //! Badal and Sempau, Computer Physics Communications 175 (2006) 00906 //! 00907 //! @param[in] m,a,s MLCG parameters 00908 //! @return (a1*a2) MOD m 00909 // 00910 // Input: 0 < a1 < m 00911 // 0 < a2 < m 00912 // 00913 // Return value: (a1*a2) MOD m 00914 // 00915 ///////////////////////////////////////////////////////////////////// 00916 #ifdef USING_CUDA 00917 __device__ __host__ // Function will be callable from host and also from device 00918 #endif 00919 inline int abMODm(int m, int a, int s) 00920 { 00921 // CAUTION: the input parameters are modified in the function but should not be returned to the calling function! (pass by value!) 00922 int q, k; 00923 int p = -m; // p is always negative to avoid overflow when adding 00924 00925 // ** Apply the Russian peasant method until "a =< 32768": 00926 while (a>32768) // We assume '32' bit integers (4 bytes): 2^(('32'-2)/2) = 32768 00927 { 00928 if (0!=(a&1)) // Store 's' when 'a' is odd Equivalent code: if (1==(a%2)) 00929 { 00930 p += s; 00931 if (p>0) p -= m; 00932 } 00933 a >>= 1; // Half a (move bits 1 position right) Equivalent code: a = a/2; 00934 s = (s-m) + s; // Double s (MOD m) 00935 if (s<0) s += m; // (s is always positive) 00936 } 00937 00938 // ** Employ the approximate factoring method (a is small enough to avoid overflow): 00939 q = (int) m / a; 00940 k = (int) s / q; 00941 s = a*(s-k*q)-k*(m-q*a); 00942 while (s<0) 00943 s += m; 00944 00945 // ** Compute the final result: 00946 p += s; 00947 if (p<0) p += m; 00948 00949 return p; 00950 } 00951 00952 00953 00954 //////////////////////////////////////////////////////////////////////////////// 00955 //! Pseudo-random number generator (PRNG) RANECU returning a float value 00956 //! (single precision version). 00957 //! 00958 //! @param[in,out] seed PRNG seed (seed kept in the calling function and updated here). 00959 //! @return PRN double value in the open interval (0,1) 00960 //! 00961 //////////////////////////////////////////////////////////////////////////////// 00962 #ifdef USING_CUDA 00963 __device__ 00964 #endif 00965 inline float ranecu(int2* seed) 00966 { 00967 int i1 = (int)(seed->x/53668); 00968 seed->x = 40014*(seed->x-i1*53668)-i1*12211; 00969 00970 int i2 = (int)(seed->y/52774); 00971 seed->y = 40692*(seed->y-i2*52774)-i2*3791; 00972 00973 if (seed->x < 0) seed->x += 2147483563; 00974 if (seed->y < 0) seed->y += 2147483399; 00975 00976 i2 = seed->x-seed->y; 00977 if (i2 < 1) i2 += 2147483562; 00978 00979 00980 #ifdef USING_CUDA 00981 return (__int2float_rn(i2)*4.65661305739e-10f); // 4.65661305739e-10 == 1/2147483563 00982 #else 00983 return ((float)(i2)*4.65661305739e-10f); 00984 #endif 00985 00986 } 00987 00988 00989 //////////////////////////////////////////////////////////////////////////////// 00990 //! Pseudo-random number generator (PRNG) RANECU returning a double value. 00991 //////////////////////////////////////////////////////////////////////////////// 00992 #ifdef USING_CUDA 00993 __device__ 00994 #endif 00995 inline double ranecu_double(int2* seed) 00996 { 00997 int i1 = (int)(seed->x/53668); 00998 seed->x = 40014*(seed->x-i1*53668)-i1*12211; 00999 01000 int i2 = (int)(seed->y/52774); 01001 seed->y = 40692*(seed->y-i2*52774)-i2*3791; 01002 01003 if (seed->x < 0) seed->x += 2147483563; 01004 if (seed->y < 0) seed->y += 2147483399; 01005 01006 i2 = seed->x-seed->y; 01007 if (i2 < 1) i2 += 2147483562; 01008 01009 #ifdef USING_CUDA 01010 return (__int2double_rn(i2)*4.6566130573917692e-10); 01011 #else 01012 return ((double)(i2)*4.6566130573917692e-10); 01013 #endif 01014 01015 } 01016 01017 01018 01019 //////////////////////////////////////////////////////////////////////////////// 01020 //! Find the voxel that contains the current position. 01021 //! Report the voxel absolute index and the x,y,z indices. 01022 //! The structure containing the voxel number and size is read from CONSTANT memory. 01023 //! 01024 //! @param[in] position Particle position 01025 //! @param[out] voxel_coord Pointer to three integer values (short3*) that will store the x,y and z voxel indices. 01026 //! @return Returns "absvox", the voxel number where the particle is 01027 //! located (negative if position outside the voxel bbox). 01028 //! 01029 //////////////////////////////////////////////////////////////////////////////// 01030 #ifdef USING_CUDA 01031 __device__ 01032 #endif 01033 inline int locate_voxel(float3* position, short3* voxel_coord) 01034 { 01035 01036 if ( (position->y < EPS_SOURCE) || (position->y > (voxel_data_CONST.size_bbox.y - EPS_SOURCE)) || 01037 (position->x < EPS_SOURCE) || (position->x > (voxel_data_CONST.size_bbox.x - EPS_SOURCE)) || 01038 (position->z < EPS_SOURCE) || (position->z > (voxel_data_CONST.size_bbox.z - EPS_SOURCE)) ) 01039 { 01040 // -- Particle escaped the voxelized geometry (using EPS_SOURCE to avoid numerical precision errors): 01041 return -1; 01042 } 01043 01044 // -- Particle inside the voxelized geometry, find current voxel: 01045 // The truncation from float to integer could give troubles for negative coordinates but this will never happen thanks to the IF at the begining of this function. 01046 // (no need to use the CUDA function to convert float to integer rounding down (towards minus infinite): __float2int_rd) 01047 01048 register int voxel_coord_x, voxel_coord_y, voxel_coord_z; 01049 #ifdef USING_CUDA 01050 voxel_coord_x = __float2int_rd(position->x * voxel_data_CONST.inv_voxel_size.x); 01051 voxel_coord_y = __float2int_rd(position->y * voxel_data_CONST.inv_voxel_size.y); 01052 voxel_coord_z = __float2int_rd(position->z * voxel_data_CONST.inv_voxel_size.z); 01053 #else 01054 voxel_coord_x = (int)(position->x * voxel_data_CONST.inv_voxel_size.x); 01055 voxel_coord_y = (int)(position->y * voxel_data_CONST.inv_voxel_size.y); 01056 voxel_coord_z = (int)(position->z * voxel_data_CONST.inv_voxel_size.z); 01057 #endif 01058 01059 // Output the voxel coordinates as short int (2 bytes) instead of int (4 bytes) to save registers; avoid type castings in the calculation of the return value. 01060 voxel_coord->x = (short int) voxel_coord_x; 01061 voxel_coord->y = (short int) voxel_coord_y; 01062 voxel_coord->z = (short int) voxel_coord_z; 01063 01064 return (voxel_coord_x + voxel_coord_y*(voxel_data_CONST.num_voxels.x) + voxel_coord_z*(voxel_data_CONST.num_voxels.x)*(voxel_data_CONST.num_voxels.y)); 01065 } 01066 01067 01068 01069 ////////////////////////////////////////////////////////////////////// 01070 //! Rotates a vector; the rotation is specified by giving 01071 //! the polar and azimuthal angles in the "self-frame", as 01072 //! determined by the vector to be rotated. 01073 //! This function is a literal translation from Fortran to C of 01074 //! PENELOPE (v. 2006) subroutine "DIRECT". 01075 //! 01076 //! @param[in,out] (u,v,w) input vector (=d) in the lab. frame; returns the rotated vector components in the lab. frame 01077 //! @param[in] costh cos(theta), angle between d before and after turn 01078 //! @param[in] phi azimuthal angle (rad) turned by d in its self-frame 01079 // 01080 // Output: 01081 // (u,v,w) -> rotated vector components in the lab. frame 01082 // 01083 // Comments: 01084 // -> (u,v,w) should have norm=1 on input; if not, it is 01085 // renormalized on output, provided norm>0. 01086 // -> The algorithm is based on considering the turned vector 01087 // d' expressed in the self-frame S', 01088 // d' = (sin(th)cos(ph), sin(th)sin(ph), cos(th)) 01089 // and then apply a change of frame from S' to the lab 01090 // frame. S' is defined as having its z' axis coincident 01091 // with d, its y' axis perpendicular to z and z' and its 01092 // x' axis equal to y'*z'. The matrix of the change is then 01093 // / uv/rho -v/rho u \ 01094 // S ->lab: | vw/rho u/rho v | , rho=(u^2+v^2)^0.5 01095 // \ -rho 0 w / 01096 // -> When rho=0 (w=1 or -1) z and z' are parallel and the y' 01097 // axis cannot be defined in this way. Instead y' is set to 01098 // y and therefore either x'=x (if w=1) or x'=-x (w=-1) 01099 ////////////////////////////////////////////////////////////////////// 01100 #ifdef USING_CUDA 01101 __device__ 01102 #endif 01103 inline void rotate_double(float3* direction, double costh, double phi) // !!DeBuG!! The direction vector is single precision but the rotation is performed in doule precision for increased accuracy. 01104 { 01105 double DXY, NORM, cosphi, sinphi, SDT; 01106 DXY = direction->x*direction->x + direction->y*direction->y; 01107 01108 #ifdef USING_CUDA 01109 sincos(phi, &sinphi,&cosphi); // Calculate the SIN and COS at the same time. 01110 #else 01111 sinphi = sin(phi); // Some CPU compilers will be able to use "sincos", but let's be safe. 01112 cosphi = cos(phi); 01113 #endif 01114 01115 // **** Ensure normalisation 01116 NORM = DXY + direction->z*direction->z; // !!DeBuG!! Check if it is really necessary to renormalize in a real simulation!! 01117 if (fabs(NORM-1.0)>1.0e-14) 01118 { 01119 NORM = 1.0/sqrt(NORM); 01120 direction->x = NORM*direction->x; 01121 direction->y = NORM*direction->y; 01122 direction->z = NORM*direction->z; 01123 DXY = direction->x*direction->x + direction->y*direction->y; 01124 } 01125 if (DXY>1.0e-28) 01126 { 01127 SDT = sqrt((1.0-costh*costh)/DXY); 01128 float direction_x_in = direction->x; 01129 direction->x = direction->x*costh + SDT*(direction_x_in*direction->z*cosphi-direction->y*sinphi); 01130 direction->y = direction->y*costh+SDT*(direction->y*direction->z*cosphi+direction_x_in*sinphi); 01131 direction->z = direction->z*costh-DXY*SDT*cosphi; 01132 } 01133 else 01134 { 01135 SDT = sqrt(1.0-costh*costh); 01136 direction->y = SDT*sinphi; 01137 if (direction->z>0.0) 01138 { 01139 direction->x = SDT*cosphi; 01140 direction->z = costh; 01141 } 01142 else 01143 { 01144 direction->x =-SDT*cosphi; 01145 direction->z =-costh; 01146 } 01147 } 01148 } 01149 01150 01151 ////////////////////////////////////////////////////////////////////// 01152 01153 01154 // *********************************************************************** 01155 // * Translation of PENELOPE's "SUBROUTINE GRAa" from FORTRAN77 to C * 01156 // *********************************************************************** 01157 //! Sample a Rayleigh interaction using the sampling algorithm 01158 //! used in PENELOPE 2006. 01159 //! 01160 //! @param[in] energy Particle energy (not modified with Rayleigh) 01161 //! @param[out] costh_Rayleigh Cosine of the angular deflection 01162 //! @param[in] material Current voxel material 01163 // 01164 // CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 01165 // C PENELOPE/PENGEOM (version 2006) C 01166 // C Copyright (c) 2001-2006 C 01167 // C Universitat de Barcelona C 01168 // C Permission to use, copy, modify, distribute and sell this software C 01169 // C and its documentation for any purpose is hereby granted without C 01170 // C fee, provided that the above copyright notice appears in all C 01171 // C copies and that both that copyright notice and this permission C 01172 // C notice appear in all supporting documentation. The Universitat de C 01173 // C Barcelona makes no representations about the suitability of this C 01174 // C software for any purpose. It is provided "as is" without express C 01175 // C or implied warranty. C 01176 // CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 01177 ////////////////////////////////////////////////////////////////////// 01178 #ifdef USING_CUDA 01179 __device__ 01180 #endif 01181 inline void GRAa(float *energy, double *costh_Rayleigh, int *mat, float *pmax_current, int2 *seed, struct rayleigh_struct* cgra) 01182 { 01183 /* **** Energy grid and interpolation constants for the current energy. */ 01184 double xmax = ((double)*energy) * 8.065535669099010e-5; // 8.065535669099010e-5 == 2.0*20.6074/510998.918 01185 double x2max = min_value( (xmax*xmax) , ((double)cgra->xco[(*mat+1)*NP_RAYLEIGH - 1]) ); // Get the last tabulated value of xco for this mat 01186 01187 if (xmax < 0.01) 01188 { 01189 do 01190 { 01191 *costh_Rayleigh = 1.0 - ranecu_double(seed) * 2.0; 01192 } 01193 while ( ranecu_double(seed) > (((*costh_Rayleigh)*(*costh_Rayleigh)+1.0)*0.5) ); 01194 return; 01195 } 01196 01197 for(;;) // (Loop will iterate everytime the sampled value is rejected or above maximum) 01198 { 01199 double ru = ranecu_double(seed) * (double)(*pmax_current); // Pmax for the current energy is entered as a parameter 01200 01201 /* **** Selection of the interval (binary search within pre-calculated limits). */ 01202 int itn = (int)(ru * (NP_RAYLEIGH-1)); // 'itn' will never reach the last interval 'NP_RAYLEIGH-1', but this is how RITA is implemented in PENELOPE 01203 int i__ = (int)cgra->itlco[itn + (*mat)*NP_RAYLEIGH]; 01204 int j = (int)cgra->ituco[itn + (*mat)*NP_RAYLEIGH]; 01205 01206 if ((j - i__) > 1) 01207 { 01208 do 01209 { 01210 register int k = (i__ + j)>>1; // >>1 == /2 01211 if (ru > cgra->pco[k -1 + (*mat)*NP_RAYLEIGH]) 01212 i__ = k; 01213 else 01214 j = k; 01215 } 01216 while ((j - i__) > 1); 01217 } 01218 01219 /* **** Sampling from the rational inverse cumulative distribution. */ 01220 int index = i__ - 1 + (*mat)*NP_RAYLEIGH; 01221 01222 double rr = ru - cgra->pco[index]; 01223 double xx; 01224 if (rr > 1e-16) 01225 { 01226 double d__ = (double)(cgra->pco[index+1] - cgra->pco[index]); 01227 float aco_index = cgra->aco[index], bco_index = cgra->bco[index], xco_index = cgra->xco[index]; // Avoid multiple accesses to the same global variable 01228 01229 xx = (double)xco_index + (double)(aco_index + 1.0f + bco_index)* d__* rr / (d__*d__ + (aco_index*d__ + bco_index*rr) * rr) * (double)(cgra->xco[index+1] - xco_index); 01230 01231 } 01232 else 01233 { 01234 xx = cgra->xco[index]; 01235 } 01236 01237 if (xx < x2max) 01238 { 01239 // Sampled value below maximum possible value: 01240 *costh_Rayleigh = 1.0 - 2.0 * xx / x2max; // !!DeBuG!! costh_Rayleigh in double precision, but not all intermediate steps are!? 01241 /* **** Rejection: */ 01242 if (ranecu_double(seed) < (((*costh_Rayleigh)*(*costh_Rayleigh) + 1.0)*0.5)) 01243 break; // Sample value not rejected! break loop and return. 01244 } 01245 } 01246 } /* graa */ 01247 01248 01249 01250 ////////////////////////////////////////////////////////////////////////// 01251 01252 01253 // *********************************************************************** 01254 // * Translation of PENELOPE's "SUBROUTINE GCOa" from FORTRAN77 to C * 01255 // ********************************************************************* * 01256 //! Random sampling of incoherent (Compton) scattering of photons, using 01257 //! the sampling algorithm from PENELOPE 2006: 01258 //! Relativistic impulse approximation with analytical one-electron Compton profiles 01259 01260 // !!DeBuG!! In penelope, Doppler broadening is not used for E greater than 5 MeV. 01261 // We don't use it in GPU to reduce the lines of code and prevent using COMMON/compos/ZT(M) 01262 01263 //! @param[in,out] energy incident and final photon energy (eV) 01264 //! @param[out] costh_Compton cosine of the polar scattering angle 01265 //! @param[in] material Current voxel material 01266 //! @param[in] seed RANECU PRNG seed 01267 // 01268 // CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 01269 // C PENELOPE/PENGEOM (version 2006) C 01270 // C Copyright (c) 2001-2006 C 01271 // C Universitat de Barcelona C 01272 // C Permission to use, copy, modify, distribute and sell this software C 01273 // C and its documentation for any purpose is hereby granted without C 01274 // C fee, provided that the above copyright notice appears in all C 01275 // C copies and that both that copyright notice and this permission C 01276 // C notice appear in all supporting documentation. The Universitat de C 01277 // C Barcelona makes no representations about the suitability of this C 01278 // C software for any purpose. It is provided "as is" without express C 01279 // C or implied warranty. C 01280 // CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 01281 // 01282 // ************************************************************************ 01283 01284 #ifdef USING_CUDA 01285 __device__ 01286 #endif 01287 inline void GCOa(float *energy, double *costh_Compton, int *mat, int2 *seed, struct compton_struct* cgco_SHARED) 01288 { 01289 float s, a1, s0, af, ek, ek2, ek3, tau, pzomc, taumin; 01290 float rn[MAX_SHELLS]; 01291 double cdt1; 01292 01293 // Some variables used in PENELOPE have been eliminated to save register: float aux, taum2, fpzmax, a, a2, ek1 ,rni, xqc, fpz, pac[MAX_SHELLS]; 01294 01295 int i__; 01296 int my_noscco = cgco_SHARED->noscco[*mat]; // Store the number of oscillators for the input material in a local variable 01297 01298 #ifndef USING_CUDA 01299 static int warning_flag_1 = -1, warning_flag_2 = -1, warning_flag_3 = -1; // Write warnings for the CPU code, but only once. !!DeBuG!! 01300 #endif 01301 01302 ek = *energy * 1.956951306108245e-6f; // (1.956951306108245e-6 == 1.0/510998.918) 01303 ek2 = ek * 2.f + 1.f; 01304 ek3 = ek * ek; 01305 // ek1 = ek3 - ek2 - 1.; 01306 taumin = 1.f / ek2; 01307 // taum2 = taumin * taumin; 01308 a1 = logf(ek2); 01309 // a2 = a1 + ek * 2. * (ek + 1.) * taum2; // a2 was used only once, code moved below 01310 01311 01312 /* **** Incoherent scattering function for theta=PI. */ 01313 01314 s0 = 0.0f; 01315 for (i__ = 0; i__ < my_noscco; i__++) 01316 { 01317 register float temp = cgco_SHARED->uico[*mat + i__*MAX_MATERIALS]; 01318 if (temp < *energy) 01319 { 01320 register float aux = *energy * (*energy - temp) * 2.f; 01321 #ifdef USING_CUDA 01322 pzomc = cgco_SHARED->fj0[*mat + i__*MAX_MATERIALS] * (aux - temp * 510998.918f) * rsqrtf(aux + aux + temp * temp) * 1.956951306108245e-6f; 01323 // 1.956951306108245e-6 = 1.0/510998.918f // Version using the reciprocal of sqrt in CUDA: faster and more accurate!! 01324 #else 01325 pzomc = cgco_SHARED->fj0[*mat + i__*MAX_MATERIALS] * (aux - temp * 510998.918f) / (sqrtf(aux + aux + temp * temp) * 510998.918f); 01326 #endif 01327 if (pzomc > 0.0f) 01328 temp = (0.707106781186545f+pzomc*1.4142135623731f) * (0.707106781186545f+pzomc*1.4142135623731f); 01329 else 01330 temp = (0.707106781186545f-pzomc*1.4142135623731f) * (0.707106781186545f-pzomc*1.4142135623731f); 01331 01332 temp = 0.5f * expf(0.5f - temp); // Calculate EXP outside the IF to avoid branching 01333 01334 if (pzomc > 0.0f) 01335 temp = 1.0f - temp; 01336 01337 s0 += cgco_SHARED->fco[*mat + i__*MAX_MATERIALS] * temp; 01338 } 01339 } 01340 01341 /* **** Sampling tau. */ 01342 do 01343 { 01344 if (ranecu(seed)*/*a2=*/(a1+2.*ek*(ek+1.f)*taumin*taumin) < a1) 01345 { 01346 tau = powf(taumin, ranecu(seed)); // !!DeBuG!! "powf()" has a big error (7 ULP), the double version has only 2!! 01347 } 01348 else 01349 { 01350 tau = sqrtf(1.f + ranecu(seed) * (taumin * taumin - 1.f)); 01351 } 01352 01353 cdt1 = (double)(1.f-tau) / (((double)tau)*((double)*energy)*1.956951306108245e-6); // !!DeBuG!! The sampled COS will be double precision, but TAU is not!!! 01354 01355 if (cdt1 > 2.0) cdt1 = 1.99999999; // !!DeBuG!! Make sure that precision error in POW, SQRT never gives cdt1>2 ==> costh_Compton<-1 01356 01357 /* **** Incoherent scattering function. */ 01358 s = 0.0f; 01359 for (i__ = 0; i__ < my_noscco; i__++) 01360 { 01361 register float temp = cgco_SHARED->uico[*mat + i__*MAX_MATERIALS]; 01362 if (temp < *energy) 01363 { 01364 register float aux = (*energy) * (*energy - temp) * ((float)cdt1); 01365 01366 if ((aux>1.0e-12f)||(temp>1.0e-12f)) // !!DeBuG!! Make sure the SQRT argument is never <0, and that we never get 0/0 -> NaN when aux=temp=0 !! 01367 { 01368 #ifdef USING_CUDA 01369 pzomc = cgco_SHARED->fj0[*mat + i__*MAX_MATERIALS] * (aux - temp * 510998.918f) * rsqrtf(aux + aux + temp * temp) * 1.956951306108245e-6f; 01370 // 1.956951306108245e-6 = 1.0/510998.918f // Version using the reciprocal of sqrt in CUDA: faster and more accurate!! 01371 #else 01372 pzomc = cgco_SHARED->fj0[*mat + i__*MAX_MATERIALS] * (aux - temp * 510998.918f) / (sqrtf(aux + aux + temp * temp) * 510998.918f); 01373 #endif 01374 01375 } 01376 else 01377 { 01378 pzomc = 0.002f; // !!DeBuG!! Using a rough approximation to a sample value of pzomc found using pure double precision: NOT RIGUROUS! But this code is expected to be used very seldom, only in extreme cases. 01379 #ifndef USING_CUDA 01380 if (warning_flag_1<0) 01381 { 01382 warning_flag_1 = +1; // Disable warning, do not show again 01383 printf(" [... Small numerical precision error detected computing \"pzomc\" in GCOa (this warning will not be repeated).]\n i__=%d, aux=%.14f, temp=%.14f, pzomc(forced)=%.14f, uico=%.14f, energy=%.7f, cgco_SHARED->fj0=%.14f, mat=%d, cdt1=%.14lf\n", (int)i__, aux, temp, pzomc, cgco_SHARED->uico[*mat+i__*MAX_MATERIALS], *energy, cgco_SHARED->fj0[*mat+i__*MAX_MATERIALS], (int)*mat, cdt1); // !!DeBuG!! 01384 } 01385 #endif 01386 } 01387 01388 temp = pzomc * 1.4142135623731f; 01389 if (pzomc > 0.0f) 01390 temp = 0.5f - (temp + 0.70710678118654502f) * (temp + 0.70710678118654502f); // Calculate exponential argument 01391 else 01392 temp = 0.5f - (0.70710678118654502f - temp) * (0.70710678118654502f - temp); 01393 01394 temp = 0.5f * expf(temp); // All threads will calculate the expf together 01395 01396 if (pzomc > 0.0f) 01397 temp = 1.0f - temp; 01398 01399 s += cgco_SHARED->fco[*mat + i__*MAX_MATERIALS] * temp; 01400 rn[i__] = temp; 01401 } 01402 } 01403 } while( (ranecu(seed)*s0) > (s*(1.0f+tau*(/*ek1=*/(ek3 - ek2 - 1.0f)+tau*(ek2+tau*ek3)))/(ek3*tau*(tau*tau+1.0f))) ); // **** Rejection function 01404 01405 *costh_Compton = 1.0 - cdt1; 01406 01407 /* **** Target electron shell. */ 01408 for (;;) 01409 { 01410 register float temp = s*ranecu(seed); 01411 float pac = 0.0f; 01412 01413 int ishell = my_noscco - 1; // First shell will have number 0 01414 for (i__ = 0; i__ < (my_noscco-1); i__++) // !!DeBuG!! Iterate to (my_noscco-1) only: the last oscillator is excited in case all other fail (no point in double checking) ?? 01415 { 01416 pac += cgco_SHARED->fco[*mat + i__*MAX_MATERIALS] * rn[i__]; // !!DeBuG!! pac[] is calculated on the fly to save registers! 01417 if (pac > temp) // pac[] is calculated on the fly to save registers! 01418 { 01419 ishell = i__; 01420 break; 01421 } 01422 } 01423 01424 /* **** Projected momentum of the target electron. */ 01425 temp = ranecu(seed) * rn[ishell]; 01426 01427 if (temp < 0.5f) 01428 { 01429 pzomc = (0.70710678118654502f - sqrtf(0.5f - logf(temp + temp))) / (cgco_SHARED->fj0[*mat + ishell * MAX_MATERIALS] * 1.4142135623731f); 01430 } 01431 else 01432 { 01433 pzomc = (sqrtf(0.5f - logf(2.0f - 2.0f*temp)) - 0.70710678118654502f) / (cgco_SHARED->fj0[*mat + ishell * MAX_MATERIALS] * 1.4142135623731f); 01434 } 01435 if (pzomc < -1.0f) 01436 { 01437 continue; // re-start the loop 01438 } 01439 01440 /* **** F(EP) rejection. */ 01441 temp = tau * (tau - (*costh_Compton) * 2.f) + 1.f; // this variable was originally called "xqc" 01442 01443 // af = sqrt( max_value(temp,1.0e-30f) ) * (tau * (tau - *costh_Compton) / max_value(temp,1.0e-30f) + 1.f); //!!DeBuG!! Make sure the SQRT argument is never <0, and that I don't divide by zero!! 01444 01445 if (temp>1.0e-20f) // !!DeBuG!! Make sure the SQRT argument is never <0, and that I don't divide by zero!! 01446 { 01447 af = sqrtf(temp) * (tau * (tau - ((float)(*costh_Compton))) / temp + 1.f); 01448 } 01449 else 01450 { 01451 // When using single precision, it is possible (but very uncommon) to get costh_Compton==1 and tau==1; then temp is 0 and 'af' can not be calculated (0/0 -> nan). Analysing the results obtained using double precision, we found that 'af' would be almost 0 in this situation, with an "average" about ~0.002 (this is just a rough estimation, but using af=0 the value would never be rejected below). 01452 01453 af = 0.00200f; // !!DeBuG!! 01454 01455 #ifndef USING_CUDA 01456 if (warning_flag_2<0) 01457 { 01458 warning_flag_2 = +1; // Disable warning, do not show again 01459 printf(" [... Small numerical precision error detected computing \"af\" in GCOa (this warning will not be repeated)].\n xqc=%.14f, af(forced)=%.14f, tau=%.14f, costh_Compton=%.14lf\n", temp, af, tau, *costh_Compton); // !!DeBuG!! 01460 } 01461 #endif 01462 } 01463 01464 if (af > 0.0f) 01465 { 01466 temp = af * 0.2f + 1.f; // this variable was originally called "fpzmax" 01467 } 01468 else 01469 { 01470 temp = 1.f - af * 0.2f; 01471 } 01472 01473 if ( ranecu(seed)*temp < /*fpz =*/(af * max_value( min_value(pzomc,0.2f) , -0.2f ) + 1.f) ) 01474 { 01475 break; 01476 } 01477 01478 } 01479 01480 /* **** Energy of the scattered photon. */ 01481 { 01482 register float t, b1, b2, temp; 01483 t = pzomc * pzomc; 01484 b1 = 1.f - t * tau * tau; 01485 b2 = 1.f - t * tau * ((float)(*costh_Compton)); 01486 01487 temp = sqrtf( fabsf(b2 * b2 - b1 * (1.0f - t)) ); 01488 01489 01490 if (pzomc < 0.0f) 01491 temp *= -1.0f; 01492 01493 // !Error! energy may increase (slightly) due to inacurate calculation! !!DeBuG!! 01494 t = (tau / b1) * (b2 + temp); 01495 if (t > 1.0f) 01496 { 01497 #ifndef USING_CUDA 01498 01499 #endif 01500 #ifndef USING_CUDA 01501 if (warning_flag_3<0) 01502 { 01503 warning_flag_3 = +1; // Disable warning, do not show again 01504 printf("\n [... a Compton event tried to increase the x ray energy due to precision error. Keeping initial energy. (This warning will not be repeated.)]\n scaling=%.14f, costh_Compton=%.14lf\n", t, *costh_Compton); // !!DeBuG!! 01505 } 01506 #endif 01507 01508 t = 1.0f; // !!DeBuG!! Avoid increasing energy by hand!!! not nice!! 01509 } 01510 01511 (*energy) *= t; 01512 // (*energy) *= (tau / b1) * (b2 + temp); // Original PENELOPE code 01513 } 01514 01515 } // [End subroutine GCOa] 01516 01517 01518 01519 //////////////////////////////////////////////////////////////////////////////// 01520 01521 01522 //////////////////////////////////////////////////////////////////////////////// 01523 //! Tally the depose deposited inside each material. 01524 //! This function is called whenever a particle suffers a Compton or photoelectric 01525 //! interaction. The energy released in each interaction is added and later in the 01526 //! report function the total deposited energy is divided by the total mass of the 01527 //! material in the voxelized object to get the dose. This naturally accounts for 01528 //! multiple densities for voxels with the same material (not all voxels have same mass). 01529 //! Electrons are not transported in MC-GPU and therefore we are approximating 01530 //! that the dose is equal to the KERMA (energy released by the photons alone). 01531 //! This approximation is acceptable when there is electronic equilibrium and 01532 //! when the range of the secondary electrons is shorter than the organ size. 01533 //! 01534 //! The function uses atomic functions for a thread-safe access to the GPU memory. 01535 //! We can check if this tally was disabled in the input file checking if the array 01536 //! materials_dose was allocated in the GPU (disabled if pointer = NULL). 01537 //! 01538 //! 01539 //! @param[in] Edep Energy deposited in the interaction 01540 //! @param[in] material Current material id number 01541 //! @param[out] materials_dose ulonglong2 array storing the mateials dose [in eV/g] and dose^2 (ie, uncertainty). 01542 //////////////////////////////////////////////////////////////////////////////// 01543 #ifdef USING_CUDA 01544 __device__ 01545 #endif 01546 inline 01547 void tally_materials_dose(float* Edep, int* material, ulonglong2* materials_dose) 01548 { 01549 01550 // !!DeBuG!! The energy can be tallied directly with atomicAdd in global memory or using shared memory first and then global for whole block if too slow. With the initial testing it looks like using global memory is already very fast! 01551 01552 // !!DeBuG!! WARNING: with many histories and few materials the materials_dose integer variables may overflow!! Using double precision floats would be better. Single precision is not good enough because adding small energies to a large counter would give problems. 01553 01554 #ifdef USING_CUDA 01555 atomicAdd(&materials_dose[*material].x, __float2ull_rn((*Edep)*SCALE_eV) ); // Energy deposited at the material, scaled by the factor SCALE_eV and rounded. 01556 atomicAdd(&materials_dose[*material].y, __float2ull_rn((*Edep)*(*Edep)) ); // Square of the dose to estimate standard deviation (not using SCALE_eV for std_dev to prevent overflow) 01557 #else 01558 materials_dose[*material].x += (unsigned long long int)((*Edep)*SCALE_eV + 0.5f); 01559 materials_dose[*material].y += (unsigned long long int)((*Edep)*(*Edep) + 0.5f); 01560 #endif 01561 01562 return; 01563 }