MC-GPU
MC-GPU_v1.3.cu
Go to the documentation of this file.
00001 
00002 //   ** CHANGE LIST **  See below \section sec_changes_13 List of changes in code version 1.3
00003 
00004 // -- Code upgraded from CUDA 4 to CUDA 5, after cutil_inline.h has been eliminated from the SDK:
00005 //           CUDA 5.0: Using "getLastCudaError" instead of "cutilCheckMsg"
00006 //           CUDA 5.0: Using "checkCudaErrors" instead of "cutilSafeCall" or "cutilCheckError"
00007 //           CUDA 5.0: Using "gpuGetMaxGflopsDeviceId instead of "cutGetMaxGflopsDeviceId"
00008 //           CUDA 5.0: Substitute all the CUDA timer functions (cutResetTimer, cutStartTimer, cutStopTimer, cutGetTimerValue) for standard C clock() calls.
00009 
00010 
00011 ////////////////////////////////////////////////////////////////////////////////////////
00012 //
00013 //               ****************************
00014 //               *** MC-GPU , version 1.3 ***
00015 //               ****************************
00016 //
00017 /**
00018  *      \mainpage MC-GPU v1.3
00019  * 
00020  * \code   
00021  * 
00022  *               Andreu Badal, PhD (Andreu.Badal-Soler{at}fda.hhs.gov)
00023  * 
00024  *                  Division of Imaging and Applied Mathematics
00025  *                  Office of Science and Engineering Laboratories
00026  *                  Center for Devices and Radiological Health
00027  *                  U.S. Food and Drug Administration 
00028  * 
00029  *               Code release date: 2012/12/12
00030  * 
00031  * 
00032  * 
00033  * \endcode
00034  * 
00035  *
00036  * 
00037  *   \b MC-GPU [1-4] is a Monte Carlo simulation code that can generate synthetic radiographic
00038  *   images and computed tomography (CT) scans of realistic models of the human anatomy using the
00039  *   computational power of commodity Graphics Processing Unit (GPU) cards.
00040  *   The code implements a massively multi-threaded Monte Carlo simulation algorithm
00041  *   for the transport of x rays in a voxelized geometry. The x ray interaction models and material
00042  *   properties have been adapted from \b PENELOPE \b 2006 [5].
00043  * 
00044  * 
00045  *   \b MC-GPU was developed using the \b CUDA programming model from \b NVIDIA [6] to achieve 
00046  *   maximum performance on NVIDIA GPUs. The code can also be compiled with a standard C compiler
00047  *   to be executed in a regular CPU. 
00048  *   In a typical medical imaging simulation, the use of GPU computing with MC-GPU has been shown 
00049  *   to provide a speed up of between 20 and 40 times, compared to the execution on a single CPU core.
00050  * 
00051  *   The MC-GPU code has been described in different scientific publications [1-4].
00052  *   The main reference of this work, which the users should cite, is the following [1]:
00053  * \code
00054  *  Andreu Badal and Aldo Badano, "Accelerating Monte Carlo simulations of 
00055  *  photon transport in a voxelized geometry using a massively parallel 
00056  *  Graphics Processing Unit", Medical Physics 36, pp. 4878–4880 (2009)
00057  * \endcode
00058  *   The main developer of MC-GPU is \b Andreu \b Badal, working at the U.S. \b Food \b and    
00059  *   \b Drug \b Administration (Center for Devices and Radiological Health, Office of Science
00060  *   and Engineering Laboratories, Division of Imaging and Applied Mathematics). 
00061  *   The source code of MC-GPU is free and open software in the public domain, as explained
00062  *   in the Disclaimer section below. 
00063  *   The source code of MC-GPU and its auxiliary files are distributed from the website: http://code.google.com/. 
00064  *
00065  *  
00066  *   This documentation has been automatically generated by \b Doxygen parsing the comments in
00067  *   the MC-GPU source code.
00068  *   This code is still in development, please report to the author any issue/bug
00069  *   that you may encounter. Feel free to suggest improvements to the code too!
00070  * 
00071  * 
00072  *  
00073  *    \section  sec_changes  List of modifications in different versions of the code 
00074  * 
00075  *      \subsection  sec_changes_v13  Version 1.3 (release date: 2012/12/12)
00076  * 
00077  *   - Code upgraded to CUDA 5.0 (not compatible with previous versions of CUDA!).
00078  *   - Removed limit on the amount of projection images that can be simulated per CT scan (source and 
00079  *     detector parameters now stored in global memory and transferring to shared memory at run time 
00080  *     to avoid using the limited constant memory).
00081  *   - New material dose tally implemented to estimate the dose deposited in each material independently
00082  *     of the voxel dose tally (the voxel dose tally measures the dose in each material adding the energy 
00083  *     deposited in each voxel of that material within the defined voxelized region-of-interest).
00084  *   - Interaction loop re-organized to maximize performance (virtual interactions simulated before real ones). 
00085  *   - Improvements and small corrections in the source sampling and tally routines.
00086  *   - Allow input of material and voxel geometry files compressed with gzip (zlib library now required for compilation).
00087  * 
00088  * 
00089  * 
00090  *      \subsection  sec_changes_v12  Version 1.2 (release date: 2011/10/25)
00091  * 
00092  *   - Implemented a voxel dose tally.
00093  *   - Polyenergetic source model.
00094  *   - MPI support for simulating individual projections.
00095  *   - Simulation by time limit.
00096  *   - Improved flexibility of the CT trajectories, helical scans.
00097  * 
00098  *  
00099  *
00100  *    \section  sec_disc  Disclaimer
00101  *
00102  *   This software and documentation (the "Software") were developed at the Food and
00103  *   Drug Administration (FDA) by employees of the Federal Government in the course
00104  *   of their official duties. Pursuant to Title 17, Section 105 of the United States
00105  *   Code, this work is not subject to copyright protection and is in the public
00106  *   domain. Permission is hereby granted, free of charge, to any person obtaining a
00107  *   copy of the Software, to deal in the Software without restriction, including
00108  *   without limitation the rights to use, copy, modify, merge, publish, distribute,
00109  *   sublicense, or sell copies of the Software or derivatives, and to permit persons
00110  *   to whom the Software is furnished to do so. FDA assumes no responsibility
00111  *   whatsoever for use by other parties of the Software, its source code,
00112  *   documentation or compiled executables, and makes no guarantees, expressed or
00113  *   implied, about its quality, reliability, or any other characteristic. Further,
00114  *   use of this code in no way implies endorsement by the FDA or confers any
00115  *   advantage in regulatory decisions.  Although this software can be redistributed
00116  *   and/or modified freely, we ask that any derivative works bear some notice that
00117  *   they are derived from it, and any modified versions bear some notice that they
00118  *   have been modified.
00119  * 
00120  * 
00121  * 
00122  *     \section sec_Intro Code features
00123  *  
00124  *   In this section we provide a brief description of the features of the MC-GPU code. A
00125  *   more complete description of the code can be found in our published articles.
00126  *   important information regarding the operation of the code is provided as comments in the  
00127  *   input files of the sample simulations provided with the MC-GPU package.
00128  *   Detailed information on each function of the code can be found in the complete Doxygen 
00129  *   documentation of the source code
00130  * 
00131  *   The basic operation of the code consists in adapting the simulation input file 
00132  *   to describe the location and characteristics of the x ray source, define the CT trajectory
00133  *   (if any), list the materials to be used in the simulation, define the geometry of
00134  *   the x ray detector and, finally, specify the voxelized object file to be  
00135  *   used as the simulation material universe. 
00136  *   In the first line of the input file, the user can fix the total number of x rays that have 
00137  *   to be simulated (> 1e5 histories) or the total simulation time (maximum 1e5 seconds). 
00138  * 
00139  * 
00140  *   The coordinate system of the simulated world is determined by the input voxelized geometry.
00141  *   The origin of coordinates is assumed to be located at the lower-back corner of the voxelized 
00142  *   volume, and the axis are located on the vertices of the voxelized volume. 
00143  *   This means that the lower-back corner of the first voxel is on the origin and the 
00144  *   following voxels are located along the positive X, Y and Z axis (first quadrant).
00145  * 
00146  * 
00147  *   To simulate the atomic interactions, MC-GPU uses a database of material properties based on the 
00148  *   database from PENELOPE. A PENELOPE 2006 material file can be converted into an MC-GPU material
00149  *   file using the auxiliary utility "MC-GPU_create_material_data.f" provided with the MC-GPU 
00150  *   package. Pre-defined material files for a set of materials typically used in medical imaging  
00151  *   simulations are already provided in the folder "MC-GPU_material_files".
00152  * 
00153  * 
00154  *   The code includes two tally options: an \b image \b tally that creates projection x-ray images,
00155  *   and a radiation \b dose \b tally that estimates the dose deposited inside the patient model.
00156  *   MC-GPU does not currently simulate the transport of electrons and therefore the dose  
00157  *   deposition tally (KERMA tally rigorously) will not be accurate for high energies or near 
00158  *   material interfaces and small voxels.
00159  *   In the image tally the images are formed by counting the energy that enters a user-defined 2D 
00160  *   grid of pixels, which is a simple approximation to a noise-free flat-panel detector with
00161  *   100% detection efficiency. The pixel values have units of eV/cm^2.
00162  *   Four different images are reported at the end of the simulation, corresponding
00163  *   to the signal produced by x rays that did not interact between the source and the detector 
00164  *   (non-scattered), x rays that suffered a single Compton (inelastic) interaction, a single 
00165  *   Rayleigh (elastic) interaction, and multi-scattered x rays.
00166  *   The dose tally counts the energy deposited by each x ray track inside each voxel of the 
00167  *   geometry, within a user-defined volumetric region-of-interest (ROI). The average dose deposited 
00168  *   inside each voxel and in each material (and the associated statistical uncertainties) are reported 
00169  *   at the end of the simulation.
00170  * 
00171  * 
00172  *   MC-GPU can simulate a single projection image or a full CT scan.
00173  *   The CT is simulated generating many projection images around the static 
00174  *   voxelized geometry. Currently, the code is limited to perform a simple
00175  *   CT trajectory rotating around the Z axis. The user can specify the angular shift and longitudinal
00176  *   translation (pitch) of the source between each projection and also the distance between the 
00177  *   source and the axis of rotation (the axis is assumed to be parallel to the Z axis). 
00178  *   By now, the code does not simulate some relevant components of a CT scanner such as the 
00179  *   anti-scatter grid, a bow-tie filter or a curved detector (flat-panel detector only).
00180  * 
00181  * 
00182  *   The x ray source is defined as a point source emitting x rays with an energy randomly sampled 
00183  *   from the user-provided energy spectrum. The polyenergetic spectrum is efficiently sampled 
00184  *   using the Walker aliasing algorithm. The emitted cone beam is computationally
00185  *   collimated to produce a rectangular field on the detector plane, within the azimuthal and 
00186  *   polar angles specified by the user.
00187  *   The detector plane is automatically located at the specified distance right in front of the
00188  *   source focal spot, with the collimated cone beam pointing towards the geometric center of the detector.
00189  * 
00190  * 
00191  *   In order to optimize the particle tracking algorithm (ray-tracing) and minimize 
00192  *   the accesses to the slow GPU main memory, the photon trajectories across the voxels
00193  *   are computed using the Woodcock tracking algorithm.
00194  *   With this technique the photons perceive the geometry as a uniform medium
00195  *   composed of the material of the most attenuating voxel.
00196  *   In this way, the voxel boundaries do not have to be explicitly calculated and
00197  *   multiple voxels can be crossed in a single step.
00198  *   To keep the simulation unbiased, some of the interactions are considered
00199  *   "virtual" (i.e., do not change the photon energy or direction of movement),
00200  *   depending on the x ray energy and the actual material at the interaction site.
00201  *   In typical medical imaging simulations where the most attenuating material is cortical bone,
00202  *   the Woodcock tracking algorithm gives an speed up of almost one order of magnitude compared
00203  *   to computing voxel boundaries all the time. However, if the geometry includes a high
00204  *   density voxel, such as a metallic implant, the performance of the code can be severely 
00205  *   reduced because a large fraction of the sampled interactions will be virtual.
00206  *  
00207  * 
00208  *   The random number generator used in PENELOPE [5], RANECU, is also used in the GPU
00209  *   program. To ensure that the simulated tracks are not correlated, each thread initializes
00210  *   the generator to a unique position in the random sequence, far enough from the
00211  *   other threads, using the algorithm implemented in the seedsMLCG code [7].
00212  * 
00213  * 
00214  *   In a typical simulation, several thousand threads are launched simultaneously in
00215  *   the GPU, each one of them simulating a batch of several x ray tracks.
00216  *   If the code is compiled with MPI support (see below), multiple GPUs can be used in parallel. 
00217  *   The code will perform a short speed test to estimate the relative speed of each GPU used 
00218  *   in the simulation and then distribute the number of particles among the available GPUs correspondingly.
00219  *   If the user specified a time limit in the simulation, all the GPUs will simulate in parallel 
00220  *   for the allowed time. Since the code is already optimized to scale well in  
00221  *   thousands of GPU threads, it scales almost linearly with the number of GPUs in most 
00222  *   situations, with only a few seconds of overhead in the initialization of the multiple GPUs 
00223  *   and in the reduction of the final results.
00224  * 
00225  *  
00226  *  
00227  * 
00228  *    \section sec_output Code output
00229  *
00230  *   At the end of the simulation the code reports the tallied 3D dose distribution and the 
00231  *   final simulated images in RAW binary form, as 32-bits float values. The image data is provided 
00232  *   as a collection of five consecutive images corresponding to: total image (scatter+primaries), 
00233  *   primary particles, Compton, Rayleigh and multi-scatter. 
00234  *   The dose data is reported as two RAW files with the mean dose and twice the standard deviation
00235  *   of the dose in each voxel of the geometry respectively, within the input ROI.
00236  *   The average dose deposited in each material of the geometry is also reported to the standard output. 
00237  *   Organ doses can be obtained by post-processing the output dose file, knowing which voxel 
00238  *   corresponds to each organ.
00239  *   The pixel and voxel dose data values are stored with the X coordinate incrementing first, the Y 
00240  *   coordinate incrementing second, and the Z coordinate incrementing last.
00241  *   
00242  *   The program also reports the simulated images and the dose at the Z plane at the level of the x ray 
00243  *   source as ASCII text files. The ASCII output can be readily visualized with the GNUPLOT scripts 
00244  *   distributed with MC-GPU. The header section at the beginning of these text files provides the 
00245  *   information required to easily read the RAW binary files with IMAGEJ, OCTAVE or other programs. 
00246  *  
00247  * 
00248  * 
00249  *    \section sec_compilation Code compilation and execution
00250  *
00251  *   MC-GPU has been developed and tested only in the Linux operating system.
00252  *   A Makefile script is provided to compile the MC-GPU code in Linux.
00253  *   The CUDA libraries and the GNU GCC compiler must be previously installed.
00254  *   The Makefile may have to be edited to modify the library path.
00255  *   The code requires the "zlib.h" library to be able to open gzipped input files.
00256  *
00257  *
00258  *   MC-GPU uses CUDA to access NVIDIA GPUs but all the actual computations are coded
00259  *   in standard C and the CUDA-specific commands are enclosed within preprocessor
00260  *   "if" statements. Defining the pre-processor variable "USING_CUDA" (i.e., 
00261  *   compiling with "-DUSING_CUDA") the particle transport routines are compiled to simulate
00262  *   many x ray histories in parallel in an NVIDIA GPU using CUDA. Otherwise, the code is 
00263  *   sequentially executed in the CPU.
00264  *   The same coding approach has been used to allow the use of multiple GPUs.
00265  *   Defining the pre-processor variable "USING_MPI" (i.e., compiling with
00266  *   "-DUSING_MPI"), Message Passing Interface (MPI) library calls are used to share information 
00267  *   between multiple CPU threads in different computers. 
00268  *   Each MPI thread gets a unique id in the CPU and addresses a unique GPU.
00269  *   At the end of the simulation the images and doses tallied by the different GPUs are 
00270  *   reduced to form single output file equivalent to a sequential simulation of the same 
00271  *   number of particles.
00272  * 
00273  *   The code can be easily compiled executing the command "make" or running the provided 
00274  *   "./make.sh" script. 
00275  *   Optionally, the code can be executed from the command line with a command like this 
00276  *   (example using CUDA and MPI, openMPI library in this case):
00277  * \code 
00278  * nvcc -DUSING_CUDA -DUSING_MPI MC-GPU_v1.3.cu -o MC-GPU_v1.3.x -O3
00279  *  -use_fast_math -L/usr/lib/ -I. -I/usr/local/cuda/include 
00280  *  -I/usr/local/cuda/samples/common/inc -I/usr/local/cuda/samples/shared/inc/ 
00281  *  -I/usr/include/openmpi  -lmpi -lz --ptxas-options=-v 
00282  *  -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_30,code=sm_30
00283  * \endcode
00284  * 
00285  *   The same source code can also be compiled for a regular CPU using:
00286  * \code  
00287  * gcc -x c -O3 MC-GPU_v1.3.cu -o MC-GPU_v1.3_CPU.x -I./ -lm -lz
00288  * \endcode 
00289  *   
00290  *   To run a simulation (and keep the information reported to the standard
00291  *   output in an external file) the compiled code can be executed as:
00292  * \code 
00293  * ./MC-GPU_v1.3.x MC-GPU_v1.3.in | tee MC-GPU_v1.3.out 
00294  * \endcode 
00295  * 
00296  *   All simulation can be executed in the same way using the code compiled for the CPU 
00297  *   or the GPU (however, the number of histories should be reduced for the CPU to finish 
00298  *   the simulation in a reasonable time).
00299  *   To run the simulation in parallel with MPI in multiple GPUs (or CPU cores) in the
00300  *   current computer the user can execute:
00301  * \code 
00302  * mpirun -n 4 ./MC-GPU_v1.3.x MC-GPU_v1.3.in
00303  * \endcode  
00304  * 
00305  *   To use GPUs in different computers, the user must make sure all computers can access the simulation
00306  *   files and that the libraries are correctly set up in all nodes. 
00307  *   To execute a simulation (with verbose MPI information being reported):
00308  * \code  
00309  * mpirun --tag-output -v -x LD_LIBRARY_PATH -hostfile myhostfile.txt -n 8 
00310  * /fullPath/MC-GPU_v1.3.x /fullPath/MC-GPU_v1.3.in | tee MC-GPU_v1.3.out
00311  * \endcode  
00312 
00313  * 
00314  *   The text file 'hostfile' lists the IP addresses and number of computing slots (GPUs) of the
00315  *   computers collaborating in the simulation. This file is not necessary when using multiple
00316  *   GPUs in a single workstation. When using multiple computers, the simulation files should 
00317  *   be located in a shared drive to make sure every node can access the input data.  
00318  *   The different workstations must have different host names in order to be differentiated by
00319  *   the MPI threads. The multiple threads communicate to each other to make sure they don't  
00320  *   use the same GPU in the same workstation. 
00321  * 
00322  *  
00323  *
00324  *    \section sec_issues Known issues
00325  * 
00326  *   In extremely long simulations, it is theoretically possible to cause an overflow of the counters 
00327  *   estimating the mean and standard deviation of the material or voxel doses. If this happen, the 
00328  *   results will be incorrect and even negative or nan values can be reported.
00329  * 
00330  *
00331  * 
00332  * 
00333  *    \section sec_ref References
00334  * 
00335  * -# A. Badal and A. Badano, Accelerating Monte Carlo simulations of photon transport in a voxelized geometry using a massively parallel Graphics Processing Unit, Med. Phys. 36, p. 4878-4880 (2009)
00336  * -# A. Badal and A. Badano, Monte Carlo Simulation of X-Ray Imaging Using a Graphics Processing Unit, IEEE NSC-MIC, Conference Record , HP3–1, p. 4081-4084 (2009)
00337  * -# A. Badal, I. Kyprianou, D. Sharma and A. Badano, Fast cardiac CT simulation using a Graphics Processing Unit-accelerated Monte Carlo code, Proc. SPIE Medical Imaging Conference 7622, p. 762231 (2010)
00338  * -# A. Badal and A. Badano, Fast Simulation of Radiographic Images Using a Monte Carlo X-Ray Transport Algorithm Implemented in CUDA, Chapter 50 of GPU Computing Gems (Emerald Edition), p. 813-830, editor Wen-mei W. Hwu, publisher Morgan Kaufmann (Elsevier), Burlington MA, 2010
00339  * -# F. Salvat, J. M. Fernandez-Varea and J. Sempau, PENELOPE – A code system for Monte Carlo simulation of electron and photon transport, NEA-OECD, Issy-les-Moulineaux, available at www.nea.fr/html/dbprog/peneloperef.html (2006)
00340  * -# NVIDIA Corporation, NVIDIA CUDA(TM) Programming Guide, Technical Report available at www.nvidia.com/cuda (2011) 
00341  * -# A. Badal and J. Sempau, A package of Linux scripts for the parallelization of Monte Carlo simulations, Comput. Phys. Commun. 175 (6), p. 440-450 (2006) 
00342  * 
00343  * 
00344  * 
00345  *                      @file    MC-GPU_v1.3.cu
00346  *                      @author  Andreu Badal (Andreu.Badal-Soler@fda.hhs.gov)
00347  *                      @date    2012/12/12
00348  *                        -- MC-GPU v.1.3:  2012/12/12
00349  *                        -- MC-GPU v.1.2:  2011/10/25
00350  *                        -- MC-GPU v.1.1:  2010/06/25
00351  *                        -- MC-GPU v.1.0:  2009/03/17
00352  */ 
00353 ////////////////////////////////////////////////////////////////////////////////////////
00354 
00355 
00356 
00357 // *** Include header file with the structures and functions declarations
00358 #include <MC-GPU_v1.3.h>
00359 
00360 // *** Include the computing kernel:
00361 #include <MC-GPU_kernel_v1.3.cu>
00362 
00363 
00364 ////////////////////////////////////////////////////////////////////////////////
00365 //!  Main program of MC-GPU: initialize the simulation enviroment, launch the GPU 
00366 //!  kernels that perform the x ray transport and report the final results.
00367 //!  This function reads the description of the simulation from an external file
00368 //!  given in the command line. This input file defines the number of particles to
00369 //!  simulate, the characteristics of the x-ray source and the detector, the number
00370 //!  and spacing of the projections (if simulating a CT), the location of the
00371 //!  material files containing the interaction mean free paths, and the location
00372 //!  of the voxelized geometry file.
00373 //!
00374 //!                            @author  Andreu Badal
00375 //!
00376 ////////////////////////////////////////////////////////////////////////////////
00377 int main(int argc, char **argv)
00378 {
00379 
00380   // -- Start time counter:
00381   time_t current_time = time(NULL);             // Get current time (in seconds)  
00382   clock_t clock_start, clock_end, clock_start_beginning;  // (requires standard header <time.h>)
00383   clock_start = clock();                        // Get current clock counter
00384   clock_start_beginning = clock_start;
00385   
00386 #ifdef USING_MPI
00387 // -- Using MPI to access multiple GPUs to simulate the x-ray projection image:
00388   int myID = -88, numprocs = -99, return_reduce = -1;
00389   MPI_Init(&argc, &argv);                       // Init MPI and get the current thread ID 
00390   MPI_Comm_rank(MPI_COMM_WORLD, &myID);
00391   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
00392   
00393   char MPI_processor_name[81];             
00394   int resultlen = -1;
00395   MPI_Get_processor_name(MPI_processor_name, &resultlen);
00396     
00397   char* char_time = ctime(&current_time); char_time[19] = '\0';   // The time is located betwen the characters 11 and 19.
00398   printf("          >> MPI run (myId=%d, numprocs=%d) on processor \"%s\" (time: %s) <<\n", myID, numprocs, MPI_processor_name, &char_time[11]);
00399   fflush(stdout);   // Clear the screen output buffer
00400   MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads  
00401   
00402   MASTER_THREAD printf("              -- Time spent initializing the MPI world (MPI_Barrier): %.3f s\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC);
00403   
00404   
00405 #else  
00406   int myID = 0, numprocs = 1;   // Only one CPU thread used when MPI is not activated (multiple projections will be simulated sequentially).
00407 #endif
00408 
00409   MASTER_THREAD 
00410   { 
00411       printf("\n\n     *****************************************************************************\n");
00412       printf(    "     ***         MC-GPU, version 1.3 (http://code.google.com/p/mcgpu/)         ***\n");
00413       printf(    "     ***                                                                       ***\n");
00414       printf(    "     ***  A. Badal and A. Badano, \"Accelerating Monte Carlo simulations of     *** \n");
00415       printf(    "     ***  photon transport in a voxelized geometry using a massively parallel  *** \n");
00416       printf(    "     ***  Graphics Processing Unit\", Medical Physics 36, pp. 4878–4880 (2009)  ***\n");
00417       printf(    "     ***                                                                       ***\n");
00418       printf(    "     ***                     Andreu Badal (Andreu.Badal-Soler@fda.hhs.gov)     ***\n");
00419       printf(    "     *****************************************************************************\n\n");
00420 
00421       printf("****** Code execution started on: %s\n\n", ctime(&current_time));  
00422       fflush(stdout);
00423   }
00424     
00425   
00426 #ifdef USING_CUDA
00427   // The "MASTER_THREAD" macro prints the messages just once when using MPI threads (it has no effect if MPI is not used):  MASTER_THREAD == "if(0==myID)"
00428   MASTER_THREAD printf  ("\n             *** CUDA SIMULATION IN THE GPU ***\n");
00429 #else
00430   MASTER_THREAD printf  ("\n             *** SIMULATION IN THE CPU ***\n");
00431 #endif
00432 
00433   MASTER_THREAD printf("\n    -- INITIALIZATION phase:\n");
00434   MASTER_THREAD fflush(stdout);   // Clear the screen output buffer for the master thread
00435   
00436   
00437 ///////////////////////////////////////////////////////////////////////////////////////////////////
00438   
00439   
00440   // *** Declare the arrays and structures that will contain the simulation data:
00441 
00442   struct voxel_struct voxel_data;          // Define the geometric constants of the voxel file
00443   struct detector_struct detector_data[MAX_NUM_PROJECTIONS];  // Define an x ray detector (for each projection)
00444   struct source_struct source_data[MAX_NUM_PROJECTIONS];      // Define the particles source (for each projection)
00445   struct source_energy_struct source_energy_data;    // Define the source energy spectrum
00446   struct linear_interp mfp_table_data;     // Constant data for the linear interpolation
00447   struct compton_struct  compton_table;    // Structure containing Compton sampling data (to be copied to CONSTANT memory)
00448   struct rayleigh_struct rayleigh_table;   // Structure containing Rayleigh sampling data (to be copied to CONSTANT memory)
00449   
00450   float2 *voxel_mat_dens = NULL;           // Poiter where voxels array will be allocated
00451   unsigned int voxel_mat_dens_bytes = 0;   // Size (in bytes) of the voxels array (using unsigned int to allocate up to 4.2GBytes)
00452   float density_max[MAX_MATERIALS];
00453   float density_nominal[MAX_MATERIALS];
00454   unsigned long long int *image = NULL;    // Poiter where image array will be allocated
00455   int image_bytes = -1;                    // Size of the image array
00456   int mfp_table_bytes = -1, mfp_Woodcock_table_bytes = -1;   // Size of the table arrays
00457   float2 *mfp_Woodcock_table = NULL;                // Linear interpolation data for the Woodcock mean free path [cm]
00458   float3 *mfp_table_a = NULL, *mfp_table_b = NULL;  // Linear interpolation data for 3 different interactions:
00459                                               //  (1) inverse total mean free path (divided by density, cm^2/g)
00460                                               //  (2) inverse Compton mean free path (divided by density, cm^2/g)
00461                                               //  (3) inverse Rayleigh mean free path (divided by density, cm^2/g)
00462   short int dose_ROI_x_min, dose_ROI_x_max, dose_ROI_y_min, dose_ROI_y_max, dose_ROI_z_min, dose_ROI_z_max;  // Coordinates of the dose region of interest (ROI)
00463   ulonglong2 *voxels_Edep = NULL;           // Poiter where the voxel energy deposition array will be allocated
00464   int voxels_Edep_bytes = 0;                      // Size of the voxel Edep array
00465   
00466   ulonglong2 materials_dose[MAX_MATERIALS];    // Array for tally_materials_dose.     !!tally_materials_dose!!
00467   int kk;
00468   for(kk=0;kk<MAX_MATERIALS;kk++) 
00469   {  
00470     materials_dose[kk].x = 0;       // Initializing data                  !!tally_materials_dose!!
00471     materials_dose[kk].y = 0;
00472     density_nominal[kk]  =-1.0f;
00473   }
00474 
00475   clock_t clock_kernel;     // Using only cpu timers after CUDA 5.0
00476 
00477   double time_elapsed_MC_loop = 0.0, time_total_MC_simulation = 0.0, time_total_MC_init_report = 0.0;
00478   
00479 
00480   unsigned long long int total_histories;
00481   int histories_per_thread, seed_input, num_threads_per_block, gpu_id, num_projections;
00482   int flag_material_dose = -2;
00483   double D_angle=-1.0, angularROI_0=0.0, angularROI_1=360.0, initial_angle=0.0, SRotAxisD=-1.0, vertical_translation_per_projection=0.0;
00484   char file_name_voxels[250], file_name_materials[MAX_MATERIALS][250], file_name_output[250], file_dose_output[250], file_name_espc[250];
00485 
00486   // *** Read the input file given in the command line and return the significant data:
00487   read_input(argc, argv, myID, &total_histories, &seed_input, &gpu_id, &num_threads_per_block, &histories_per_thread, detector_data, &image, &image_bytes, source_data, &source_energy_data, file_name_voxels, file_name_materials, file_name_output, file_name_espc, &num_projections, &D_angle, &angularROI_0, &angularROI_1, &initial_angle, &voxels_Edep, &voxels_Edep_bytes, file_dose_output, &dose_ROI_x_min, &dose_ROI_x_max, &dose_ROI_y_min, &dose_ROI_y_max, &dose_ROI_z_min, &dose_ROI_z_max, &SRotAxisD, &vertical_translation_per_projection, &flag_material_dose);
00488 
00489 
00490 
00491   // *** Read the energy spectrum and initialize its sampling with the Walker aliasing method:
00492   MASTER_THREAD printf("    -- Reading the energy spectrum and initializing the Walker aliasing sampling algorithm.\n");
00493   float mean_energy_spectrum = 0.0f;  
00494   init_energy_spectrum(file_name_espc, &source_energy_data, &mean_energy_spectrum);
00495   
00496   
00497   // *** Output some of the data read to make sure everything was correctly read:
00498   MASTER_THREAD
00499   {
00500         if (total_histories < (unsigned long long int)(100000))
00501           printf("                       simulation time = %lld s\n", total_histories);
00502         else            
00503           printf("              x-ray tracks to simulate = %lld\n", total_histories);
00504         printf("                   initial random seed = %d\n", seed_input);        
00505         printf("      azimuthal (phi), polar apertures = %.6f , %.6f degrees\n", ((double)source_data[0].D_phi)*RAD2DEG, 2.0*(90.0 - acos(((double)source_data[0].cos_theta_low))*RAD2DEG) );
00506         printf("                   focal spot position = (%f, %f, %f)\n", source_data[0].position.x, source_data[0].position.y, source_data[0].position.z);
00507         printf("                      source direction = (%f, %f, %f)\n", source_data[0].direction.x, source_data[0].direction.y, source_data[0].direction.z);
00508         printf("                  initial angle from X = %lf\n", initial_angle*RAD2DEG);
00509         printf("              source-detector distance = %f cm\n", detector_data[0].sdd);        
00510         printf("                       detector center = (%f, %f, %f)\n", (source_data[0].position.x + source_data[0].direction.x * detector_data[0].sdd),  // Center of the detector straight ahead of the focal spot.
00511                                                                           (source_data[0].position.y + source_data[0].direction.y * detector_data[0].sdd),
00512                                                                           (source_data[0].position.z + source_data[0].direction.z * detector_data[0].sdd));
00513         printf("           detector low corner (at +Y) = (%f, %f, %f)\n", detector_data[0].corner_min_rotated_to_Y.x, detector_data[0].corner_min_rotated_to_Y.y, detector_data[0].corner_min_rotated_to_Y.z);        
00514         printf("                number of pixels image = %dx%d = %d\n", detector_data[0].num_pixels.x, detector_data[0].num_pixels.y, detector_data[0].total_num_pixels);
00515         printf("                            pixel size = %.3fx%.3f cm\n", 1.0f/detector_data[0].inv_pixel_size_X, 1.0f/detector_data[0].inv_pixel_size_Z);
00516         printf("                 number of projections = %d\n", num_projections);
00517         if (num_projections!=1)
00518         {
00519           printf("         source-rotation axis-distance = %lf cm\n", SRotAxisD);          
00520           printf("             angle between projections = %lf\n", D_angle*RAD2DEG);
00521           printf("            angular region of interest = [%lf,%lf] degrees\n", angularROI_0*RAD2DEG, angularROI_1*RAD2DEG);
00522           printf("   vertical translation per projection = %lf cm\n", vertical_translation_per_projection);
00523         }
00524         printf("                      Input voxel file = %s\n", file_name_voxels);
00525         printf("                     Output image file = %s\n", file_name_output);
00526 
00527         if (dose_ROI_x_max>-1)
00528         {
00529           printf("                      Output dose file = %s\n", file_dose_output);
00530           printf("         Input region of interest dose = X[%d,%d], Y[%d,%d], Z[%d,%d]\n", dose_ROI_x_min+1, dose_ROI_x_max+1, dose_ROI_y_min+1, dose_ROI_y_max+1, dose_ROI_z_min+1, dose_ROI_z_max+1);   // Show ROI with index=1 for the first voxel instead of 0.
00531         }
00532         
00533         printf("\n                  Energy spectrum file = %s\n", file_name_espc);      
00534         printf(  "            number of energy bins read = %d\n", source_energy_data.num_bins_espc);
00535         printf(  "             minimum, maximum energies = %.3f, %.3f keV\n", 0.001f*source_energy_data.espc[0], 0.001f*source_energy_data.espc[source_energy_data.num_bins_espc]);
00536         printf(  "                  mean energy spectrum = %.3f keV\n\n", 0.001f*mean_energy_spectrum);
00537         
00538         fflush(stdout);       
00539   }
00540   
00541  
00542 
00543   // *** Set the detectors and sources for the CT trajectory (if needed, ie, for more than one projection):
00544   if (num_projections != 1)
00545   {
00546     set_CT_trajectory(myID, num_projections, D_angle, angularROI_0, angularROI_1, SRotAxisD, source_data, detector_data, vertical_translation_per_projection);
00547   }
00548   
00549   fflush(stdout);
00550         
00551 
00552   // *** Read the voxel data and allocate the density map matrix. Return the maximum density:
00553   load_voxels(myID, file_name_voxels, density_max, &voxel_data, &voxel_mat_dens, &voxel_mat_dens_bytes, &dose_ROI_x_max, &dose_ROI_y_max, &dose_ROI_z_max);
00554   MASTER_THREAD printf("       Total CPU memory allocated for voxels vector and data structures = %f Mbytes\n", (voxel_mat_dens_bytes+image_bytes+sizeof(struct voxel_struct)+sizeof(struct source_struct)+sizeof(struct detector_struct)+sizeof(struct linear_interp)+2*mfp_table_bytes+sizeof(struct rayleigh_struct)+sizeof(struct compton_struct))/(1024.f*1024.f));
00555   MASTER_THREAD fflush(stdout);
00556 
00557   // *** Read the material mean free paths and set the interaction table in a "linear_interp" structure:
00558   load_material(myID, file_name_materials, density_max, density_nominal, &mfp_table_data, &mfp_Woodcock_table, &mfp_Woodcock_table_bytes, &mfp_table_a, &mfp_table_b, &mfp_table_bytes, &rayleigh_table, &compton_table);
00559 
00560   // -- Check that the input material tables and the x-ray source are consistent:
00561   if ( (source_energy_data.espc[0] < mfp_table_data.e0) || (source_energy_data.espc[source_energy_data.num_bins_espc] > (mfp_table_data.e0 + (mfp_table_data.num_values-1)/mfp_table_data.ide)) )
00562   {
00563     MASTER_THREAD 
00564     {
00565       printf("\n\n\n !!ERROR!! The input x-ray source energy spectrum minimum (%.3f eV) and maximum (%.3f eV) energy values\n", source_energy_data.espc[0], source_energy_data.espc[source_energy_data.num_bins_espc]);
00566       printf(  "           are outside the tabulated energy interval for the material properties tables (from %.3f to %.3f eV)!!\n", mfp_table_data.e0, (mfp_table_data.e0+(mfp_table_data.num_values-1)/mfp_table_data.ide));
00567       printf(  "           Please, modify the input energy spectra to fit the tabulated limits or create new tables.\n\n");
00568     }
00569     #ifdef USING_MPI
00570       MPI_Finalize();
00571     #endif
00572     exit(-1);
00573   }
00574 
00575   // -- Pre-compute the total mass of each material present in the voxel phantom (to be used in "report_materials_dose"):
00576   double voxel_volume = 1.0 / ( ((double)voxel_data.inv_voxel_size.x) * ((double)voxel_data.inv_voxel_size.y) * ((double)voxel_data.inv_voxel_size.z) );
00577   double mass_materials[MAX_MATERIALS];
00578   for(kk=0; kk<MAX_MATERIALS; kk++)
00579     mass_materials[kk] = 0.0;
00580   for(kk=0; kk<(voxel_data.num_voxels.x*voxel_data.num_voxels.y*voxel_data.num_voxels.z); kk++)  // For each voxel in the geometry
00581     mass_materials[((int)voxel_mat_dens[kk].x)-1] += ((double)voxel_mat_dens[kk].y)*voxel_volume;        // Add material mass = density*volume
00582 
00583 
00584 
00585   // *** Initialize the GPU using the NVIDIA CUDA libraries, if USING_CUDA parameter defined at compile time:
00586 #ifdef USING_CUDA    
00587   // -- Declare the pointers to the device global memory, when using the GPU:
00588   float2 *voxel_mat_dens_device     = NULL,
00589          *mfp_Woodcock_table_device = NULL;
00590   float3 *mfp_table_a_device        = NULL,
00591          *mfp_table_b_device        = NULL;
00592   unsigned long long int *image_device          = NULL;
00593   struct rayleigh_struct *rayleigh_table_device = NULL;
00594   struct compton_struct  *compton_table_device  = NULL;
00595   ulonglong2 *voxels_Edep_device                = NULL;
00596   struct detector_struct *detector_data_device  = NULL;
00597   struct source_struct   *source_data_device    = NULL;  
00598   ulonglong2 *materials_dose_device = NULL;     // !!tally_materials_dose!!
00599 
00600   // -- Sets the CUDA enabled GPU that will be used in the simulation, and allocate and copies the simulation data in the GPU global and constant memories.
00601   init_CUDA_device(&gpu_id, myID, numprocs, &voxel_data, source_data, &source_energy_data, detector_data, &mfp_table_data,  /*Variables GPU constant memory*/
00602         voxel_mat_dens, &voxel_mat_dens_device, voxel_mat_dens_bytes,                          /*Variables GPU global memory*/
00603         image, &image_device, image_bytes,
00604         mfp_Woodcock_table, &mfp_Woodcock_table_device, mfp_Woodcock_table_bytes,
00605         mfp_table_a, mfp_table_b, &mfp_table_a_device, &mfp_table_b_device, mfp_table_bytes,
00606         &rayleigh_table, &rayleigh_table_device,
00607         &compton_table, &compton_table_device, &detector_data_device, &source_data_device,
00608         voxels_Edep, &voxels_Edep_device, voxels_Edep_bytes, &dose_ROI_x_min, &dose_ROI_x_max, &dose_ROI_y_min, &dose_ROI_y_max, &dose_ROI_z_min, &dose_ROI_z_max,
00609         materials_dose, &materials_dose_device, flag_material_dose, num_projections);
00610 
00611   // -- Constant data already moved to the GPU: clean up unnecessary RAM memory
00612   free(mfp_Woodcock_table);
00613   free(mfp_table_a);
00614   free(mfp_table_b);
00615   if (0!=myID)    // Keep the geometry data for the MPI root because the voxel densities are still needed to compute the final doses
00616     free(voxel_mat_dens);
00617     
00618 
00619 #endif
00620   
00621   MASTER_THREAD
00622   {
00623     current_time=time(NULL);
00624     printf("\n    -- INITIALIZATION finished: elapsed time = %.3f s. \n\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC);
00625   }
00626   
00627 
00628 #ifdef USING_MPI
00629   fflush(stdout);
00630   MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads before starting the MC phase.
00631 #endif
00632 
00633   
00634 ///////////////////////////////////////////////////////////////////////////////////////////////////
00635   
00636   
00637   
00638   MASTER_THREAD
00639   {
00640     current_time=time(NULL);
00641     printf("\n\n    -- MONTE CARLO LOOP phase. Time: %s\n\n", ctime(&current_time)); 
00642     fflush(stdout);    
00643   }
00644 
00645   
00646   // -- A number of histories smaller than 24 hours in sec (3600*24=86400) means that the user wants to simulate for the input number of seconds in each GPU, not a fix number of histories:
00647   unsigned long long int total_histories_INPUT = total_histories;    // Save the original input values to be re-used for multiple projections
00648   int seed_input_INPUT = seed_input, doing_speed_test = -1;  
00649   int simulating_by_time = 0;  // 0==false
00650   if (total_histories < (unsigned long long int)(95000))
00651     simulating_by_time = 1;    // 1=true
00652       
00653 
00654 
00655   int num_blocks_speed_test = 0;
00656   unsigned long long int histories_speed_test = (unsigned long long int)0, total_histories_speed_test = (unsigned long long int)0;
00657   float node_speed = -1.0f, total_speed = 1.0f;
00658   double current_angle;
00659   int num_p;  // == current projection number
00660   
00661    // *** CT simulation: find the current projection angle and start Monte Carlo simulation: 
00662    
00663   for (num_p=0; num_p<num_projections; num_p++)
00664   {
00665     
00666     // -- Check if this projection is inside the input angular region of interest (the angle can be negative, or larger than 360 in helical scans):
00667     current_angle = initial_angle + num_p * D_angle;       
00668     
00669     if ((current_angle < angularROI_0) || (current_angle > angularROI_1))
00670     {
00671       MASTER_THREAD printf("         << Skipping projection #%d of %d >> Angle %f degrees: outside angular region of interest.\n", num_p+1, num_projections, current_angle*RAD2DEG);
00672       continue;   // Cycle loop: do not simulate this projection!
00673     }
00674       
00675     if (num_projections!=1)
00676       MASTER_THREAD printf("\n\n\n   << Simulating Projection %d of %d >> Angle: %lf degrees.\n\n\n", num_p+1, num_projections, current_angle*RAD2DEG);          
00677 
00678     
00679     clock_start = clock();   // Start the CPU clock
00680     
00681 #ifdef USING_CUDA
00682       
00683     // *** Simulate in the GPUs the input amount of time or amount of particles:
00684     
00685     // -- Estimate GPU speed to use a total simulation time or multiple GPUs:    
00686     
00687     if ( simulating_by_time==0 &&   // Simulating a fixed number of particles, not a fixed time (so performing the speed test only once)
00688          node_speed>0.0f &&         // Speed test already performed for a previous projection in this simulation (node_speed and total_speed variables set)
00689          numprocs>1)                // Using multiple GPUs (ie, multiple MPI threads)
00690     { 
00691       // -- Simulating successive projections after the first one with a fix number of particles, with multiple MPI threads: re-use the speed test results from the first projection image:
00692       total_histories = (unsigned long long int)(0.5 + ((double)total_histories_INPUT) * (((double)node_speed)/total_speed));  
00693       doing_speed_test = 0;   // No speed test for this projection.
00694     }
00695     else if ( simulating_by_time==1 || numprocs>1)
00696     {
00697       // -- Simulating with a time limit OR multiple MPI threads for the first time (num_p==0): run a speed test to calculate the speed of the current GPU and distribute the number of particles to the multiple GPUs or estimate the total number of particles required to run the input amount of time:      
00698       //    Note that this ELSE IF block will be skipped if we are using a single MPI thread and a fix number of particles.
00699 
00700       doing_speed_test = 1;   // Remember that we are performing the speed test to make sure we add the test histories to the total before the tally reports.
00701 
00702       if (node_speed<0.0f)    // Speed test not performed before (first projection being simulated): set num_blocks_speed_test and histories_speed_test.
00703       {
00704         num_blocks_speed_test = guestimate_GPU_performance(gpu_id);  // Guestimating a good number of blocks to estimate the speed of different generations of GPUs. Slower GPUs will simulate less particles and hopefully the fastest GPUs will not have to wait much.
00705         
00706               // !!DeBuG!! Error in code version 1.2 has been corrected here. Old code:   histories_speed_test = (unsigned long long int)(num_blocks_speed_test*num_threads_per_block)*(unsigned long long int)(histories_per_thread);
00707         
00708       }
00709       
00710       histories_speed_test = (unsigned long long int)(num_blocks_speed_test*num_threads_per_block)*(unsigned long long int)(histories_per_thread);
00711 
00712 
00713       // Re-load the input total number of histories and the random seed:
00714       total_histories = total_histories_INPUT;
00715       seed_input = seed_input_INPUT;                
00716       
00717       dim3  blocks_speed_test(num_blocks_speed_test, 1);
00718       dim3 threads_speed_test(num_threads_per_block, 1);
00719 
00720       
00721       // -- Init the current random number generator seed to avoid overlapping sequences with other MPI threads:      
00722       if (simulating_by_time == 1) 
00723         // Simulating by time: set an arbitrary huge number of particles to skip.
00724         update_seed_PRNG( (myID + num_p*numprocs), (unsigned long long int)(123456789012), &seed_input);     // Set the random number seed far from any other MPI thread (myID) and away from the seeds used in the previous projections (num_p*numprocs).
00725       else  
00726         // Simulating by histories
00727         update_seed_PRNG( (myID + num_p*numprocs), total_histories, &seed_input);   //  Using different random seeds for each projection            
00728       
00729       #ifdef USING_MPI
00730         printf("        ==> CUDA (MPI process #%d in \"%s\"): estimate GPU speed executing %d blocks of %d threads, %d histories per thread: %lld histories in total (random seed: %d).\n", myID, MPI_processor_name, num_blocks_speed_test, num_threads_per_block, histories_per_thread, histories_speed_test, seed_input);
00731       #else
00732         printf("        ==> CUDA: Estimating the GPU speed executing %d blocks of %d threads, %d histories per thread: %lld histories in total.\n", num_blocks_speed_test, num_threads_per_block, histories_per_thread, histories_speed_test);
00733       #endif  
00734       fflush(stdout); 
00735       
00736       clock_kernel = clock();
00737       
00738       // -- Launch Monte Carlo simulation kernel for the speed test:
00739       track_particles<<<blocks_speed_test,threads_speed_test>>>(histories_per_thread, num_p, seed_input, image_device, voxels_Edep_device, voxel_mat_dens_device, mfp_Woodcock_table_device, mfp_table_a_device, mfp_table_b_device, rayleigh_table_device, compton_table_device, detector_data_device, source_data_device, materials_dose_device);
00740       
00741       
00742       #ifdef USING_MPI    
00743         // Find out the total number of histories simulated in the speed test by all the GPUs. Note that this MPI call will be executed in parallel with the GPU kernel because it is located before the cudaThreadSynchronize command!
00744       
00745         return_reduce = MPI_Allreduce(&histories_speed_test, &total_histories_speed_test, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);  
00746         if (MPI_SUCCESS != return_reduce)
00747           printf("\n\n !!ERROR!! Error reducing (MPI_Allreduce) the total number of histories in the speed test test??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
00748         else
00749       #else
00750         total_histories_speed_test = histories_speed_test;
00751       #endif
00752             
00753       cudaThreadSynchronize();    // Force the runtime to wait until GPU kernel has completed
00754       getLastCudaError("\n\n !!Kernel execution failed while simulating particle tracks!! ");   // Check if the CUDA function returned any error
00755 
00756       float speed_test_time = float(clock()-clock_kernel)/CLOCKS_PER_SEC;
00757 
00758       node_speed = (float) (((double)histories_speed_test)/speed_test_time); 
00759       
00760       #ifdef USING_MPI  
00761         printf("                 (MPI process #%d): Estimated GPU speed = %lld hist / %.4f s = %.3f hist/s\n", myID, histories_speed_test, speed_test_time, node_speed);      
00762       #else
00763         printf("                  Estimated GPU speed = %lld hist / %.3f s = %.3f hist/s\n", histories_speed_test, speed_test_time, node_speed);        
00764       #endif
00765 
00766       
00767       // -- Init random number generator seed to avoid repeating the random numbers used in the speed test:
00768       update_seed_PRNG(1, histories_speed_test, &seed_input);
00769       
00770       if (simulating_by_time==1)
00771       {
00772         // -- Set number of histories for each GPU when simulating by time:
00773         if (total_histories > speed_test_time)
00774           total_histories = (total_histories - speed_test_time)*node_speed;    // Calculate the total number of remaining histories by "GPU speed" * "remaining time"
00775         else
00776           total_histories = 1;       // Enough particles simulated already, simulate just one more history (block) and report (kernel call would fail if total_histories < or == 0).
00777       }
00778       else
00779       {
00780         
00781         #ifdef USING_MPI 
00782           // -- Simulating a fix number of histories divided between all GPUs (execution time variable):                     
00783           //    Compute the fraction of the total speed that accounts for the current MPI thread:
00784           return_reduce = MPI_Allreduce(&node_speed, &total_speed, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);  // Sum all the times and send result to all processes
00785           
00786           if (MPI_SUCCESS != return_reduce)
00787             printf("\n\n !!ERROR!! Error reducing (MPI_Allreduce) the speed test results??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
00788           else
00789             MASTER_THREAD 
00790             {
00791               printf("       -- Total speed for all GPUs (MPI_Allreduce) = %.3f hist/s; total histories simulated in the speed test (MPI_Allreduce) = %lld.\n", total_speed, total_histories_speed_test);
00792               printf("          The master thread will simulate %.2f%% of the x rays in the simulation.\n",  100.0f*node_speed/total_speed);
00793             }
00794         #else
00795           total_speed = node_speed;
00796         #endif
00797 
00798         // - Divide the remaining histories among the MPI threads (GPUs) according to their fraction of the total speed (rounding up).
00799         if (total_histories_speed_test < total_histories)
00800           total_histories = (unsigned long long int)(0.5 + ((double)(total_histories-total_histories_speed_test)) * ((double)(node_speed/total_speed)));
00801         else
00802           total_histories = numprocs;       // Enough particles simulated already, simulate just one more history (block) and report (kernel call would fail if total_histories < or == 0).
00803       }     
00804      
00805     }   // [Done with case of simulating projections by time or first projection by number of particles]    
00806     
00807     // else  ==>  if using only 1 GPU and a fixed number of histories the whole speed test is skipped. The random seed will be different for each projection because it is updated after calling the kernel below.
00808   
00809   
00810     // fflush(stdout); 
00811     // MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads here if we want to have a better organized output text at the expense of losing some performance 
00812 
00813 
00814   
00815     // *** Perform the MC simulation itself (the speed test would be skipped for a single CPU thread using a fix number of histories):
00816   
00817     // -- Compute the number of CUDA blocks to simulate, rounding up and making sure it is below the limit of 65535 blocks.
00818     //    The total number of particles simulated will be increased to the nearest multiple "histories_per_thread".
00819     int total_threads = (int)(((double)total_histories)/((double)histories_per_thread) + 0.9990);     // Divide the histories among GPU threads, rounding up
00820     int total_threads_blocks = (int)(((double)total_threads)/((double)num_threads_per_block) + 0.9990);   // Divide the GPU threads among CUDA blocks, rounding up
00821     if (total_threads_blocks>65535)
00822     {     
00823       #ifdef USING_MPI       
00824         printf("          WARNING (MPI process #%d): %d hist per thread would produce %d CUDA blocks (>65535 maximum).", myID, histories_per_thread, total_threads_blocks);
00825       #else
00826         printf("\n          WARNING: %d hist per thread would produce %d CUDA blocks, more than the maximum value of 65535.", histories_per_thread, total_threads_blocks);
00827       #endif
00828       total_threads_blocks = 65000;    // Increase the histories per thread to have exactly 65000 blocks.
00829       histories_per_thread = (int) ( ((double)total_histories)/((double)(total_threads_blocks*num_threads_per_block)) + 0.9990 );        
00830       printf(" Increasing to %d hist to run exactly %d blocks in the GPU.\n", histories_per_thread, total_threads_blocks);
00831     }
00832     else if (total_threads_blocks<1)
00833     {
00834       total_threads_blocks = 1;        // Make sure we have at least 1 block to run
00835     }      
00836     
00837     total_histories = ((unsigned long long int)(total_threads_blocks*num_threads_per_block))*histories_per_thread;   // Total histories will be equal or higher than the input value due to the rounding up in the division of the histories
00838     float total_histories_current_kernel_float = (float)total_histories;   // Keep a float approx of the num histories for the timing below
00839     
00840     #ifdef USING_MPI  
00841       MASTER_THREAD printf("\n\n");
00842       printf("        ==> CUDA (MPI process #%d in \"%s\"): Executing %d blocks of %d threads, with %d histories in each thread: %lld histories in total (random seed: %d).\n", myID, MPI_processor_name, total_threads_blocks, num_threads_per_block, histories_per_thread, total_histories, seed_input);
00843     #else
00844       printf("\n        ==> CUDA: Executing %d blocks of %d threads, with %d histories in each thread: %lld histories in total (random seed: %d).\n", total_threads_blocks, num_threads_per_block, histories_per_thread, total_histories, seed_input);     
00845     #endif
00846     fflush(stdout); 
00847     
00848     // -- Setup the execution parameters (Max number threads per block: 512, Max sizes each dimension of grid: 65535x65535x1)
00849 
00850     dim3 blocks(total_threads_blocks, 1);
00851     dim3 threads(num_threads_per_block, 1); 
00852     
00853     clock_kernel = clock();
00854 
00855     
00856     // *** Execute the x-ray transport kernel in the GPU ***
00857     track_particles<<<blocks,threads>>>(histories_per_thread, num_p, seed_input, image_device, voxels_Edep_device, voxel_mat_dens_device, mfp_Woodcock_table_device, mfp_table_a_device, mfp_table_b_device, rayleigh_table_device, compton_table_device, detector_data_device, source_data_device, materials_dose_device);
00858     
00859     
00860     if (1==doing_speed_test)
00861       total_histories += histories_speed_test;     // Speed test was done: compute the total number of histories including the particles simulated in the speed test 
00862       
00863     // -- Move the pseudo-random number generator seed ahead to skip all the random numbers generated in the current projection by this and the other
00864     //    "numprocs" MPI threads. Each projection will use independent seeds! (this code runs in parallel with the asynchronous GPU kernel):
00865     update_seed_PRNG(numprocs, total_histories, &seed_input);   // Do not repeat seed for each projection. Note that this function only updates 1 seed, the other is not computed.
00866               
00867 
00868     #ifdef USING_MPI 
00869       if (numprocs>1)  // Using more than 1 MPI thread:
00870       {
00871         // -- Compute the total number of histories simulated with all MPI thread, including the speed test (histories_speed_test==0 if speed test was skipped).
00872         //    These MPI messajes are sent concurrently with the GPU kernel computation for maximum efficiency.
00873         unsigned long long int current_GPU_histories = total_histories;  
00874         return_reduce = MPI_Reduce(&current_GPU_histories, &total_histories, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);  // Sum all the simulated particles and send to thread 0
00875                 
00876         MASTER_THREAD 
00877         {
00878           if (MPI_SUCCESS != return_reduce)
00879             printf("\n\n !!ERROR!! Error getting the total number of particles simulated in all the GPUs (MPI_Reduce). return_reduce = %d.\n\n\n", return_reduce);
00880           
00881           if (1==simulating_by_time || 1==doing_speed_test)
00882           {
00883             printf("\n       -- Total number of histories being simulated in all the GPUs for the current projection (including speed test)= %.3lld.\n\n", total_histories);
00884             fflush(stdout);
00885           }
00886         }
00887       }
00888     #endif
00889     
00890     cudaThreadSynchronize();    // Force the runtime to wait until the GPU kernel is completed
00891     getLastCudaError("\n\n !!Kernel execution failed while simulating particle tracks!! ");  // Check if kernel execution generated any error
00892 
00893     float real_GPU_speed = total_histories_current_kernel_float/(float(clock()-clock_kernel)/CLOCKS_PER_SEC);  // GPU speed for all the image simulation, not just the speed test.
00894     
00895 //     #ifdef USING_MPI   
00896 //       printf("        ==> CUDA (MPI process #%d in \"%s\"): GPU kernel execution time: %.4f s (%.3f hist/s)\n", myID, MPI_processor_name, time_kernel, total_histories_current_kernel_float/time_kernel);
00897 //     #else     
00898 //       printf("        ==> CUDA: Kernel execution time: %.4f s\n", time_kernel);
00899 //     #endif  
00900       
00901       
00902     // -- Copy the simulated image from the GPU memory to the CPU:           
00903     checkCudaErrors(cudaMemcpy(image, image_device, image_bytes, cudaMemcpyDeviceToHost) );  // Copy final results to host
00904 
00905          
00906 ///////////////////////////////////////////////////////////////////////////////////////////////////
00907   
00908     
00909 #else
00910     
00911     // *** Executing the kernel in the CPU:
00912     
00913     //     If using more than one MPI thread, the number of particles is equally dividied among the threads.  
00914     //    !!DeBuG!! --> NOT USING SPEED TEST IN THE CPU!! Not possible to limit the execution by time in the CPU.
00915     
00916     int total_threads = (int)(((double)total_histories)/((double)histories_per_thread*numprocs) + 0.9990);     // Divide the histories among MPI threads, rounding up
00917     unsigned long long int total_histories_per_thread = ((unsigned long long int)(total_threads))*histories_per_thread;
00918     total_histories = total_histories_per_thread*numprocs;    // Total histories will be equal or higher than the input value due to the rounding up in the division of the histories       
00919     
00920     
00921     if (numprocs>1) 
00922     {
00923       #ifdef USING_MPI       
00924         update_seed_PRNG(myID, total_histories, &seed_input);   // Compute the initial random seed for each MPI threads, avoiding overlapping of the random sequences
00925       
00926         printf("       Executing %d history batches in the CPU, with %d histories in each batch  (thread %d of %d at \'%s\'): %lld histories (random seed=%d).\n", total_threads, histories_per_thread, myID+1, numprocs, MPI_processor_name, total_histories_per_thread, seed_input);
00927         MASTER_THREAD printf("       Simulating %lld histories in total for the %d MPI threads.\n\n", total_histories, numprocs);
00928       #endif
00929     }
00930     else
00931     {
00932       printf("       Executing %d history batches in the CPU, with %d histories in each batch: %lld histories in total.\n\n", total_threads, histories_per_thread, total_histories);
00933     }    
00934     fflush(stdout); 
00935     
00936 
00937     // -- Copy local structures to global struct variables accessible from "track_particles" (__constant__ variables in the GPU):
00938     source_energy_data_CONST = source_energy_data;
00939     voxel_data_CONST = voxel_data;
00940     mfp_table_data_CONST = mfp_table_data;
00941     dose_ROI_x_min_CONST = dose_ROI_x_min;
00942     dose_ROI_x_max_CONST = dose_ROI_x_max;
00943     dose_ROI_y_min_CONST = dose_ROI_y_min;
00944     dose_ROI_y_max_CONST = dose_ROI_y_max;
00945     dose_ROI_z_min_CONST = dose_ROI_z_min;
00946     dose_ROI_z_max_CONST = dose_ROI_z_max;
00947 
00948     
00949     int CPU_batch;
00950     for(CPU_batch=0; CPU_batch<total_threads; CPU_batch++)
00951     {
00952       // -- Simulate a particle track initializing the PRNG with the particle number 'n':
00953       track_particles(CPU_batch, histories_per_thread, num_p, seed_input, image, voxels_Edep, voxel_mat_dens, mfp_Woodcock_table, mfp_table_a, mfp_table_b, &rayleigh_table, &compton_table, detector_data, source_data, materials_dose);
00954     }
00955 
00956     
00957 #endif    
00958 
00959 
00960     // Get current time and calculate execution time in the MC loop:
00961     time_elapsed_MC_loop = ((double)(clock()-clock_start))/CLOCKS_PER_SEC;       
00962     time_total_MC_simulation += time_elapsed_MC_loop;   // Count total time (in seconds).
00963         //  printf("\n    -- MONTE CARLO LOOP finished: time tallied in MAIN program: %.3f s\n\n", time_elapsed_MC_loop);
00964         
00965 
00966 
00967 ///////////////////////////////////////////////////////////////////////////////////////////////////
00968      
00969 
00970     // *** Move the images simulated in the GPU (or multiple CPU cores) to the host memory space:
00971     
00972 #ifdef USING_MPI 
00973     if (numprocs>1)  // Using more than 1 MPI thread
00974     {
00975       // -- Add the images simulated in all the MPI threads:      
00976       MASTER_THREAD printf("\n        >>  Synchronize the MPI threads and accumulate the simulated images (MPI_Reduce).\n\n");                    
00977       
00978       // Allocate the memory for the final image in the master thread:
00979       unsigned long long int *image_MPI = NULL;
00980       MASTER_THREAD image_MPI = (unsigned long long int*) malloc(image_bytes);
00981       MASTER_THREAD if (image_MPI==NULL)
00982       {
00983         printf("\n\n   !!malloc ERROR!! Problem allocating the total MPI image. Out of memory??\n\n");  
00984         exit(-4);
00985       }
00986 
00987 
00988       // !!DeBuG!! To know how much time the threads lose waiting for other threads in the MPI_Reduce, I have to use an explicit barrier here. It may be more efficient to let the threads advance to the MPI_Reduce directly.
00989       clock_start = clock();      
00990       MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads            
00991       
00992       current_time=time(NULL);      
00993       char_time = ctime(&current_time); char_time[19] = '\0';   // The time is located between the characters 11 and 19.
00994       
00995       
00996     #ifdef USING_CUDA
00997       if (1==doing_speed_test)    // This message will be shown only for the first projection simulated in the GPU.
00998         printf("        ==> CUDA (MPI process #%d in \"%s\"): GPU speed = %.4f hist/s. Time spent at MPI_Barrier waiting to add the partial images: %.6f s (time: %8s)\n", myID, MPI_processor_name, real_GPU_speed, ((double)(clock()-clock_start))/CLOCKS_PER_SEC, &char_time[11]);
00999     #else        
01000       if (-1==doing_speed_test)
01001       {
01002         printf("        ==> CUDA (MPI process #%d in \"%s\"): Time spent at MPI_Barrier waiting to add the partial images: %.6f s (time: %8s)\n", myID, MPI_processor_name, ((double)(clock()-clock_start))/CLOCKS_PER_SEC, &char_time[11]);
01003         doing_speed_test = 0;
01004       }
01005     #endif  
01006       
01007       
01008       fflush(stdout);      
01009       
01010       MASTER_THREAD clock_start = clock();
01011                       
01012       // -- Sum the pixel values from the different simulated images and send to thread 0.
01013       //    MPI_Reduce will act as a synchronization barrier for all the MPI threads.
01014       int num_pixels_image = image_bytes/((int)sizeof(unsigned long long int));   // Number of elements allocated in the "image" array.         
01015       return_reduce = MPI_Reduce(image, image_MPI, num_pixels_image, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD); 
01016       
01017       if (MPI_SUCCESS != return_reduce)
01018       {
01019         printf("\n\n !!ERROR!! Possible error reducing (MPI_SUM) the image results??? Returned value MPI_Reduce = %d\n\n\n", return_reduce);
01020       }
01021               
01022       // -- Exchange the image simulated in thread 0 for the final image from all threads, in the master thread:
01023       MASTER_THREAD 
01024       {
01025         free(image);
01026         image = image_MPI;    // point the image pointer to the new image in host memory
01027         image_MPI = NULL;                
01028 
01029         printf("\n       -- Time reducing the images simulated by all the MPI threads (MPI_Reduce) according to the master thread = %.6f s.\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC); 
01030       }
01031     }
01032 #endif
01033                 
01034 
01035     // *** Report the final results:
01036     char file_name_output_num_p[253];
01037     if (1==num_projections)
01038       strcpy(file_name_output_num_p, file_name_output);   // Use the input name for single projection
01039     else
01040       sprintf(file_name_output_num_p, "%s_%04d", file_name_output, num_p);   // Create the output file name with the input name + projection number (4 digits, padding with 0)
01041 
01042     MASTER_THREAD report_image(file_name_output_num_p, detector_data, source_data, mean_energy_spectrum, image, time_elapsed_MC_loop, total_histories, num_p, num_projections, D_angle, initial_angle, myID, numprocs);
01043 
01044     // *** Clear the image after reporting, unless this is the last projection to simulate:
01045     if (num_p<(num_projections-1))
01046     {
01047       int pixels_per_image = detector_data[0].num_pixels.x * detector_data[0].num_pixels.y;
01048       #ifdef USING_CUDA
01049         MASTER_THREAD printf("       ==> CUDA: Launching kernel to reset the device image to 0: number of blocks = %d, threads per block = 128\n", (int)(ceil(pixels_per_image/128.0f)+0.01f) );
01050         init_image_array_GPU<<<(int)(ceil(pixels_per_image/128.0f)+0.01f),128>>>(image_device, pixels_per_image);
01051         cudaThreadSynchronize();
01052         getLastCudaError("\n\n !!Kernel execution failed initializing the image array!! ");  // Check if kernel execution generated any error:
01053       #else        
01054         memset(image, 0, image_bytes);     //   Init memory space to 0.  (see http://www.lainoox.com/c-memset-examples/)
01055       #endif
01056     }
01057     
01058   }  // [Projection loop end: iterate for next CT projection angle]
01059 
01060 
01061 ///////////////////////////////////////////////////////////////////////////////////////////////////
01062           
01063      
01064   // *** Simulation finished! Report dose and timings and clean up.
01065 
01066 #ifdef USING_CUDA
01067   if (dose_ROI_x_max > -1)
01068   {   
01069     MASTER_THREAD clock_kernel = clock();    
01070 
01071     checkCudaErrors( cudaMemcpy( voxels_Edep, voxels_Edep_device, voxels_Edep_bytes, cudaMemcpyDeviceToHost) );  // Copy final dose results to host (for every MPI threads)
01072 
01073     MASTER_THREAD printf("       ==> CUDA: Time copying dose results from device to host: %.6f s\n", float(clock()-clock_kernel)/CLOCKS_PER_SEC);
01074   }
01075   
01076   if (flag_material_dose==1)
01077     checkCudaErrors( cudaMemcpy( materials_dose, materials_dose_device, MAX_MATERIALS*sizeof(ulonglong2), cudaMemcpyDeviceToHost) );  // Copy materials dose results to host, if tally enabled in input file.   !!tally_materials_dose!!
01078 
01079   // -- Clean up GPU device memory:
01080   clock_kernel = clock();    
01081 
01082   cudaFree(voxel_mat_dens_device);
01083   cudaFree(image_device);
01084   cudaFree(mfp_Woodcock_table_device);
01085   cudaFree(mfp_table_a_device);
01086   cudaFree(mfp_table_b_device);
01087   cudaFree(voxels_Edep_device);
01088   checkCudaErrors( cudaThreadExit() );
01089 
01090   MASTER_THREAD printf("       ==> CUDA: Time freeing the device memory and ending the GPU threads: %.6f s\n", float(clock()-clock_kernel)/CLOCKS_PER_SEC);
01091 
01092 #endif
01093 
01094 
01095 #ifdef USING_MPI
01096   current_time=time(NULL);     // Get current time (in seconds)
01097   char_time = ctime(&current_time); char_time[19] = '\0';   // The time is located betwen the characters 11 and 19.  
01098   printf("        >> MPI thread %d in \"%s\" done! (local time: %s)\n", myID, MPI_processor_name, &char_time[11]);
01099   fflush(stdout);   // Clear the screen output buffer
01100 #endif
01101 
01102 
01103   
01104   // *** Report the total dose for all the projections, if the tally is not disabled (must be done after MPI_Barrier to have all the MPI threads synchronized):
01105   MASTER_THREAD clock_start = clock(); 
01106   
01107   if (dose_ROI_x_max > -1)
01108   {   
01109     
01110 #ifdef USING_MPI
01111     if (numprocs>1)
01112     {
01113       // -- Use MPI_Reduce to accumulate the dose from all projections:      
01114       //    Allocate memory in the root node to combine the dose results with MPI_REDUCE:
01115       int num_voxels_ROI = voxels_Edep_bytes/((int)sizeof(ulonglong2));   // Number of elements allocated in the "dose" array.
01116       ulonglong2 *voxels_Edep_total = (ulonglong2*) malloc(voxels_Edep_bytes);
01117       if (voxels_Edep_total==NULL)
01118       {
01119         printf("\n\n   !!malloc ERROR!! Not enough memory to allocate %d voxels by the MPI root node for the total deposited dose (and uncertainty) array (%f Mbytes)!!\n\n", num_voxels_ROI, voxels_Edep_bytes/(1024.f*1024.f));
01120         exit(-2);
01121       }
01122       else
01123       {
01124         MASTER_THREAD
01125         {
01126           printf("\n        >> Array for the total deposited dose correctly allocated by the MPI root node (%f Mbytes).\n", voxels_Edep_bytes/(1024.f*1024.f));
01127           printf(  "           Waiting at MPI_Barrier for thread synchronization.\n");
01128         }
01129       }      
01130       
01131       
01132       MASTER_THREAD printf("\n        >> Calling MPI_Reduce to accumulate the dose from all projections...\n\n");    
01133       
01134       return_reduce = MPI_Reduce(voxels_Edep, voxels_Edep_total, 2*num_voxels_ROI, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);   // Sum all the doses in "voxels_Edep_total" at thread 0.
01135             // !!DeBuG!! I am sending a "ulonglong2" array as if it was composed of 2 "ulonglong" variables per element. There could be problems if the alignment in the structure includes some extra padding space (but it seems ok for a 64-bit computer).
01136       if (MPI_SUCCESS != return_reduce)
01137       {
01138         printf("\n\n !!ERROR!! Possible error reducing (MPI_SUM) the dose results??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
01139       }
01140 
01141       // -- Exchange the dose simulated in thread 0 for the final dose from all threads  
01142       MASTER_THREAD
01143       {
01144         free(voxels_Edep);
01145         voxels_Edep = voxels_Edep_total;    // point the voxels_Edep pointer to the final voxels_Edep array in host memory
01146         voxels_Edep_total = NULL;           // This pointer is not needed by now
01147       }
01148     }
01149 #endif
01150         
01151     // -- Report the total dose for all the projections:
01152     MASTER_THREAD report_voxels_dose(file_dose_output, num_projections, &voxel_data, voxel_mat_dens, voxels_Edep, time_total_MC_simulation, total_histories, dose_ROI_x_min, dose_ROI_x_max, dose_ROI_y_min, dose_ROI_y_max, dose_ROI_z_min, dose_ROI_z_max, source_data);        
01153   }
01154   
01155   
01156   // -- Report "tally_materials_dose" with data from all MPI threads, if tally enabled:
01157   if (flag_material_dose==1)
01158   {
01159   #ifdef USING_MPI
01160     ulonglong2 materials_dose_total[MAX_MATERIALS];
01161     return_reduce = MPI_Reduce(materials_dose, materials_dose_total, 2*MAX_MATERIALS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);   // !!tally_materials_dose!!
01162   #else
01163     ulonglong2 *materials_dose_total = materials_dose;  // Create a dummy pointer to the materials_dose data 
01164   #endif
01165     
01166     MASTER_THREAD report_materials_dose(num_projections, total_histories, density_nominal, materials_dose_total, mass_materials);    // Report the material dose  !!tally_materials_dose!!
01167   }
01168   
01169   MASTER_THREAD clock_end = clock();
01170   MASTER_THREAD printf("\n\n       ==> CUDA: Time reporting the dose data: %.6f s\n", ((double)(clock_end-clock_start))/CLOCKS_PER_SEC);
01171   
01172 
01173   // *** Clean up RAM memory. If CUDA was used, the geometry and table data were already cleaned for MPI threads other than root after copying data to the GPU:
01174   free(voxels_Edep);
01175   free(image);
01176 #ifdef USING_CUDA
01177   MASTER_THREAD free(voxel_mat_dens);
01178 #else
01179   free(voxel_mat_dens);
01180   free(mfp_Woodcock_table);
01181   free(mfp_table_a);
01182   free(mfp_table_b);
01183 #endif
01184    
01185 
01186 #ifdef USING_MPI
01187   MPI_Finalize();   // Finalize MPI library: no more MPI calls allowed below.
01188 #endif
01189   
01190     
01191   MASTER_THREAD 
01192   {
01193     printf("\n\n\n    -- SIMULATION FINISHED!\n");
01194     
01195     time_total_MC_init_report = ((double)(clock()-clock_start_beginning))/CLOCKS_PER_SEC;
01196 
01197     // -- Report total performance:
01198     printf("\n\n       ****** TOTAL SIMULATION PERFORMANCE (including initialization and reporting) ******\n\n");  
01199     printf(    "          >>> Execution time including initialization, transport and report: %.3f s.\n", time_total_MC_init_report);
01200     printf(    "          >>> Time spent in the Monte Carlo transport only: %.3f s.\n", time_total_MC_simulation);
01201     printf(    "          >>> Time spent in initialization, reporting and clean up: %.3f s.\n\n", (time_total_MC_init_report-time_total_MC_simulation));
01202     printf(    "          >>> Total number of simulated x rays:  %lld\n", total_histories*((unsigned long long int)num_projections));      
01203     if (time_total_MC_init_report>0.000001)
01204       printf(  "          >>> Total speed (using %d thread, including initialization time) [x-rays/s]:  %.2f\n\n", numprocs, (double)(total_histories*((unsigned long long int)num_projections))/time_total_MC_init_report);
01205   
01206     current_time=time(NULL);     // Get current time (in seconds)
01207     
01208     printf("\n****** Code execution finished on: %s\n\n", ctime(&current_time));
01209   }
01210   
01211 #ifdef USING_CUDA
01212   cudaDeviceReset();  // Destroy the CUDA context before ending program (flush visual debugger data).
01213 #endif
01214 
01215   return 0;
01216 }
01217 
01218 
01219 
01220 
01221 
01222 ////////////////////////////////////////////////////////////////////////////////
01223 //! Read the input file given in the command line and return the significant data.
01224 //! Example input file:
01225 //!
01226 //!    1000000          [Total number of histories to simulate]
01227 //!    geometry.vox     [Voxelized geometry file name]
01228 //!    material.mat     [Material data file name]
01229 //!
01230 //!       @param[in] argc   Command line parameters
01231 //!       @param[in] argv   Command line parameters: name of input file
01232 //!       @param[out] total_histories  Total number of particles to simulate
01233 //!       @param[out] seed_input   Input random number generator seed
01234 //!       @param[out] num_threads_per_block   Number of CUDA threads for each GPU block
01235 //!       @param[out] detector_data
01236 //!       @param[out] image
01237 //!       @param[out] source_data
01238 //!       @param[out] file_name_voxels
01239 //!       @param[out] file_name_materials
01240 //!       @param[out] file_name_output
01241 ////////////////////////////////////////////////////////////////////////////////
01242 void read_input(int argc, char** argv, int myID, unsigned long long int* total_histories, int* seed_input, int* gpu_id, int* num_threads_per_block, int* histories_per_thread, struct detector_struct* detector_data, unsigned long long int** image_ptr, int* image_bytes, struct source_struct* source_data, struct source_energy_struct* source_energy_data, char* file_name_voxels, char file_name_materials[MAX_MATERIALS][250] , char* file_name_output, char* file_name_espc, int* num_projections, double* D_angle, double* angularROI_0, double* angularROI_1, double* initial_angle, ulonglong2** voxels_Edep_ptr, int* voxels_Edep_bytes, char* file_dose_output, short int* dose_ROI_x_min, short int* dose_ROI_x_max, short int* dose_ROI_y_min, short int* dose_ROI_y_max, short int* dose_ROI_z_min, short int* dose_ROI_z_max, double* SRotAxisD, double* vertical_translation_per_projection, int* flag_material_dose)
01243 {
01244   FILE* file_ptr = NULL;
01245   char new_line[250];
01246   char *new_line_ptr = NULL;
01247   double dummy_double;
01248 
01249   // -- Read the input file name from command line, if given (otherwise keep default value):
01250   if (2==argc)
01251   {
01252     file_ptr = fopen(argv[1], "r");
01253     if (NULL==file_ptr)
01254     {
01255       printf("\n\n   !!read_input ERROR!! Input file not found or not readable. Input file name: \'%s\'\n\n", argv[1]);      
01256         //  Not finalizing MPI here because we want the execution to fail if there is a problem with any MPI thread!!! MPI_Finalize();   // Finalize MPI library: no more MPI calls allowed below.
01257       exit(-1);
01258     }
01259   }
01260   else if (argc>2)
01261   {
01262     
01263     MASTER_THREAD printf("\n\n   !!read_input ERROR!! Too many input parameter (argc=%d)!! Provide only the input file name.\n\n", argc);    
01264     // Finalizing MPI because all threads will detect the same problem and fail together.
01265     #ifdef USING_MPI
01266       MPI_Finalize();
01267     #endif
01268     exit(-1);
01269   }
01270   else
01271   {
01272     MASTER_THREAD printf("\n\n   !!read_input ERROR!! Input file name not given as an execution parameter!! Try again...\n\n");
01273     #ifdef USING_MPI
01274       MPI_Finalize();
01275     #endif
01276     exit(-1);
01277   }
01278 
01279   MASTER_THREAD printf("\n    -- Reading the input file \'%s\':\n", argv[1]);
01280 
01281 
01282   // -- Init. [SECTION SIMULATION CONFIG v.2009-05-12]:
01283   do
01284   {
01285     new_line_ptr = fgets(new_line, 250, file_ptr);    // Read full line (max. 250 characters).
01286     if (new_line_ptr==NULL)
01287     {
01288       printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION SIMULATION CONFIG v.2009-05-12\'!!\n");
01289       exit(-2);
01290     }
01291   }
01292   while(strstr(new_line,"SECTION SIMULATION CONFIG v.2009-05-12")==NULL);   // Skip comments and empty lines until the section begins
01293   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01294     sscanf(new_line, "%lf", &dummy_double);
01295     *total_histories = (unsigned long long int) (dummy_double+0.0001);  // Maximum unsigned long long value: 18446744073709551615
01296   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01297     sscanf(new_line, "%d", seed_input);   // Set the RANECU PRNG seed (the same seed will be used to init the 2 MLCGs in RANECU)
01298   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01299     sscanf(new_line, "%d", gpu_id);       // GPU NUMBER WHERE SIMULATION WILL RUN
01300   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01301     sscanf(new_line, "%d", num_threads_per_block);  // GPU THREADS PER CUDA BLOCK
01302     
01303 #ifdef USING_CUDA
01304   if ((*num_threads_per_block%32)!=0)
01305   {
01306     MASTER_THREAD printf("\n\n   !!read_input ERROR!! The input number of GPU threads per CUDA block must be a multiple of 32 (warp size). Input value: %d !!\n\n", *num_threads_per_block);
01307     #ifdef USING_MPI
01308       MPI_Finalize();
01309     #endif
01310     exit(-2);
01311   }
01312 #endif
01313   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01314     sscanf(new_line, "%d", histories_per_thread);   // HISTORIES PER GPU THREAD
01315 
01316     
01317   // -- Init. [SECTION SOURCE v.2009-05-12]:
01318   do
01319   {
01320     new_line_ptr = fgets(new_line, 250, file_ptr);
01321     if (new_line_ptr==NULL)
01322     {
01323       printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION SOURCE v.2011-07-12\'!!\n");
01324       exit(-2);
01325     }
01326   }
01327   while(strstr(new_line,"SECTION SOURCE v.2011-07-12")==NULL);   // Skip comments and empty lines until the section begins
01328 
01329 
01330 
01331     
01332   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);  // X-RAY ENERGY SPECTRUM FILE
01333     trim_name(new_line, file_name_espc);
01334     
01335   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01336     sscanf(new_line, "%f %f %f", &source_data[0].position.x, &source_data[0].position.y, &source_data[0].position.z);   // SOURCE POSITION: X Y Z [cm]
01337   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01338     sscanf(new_line, "%f %f %f", &source_data[0].direction.x, &source_data[0].direction.y, &source_data[0].direction.z);   // SOURCE DIRECTION COSINES: U V W
01339     // -- Normalize the input beam direction to 1:
01340     dummy_double = 1.0/sqrt((double)(source_data[0].direction.x*source_data[0].direction.x + source_data[0].direction.y*source_data[0].direction.y + source_data[0].direction.z*source_data[0].direction.z));
01341     source_data[0].direction.x = (float)(((double)source_data[0].direction.x)*dummy_double);
01342     source_data[0].direction.y = (float)(((double)source_data[0].direction.y)*dummy_double);
01343     source_data[0].direction.z = (float)(((double)source_data[0].direction.z)*dummy_double);
01344   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01345   
01346 
01347   // Read input fan beam polar (theta) and azimuthal (phi) aperture angles (deg):
01348   double phi_aperture, theta_aperture;
01349   sscanf(new_line, "%lf %lf", &phi_aperture, &theta_aperture);
01350 
01351   if (theta_aperture > 180.0)
01352   {
01353     MASTER_THREAD printf("\n\n   !!read_input ERROR!! Input polar aperture must be in [0,180] deg.!\n");
01354     MASTER_THREAD printf("                       theta_aperture = %lf, phi_aperture = %lf\n", theta_aperture, phi_aperture);
01355     #ifdef USING_MPI
01356       MPI_Finalize();
01357     #endif
01358     exit(-2);
01359   }
01360   if (phi_aperture > 360.0)
01361   {
01362     MASTER_THREAD printf("\n\n   !!read_input ERROR!! Input azimuthal aperture must be in [0,360] deg.!\n");
01363     MASTER_THREAD printf("                       theta_aperture = %lf, phi_aperture = %lf\n", theta_aperture, phi_aperture);
01364     #ifdef USING_MPI
01365       MPI_Finalize();
01366     #endif    
01367     exit(-2);
01368   }
01369   
01370   // Entering a negative theta_aperture or phi_aperture, the emitted fan beam will cover exactly the detector: see below.
01371     
01372   // *** RECTANGULAR BEAM INITIALIZATION: aperture initially centered at (0,1,0), ie, THETA_0=90, PHI_0=90
01373   //     Using the algorithm used in PENMAIN.f, from penelope 2008 (by F. Salvat).
01374   source_data[0].cos_theta_low = (float)( cos((90.0 - 0.5*theta_aperture)*DEG2RAD) );
01375   source_data[0].D_cos_theta   = (float)( -2.0*source_data[0].cos_theta_low );      // Theta aperture is symetric above and below 90 deg
01376   source_data[0].phi_low       = (float)( (90.0 - 0.5*phi_aperture)*DEG2RAD );
01377   source_data[0].D_phi         = (float)( phi_aperture*DEG2RAD );
01378   source_data[0].max_height_at_y1cm = (float) ( tan(0.5*theta_aperture*DEG2RAD) );
01379 
01380   
01381   // If a pencil beam is input, convert the 0 angle to a very small square beam to avoid precission errors:
01382   if (abs(theta_aperture) < 1.0e-7)
01383   {
01384     theta_aperture = +1.00e-7;
01385     source_data[0].cos_theta_low = 0.0f;  // = cos(90-0)
01386     source_data[0].D_cos_theta   = 0.0f;
01387     source_data[0].max_height_at_y1cm = 0.0f;
01388   }
01389   if (abs(phi_aperture) < 1.0e-7)
01390   {  
01391     phi_aperture = +1.00e-7;
01392     source_data[0].phi_low       = (float)( 90.0*DEG2RAD );
01393     source_data[0].D_phi         = 0.0f;    
01394   }  
01395   
01396   
01397   // -- Init. [SECTION IMAGE DETECTOR v.2009-12-02]:
01398   do
01399   {
01400     new_line_ptr = fgets(new_line, 250, file_ptr);
01401     if (new_line_ptr==NULL)
01402     {
01403       printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION IMAGE DETECTOR v.2009-12-02\'!!\n");
01404       exit(-2);
01405     }
01406   }
01407   while(strstr(new_line,"SECTION IMAGE DETECTOR v.2009-12-02")==NULL);   // Skip comments and empty lines until the section begins
01408   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01409     trim_name(new_line, file_name_output);   // OUTPUT IMAGE FILE NAME (no spaces)
01410   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01411     float dummy_num_pixels_x, dummy_num_pixels_y;  // Read input pixel number as float and truncated to integer
01412     sscanf(new_line, "%f %f", &dummy_num_pixels_x, &dummy_num_pixels_y);   // NUMBER OF PIXELS IN THE IMAGE: Nx Nz
01413     detector_data[0].num_pixels.x = (int)(dummy_num_pixels_x+0.001f);
01414     detector_data[0].num_pixels.y = (int)(dummy_num_pixels_y+0.001f);
01415     detector_data[0].total_num_pixels = detector_data[0].num_pixels.x * detector_data[0].num_pixels.y;
01416 
01417     if (detector_data[0].total_num_pixels < 1 || detector_data[0].total_num_pixels > 99999999 )
01418     {
01419       MASTER_THREAD printf("\n\n   !!read_input ERROR!! The input number of pixels is incorrect. Input: X_pix = %d, Y_pix = %d, total_num_pix = %d!!\n\n", detector_data[0].num_pixels.x, detector_data[0].num_pixels.y, detector_data[0].total_num_pixels);
01420       #ifdef USING_MPI
01421         MPI_Finalize();
01422       #endif      
01423       exit(-2); 
01424     }
01425   
01426   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01427   sscanf(new_line, "%f %f", &detector_data[0].width_X, &detector_data[0].height_Z);   // IMAGE SIZE (width, height): Dx Dz [cm]
01428     detector_data[0].inv_pixel_size_X = detector_data[0].num_pixels.x / detector_data[0].width_X;
01429     detector_data[0].inv_pixel_size_Z = detector_data[0].num_pixels.y / detector_data[0].height_Z;
01430 
01431   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01432     sscanf(new_line, "%f", &detector_data[0].sdd);            // SOURCE-TO-DETECTOR DISTANCE [cm] (detector set in front of the source, normal to the input direction)
01433 
01434     float3 detector_center;   // Center of the detector straight ahead of the focal spot.
01435     detector_center.x = source_data[0].position.x + source_data[0].direction.x * detector_data[0].sdd;
01436     detector_center.y = source_data[0].position.y + source_data[0].direction.y * detector_data[0].sdd;
01437     detector_center.z = source_data[0].position.z + source_data[0].direction.z * detector_data[0].sdd;
01438         
01439     if ((detector_data[0].sdd)<1.0e-6)
01440     {
01441       MASTER_THREAD printf("\n\n   !!read_input ERROR!! The source-to-detector distance must be positive. Input: sdd=%f!!\n\n", detector_data[0].sdd);
01442       #ifdef USING_MPI
01443         MPI_Finalize();
01444       #endif      
01445       exit(-2);
01446     }
01447 
01448   if ( (theta_aperture < -1.0e-7) || (phi_aperture < -1.0e-7) )   // If we enter a negative angle, the fan beam will cover exactly the detector surface.
01449   {
01450     theta_aperture= 2.0 * atan(0.5*detector_data[0].height_Z/(detector_data[0].sdd)) * RAD2DEG;   // Optimum angles
01451     phi_aperture  = 2.0 * atan(0.5*detector_data[0].width_X/(detector_data[0].sdd)) * RAD2DEG;
01452 
01453     source_data[0].cos_theta_low = (float)( cos((90.0 - 0.5*theta_aperture)*DEG2RAD) );
01454     source_data[0].D_cos_theta   = (float)( -2.0*source_data[0].cos_theta_low );      // Theta aperture is symetric above and below 90 deg
01455     source_data[0].phi_low       = (float)( (90.0 - 0.5*phi_aperture)*DEG2RAD );
01456     source_data[0].D_phi         = (float)( phi_aperture*DEG2RAD );
01457     source_data[0].max_height_at_y1cm = (float) ( tan(0.5*theta_aperture*DEG2RAD) ); 
01458   }
01459   
01460 
01461   // -- Init. [SECTION CT SCAN v.2011-10-25]:
01462   do
01463   {
01464     new_line_ptr = fgets(new_line, 250, file_ptr);
01465     if (new_line_ptr==NULL)
01466     {
01467       printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION CT SCAN TRAJECTORY v.2011-10-25\'!!\n");
01468       exit(-2);
01469     }
01470   }
01471   while(strstr(new_line,"SECTION CT SCAN TRAJECTORY v.2011-10-25")==NULL);  // Skip comments and empty lines until the section begins
01472   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01473   sscanf(new_line, "%d", num_projections);     // NUMBER OF PROJECTIONS (beam must be perpendicular to Z axis, set to 1 for a single projection)
01474   if (0 == (*num_projections))
01475     *num_projections = 1;      // Zero projections has the same effect as 1 projection (ie, no CT scan rotation). Negative values are allowed and then the source rotates in opposite direction (negative angles).
01476   if ( (fabs(*num_projections) > 1) && (fabs(source_data[0].direction.z)>0.00001f) )
01477   {
01478     MASTER_THREAD printf("\n\n   !!read_input ERROR!! Sorry, but currently we can only simulate CT scans when the source direction is perpendicular to the Z axis (ie, w=0).\n\n\n");   // The reconstructed planes are always parallel to the XY plane.\n");
01479     #ifdef USING_MPI
01480       MPI_Finalize();
01481     #endif    
01482     exit(-2);
01483   }
01484   if ( fabs(*num_projections) > MAX_NUM_PROJECTIONS )
01485   {
01486     MASTER_THREAD printf("\n\n   !!read_input ERROR!! The input number of projections is too large. Increase parameter MAX_NUM_PROJECTIONS=%d in the header file and recompile.\n", MAX_NUM_PROJECTIONS);
01487     MASTER_THREAD printf(    "                        There is no limit in the number of projections to be simulated because the source, detector data for each projection is stored in global memory and transfered to shared memory for each projection.\n\n");
01488     #ifdef USING_MPI
01489       MPI_Finalize();
01490     #endif    
01491     exit(-2);
01492   }
01493   
01494 
01495   if (*num_projections!=1)
01496   {
01497     // -- Skip rest of the section if simulating a single projection:
01498     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01499     sscanf(new_line, "%lf", D_angle);   // ANGLE BETWEEN PROJECTIONS [degrees] (360/num_projections for full CT)
01500 //     printf("       [Input] %s",new_line);
01501     *D_angle = (*D_angle)*DEG2RAD;      // store the angle in radians
01502     
01503 
01504     // Calculate initial source angle:
01505     *initial_angle = acos((double)(source_data[0].direction.x));
01506     if (source_data[0].direction.y<0)
01507       *initial_angle = -(*initial_angle);     // Correct for the fact that positive and negative angles have the same ACOS
01508     if (*initial_angle<0.0)
01509       *initial_angle = (*initial_angle) + 2.0*PI;   // Make sure the angle is not negative, between [0,360) degrees.
01510     *initial_angle = (*initial_angle) - PI;   // Correct the fact that the source is opposite to the detector (180 degrees difference).
01511     if (*initial_angle<0.0)
01512       *initial_angle = (*initial_angle) + 2.0*PI;   // Make sure the initial angle is not negative, between [0,360) degrees.
01513 
01514   
01515     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01516     sscanf(new_line, "%lf %lf", angularROI_0, angularROI_1);   // ANGLES OF INTEREST (projections outside this interval will be skipped)
01517     
01518     // if (*angularROI_0<-0.001 || *angularROI_1>360.001)
01519     // {
01520     //   MASTER_THREAD printf("\n\n   !!read_input ERROR!! The angles in the angular region of interest must be in the interval [0,360]. Input: %f, %f.\n\n\n", *angularROI_0, *angularROI_1);   // // The reconstructed planes are always parallel to the XY plane.\n");
01521     //   #ifdef USING_MPI
01522     //     MPI_Finalize();
01523     //   #endif
01524     //   exit(-2);
01525     // }
01526     
01527     *angularROI_0 = (*angularROI_0 - 0.00001)*DEG2RAD;   // Store the angles of interest in radians, increasing a little the interval to avoid floating point precision problems
01528     *angularROI_1 = (*angularROI_1 + 0.00001)*DEG2RAD;
01529 
01530 
01531     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01532     sscanf(new_line, "%lf", SRotAxisD);   // SOURCE-TO-ROTATION AXIS DISTANCE (rotation axis parallel to Z, located between the source and the detector)
01533     if (*SRotAxisD<0.0 || *SRotAxisD>detector_data[0].sdd)
01534     {
01535       MASTER_THREAD printf("\n\n   !!read_input ERROR!! Invalid source-to-rotation axis distance! Input: %f (sdd=%f).\n\n\n", *SRotAxisD, detector_data[0].sdd);
01536       #ifdef USING_MPI
01537         MPI_Finalize();
01538       #endif      
01539       exit(-2);
01540     }
01541     
01542     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01543     sscanf(new_line, "%lf", vertical_translation_per_projection);    // VERTICAL TRANSLATION BETWEEN PROJECTIONS (HELICAL SCAN)        
01544     
01545   }
01546   
01547   
01548 
01549   // -- Init. [SECTION DOSE DEPOSITION v.2012-12-12] (MC-GPU v.1.3):
01550   //    Electrons are not transported and therefore we are approximating that the dose is equal to the KERMA (energy released by the photons alone).
01551   //    This approximation is acceptable when there is electronic equilibrium and when the range of the secondary electrons is shorter than the voxel size.
01552   //    Usually the doses will be acceptable for photon energies below 1 MeV. The dose estimates may not be accurate at the interface of low density volumes.
01553   do
01554   {
01555     new_line_ptr = fgets(new_line, 250, file_ptr);
01556     if (new_line_ptr==NULL)
01557     {
01558       printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION DOSE DEPOSITION v.2012-12-12\'!!\n");
01559       exit(-2);
01560     }
01561     
01562     if (strstr(new_line,"SECTION DOSE DEPOSITION v.2011-02-18")!=NULL)  // Detect previous version of input file
01563     {
01564       MASTER_THREAD printf("\n\n   !!read_input ERROR!! Please update the input file to the new version of MC-GPU (v1.3)!!\n\n    You simply have to change the input file text line:\n         [SECTION DOSE DEPOSITION v.2011-02-18]\n\n    for these two lines:\n         [SECTION DOSE DEPOSITION v.2012-12-12]\n         NO                              # TALLY MATERIAL DOSE? [YES/NO]\n\n");
01565       exit(-2);
01566     }
01567     
01568   }
01569   while(strstr(new_line,"SECTION DOSE DEPOSITION v.2012-12-12")==NULL);  // Skip comments and empty lines until the section begins
01570     
01571 
01572   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);   // TALLY MATERIAL DOSE? [YES/NO]  --> turn on/off the material dose tallied adding the Edep in each material, independently of the voxels.
01573   if (0==strncmp("YE",new_line,2) || 0==strncmp("Ye",new_line,2) || 0==strncmp("ye",new_line,2))
01574   {
01575     *flag_material_dose = 1;
01576     MASTER_THREAD printf("       Material dose deposition tally ENABLED.\n");
01577   }
01578   else if (0==strncmp("NO",new_line,2) || 0==strncmp("No",new_line,2) || 0==strncmp("no",new_line,2))
01579   {
01580     *flag_material_dose = 0;  // -- NO: disabling tally
01581     MASTER_THREAD printf("       Material dose deposition tally DISABLED.\n");    
01582   }
01583   else
01584   {
01585     MASTER_THREAD printf("\n\n   !!read_input ERROR!! Answer YES or NO in the first two line of \'SECTION DOSE DEPOSITION\' to enable or disable the material dose and 3D voxel dose tallies.\n                        Input text: %s\n\n",new_line);
01586     #ifdef USING_MPI
01587       MPI_Finalize();
01588     #endif
01589     exit(-2);
01590   }        
01591 
01592   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);   // TALLY 3D VOXEL DOSE? [YES/NO] 
01593 
01594   if (0==strncmp("YE",new_line,2) || 0==strncmp("Ye",new_line,2) || 0==strncmp("ye",new_line,2))
01595   {
01596     // -- YES: using the tally
01597     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr); trim_name(new_line, file_dose_output);   // OUTPUT DOSE FILE NAME (no spaces)
01598     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr); sscanf(new_line, "%hd %hd", dose_ROI_x_min, dose_ROI_x_max);   // # VOXELS TO TALLY DOSE: X-index min max (first voxel has index 1)
01599     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr); sscanf(new_line, "%hd %hd", dose_ROI_y_min, dose_ROI_y_max);   // # VOXELS TO TALLY DOSE: Y-index min max
01600     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr); sscanf(new_line, "%hd %hd", dose_ROI_z_min, dose_ROI_z_max);   // # VOXELS TO TALLY DOSE: Z-index min max
01601 
01602     *dose_ROI_x_min -= 1; *dose_ROI_x_max -= 1;  // -Re-scale input coordinates to have index=0 for the first voxel instead of 1.
01603     *dose_ROI_y_min -= 1; *dose_ROI_y_max -= 1;
01604     *dose_ROI_z_min -= 1; *dose_ROI_z_max -= 1;
01605 
01606     MASTER_THREAD printf("       3D voxel dose deposition tally ENABLED.\n");
01607     if ( ((*dose_ROI_x_min)>(*dose_ROI_x_max)) || ((*dose_ROI_y_min)>(*dose_ROI_y_max)) || ((*dose_ROI_z_min)>(*dose_ROI_z_max)) ||
01608           (*dose_ROI_x_min)<0 || (*dose_ROI_y_min)<0 || (*dose_ROI_z_min)<0 )
01609     {
01610       MASTER_THREAD printf("\n\n   !!read_input ERROR!! The input region-of-interst in \'SECTION DOSE DEPOSITION\' is not valid: the minimum voxel index may not be zero or larger than the maximum index.\n");
01611       MASTER_THREAD printf(  "                          Input data = X[%d,%d], Y[%d,%d], Z[%d,%d]\n\n", *dose_ROI_x_min+1, *dose_ROI_x_max+1, *dose_ROI_y_min+1, *dose_ROI_y_max+1, *dose_ROI_z_min+1, *dose_ROI_z_max+1);  // Show ROI with index=1 for the first voxel instead of 0.
01612       #ifdef USING_MPI
01613         MPI_Finalize();
01614       #endif      
01615       exit(-2);
01616     }
01617     if ( ((*dose_ROI_x_min)==(*dose_ROI_x_max)) && ((*dose_ROI_y_min)==(*dose_ROI_y_max)) && ((*dose_ROI_z_min)==(*dose_ROI_z_max)) ) 
01618     {
01619       MASTER_THREAD printf("\n\n   !!read_input!! According to the input region-of-interest in \'SECTION DOSE DEPOSITION\', only the dose in the voxel (%d,%d,%d) will be tallied.\n\n",*dose_ROI_x_min,*dose_ROI_y_min,*dose_ROI_z_min);
01620     }
01621     
01622   }
01623   else if (0==strncmp("NO",new_line,2) || 0==strncmp("No",new_line,2) || 0==strncmp("no",new_line,2))
01624   {
01625     // -- NO: disabling tally
01626     MASTER_THREAD printf("       3D voxel dose deposition tally DISABLED.\n");
01627     *dose_ROI_x_min = (short int) 32500; *dose_ROI_x_max = (short int) -32500;   // Set absurd values for the ROI to make sure we never get any dose tallied
01628     *dose_ROI_y_min = (short int) 32500; *dose_ROI_y_max = (short int) -32500;   // (the maximum values for short int variables are +-32768)
01629     *dose_ROI_z_min = (short int) 32500; *dose_ROI_z_max = (short int) -32500;
01630   }
01631   else
01632   {
01633       MASTER_THREAD printf("\n\n   !!read_input ERROR!! Answer YES or NO in the first two line of \'SECTION DOSE DEPOSITION\' to enable or disable the material dose and 3D voxel dose tallies.\n                        Input text: %s\n\n",new_line);
01634       #ifdef USING_MPI
01635         MPI_Finalize();
01636       #endif
01637       exit(-2);
01638   }
01639   MASTER_THREAD printf("\n");
01640 
01641 
01642 
01643   // -- Init. [SECTION VOXELIZED GEOMETRY FILE v.2009-11-30]:
01644   do
01645   {
01646     new_line_ptr = fgets(new_line, 250, file_ptr);
01647     if (new_line_ptr==NULL)
01648     {
01649       printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION VOXELIZED GEOMETRY FILE v.2009-11-30\'!!\n");
01650       exit(-2);
01651     }
01652   }
01653   while(strstr(new_line,"SECTION VOXELIZED GEOMETRY FILE v.2009-11-30")==NULL);   // Skip comments and empty lines until the section begins
01654   new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01655   trim_name(new_line, file_name_voxels);   // VOXEL GEOMETRY FILE (penEasy 2008 format)
01656 
01657   do
01658   {
01659     new_line_ptr = fgets(new_line, 250, file_ptr);
01660     if (new_line_ptr==NULL)
01661     {
01662       printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION MATERIAL FILE LIST\'!!\n");
01663       exit(-2);
01664     }
01665   }
01666   while(strstr(new_line,"SECTION MATERIAL")==NULL);   // Skip comments and empty lines until the section begins
01667 
01668   int i;
01669   for (i=0; i<MAX_MATERIALS; i++)
01670   {
01671     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);
01672     if (new_line_ptr==NULL)
01673       file_name_materials[i][0]='\n';   // The input file is allowed to finish without defining all the materials
01674     else
01675       trim_name(new_line, file_name_materials[i]);
01676   }
01677   // [Finish reading input file]
01678 
01679   /////////////////////////////////////////////////////////////////////////////
01680 
01681   // *** Set the rotation that will bring particles from the detector plane to +Y=(0,+1,0) through a rotation around X and around Z (counter-clock):
01682   double rotX, rotZ, cos_rX, cos_rZ, sin_rX, sin_rZ;
01683       // rotX = 1.5*PI - acos(source_data[0].direction.z);  // Rotate to +Y = (0,+1,0) --> rotX_0 = 3/2*PI == -PI/2
01684     rotX = acos(source_data[0].direction.z) - 0.5*PI;  // Rotate to +Y = (0,+1,0) --> rotX_0 =  -PI/2
01685       // rotX = 0.5*PI - acos(source_data[0].direction.z);  // Rotate to +Y = (0,+1,0) --> rotX_0 =  PI/2
01686   if ( (source_data[0].direction.x*source_data[0].direction.x + source_data[0].direction.y*source_data[0].direction.y) > 1.0e-8 )   // == u^2+v^2 > 0
01687   {
01688       // rotZ = 0.5*PI - acos(source_data[0].direction.x/sqrt(source_data[0].direction.x*source_data[0].direction.x + source_data[0].direction.y*source_data[0].direction.y));
01689     if (source_data[0].direction.y >= 0.0f)
01690       rotZ = 0.5*PI - acos(source_data[0].direction.x/sqrt(source_data[0].direction.x*source_data[0].direction.x + source_data[0].direction.y*source_data[0].direction.y));
01691     else
01692       rotZ = 0.5*PI - (-acos(source_data[0].direction.x/sqrt(source_data[0].direction.x*source_data[0].direction.x + source_data[0].direction.y*source_data[0].direction.y)));
01693   }
01694   else
01695     rotZ = 0.0;   // Vector pointing to +Z, do not rotate around Z then.
01696  
01697   // -- Set the rotation matrix RzRx (called inverse because moves from the correct position to the reference at +Y):
01698   cos_rX = cos(rotX);
01699   cos_rZ = cos(rotZ);
01700   sin_rX = sin(rotX);
01701   sin_rZ = sin(rotZ);
01702 
01703   // Rotation matrix RxRz:
01704   detector_data[0].rot_inv[0] =  cos_rZ;
01705   detector_data[0].rot_inv[1] = -sin_rZ;
01706   detector_data[0].rot_inv[2] =  0.0f;
01707   detector_data[0].rot_inv[3] =  cos_rX*sin_rZ;
01708   detector_data[0].rot_inv[4] =  cos_rX*cos_rZ;
01709   detector_data[0].rot_inv[5] = -sin_rX;
01710   detector_data[0].rot_inv[6] =  sin_rX*sin_rZ;
01711   detector_data[0].rot_inv[7] =  sin_rX*cos_rZ;
01712   detector_data[0].rot_inv[8] =  cos_rX;
01713 
01714 
01715 
01716   if ((source_data[0].direction.y > 0.99999f) && (*num_projections==1))
01717   {
01718     // Simulating a single projection and initial beam pointing to +Y: no rotation needed!!
01719     detector_data[0].rotation_flag = 0;
01720     detector_data[0].corner_min_rotated_to_Y.x = detector_center.x;
01721     detector_data[0].corner_min_rotated_to_Y.y = detector_center.y;
01722     detector_data[0].corner_min_rotated_to_Y.z = detector_center.z;
01723 
01724     MASTER_THREAD printf("       Source pointing to (0,1,0): detector not rotated, initial location in voxels found faster.\n");  // maximizing code efficiency -> the simulation will be faster than for other angles (but not much).");
01725 
01726   }
01727   else
01728   { // Rotation needed to set the detector perpendicular to +Y:
01729     detector_data[0].rotation_flag = 1;
01730     // -- Rotate the detector center to +Y:
01731     detector_data[0].corner_min_rotated_to_Y.x = detector_center.x*detector_data->rot_inv[0] + detector_center.y*detector_data[0].rot_inv[1] + detector_center.z*detector_data[0].rot_inv[2];
01732     detector_data[0].corner_min_rotated_to_Y.y = detector_center.x*detector_data[0].rot_inv[3] + detector_center.y*detector_data[0].rot_inv[4] + detector_center.z*detector_data[0].rot_inv[5];
01733     detector_data[0].corner_min_rotated_to_Y.z = detector_center.x*detector_data[0].rot_inv[6] + detector_center.y*detector_data[0].rot_inv[7] + detector_center.z*detector_data[0].rot_inv[8];
01734 
01735     MASTER_THREAD printf("       Rotations from the input direction to +Y [deg]: rotZ = %f , rotX = %f\n", rotZ*RAD2DEG, rotX*RAD2DEG);
01736     
01737   }
01738   // -- Set the lower corner (minimum) coordinates at the normalized orientation: +Y. The detector has thickness 0.
01739   detector_data[0].corner_min_rotated_to_Y.x = detector_data[0].corner_min_rotated_to_Y.x - 0.5*detector_data[0].width_X;
01740 //  detector_data[0].corner_min_rotated_to_Y.y = detector_data[0].corner_min_rotated_to_Y.y;
01741   detector_data[0].corner_min_rotated_to_Y.z = detector_data[0].corner_min_rotated_to_Y.z - 0.5*detector_data[0].height_Z;
01742   
01743   detector_data[0].center.x = source_data[0].position.x + source_data[0].direction.x * detector_data[0].sdd;
01744   detector_data[0].center.y = source_data[0].position.y + source_data[0].direction.y * detector_data[0].sdd;
01745   detector_data[0].center.z = source_data[0].position.z + source_data[0].direction.z * detector_data[0].sdd;
01746 
01747   
01748   
01749   /////////////////////////////////////////////////////////////////////////////
01750 
01751   // *** Init the fan beam source model:
01752 
01753   if (1 == detector_data[0].rotation_flag)
01754   {
01755     // Initial beam NOT pointing to +Y: rotation is needed to move the sampled vector from (0,1,0) to the given direction!!
01756     rotX = 0.5*PI - acos(source_data[0].direction.z);         // ! Rotation about X: acos(wsrc)==theta, theta=90 for alpha=0, ie, +Y.
01757     rotZ = atan2(source_data[0].direction.y, source_data[0].direction.x) - 0.5*PI;   // ! Rotation about Z:  initial phi = 90 (+Y).  [ATAN2(v,u) = TAN(v/u), with the angle in the correct quadrant.
01758     cos_rX = cos(rotX);
01759     cos_rZ = cos(rotZ);
01760     sin_rX = sin(rotX);
01761     sin_rZ = sin(rotZ);
01762     // --Rotation around X (alpha) and then around Z (phi): Rz*Rx (oposite of detector rotation)
01763     source_data[0].rot_fan[0] =  cos_rZ;
01764     source_data[0].rot_fan[1] = -cos_rX*sin_rZ;
01765     source_data[0].rot_fan[2] =  sin_rX*sin_rZ;
01766     source_data[0].rot_fan[3] =  sin_rZ;
01767     source_data[0].rot_fan[4] =  cos_rX*cos_rZ;
01768     source_data[0].rot_fan[5] = -sin_rX*cos_rZ;
01769     source_data[0].rot_fan[6] =  0.0f;
01770     source_data[0].rot_fan[7] =  sin_rX;
01771     source_data[0].rot_fan[8] =  cos_rX;
01772     
01773     MASTER_THREAD printf("       Rotations from +Y to the input direction for the fan beam source model [deg]: rotZ = %f , rotX = %f\n", rotZ*RAD2DEG, rotX*RAD2DEG);
01774   }      
01775 
01776 
01777   /////////////////////////////////////////////////////////////////////////////
01778 
01779 
01780   // *** Allocate array for the 4 detected images (non-scattered, Compton, Rayleigh, multiple-scatter):
01781   int pixels_per_image = detector_data[0].num_pixels.x * detector_data[0].num_pixels.y;
01782   *image_bytes = 4 * pixels_per_image * sizeof(unsigned long long int);
01783   (*image_ptr) = (unsigned long long int*) malloc(*image_bytes);
01784   if (*image_ptr==NULL)
01785   {
01786     printf("\n\n   !!malloc ERROR!! Not enough memory to allocate %d pixels for the 4 scatter images (%f Mbytes)!!\n\n", pixels_per_image, (*image_bytes)/(1024.f*1024.f));
01787     exit(-2);
01788   }
01789   else
01790   {
01791     MASTER_THREAD printf("       Array for 4 scatter images correctly allocated (%d pixels, %f Mbytes)\n", pixels_per_image, (*image_bytes)/(1024.f*1024.f));
01792   }
01793 
01794 #ifndef USING_CUDA
01795   // *** Initialize the images to 0 in the CPU. The CUDA code will init it to 0 in the GPU global memory later, using kernel "init_image_array_GPU".
01796   memset(*image_ptr, 0, (*image_bytes));     // Init memory space to 0.   
01797 #endif
01798 
01799 
01800   // *** Allocate dose and dose^2 array if tally active:
01801   int num_voxels_ROI = ((int)(*dose_ROI_x_max - *dose_ROI_x_min + 1)) * ((int)(*dose_ROI_y_max - *dose_ROI_y_min + 1)) * ((int)(*dose_ROI_z_max - *dose_ROI_z_min + 1));
01802   if ((*dose_ROI_x_max)>-1)
01803   {    
01804     *voxels_Edep_bytes = num_voxels_ROI * sizeof(ulonglong2);
01805     (*voxels_Edep_ptr) = (ulonglong2*) malloc(*voxels_Edep_bytes);
01806     if (*voxels_Edep_ptr==NULL)
01807     {
01808       printf("\n\n   !!malloc ERROR!! Not enough memory to allocate %d voxels for the deposited dose (and uncertainty) array (%f Mbytes)!!\n\n", num_voxels_ROI, (*voxels_Edep_bytes)/(1024.f*1024.f));
01809       exit(-2);
01810     }
01811     else
01812     {
01813       MASTER_THREAD printf("       Array for the deposited dose ROI (and uncertainty) correctly allocated (%d voxels, %f Mbytes)\n", num_voxels_ROI, (*voxels_Edep_bytes)/(1024.f*1024.f));
01814     }
01815   }
01816   else
01817   {
01818     (*voxels_Edep_bytes) = 0;
01819   }
01820   
01821   // *** Initialize the voxel dose to 0 in the CPU. Not necessary for the CUDA code if dose matrix init. in the GPU global memory using a GPU kernel, but needed if using cudaMemcpy.  
01822   if ((*dose_ROI_x_max)>-1)
01823   {    
01824     memset(*voxels_Edep_ptr, 0, (*voxels_Edep_bytes));     // Init memory space to 0.
01825   }
01826 
01827   return;
01828 }
01829 
01830 
01831 
01832 ////////////////////////////////////////////////////////////////////////////////
01833 //! Extract a file name from an input text line, trimming the initial blanks,
01834 //! trailing comment (#) and stopping at the first blank (the file name should
01835 //! not contain blanks).
01836 //!
01837 //!       @param[in] input_line   Input sentence with blanks and a trailing comment
01838 //!       @param[out] file_name   Trimmed file name
01839 ////////////////////////////////////////////////////////////////////////////////
01840 void trim_name(char* input_line, char* file_name)
01841 {
01842   int a=0, b=0;
01843   
01844   // Discard initial blanks:
01845   while(' '==input_line[a])
01846   {
01847     a++;
01848   }
01849 
01850   // Read file name until a blank or a comment symbol (#) is found:
01851   while ((' '!=input_line[a])&&('#'!=input_line[a]))
01852   {
01853     file_name[b] = input_line[a];
01854     b++;
01855     a++;
01856   }
01857   
01858   file_name[b] = '\0';    // Terminate output string
01859 }
01860 
01861 ////////////////////////////////////////////////////////////////////////////////
01862 //! Read a line of text and trim initial blancks and trailing comments (#).
01863 //!
01864 //!       @param[in] num   Characters to read
01865 //!       @param[in] file_ptr   Pointer to the input file stream
01866 //!       @param[out] trimmed_line   Trimmed line from input file, skipping empty lines and comments
01867 ////////////////////////////////////////////////////////////////////////////////
01868 char* fgets_trimmed(char* trimmed_line, int num, FILE* file_ptr)
01869 {
01870   char  new_line[250];
01871   char *new_line_ptr = NULL;
01872   int a=0, b=0;
01873   trimmed_line[0] = '\0';   //  Init with a mark that means no file input
01874   
01875   do
01876   {
01877     a=0; b=0;
01878     new_line_ptr = fgets(new_line, num, file_ptr);   // Read new line
01879     if (new_line_ptr != NULL)
01880     {
01881       // Discard initial blanks:
01882       while(' '==new_line[a])
01883       {
01884         a++;
01885       }
01886       // Read file until a comment symbol (#) or end-of-line are found:
01887       while (('\n'!=new_line[a])&&('#'!=new_line[a]))
01888       {
01889         trimmed_line[b] = new_line[a];
01890         b++;
01891         a++;
01892       }
01893     }
01894   } while(new_line_ptr!=NULL &&  '\0'==trimmed_line[0]);   // Keep reading lines until end-of-file or a line that is not empty or only comment is found
01895   
01896   trimmed_line[b] = '\0';    // Terminate output string
01897   return new_line_ptr;
01898 }
01899 
01900 
01901 
01902 ////////////////////////////////////////////////////////////////////////////////
01903 //! Read the voxel data and allocate the material and density matrix.
01904 //! Also find and report the maximum density defined in the geometry.
01905 //!
01906 // -- Sample voxel geometry file:
01907 //
01908 //   #  (comment lines...)
01909 //   #
01910 //   #   Voxel order: X runs first, then Y, then Z.
01911 //   #
01912 //   [SECTION VOXELS HEADER v.2008-04-13]
01913 //   411  190  113      No. OF VOXELS IN X,Y,Z
01914 //   5.000e-02  5.000e-02  5.000e-02    VOXEL SIZE (cm) ALONG X,Y,Z
01915 //   1                  COLUMN NUMBER WHERE MATERIAL ID IS LOCATED
01916 //   2                  COLUMN NUMBER WHERE THE MASS DENSITY IS LOCATED
01917 //   1                  BLANK LINES AT END OF X,Y-CYCLES (1=YES,0=NO)
01918 //   [END OF VXH SECTION]
01919 //   1 0.00120479
01920 //   1 0.00120479
01921 //   ...
01922 //
01923 //!       @param[in] file_name_voxels  Name of the voxelized geometry file.
01924 //!       @param[out] density_max  Array with the maximum density for each material in the voxels.
01925 //!       @param[out] voxel_data   Pointer to a structure containing the voxel number and size.
01926 //!       @param[out] voxel_mat_dens_ptr   Pointer to the vector with the voxel materials and densities.
01927 //!       @param[in] dose_ROI_x/y/z_max   Size of the dose ROI: can not be larger than the total number of voxels in the geometry.
01928 ////////////////////////////////////////////////////////////////////////////////
01929 void load_voxels(int myID, char* file_name_voxels, float* density_max, struct voxel_struct* voxel_data, float2** voxel_mat_dens_ptr, unsigned int* voxel_mat_dens_bytes, short int* dose_ROI_x_max, short int* dose_ROI_y_max, short int* dose_ROI_z_max)
01930 {
01931   char new_line[250];
01932   char *new_line_ptr = NULL;  
01933       
01934   MASTER_THREAD if (strstr(file_name_voxels,".zip")!=NULL)
01935     printf("\n\n    -- WARNING load_voxels! The input voxel file name has the extension \'.zip\'. Only \'.gz\' compression is allowed!!\n\n");     // !!zlib!!
01936     
01937   gzFile file_ptr = gzopen(file_name_voxels, "rb");  // Open the file with zlib: the file can be compressed with gzip or uncompressed.   !!zlib!!  
01938   
01939   if (file_ptr==NULL)
01940   {
01941     printf("\n\n   !! fopen ERROR load_voxels!! File %s does not exist!!\n", file_name_voxels);
01942     exit(-2);
01943   }
01944   MASTER_THREAD 
01945   {
01946     printf("\n    -- Reading voxel file \'%s\':\n",file_name_voxels);
01947     if (strstr(file_name_voxels,".gz")==NULL)
01948       printf("         (note that MC-GPU can also read voxel and material files compressed with gzip)\n");     // !!zlib!!  
01949     fflush(stdout);
01950   }
01951   do
01952   {    
01953     new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
01954     
01955     if (new_line_ptr==NULL)
01956     {
01957       MASTER_THREAD printf("\n\n   !!Reading ERROR load_voxels!! File is not readable or does not contain the string \'[SECTION VOXELS HEADER\'!!\n");
01958       exit(-2);
01959     }
01960   }
01961   while(strstr(new_line,"[SECTION VOXELS")==NULL);   // Skip comments and empty lines until the header begins
01962 
01963   float3 voxel_size;
01964   new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!   // Read full line (max. 250 characters).
01965   sscanf(new_line, "%d %d %d",&voxel_data->num_voxels.x, &voxel_data->num_voxels.y, &voxel_data->num_voxels.z);
01966   new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
01967   sscanf(new_line, "%f %f %f", &voxel_size.x, &voxel_size.y, &voxel_size.z);
01968   do
01969   {
01970     new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
01971     if (new_line_ptr==NULL)
01972     {
01973       MASTER_THREAD printf("\n\n   !!Reading ERROR load_voxels!! File is not readable or does not contain the string \'[END OF VXH SECTION]\'!!\n");
01974       exit(-2);
01975     }
01976   }
01977   while(strstr(new_line,"[END OF VXH SECTION")==NULL);   // Skip rest of the header
01978 
01979   // -- Store the size of the voxel bounding box (used in the source function):
01980   voxel_data->size_bbox.x = voxel_data->num_voxels.x * voxel_size.x;
01981   voxel_data->size_bbox.y = voxel_data->num_voxels.y * voxel_size.y;
01982   voxel_data->size_bbox.z = voxel_data->num_voxels.z * voxel_size.z;
01983   
01984   MASTER_THREAD printf("       Number of voxels in the input geometry file: %d x %d x %d =  %d\n", voxel_data->num_voxels.x, voxel_data->num_voxels.y, voxel_data->num_voxels.z, (voxel_data->num_voxels.x*voxel_data->num_voxels.y*voxel_data->num_voxels.z));
01985   MASTER_THREAD printf("       Size of the input voxels: %f x %f x %f cm  (voxel volume=%f cm^3)\n", voxel_size.x, voxel_size.y, voxel_size.z, voxel_size.x*voxel_size.y*voxel_size.z);
01986   MASTER_THREAD printf("       Voxel bounding box size: %f x %f x %f cm\n", voxel_data->size_bbox.x, voxel_data->size_bbox.y,  voxel_data->size_bbox.z);
01987   // printf("       The geometry must be given in two columns, with the voxel density in the second column.\n");
01988   // printf("       The  X,Y-cycles may, or may not, be separated by blank lines.\n");
01989 
01990   // -- Make sure the input number of voxels in the vox file is compatible with the input dose ROI (ROI assumes first voxel is index 0):
01991   if ( (*dose_ROI_x_max+1)>(voxel_data->num_voxels.x) || (*dose_ROI_y_max+1)>(voxel_data->num_voxels.y) || (*dose_ROI_z_max+1)>(voxel_data->num_voxels.z) )
01992   {
01993     MASTER_THREAD printf("\n       The input region of interest for the dose deposition is larger than the size of the voxelized geometry:\n");
01994     *dose_ROI_x_max = min_value(voxel_data->num_voxels.x-1, *dose_ROI_x_max);
01995     *dose_ROI_y_max = min_value(voxel_data->num_voxels.y-1, *dose_ROI_y_max);
01996     *dose_ROI_z_max = min_value(voxel_data->num_voxels.z-1, *dose_ROI_z_max);
01997     MASTER_THREAD printf(  "       updating the ROI max limits to fit the geometry -> dose_ROI_max=(%d, %d, %d)\n", *dose_ROI_x_max+1, *dose_ROI_y_max+1, *dose_ROI_z_max+1);         // Allowing the input of an ROI larger than the voxel volume: in this case some of the allocated memory will be wasted but the program will run ok.
01998   }
01999   
02000   if ( (*dose_ROI_x_max+1)==(voxel_data->num_voxels.x) && (*dose_ROI_y_max+1)==(voxel_data->num_voxels.y) && (*dose_ROI_z_max+1)==(voxel_data->num_voxels.z) )
02001     MASTER_THREAD printf("       The voxel dose tally ROI covers the entire voxelized phantom: the dose to every voxel will be tallied.\n");
02002   else
02003     MASTER_THREAD printf("       The voxel dose tally ROI covers only a fraction of the voxelized phantom: the dose to voxels outside the ROI will not be tallied.\n");
02004  
02005   // -- Store the inverse of the pixel sides (in cm) to speed up the particle location in voxels.
02006   voxel_data->inv_voxel_size.x = 1.0f/(voxel_size.x);
02007   voxel_data->inv_voxel_size.y = 1.0f/(voxel_size.y);
02008   voxel_data->inv_voxel_size.z = 1.0f/(voxel_size.z);
02009   
02010   // -- Allocate the voxel matrix and store array size:
02011   *voxel_mat_dens_bytes = sizeof(float2)*(voxel_data->num_voxels.x)*(voxel_data->num_voxels.y)*(voxel_data->num_voxels.z);
02012   *voxel_mat_dens_ptr    = (float2*) malloc(*voxel_mat_dens_bytes);
02013   if (*voxel_mat_dens_ptr==NULL)
02014   {
02015     printf("\n\n   !!malloc ERROR load_voxels!! Not enough memory to allocate %d voxels (%f Mbytes)!!\n\n", (voxel_data->num_voxels.x*voxel_data->num_voxels.y*voxel_data->num_voxels.z), (*voxel_mat_dens_bytes)/(1024.f*1024.f));
02016     exit(-2);
02017   }
02018   MASTER_THREAD printf("\n    -- Initializing the voxel material and density vector (%f Mbytes)...\n", (*voxel_mat_dens_bytes)/(1024.f*1024.f));
02019   MASTER_THREAD fflush(stdout);
02020   
02021   // -- Read the voxel densities:
02022   //   MASTER_THREAD printf("       Reading the voxel densities... ");
02023   int i, j, k, read_lines=0, dummy_material, read_items = -99;
02024   float dummy_density;
02025   float2 *voxels_ptr = *voxel_mat_dens_ptr;
02026 
02027   for (k=0; k<MAX_MATERIALS; k++)
02028     density_max[k] = -999.0f;   // Init array with an impossible low density value
02029   
02030 
02031   for(k=0; k<(voxel_data->num_voxels.z); k++)
02032   {
02033     for(j=0; j<(voxel_data->num_voxels.y); j++)
02034     {
02035       for(i=0; i<(voxel_data->num_voxels.x); i++)
02036       {
02037         
02038         do
02039         {
02040           new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02041         } 
02042         while (('\n'==new_line[0])||('\n'==new_line[1])||('#'==new_line[0])||('#'==new_line[1]));   // Skip empty lines and comments.
02043         read_items = sscanf(new_line, "%d %f", &dummy_material, &dummy_density);    // Read the next 2 numbers
02044   
02045         if (read_items!=2)
02046           printf("\n   !!WARNING load_voxels!! Expecting to read 2 items (material and density). read_items=%d, read_lines=%d \n", read_items, read_lines);
02047   
02048         if (dummy_material>MAX_MATERIALS)
02049         {
02050           printf("\n\n   !!ERROR load_voxels!! Voxel material number too high!! #mat=%d, MAX_MATERIALS=%d, voxel number=%d\n\n", dummy_material, MAX_MATERIALS, read_lines+1);
02051           exit(-2);
02052         }
02053         if (dummy_material<1)
02054         {
02055           printf("\n\n   !!ERROR load_voxels!! Voxel material number can not be zero or negative!! #mat=%d, voxel number=%dd\n\n", dummy_material, read_lines+1);
02056           exit(-2);
02057         }
02058         
02059         if (dummy_density < 1.0e-9f)
02060         {
02061           printf("\n\n   !!ERROR load_voxels!! Voxel density can not be 0 or negative: #mat=%d, density=%f, voxel number=%d\n\n", dummy_material, dummy_density, read_lines+1);
02062           exit(-2);
02063         }        
02064         
02065         if (dummy_density > density_max[dummy_material-1])
02066           density_max[dummy_material-1] = dummy_density;  // Store maximum density for each material
02067 
02068         (*voxels_ptr).x = (float)(dummy_material)+0.0001f;  // Assign material value as float (the integer value will be recovered by truncation)
02069         (*voxels_ptr).y = dummy_density;      // Assign density value
02070         voxels_ptr++;                         // Move to next voxel
02071 
02072         read_lines++;
02073       }
02074     }
02075   }
02076   MASTER_THREAD printf("       Total number of voxels read: %d\n",read_lines);
02077   gzclose(file_ptr);     // Close input file    !!zlib!!
02078 }
02079 
02080 
02081 ////////////////////////////////////////////////////////////////////////////////
02082 //! Read the material input files and set the mean free paths and the "linear_interp" structures.
02083 //! Find the material nominal density. Set the Woodcock trick data.
02084 //
02085 // -- Sample material data file (data obtained from the PENELOPE 2006 database and models):
02086 //
02087 //    [MATERIAL NAME]
02088 //     Water
02089 //    [NOMINAL DENSITY (g/cm^3)]
02090 //     1.000
02091 //    [NUMBER OF DATA VALUES]
02092 //     4096
02093 //    [MEAN FREE PATHS :: Energy (eV) || Rayleigh | Compton | Photoelectric | Pair-production | TOTAL (cm)]
02094 //     1.00000E+03  7.27451E-01  9.43363E+01  2.45451E-04  1.00000E+35  2.45367E-04
02095 //     5.00000E+03  1.80004E+00  8.35996E+00  2.38881E-02  1.00000E+35  2.35089E-02
02096 //     1.00000E+04  4.34941E+00  6.26746E+00  2.02568E-01  1.00000E+35  1.87755E-01
02097 //     ...
02098 //     #[RAYLEIGH INTERACTIONS (RITA sampling  of atomic form factor from EPDL database)]
02099 //     ...
02100 //     #[COMPTON INTERACTIONS (relativistic impulse model with approximated one-electron analytical profiles)]
02101 //     ...
02102 //
02103 //!       @param[in] file_name_materials    Array with the names of the material files.
02104 //!       @param[in] density_max   maximum density in the geometry (needed to set Woodcock trick)
02105 //!       @param[out] density_nominal   Array with the nominal density of the materials read
02106 //!       @param[out] mfp_table_data   Constant values for the linear interpolation
02107 //!       @param[out] mfp_table_a_ptr   First element for the linear interpolation.
02108 //!       @param[out] mfp_table_b_ptr   Second element for the linear interpolation.
02109 ////////////////////////////////////////////////////////////////////////////////
02110 void load_material(int myID, char file_name_materials[MAX_MATERIALS][250], float* density_max, float* density_nominal, struct linear_interp* mfp_table_data, float2** mfp_Woodcock_table_ptr, int* mfp_Woodcock_table_bytes, float3** mfp_table_a_ptr, float3** mfp_table_b_ptr, int* mfp_table_bytes, struct rayleigh_struct *rayleigh_table_ptr, struct compton_struct *compton_table_ptr)
02111 {
02112   char new_line[250];
02113   char *new_line_ptr = NULL;
02114   int mat, i, bin, input_num_values = 0, input_rayleigh_values = 0, input_num_shells = 0;
02115   double delta_e=-99999.0;
02116 
02117   // -- Init the number of shells to 0 for all materials
02118   for (mat=0; mat<MAX_MATERIALS; mat++)
02119     compton_table_ptr->noscco[mat] = 0;
02120     
02121 
02122   // --Read the material data files:
02123   MASTER_THREAD printf("\n    -- Reading the material data files (MAX_MATERIALS=%d):\n", MAX_MATERIALS);
02124   for (mat=0; mat<MAX_MATERIALS; mat++)
02125   {
02126     if ((file_name_materials[mat][0]=='\0') || (file_name_materials[mat][0]=='\n'))  //  Empty file name
02127        continue;   // Re-start loop for next material
02128 
02129     MASTER_THREAD printf("         Mat %d: File \'%s\'\n", mat+1, file_name_materials[mat]);
02130 //     printf("    -- Reading material file #%d: \'%s\'\n", mat, file_name_materials[mat]);
02131 
02132     gzFile file_ptr = gzopen(file_name_materials[mat], "rb");    // !!zlib!!  
02133     if (file_ptr==NULL)
02134     {
02135       printf("\n\n   !!fopen ERROR!! File %d \'%s\' does not exist!!\n", mat, file_name_materials[mat]);
02136       exit(-2);
02137     }
02138     do
02139     {
02140       new_line_ptr = gzgets(file_ptr, new_line, 250);   // Read full line (max. 250 characters).   //  !!zlib!!
02141       if (new_line_ptr==NULL)
02142       {
02143         printf("\n\n   !!Reading ERROR!! File is not readable or does not contain the string \'[NOMINAL DENSITY\'!!\n");
02144         exit(-2);
02145       }
02146     }
02147     while(strstr(new_line,"[NOMINAL DENSITY")==NULL);   // Skip rest of the header
02148 
02149     // Read the material nominal density:
02150     new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02151     sscanf(new_line, "# %f", &density_nominal[mat]);
02152     
02153     if (density_max[mat]>0)    //  Material found in the voxels
02154     {
02155       MASTER_THREAD printf("                Nominal density = %f g/cm^3; Max density in voxels = %f g/cm^3\n", density_nominal[mat], density_max[mat]);
02156     }
02157     else                       //  Material NOT found in the voxels
02158     {
02159       MASTER_THREAD printf("                This material is not used in any voxel.\n");
02160       
02161       // Do not lose time reading the data for materials not found in the voxels, except for the first one (needed to determine the size of the input data).      
02162       if (0 == mat)
02163         density_max[mat] = 0.01f*density_nominal[mat];   // Assign a small but positive density; this material will not be used anyway.
02164       else
02165         continue;     //  Move on to next material          
02166     }
02167       
02168 
02169     // --For the first material, set the number of energy values and allocate table arrays:
02170     new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02171     new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02172     sscanf(new_line, "# %d", &input_num_values);
02173     if (0==mat)
02174     {
02175       mfp_table_data->num_values = input_num_values;
02176       MASTER_THREAD printf("                Number of energy values in the mean free path database: %d.\n", input_num_values);
02177 
02178       // Allocate memory for the linear interpolation arrays:
02179       *mfp_Woodcock_table_bytes = sizeof(float2)*input_num_values;
02180       *mfp_Woodcock_table_ptr   = (float2*) malloc(*mfp_Woodcock_table_bytes);  // Allocate space for the 2 parameter table
02181       *mfp_table_bytes = sizeof(float3)*input_num_values*MAX_MATERIALS;
02182       *mfp_table_a_ptr = (float3*) malloc(*mfp_table_bytes);  // Allocate space for the 4 MFP tables
02183       *mfp_table_b_ptr = (float3*) malloc(*mfp_table_bytes);
02184       *mfp_table_bytes = sizeof(float3)*input_num_values*MAX_MATERIALS;
02185 
02186       if (input_num_values>MAX_ENERGYBINS_RAYLEIGH)
02187       {
02188         printf("\n\n   !!load_material ERROR!! Too many energy bins (Input bins=%d): increase parameter MAX_ENERGYBINS_RAYLEIGH=%d!!\n\n", input_num_values, MAX_ENERGYBINS_RAYLEIGH);
02189         exit(-2);
02190       }
02191       
02192       if ((NULL==*mfp_Woodcock_table_ptr)||(NULL==*mfp_table_a_ptr)||(NULL==*mfp_table_b_ptr))
02193       {
02194         printf("\n\n   !!malloc ERROR!! Not enough memory to allocate the linear interpolation data: %d bytes!!\n\n", (*mfp_Woodcock_table_bytes+2*(*mfp_table_bytes)));
02195         exit(-2);
02196       }
02197       else
02198       {
02199         MASTER_THREAD printf("                Linear interpolation data correctly allocated (%f Mbytes)\n", (*mfp_Woodcock_table_bytes+2*(*mfp_table_bytes))/(1024.f*1024.f));
02200       }
02201       for (i=0; i<input_num_values; i++)
02202       {
02203         (*mfp_Woodcock_table_ptr)[i].x = 99999999.99f;    // Init this array with a huge MFP, the minimum values are calculated below
02204       }
02205     }
02206     else   // Materials after first
02207     {
02208       if (input_num_values != mfp_table_data->num_values)
02209       {
02210         printf("\n\n   !!load_material ERROR!! Incorrect number of energy values given in material \'%s\': input=%d, expected=%d\n",file_name_materials[mat], input_num_values, mfp_table_data->num_values);
02211         exit(-2);
02212       }
02213     }
02214 
02215     // -- Read the mean free paths (and Rayleigh cumulative prob):
02216     new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02217     new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02218     double d_energy, d_rayleigh, d_compton, d_photelectric, d_total_mfp, d_pmax, e_last=-1.0;
02219     
02220     for (i=0; i<input_num_values; i++)
02221     {
02222 
02223       new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02224       sscanf(new_line,"  %le  %le  %le  %le  %le  %le", &d_energy, &d_rayleigh, &d_compton, &d_photelectric, &d_total_mfp, &d_pmax);
02225 
02226       // Find and store the minimum total MFP at the current energy, for every material's maximum density:
02227       float temp_mfp = d_total_mfp*(density_nominal[mat])/(density_max[mat]);
02228       if (temp_mfp < (*mfp_Woodcock_table_ptr)[i].x)
02229         (*mfp_Woodcock_table_ptr)[i].x = temp_mfp;       // Store minimum total mfp [cm]
02230 
02231       // Store the inverse MFP data points with [num_values rows]*[MAX_MATERIALS columns]
02232       // Scaling the table to the nominal density so that I can re-scale in the kernel to the actual local density:
02233       (*mfp_table_a_ptr)[i*(MAX_MATERIALS)+mat].x = 1.0/(d_total_mfp*density_nominal[mat]);   // inverse TOTAL mfp * nominal density
02234       (*mfp_table_a_ptr)[i*(MAX_MATERIALS)+mat].y = 1.0/(d_compton  *density_nominal[mat]);   // inverse Compton mfp * nominal density
02235       (*mfp_table_a_ptr)[i*(MAX_MATERIALS)+mat].z = 1.0/(d_rayleigh *density_nominal[mat]);   // inverse Rayleigh mfp * nominal density
02236 
02237       rayleigh_table_ptr->pmax[i*(MAX_MATERIALS)+mat] = d_pmax;    // Store the maximum cumulative probability of atomic form factor F^2 for
02238 
02239       if (0==i && 0==mat)
02240       {
02241         mfp_table_data->e0  = d_energy;   // Store the first energy of the first material
02242       }
02243 
02244       if (0==i)
02245       {
02246         if (fabs(d_energy-mfp_table_data->e0)>1.0e-9)
02247         {
02248           printf("\n\n   !!load_material ERROR!! Incorrect first energy value given in material \'%s\': input=%f, expected=%f\n", file_name_materials[mat], d_energy, mfp_table_data->e0);
02249           exit(-2);
02250         }
02251       }
02252       else if (1==i)
02253       {
02254         delta_e = d_energy-e_last;
02255       }
02256       else if (i>1)
02257       {
02258         if (((fabs((d_energy-e_last)-delta_e))/delta_e)>0.001)  // Tolerate up to a 0.1% relative variation in the delta e (for each bin) to account for possible precission errors reading the energy values
02259         {
02260           printf("  !!ERROR reading material data!! The energy step between mean free path values is not constant!!\n      (maybe not enough decimals given for the energy values)\n      #value = %d, First delta: %f , New delta: %f, Energy: %f ; Rel.Dif=%f\n", i, delta_e, (d_energy-e_last), d_energy,((fabs((d_energy-e_last)-delta_e))/delta_e));
02261           exit(-2);
02262         }
02263       }
02264       e_last = d_energy;
02265     }
02266     
02267     if (0==mat) MASTER_THREAD printf("                Lowest energy first bin = %f eV, last bin = %f eV; bin width = %f eV\n", (mfp_table_data->e0), e_last, delta_e);
02268 
02269     // -- Store the inverse of delta energy:
02270     mfp_table_data->ide = 1.0f/delta_e;
02271 
02272     // -- Store MFP data slope 'b' (.y for Woodcock):
02273     for (i=0; i<(input_num_values-1); i++)
02274     {
02275       bin = i*MAX_MATERIALS+mat;                   // Set current bin, skipping MAX_MATERIALS columns
02276       (*mfp_table_b_ptr)[bin].x = ((*mfp_table_a_ptr)[bin+MAX_MATERIALS].x - (*mfp_table_a_ptr)[bin].x) / delta_e;
02277       (*mfp_table_b_ptr)[bin].y = ((*mfp_table_a_ptr)[bin+MAX_MATERIALS].y - (*mfp_table_a_ptr)[bin].y) / delta_e;
02278       (*mfp_table_b_ptr)[bin].z = ((*mfp_table_a_ptr)[bin+MAX_MATERIALS].z - (*mfp_table_a_ptr)[bin].z) / delta_e;
02279     }
02280     // After maximum energy (last bin), assume constant slope:
02281     (*mfp_table_b_ptr)[(input_num_values-1)*MAX_MATERIALS+mat] = (*mfp_table_b_ptr)[(input_num_values-2)*MAX_MATERIALS+mat];
02282 
02283     // -- Rescale the 'a' parameter (.x for Woodcock) as if the bin started at energy = 0: we will not have to rescale to the bin minimum energy every time
02284     for (i=0; i<input_num_values; i++)
02285     {
02286       d_energy = mfp_table_data->e0 + i*delta_e;   // Set current bin lowest energy value
02287       bin = i*MAX_MATERIALS+mat;                   // Set current bin, skipping MAX_MATERIALS columns
02288       (*mfp_table_a_ptr)[bin].x = (*mfp_table_a_ptr)[bin].x - d_energy*(*mfp_table_b_ptr)[bin].x;
02289       (*mfp_table_a_ptr)[bin].y = (*mfp_table_a_ptr)[bin].y - d_energy*(*mfp_table_b_ptr)[bin].y;
02290       (*mfp_table_a_ptr)[bin].z = (*mfp_table_a_ptr)[bin].z - d_energy*(*mfp_table_b_ptr)[bin].z;
02291     }
02292 
02293     // -- Reading data for RAYLEIGH INTERACTIONS (RITA sampling  of atomic form factor from EPDL database):
02294     do
02295     {
02296       new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02297       if (gzeof(file_ptr)!=0)                           //  !!zlib!!
02298       {
02299         printf("\n\n   !!End-of-file ERROR!! Rayleigh data not found: \"#[DATA VALUES...\" in file \'%s\'. Last line read: %s\n\n", file_name_materials[mat], new_line);
02300         exit(-2);
02301       }
02302     }
02303     while(strstr(new_line,"[DATA VALUES")==NULL);   // Skip all lines until this text is found
02304       
02305     new_line_ptr = gzgets(file_ptr, new_line, 250);   // Read the number of data points in Rayleigh     //  !!zlib!! 
02306     sscanf(new_line, "# %d", &input_rayleigh_values);
02307         
02308     if (input_rayleigh_values != NP_RAYLEIGH)
02309     {
02310       printf("\n\n   !!ERROR!! The number of values for Rayleigh sampling is different than the allocated space: input=%d, NP_RAYLEIGH=%d. File=\'%s\'\n", input_rayleigh_values, NP_RAYLEIGH, file_name_materials[mat]);
02311       exit(-2);
02312     }
02313     new_line_ptr = gzgets(file_ptr, new_line, 250);    // Comment line:  #[SAMPLING DATA FROM COMMON/CGRA/: X, P, A, B, ITL, ITU]     //  !!zlib!!
02314     for (i=0; i<input_rayleigh_values; i++)
02315     {
02316       int itlco_tmp, ituco_tmp;
02317       bin = NP_RAYLEIGH*mat + i;
02318 
02319       new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02320       sscanf(new_line,"  %e  %e  %e  %e  %d  %d", &(rayleigh_table_ptr->xco[bin]), &(rayleigh_table_ptr->pco[bin]),
02321                                                   &(rayleigh_table_ptr->aco[bin]), &(rayleigh_table_ptr->bco[bin]),
02322                                                   &itlco_tmp, &ituco_tmp);
02323 
02324       rayleigh_table_ptr->itlco[bin] = (unsigned char) itlco_tmp;
02325       rayleigh_table_ptr->ituco[bin] = (unsigned char) ituco_tmp;
02326                                                   
02327     }
02328     //  printf("    -- Rayleigh sampling data read. Input values = %d\n",input_rayleigh_values);
02329 
02330     // -- Reading COMPTON INTERACTIONS data (relativistic impulse model with approximated one-electron analytical profiles):
02331     do
02332     {
02333       new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02334       if (gzeof(file_ptr)!=0)                           //  !!zlib!!
02335       {
02336         printf("\n\n   !!End-of-file ERROR!! Compton data not found: \"[NUMBER OF SHELLS]\" in file \'%s\'. Last line read: %s\n\n", file_name_materials[mat], new_line);
02337         exit(-2);
02338       }
02339     }
02340     while(strstr(new_line,"[NUMBER OF SHELLS")==NULL);   // Skip all lines until this text is found
02341     new_line_ptr = gzgets(file_ptr, new_line, 250);
02342     sscanf(new_line, "# %d", &input_num_shells);      // Read the NUMBER OF SHELLS
02343     if (input_num_shells>MAX_SHELLS)
02344     {
02345       printf("\n\n   !!ERROR!! Too many shells for Compton interactions in file \'%s\': input=%d, MAX_SHELLS=%d\n", file_name_materials[mat], input_num_shells, MAX_SHELLS);
02346       exit(-2);
02347     }
02348     compton_table_ptr->noscco[mat] = input_num_shells;   // Store number of shells for this material in structure
02349     new_line_ptr = gzgets(file_ptr, new_line, 250);      // Comment line:  #[SHELL INFORMATION FROM COMMON/CGCO/: FCO, UICO, FJ0, KZCO, KSCO]
02350     int kzco_dummy, ksco_dummy;
02351     for (i=0; i<input_num_shells; i++)
02352     {
02353 
02354       bin = mat + i*MAX_MATERIALS;
02355 
02356       new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
02357       sscanf(new_line," %e  %e  %e  %d  %d", &(compton_table_ptr->fco[bin]), &(compton_table_ptr->uico[bin]),
02358                                               &(compton_table_ptr->fj0[bin]), &kzco_dummy, &ksco_dummy);
02359     }
02360   
02361     gzclose(file_ptr);    // Material data read. Close the current material input file.           //  !!zlib!!
02362     
02363   }  // ["for" loop: continue with next material]
02364 
02365 
02366   // -- Store Woodcock MFP slope in component '.y':
02367   for (i=0; i<(mfp_table_data->num_values-1); i++)
02368     (*mfp_Woodcock_table_ptr)[i].y = ((*mfp_Woodcock_table_ptr)[i+1].x - (*mfp_Woodcock_table_ptr)[i].x)/delta_e;
02369 
02370   // -- Rescale the first parameter in component .x for Woodcock
02371   for (i=0; i<mfp_table_data->num_values; i++)
02372   {
02373     (*mfp_Woodcock_table_ptr)[i].x = (*mfp_Woodcock_table_ptr)[i].x - (mfp_table_data->e0 + i*delta_e)*(*mfp_Woodcock_table_ptr)[i].y;
02374   }
02375   
02376 }
02377 ////////////////////////////////////////////////////////////////////////////////
02378 
02379 
02380 
02381 #ifdef USING_CUDA
02382 ////////////////////////////////////////////////////////////////////////////////
02383 //!  Select and initialize the CUDA-enabled GPU that will be used in the simulation.
02384 //!  Allocates and copies the simulation data in the GPU global and constant memories.
02385 //!
02386 ////////////////////////////////////////////////////////////////////////////////
02387 void init_CUDA_device( int* gpu_id, int myID, int numprocs,
02388       /*Variables to GPU constant memory:*/ struct voxel_struct* voxel_data, struct source_struct* source_data, struct source_energy_struct* source_energy_data, struct detector_struct* detector_data, struct linear_interp* mfp_table_data,
02389       /*Variables to GPU global memory:*/ float2* voxel_mat_dens, float2** voxel_mat_dens_device, unsigned int voxel_mat_dens_bytes,
02390         unsigned long long int* image, unsigned long long int** image_device, int image_bytes,
02391         float2* mfp_Woodcock_table, float2** mfp_Woodcock_table_device, int mfp_Woodcock_table_bytes,
02392         float3* mfp_table_a, float3* mfp_table_b, float3** mfp_table_a_device, float3** mfp_table_b_device, int mfp_table_bytes,
02393         struct rayleigh_struct* rayleigh_table, struct rayleigh_struct** rayleigh_table_device,
02394         struct compton_struct* compton_table, struct compton_struct** compton_table_device, 
02395         struct detector_struct** detector_data_device, struct source_struct** source_data_device,
02396         ulonglong2* voxels_Edep, ulonglong2** voxels_Edep_device, int voxels_Edep_bytes, short int* dose_ROI_x_min, short int* dose_ROI_x_max, short int* dose_ROI_y_min, short int* dose_ROI_y_max, short int* dose_ROI_z_min, short int* dose_ROI_z_max,
02397         ulonglong2* materials_dose, ulonglong2** materials_dose_device, int flag_material_dose, int num_projections)
02398 {
02399   int deviceCount;
02400   checkCudaErrors(cudaGetDeviceCount(&deviceCount));
02401   if (0==deviceCount)
02402   {
02403     printf("\n  !!ERROR!! No CUDA enabled GPU detected by thread #%d!!\n\n", myID);
02404     exit(-1);
02405   }  
02406   
02407   
02408 #ifdef USING_MPI
02409 
02410   // *** Select the appropriate GPUs in the different workstations in the MPI hostfile:
02411   //     The idea is that each threads will wait for the previous thread to send a messages with its processor name and GPU id, 
02412   //     then it will assign the current GPU, and finally it will notify the following thread:
02413       
02414   if (numprocs>1)
02415   {      
02416     const int NODE_NAME_LENGTH = 31;
02417     char processor_name[NODE_NAME_LENGTH+1], previous_processor_name[NODE_NAME_LENGTH+1];
02418     int resultlen = -1;
02419     
02420     MPI_Get_processor_name(processor_name, &resultlen);
02421     
02422     MPI_Status status;
02423     
02424     int gpu_id_to_avoid = *gpu_id;
02425 
02426     clock_t clock_start;
02427     if (myID == (numprocs-1))
02428       clock_start = clock();
02429         
02430 
02431     // Unless we are the first thread, wait for a message from the previous thread:
02432     // The MPI_Recv command will block the execution of the code until the previous threads have communicated and shared the appropriate information.
02433     if (0!=myID)
02434     {     
02435       MPI_Recv(previous_processor_name, NODE_NAME_LENGTH, MPI_CHAR, myID-1, 111, MPI_COMM_WORLD, &status);   // Receive the processor name and gpu_id from the previous thread
02436           // printf("\n -> MPI_Recv thread %d: gpu_id=%d, %s\n", myID, (int)previous_processor_name[NODE_NAME_LENGTH-1], previous_processor_name); fflush(stdout);  //!!Verbose!! 
02437     }
02438     
02439 
02440     // Compare the 30 first characters of the 2 names to see if we changed the node, except for the first thread that allways gets GPU 0:
02441     if ((0==myID) || (0!=strncmp(processor_name, previous_processor_name, NODE_NAME_LENGTH-1)))
02442     { 
02443       *gpu_id = 0;    // Thread in a new node: assign to GPU 0:
02444     }
02445     else
02446     {
02447       // Current thread in the same node as the previous one: assign next GPU (previous GPU id given in element NODE_NAME_LENGTH-1 of the array)
02448       *gpu_id = (int)previous_processor_name[NODE_NAME_LENGTH-1] + 1;
02449     }
02450 
02451     // Set the following GPU if this is the one to be skipped (given in the input file):
02452     if (*gpu_id == gpu_id_to_avoid)
02453     {
02454       *gpu_id = *gpu_id + 1;  
02455       printf("             Skipping GPU %d in thread %d (%s), as selected in the input file: gpu_id=%d\n", gpu_id_to_avoid, myID, processor_name, *gpu_id); fflush(stdout);
02456     }
02457     
02458     
02459     // Send the processor and GPU id to the following thread, unless we are the last thread:
02460     if (myID != (numprocs-1))
02461     { 
02462       processor_name[NODE_NAME_LENGTH-1] = (char)(*gpu_id);  // Store GPU number in the last element of the array
02463       
02464           // printf(" <- MPI_Send thread %d: gpu_id=%d, %s\n", myID, (int)processor_name[NODE_NAME_LENGTH-1], processor_name); fflush(stdout);  //!!Verbose!!
02465       MPI_Send(processor_name, NODE_NAME_LENGTH, MPI_CHAR, myID+1, 111, MPI_COMM_WORLD);  // Send processor name and gpu_id to the following thread (tag is the current thread id)
02466     }
02467     else
02468     {
02469       printf("           -- Time spent communicating between threads to determine the GPU id to use in each thread: %.6f s\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC); fflush(stdout);
02470     }
02471     
02472   }
02473   
02474 #endif  
02475   
02476   if (*gpu_id>=deviceCount)
02477   {
02478     printf("\n\n  !!WARNING!! The selected GPU number is too high, this device number does not exist!! GPU_id (starting at 0)=%d, deviceCount=%d\n", (*gpu_id), deviceCount); fflush(stdout);
02479     if (numprocs==1)
02480     {
02481       *gpu_id = gpuGetMaxGflopsDeviceId();
02482       printf("            Selecting the fastest GPU available using gpuGetMaxGflopsDeviceId(): GPU_id = %d\n\n", (*gpu_id)); fflush(stdout);
02483     }    
02484     else
02485     {
02486       exit(-1);    
02487     }
02488   }     
02489 
02490   checkCudaErrors(cudaSetDevice(*gpu_id));   // Set the GPU device. (optionally use: cutGetMaxGflopsDeviceId())
02491    
02492   
02493   cudaDeviceProp deviceProp;
02494   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *gpu_id));
02495   if (deviceProp.major>99 || deviceProp.minor>99)
02496   {
02497     printf("\n\n\n  !!ERROR!! The selected GPU device does not support CUDA!! GPU_id=%d, deviceCount=%d, compute capability=%d.%d\n\n\n", (*gpu_id), deviceCount, deviceProp.major,deviceProp.minor);
02498     exit(-1);
02499   }
02500 
02501   if (deviceProp.major>1)
02502   {
02503     
02504 #ifdef LARGE_CACHE  
02505     // -- Compute capability > 1: set a large L1 cache for the global memory, reducing the size of the shared memory:
02506     //       cudaFuncCachePreferShared: shared memory is 48 KB
02507     //       cudaFuncCachePreferL1: shared memory is 16 KB
02508     //       cudaFuncCachePreferNone: no preference
02509     printf("\n       ==> CUDA: LARGE_CACHE defined --> setting a large global memory cache (L1) and a small shared memory (cudaFuncCachePreferL1).\n");
02510     cudaFuncSetCacheConfig(track_particles, cudaFuncCachePreferL1);            // -- Set a large cache instead of a large shared memory.
02511         // #else
02512         // -- Using default:
02513         // printf("\n       ==> CUDA: LARGE_CACHE not defined --> setting a large shared memory and a small global memory cache (cudaFuncCachePreferShared).\n");
02514         //    cudaFuncSetCacheConfig(track_particles, cudaFuncCachePreferShared);            // !!DeBuG!! Setting size of shared memory/global cache
02515 #endif
02516 
02517   }
02518 
02519   register int GPU_cores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount;    // CUDA SDK function to get the number of GPU cores
02520 
02521   // -- Reading the device properties:
02522   
02523 #ifdef USING_MPI   
02524   printf("\n       ==> CUDA (MPI process #%d): %d CUDA enabled GPU detected! Using device #%d: \"%s\"\n", myID, deviceCount, (*gpu_id), deviceProp.name);    
02525 #else  
02526   printf("\n       ==> CUDA: %d CUDA enabled GPU detected! Using device #%d: \"%s\"\n", deviceCount, (*gpu_id), deviceProp.name);    
02527 #endif
02528   printf("                 Compute capability: %d.%d, Number multiprocessors: %d, Number cores: %d\n", deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount, GPU_cores);
02529   printf("                 Clock rate: %.2f GHz, Global memory: %.3f Mbyte, Constant memory: %.2f kbyte\n", deviceProp.clockRate*1.0e-6f, deviceProp.totalGlobalMem/(1024.f*1024.f), deviceProp.totalConstMem/1024.f);
02530   printf("                 Shared memory per block: %.2f kbyte, Registers per block: %.2f kbyte\n", deviceProp.sharedMemPerBlock/1024.f, deviceProp.regsPerBlock/1024.f);
02531   int driverVersion = 0, runtimeVersion = 0;  
02532   cudaDriverGetVersion(&driverVersion);
02533   cudaRuntimeGetVersion(&runtimeVersion);
02534   printf("                 CUDA Driver Version: %d.%d, Runtime Version: %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
02535 
02536   if (0!=deviceProp.kernelExecTimeoutEnabled)
02537   {
02538     printf("\n\n\n   !!WARNING!! The selected GPU is connected to a display and therefore CUDA driver will limit the kernel run time to 5 seconds and the simulation will likely fail!!\n");
02539     printf( "              You can fix this by executing the simulation in a different GPU (select number in the input file) or by turning off the window manager and using the text-only Linux shell.\n\n\n");
02540     // exit(-1);
02541   }    
02542 
02543   clock_t clock_init = clock();    
02544 
02545   // -- Allocate the constant variables in the device:
02546   checkCudaErrors(cudaMemcpyToSymbol(voxel_data_CONST,     voxel_data,     sizeof(struct voxel_struct)));
02547   checkCudaErrors(cudaMemcpyToSymbol(source_energy_data_CONST, source_energy_data, sizeof(struct source_energy_struct)));
02548   
02549 // Source, detector data now copied to global memory and transfered to shared memory in the kernel. OLD CODE:  checkCudaErrors(cudaMemcpyToSymbol(detector_data_CONST,  detector_data,  sizeof(struct detector_struct)));
02550   
02551   checkCudaErrors(cudaMemcpyToSymbol(mfp_table_data_CONST, mfp_table_data, sizeof(struct linear_interp)));
02552 
02553   checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_x_min_CONST, dose_ROI_x_min, sizeof(short int)));
02554   checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_x_max_CONST, dose_ROI_x_max, sizeof(short int)));
02555   checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_y_min_CONST, dose_ROI_y_min, sizeof(short int)));
02556   checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_y_max_CONST, dose_ROI_y_max, sizeof(short int)));
02557   checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_z_min_CONST, dose_ROI_z_min, sizeof(short int)));
02558   checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_z_max_CONST, dose_ROI_z_max, sizeof(short int)));
02559   
02560 
02561 
02562   double total_mem = sizeof(struct voxel_struct)+sizeof(struct source_struct)+sizeof(struct detector_struct)+sizeof(struct linear_interp) + 6*sizeof(short int);
02563   MASTER_THREAD printf("       ==> CUDA: Constant data successfully copied to the device. CONSTANT memory used: %lf kbytes (%.1lf%%)\n", total_mem/1024.0, 100.0*total_mem/deviceProp.totalConstMem);
02564   
02565 
02566   // -- Allocate the device global memory:
02567 
02568   if (*dose_ROI_x_max > -1)  // Allocate dose array only if the tally is not disabled
02569   {
02570     checkCudaErrors(cudaMalloc((void**) voxels_Edep_device, voxels_Edep_bytes));
02571     if (*voxels_Edep_device==NULL)
02572     {
02573       printf("\n cudaMalloc ERROR!! Error allocating the dose array on the device global memory!! (%lf Mbytes)\n", voxels_Edep_bytes/(1024.0*1024.0));
02574       exit(-1);
02575     }
02576   }
02577   
02578   checkCudaErrors(cudaMalloc((void**) voxel_mat_dens_device, voxel_mat_dens_bytes));
02579   checkCudaErrors(cudaMalloc((void**) image_device,          image_bytes));
02580   checkCudaErrors(cudaMalloc((void**) mfp_Woodcock_table_device, mfp_Woodcock_table_bytes));
02581   checkCudaErrors(cudaMalloc((void**) mfp_table_a_device,    mfp_table_bytes));
02582   checkCudaErrors(cudaMalloc((void**) mfp_table_b_device,    mfp_table_bytes));
02583   checkCudaErrors(cudaMalloc((void**) rayleigh_table_device, sizeof(struct rayleigh_struct)));
02584   checkCudaErrors(cudaMalloc((void**) compton_table_device,  sizeof(struct compton_struct))); 
02585   checkCudaErrors(cudaMalloc((void**) detector_data_device,  num_projections*sizeof(struct detector_struct)));
02586   checkCudaErrors(cudaMalloc((void**) source_data_device,    num_projections*sizeof(struct source_struct)));    // The array of detectors, sources has "MAX_NUM_PROJECTIONS" elements but I am allocating only the used "num_projections" elements to the GPU
02587   
02588   if (flag_material_dose==1)
02589     checkCudaErrors(cudaMalloc((void**) materials_dose_device, MAX_MATERIALS*sizeof(ulonglong2)));    // !!tally_materials_dose!!
02590   
02591   total_mem = voxels_Edep_bytes + voxel_mat_dens_bytes + image_bytes + mfp_Woodcock_table_bytes + 2*mfp_table_bytes + sizeof(struct compton_struct) + sizeof(struct rayleigh_struct) + num_projections*(sizeof(struct detector_struct) + sizeof(struct source_struct));
02592   if (*voxel_mat_dens_device==NULL || *image_device==NULL || *mfp_Woodcock_table_device==NULL || *mfp_table_a_device==NULL ||
02593       *mfp_table_a_device==NULL || *rayleigh_table_device==NULL || *compton_table_device==NULL || *detector_data_device==NULL || *source_data_device==NULL)
02594   {
02595     printf("\n cudaMalloc ERROR!! Device global memory not correctly allocated!! (%lf Mbytes)\n", total_mem/(1024.0*1024.0));
02596     exit(-1);
02597   }
02598   else
02599   {
02600     MASTER_THREAD printf("       ==> CUDA: Device global memory correctly allocated. GLOBAL memory used: %lf Mbytes (%.1lf%%)\n", total_mem/(1024.0*1024.0), 100.0*total_mem/deviceProp.totalGlobalMem);
02601   }
02602 
02603   // --Copy the host memory to the device:
02604   checkCudaErrors(cudaMemcpy(*voxel_mat_dens_device, voxel_mat_dens, voxel_mat_dens_bytes,                          cudaMemcpyHostToDevice));
02605   checkCudaErrors(cudaMemcpy(*mfp_Woodcock_table_device, mfp_Woodcock_table, mfp_Woodcock_table_bytes,              cudaMemcpyHostToDevice));
02606   checkCudaErrors(cudaMemcpy(*mfp_table_a_device,    mfp_table_a,    mfp_table_bytes,                               cudaMemcpyHostToDevice));
02607   checkCudaErrors(cudaMemcpy(*mfp_table_b_device,    mfp_table_b,    mfp_table_bytes,                               cudaMemcpyHostToDevice));
02608   checkCudaErrors(cudaMemcpy(*rayleigh_table_device, rayleigh_table, sizeof(struct rayleigh_struct),                cudaMemcpyHostToDevice));
02609   checkCudaErrors(cudaMemcpy(*compton_table_device,  compton_table,  sizeof(struct compton_struct),                 cudaMemcpyHostToDevice));  
02610   checkCudaErrors(cudaMemcpy(*detector_data_device,  detector_data,  num_projections*sizeof(struct detector_struct),cudaMemcpyHostToDevice));
02611   checkCudaErrors(cudaMemcpy(*source_data_device,    source_data,    num_projections*sizeof(struct source_struct),  cudaMemcpyHostToDevice));  
02612   
02613 
02614   //   --Init the image array to 0 using a GPU kernel instead of cudaMemcpy:
02615   //     Simple version: checkCudaErrors( cudaMemcpy( image_device, image, image_bytes, cudaMemcpyHostToDevice) );
02616 
02617   int pixels_per_image = detector_data[0].num_pixels.x * detector_data[0].num_pixels.y;
02618   MASTER_THREAD printf("       ==> CUDA: Launching kernel to initialize the device image to 0: number of blocks = %d, threads per block = 128\n", (int)(ceil(pixels_per_image/128.0f)+0.01f) );
02619 
02620   init_image_array_GPU<<<(int)(ceil(pixels_per_image/128.0f)+0.01f),128>>>(*image_device, pixels_per_image);
02621     cudaThreadSynchronize();      // Force the runtime to wait until all device tasks have completed
02622     getLastCudaError("\n\n !!Kernel execution failed initializing the image array!! ");  // Check if kernel execution generated any error:
02623 
02624 
02625   //   --Init the dose array to 0 using a GPU kernel, if the tally is not disabled:
02626   if (*dose_ROI_x_max > -1)
02627   {      
02628     
02629     MASTER_THREAD printf("       ==> CUDA: Initialize the device dose deposition to 0 using cudaMemcpy.\n");
02630     checkCudaErrors(cudaMemcpy(*voxels_Edep_device, voxels_Edep, voxels_Edep_bytes, cudaMemcpyHostToDevice) );
02631    
02632 /*  // -- OPTIONAL CODE: Launch kernel to initialize the device dose deposition to 0 (MAY FAIL IF DOSE MATRIX IS TOO BIG!)    !!DeBuG!!
02633     int num_voxels_dose = voxels_Edep_bytes/sizeof(ulonglong2);   // Calculate the number of voxels in the dose array
02634     int num_blocks, num_threads_block = 0;  
02635     // Select the number of threads per block making sure we don't try to launch more blocks than CUDA's maximum value:
02636     do
02637     {
02638       num_threads_block += 64;
02639       num_blocks = (int)(ceil(((double)num_voxels_dose)/((double)num_threads_block))+0.001);
02640     }
02641     while (num_blocks > 65500);    
02642     MASTER_THREAD printf("       ==> CUDA: Launching kernel to initialize the device dose deposition to 0: number of blocks = %d, threads per block = %d\n", num_blocks, num_threads_block);  
02643     init_dose_array_GPU<<<num_blocks,num_threads_block>>>(*voxels_Edep_device, num_voxels_dose);    
02644       cudaThreadSynchronize();
02645       getLastCudaError("\n\n !!Kernel execution failed initializing the dose array!! ");  // Check if kernel execution generated any error:
02646 */
02647 
02648   }
02649   
02650   // Init materials_dose array in GPU with 0 (same as host):
02651   if (flag_material_dose==1)
02652     checkCudaErrors(cudaMemcpy(*materials_dose_device, materials_dose, MAX_MATERIALS*sizeof(ulonglong2), cudaMemcpyHostToDevice));   // !!tally_materials_dose!!
02653   
02654   MASTER_THREAD printf("                 Time spent allocating and copying memory to the device: %.6f s\n", float(clock()-clock_init)/CLOCKS_PER_SEC);    
02655 
02656 }
02657 
02658 
02659 ////////////////////////////////////////////////////////////////////////////////
02660 
02661 ////////////////////////////////////////////////////////////////////////////////
02662 //! Guestimate a good number of blocks to estimate the speed of different generations 
02663 //! of GPUs. Slower GPUs will simulate less particles and hopefully the fastest GPUs 
02664 //! will not have to wait much. If the speed is not accurately estimated in the speed test
02665 //! some GPUs will simulate longer than others and valuable simulation time will be wasted 
02666 //! in the idle GPUs.
02667 //!
02668 //! In this function the "optimum" number of blocks for the speed test is heuristically 
02669 //! computed as the product of three GPU characteristics:
02670 //!   [2.0] * [number of GPU cores] * [core frequency] * [major CUDA compute capability] + [100]
02671 //!
02672 //! The factor 2.0 is arbitrary and can be modified depending on the case (for short 
02673 //! simulations this value may have to be reduced or the speed test will take longer 
02674 //! than the whole simulation). The constant 100 blocks are added to try to get enough 
02675 //! blocks for a reliable timing of slow GPUs.
02676 //!
02677 //! For example, an NVIDIA GeForce 290 will get:
02678 //!   2.0 * 240 (cores) * 1.24 (GHz) * 1 (major compute capability) + 100 =  695.2 ~  695 blocks
02679 //! An NVIDIA GeForce 580 will get:
02680 //!   2.0 * 512 (cores) * 1.54 (GHz) * 2 (major compute capability) + 100 = 3253.9 ~ 3254 blocks 
02681 //! In total the 580 gets 5.7 times more blocks than the 290.
02682 //!
02683 //!       @param[in] gpu_id   GPU number
02684 //!       @param[out] num_blocks   Returns a number of blocks related to the expected GPU speed
02685 ////////////////////////////////////////////////////////////////////////////////
02686 int guestimate_GPU_performance(int gpu_id)
02687 {          
02688   cudaDeviceProp deviceProp;
02689   cudaGetDeviceProperties(&deviceProp, gpu_id);
02690   float num_cores       = (float) _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount;
02691   float comp_capability = (float) deviceProp.major;
02692   float frequency       = deviceProp.clockRate*1.0e-6f;
02693   
02694   return (int)(2.0f*num_cores*frequency*comp_capability + 100.0f + 0.50f);
02695 }
02696   
02697   
02698 #endif
02699 ////////////////////////////////////////////////////////////////////////////////
02700 
02701 
02702 
02703 
02704 ////////////////////////////////////////////////////////////////////////////////
02705 //! Report the tallied image in ASCII and binary form (32-bit floats).
02706 //! Separate images for primary and scatter radiation are generated.
02707 //! 
02708 //!
02709 //!       @param[in] file_name_output   File where tallied image is reported
02710 //!       @param[in] detector_data   Detector description read from the input file (pointer to detector_struct)
02711 //!       @param[in] image  Tallied image (in meV per pixel)
02712 //!       @param[in] time_elapsed   Time elapsed during the main loop execution (in seconds)
02713 //!       @param[in] total_histories   Total number of x-rays simulated
02714 ////////////////////////////////////////////////////////////////////////////////
02715 int report_image(char* file_name_output, struct detector_struct* detector_data, struct source_struct* source_data, float mean_energy_spectrum, unsigned long long int* image, double time_elapsed, unsigned long long int total_histories, int current_projection, int num_projections, double D_angle, double initial_angle, int myID, int numprocs)
02716 {
02717   
02718   //  -Find current angle
02719   double current_angle = initial_angle+current_projection*D_angle;
02720 
02721   // -- Report data:
02722   printf("\n\n          *** IMAGE TALLY PERFORMANCE REPORT ***\n");
02723   
02724   if(num_projections!=1)   // Output the projection angle when simulating a CT:
02725   {
02726     printf("              CT projection %d of %d: angle from X axis = %lf \n", current_projection+1, num_projections, current_angle*RAD2DEG);
02727   }
02728   
02729   printf("              Simulated x rays:    %lld\n", total_histories);
02730   printf("              Simulation time [s]: %.2f\n", time_elapsed);
02731   if (time_elapsed>0.000001)
02732     printf("              Speed [x-rays/s]:    %.2f\n\n", ((double)total_histories)/time_elapsed);
02733 
02734   FILE* file_ptr = fopen(file_name_output, "w");
02735   
02736   if (file_ptr==NULL)
02737   {
02738     printf("\n\n   !!fopen ERROR report_image!! File %s can not be opened!!\n", file_name_output);
02739     exit(-3);
02740   }
02741   
02742   fprintf(file_ptr, "# \n");
02743   fprintf(file_ptr, "#     *****************************************************************************\n");
02744   fprintf(file_ptr, "#     ***         MC-GPU, version 1.3 (http://code.google.com/p/mcgpu/)         ***\n");
02745   fprintf(file_ptr, "#     ***                                                                       ***\n");
02746   fprintf(file_ptr, "#     ***                     Andreu Badal (Andreu.Badal-Soler@fda.hhs.gov)     ***\n");
02747   fprintf(file_ptr, "#     *****************************************************************************\n");
02748   fprintf(file_ptr, "# \n");  
02749 #ifdef USING_CUDA
02750   fprintf(file_ptr, "#  *** SIMULATION IN THE GPU USING CUDA ***\n");
02751 #else
02752   fprintf(file_ptr, "#  *** SIMULATION IN THE CPU ***\n");
02753 #endif  
02754   fprintf(file_ptr, "#\n");
02755   fprintf(file_ptr, "#  Image created counting the energy arriving at each pixel: ideal energy integrating detector.\n");
02756   fprintf(file_ptr, "#  Pixel value units: eV/cm^2 per history (energy fluence).\n");
02757 
02758 
02759   if(num_projections!=1)   // Output the projection angle when simulating a CT:
02760   {
02761     fprintf(file_ptr, "#  CT projection %d of %d: angle from X axis = %lf \n", current_projection+1, num_projections, current_angle*RAD2DEG);
02762   }  
02763 
02764   fprintf(file_ptr, "#  Focal spot position = (%.8f,%.8f,%.8f), cone beam direction = (%.8f,%.8f,%.8f)\n", source_data[current_projection].position.x, source_data[current_projection].position.y, source_data[current_projection].position.z, source_data[current_projection].direction.x, source_data[current_projection].direction.y, source_data[current_projection].direction.z);
02765 
02766   fprintf(file_ptr, "#  Pixel size:  %lf x %lf = %lf cm^2\n", 1.0/(double)(detector_data[0].inv_pixel_size_X), 1.0/(double)(detector_data[0].inv_pixel_size_Z), 1.0/(double)(detector_data[0].inv_pixel_size_X*detector_data[0].inv_pixel_size_Z));
02767   
02768   fprintf(file_ptr, "#  Number of pixels in X and Z:  %d  %d\n", detector_data[0].num_pixels.x, detector_data[0].num_pixels.y);
02769   fprintf(file_ptr, "#  (X rows given first, a blank line separates the different Z values)\n");
02770   fprintf(file_ptr, "# \n");
02771   fprintf(file_ptr, "#  [NON-SCATTERED] [COMPTON] [RAYLEIGH] [MULTIPLE-SCATTING]\n");
02772   fprintf(file_ptr, "# ==========================================================\n");
02773 
02774   const double SCALE = 1.0/SCALE_eV;    // conversion to eV using the inverse of the constant used in the "tally_image" kernel function (defined in the header file)
02775   const double NORM = SCALE * detector_data[0].inv_pixel_size_X * detector_data[0].inv_pixel_size_Z / ((double)total_histories);  // ==> [eV/cm^2 per history]
02776   double energy_noScatter, energy_compton, energy_rayleigh, energy_multiscatter;
02777   double energy_integral = 0.0;   // Integrate (add) the energy in the image pixels [meV]
02778   double maximum_energy_pixel = -100.0;  // Find maximum pixel signal
02779   int maximum_energy_pixel_x=0, maximum_energy_pixel_y=0, maximum_energy_pixel_number=0;   
02780 
02781   int pixels_per_image = (detector_data[0].num_pixels.x*detector_data[0].num_pixels.y), pixel=0;
02782   int i, j;
02783   for(j=0; j<detector_data[0].num_pixels.y; j++)
02784   {
02785     for(i=0; i<detector_data[0].num_pixels.x; i++)
02786     {
02787       energy_noScatter    = (double)(image[pixel]);
02788       energy_compton      = (double)(image[pixel +   pixels_per_image]);
02789       energy_rayleigh     = (double)(image[pixel + 2*pixels_per_image]);
02790       energy_multiscatter = (double)(image[pixel + 3*pixels_per_image]);
02791 
02792       // -- Write the results in an external file; the image corresponding to all particles not written: it has to be infered adding all images
02793       fprintf(file_ptr, "%.8lf %.8lf %.8lf %.8lf\n", NORM*energy_noScatter, NORM*energy_compton, NORM*energy_rayleigh, NORM*energy_multiscatter);
02794       
02795       register double total_energy_pixel = energy_noScatter + energy_compton + energy_rayleigh + energy_multiscatter;   // Find and report the pixel with maximum signal
02796       if (total_energy_pixel>maximum_energy_pixel)
02797       {
02798         maximum_energy_pixel = total_energy_pixel;
02799         maximum_energy_pixel_x = i;
02800         maximum_energy_pixel_y = j;
02801         maximum_energy_pixel_number = pixel;
02802       }            
02803       energy_integral += total_energy_pixel;   // Count total energy in the whole image      
02804       
02805       pixel++;   // Move to next pixel
02806     }
02807     fprintf(file_ptr, "\n");     // Separate rows with an empty line for visualization with gnuplot.
02808   }
02809   
02810   fprintf(file_ptr, "#   *** Simulation REPORT: ***\n");
02811   fprintf(file_ptr, "#       Fraction of energy detected (over the mean energy of the spectrum): %.3lf%%\n", 100.0*SCALE*(energy_integral/(double)(total_histories))/(double)(mean_energy_spectrum));
02812   fprintf(file_ptr, "#       Maximum energy detected in pixel %i: (x,y)=(%i,%i) -> pixel value = %lf eV/cm^2\n", maximum_energy_pixel_number, maximum_energy_pixel_x, maximum_energy_pixel_y, NORM*maximum_energy_pixel);
02813   fprintf(file_ptr, "#       Simulated x rays:    %lld\n", total_histories);
02814   fprintf(file_ptr, "#       Simulation time [s]: %.2f\n", time_elapsed);
02815   if (time_elapsed>0.000001)
02816     fprintf(file_ptr, "#       Speed [x-rays/sec]:  %.2f\n\n", ((double)total_histories)/time_elapsed);
02817    
02818   fclose(file_ptr);  // Close output file and flush stream
02819 
02820   printf("              Fraction of initial energy arriving at the detector (over the mean energy of the spectrum):  %.3lf%%\n", 100.0*SCALE*(energy_integral/(double)(total_histories))/(double)(mean_energy_spectrum));
02821   printf("              Maximum energy detected in pixel %i: (x,y)=(%i,%i). Maximum pixel value = %lf eV/cm^2\n\n", maximum_energy_pixel_number, maximum_energy_pixel_x, maximum_energy_pixel_y, NORM*maximum_energy_pixel);  
02822   fflush(stdout);
02823   
02824   
02825   // -- Binary output:   
02826   float energy_float;
02827   char file_binary[250];
02828   strncpy (file_binary, file_name_output, 250);
02829   strcat(file_binary,".raw");                       // !!BINARY!! 
02830   FILE* file_binary_ptr = fopen(file_binary, "w");  // !!BINARY!!
02831   if (file_binary_ptr==NULL)
02832   {
02833     printf("\n\n   !!fopen ERROR report_image!! Binary file %s can not be opened for writing!!\n", file_binary);
02834     exit(-3);
02835   }
02836   
02837   for(i=0; i<pixels_per_image; i++)
02838   {
02839     energy_float = (float)( NORM * (double)(image[i] + image[i + pixels_per_image] + image[i + 2*pixels_per_image] + image[i + 3*pixels_per_image]) );  // Total image (scatter + primary)
02840     fwrite(&energy_float, sizeof(float), 1, file_binary_ptr);   // Write pixel data in a binary file that can be easyly open in imageJ. !!BINARY!!
02841   }
02842   for(i=0; i<pixels_per_image; i++)
02843   {
02844     energy_float = (float)( NORM * (double)(image[i]) );  // Non-scattered image
02845     fwrite(&energy_float, sizeof(float), 1, file_binary_ptr);
02846   }
02847   for(i=0; i<pixels_per_image; i++)
02848   {
02849     energy_float = (float)( NORM * (double)(image[i + pixels_per_image]) );  // Compton image
02850     fwrite(&energy_float, sizeof(float), 1, file_binary_ptr);
02851   }
02852   for(i=0; i<pixels_per_image; i++)
02853   {
02854     energy_float = (float)( NORM * (double)(image[i + 2*pixels_per_image]) );  // Rayleigh image
02855     fwrite(&energy_float, sizeof(float), 1, file_binary_ptr);
02856   }
02857   for(i=0; i<pixels_per_image; i++)
02858   {
02859     energy_float = (float)( NORM * (double)(image[i + 3*pixels_per_image]) );  // Multiple-scatter image
02860     fwrite(&energy_float, sizeof(float), 1, file_binary_ptr);
02861   }       
02862   
02863   fclose(file_binary_ptr);    
02864   
02865     
02866   return 0;     // Report could return not 0 to continue the simulation...
02867 }
02868 ///////////////////////////////////////////////////////////////////////////////
02869 
02870 
02871 
02872 
02873 ///////////////////////////////////////////////////////////////////////////////
02874 //! Report the total tallied 3D voxel dose deposition for all projections.
02875 //! The voxel doses in the input ROI and their respective uncertainties are reported 
02876 //! in binary form (32-bit floats) in two separate .raw files.
02877 //! The dose in a single plane at the level of the focal spot is also reported in  
02878 //! ASCII format for simple visualization with GNUPLOT.
02879 //! The total dose deposited in each different material is reported to the standard output.
02880 //! The material dose is calculated adding the energy deposited in the individual voxels 
02881 //! within the dose ROI, and dividing by the total mass of the material in the ROI.
02882 //!
02883 //!       @param[in] file_dose_output   File where tallied image is reported
02884 //!       @param[in] detector_data   Detector description read from the input file (pointer to detector_struct)
02885 //!       @param[in] image  Tallied image (in meV per pixel)
02886 //!       @param[in] time_elapsed   Time elapsed during the main loop execution (in seconds)
02887 //!       @param[in] total_histories   Total number of x-rays simulated
02888 //!       @param[in] source_data   Data required to compute the voxel plane to report in ASCII format: Z at the level of the source, 1st projection
02889 ////////////////////////////////////////////////////////////////////////////////
02890 int report_voxels_dose(char* file_dose_output, int num_projections, struct voxel_struct* voxel_data, float2* voxel_mat_dens, ulonglong2* voxels_Edep, double time_total_MC_init_report, unsigned long long int total_histories, short int dose_ROI_x_min, short int dose_ROI_x_max, short int dose_ROI_y_min, short int dose_ROI_y_max, short int dose_ROI_z_min, short int dose_ROI_z_max, struct source_struct* source_data)
02891 {
02892 
02893   printf("\n\n          *** VOXEL ROI DOSE TALLY REPORT ***\n\n");
02894     
02895   FILE* file_ptr = fopen(file_dose_output, "w");
02896   if (file_ptr==NULL)
02897   {
02898     printf("\n\n   !!fopen ERROR report_voxels_dose!! File %s can not be opened!!\n", file_dose_output);
02899     exit(-3);
02900   }    
02901     
02902   // -- Binary output:                                         // !!BINARY!!  
02903   char file_binary_mean[250], file_binary_sigma[250];
02904   strncpy (file_binary_mean, file_dose_output, 250);
02905   strcat(file_binary_mean,".raw");                     
02906   strncpy (file_binary_sigma, file_dose_output, 250);
02907   strcat(file_binary_sigma,"_2sigma.raw");    
02908   FILE* file_binary_mean_ptr  = fopen(file_binary_mean, "w");  // !!BINARY!!
02909   FILE* file_binary_sigma_ptr = fopen(file_binary_sigma, "w");       // !!BINARY!!
02910   if (file_binary_mean_ptr==NULL)
02911   {
02912     printf("\n\n   !!fopen ERROR report_voxels_dose!! Binary file %s can not be opened!!\n", file_dose_output);
02913     exit(-3);
02914   }
02915   
02916   int DX = dose_ROI_x_max - dose_ROI_x_min + 1,
02917       DY = dose_ROI_y_max - dose_ROI_y_min + 1,
02918       DZ = dose_ROI_z_max - dose_ROI_z_min + 1;           
02919       
02920   // -- Calculate the dose plane that will be output as ASCII text:
02921   int z_plane_dose = (int)(source_data[0].position.z * voxel_data->inv_voxel_size.z + 0.00001f);  // Select voxel plane at the level of the source, 1st projections
02922   if ( (z_plane_dose<dose_ROI_z_min) || (z_plane_dose>dose_ROI_z_max) )
02923     z_plane_dose = (dose_ROI_z_max+dose_ROI_z_min)/2;
02924   
02925   int z_plane_dose_ROI = z_plane_dose - dose_ROI_z_min;
02926 
02927   printf("              Reporting the 3D voxel dose distribution as binary floats in the .raw file, and the 2D dose for Z plane %d as ASCII text.\n", z_plane_dose);
02928 //   printf("              Also reporting the dose to each material inside the input ROI adding the energy deposited in each individual voxel\n");
02929 //   printf("              (these material dose results will be equal to the materials dose tally below if the ROI covers all the voxels).\n");
02930   
02931   fprintf(file_ptr, "# \n");
02932   fprintf(file_ptr, "#     *****************************************************************************\n");
02933   fprintf(file_ptr, "#     ***         MC-GPU, version 1.3 (http://code.google.com/p/mcgpu/)         ***\n");
02934   fprintf(file_ptr, "#     ***                                                                       ***\n");
02935   fprintf(file_ptr, "#     ***                     Andreu Badal (Andreu.Badal-Soler@fda.hhs.gov)     ***\n");
02936   fprintf(file_ptr, "#     *****************************************************************************\n");
02937   fprintf(file_ptr, "# \n");  
02938 #ifdef USING_CUDA
02939   fprintf(file_ptr, "#  *** SIMULATION IN THE GPU USING CUDA ***\n");
02940 #else
02941   fprintf(file_ptr, "#  *** SIMULATION IN THE CPU ***\n");
02942 #endif
02943   fprintf(file_ptr, "#\n");
02944   
02945   
02946   // Report only one dose plane in ASCII, all the other data in binary only:
02947 
02948   fprintf(file_ptr, "#\n");
02949   fprintf(file_ptr, "#  3D dose deposition map (and dose uncertainty) created tallying the energy deposited by photons inside each voxel of the input geometry.\n");
02950   fprintf(file_ptr, "#  Electrons were not transported and therefore we are approximating that the dose is equal to the KERMA (energy released by the photons alone).\n");
02951   fprintf(file_ptr, "#  This approximation is acceptable when there is electronic equilibrium and when the range of the secondary electrons is shorter than the voxel size.\n");
02952   fprintf(file_ptr, "#  Usually the doses will be acceptable for photon energies below 1 MeV. The dose estimates may not be accurate at the interface of low density volumes.\n");
02953   fprintf(file_ptr, "#\n");
02954   fprintf(file_ptr, "#  The 3D dose deposition is reported in binary form in the .raw files (data given as 32-bit floats). \n");
02955   fprintf(file_ptr, "#  To reduce the memory use and the reporting time this text output reports only the 2D dose at the Z plane at the level\n"); 
02956   fprintf(file_ptr, "#  of the source focal spot: z_coord = %d (z_coord in ROI = %d)\n", z_plane_dose, z_plane_dose_ROI);
02957   fprintf(file_ptr, "#\n");  
02958   fprintf(file_ptr, "#  The total dose deposited in each different material is reported to the standard output.\n");
02959   fprintf(file_ptr, "#  The dose is calculated adding the energy deposited in the individual voxels within the dose ROI and dividing by the total mass of the material in the ROI.\n");
02960   fprintf(file_ptr, "#\n");  
02961   fprintf(file_ptr, "#\n");
02962   fprintf(file_ptr, "#  Voxel size:  %lf x %lf x %lf = %lf cm^3\n", 1.0/(double)(voxel_data->inv_voxel_size.x), 1.0/(double)(voxel_data->inv_voxel_size.y), 1.0/(double)(voxel_data->inv_voxel_size.z), 1.0/(double)(voxel_data->inv_voxel_size.x*voxel_data->inv_voxel_size.y*voxel_data->inv_voxel_size.z));
02963   fprintf(file_ptr, "#  Number of voxels in the reported region of interest (ROI) X, Y and Z:\n");
02964   fprintf(file_ptr, "#      %d  %d  %d\n", DX, DY, DZ);
02965   fprintf(file_ptr, "#  Coordinates of the ROI inside the voxel volume = X[%d,%d], Y[%d,%d], Z[%d,%d]\n", dose_ROI_x_min+1, dose_ROI_x_max+1, dose_ROI_y_min+1, dose_ROI_y_max+1, dose_ROI_z_min+1, dose_ROI_z_max+1);  // Show ROI with index=1 for the first voxel instead of 0.
02966   fprintf(file_ptr, "#\n");
02967   fprintf(file_ptr, "#  Voxel dose units: eV/g per history\n");
02968   fprintf(file_ptr, "#  X rows given first, then Y, then Z. One blank line separates the different Y, and two blanks the Z values (GNUPLOT format).\n");
02969   fprintf(file_ptr, "#  The dose distribution is also reported with binary FLOAT values (.raw file) for easy visualization in ImageJ.\n");
02970   fprintf(file_ptr, "# \n");
02971   fprintf(file_ptr, "#    [DOSE]   [2*standard_deviation]\n");
02972   fprintf(file_ptr, "# =====================================\n");
02973   fflush(file_ptr);
02974   
02975   double voxel_dose, max_voxel_dose = -1.0, max_voxel_dose_std_dev = -1.0;
02976   
02977   int max_dose_voxel_geometry=0, max_voxel_dose_x=-1, max_voxel_dose_y=-1, max_voxel_dose_z=-1;
02978   unsigned long long int total_energy_deposited = 0;
02979   double inv_SCALE_eV = 1.0 / SCALE_eV,      // conversion to eV using the inverse of the constant used in the tally function (defined in the header file).         
02980                 inv_N = 1.0 / (double)(total_histories*((unsigned long long int)num_projections));
02981                                 
02982   register int i, j, k, voxel=0;
02983     
02984   double mat_Edep[MAX_MATERIALS], mat_Edep2[MAX_MATERIALS], mat_mass_ROI[MAX_MATERIALS];    // Arrays with the total energy, energy squared and mass of each material inside the ROI (mass and dose outside the ROI was not tallied).
02985   unsigned int mat_voxels[MAX_MATERIALS];
02986   for(i=0; i<MAX_MATERIALS; i++)
02987   {
02988      mat_Edep[i]  = 0.0;
02989      mat_Edep2[i] = 0.0;
02990      mat_mass_ROI[i]  = 0.0;
02991      mat_voxels[i]= 0;
02992   }
02993   
02994   double voxel_volume = 1.0 / ( ((double)voxel_data->inv_voxel_size.x) * ((double)voxel_data->inv_voxel_size.y) * ((double)voxel_data->inv_voxel_size.z) );
02995     
02996   for(k=0; k<DZ; k++)
02997   {
02998     for(j=0; j<DY; j++)
02999     {
03000       for(i=0; i<DX; i++)
03001       {
03002         register int voxel_geometry = (i+dose_ROI_x_min) + (j+dose_ROI_y_min)*voxel_data->num_voxels.x + (k+dose_ROI_z_min)*voxel_data->num_voxels.x*voxel_data->num_voxels.y;
03003         register double inv_voxel_mass = 1.0 / (voxel_mat_dens[voxel_geometry].y*voxel_volume);
03004 
03005         register int mat_number = (int)(voxel_mat_dens[voxel_geometry].x) - 1 ;  // Material number, starting at 0.
03006         mat_mass_ROI[mat_number]  += voxel_mat_dens[voxel_geometry].y*voxel_volume;   // Estimate mass and energy deposited in this material
03007         mat_Edep[mat_number]  += (double)voxels_Edep[voxel].x;        // Using doubles to avoid overflow
03008         mat_Edep2[mat_number] += (double)voxels_Edep[voxel].y;
03009         mat_voxels[mat_number]++;                                                // Count voxels made of this material
03010         
03011                 
03012               // Optional code to eliminate dose deposited in air (first material).  Sometimes useful for visualization (dose to air irrelevant, noisy)
03013               //   if (voxel_mat_dens[voxel_geometry].x < 1.1f)
03014               //   {
03015               //     voxels_Edep[voxel].x = 0.0f;
03016               //     voxels_Edep[voxel].y = 0.0f;
03017               //   }
03018                 
03019         // -- Convert total energy deposited to dose [eV/gram] per history:                        
03020         voxel_dose = ((double)voxels_Edep[voxel].x) * inv_N * inv_voxel_mass * inv_SCALE_eV;    // [dose == Edep * voxel_volume / voxel_density / N_hist]                      
03021         total_energy_deposited += voxels_Edep[voxel].x;
03022 
03023         register double voxel_std_dev = (((double)voxels_Edep[voxel].y) * inv_N * inv_SCALE_eV * inv_voxel_mass - voxel_dose*voxel_dose) * inv_N;   // [sigma = (<Edep^2> - <Edep>^2) / N_hist]
03024         if (voxel_std_dev>0.0)
03025           voxel_std_dev = sqrt(voxel_std_dev);
03026         
03027         if (voxel_dose > max_voxel_dose)
03028         {
03029           // Find the voxel that has the maximum dose:
03030           max_voxel_dose          = voxel_dose;
03031           max_voxel_dose_std_dev  = voxel_std_dev;
03032           max_voxel_dose_x        = i+dose_ROI_x_min;
03033           max_voxel_dose_y        = j+dose_ROI_y_min;
03034           max_voxel_dose_z        = k+dose_ROI_z_min;
03035           max_dose_voxel_geometry = voxel_geometry;          
03036         }
03037         
03038         // Report only one dose plane in ASCII:
03039         if (k == z_plane_dose_ROI) 
03040           fprintf(file_ptr, "%.6lf %.6lf\n", voxel_dose, 2.0*voxel_std_dev);        
03041         
03042         float voxel_dose_float  = (float)voxel_dose;         // After dividing by the number of histories I can report FLOAT bc the number of significant digits will be low.  
03043         float voxel_sigma_float = 2.0f * (float)(voxel_std_dev);
03044         
03045         fwrite(&voxel_dose_float,  sizeof(float), 1, file_binary_mean_ptr);    // Write dose data in a binary file that can be easyly open in imageJ.   !!BINARY!!
03046         fwrite(&voxel_sigma_float, sizeof(float), 1, file_binary_sigma_ptr);
03047         
03048         voxel++;
03049       }
03050       if (k == z_plane_dose_ROI) 
03051         fprintf(file_ptr, "\n");     // Separate Ys with an empty line for visualization with gnuplot.
03052     }
03053     if (k == z_plane_dose_ROI) 
03054       fprintf(file_ptr, "\n");     // Separate Zs.
03055   }
03056 
03057   
03058   fprintf(file_ptr, "#   ****** DOSE REPORT: TOTAL SIMULATION PERFORMANCE FOR ALL PROJECTIONS ******\n");
03059   fprintf(file_ptr, "#       Total number of simulated x rays: %lld\n", total_histories*((unsigned long long int)num_projections));
03060   fprintf(file_ptr, "#       Simulated x rays per projection:  %lld\n", total_histories);
03061   fprintf(file_ptr, "#       Total simulation time [s]:  %.2f\n", time_total_MC_init_report);
03062   if (time_total_MC_init_report>0.000001)
03063     fprintf(file_ptr, "#       Total speed [x-rays/s]:  %.2f\n", (double)(total_histories*((unsigned long long int)num_projections))/time_total_MC_init_report);
03064 
03065   
03066   fprintf(file_ptr, "\n#       Total energy absorved inside the dose ROI: %.5lf keV/hist\n\n", 0.001*((double)total_energy_deposited)*inv_N*inv_SCALE_eV);
03067   
03068   // Output data to standard input:
03069   printf("\n              Total energy absorved inside the dose deposition ROI: %.5lf keV/hist\n", 0.001*((double)total_energy_deposited)*inv_N*inv_SCALE_eV);
03070   register double voxel_mass_max_dose = voxel_volume*voxel_mat_dens[max_dose_voxel_geometry].y; 
03071   printf(  "              Maximum voxel dose (+-2 sigma): %lf +- %lf eV/g per history (E_dep_voxel=%lf eV/hist)\n", max_voxel_dose, max_voxel_dose_std_dev, (max_voxel_dose*voxel_mass_max_dose));
03072   printf(  "              for the voxel: material=%d, density=%.8f g/cm^3, voxel_mass=%.8lf g, voxel coord in geometry=(%d,%d,%d)\n\n", (int)voxel_mat_dens[max_dose_voxel_geometry].x, voxel_mat_dens[max_dose_voxel_geometry].y, voxel_mass_max_dose, max_voxel_dose_x, max_voxel_dose_y, max_voxel_dose_z);
03073   
03074   
03075   // -- Report dose deposited in each material:  
03076   printf("              Dose deposited in the different materials inside the input ROI computed post-processing the 3D voxel dose results:\n\n");
03077   printf("    [MATERIAL]  [DOSE_ROI, eV/g/hist]  [2*std_dev]  [Rel error 2*std_dev, %%]  [E_dep [eV/hist]  [MASS_ROI, g]  [NUM_VOXELS_ROI]\n");
03078   printf("   =============================================================================================================================\n");
03079   
03080   for(i=0; i<MAX_MATERIALS; i++)
03081   {
03082     if(mat_voxels[i]>0)   // Report only for materials found at least in 1 voxel of the input geometry (prevent dividing by 0 mass).
03083     {
03084       
03085       double Edep = mat_Edep[i] * inv_N * inv_SCALE_eV;    // [dose == Edep/Mass/N_hist]
03086       // !!DeBuG!! BUG in version 1.2: I have to divide by mass after computing the mean and sigma!!!
03087       // !!DeBuG!! WRONG code:  double material_dose = mat_Edep[i] * inv_N  * inv_SCALE_eV / mat_mass_ROI[i];    // [dose == Edep/Mass/N_hist]
03088       // !!DeBuG!! WRONG code:  double material_std_dev = (mat_Edep2[i] * inv_N  * inv_SCALE_eV / mat_mass_ROI[i] - material_dose*material_dose) * inv_N;   // [sigma^2 = (<Edep^2> - <Edep>^2) / N_hist]      
03089       
03090       double material_std_dev = (mat_Edep2[i] * inv_N - Edep*Edep) * inv_N;   // [sigma^2 = (<Edep^2> - <Edep>^2) / N_hist]   (mat_Edep2 not scaled by SCALE_eV in kernel to prevent overflow)
03091       if (material_std_dev>0.0)
03092         material_std_dev = sqrt(material_std_dev);
03093       
03094       double material_dose = Edep / mat_mass_ROI[i];
03095       material_std_dev = material_std_dev / mat_mass_ROI[i];
03096       
03097       double rel_diff = 0.0;
03098       if (material_dose>0.0)
03099         rel_diff = material_std_dev/material_dose;
03100     
03101       printf("\t%d\t%.5lf\t\t%.5lf\t\t%.2lf\t\t%.2lf\t\t%.5lf\t%u\n", (i+1), material_dose, 2.0*material_std_dev, (2.0*100.0*rel_diff), Edep, mat_mass_ROI[i], mat_voxels[i]);            
03102  
03103     }    
03104   }       
03105   printf("\n");
03106         
03107   
03108   fflush(stdout);          
03109   fclose(file_ptr);  // Close output file and flush stream
03110   fclose(file_binary_mean_ptr);
03111   fclose(file_binary_sigma_ptr);
03112 
03113   return 0;   // Report could return not 0 to continue the simulation...
03114 }
03115 ///////////////////////////////////////////////////////////////////////////////
03116 
03117 
03118 
03119 ///////////////////////////////////////////////////////////////////////////////
03120 //! Report the tallied dose to each material number, accounting for different 
03121 //! densities in different regions with the same material number. 
03122 //!
03123 //!       @param[in] num_projections   Number of projections simulated
03124 //!       @param[in] total_histories   Total number of x-rays simulated per projection
03125 //!       @param[out] density_nominal   Array with the nominal densities of materials given in the input file; -1 for materials not defined. Used to report only defined materials.
03126 //!       @param[in] materials_dose   Tallied dose and dose^2 arrays
03127 ////////////////////////////////////////////////////////////////////////////////
03128 int report_materials_dose(int num_projections, unsigned long long int total_histories, float *density_nominal, ulonglong2 *materials_dose, double *mass_materials)  // !!tally_materials_dose!!
03129 {
03130 
03131   printf("\n\n          *** MATERIALS TOTAL DOSE TALLY REPORT ***\n\n");  
03132   printf("              Dose deposited in each material defined in the input file (tallied directly per material, not per voxel):\n");
03133   printf("              The results of this tally should be equal to the voxel tally doses for an ROI covering all voxels.\n\n");
03134   printf("    [MAT]  [DOSE, eV/g/hist]  [2*std_dev]  [Rel_error 2*std_dev, %%]  [E_dep [eV/hist]  [MASS_TOTAL, g]\n");
03135   printf("   ====================================================================================================\n");
03136   
03137   double dose, Edep, std_dev, rel_diff, inv_N = 1.0 / (double)(total_histories*((unsigned long long int)num_projections));
03138   int i, flag=0, max_mat=0;
03139   for(i=0; i<MAX_MATERIALS; i++)
03140   {
03141     if (density_nominal[i]<0.0f)
03142       break;  // Skip report for materials not defined in the input file
03143     
03144     Edep    = ((double)materials_dose[i].x) / SCALE_eV * inv_N;
03145     
03146     std_dev = sqrt( (((double)materials_dose[i].y)*inv_N - Edep*Edep) * inv_N );   // [sigma^2 = (<Edep^2> - <Edep>^2) / N_hist]   (not scaling "materials_dose[i].y" by SCALE_eV in kernel to prevent overflow).
03147     
03148     if (Edep>0.0)
03149       rel_diff = std_dev/Edep;
03150     else
03151       rel_diff = 0.0;
03152 
03153     dose    = Edep / mass_materials[i];
03154     std_dev = std_dev / mass_materials[i];
03155     
03156     printf("\t%d\t%.5lf\t\t%.5lf\t\t%.2lf\t\t%.2lf\t\t%.5lf\n", (i+1), dose, 2.0*std_dev, 2.0*100.0*rel_diff, Edep, mass_materials[i]);
03157     
03158        // printf("\t%d\t%.5lf\t\t%.5lf\t\t%.2lf\t\t%.5lf\t\t%.5lf\t\t\t%llu,\t\t%llu\n", (i+1), dose, 2.0*std_dev, 2.0*100.0*rel_diff, Edep, mass_materials[i],  materials_dose[i].x, materials_dose[i].y);  //!!DeBuG!! VERBOSE output: counters not divided by num histories
03159     
03160     
03161     if (materials_dose[i].x>1e16 || dose!=abs(dose) || std_dev!=abs(std_dev))  // !!DeBuG!!  Try to detect a possible overflow in any material: large counter or negative, nan value
03162     {
03163       flag = 1;
03164       if (materials_dose[i].x>materials_dose[max_mat].x) 
03165         max_mat = i;
03166     }
03167   }
03168   
03169   if (flag!=0)    // !!DeBuG!! Try to detect a possible overflow: large counter or negative, nan value. The value of SCALE_eV can be reduced to prevent this overflow in some cases.
03170   {
03171     printf("\n     WARNING: it is possible that the unsigned long long int counter used to tally the standard deviation overflowed (>2^64).\n");  // !!DeBuG!! 
03172     printf("              The standard deviation may be incorrectly measured, but it will surely be very small (<< 1%%).\n");
03173     printf("              Max counter (mat=%d): E_dep = %llu , E_dep^2 = %llu\n\n", max_mat+1, materials_dose[max_mat].x, materials_dose[max_mat].y);
03174   }
03175   fflush(stdout);  
03176   return 0;
03177 }
03178 
03179 
03180 ///////////////////////////////////////////////////////////////////////////////
03181 
03182 
03183 
03184 ///////////////////////////////////////////////////////////////////////////////
03185 //!  Sets the CT trajectory: store in memory the source and detector rotations
03186 //!  that are needed to calculate the multiple projections.
03187 //!  The first projection (0) was previously initialized in function "read_input".
03188 //!  
03189 //!
03190 //!  ASSUMPTIONS: the CT scan plane must be perpendicular to the Z axis, ie,
03191 //!               the initial direction of the particles must have w=0!
03192 //!
03193 ///////////////////////////////////////////////////////////////////////////////
03194 void set_CT_trajectory(int myID, int num_projections, double D_angle, double angularROI_0, double angularROI_1, double SRotAxisD, struct source_struct* source_data, struct detector_struct* detector_data, double vertical_translation_per_projection)
03195 {
03196   MASTER_THREAD printf("\n    -- Setting the sources and detectors for the %d CT projections (MAX_NUM_PROJECTIONS=%d):\n", num_projections, MAX_NUM_PROJECTIONS);
03197   double cos_rX, cos_rZ, sin_rX, sin_rZ, current_angle;
03198 
03199   // --Set center of rotation at the input distance between source and detector:
03200   float3 center_rotation;
03201   center_rotation.x =  source_data[0].position.x + source_data[0].direction.x * SRotAxisD;
03202   center_rotation.y =  source_data[0].position.y + source_data[0].direction.y * SRotAxisD;
03203   center_rotation.z =  source_data[0].position.z;      //  + source_data[0].direction.z * SRotAxisD;   // w=0 all the time!!
03204 
03205   // --Angular span between projections:
03206 
03207   //  -Set initial angle for the source (180 degress less than the detector pointed by the direction vector; the zero angle is the X axis, increasing to +Y axis).
03208   current_angle = acos((double)source_data[0].direction.x);
03209   if (source_data[0].direction.y<0)
03210     current_angle = -current_angle;     // Correct for the fact that positive and negative angles have the same ACOS
03211   if (current_angle<0.0)
03212     current_angle += 2.0*PI;   // Make sure the angle is not negative, between [0,360) degrees.
03213   current_angle = current_angle - PI;   // Correct the fact that the source is opposite to the detector (180 degrees difference).
03214   if (current_angle<0.0)
03215     current_angle += 2.0*PI;   // Make sure the angle is not negative, between [0,360) degrees..
03216 
03217   MASTER_THREAD printf("         << Projection #1 >> initial_angle=%.8f , D_angle=%.8f\n", current_angle*RAD2DEG, D_angle*RAD2DEG);
03218   MASTER_THREAD printf("                             Source direction=(%.8f,%.8f,%.8f), position=(%.8f,%.8f,%.8f)\n", source_data[0].direction.x,source_data[0].direction.y,source_data[0].direction.z, source_data[0].position.x,source_data[0].position.y,source_data[0].position.z);
03219 
03220   int i;
03221   for (i=1; i<num_projections; i++)   // The first projection (i=0) was initialized in function "read_input".
03222   {
03223     // --Init constant parameters to the values in projection 0:
03224     source_data[i].cos_theta_low = source_data[0].cos_theta_low;
03225     source_data[i].phi_low = source_data[0].phi_low;
03226     source_data[i].D_cos_theta = source_data[0].D_cos_theta;
03227     source_data[i].D_phi = source_data[0].D_phi;
03228     source_data[i].max_height_at_y1cm = source_data[0].max_height_at_y1cm;    
03229     detector_data[i].sdd = detector_data[0].sdd;
03230     detector_data[i].width_X = detector_data[0].width_X;
03231     detector_data[i].height_Z = detector_data[0].height_Z;
03232     detector_data[i].inv_pixel_size_X = detector_data[0].inv_pixel_size_X;
03233     detector_data[i].inv_pixel_size_Z = detector_data[0].inv_pixel_size_Z;
03234     detector_data[i].num_pixels = detector_data[0].num_pixels;
03235     detector_data[i].total_num_pixels = detector_data[0].total_num_pixels;
03236     detector_data[i].rotation_flag = detector_data[0].rotation_flag;
03237         
03238         
03239     // --Set the new source location and direction, for the current CT projection:
03240     current_angle += D_angle;
03241     if (current_angle>=(2.0*PI-0.0001))
03242       current_angle -= 2.0*PI;   // Make sure the angle is not above or equal to 360 degrees.
03243 
03244     source_data[i].position.x = center_rotation.x + SRotAxisD*cos(current_angle);
03245     source_data[i].position.y = center_rotation.y + SRotAxisD*sin(current_angle);
03246     source_data[i].position.z = source_data[i-1].position.z + vertical_translation_per_projection;   //  The Z position can increase between projections for a helical scan. But rotation still around Z always: (w=0)!!
03247 
03248     source_data[i].direction.x = center_rotation.x - source_data[i].position.x;
03249     source_data[i].direction.y = center_rotation.y - source_data[i].position.y;
03250     source_data[i].direction.z = 0.0f;    //  center_rotation.z - source_data[0].position.z;   !! w=0 all the time!!  
03251 
03252     double norm = 1.0/sqrt((double)source_data[i].direction.x*(double)source_data[i].direction.x + (double)source_data[i].direction.y*(double)source_data[i].direction.y /* + source_data[i].direction.z*source_data[i].direction.z*/);
03253     source_data[i].direction.x = (float)(((double)source_data[i].direction.x)*norm);
03254     source_data[i].direction.y = (float)(((double)source_data[i].direction.y)*norm);
03255       // source_data[i].direction.z = (float)(((double)source_data[i].direction.z)*norm);
03256 
03257     // --Set the new detector in front of the new source:
03258     detector_data[i].center.x = source_data[i].position.x + source_data[i].direction.x * detector_data[i].sdd;   // Set the center of the detector straight ahead of the focal spot.
03259     detector_data[i].center.y = source_data[i].position.y + source_data[i].direction.y * detector_data[i].sdd;
03260     detector_data[i].center.z = source_data[i].position.z;    //  + source_data[i].direction.z * detector_data[i].sdd;   !! w=0 all the time!!
03261 
03262     double rotX, rotZ;
03263 
03264       //  detector_data[0].rotation_flag = 1;   //  Already set in read_input!
03265 
03266     // -- Rotate the detector center to +Y:
03267     //    Set the rotation that will bring particles from the detector plane to +Y=(0,+1,0) through a rotation around X and around Z (counter-clock):
03268     rotX = 0.0;   // !! w=0 all the time!!  CORRECT CALCULATION:  acos(source_data[0].direction.z) - 0.5*PI;  // Rotate to +Y = (0,+1,0) --> rotX_0 =  -PI/2
03269 
03270     if ( (source_data[i].direction.x*source_data[i].direction.x + source_data[i].direction.y*source_data[i].direction.y) > 1.0e-8 )   // == u^2+v^2 > 0
03271       if (source_data[i].direction.y >= 0.0f)
03272         rotZ = 0.5*PI - acos(source_data[i].direction.x/sqrt(source_data[i].direction.x*source_data[i].direction.x + source_data[i].direction.y*source_data[i].direction.y));
03273       else
03274         rotZ = 0.5*PI - (-acos(source_data[i].direction.x/sqrt(source_data[i].direction.x*source_data[i].direction.x + source_data[i].direction.y*source_data[i].direction.y)));
03275     else
03276       rotZ = 0.0;   // Vector pointing to +Z, do not rotate around Z then.
03277 
03278     MASTER_THREAD printf("         << Projection #%d >> current_angle=%.8f degrees (rotation around Z axis = %.8f)\n", (i+1), current_angle*RAD2DEG, rotZ*RAD2DEG);
03279     MASTER_THREAD printf("                             Source direction = (%.8f,%.8f,%.8f) , position = (%.8f,%.8f,%.8f)\n", source_data[i].direction.x,source_data[i].direction.y,source_data[i].direction.z, source_data[i].position.x,source_data[i].position.y,source_data[i].position.z);
03280 
03281     cos_rX = cos(rotX);
03282     cos_rZ = cos(rotZ);
03283     sin_rX = sin(rotX);
03284     sin_rZ = sin(rotZ);
03285     detector_data[i].rot_inv[0] =  cos_rZ;    // Rotation matrix RxRz:
03286     detector_data[i].rot_inv[1] = -sin_rZ;
03287     detector_data[i].rot_inv[2] =  0.0f;
03288     detector_data[i].rot_inv[3] =  cos_rX*sin_rZ;
03289     detector_data[i].rot_inv[4] =  cos_rX*cos_rZ;
03290     detector_data[i].rot_inv[5] = -sin_rX;
03291     detector_data[i].rot_inv[6] =  sin_rX*sin_rZ;
03292     detector_data[i].rot_inv[7] =  sin_rX*cos_rZ;
03293     detector_data[i].rot_inv[8] =  cos_rX;
03294 
03295 
03296     detector_data[i].corner_min_rotated_to_Y.x = detector_data[i].center.x*detector_data[i].rot_inv[0] + detector_data[i].center.y*detector_data[i].rot_inv[1] + detector_data[i].center.z*detector_data[i].rot_inv[2];
03297     detector_data[i].corner_min_rotated_to_Y.y = detector_data[i].center.x*detector_data[i].rot_inv[3] + detector_data[i].center.y*detector_data[i].rot_inv[4] + detector_data[i].center.z*detector_data[i].rot_inv[5];
03298     detector_data[i].corner_min_rotated_to_Y.z = detector_data[i].center.x*detector_data[i].rot_inv[6] + detector_data[i].center.y*detector_data[i].rot_inv[7] + detector_data[i].center.z*detector_data[i].rot_inv[8];
03299 
03300     // -- Set the lower corner (minimum) coordinates at the normalized orientation: +Y. The detector has thickness 0.
03301     detector_data[i].corner_min_rotated_to_Y.x = detector_data[i].corner_min_rotated_to_Y.x - 0.5*detector_data[i].width_X;
03302 //  detector_data[i].corner_min_rotated_to_Y.y = detector_data[i].corner_min_rotated_to_Y.y;
03303     detector_data[i].corner_min_rotated_to_Y.z = detector_data[i].corner_min_rotated_to_Y.z - 0.5*detector_data[i].height_Z;
03304 
03305     // *** Init the fan beam source model:
03306 
03307       rotZ = -rotZ;   // The source rotation is the inverse of the detector.
03308       cos_rX = cos(rotX);
03309       cos_rZ = cos(rotZ);
03310       sin_rX = sin(rotX);
03311       sin_rZ = sin(rotZ);
03312       // --Rotation around X (alpha) and then around Z (phi): Rz*Rx (oposite of detector rotation)
03313       source_data[i].rot_fan[0] =  cos_rZ;
03314       source_data[i].rot_fan[1] = -cos_rX*sin_rZ;
03315       source_data[i].rot_fan[2] =  sin_rX*sin_rZ;
03316       source_data[i].rot_fan[3] =  sin_rZ;
03317       source_data[i].rot_fan[4] =  cos_rX*cos_rZ;
03318       source_data[i].rot_fan[5] = -sin_rX*cos_rZ;
03319       source_data[i].rot_fan[6] =  0.0f;
03320       source_data[i].rot_fan[7] =  sin_rX;
03321       source_data[i].rot_fan[8] =  cos_rX;
03322 
03323         // printf("\n    -- Source location and direction for the following CT projection:\n");   // !!Verbose!! 
03324         // printf("                 angle between projections = %lf degrees\n", D_angle*RAD2DEG);
03325         // printf("                             current angle = %lf degrees\n", current_angle*RAD2DEG);
03326         // printf("                   new focal spot position = (%f, %f, %f)\n", source_data[i].position.x, source_data[i].position.y, source_data[i].position.z);
03327         // printf("                      new source direction = (%f, %f, %f)\n", source_data[i].direction.x, source_data[i].direction.y, source_data[i].direction.z);
03328         // printf("                       new detector center = (%f, %f, %f)\n", detector_center.x, detector_center.y, detector_center.z);
03329         // printf("           new detector low corner (at +Y) = (%f, %f, %f)\n", detector_data->corner_min_rotated_to_Y[i].x, detector_data->corner_min_rotated_to_Y[i].y, detector_data->corner_min_rotated_to_Y[i].z);
03330         // printf("                        center of rotation = (%f, %f, %f)\n", center_rotation.x, center_rotation.y, center_rotation.z);
03331         // printf("         detector width (X) and height (Z) = %f , %f cm\n", detector_data->width_X, detector_data->height_Z);
03332         // printf("            rotations to +Y around Z and X = %f , %f degrees\n", rotZ*RAD2DEG, rotX*RAD2DEG);
03333   }
03334 }
03335 
03336 
03337 ///////////////////////////////////////////////////////////////////////////////
03338 
03339 
03340 ////////////////////////////////////////////////////////////////////////////////
03341 //! Initialize the first seed of the pseudo-random number generator (PRNG) 
03342 //! RANECU to a position far away from the previous history (leap frog technique).
03343 //! This function is equivalent to "init_PRNG" but only updates one of the seeds.
03344 //!
03345 //! Note that if we use the same seed number to initialize the 2 MLCGs of the PRNG
03346 //! we can only warranty that the first MLCG will be uncorrelated for each value
03347 //! generated by "update_seed_PRNG". There is a tiny chance that the final PRNs will
03348 //! be correlated because the leap frog on the first MLCG will probably go over the
03349 //! repetition cycle of the MLCG, which is much smaller than the full RANECU. But any
03350 //! correlataion is extremely unlikely. Function "init_PRNG" doesn't have this issue.
03351 //!
03352 //!       @param[in] batch_number   Elements to skip (eg, MPI thread_number).
03353 //!       @param[in] total_histories   Histories to skip.
03354 //!       @param[in,out] seed   Initial PRNG seeds; returns the updated seed.
03355 ////////////////////////////////////////////////////////////////////////////////
03356 inline void update_seed_PRNG(int batch_number, unsigned long long int total_histories, int* seed)
03357 {
03358   if (0==batch_number)
03359     return;
03360     
03361   unsigned long long int leap = total_histories * (batch_number * LEAP_DISTANCE);
03362   int y = 1;
03363   int z = a1_RANECU;
03364   // -- Calculate the modulo power '(a^leap)MOD(m)' using a divide-and-conquer algorithm adapted to modulo arithmetic
03365   for(;;)
03366   {
03367     // (A2) Halve n, and store the integer part and the residue
03368     if (0!=(leap&01))  // (bit-wise operation for MOD(leap,2), or leap%2 ==> proceed if leap is an odd number)  Equivalent: t=(short)(leap%2);
03369     {
03370       leap >>= 1;     // Halve n moving the bits 1 position right. Equivalent to:  leap=(leap/2);  
03371       y = abMODm(m1_RANECU,z,y);      // (A3) Multiply y by z:  y = [z*y] MOD m
03372       if (0==leap) break;         // (A4) leap==0? ==> finish
03373     }
03374     else           // (leap is even)
03375     {
03376       leap>>= 1;     // Halve leap moving the bits 1 position right. Equivalent to:  leap=(leap/2);
03377     }
03378     z = abMODm(m1_RANECU,z,z);        // (A5) Square z:  z = [z*z] MOD m
03379   }
03380   // AjMODm1 = y;                 // Exponentiation finished:  AjMODm = expMOD = y = a^j
03381   // -- Compute and display the seeds S(i+j), from the present seed S(i), using the previously calculated value of (a^j)MOD(m):
03382   //         S(i+j) = [(a**j MOD m)*S(i)] MOD m
03383   //         S_i = abMODm(m,S_i,AjMODm)
03384   *seed = abMODm(m1_RANECU, *seed, y);
03385 }
03386 
03387 
03388 ///////////////////////////////////////////////////////////////////////////////
03389 
03390 
03391 ////////////////////////////////////////////////////////////////////////////////
03392 //! Read the energy spectrum file and initialize the Walker aliasing sampling.
03393 //!
03394 //!       @param[in] file_name_espc   File containing the energy spectrum (lower energy value in each bin and its emission probability).
03395 //!       @param[in,out] source_energy_data   Energy spectrum and other source data. The Walker alias and cutoffs are initialized in this function.
03396 //!       @param[out] mean_energy_spectrum   Mean energy in the input x-ray energy spectrum.
03397 ////////////////////////////////////////////////////////////////////////////////
03398 void init_energy_spectrum(char* file_name_espc, struct source_energy_struct* source_energy_data, float *mean_energy_spectrum)
03399 {
03400   char *new_line_ptr = NULL, new_line[250];    
03401   float lower_energy_bin, prob;
03402   float prob_espc_bin[MAX_ENERGY_BINS];    // The input probabilities of each energy bin will be discarded after Walker is initialized
03403 
03404   // -- Read spectrum from file:
03405   FILE* file_ptr = fopen(file_name_espc, "r");
03406   if (NULL==file_ptr)
03407   {
03408     printf("\n\n   !!init_energy_spectrum ERROR!! Error trying to read the energy spectrum input file \"%s\".\n\n", file_name_espc);
03409     exit(-1);
03410   }
03411   
03412   int current_bin = -1;
03413   do 
03414   {
03415     current_bin++;  // Update bin counter
03416     
03417     if (current_bin >= MAX_ENERGY_BINS)
03418     {
03419       printf("\n !!init_energy_spectrum ERROR!!: too many energy bins in the input spectrum. Increase the value of MAX_ENERGY_BINS=%d.\n", MAX_ENERGY_BINS);
03420       printf(  "            A negative probability marks the end of the spectrum.\n\n");
03421       exit(-1);
03422     }
03423 
03424     new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);   // Read the following line of text skipping comments and extra spaces
03425     
03426     if (new_line_ptr==NULL)
03427     {
03428       printf("\n\n   !!init_energy_spectrum ERROR!! The input file for the x ray spectrum (%s) is not readable or incomplete (a negative probability marks the end of the spectrum).\n", file_name_espc);
03429       exit(-1);
03430     }
03431     
03432     prob = -123456789.0f;  
03433     
03434     sscanf(new_line, "%f %f", &lower_energy_bin, &prob);     // Extract the lowest energy in the bin and the corresponding emission probability from the line read 
03435             
03436     prob_espc_bin[current_bin]     = prob;
03437     source_energy_data->espc[current_bin] = lower_energy_bin;           
03438     
03439     if (prob == -123456789.0f)
03440     {
03441       printf("\n !!init_energy_spectrum ERROR!!: invalid energy bin number %d?\n\n", current_bin);
03442       exit(-1);
03443     }
03444     else if (lower_energy_bin < source_energy_data->espc[max_value(current_bin-1,0)])    // (Avoid a negative index using the macro "max_value" defined in the header file)
03445     {
03446       printf("\n !!init_energy_spectrum ERROR!!: input energy bins with decreasing energy? espc(%d)=%f, espc(%d)=%f\n\n", current_bin-1, source_energy_data->espc[max_value(current_bin-1,0)], current_bin, lower_energy_bin);
03447       exit(-1);
03448     }
03449     
03450   } 
03451   while (prob > -1.0e-11f);     // A negative probability marks the end of the spectrum
03452 
03453 
03454   // Store the number of bins read from the input energy spectrum file:
03455   source_energy_data->num_bins_espc = current_bin;
03456 
03457 
03458   // Init the remaining bins (which will not be used) with the last energy read (will be assumed as the highest energy in the last bin) and 0 probability of emission.
03459   register int i;
03460   for (i=current_bin; i<MAX_ENERGY_BINS; i++)
03461   {
03462     source_energy_data->espc[i] = lower_energy_bin;
03463     prob_espc_bin[i]     = 0.0f;
03464   }
03465 
03466 
03467   // Compute the mean energy in the spectrum, taking into account the energy and prob of each bin:
03468   float all_energy = 0.0f;
03469   float all_prob = 0.0f;
03470   for(i=0; i<source_energy_data->num_bins_espc; i++)
03471   {
03472     all_energy += 0.5f*(source_energy_data->espc[i]+source_energy_data->espc[i+1])*prob_espc_bin[i];
03473     all_prob   += prob_espc_bin[i];
03474   }  
03475   *mean_energy_spectrum = all_energy/all_prob;
03476   
03477           
03478 // -- Init the Walker aliasing sampling method (as it is done in PENELOPE):
03479   IRND0(prob_espc_bin, source_energy_data->espc_cutoff, source_energy_data->espc_alias, source_energy_data->num_bins_espc);   //!!Walker!! Calling PENELOPE's function to init the Walker method
03480        
03481 // !!Verbose!! Test sampling
03482 // Sampling the x ray energy using the Walker aliasing algorithm from PENELOPE:
03483 // int sampled_bin = seeki_walker(source_energy_data->espc_cutoff, source_energy_data->espc_alias, 0.5, source_energy_data->num_bins_espc);
03484 // float e = source_energy_data->espc[sampled_bin] + ranecu(seed) * (source_energy_data->espc[sampled_bin+1] - source_energy_data->espc[sampled_bin]);    // Linear interpolation of the final energy within the sampled energy bin
03485 // printf("\n\n !!Walker!! Energy center bin %d = %f keV\n", sampled_bin, 0.001f*e);
03486   
03487 }       
03488 
03489 
03490 
03491       
03492 //********************************************************************
03493 //!    Finds the interval (x(i),x(i+1)] containing the input value    
03494 //!    using Walker's aliasing method.                                
03495 //!                                                                   
03496 //!    Input:                                                         
03497 //!      cutoff(1..n) -> interval cutoff values for the Walker method 
03498 //!      cutoff(1..n) -> alias for the upper part of each interval    
03499 //!      randno       -> point to be located                          
03500 //!      n            -> no. of data points                           
03501 //!    Output:                                                        
03502 //!      index i of the semiopen interval where randno lies           
03503 //!    Comments:                                                      
03504 //!      -> The cutoff and alias values have to be previously         
03505 //!         initialised calling the penelope subroutine IRND0.        
03506 //!                                                                   
03507 //!                                                                   
03508 //!    Algorithm implementation based on the PENELOPE code developed   
03509 //!    by Francesc Salvat at the University of Barcelona. For more     
03510 //!    info: www.oecd-nea.org/science/pubs/2009/nea6416-penelope.pdf  
03511 //!                                                                   
03512 //CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
03513 //C  PENELOPE/PENGEOM (version 2006)                                     C
03514 //C  Copyright (c) 2001-2006                                             C
03515 //C  Universitat de Barcelona                                            C
03516 //C                                                                      C
03517 //C  Permission to use, copy, modify, distribute and sell this software  C
03518 //C  and its documentation for any purpose is hereby granted without     C
03519 //C  fee, provided that the above copyright notice appears in all        C
03520 //C  copies and that both that copyright notice and this permission      C
03521 //C  notice appear in all supporting documentation. The Universitat de   C
03522 //C  Barcelona makes no representations about the suitability of this    C
03523 //C  software for any purpose. It is provided "as is" without express    C
03524 //C  or implied warranty.                                                C
03525 //CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
03526 inline int seeki_walker(float *cutoff, short int *alias, float randno, int n)
03527 {
03528    float RN = randno * n;                         // Find initial interval (array starting at 0):   
03529    int int_part = (int)(RN);                      //   -- Integer part
03530    float fraction_part = RN - ((float)int_part);  //   -- Fractional part
03531 
03532    if (fraction_part < cutoff[int_part])          // Check if we are in the aliased part
03533       return int_part;                            // Below the cutoff: return current value
03534    else
03535       return (int)alias[int_part];                // Above the cutoff: return alias
03536 }     
03537 
03538 
03539 
03540 
03541 //****************************************************************** *
03542 //*                    SUBROUTINE IRND0                              *
03543 //********************************************************************
03544 //*                                                                   
03545 //!  Initialisation of Walker's aliasing algorithm for random         
03546 //!  sampling from discrete probability distributions.                
03547 //!                                                                   
03548 //! Input arguments:                                                  
03549 //!   N ........ number of different values of the random variable.   
03550 //!   W(1:N) ... corresponding point probabilities (not necessarily   
03551 //!              normalised to unity).                                
03552 //! Output arguments:                                                 
03553 //!   F(1:N) ... cutoff values.                                       
03554 //!   K(1:N) ... alias values.                                        
03555 //!                                                                   
03556 //!                                                                   
03557 //!  This subroutine is part of the PENELOPE 2006 code developed      
03558 //!  by Francesc Salvat at the University of Barcelona. For more       
03559 //!  info: www.oecd-nea.org/science/pubs/2009/nea6416-penelope.pdf    
03560 //*                                                                   
03561 //CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
03562 //C  PENELOPE/PENGEOM (version 2006)                                     C
03563 //C  Copyright (c) 2001-2006                                             C
03564 //C  Universitat de Barcelona                                            C
03565 //C                                                                      C
03566 //C  Permission to use, copy, modify, distribute and sell this software  C
03567 //C  and its documentation for any purpose is hereby granted without     C
03568 //C  fee, provided that the above copyright notice appears in all        C
03569 //C  copies and that both that copyright notice and this permission      C
03570 //C  notice appear in all supporting documentation. The Universitat de   C
03571 //C  Barcelona makes no representations about the suitability of this    C
03572 //C  software for any purpose. It is provided "as is" without express    C
03573 //C  or implied warranty.                                                C
03574 //CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
03575 void IRND0(float *W, float *F, short int *K, int N)
03576 {
03577    register int I;
03578   
03579    //  ****  Renormalisation.
03580    double WS=0.0;
03581    for (I=0; I<N; I++)
03582    {   
03583       if(W[I] < 0.0f) 
03584       {
03585          printf("\n\n !!ERROR!! IRND0: Walker sampling initialization. Negative point probability? W(%d)=%f\n\n", I, W[I]);
03586          exit(-1);
03587       }
03588       WS = WS + W[I];
03589    }
03590    WS = ((double)N) / WS; 
03591   
03592    for (I=0; I<N; I++)
03593    {
03594       K[I] = I;
03595       F[I] = W[I] * WS;
03596    }
03597     
03598    if (N==1) 
03599       return;
03600      
03601    //  ****  Cutoff and alias values.
03602    float HLOW, HIGH;
03603    int   ILOW, IHIGH, J;
03604    for (I=0; I<N-1; I++)
03605    {
03606       HLOW = 1.0f;
03607       HIGH = 1.0f;
03608       ILOW = -1;
03609       IHIGH= -1;
03610       for (J=0; J<N; J++)
03611       {
03612          if(K[J]==J)
03613          {
03614             if(F[J]<HLOW)
03615             {
03616                HLOW = F[J];
03617                ILOW = J;
03618             }
03619             else if(F[J]>HIGH)
03620             {
03621                HIGH  = F[J];
03622                IHIGH = J;
03623             }
03624          }
03625       }
03626       
03627       if((ILOW==-1) || (IHIGH==-1)) 
03628         return;
03629 
03630       K[ILOW] = IHIGH;
03631       F[IHIGH]= HIGH + HLOW - 1.0f;
03632    }
03633    return;
03634 }
03635 
03636 
03637 
03638 ///////////////////////////////////////////////////////////////////////////////