Commits

ivarun committed 43552a5

Making generator host-code independent:
Remove all C/C++ code, initialization now done entirely in kernel code.
Remove ranluxcl_warmup, warmup is done in initialization function by default.
Some restructuring of ranluxcltest (both using the new initialization method and other changes).
Update comments.

Comments (0)

Files changed (4)

 #ifndef RANLUXCL_CL
 #define RANLUXCL_CL
 
-/**** RANLUXCL v1.2.0 ****************************************
+/**** RANLUXCL v1.2.0 MODIFIED *****************************************************
 
-***** DESCRIPTION ******************************************************************
+Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77
+implementation by Fred James. This OpenCL code is a complete implementation which 
+should perfectly replicate the numbers generated by the original Fortran 77
+implementation.
 
-Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77 
-implementation by Fred James. This OpenCL code along with the C++ function 
-ranluxcl_initialization in ranluxcl.h is a complete implementation which should 
-perfectly replicate the numbers generated by the original fortran77 implementation.
+***** QUICK USAGE DESCRIPTION ******************************************************
 
-***** USAGE ************************************************************************
+1. Create an OpenCL buffer with room for (<numWorkitems> * 7) float4 variables. I.e.
+in C/C++: size_t buffSize = numWorkitems * 7 * sizeof(cl_float4).
 
-To use, call ranluxcl_initialization in host code. It returns a pointer to a
-cl_float4 array, and also returns the size of said array in bytes. This
-array should be transferred to an OpenCL buffer. See comments in 
-ranluxcl.hpp for more details about the initialization.
+2. Pass the buffer and an integer seed <ins> (<ins> larger or equal to zero) to a
+kernel that launches ranluxcl_initialization. Stick to relatively small <ins>. For
+more on restrictions on <ins> see the section on the initialization function
+below. An examle initialization kernel would be:
+    #include "ranluxcl.cl"
+    __kernel void Kernel_Ranluxcl_Init(private int ins, global float4 *ranluxcltab){
+    	ranluxcl_initialization(ins, ranluxcltab);
+    }
 
-Simple sample kernel that writes a pseudorandom float4 to global memory:
+3. Now the generator is ready for use. Remember to download the seeds first, and
+upload them again when done. Example kernel that generates a float4 where each
+component is uniformly distributed between 0 and 1, end points not included:
+    #include "ranluxcl.cl"
+	__kernel void Kernel_Example(__global float4 *ranluxcltab){
+		//ranluxclstate is a struct of 7 float4 variables
+		//storing the state of the generator.
+		ranluxcl_state_t ranluxclstate;
 
-#include "ranluxcl.cl"
-__kernel void Kernel_PRN(__global float4* RANLUXCLTab, __global float4* PRNs)
-{
-	//Downloading RANLUXCLTab. The state of RANLUXCL is stored in ranluxclstate.
-	ranluxcl_state_t ranluxclstate;
-	ranluxcl_download_seed(&ranluxclstate, RANLUXCLTab);
-	
-	float4 randomnr = ranluxcl(&ranluxclstate);
+		//Download state into ranluxclstate struct.
+		ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
 
-	PRNs[get_global_id(0)] = randomnr;
+		//Generate a float4 with each component on (0,1),
+		//end points not included. We can call ranluxcl as many
+		//times as we like until we upload the state again.
+		float4 randomnr = ranluxcl(&ranluxclstate);
 
-	//Uploading RANLUXCLTab
-	ranluxcl_upload_seed(&ranluxclstate, RANLUXCLTab);
-}
+		//Upload state again so that we don't get the same
+		//numbers over again the next time we use ranluxcl.
+		ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
+	}
 
-The ranluxcl_download_seed and ranluxcl_upload_seed functions assume that the 
-RANLUXCLTab in global memory has enough values for all the work-items, i.e. that
-the numWorkitems variable passed to ranluxcl_initialization in host code corresponds
-to the number of work-items in the current NDRange. If this is not the case the
-behaviour is, as they say, "undefined".
+***** MACROS ***********************************************************************
 
-There are a total of four functions meant to be called by other OpenCL code:
+The following macros can optionally be defined:
 
-ranluxcl_download_seed(ranluxcl_state_t *ranluxclstate, __global float4 *RANLUXCLTab)
+RANLUXCL_LUX:
+Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger it
+sets the p-value of the generator (generally not needed). If this macro is not set
+then lux=4 is the default (highest quality). For many applications the high quality
+of lux=4 may not be needed. Indeed if two values (each value having 24 random bits) 
+are glued together to form a 48-bit value the generator passes all tests in the TestU01
+suite already with lux=2. See "TestU01: A C Library for Empirical Testing of Random 
+Number Generators" by PIERRE L�ECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is 
+RANLUX with two values glued together to create 48-bit numbers, and we see that it
+passes all tests already at luxury value 2.
+
+RANLUXCL_MAXWORKITEMS:
+If several OpenCL NDRanges will be running in parallel and the parallel sequences
+should be different then this macro should have a value equal or larger than the
+largest number of work-items in any of the parallel runs. The default is to use the
+current global size, so if all NDRanges are of the same size this need not be
+defined. Each parallel instance must also have different seeds <ins>. For example if
+we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would use
+different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be defined to
+be at least 10240. If GPU1 and GPU2 had the same number of work-items this would not
+be necessary.
+
+RANLUXCL_NO_WARMUP:
+Turns off the warmup functionality in ranluxcl_initialization. This macro should
+generally not be used, since the generators will initially be correlated if it is
+defined. The only advantage is that the numbers generated will exactly correspond
+to those of the original Fortran 77 implementation.
+
+***** FUNCTIONS: INITIALIZATION ****************************************************
+
+The initialization function is defined as:
+void ranluxcl_initialization(int ins, __global float4 *ranluxcltab)
+
+Run once at the very beginning. ranluxcltab should be a buffer with space for 7
+float4 variables per work-item in the NDRange. <ins> is the seed to the generator.
+For a given <ins> each work-item in the NDRange will generate a different sequence.
+If more than one NDRange is used in parallel then <ins> must be different for each
+NDRange to avoid identical sequences.
+
+See also the RANLUXCL_MAXWORKITEMS macro above for parallel sequences. As long as either
+the RANLUXCL_MAXWORKITEMS macro is set correctly, or all NDRanges are launched with
+the same number of work-items, it is sufficient to increment <ins> by one to ensure all
+work-items will generate different sequences. 
+
+An underestimate of the highest permissible seed <ins> is given by the smallest of:
+(<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS).
+Please make sure that <ins> is never higher than this since it could cause undetected
+problems. For example with 10240 work-items the highest permissible <ins> is about
+100 000.
+
+***** FUNCTIONS: SEED UPLOAD/DOWNLOAD **********************************************
+
+The following two functions should be launced at the beginning and end of a kernel
+that uses ranluxcl to generate numbers, respectively:
+
+void ranluxcl_download_seed(ranluxcl_state_t *ranluxclstate, __global float4 *ranluxcltab)
 Run at the beginning of a kernel to download ranluxcl state data
 
-ranluxcl_upload_seed(ranluxcl_state_t *ranluxclstate, __global float4 *RANLUXCLTab)
+void ranluxcl_upload_seed(ranluxcl_state_t *ranluxclstate, __global float4 *ranluxcltab)
 Run at the end of a kernel to upload state data
 
-ranluxcl(ranluxcl_state_t *ranluxclstate)
-Run to generate a float4
+***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ************************************
 
-ranluxcl_synchronize(ranluxcl_state_t *ranluxclstate)
+float4 ranluxcl(ranluxcl_state_t *ranluxclstate)
+Run to generate a pseudo-random float4 where each component a number between 0 and 1,
+end points not included (meaning the number will never be exactly 0 or 1).
+
+void ranluxcl_synchronize(ranluxcl_state_t *ranluxclstate)
 Run to synchronize execution in case different work-items have made a different
 number of calls to ranluxcl. On SIMD machines this could lead to inefficient execution.
 ranluxcl_synchronize allows us to make sure all generators are SIMD-friendly again. Not
 needed if all work-items always call ranluxcl the same number of times.
 
-ranluxcl_warmup(ranluxcl_state_t *ranluxclstate)
-Run once before any values are generated using ranluxcl(). This will ensure that
-the parallel generators are not correlated. While you could just run this in any
-kernel where ranluxcl is used, that would be wastefull. It is only necessary to
-call ranluxcl_warmup once for each work-item to make sure there are no
-correlations from the initialization procedure. For instance the following kernel
-could be launched right after ranluxcl_initialization was called in host code:
-
-#include "ranluxcl.cl"
-__kernel void Kernel_RANLUXCL_Warmup(__global float4* RANLUXCLTab)
-{
-	//Downloading RANLUXCLTab. The state of RANLUXCL is stored in ranluxclstate.
-	ranluxcl_state_t ranluxclstate;
-	ranluxcl_download_seed(&ranluxclstate, RANLUXCLTab);
-
-	ranluxcl_warmup(&ranluxclstate);
-
-	//Uploading RANLUXCLTab
-	ranluxcl_upload_seed(&ranluxclstate, RANLUXCLTab);
-}
-
 ***** PERFORMANCE ******************************************************************
 
 For luxury setting 3, performance on AMD cypress should be ~7*10^9 pseudorandom 
 values per second, when not downloading values to host memory (i.e. the values are 
 just generated, but not used for anything in particular).
 
-***** IMPLEMENTATION DETAILS *******************************************************
+***** DESCRIPTION OF THE IMPLEMENTATION ********************************************
 
-There are two slightly different approaches combined in this file, namely planar 
-and planar shift. If RANLUXCL_NSKIP is set and is a multiple of 24 the planar scheme 
-is recovered through preprocessor directives (i.e. the unneeded parts of the 
-planar shift scheme are not included). If RANLUXCL_NSKIP is not set, or it is not a
-multiple of 24 the planar shift scheme is used.
+This code closely follows the original Fortran 77 code (see credit section). Here
+the differences (and similarities) between RANLUXCL (this implementation) and the
+original RANLUX are discussed.
 
-"Planar" refers to the fact that the algorithm is unrolled, i.e. all indexing 
-into the seeds is explicit. For this reason it becomes most convenient to always 
-discard some multiple of 24 values, i.e. the p-value should be some multiple of 24. 
-The idea for this approach comes from:
+The Fortran 77 implementation uses a simple LCG to initialize the generator, and
+so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as
+seed, the first work-item behaves like the original RANLUX with seed equal 1, the
+second work-item as if with seed equal 2 and so on. If <ins> = 1 then the first
+work-item behaves like the original RANLUX with seed equal to <numWorkitems> + 1,
+and so on for higher <ins> so that we never have overlapping sequences. This is
+why the RANLUXCL_MAXWORKITEMS macro must be set if we have different NDRanges with
+a different number of work-items.
 
+RANLUX is based on chaos theory, and what we are actually doing when selecting
+a luxury value is setting how many values to skip over (causing decorrelation).
+The number of values to skip is controlled by the so-called p-value of the
+generator. After generating 24 values we skip p - 24 values until again generating
+24 values.
+
+This implementation is somewhat modified from the original fortran implementation 
+by F. James. Because of the way the OpenCL code is optimized with 4-component 
+32-bit float vectors, it is most convenient to always throw away some multiple 
+of 24 values (i.e. p is always a multiple of 24).
+
+However, there might be some resonances if we always throw away a multiple of
+the seeds table size. Therefore the implementation is slightly more intricate
+where p can be a multiple of 4 instead, at a cost to performance (only about 10%
+lower than the cleaner 24 values approach on AMD Cypress). These two approaches
+are termed planar and planar shift respectively. The idea for the planar approach
+comes from the following paper:
 Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on 
 Graphics Processing Units, arXiv:1003.1898v1 [hep-lat]
 
-In the planar shift scheme it is however possible to have p a multiple of 4 instead, 
-which allows us to avoid any resonances that might be present when p is a 
-multiple of the seeds table size. It also allows us to choose a p-value 
-corresponding to what Martin L�scher chose for his v3 version of RANLUX for our 
-luxury setting 4.
+Below the p-values for the original reference implementation are listed along with 
+those of the planar shift implementation. Suggested values for the planar approach 
+are also presented. When this function is called with RANLUXCL_LUX set to 0-4, the
+planar shift values are used. To use the pure planar approach (for some extra
+performance with likely undetectable quality decrease), set lux equal to the specific 
+p-value.
+        
+Luxury setting (RANLUXCL_LUX):                   0   1   2   3   4
+Original fortran77 implementation by F. James:  24  48  97  223 389
+Planar (suggested):                             24  48  120 240 408
+Planar shift:                                   24  48  100 224 404
 
-The planar shift algorithm is recommended (which is what is used by the default 
-luxury values 0 through 4 in ranluxcl_initialization), but there is a chance that 
-the simpler planar approach will be significantly faster on some architectures 
-since it includes fewer if tests and drops a switch. On AMD Cypress the 
-performance difference is about 10%. To use the planar approach RANLUXCL_NSKIP 
-must be defined in the kernel code to be the same as the nskip variable returned 
-by ranluxcl_initialization. If nskip is a multiple of 24 the planar scheme is
-then recovered.
+Note that levels 0 and 1 are the same as in the original implementation for both
+planar and planar shift. Level 4 of planar shift where p=404 is the same as chosen 
+for luxury level 1 by Martin Luescher for his v3 version of RANLUX. Therefore if 
+it is considered important to only use "official" values, luxury settings 0, 1 or 
+4 of planar shift should be used. It is however unlikely that the other values are 
+bad, they just haven't been as extensively used and tested by others.
 
-Note that the in24 variable is only used in planar shift, while stepnr takes the 
-place of in24 in the planar case.
+Variable names are generally the same as in the fortran77 implementation, however 
+because of the way the generator is implemented, the i24 and j24 variables are 
+no longer needed.
 
 ***** CREDIT ***********************************************************************
 
 	float4 s13to16;
 	float4 s17to20;
 	float4 s21to24;
-	float4 carryin24stepnrnskip;
+	float4 carryin24stepnr; //Fourth component unused
 } ranluxcl_state_t;
 
 #define RANLUXCL_TWOM24 0.000000059604644775f
 #define RANLUXCL_TWOM12 0.000244140625f
 
-//Check that nskip is a permissible value if it's defined
-#ifdef RANLUXCL_NSKIP
+#ifdef RANLUXCL_LUX
+#if RANLUXCL_LUX < 0
+#error ranluxcl: lux must be zero or positive.
+#endif
+#else
+#define RANLUXCL_LUX 4 //Default to high quality
+#endif //RANLUXCL_LUX
+
+//Here the luxury values are defined
+#if RANLUXCL_LUX == 0
+#define RANLUXCL_NSKIP 0
+#elif RANLUXCL_LUX == 1
+#define RANLUXCL_NSKIP 24
+#elif RANLUXCL_LUX == 2
+#define RANLUXCL_NSKIP 76
+#elif RANLUXCL_LUX == 3
+#define RANLUXCL_NSKIP 200
+#elif RANLUXCL_LUX == 4
+#define RANLUXCL_NSKIP 380
+#else
+#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24)
+#endif //RANLUXCL_LUX == 0
+
+//Check that nskip is a permissible value
 #if RANLUXCL_NSKIP % 4 != 0 
 #error nskip must be divisible by 4!
 #endif
 #define RANLUXCL_NOSKIP
 #endif
 
-#endif //defined RANLUXCL_NSKIP
-
 //Single-value global size and id
 #define RANLUXCL_NUMWORKITEMS (get_global_size(0) * get_global_size(1) * get_global_size(2))
 #define RANLUXCL_MYID (get_global_id(0) + get_global_id(1) * get_global_size(0) + get_global_id(2) * get_global_size(0) * get_global_size(1))
 
-void ranluxcl_download_seed(ranluxcl_state_t *ranluxclstate, __global float4 *RANLUXCLTab)
+void ranluxcl_download_seed(ranluxcl_state_t *ranluxclstate, __global float4 *ranluxcltab)
 {
-	(*ranluxclstate).s01to04               = RANLUXCLTab[RANLUXCL_MYID + 0 * RANLUXCL_NUMWORKITEMS];
-	(*ranluxclstate).s05to08               = RANLUXCLTab[RANLUXCL_MYID + 1 * RANLUXCL_NUMWORKITEMS];
-	(*ranluxclstate).s09to12               = RANLUXCLTab[RANLUXCL_MYID + 2 * RANLUXCL_NUMWORKITEMS];
-	(*ranluxclstate).s13to16               = RANLUXCLTab[RANLUXCL_MYID + 3 * RANLUXCL_NUMWORKITEMS];
-	(*ranluxclstate).s17to20               = RANLUXCLTab[RANLUXCL_MYID + 4 * RANLUXCL_NUMWORKITEMS];
-	(*ranluxclstate).s21to24               = RANLUXCLTab[RANLUXCL_MYID + 5 * RANLUXCL_NUMWORKITEMS];
-	(*ranluxclstate).carryin24stepnrnskip  = RANLUXCLTab[RANLUXCL_MYID + 6 * RANLUXCL_NUMWORKITEMS];
+	(*ranluxclstate).s01to04         = ranluxcltab[RANLUXCL_MYID + 0 * RANLUXCL_NUMWORKITEMS];
+	(*ranluxclstate).s05to08         = ranluxcltab[RANLUXCL_MYID + 1 * RANLUXCL_NUMWORKITEMS];
+	(*ranluxclstate).s09to12         = ranluxcltab[RANLUXCL_MYID + 2 * RANLUXCL_NUMWORKITEMS];
+	(*ranluxclstate).s13to16         = ranluxcltab[RANLUXCL_MYID + 3 * RANLUXCL_NUMWORKITEMS];
+	(*ranluxclstate).s17to20         = ranluxcltab[RANLUXCL_MYID + 4 * RANLUXCL_NUMWORKITEMS];
+	(*ranluxclstate).s21to24         = ranluxcltab[RANLUXCL_MYID + 5 * RANLUXCL_NUMWORKITEMS];
+	(*ranluxclstate).carryin24stepnr = ranluxcltab[RANLUXCL_MYID + 6 * RANLUXCL_NUMWORKITEMS];
 }
 
-void ranluxcl_upload_seed(ranluxcl_state_t *ranluxclstate, __global float4 *RANLUXCLTab)
+void ranluxcl_upload_seed(ranluxcl_state_t *ranluxclstate, __global float4 *ranluxcltab)
 {
-	RANLUXCLTab[RANLUXCL_MYID + 0 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s01to04;         
-	RANLUXCLTab[RANLUXCL_MYID + 1 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s05to08;         
-	RANLUXCLTab[RANLUXCL_MYID + 2 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s09to12;         
-	RANLUXCLTab[RANLUXCL_MYID + 3 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s13to16;         
-	RANLUXCLTab[RANLUXCL_MYID + 4 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s17to20;         
-	RANLUXCLTab[RANLUXCL_MYID + 5 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s21to24;         
-	RANLUXCLTab[RANLUXCL_MYID + 6 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).carryin24stepnrnskip;
+	ranluxcltab[RANLUXCL_MYID + 0 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s01to04;         
+	ranluxcltab[RANLUXCL_MYID + 1 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s05to08;         
+	ranluxcltab[RANLUXCL_MYID + 2 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s09to12;         
+	ranluxcltab[RANLUXCL_MYID + 3 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s13to16;         
+	ranluxcltab[RANLUXCL_MYID + 4 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s17to20;         
+	ranluxcltab[RANLUXCL_MYID + 5 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).s21to24;         
+	ranluxcltab[RANLUXCL_MYID + 6 * RANLUXCL_NUMWORKITEMS] = (*ranluxclstate).carryin24stepnr;
 }
 
-#undef RANLUXCL_NUMWORKITEMS
-#undef RANLUXCL_MYID
-
-float ranluxcl_onestep_1(float4* vec1, float4* vec2, float4* carryin24stepnrnskip){
+float ranluxcl_onestep_1(float4* vec1, float4* vec2, float4* carryin24stepnr){
 	float uni, out;
-	uni = (*vec1).y - (*vec2).w - (*carryin24stepnrnskip).x;
+	uni = (*vec1).y - (*vec2).w - (*carryin24stepnr).x;
 	if(uni < 0.0f){
 		uni += 1.0f;
-		(*carryin24stepnrnskip).x = RANLUXCL_TWOM24;
-	} else (*carryin24stepnrnskip).x = 0.0f;
+		(*carryin24stepnr).x = RANLUXCL_TWOM24;
+	} else (*carryin24stepnr).x = 0.0f;
 	out = ((*vec2).w = uni);
 
 	if(uni < RANLUXCL_TWOM12) out += RANLUXCL_TWOM24 * (*vec1).y;
 	return out;
 }
 
-float ranluxcl_onestep_2(float4* vec1, float4* vec2, float4* carryin24stepnrnskip){
+float ranluxcl_onestep_2(float4* vec1, float4* vec2, float4* carryin24stepnr){
 	float uni, out;
-	uni = (*vec1).x - (*vec2).z - (*carryin24stepnrnskip).x;
+	uni = (*vec1).x - (*vec2).z - (*carryin24stepnr).x;
 	if(uni < 0.0f){
 		uni += 1.0f;
-		(*carryin24stepnrnskip).x = RANLUXCL_TWOM24;
-	} else (*carryin24stepnrnskip).x = 0.0f;
+		(*carryin24stepnr).x = RANLUXCL_TWOM24;
+	} else (*carryin24stepnr).x = 0.0f;
 	out = ((*vec2).z = uni);
 
 	if(uni < RANLUXCL_TWOM12) out += RANLUXCL_TWOM24 * (*vec1).x;
 	return out;
 }
 
-float ranluxcl_onestep_3(float4* vec1, float4* vec2, float4* carryin24stepnrnskip){
+float ranluxcl_onestep_3(float4* vec1, float4* vec2, float4* carryin24stepnr){
 	float uni, out;
-	uni = (*vec1).w - (*vec2).y - (*carryin24stepnrnskip).x;
+	uni = (*vec1).w - (*vec2).y - (*carryin24stepnr).x;
 	if(uni < 0.0f){
 		uni += 1.0f;
-		(*carryin24stepnrnskip).x = RANLUXCL_TWOM24;
-	} else (*carryin24stepnrnskip).x = 0.0f;
+		(*carryin24stepnr).x = RANLUXCL_TWOM24;
+	} else (*carryin24stepnr).x = 0.0f;
 	out = ((*vec2).y = uni);
 
 	if(uni < RANLUXCL_TWOM12) out += RANLUXCL_TWOM24 * (*vec1).w;
 	return out;
 }
 
-float ranluxcl_onestep_4(float4* vec1, float4* vec2, float4* carryin24stepnrnskip){
+float ranluxcl_onestep_4(float4* vec1, float4* vec2, float4* carryin24stepnr){
 	float uni, out;
-	uni = (*vec1).z - (*vec2).x - (*carryin24stepnrnskip).x;
+	uni = (*vec1).z - (*vec2).x - (*carryin24stepnr).x;
 	if(uni < 0.0f){
 		uni += 1.0f;
-		(*carryin24stepnrnskip).x = RANLUXCL_TWOM24;
-	} else (*carryin24stepnrnskip).x = 0.0f;
+		(*carryin24stepnr).x = RANLUXCL_TWOM24;
+	} else (*carryin24stepnr).x = 0.0f;
 	out = ((*vec2).x = uni);
 
 	if(uni < RANLUXCL_TWOM12) out += RANLUXCL_TWOM24 * (*vec1).z;
 
 	float4 out;
 
-	if((*ranluxclstate).carryin24stepnrnskip.z == 0.0f){
-		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-		(*ranluxclstate).carryin24stepnrnskip.z += 4.0f;
+	if((*ranluxclstate).carryin24stepnr.z == 0.0f){
+		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		(*ranluxclstate).carryin24stepnr.z += 4.0f;
 	}
 
-	else if((*ranluxclstate).carryin24stepnrnskip.z == 4.0f){
-		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-		(*ranluxclstate).carryin24stepnrnskip.z += 4.0f;
+	else if((*ranluxclstate).carryin24stepnr.z == 4.0f){
+		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		(*ranluxclstate).carryin24stepnr.z += 4.0f;
 	}
 
-	else if((*ranluxclstate).carryin24stepnrnskip.z == 8.0f){
-		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-		(*ranluxclstate).carryin24stepnrnskip.z += 4.0f;
+	else if((*ranluxclstate).carryin24stepnr.z == 8.0f){
+		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		(*ranluxclstate).carryin24stepnr.z += 4.0f;
 	}
 
-	else if((*ranluxclstate).carryin24stepnrnskip.z == 12.0f){
-		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-		(*ranluxclstate).carryin24stepnrnskip.z += 4.0f;
+	else if((*ranluxclstate).carryin24stepnr.z == 12.0f){
+		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		(*ranluxclstate).carryin24stepnr.z += 4.0f;
 	}
 
-	else if((*ranluxclstate).carryin24stepnrnskip.z == 16.0f){
-		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-		(*ranluxclstate).carryin24stepnrnskip.z += 4.0f;
+	else if((*ranluxclstate).carryin24stepnr.z == 16.0f){
+		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		(*ranluxclstate).carryin24stepnr.z += 4.0f;
 	}
 
-	else if((*ranluxclstate).carryin24stepnrnskip.z == 20.0f){
-		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-		(*ranluxclstate).carryin24stepnrnskip.z = 0.0f;
+	else if((*ranluxclstate).carryin24stepnr.z == 20.0f){
+		out.x = ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		out.y = ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		out.z = ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		out.w = ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		(*ranluxclstate).carryin24stepnr.z = 0.0f;
 
 //The below preprocessor directives are here to recover the simpler planar scheme when nskip is a multiple of 24.
 //For the most general planar shift approach, just ignore all #if's below.
 #ifndef RANLUXCL_PLANAR
 	}
 
-	(*&((*ranluxclstate).carryin24stepnrnskip)).y += 4.0f;
-	if((*&((*ranluxclstate).carryin24stepnrnskip)).y == 24.0f){
-		(*&((*ranluxclstate).carryin24stepnrnskip)).y = 0.0f;
+	(*&((*ranluxclstate).carryin24stepnr)).y += 4.0f;
+	if((*&((*ranluxclstate).carryin24stepnr)).y == 24.0f){
+		(*&((*ranluxclstate).carryin24stepnr)).y = 0.0f;
 #endif //RANLUXCL_PLANAR
 
-		int initialskips = (int)((*ranluxclstate).carryin24stepnrnskip.z) ? (24 - (int)((*ranluxclstate).carryin24stepnrnskip.z)) : 0;
-		int bulkskips = (((int)((*&((*ranluxclstate).carryin24stepnrnskip)).w) - initialskips)/24) * 24;
-		int remainingskips = (int)((*&((*ranluxclstate).carryin24stepnrnskip)).w) - initialskips - bulkskips;
+		int initialskips = (int)((*ranluxclstate).carryin24stepnr.z) ? (24 - (int)((*ranluxclstate).carryin24stepnr.z)) : 0;
+		int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24;
+		int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips;
 
 //We know there won't be any initial skips in the planar scheme
 #ifndef RANLUXCL_PLANAR
 		//Do initial skips (lack of breaks in switch is intentional).
 		switch(initialskips){
 			case(20):
-				ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
 			case(16):
-				ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
 			case(12):
-				ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
 			case(8):
-				ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
 			case(4):
-				ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
 		}
 #endif //RANLUXCL_PLANAR
 
 //Also check if we will ever need to skip at all
 #ifndef RANLUXCL_NOSKIP
 		for(int i=0; i<bulkskips/24; i++){
-			ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
+			ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
 		}
 #endif //RANLUXCL_NOSKIP
 
 #ifndef RANLUXCL_PLANAR
 		//Do remaining skips
 		if(remainingskips){
-			ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-			ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
+			ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+			ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
 			
 			if(remainingskips > 4){
-				ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
 			}
 
 			if(remainingskips > 8){
-				ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
 			}
 
 			if(remainingskips > 12){
-				ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
 			}
 
 			if(remainingskips > 16){
-				ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-				ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
+				ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+				ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
 			}
 		}
 #endif //RANLUXCL_PLANAR
 
 		//Initial skips brought stepnr down to 0. The bulk skips did only full cycles.
 		//Therefore stepnr is now equal to remainingskips.
-		(*ranluxclstate).carryin24stepnrnskip.z = (float)remainingskips;
+		(*ranluxclstate).carryin24stepnr.z = (float)remainingskips;
 	}
 
 	return out;
 	//execution across all work-items.
 
 	//Do necessary number of calls to ranluxcl so that stepnr == 0 at the end.
-	if((*ranluxclstate).carryin24stepnrnskip.z == 4.0f)
+	if((*ranluxclstate).carryin24stepnr.z == 4.0f)
 		ranluxcl(ranluxclstate);
-	if((*ranluxclstate).carryin24stepnrnskip.z == 8.0f)
+	if((*ranluxclstate).carryin24stepnr.z == 8.0f)
 		ranluxcl(ranluxclstate);
-	if((*ranluxclstate).carryin24stepnrnskip.z == 12.0f)
+	if((*ranluxclstate).carryin24stepnr.z == 12.0f)
 		ranluxcl(ranluxclstate);
-	if((*ranluxclstate).carryin24stepnrnskip.z == 16.0f)
+	if((*ranluxclstate).carryin24stepnr.z == 16.0f)
 		ranluxcl(ranluxclstate);
-	if((*ranluxclstate).carryin24stepnrnskip.z == 20.0f)
+	if((*ranluxclstate).carryin24stepnr.z == 20.0f)
 		ranluxcl(ranluxclstate);
 }
 
 void ranluxcl_warmup(ranluxcl_state_t *ranluxclstate){
 	//This function "warms up" the generator, meaning it simply generates enough
 	//values to ensure that the starting values are completely decorrelated.
-	//It should be called once after ranluxcl_initialization in host code, after
-	//the first call the sequences will stay uncorrelated.
 
 	//16 is a "magic number". It is the number of times we must generate
 	//a batch of 24 numbers to ensure complete decorrelation.
 	for(int i=0; i<16; i++){
-		ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
-		ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnrnskip));
+		ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
 	}
 
 }
 
+void ranluxcl_initialization(int ins, global float4 *ranluxcltab)
+{
+#define IC 2147483563
+#define MAXLEV 4
+
+	int scaledins, k, itwo24 = 16777216, maxWorkitems;
+
+	ranluxcl_state_t rst;
+
+	//Make sure ins isn't negative.
+	if(ins < 0)
+		ins = 0;
+
+	#ifdef RANLUXCL_MAXWORKITEMS
+	maxWorkitems = RANLUXCL_MAXWORKITEMS;
+	#else
+	maxWorkitems = RANLUXCL_NUMWORKITEMS;
+	#endif
+
+
+	//ins is scaled so that if the user makes another call somewhere else
+	//with ins + 1 there should be no overlap. Also adding one
+	//allows us to use ins = 0.
+	scaledins = ins * maxWorkitems + 1;
+
+	int js = scaledins + RANLUXCL_MYID;
+
+	//Make sure js is not too small (should really be an error)
+	if(js < 1)
+		js = 1;
+
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s01to04.x=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s01to04.y=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s01to04.z=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s01to04.w=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s05to08.x=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s05to08.y=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s05to08.z=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s05to08.w=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s09to12.x=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s09to12.y=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s09to12.z=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s09to12.w=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s13to16.x=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s13to16.y=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s13to16.z=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s13to16.w=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s17to20.x=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s17to20.y=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s17to20.z=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s17to20.w=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s21to24.x=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s21to24.y=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s21to24.z=(js%itwo24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s21to24.w=(js%itwo24)*RANLUXCL_TWOM24;
+
+	rst.carryin24stepnr.x = 0.0f; //carry
+	if(rst.s21to24.w == 0.0f)
+		rst.carryin24stepnr.x = RANLUXCL_TWOM24;
+
+	rst.carryin24stepnr.y = 0.0f; //in24
+	rst.carryin24stepnr.z = 0.0f; //stepnr
+
+	#ifndef RANLUXCL_NO_WARMUP
+	//Warming up the generator, ensuring there are no initial correlations.
+	//16 is a "magic number". It is the number of times we must generate
+	//a batch of 24 numbers to ensure complete decorrelation.
+	for(int i=0; i<16; i++){
+		ranluxcl_onestep_1(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s09to12), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s05to08), &((*ranluxclstate).s21to24), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s05to08), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s01to04), &((*ranluxclstate).s17to20), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s01to04), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s21to24), &((*ranluxclstate).s13to16), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s21to24), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s17to20), &((*ranluxclstate).s09to12), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s17to20), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s13to16), &((*ranluxclstate).s05to08), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_1(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_2(&((*ranluxclstate).s13to16), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_3(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+		ranluxcl_onestep_4(&((*ranluxclstate).s09to12), &((*ranluxclstate).s01to04), &((*ranluxclstate).carryin24stepnr));
+	}
+	#endif //RANLUXCL_NO_WARMUP
+
+	//Upload the state
+	ranluxcl_upload_seed(&rst, ranluxcltab);
+}
+
 #undef RANLUXCL_TWOM24
 #undef RANLUXCL_TWOM12
+#undef RANLUXCL_NUMWORKITEMS
+#undef RANLUXCL_MYID
 #endif //RANLUXCL_CL

ranluxcl.h

-#pragma once
-#ifndef RANLUXCL_HPP
-#define RANLUXCL_HPP
-
-/**** RANLUXCL v1.2.0 ****************************************
-
-***** GENERAL USAGE ************************************************************
-
-ranluxcl_initialization returns a pointer to an array of cl_float4, with size 
-(i.e. number of bytes) as returned in the RANLUXCLTabSize variable. This is the
-seeds table that must be transferred to the OpenCL device.
-
-lux is the "luxury value" of the generator, and should be 0-4, where 0 is fastest
-and 4 produces the best numbers. It can also be >=24, in which case it directly
-sets the p-value of RANLUXCL (see details below).
-
-ins is the seed which initializes RANLUXCL. It should be >= 0. If several devices
-are used in parallel initializing each device's RANLUXTab with different ins
-will ensure that all work-items are generating uncorrelated sequences. Just be
-sure that maxWorkitems (described below) is set correctly.
-
-numWorkitems is the number of generators to initialize, usually corresponding
-to the number of work-items in the NDRange RANLUXCL will be used with.
-
-maxWorkitems should reflect the maximum number of work-items that will be used
-on any parallel instance of RANLUXCL. So for instance if we are launching 5120
-work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be
-generated by calling ranluxcl_intialization with numWorkitems = 5120 while 
-GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must 
-be at least 10240 for both GPU1 and GPU2, and it must be set to the same value
-for both.
-
-nskip will return an integer which can (optionally) be defined in the kernel code
-as RANLUXCL_NSKIP. If this is done the generator will be faster for luxury setting
-0 and 1, or when the p-value is manually set to a multiple of 24 (more on this in
-the implementation details below).
-
-RANLUXCLTabSize returns the number of bytes allocated for the entire float4 seeds 
-array, for instance for use when writing the seeds table to the device.
-
-Please note that you MUST "warm up" the generators before using them for any
-important computations. This is achieved by calling the ranluxcl_warmup
-function in a kernel. See top comments in ranluxcl.cl for more information.
-
-***** IMPLEMENTATION DETAILS *******************************************************
-
-A linear congruential generator initializes the seeds array. They can be initialized 
-in other ways as well, but this is the approach taken in the original code, which
-has been extensively used and tested by other.
-
-If lux is 0, 1, 2, 3 or 4, it is taken to be the luxury value, where 4 produces 
-the best numbers. If lux is above 24, it is taken as the "p-value, where the 
-number of values to throw away (nskip) is p - 24. For this OpenCL implementation, 
-p must either be a multiple of 24 or 4, for the planar and planar shift variants 
-respectively. If p is a multiple of 24 and RANLUXCL_NSKIP is set equal to nskip
-in the kernel code, the planar scheme is recovered, which might be slightly faster.
-
-This implementation is somewhat modified from the original fortran implementation 
-by F. James. Because of the way the OpenCL code is optimized with 4-component 
-32-bit float vectors, it is most convenient to always throw away some multiple 
-of 24 values (i.e. p is always a multiple of 24). However, there might be some 
-resonances if we always throw away a multiple of the seeds table size. Therefore 
-the implementation is slightly more intricate where p can be a multiple of 4 
-instead, at a cost to performance (only about 10% lower than the cleaner 24 values 
-approach on AMD Cypress). These two approaches are termed planar and planar 
-shift respectively.
-
-Below the p-values for the original reference implementation are listed along with 
-those of the planar shift implementation. Suggested values for the planar approach 
-are also presented. When this function is called with lux 0-4, the planar shift 
-values are used. To use the pure planar approach, set lux equal to the specific 
-p-value.
-	
-Luxury setting (lux):                            0   1   2   3   4
-Original fortran77 implementation by F. James:  24  48  97  223 389
-Planar (suggested):                             24  48  120 240 408
-Planar shift:                                   24  48  100 224 404
-
-Note that levels 0 and 1 are the same as in the original implementation for both
-planar and planar shift. Level 4 of planar shift where p=404 is the same as chosen 
-for luxury level 1 by Martin Luescher for his v3 version of RANLUX. Therefore if 
-it is considered important to only use "official" values, luxury settings 0, 1 or 
-4 of planar shift should be used. It is however unlikely that the other values are 
-bad, they just haven't been as extensively used and tested by others.
-
-Variable names are generally the same as in the fortran77 implementation, however 
-because of the way the generator is implemented, the i24 and j24 variables are 
-no longer needed.
-
-***** CREDIT ***********************************************************************
-
-I have been told by Fred James (the coder) that the original Fortran 77 
-implementation (which is the subject of the second paper below) is free to use and 
-share. Therefore I am using the permissive MIT license (below). But most importantly 
-please always remember to give credit to the two articles by Martin Luscher and 
-Fred James, describing the generator and the fortran 77 implementation on which this 
-implementation is based, respectively:
-
-Martin L�scher, A portable high-quality random number generator for lattice 
-field theory simulations, Computer Physics Communications 79 (1994) 100-110
-
-F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom 
-number generator of L�scher, Computer Physics Communications 79 (1994) 111-114
-
-***** LICENSE **********************************************************************
-
-Copyright (c) 2011 Ivar Ursin Nikolaisen
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this 
-software and associated documentation files (the "Software"), to deal in the Software 
-without restriction, including without limitation the rights to use, copy, modify, 
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
-permit persons to whom the Software is furnished to do so, subject to the following 
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies 
-or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
-PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
-CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
-OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-***************************************************************************************/
-
-//Use C++ headers
-#ifdef __cplusplus
-//This file only relies on the cl_float4, cl_int and cl_float datatypes from the OpenCL header.
-#include <CL/cl.hpp>
-#include <cstdlib>
-#include <cstdarg>
-#include <cstdio>
-
-//Else assume C
-#else
-#include <CL/cl.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <stdio.h>
-#endif //__cplusplus
-
-#define numFloat4PerWorkitem 7
-
-cl_float4 *ranluxcl_initialization(cl_int lux,
-	                               cl_int ins,
-	                               size_t numWorkitems,
-	                               size_t maxWorkitems,
-	                               cl_int *nskip,
-	                               size_t *RANLUXCLTabSize)
-{
-#ifdef __cplusplus
-	using namespace std;
-#endif //__cplusplus
-
-	const cl_int maxlev = 4, icons = 2147483563, itwo24 = 1<<24, pMustBeDivisibleBy = 4;
-	const cl_int ndskip[5]={0,24,76,200,380};
-	const cl_float twom24 = (cl_float)0.000000059604644775;
-	cl_int scaledins, i;
-	*RANLUXCLTabSize = sizeof(cl_float4) * numFloat4PerWorkitem * numWorkitems;
-
-	cl_float4 *RANLUXCLTab = (cl_float4*)malloc(*RANLUXCLTabSize);
-	if(RANLUXCLTab == NULL){
-		fprintf(stderr, "Error in ranluxcl_initialization: failed to allocate memory for RANLUXCLTab. Exiting.\n");
-		exit(1);
-	}
-	
-	//Interpret the lux parameter correctly, either as luxury setting if 0, 1, 2, 3 or 4, or as p-value if
-	//between 24 and 2000 (2000 and up is not likely to be intended by the programmer).
-	if(lux < 0){
-		fprintf(stderr, "Error in ranluxcl_initialization: illegal luxury level: %d, luxury cannot be negative. Exiting\n", lux);
-		exit(1);
-	}
-	else if(lux <= maxlev)
-		*nskip = ndskip[lux];
-	else if(lux >= 24 && lux <= 2000 && !((lux - 24) % pMustBeDivisibleBy))
-		*nskip = lux - 24;
-	else{
-		fprintf(stderr, "Error in ranluxcl_initialization: illegal luxury level: %d\n", lux);
-		fprintf(stderr, "lux must be either 0-4, or lux >= 24. If lux >= 24 it is interpreted as the\n");
-		fprintf(stderr, "p-value, which must be divisible by %d. Exiting.\n", pMustBeDivisibleBy);
-		exit(1);
-	}
-
-	//Check that ins is positive
-	if(ins < 0){
-		fprintf(stderr, "Error in ranluxcl_initialization: ins is negative. Exiting.\n");
-		exit(1);
-	}
-
-	//ins is scaled so that if the user makes another call somewhere else with ins + 1 there should be no overlap. Also adding one
-	//allows us to use ins = 0.
-	scaledins = ins * (cl_int)maxWorkitems + 1;
-
-	//Check that we won't have seeds too high for the LCG used to initialize
-	//cl_ulong highestRequestedSeed = (cl_ulong)(ins + 1) * (cl_ulong)maxWorkitems;
-	cl_ulong highestRequestedSeed = (cl_ulong)scaledins + (cl_ulong)numWorkitems;
-	if(highestRequestedSeed >= (cl_ulong)icons){
-		fprintf(stderr, "Error in ranluxcl_initialization: combination of ins and maxWorkitems is too high. Exiting.\n");
-		exit(1);
-	}
-
-	//Check that numWorkitems isn't larger than maxWorkitems (could mean the user is getting identical sequences without knowing it!)
-	if(numWorkitems > maxWorkitems){
-		fprintf(stderr, "Error in ranluxcl_initialization: numWorkitems is larger than maxWorkitems. Exiting.\n");
-		exit(1);
-	}
-
-	size_t workitem;
-	for(workitem = 0; workitem < numWorkitems; workitem++){
-		cl_int jseed = scaledins + (cl_int)workitem;
-
-		cl_int k;
-		cl_int iseeds[24];
-		cl_float seeds[numFloat4PerWorkitem * 4];
-
-		for(i=0; i<24; i++){
-			k = jseed / 53668;
-			jseed = 40014 * (jseed-k*53668) - k * 12211;
-			if (jseed < 0)
-				jseed = jseed + icons;
- 			iseeds[i] = jseed % itwo24;
-		}
-	
-		for(i=0; i<24; i++)
-			seeds[i] = iseeds[i] * twom24;
-
-		seeds[24] = 0.0; //carry
-
-		if(seeds[23] == 0.0)
-			seeds[24] = twom24;
-
-		seeds[25] = 0; //in24
-		seeds[26] = 0; //stepnr
-		seeds[27] = (cl_float)(*nskip); //nskip
-
-		//Transfer seeds to RANLUXCLTab
-		for(i = 0; i < numFloat4PerWorkitem; i++){
-			RANLUXCLTab[workitem + numWorkitems * i].s[0] = seeds[4 * i + 0];
-			RANLUXCLTab[workitem + numWorkitems * i].s[1] = seeds[4 * i + 1];
-			RANLUXCLTab[workitem + numWorkitems * i].s[2] = seeds[4 * i + 2];
-			RANLUXCLTab[workitem + numWorkitems * i].s[3] = seeds[4 * i + 3];
-		}
-	}
-
-	return RANLUXCLTab;
-}
-
-#undef numFloat4PerWorkitem
-#endif //RANLUXCL_HPP
 #include <cstdlib>
 #include <cmath>
 
-//Custom headers
-#include "ranluxcl.h"
+int globalLuxury; //Just relevant for checking correctness of implementation in this specific program
 
-int pvalue; //Just relevant for checking correctness of implementation in this specific program
+template <class T>
+inline std::string to_string(T x){
+	std::ostringstream o;
+	o << std::setprecision(16) << x;
+	return o.str();
+}
 
 //iTimer is a simple class to make timing events easy. It should provide microsecond accuracy
 //on both Windows and *nix systems.
 	          << "ranluxcltest.exe 4 1 1 1\n"
 	          << "\nCorrectness of final numbers will be checked if lux is 0-4.\n";
 }
-void Compile_OpenCL_Code(std::string BuildOptions,
-	                     std::string FileName,
-	                     std::string& BuildLog,
-	                     cl::Context context,
-	                     cl::Program &program,
-	                     std::vector <cl::Device>
-	                     devices,
-	                     cl_int deviceNr){
+
+void Compile_OpenCL_Code(
+	std::string BuildOptions,
+	std::string FileName,
+	cl::Context &context,
+	cl::Program &program,
+	cl_uint verbosity)
+{
 	//This function builds the OpenCL kernels and functions using the provided build options.
-	cl_int err;
+	//If verbosity is >0 the build log is printed.
+	cl_int err, err2;
+	cl::STRING_CLASS buildLog;
 
 	//Getting source code
 	std::ifstream file(FileName.c_str());
 	std::string prog(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>()));
 	cl::Program::Sources source(1, std::make_pair(prog.c_str(), prog.length()+1));
 	program = cl::Program(context, source);
+	file.close();
+
+	//Get vector of devices in context
+	VECTOR_CLASS <cl::Device> deviceList;
+	err = context.getInfo(CL_CONTEXT_DEVICES, &deviceList);
+	checkErr(err, "cl::Context::getInfo()");
 
 	//Building source code
-	err = program.build(devices, BuildOptions.c_str());
-	checkErr(file.is_open() ? CL_SUCCESS : -1, "Program::build()");
-	program.getBuildInfo(devices[deviceNr], CL_PROGRAM_BUILD_LOG, &BuildLog);
+	err2 = program.build(deviceList, BuildOptions.c_str());
+		
+	//Fetch build log for first device
+	err = program.getBuildInfo(deviceList[0], CL_PROGRAM_BUILD_LOG, &buildLog);
+	checkErr(err, "getBuildInfo");
+
+	if(verbosity)
+		std::cout << "Build log for " << FileName << ":\n" << buildLog << "\n";
+
+	checkErr(err2, "cl::Program::build()");
 }
+
 void OpenCL_Initializations(cl::Context &context, 
 	                        std::vector <cl::Device> &devices,
 	                        cl::CommandQueue &queue,
 		NumValsToCheck = numRefFileValsPerpval;
 
 	//The values of each of the five p-values are stored one after the other. This sets the offsett we need to index from.
-	if(pvalue == 24) offset = 0;
-	else if(pvalue == 48)  offset = 1;
-	else if(pvalue == 100) offset = 2;
-	else if(pvalue == 224) offset = 3;
-	else if(pvalue == 404) offset = 4;
+	if(globalLuxury == 0) offset = 0;
+	else if(globalLuxury == 1) offset = 1;
+	else if(globalLuxury == 2) offset = 2;
+	else if(globalLuxury == 3) offset = 3;
+	else if(globalLuxury == 4) offset = 4;
 	else{
-		std::cout << "p-value not checkable, correctness check will not be performed\n\n";
+		std::cout << "luxury value not checkable, correctness check will not be performed\n\n";
 		doCorrectnessCheck = 0;
 	}
 
 		          << "The averages of the reference and generated numbers (should be equal): " << avgRef << " and " << avgGen << "\n\n";
 	}
 }
-void MC_Cycles(size_t WorkgroupSize,
-               size_t NumWorkitems,
-               size_t RANLUXCLTabSize,
-               cl_int KernelCycles,
-               cl_int NumIterations,
-               cl::Context context, 
-               cl::CommandQueue queue, 
-               cl::Program program,
-               cl_float4 *RANLUXCLTab,
-               cl_float *PRNs,
-               double &genTime)
-{
-	cl_int err;
-	
-	//Setting kernel
-	cl::Kernel Kernel_PRN = cl::Kernel(program, "Kernel_PRN", &err); checkErr(err, "Kernel::Kernel()");
 
-	//Creating OpenCL buffers
-	cl::Buffer Buffer_RANLUXCLTab(context, CL_MEM_READ_WRITE, RANLUXCLTabSize, NULL, &err);
-	checkErr(err, "Buffer:RANLUXCLTab");
-	cl::Buffer Buffer_PRNs(context, CL_MEM_READ_WRITE, NumWorkitems * sizeof(cl_float), NULL, &err);
-	checkErr(err, "Buffer:PRNs");
-
-	//Writing data from arrays to OpenCL buffers
-	err = queue.enqueueWriteBuffer(Buffer_RANLUXCLTab, 
-	                               CL_TRUE, 
-	                               0, 
-	                               RANLUXCLTabSize, 
-	                               RANLUXCLTab); 
-	checkErr(err, "Write RANLUXCL Buffer");
-
-	//Setting arguments to kernels
-	err = Kernel_PRN.setArg(0, KernelCycles); checkErr(err, "Kernel::setArg()");
-	err = Kernel_PRN.setArg(1, Buffer_RANLUXCLTab); checkErr(err, "Kernel::setArg()");
-	err = Kernel_PRN.setArg(2, Buffer_PRNs); checkErr(err, "Kernel::setArg()");
-
-	iTimer genTimer;
-
-	for(int Iteration=0; Iteration<NumIterations; Iteration++){
-		err = queue.enqueueNDRangeKernel(Kernel_PRN, 
-		                                 cl::NullRange, 
-		                                 cl::NDRange(NumWorkitems), 
-		                                 cl::NDRange(WorkgroupSize));
-		checkErr(err, "CommandQueue::enqueueNDRangeKernel()");
-	}
-
-	//Just call finish after all commands have been entered into queue
-	err = queue.finish(); checkErr(err, "queue.finish()");
-
-	genTimer.End();
-	genTime = genTimer.Time();
-
-	
-	//Reading final numbers generated
-	err = queue.enqueueReadBuffer(Buffer_PRNs, 
-	                              CL_TRUE, 
-	                              0, 
-	                              NumWorkitems * sizeof(cl_float), 
-	                              PRNs);
-	checkErr(err, "Read PRNs Buffer");
-
-	//Reading RANLUXCLTabs back into RAM (needed if we were to launch another run later, which we won't do in this program)
-	err = queue.enqueueReadBuffer(Buffer_RANLUXCLTab, CL_TRUE, 0, RANLUXCLTabSize, RANLUXCLTab);
-	checkErr(err, "Read RANLUXCL Buffer");
-
-	//Check correctness
-	CheckCorrectness(NumWorkitems, KernelCycles, NumIterations, PRNs);
-}
 int main(int argc, char *argv[])
 {
 	iTimer totalTimer;
 	double genTime;
+	cl_int err;
 	cl_ulong TotalNumbersGenerated;
 
 	//OpenCL variables
 	if(argc > 5) deviceNr = atoi(argv[5]);
 	if(argc > 6) platformNr = atoi(argv[6]);
 
+	globalLuxury = lux;
+
 	//Different settings when using CPU
 	if(!UseGPU){
 		WorkgroupSize = WorkgroupSizeCPU;
 
 	TotalNumbersGenerated = cl_ulong(KernelCycles) * cl_ulong(NumWorkitems) * cl_ulong(NumIterations);
 
+	//Size of ranluxcltab buffer is seven float4 per work-item.
+	size_t RANLUXCLTabSize = NumWorkitems * 7 * sizeof(cl_float4);
+
 	if(UseGPU) std::cout << "Using GPU\n";
 	else std::cout << "Using CPU\n";
 
-	//Creating and initializing state array for PRNGs
-	cl_int nskip;
-	size_t RANLUXCLTabSize, maxWorkitems = NumWorkitems;
-	cl_float4 *RANLUXCLTab = ranluxcl_initialization(lux, 0, NumWorkitems, maxWorkitems, &nskip, &RANLUXCLTabSize);
-
-	//Set the global pvalue, which is only used for the correctness check of this program.
-	pvalue = nskip+24;
-	std::cout << "p-value = " << pvalue << "\n\n";
-
 	cl_float* PRNs = new cl_float [NumWorkitems];
 
 	OpenCL_Initializations(context, devices, queue, PrintOpenCLInfo, 0, UseGPU, deviceNr, platformNr);
 	std::string BuildOptions;
 	std::string BuildLog;
 	std::string FileName = "ranluxcltest_kernels.cl";
+
 	BuildOptions += "-I . "; //Search for include files in current directory
-	Compile_OpenCL_Code(BuildOptions, FileName, BuildLog, context, program, devices, deviceNr);
-	std::cout << "Done\n";
-	if(ShowBuildLog == 1) std::cout << "\nBuild log: \n" << BuildLog << "\n\n";
 
-	//Running calculations
-	MC_Cycles(WorkgroupSize,
-	          NumWorkitems,
-	          RANLUXCLTabSize,
-	          KernelCycles,
-	          NumIterations,
-	          context, 
-	          queue, 
-	          program,
-	          RANLUXCLTab, 
-	          PRNs,
-	          genTime);
+	//Used to get exact same sequence as original implementation (for correctness check). Should generally
+	//NOT be set in other programs, to ensure parallel generators aren't in initially correlated states.
+	BuildOptions += " -D RANLUXCL_NO_WARMUP ";
+
+	//Set luxury value. If this is not defined the highest (4) is used by default.
+	BuildOptions += " -D RANLUXCL_LUX=" + to_string(lux);
+
+	Compile_OpenCL_Code(BuildOptions, FileName, context, program, 1);
+
+	//Setting kernels
+	cl::Kernel Kernel_RanluxclInit = cl::Kernel(program, "Kernel_Ranluxcl_Init", &err); checkErr(err, "cl::Kernel Kernel_RanluxclInit");
+	cl::Kernel Kernel_PRN = cl::Kernel(program, "Kernel_PRN", &err); checkErr(err, "cl::Kernel kernel_RanluxclInit");
+
+	//Creating OpenCL buffers
+	cl::Buffer Buffer_RANLUXCLTab(context, CL_MEM_READ_WRITE, RANLUXCLTabSize, NULL, &err);
+	checkErr(err, "Buffer:RANLUXCLTab");
+	cl::Buffer Buffer_PRNs(context, CL_MEM_READ_WRITE, NumWorkitems * sizeof(cl_float), NULL, &err);
+	checkErr(err, "Buffer:PRNs");
+
+	//Setting arguments to kernels
+	err = Kernel_PRN.setArg(0, KernelCycles); checkErr(err, "Kernel::setArg()");
+	err = Kernel_PRN.setArg(1, Buffer_RANLUXCLTab); checkErr(err, "Kernel::setArg()");
+	err = Kernel_PRN.setArg(2, Buffer_PRNs); checkErr(err, "Kernel::setArg()");
+
+	err = Kernel_RanluxclInit.setArg(0, 0); checkErr(err, "Kernel::setArg()");
+	err = Kernel_RanluxclInit.setArg(1, Buffer_RANLUXCLTab); checkErr(err, "Kernel::setArg()");
+
+	//Initialize the generator
+	err = queue.enqueueNDRangeKernel(Kernel_RanluxclInit, cl::NullRange, cl::NDRange(NumWorkitems), cl::NDRange(WorkgroupSize));
+	checkErr(err, "CommandQueue::enqueueNDRangeKernel()");
+
+	iTimer genTimer;
+
+	for(int Iteration=0; Iteration<NumIterations; Iteration++){
+		err = queue.enqueueNDRangeKernel(Kernel_PRN, 
+		                                 cl::NullRange, 
+		                                 cl::NDRange(NumWorkitems), 
+		                                 cl::NDRange(WorkgroupSize));
+		checkErr(err, "CommandQueue::enqueueNDRangeKernel()");
+	}
+
+	//Just call finish after all commands have been entered into queue
+	err = queue.finish(); checkErr(err, "queue.finish()");
+
+	genTimer.End();
+	genTime = genTimer.Time();
+	
+	//Reading final numbers generated
+	err = queue.enqueueReadBuffer(Buffer_PRNs, 
+	                              CL_TRUE, 
+	                              0, 
+	                              NumWorkitems * sizeof(cl_float), 
+	                              PRNs);
+	checkErr(err, "Read PRNs Buffer");
+
+	//Check correctness
+	CheckCorrectness(NumWorkitems, KernelCycles, NumIterations, PRNs);
 
 	totalTimer.End();
 

ranluxcltest_kernels.cl

 //pasted here at the top instead.
 #include "ranluxcl.cl"
 
+__kernel void Kernel_Ranluxcl_Init(
+	private int ins,
+	global float4 *ranluxcltab)
+{
+	ranluxcl_initialization(ins, ranluxcltab);
+}
+
 __kernel void Kernel_PRN(private int KernelCycles,
-                         global float4* RANLUXCLTab,
+                         global float4* ranluxcltab,
                          global float* PRNs)
 {
-	//Downloading RANLUXCLTab. The state of RANLUXCL is stored in ranluxclstate.
+	//Downloading ranluxcltab. The state of RANLUXCL is stored in ranluxclstate.
 	ranluxcl_state_t ranluxclstate;
-	ranluxcl_download_seed(&ranluxclstate, RANLUXCLTab);
+	ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
 
 	float4 randomnr;
 
 	//Uploading only last number generated.
 	PRNs[get_global_id(0)] = randomnr.w;
 
-	//Uploading RANLUXCLTab
-	ranluxcl_upload_seed(&ranluxclstate, RANLUXCLTab);
+	//Uploading ranluxcltab
+	ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
 }