Commits

Rio Yokota committed 1dbe25b

Added GPU kernels from hacked bonsai.

Comments (0)

Files changed (79)

+CUDA_INSTALL_PATH  = /usr/local/cuda
+
+CXX = g++ -fopenmp -O3
+NVCC = nvcc --compiler-bindir=/usr/bin/g++-4.4 -arch sm_21 -use_fast_math -Iinclude -Xcompiler "-fopenmp -O3"
+
+ORG = approximate.o \
+	main.o \
+	scanKernels.o \
+	tree.o
+TORG = sortKernels.th_o
+
+.SUFFIXES: .o .cpp .cu
+
+all: $(ORG) $(TORG)
+	$(CXX) $^ -lcudart -L$(CUDA_INSTALL_PATH)/lib64
+	./a.out
+
+%.o: %.cu
+	$(NVCC) -c $< -o $@
+
+%.th_o: %.cu
+	$(NVCC) -c $< -o $@
+
+$(ORG): include/*.h
+
+clean:
+	find . -name "*.o" -o -name "*.out*" | xargs rm -rf
+cleanall:
+	make clean
+	rm *.th_o
+commit:
+	@make -C .. commit
+save:
+	@make -C .. save
+revert:
+	@make -C .. revert

gpu/approximate.cu

+#include "octree.h"
+#define laneId (threadIdx.x & (WARP_SIZE - 1))
+#define warpId (threadIdx.x >> WARP_SIZE2)
+#define IF(x) (-(int)(x))
+#define ABS(x) ((int(x) < 0 ) ? -(x) : (x))
+
+__device__ __forceinline__ int inclusiveScanInt(int* prefix, int value) 
+{
+  prefix[laneId] = value;
+  for (int i = 0; i < WARP_SIZE2; i++) {
+    const int offset = 1 << i;
+    const int laneOffset = ABS(laneId-offset);
+    prefix[laneId] += prefix[laneOffset] & IF(laneId >= offset);
+  }
+  return prefix[WARP_SIZE-1];
+}
+
+__device__ __forceinline__ int lanemask_lt()
+{
+  int mask;
+  asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
+  return mask;
+}
+
+__device__ int exclusiveScanBit(const bool flag)
+{
+  const uint flags = __ballot(flag);
+  return __popc(flags & lanemask_lt());
+}
+
+__device__ int reduceBit(const bool flag)
+{
+  const uint flags = __ballot(flag);
+  return __popc(flags);
+}
+
+__device__ __forceinline__ int lanemask_le()
+{
+  int mask;
+  asm("mov.u32 %0, %lanemask_le;" : "=r" (mask));
+  return mask;
+}
+
+__device__ __forceinline__ int inclusive_segscan_warp(
+    int *shmem, const int packed_value, int &dist_block, int &nseg)
+{
+  const int  flag = packed_value < 0;
+  const int  mask = IF(flag);
+  const int value = (mask & (-1-packed_value)) + (~mask & 1);
+  const int flags = __ballot(flag);
+
+  nseg += __popc(flags) ;
+  dist_block = __clz(__brev(flags));
+
+  const int distance = min(__clz(flags & lanemask_le()) + laneId - 31, laneId);
+  shmem[laneId] = value;
+  for( int i=0; i<WARP_SIZE2; i++ ) {
+    const int offset = 1 << i;
+    const int laneOffset = ABS(laneId-offset);
+    shmem[laneId] += shmem[laneOffset] & IF(offset <= distance);
+  }
+  return shmem[WARP_SIZE - 1];
+}
+
+__device__ __forceinline__ int inclusive_segscan_array(int *shmem_in, const int N)
+{
+  int dist, nseg = 0;
+  int y = inclusive_segscan_warp(shmem_in, shmem_in[laneId], dist, nseg);
+  for( int p=WARP_SIZE; p<N; p+=WARP_SIZE ) {
+    int *shmem = shmem_in + p;
+    int y1 = inclusive_segscan_warp(shmem, shmem[laneId], dist, nseg);
+    shmem[laneId] += y & IF(laneId < dist);
+    y = y1;
+  }
+  return nseg;
+}
+
+__device__ __forceinline__ int ACCESS(const int i) {
+  return (i & (LMEM_STACK_SIZE - 1)) * blockDim.x + threadIdx.x;
+}
+
+texture<uint, 1, cudaReadModeElementType> texNodeChild;
+texture<float, 1, cudaReadModeElementType> texOpening;
+texture<float4, 1, cudaReadModeElementType> texMultipole;
+texture<float4, 1, cudaReadModeElementType> texBody;
+
+__device__ __forceinline__ void P2P(
+    float4 &acc,  const float4 pos,
+    const float4 posj) {
+  const float3 dr = make_float3(posj.x - pos.x, posj.y - pos.y, posj.z - pos.z);
+  const float r2     = dr.x*dr.x + dr.y*dr.y + dr.z*dr.z + EPS2;
+  const float rinv   = rsqrtf(r2);
+  const float rinv2  = rinv*rinv;
+  const float mrinv  = posj.w * rinv;
+  const float mrinv3 = mrinv * rinv2;
+  acc.w -= mrinv;
+  acc.x += mrinv3 * dr.x;
+  acc.y += mrinv3 * dr.y;
+  acc.z += mrinv3 * dr.z;
+}
+
+__device__ bool applyMAC(
+    const float4 sourceCenter, 
+    const float4 groupCenter, 
+    const float4 groupSize) {
+  float3 dr = make_float3(fabsf(groupCenter.x - sourceCenter.x) - (groupSize.x),
+                          fabsf(groupCenter.y - sourceCenter.y) - (groupSize.y),
+                          fabsf(groupCenter.z - sourceCenter.z) - (groupSize.z));
+  dr.x += fabsf(dr.x); dr.x *= 0.5f;
+  dr.y += fabsf(dr.y); dr.y *= 0.5f;
+  dr.z += fabsf(dr.z); dr.z *= 0.5f;
+  const float ds2 = dr.x*dr.x + dr.y*dr.y + dr.z*dr.z;
+  return ds2 <= fabsf(sourceCenter.w);
+}
+
+__device__ void traverse(
+    float4 &pos_i,
+    float4 &acc_i,
+    float4 targetCenter,
+    float4 targetSize,
+    uint2 rootRange,
+    int *shmem,
+    int *lmem) {
+  const int stackSize = LMEM_STACK_SIZE << NTHREAD2;
+  int *approxNodes = lmem + stackSize + 2 * WARP_SIZE * warpId;
+  int *numDirect = shmem;
+  int *stackShrd = numDirect + WARP_SIZE;
+  int *directNodes = stackShrd + WARP_SIZE;
+  float4 *pos_j = (float4*)&directNodes[3*WARP_SIZE];
+  int *prefix = (int*)&pos_j[WARP_SIZE];
+
+  // stack
+  int *stackGlob = lmem;
+  // begin tree-walk
+  int warpOffsetApprox = 0;
+  int warpOffsetDirect = 0;
+  for( int root=rootRange.x; root<rootRange.y; root+=WARP_SIZE ) {
+    int numNodes = min(rootRange.y-root, WARP_SIZE);
+    int beginStack = 0;
+    int endStack = 1;
+    stackGlob[threadIdx.x] = root + laneId;
+    // walk each level
+    while( numNodes > 0 ) {
+      int numNodesNew = 0;
+      int warpOffsetSplit = 0;
+      int numStack = endStack;
+      // walk a level
+      for( int iStack=beginStack; iStack<endStack; iStack++ ) {
+        bool valid = laneId < numNodes;
+        int node = stackGlob[ACCESS(iStack)] & IF(valid);
+        numNodes -= WARP_SIZE;
+        float opening = tex1Dfetch(texOpening, node);
+        uint sourceData = tex1Dfetch(texNodeChild, node);
+        float4 sourceCenter = tex1Dfetch(texMultipole, node);
+        sourceCenter.w = opening;
+        bool split = applyMAC(sourceCenter, targetCenter, targetSize);
+        bool leaf = opening <= 0;
+        bool flag = split && !leaf && valid;
+        int child = sourceData & 0x0FFFFFFF;
+        int numChild = ((sourceData & 0xF0000000) >> 28) & IF(flag);
+        int sumChild = inclusiveScanInt(prefix, numChild);
+        int laneOffset = prefix[laneId];
+        laneOffset += warpOffsetSplit - numChild;
+        for( int i=0; i<numChild; i++ )
+          stackShrd[laneOffset+i] = child+i;
+        warpOffsetSplit += sumChild;
+        while( warpOffsetSplit >= WARP_SIZE ) {
+          warpOffsetSplit -= WARP_SIZE;
+          stackGlob[ACCESS(numStack)] = stackShrd[warpOffsetSplit+laneId];
+          numStack++;
+          numNodesNew += WARP_SIZE;
+          if( (numStack - iStack) > LMEM_STACK_SIZE ) return;
+        }
+#if 1   // APPROX
+        flag = !split && valid;
+        laneOffset = exclusiveScanBit(flag);
+        if( flag ) approxNodes[warpOffsetApprox+laneOffset] = node;
+        warpOffsetApprox += reduceBit(flag);
+        if( warpOffsetApprox >= WARP_SIZE ) {
+          warpOffsetApprox -= WARP_SIZE;
+          node = approxNodes[warpOffsetApprox+laneId];
+          pos_j[laneId] = tex1Dfetch(texMultipole, node);
+          for( int i=0; i<WARP_SIZE; i++ )
+            P2P(acc_i, pos_i, pos_j[i]);
+        }
+#endif
+#if 1   // DIRECT
+        flag = split && leaf && valid;
+        const int jbody = sourceData & BODYMASK;
+        int numBodies = (((sourceData & INVBMASK) >> LEAFBIT)+1) & IF(flag);
+        directNodes[laneId] = numDirect[laneId];
+
+        int sumBodies = inclusiveScanInt(prefix, numBodies);
+        laneOffset = prefix[laneId];
+        if( flag ) prefix[exclusiveScanBit(flag)] = laneId;
+        numDirect[laneId] = laneOffset;
+        laneOffset -= numBodies;
+        int numFinished = 0;
+        while( sumBodies > 0 ) {
+          numBodies = min(sumBodies, 3*WARP_SIZE-warpOffsetDirect);
+          for( int i=warpOffsetDirect; i<warpOffsetDirect+numBodies; i+=WARP_SIZE )
+            directNodes[i+laneId] = 0;
+          if( flag && (numDirect[laneId] <= numBodies) && (laneOffset >= 0) )
+            directNodes[warpOffsetDirect+laneOffset] = -1-jbody;
+          numFinished += inclusive_segscan_array(&directNodes[warpOffsetDirect], numBodies);
+          numBodies = numDirect[prefix[numFinished-1]];
+          sumBodies -= numBodies;
+          numDirect[laneId] -= numBodies;
+          laneOffset -= numBodies;
+          warpOffsetDirect += numBodies;
+          while( warpOffsetDirect >= WARP_SIZE ) {
+            warpOffsetDirect -= WARP_SIZE;
+            pos_j[laneId] = tex1Dfetch(texBody,directNodes[warpOffsetDirect+laneId]);
+            for( int i=0; i<WARP_SIZE; i++ )
+              P2P(acc_i, pos_i, pos_j[i]);
+          }
+        }
+        numDirect[laneId] = directNodes[laneId];
+#endif
+      }
+
+      if( warpOffsetSplit > 0 ) { 
+        stackGlob[ACCESS(numStack)] = stackShrd[laneId];
+        numStack++; 
+        numNodesNew += warpOffsetSplit;
+      }
+      numNodes = numNodesNew;
+      beginStack = endStack;
+      endStack = numStack;
+    }
+  }
+
+  if( warpOffsetApprox > 0 ) {
+    if( laneId < warpOffsetApprox )  {
+      const int node = approxNodes[laneId];
+      pos_j[laneId] = tex1Dfetch(texMultipole, node);
+    } else {
+      pos_j[laneId] = make_float4(1.0e10f, 1.0e10f, 1.0e10f, 0.0f);
+    }
+    for( int i=0; i<WARP_SIZE; i++ )
+      P2P(acc_i, pos_i, pos_j[i]);
+  }
+
+  if( warpOffsetDirect > 0 ) {
+    if( laneId < warpOffsetDirect ) {
+      const float4 posj = tex1Dfetch(texBody,numDirect[laneId]);
+      pos_j[laneId] = posj;
+    } else {
+      pos_j[laneId] = make_float4(1.0e10f, 1.0e10f, 1.0e10f, 0.0f);
+    }
+    for( int i=0; i<WARP_SIZE; i++ ) 
+      P2P(acc_i, pos_i, pos_j[i]);
+  }
+}
+
+extern "C" __global__ void
+  traverseKernel(
+      const int numGroups,
+      uint2 *levelRange,
+      float4 *acc,
+      float4 *groupSizeInfo,
+      float4 *groupCenterInfo,
+      int    *MEM_BUF,
+      uint   *workToDo) {
+  __shared__ int wid[4];
+  __shared__ int shmem_pool[10*NTHREAD];
+  int *shmem = shmem_pool+10*WARP_SIZE*warpId;
+  int *lmem = &MEM_BUF[blockIdx.x*(LMEM_STACK_SIZE*NTHREAD+2*NTHREAD)];
+  while(true) {
+    if( laneId == 0 )
+      wid[warpId] = atomicAdd(workToDo,1);
+    if( wid[warpId] >= numGroups ) return;
+    float4 groupSize = groupSizeInfo[wid[warpId]];
+    const int groupData = __float_as_int(groupSize.w);
+    const uint begin = groupData & CRITMASK;
+    const uint numGroup = ((groupData & INVCMASK) >> CRITBIT) + 1;
+    float4 groupCenter = groupCenterInfo[wid[warpId]];
+    uint body_i = begin + laneId % numGroup;
+    float4 pos_i = tex1Dfetch(texBody,body_i);
+    float4 acc_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    traverse(pos_i, acc_i, groupCenter, groupSize, levelRange[2], shmem, lmem);
+    if( laneId < numGroup )
+      acc[body_i] = acc_i;
+  }
+}
+
+extern "C" __global__ void directKernel(float4 *bodyPos, float4 *bodyAcc, const int N) {
+  uint idx = min(blockIdx.x * blockDim.x + threadIdx.x, N-1);
+  float4 pos_i = bodyPos[idx];
+  float4 acc_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  __shared__ float4 shmem[NTHREAD];
+  float4 *pos_j = shmem + WARP_SIZE * warpId;
+  const int numWarp = ALIGN(N, WARP_SIZE);
+  for( int jwarp=0; jwarp<numWarp; jwarp++ ) {
+    int jGlob = jwarp*WARP_SIZE+laneId;
+    pos_j[laneId] = bodyPos[min(jGlob,N-1)];
+    pos_j[laneId].w *= jGlob < N;
+    for( int i=0; i<WARP_SIZE; i++ )
+      P2P(acc_i, pos_i, pos_j[i]);
+  }
+  bodyAcc[idx] = acc_i;
+}
+
+void octree::traverse() {
+  nodeChild.tex("texNodeChild");
+  openingAngle.tex("texOpening");
+  multipole.tex("texMultipole");
+  bodyPos.tex("texBody");
+  workToDo.zeros();
+  traverseKernel<<<NBLOCK,NTHREAD,0,execStream>>>(
+    numGroups,
+    levelRange.devc(),
+    bodyAcc.devc(),
+    groupSizeInfo.devc(),
+    groupCenterInfo.devc(),
+    (int*)generalBuffer1.devc(),
+    workToDo.devc()
+  );
+}
+
+void octree::iterate() {
+  CU_SAFE_CALL(cudaStreamCreate(&execStream));
+  double t1 = get_time();
+  getBoundaries();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("BOUND : %lf\n",get_time() - t1);;
+  t1 = get_time();
+  getKeys();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("INDEX : %lf\n",get_time() - t1);;
+  t1 = get_time();
+  sortKeys();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("KEYS  : %lf\n",get_time() - t1);;
+  t1 = get_time();
+  sortBodies();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("BODIES: %lf\n",get_time() - t1);;
+  t1 = get_time();
+  buildTree();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("BUILD : %lf\n",get_time() - t1);;
+  t1 = get_time();
+  allocateTreePropMemory();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("ALLOC : %lf\n",get_time() - t1);;
+  t1 = get_time();
+  linkTree();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("LINK  : %lf\n",get_time() - t1);;
+  t1 = get_time();
+  upward();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("UPWARD: %lf\n",get_time() - t1);;
+  t1 = get_time();
+  traverse();
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  printf("FMM   : %lf\n",get_time() - t1);;
+}
+
+void octree::direct() {
+  int blocks = ALIGN(numBodies/100, NTHREAD);
+  directKernel<<<blocks,NTHREAD,0,execStream>>>(bodyPos.devc(),bodyAcc2.devc(),numBodies);
+  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
+  CU_SAFE_CALL(cudaStreamDestroy(execStream));
+}

gpu/include/b40c/radix_sort/downsweep/6bit_prmt/cta.cuh

+/******************************************************************************
+ * 
+ * Copyright 2010-2011 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ ******************************************************************************/
+
+/******************************************************************************
+ * Abstract CTA-processing functionality for partitioning downsweep
+ * scan kernels
+ ******************************************************************************/
+
+#pragma once
+
+#include <b40c/util/basic_utils.cuh>
+#include <b40c/util/device_intrinsics.cuh>
+#include <b40c/util/io/load_tile.cuh>
+#include <b40c/util/io/scatter_tile.cuh>
+
+namespace b40c {
+namespace partition {
+namespace downsweep {
+
+
+/**
+ * Partitioning downsweep scan CTA
+ *
+ * Abstract class
+ */
+template <
+	typename KernelPolicy,
+	typename DerivedCta,									// Derived CTA class
+	template <typename Policy> class Tile>			// Derived Tile class to use
+struct Cta
+{
+	//---------------------------------------------------------------------
+	// Typedefs and Constants
+	//---------------------------------------------------------------------
+
+	typedef typename KernelPolicy::KeyType 					KeyType;
+	typedef typename KernelPolicy::ValueType 				ValueType;
+	typedef typename KernelPolicy::SizeT 					SizeT;
+	typedef typename KernelPolicy::SmemStorage				SmemStorage;
+	typedef typename KernelPolicy::ByteGrid::LanePartial	LanePartial;
+
+	// Operational details type for short grid
+	typedef util::SrtsDetails<typename KernelPolicy::ByteGrid> 		ByteGridDetails;
+
+	typedef DerivedCta Dispatch;
+
+	//---------------------------------------------------------------------
+	// Members
+	//---------------------------------------------------------------------
+
+	// Shared storage for this CTA
+	typename KernelPolicy::SmemStorage 	&smem_storage;
+
+	// Input and output device pointers
+	KeyType								*d_in_keys;
+	KeyType								*d_out_keys;
+
+	ValueType							*d_in_values;
+	ValueType							*d_out_values;
+
+	// Operational details for scan grids
+	ByteGridDetails 					byte_grid_details;
+
+	SizeT								my_bin_carry;
+
+	KeyType 							*offset;
+	KeyType 							*next_offset;
+
+	//---------------------------------------------------------------------
+	// Methods
+	//---------------------------------------------------------------------
+
+	/**
+	 * Constructor
+	 */
+	__device__ __forceinline__ Cta(
+		SmemStorage 	&smem_storage,
+		KeyType 		*d_in_keys,
+		KeyType 		*d_out_keys,
+		ValueType 		*d_in_values,
+		ValueType 		*d_out_values,
+		SizeT 			*d_spine) :
+			smem_storage(smem_storage),
+			d_in_keys(d_in_keys),
+			d_out_keys(d_out_keys),
+			d_in_values(d_in_values),
+			d_out_values(d_out_values),
+			byte_grid_details(smem_storage.byte_raking_lanes),
+			offset(smem_storage.key_exchange + threadIdx.x + (threadIdx.x >> 5)),
+			next_offset(smem_storage.key_exchange + (threadIdx.x + 1) + ((threadIdx.x + 1) >> 5))
+	{
+
+		if (threadIdx.x < KernelPolicy::BINS) {
+
+			// Read bin_carry in parallel
+			int spine_bin_offset = (gridDim.x * threadIdx.x) + blockIdx.x;
+
+			my_bin_carry = tex1Dfetch(spine::SpineTex<SizeT>::ref, spine_bin_offset);
+
+			int2 item;
+			item.x = -1;
+			item.y = KernelPolicy::BINS;
+			smem_storage.bin_in_prefixes[threadIdx.x] = item;
+		}
+
+		if (threadIdx.x < CUB_WARP_THREADS(KernelPolicy::CUDA_ARCH)) {
+			smem_storage.warpscan[0][threadIdx.x] = 0;
+			smem_storage.warpscan[1][threadIdx.x] = 0;
+		}
+	}
+
+
+	/**
+	 * Process tile
+	 */
+	__device__ __forceinline__ void ProcessTile(
+		SizeT cta_offset,
+		const SizeT &guarded_elements = KernelPolicy::TILE_ELEMENTS)
+	{
+		Tile<KernelPolicy> tile;
+
+		tile.Partition(
+			cta_offset,
+			guarded_elements,
+			(Dispatch *) this);
+	}
+
+
+	/**
+	 * Process work range of tiles
+	 */
+	__device__ __forceinline__ void ProcessWorkRange(
+		util::CtaWorkLimits<SizeT> &work_limits)
+	{
+		// Make sure we get a local copy of the cta's offset (work_limits may be in smem)
+		SizeT pack_offset = smem_storage.packed_offset;
+
+		// Process full tiles of tile_elements
+		while (pack_offset < smem_storage.packed_offset_limit) {
+
+			ProcessTile(pack_offset);
+			pack_offset += (KernelPolicy::TILE_ELEMENTS / KernelPolicy::PACK_SIZE);
+		}
+
+/*
+		// Clean up last partial tile with guarded-io
+		if (work_limits.guarded_elements) {
+			ProcessTile(
+				pack_offset,
+				work_limits.guarded_elements);
+		}
+*/
+	}
+};
+
+
+} // namespace downsweep
+} // namespace partition
+} // namespace b40c
+

gpu/include/b40c/radix_sort/downsweep/6bit_prmt/kernel_policy.cuh

+/******************************************************************************
+ * 
+ * Copyright 2010-2011 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ ******************************************************************************/
+
+/******************************************************************************
+ * Configuration policy for partitioning downsweep scan kernels
+ ******************************************************************************/
+
+#pragma once
+
+#include <b40c/util/cuda_properties.cuh>
+#include <b40c/util/basic_utils.cuh>
+#include <b40c/util/srts_grid.cuh>
+
+namespace b40c {
+namespace partition {
+namespace downsweep {
+
+
+/**
+ * A detailed partitioning downsweep kernel configuration policy type that specializes
+ * kernel code for a specific pass.  It encapsulates tuning configuration policy
+ * details derived from TuningPolicy
+ */
+template <typename TuningPolicy>
+struct KernelPolicy : TuningPolicy
+{
+	typedef typename TuningPolicy::SizeT 		SizeT;
+	typedef typename TuningPolicy::KeyType 		KeyType;
+	typedef typename TuningPolicy::ValueType 	ValueType;
+
+	enum {
+
+		BINS 							= 1 << TuningPolicy::LOG_BINS,
+		THREADS							= 1 << TuningPolicy::LOG_THREADS,
+
+		LOG_WARPS						= TuningPolicy::LOG_THREADS - CUB_LOG_WARP_THREADS(TuningPolicy::CUDA_ARCH),
+		WARPS							= 1 << LOG_WARPS,
+
+		LOAD_VEC_SIZE					= 1 << TuningPolicy::LOG_LOAD_VEC_SIZE,
+		LOADS_PER_TILE					= 1 << TuningPolicy::LOG_LOADS_PER_TILE,
+
+		LOG_TILE_ELEMENTS_PER_THREAD	= TuningPolicy::LOG_LOAD_VEC_SIZE + TuningPolicy::LOG_LOADS_PER_TILE,
+		TILE_ELEMENTS_PER_THREAD		= 1 << LOG_TILE_ELEMENTS_PER_THREAD,
+
+		LOG_TILE_ELEMENTS				= TuningPolicy::LOG_THREADS + LOG_TILE_ELEMENTS_PER_THREAD,
+		TILE_ELEMENTS					= 1 << LOG_TILE_ELEMENTS,
+	
+		LOG_SCAN_BINS					= (TuningPolicy::LOG_BINS > 3) ? 3 : TuningPolicy::LOG_BINS,
+		SCAN_BINS						= 1 << LOG_SCAN_BINS,
+
+		LOG_SCAN_LANES_PER_TILE			= CUB_MAX((LOG_SCAN_BINS - 2), 0),		// Always at least one lane per load
+		SCAN_LANES_PER_TILE				= 1 << LOG_SCAN_LANES_PER_TILE,
+
+		LOG_DEPOSITS_PER_LANE 			= TuningPolicy::LOG_THREADS + TuningPolicy::LOG_LOADS_PER_TILE,
+	};
+
+
+	// Smem SRTS grid type for reducing and scanning a tile of
+	// (bins/4) lanes of composite 8-bit bin counters
+	typedef util::SrtsGrid<
+		TuningPolicy::CUDA_ARCH,
+		int,											// Partial type
+		LOG_DEPOSITS_PER_LANE,							// Deposits per lane
+		LOG_SCAN_LANES_PER_TILE,						// Lanes (the number of composite digits)
+		TuningPolicy::LOG_RAKING_THREADS,				// Raking threads
+		false>											// Any prefix dependences between lanes are explicitly managed
+			ByteGrid;
+
+	
+	/**
+	 * Shared storage for partitioning upsweep
+	 */
+	struct SmemStorage
+	{
+		SizeT							packed_offset;
+		SizeT							packed_offset_limit;
+
+		bool 							non_trivial_pass;
+		util::CtaWorkLimits<SizeT> 		work_limits;
+
+		SizeT							bin_carry[BINS];
+		int2							bin_in_prefixes[BINS + 1];
+
+		// Storage for scanning local ranks
+		volatile int 					warpscan[2][CUB_WARP_THREADS(CUDA_ARCH) * 3 / 2];
+
+		union {
+			struct {
+				int 					byte_raking_lanes[ByteGrid::RAKING_ELEMENTS];
+				int						short_prefixes[2][ByteGrid::RAKING_THREADS];
+			};
+
+			int							bin_ex_prefixes[BINS + 1];
+
+			KeyType 					key_exchange[TILE_ELEMENTS + (TILE_ELEMENTS / 32)];			// Last index is for invalid elements to be culled (if any)
+		};
+	};
+
+	enum {
+		THREAD_OCCUPANCY					= CUB_SM_THREADS(CUDA_ARCH) >> TuningPolicy::LOG_THREADS,
+		SMEM_OCCUPANCY						= CUB_SMEM_BYTES(CUDA_ARCH) / sizeof(SmemStorage),
+		MAX_CTA_OCCUPANCY					= CUB_MIN(CUB_SM_CTAS(CUDA_ARCH), CUB_MIN(THREAD_OCCUPANCY, SMEM_OCCUPANCY)),
+
+		VALID								= (MAX_CTA_OCCUPANCY > 0),
+	};
+
+
+	__device__ __forceinline__ static void PreprocessKey(KeyType &key) {}
+
+	__device__ __forceinline__ static void PostprocessKey(KeyType &key) {}
+};
+	
+
+
+} // namespace downsweep
+} // namespace partition
+} // namespace b40c
+

gpu/include/b40c/radix_sort/downsweep/6bit_prmt/tile.cuh

+/******************************************************************************
+ * 
+ * Copyright 2010-2011 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ ******************************************************************************/
+
+/******************************************************************************
+ * Abstract tile-processing functionality for partitioning downsweep scan
+ * kernels
+ ******************************************************************************/
+
+#pragma once
+
+#include <b40c/util/cuda_properties.cuh>
+#include <b40c/util/basic_utils.cuh>
+#include <b40c/util/io/modified_load.cuh>
+#include <b40c/util/io/modified_store.cuh>
+#include <b40c/util/io/load_tile.cuh>
+#include <b40c/util/io/scatter_tile.cuh>
+#include <b40c/util/reduction/serial_reduce.cuh>
+#include <b40c/util/scan/serial_scan.cuh>
+#include <b40c/util/scan/warp_scan.cuh>
+#include <b40c/util/device_intrinsics.cuh>
+#include <b40c/util/soa_tuple.cuh>
+#include <b40c/util/scan/soa/cooperative_soa_scan.cuh>
+
+namespace b40c {
+namespace partition {
+namespace downsweep {
+
+
+/**
+ * Templated texture reference for keys
+ */
+template <typename KeyType>
+struct KeysTex
+{
+	static texture<KeyType, cudaTextureType1D, cudaReadModeElementType> ref;
+};
+template <typename KeyType>
+texture<KeyType, cudaTextureType1D, cudaReadModeElementType> KeysTex<KeyType>::ref;
+
+
+
+/**
+ * Tile
+ *
+ * Abstract class
+ */
+template <
+	typename KernelPolicy,
+	typename DerivedTile>
+struct Tile
+{
+	//---------------------------------------------------------------------
+	// Typedefs and Constants
+	//---------------------------------------------------------------------
+
+	typedef typename KernelPolicy::KeyType 					KeyType;
+	typedef typename KernelPolicy::ValueType 				ValueType;
+	typedef typename KernelPolicy::SizeT 					SizeT;
+
+	typedef DerivedTile Dispatch;
+
+	enum {
+		LOAD_VEC_SIZE 				= KernelPolicy::LOAD_VEC_SIZE,
+		LOADS_PER_TILE 				= KernelPolicy::LOADS_PER_TILE,
+		TILE_ELEMENTS_PER_THREAD 	= KernelPolicy::TILE_ELEMENTS_PER_THREAD,
+
+		LOG_SCAN_LANES				= KernelPolicy::LOG_SCAN_LANES_PER_TILE,
+		SCAN_LANES					= KernelPolicy::SCAN_LANES_PER_TILE,
+
+		LOG_PACKS_PER_LOAD			= KernelPolicy::LOG_LOAD_VEC_SIZE - KernelPolicy::LOG_PACK_SIZE,
+		PACKS_PER_LOAD				= 1 << LOG_PACKS_PER_LOAD,
+
+		LANE_ROWS_PER_LOAD 			= KernelPolicy::ByteGrid::ROWS_PER_LANE / KernelPolicy::LOADS_PER_TILE,
+		LANE_STRIDE_PER_LOAD 		= KernelPolicy::ByteGrid::PADDED_PARTIALS_PER_ROW * LANE_ROWS_PER_LOAD,
+
+		INVALID_BIN					= -1,
+
+		LOG_RAKING_THREADS 			= KernelPolicy::ByteGrid::LOG_RAKING_THREADS,
+		RAKING_THREADS 				= 1 << LOG_RAKING_THREADS,
+
+		LOG_WARPSCAN_THREADS		= CUB_LOG_WARP_THREADS(CUDA_ARCH),
+		WARPSCAN_THREADS 			= 1 << LOG_WARPSCAN_THREADS,
+
+	};
+
+	//---------------------------------------------------------------------
+	// Members
+	//---------------------------------------------------------------------
+
+
+	// The keys (and values) this thread will read this tile
+	KeyType 	keys[LOADS_PER_TILE][LOAD_VEC_SIZE];
+	ValueType 	values[TILE_ELEMENTS_PER_THREAD];
+
+	// For each load:
+	// 		counts_nibbles contains the bin counts within nibbles ordered right to left
+	// 		bins_nibbles contains the bin for each key within nibbles ordered right to left
+	// 		load_prefix_bytes contains the exclusive scan for each key within nibbles ordered right to left
+
+	int 		bins_nibbles[(LOAD_VEC_SIZE + 7) / 8][LOADS_PER_TILE];
+
+	int 		counts_nibbles[SCAN_LANES / 2][LOADS_PER_TILE];
+	int			counts_bytes[SCAN_LANES][LOADS_PER_TILE];
+
+	int 		load_prefix_bytes[(LOAD_VEC_SIZE + 3) / 4][LOADS_PER_TILE];
+
+	int 		warpscan_shorts[LOADS_PER_TILE][4];
+
+	int 		local_ranks[LOADS_PER_TILE][LOAD_VEC_SIZE];		// The local rank of each key
+	SizeT 		scatter_offsets[LOADS_PER_TILE][LOAD_VEC_SIZE];	// The global rank of each key
+
+	int 		bins[TILE_ELEMENTS_PER_THREAD];
+
+
+	//---------------------------------------------------------------------
+	// Tile Methods
+	//---------------------------------------------------------------------
+
+	/**
+	 * ExtractRanks
+	 */
+	template <int LOAD, int VEC, int REM = (VEC & 7)>
+	struct ExtractRanks
+	{
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void Invoke(Cta *cta, Tile *tile, const bool shift_bytes) {}
+	};
+
+
+	/**
+	 * ExtractRanks (VEC % 8 == 0)
+	 */
+	template <int LOAD, int VEC>
+	struct ExtractRanks<LOAD, VEC, 0>
+	{
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void Invoke(Cta *cta, Tile *tile, const bool shift_bytes)
+		{
+/*
+			printf("\tTid(%d) Vec(%d) bins_nibbles(%08x)\n",
+				threadIdx.x, VEC, tile->bins_nibbles[VEC / 8][LOAD]);
+*/
+			// Decode prefix bytes for first four keys
+			tile->load_prefix_bytes[VEC / 4][LOAD] += util::PRMT(
+				tile->counts_bytes[0][LOAD],
+				tile->counts_bytes[1][LOAD],
+				tile->bins_nibbles[VEC / 8][LOAD]);
+
+			// Decode scan low and high packed words for first four keys
+			int warpscan_prefix[2];
+			warpscan_prefix[0] = util::PRMT(
+				tile->warpscan_shorts[LOAD][0],
+				tile->warpscan_shorts[LOAD][1],
+				tile->bins_nibbles[VEC / 8][LOAD]);
+
+			warpscan_prefix[1] = util::PRMT(
+				tile->warpscan_shorts[LOAD][2],
+				tile->warpscan_shorts[LOAD][3],
+				tile->bins_nibbles[VEC / 8][LOAD]);
+
+			// Low
+			int packed_scatter =
+				util::PRMT(								// Warpscan component (de-interleaved)
+					warpscan_prefix[0],
+					warpscan_prefix[1],
+					0x5140) +
+				util::PRMT(								// Raking scan component (lower bytes from each half)
+					tile->load_prefix_bytes[VEC / 4][LOAD],
+					0,
+					0x4140);
+
+			packed_scatter = util::SHR_ADD(0xffe0ffe0 & packed_scatter, 5, packed_scatter);
+			packed_scatter <<= 2;
+
+			tile->local_ranks[LOAD][VEC + 0] = packed_scatter & 0x0000ffff;
+			tile->local_ranks[LOAD][VEC + 1] = packed_scatter >> 16;
+
+			// High
+			packed_scatter =
+				util::PRMT(								// Warpscan component (de-interleaved)
+					warpscan_prefix[0],
+					warpscan_prefix[1],
+					0x7362) +
+				util::PRMT(								// Raking scan component (upper bytes from each half)
+					tile->load_prefix_bytes[VEC / 4][LOAD],
+					0,
+					0x4342);
+
+			packed_scatter = util::SHR_ADD(0xffe0ffe0 & packed_scatter, 5, packed_scatter);
+			packed_scatter <<= 2;
+
+			tile->local_ranks[LOAD][VEC + 2] = packed_scatter & 0x0000ffff;
+			tile->local_ranks[LOAD][VEC + 3] = packed_scatter >> 16;
+
+		}
+	};
+
+
+	/**
+	 * ExtractRanks (VEC % 8 == 4)
+	 */
+	template <int LOAD, int VEC>
+	struct ExtractRanks<LOAD, VEC, 4>
+	{
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void Invoke(Cta *cta, Tile *tile, const bool shift_bytes)
+		{
+			int upper_bins_nibbles = tile->bins_nibbles[VEC / 8][LOAD] >> 16;
+
+			// Decode prefix bytes for second four keys
+			tile->load_prefix_bytes[VEC / 4][LOAD] += util::PRMT(
+				tile->counts_bytes[0][LOAD],
+				tile->counts_bytes[1][LOAD],
+				upper_bins_nibbles);
+
+			// Decode scan low and high packed words for second four keys
+			int warpscan_prefix[2];
+			warpscan_prefix[0] = util::PRMT(
+				tile->warpscan_shorts[LOAD][0],
+				tile->warpscan_shorts[LOAD][1],
+				upper_bins_nibbles);
+
+			warpscan_prefix[1] = util::PRMT(
+				tile->warpscan_shorts[LOAD][2],
+				tile->warpscan_shorts[LOAD][3],
+				upper_bins_nibbles);
+
+			// Low
+			int packed_scatter =
+				util::PRMT(								// Warpscan component (de-interleaved)
+					warpscan_prefix[0],
+					warpscan_prefix[1],
+					0x5140) +
+				util::PRMT(								// Raking scan component (lower bytes from each half)
+					tile->load_prefix_bytes[VEC / 4][LOAD],
+					0,
+					0x4140);
+
+			packed_scatter = util::SHR_ADD(0xffe0ffe0 & packed_scatter, 5, packed_scatter);
+			packed_scatter <<= 2;
+
+			tile->local_ranks[LOAD][VEC + 0] = packed_scatter & 0x0000ffff;
+			tile->local_ranks[LOAD][VEC + 1] = packed_scatter >> 16;
+
+			// High
+			packed_scatter =
+				util::PRMT(								// Warpscan component (de-interleaved)
+					warpscan_prefix[0],
+					warpscan_prefix[1],
+					0x7362) +
+				util::PRMT(								// Raking scan component (upper bytes from each half)
+					tile->load_prefix_bytes[VEC / 4][LOAD],
+					0,
+					0x4342);
+
+			packed_scatter = util::SHR_ADD(0xffe0ffe0 & packed_scatter, 5, packed_scatter);
+			packed_scatter <<= 2;
+
+			tile->local_ranks[LOAD][VEC + 2] = packed_scatter & 0x0000ffff;
+			tile->local_ranks[LOAD][VEC + 3] = packed_scatter >> 16;
+		}
+	};
+
+
+
+	//---------------------------------------------------------------------
+	// IterateTileElements Structures
+	//---------------------------------------------------------------------
+
+	/**
+	 * Iterate next vector element
+	 */
+	template <int LOAD, int VEC, int dummy = 0>
+	struct IterateTileElements
+	{
+		// DecodeKeys
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void DecodeKeys(
+			Cta *cta,
+			Tile *tile,
+			const int CURRENT_BIT)
+		{
+			// Decode the bin for this key
+			int bin = util::BFE(
+				tile->keys[LOAD][VEC],
+				CURRENT_BIT,
+				KernelPolicy::LOG_SCAN_BINS);
+
+			const int BITS_PER_NIBBLE = 4;
+			int shift = bin * BITS_PER_NIBBLE;
+
+			// Initialize exclusive scan bytes
+			if (VEC == 0) {
+
+				tile->load_prefix_bytes[VEC / 4][LOAD] = 0;
+
+			} else {
+				int prev_counts_nibbles = tile->counts_nibbles[0][LOAD] >> shift;
+
+				if ((VEC & 3) == 0) {
+
+					tile->load_prefix_bytes[VEC / 4][LOAD] = prev_counts_nibbles & 0xf;
+
+				} else if ((VEC & 7) < 4) {
+
+					util::BFI(
+						tile->load_prefix_bytes[VEC / 4][LOAD],
+						tile->load_prefix_bytes[VEC / 4][LOAD],
+						prev_counts_nibbles,
+						8 * (VEC & 7),
+						BITS_PER_NIBBLE);
+
+				} else {
+
+					util::BFI(
+						tile->load_prefix_bytes[VEC / 4][LOAD],
+						tile->load_prefix_bytes[VEC / 4][LOAD],
+						prev_counts_nibbles,
+						8 * ((VEC & 7) - 4),
+						BITS_PER_NIBBLE);
+				}
+			}
+
+			// Initialize counts nibbles
+			if (VEC == 0) {
+				tile->counts_nibbles[0][LOAD] = 1 << shift;
+
+			} else if (VEC == LOAD_VEC_SIZE - 1) {
+
+				// last vector element
+				if ((VEC & 15) == 15) {
+
+					// Protect overflow: expand nibbles into bytes and then add
+					util::NibblesToBytes(
+						tile->counts_bytes[0][LOAD],
+						tile->counts_bytes[1][LOAD],
+						tile->counts_nibbles[0][LOAD]);
+
+					shift = shift * 2;
+					util::SHL_ADD(
+						tile->counts_bytes[0][LOAD],
+						1,
+						shift,
+						tile->counts_bytes[0][LOAD]);
+
+					util::SHL_ADD(
+						tile->counts_bytes[1][LOAD],
+						1,
+						shift - 32,
+						tile->counts_bytes[1][LOAD]);
+
+				} else {
+
+					// Add nibble then expand into bytes
+					util::SHL_ADD(
+						tile->counts_nibbles[0][LOAD],
+						1,
+						shift,
+						tile->counts_nibbles[0][LOAD]);
+
+					util::NibblesToBytes(
+						tile->counts_bytes[0][LOAD],
+						tile->counts_bytes[1][LOAD],
+						tile->counts_nibbles[0][LOAD]);
+				}
+
+			} else {
+				util::SHL_ADD(
+					tile->counts_nibbles[0][LOAD],
+					1,
+					shift,
+					tile->counts_nibbles[0][LOAD]);
+			}
+
+			// Initialize bins nibbles
+			if ((VEC & 7) == 0) {
+				tile->bins_nibbles[VEC / 8][LOAD] = bin;
+
+			} else {
+				util::BFI(
+					tile->bins_nibbles[VEC / 8][LOAD],
+					tile->bins_nibbles[VEC / 8][LOAD],
+					bin,
+					4 * (VEC & 7),
+					4);
+			}
+
+			// Next vector element
+			IterateTileElements<LOAD, VEC + 1>::DecodeKeys(cta, tile, CURRENT_BIT);
+		}
+
+		// ComputeRanks
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void ComputeRanks(Cta *cta, Tile *tile, const bool shift_bytes)
+		{
+			if (VEC == 0) {
+
+				const int LANE_OFFSET = LOAD * LANE_STRIDE_PER_LOAD;
+
+				// Extract prefix bytes from bytes raking grid
+				tile->counts_bytes[0][LOAD] = cta->byte_grid_details.lane_partial[0][LANE_OFFSET];
+				tile->counts_bytes[1][LOAD] = cta->byte_grid_details.lane_partial[1][LANE_OFFSET];
+
+				// Extract warpscan shorts
+				const int LOAD_RAKING_TID_OFFSET = (KernelPolicy::THREADS * LOAD) >> KernelPolicy::ByteGrid::LOG_PARTIALS_PER_SEG;
+
+				int base_raking_tid = threadIdx.x >> KernelPolicy::ByteGrid::LOG_PARTIALS_PER_SEG;
+
+				tile->warpscan_shorts[LOAD][0] = cta->smem_storage.short_prefixes[0][base_raking_tid + LOAD_RAKING_TID_OFFSET];
+				tile->warpscan_shorts[LOAD][1] = cta->smem_storage.short_prefixes[1][base_raking_tid + LOAD_RAKING_TID_OFFSET];
+				tile->warpscan_shorts[LOAD][2] = cta->smem_storage.short_prefixes[0][base_raking_tid + LOAD_RAKING_TID_OFFSET + (RAKING_THREADS / 2)];
+				tile->warpscan_shorts[LOAD][3] = cta->smem_storage.short_prefixes[1][base_raking_tid + LOAD_RAKING_TID_OFFSET + (RAKING_THREADS / 2)];
+			}
+
+			ExtractRanks<LOAD, VEC>::Invoke(cta, tile, shift_bytes);
+/*
+			printf("tid(%d) vec(%d) key(%d) scatter(%d)\n",
+				threadIdx.x,
+				VEC,
+				tile->keys[LOAD][VEC],
+				tile->local_ranks[LOAD][VEC] / 4);
+*/
+			// Next vector element
+			IterateTileElements<LOAD, VEC + 1>::ComputeRanks(cta, tile, shift_bytes);
+		}
+	};
+
+
+
+	/**
+	 * IterateTileElements next load
+	 */
+	template <int LOAD, int dummy>
+	struct IterateTileElements<LOAD, LOAD_VEC_SIZE, dummy>
+	{
+		// DecodeKeys
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void DecodeKeys(
+			Cta *cta,
+			Tile *tile,
+			const int CURRENT_BIT)
+		{
+			const int LANE_OFFSET = LOAD * LANE_STRIDE_PER_LOAD;
+
+			// Place keys into raking grid
+			cta->byte_grid_details.lane_partial[0][LANE_OFFSET] = tile->counts_bytes[0][LOAD];
+			cta->byte_grid_details.lane_partial[1][LANE_OFFSET] = tile->counts_bytes[1][LOAD];
+/*
+			printf("Tid %u load %u:\t,"
+				"load_prefix_bytes[0](%08x), "
+				"load_prefix_bytes[1](%08x), "
+				"counts_bytes[0](%08x), "
+				"counts_bytes[1](%08x), "
+				"\n",
+				threadIdx.x, LOAD,
+				tile->load_prefix_bytes[0][LOAD],
+				tile->load_prefix_bytes[1][LOAD],
+				tile->counts_bytes[0][LOAD],
+				tile->counts_bytes[1][LOAD]);
+*/
+			// First vector element, next load
+			IterateTileElements<LOAD + 1, 0>::DecodeKeys(cta, tile, CURRENT_BIT);
+		}
+
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void ComputeRanks(Cta *cta, Tile *tile, const bool shift_bytes)
+		{
+			// First vector element, next load
+			IterateTileElements<LOAD + 1, 0>::ComputeRanks(cta, tile, shift_bytes);
+		}
+
+	};
+
+	/**
+	 * Terminate iteration
+	 */
+	template <int dummy>
+	struct IterateTileElements<LOADS_PER_TILE, 0, dummy>
+	{
+		// DecodeKeys
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void DecodeKeys(Cta *cta, Tile *tile, const int CURRENT_BIT) {}
+
+		// ExtractRanks
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void ComputeRanks(Cta *cta, Tile *tile, const bool shift_bytes) {}
+	};
+
+
+
+	//---------------------------------------------------------------------
+	// Tile Internal Methods
+	//---------------------------------------------------------------------
+
+
+	/**
+	 * Scan Tile
+	 */
+	template <typename Cta>
+	__device__ __forceinline__ void ScanTile(Cta *cta, const int CURRENT_BIT, const bool shift_bytes)
+	{
+
+		// Decode bins and place keys into grid
+		IterateTileElements<0, 0>::DecodeKeys(cta, this, CURRENT_BIT);
+
+		__syncthreads();
+
+		int tid = threadIdx.x & 31;
+		int warp = threadIdx.x >> 5;
+		volatile int *p = cta->smem_storage.short_prefixes[warp];
+		volatile int *p2 = &cta->smem_storage.short_prefixes[warp][tid * 2];
+		volatile int *warpscan = cta->smem_storage.warpscan[warp];
+
+		// Use our raking threads to, in aggregate, scan the composite counter lanes
+		if (threadIdx.x < RAKING_THREADS) {
+
+/*
+			if (threadIdx.x == 0) {
+				printf("ByteGrid:\n");
+				KernelPolicy::ByteGrid::Print();
+				printf("\n");
+			}
+*/
+			// Upsweep rake
+			int partial_bytes = util::scan::SerialScan<KernelPolicy::ByteGrid::PARTIALS_PER_SEG>::Invoke(
+				cta->byte_grid_details.raking_segment,
+				0);
+/*
+			printf("\t\t\tRaking tid %d with partial_bytes (%08x)\n",
+				threadIdx.x, partial_bytes);
+*/
+			// Extract bytes into shorts (first warp has 0-3, second has 4-7)
+			p[tid] = util::PRMT(partial_bytes, 0, 0x4240);
+			p[tid + CUB_WARP_THREADS(CUDA_ARCH)] = util::PRMT(partial_bytes, 0, 0x4341);
+
+			int partial0 = *p2;
+			int partial1 = *(p2 + 1);
+
+			int partial = partial0 + partial1;
+
+			warpscan[16 + tid] = partial;
+
+			warpscan[16 + tid] = partial =
+				partial + warpscan[16 + tid - 1];
+			warpscan[16 + tid] = partial =
+				partial + warpscan[16 + tid - 2];
+			warpscan[16 + tid] = partial =
+				partial + warpscan[16 + tid - 4];
+			warpscan[16 + tid] = partial =
+				partial + warpscan[16 + tid - 8];
+			warpscan[16 + tid] = partial =
+				partial + warpscan[16 + tid - 16];
+
+			// Restricted barrier
+			util::BAR(RAKING_THREADS);
+
+			// Grab first warp total
+			int total = cta->smem_storage.warpscan[0][16 + CUB_WARP_THREADS(CUDA_ARCH) - 1];
+			if (threadIdx.x >= (RAKING_THREADS / 2)) {
+
+				// Second warp adds halves from first warp total into partial
+				int flip = util::PRMT(total, total, 0x1032);
+				total += flip;
+				partial += total;
+
+				// Second warp replaces with second warp total
+				total = cta->smem_storage.warpscan[1][16 + CUB_WARP_THREADS(CUDA_ARCH) - 1];
+			}
+
+			// Add lower into upper
+			partial = util::SHL_ADD_C(total, 16, partial);
+
+			int exclusive1 = partial - partial1;
+			int exclusive0 = exclusive1 - partial0;
+
+/*
+			printf("\tRaking tid %d with inclusive_partial((%u,%u),(%u,%u)) and exclusive_partial((%u,%u),(%u,%u))\n",
+				threadIdx.x,
+				inclusive0 >> 16, inclusive0 & 0x0000ffff,
+				inclusive1 >> 16, inclusive1 & 0x0000ffff,
+				exclusive0 >> 16, exclusive0 & 0x0000ffff,
+				exclusive1 >> 16, exclusive1 & 0x0000ffff);
+*/
+			// Trade
+			// (0,2) .. (1,3)
+			*p2 = exclusive0;
+			*(p2 + 1) = exclusive1;
+
+			// Interleave:
+			// (0L, 1L, 2L, 3L)
+			// (0H, 1H, 2H, 3H)
+			int a = p[tid];								// 0,2
+			int b = p[tid + 32];						// 1,3
+
+			p[tid] =
+				util::PRMT(a, b, 0x6240);
+			p[tid + 32] =
+				util::PRMT(a, b, 0x7351);
+		}
+
+		__syncthreads();
+
+		// Extract the local ranks of each key
+		IterateTileElements<0, 0>::ComputeRanks(cta, this, shift_bytes);
+	}
+
+
+
+	//---------------------------------------------------------------------
+	// IterateElements Structures
+	//---------------------------------------------------------------------
+
+	/**
+	 * Iterate next tile element
+	 */
+	template <int ELEMENT, int dummy = 0>
+	struct IterateElements
+	{
+		// GatherDecodeKeys
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void GatherDecodeKeys(Cta *cta, Tile *tile)
+		{
+			const int LOAD_OFFSET = (ELEMENT * KernelPolicy::THREADS) + ((ELEMENT * KernelPolicy::THREADS) >> 5);
+
+			KeyType *linear_keys = (KeyType *) tile->keys;
+
+			linear_keys[ELEMENT] = cta->offset[LOAD_OFFSET];
+			KeyType next_key = cta->next_offset[LOAD_OFFSET];
+
+			tile->bins[ELEMENT] = util::BFE(
+				linear_keys[ELEMENT],
+				KernelPolicy::CURRENT_BIT,
+				KernelPolicy::LOG_BINS);
+
+			int2 item;	// (inclusive for bins[element], next bin)
+			item.x = threadIdx.x + (ELEMENT * KernelPolicy::THREADS);
+			item.y = ((ELEMENT == TILE_ELEMENTS_PER_THREAD - 1) && (threadIdx.x == KernelPolicy::THREADS - 1)) ?
+				KernelPolicy::BINS :						// invalid bin
+				util::BFE(
+					next_key,
+					KernelPolicy::CURRENT_BIT,
+					KernelPolicy::LOG_BINS);
+
+			if (tile->bins[ELEMENT] != item.y) {
+				cta->smem_storage.bin_in_prefixes[tile->bins[ELEMENT]] = item;
+			}
+
+			IterateElements<ELEMENT + 1>::GatherDecodeKeys(cta, tile);
+		}
+
+		// ScatterKeysToGlobal
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void ScatterKeysToGlobal(
+			Cta *cta,
+			Tile *tile,
+			const SizeT &guarded_elements)
+		{
+			KeyType *linear_keys = (KeyType *) tile->keys;
+
+			int bin_carry = cta->smem_storage.bin_carry[tile->bins[ELEMENT]];
+			int tile_element = threadIdx.x + (ELEMENT * KernelPolicy::THREADS);
+/*
+			printf("\tTid %d scattering key[%d](%d) with bin_carry(%d) to offset %d\n",
+				threadIdx.x,
+				ELEMENT,
+				linear_keys[ELEMENT],
+				bin_carry,
+				threadIdx.x + (KernelPolicy::THREADS * ELEMENT) + bin_carry);
+*/
+			if ((guarded_elements >= KernelPolicy::TILE_ELEMENTS) || (tile_element < guarded_elements)) {
+
+				util::io::ModifiedStore<KernelPolicy::WRITE_MODIFIER>::St(
+					linear_keys[ELEMENT],
+					cta->d_out_keys + threadIdx.x + (KernelPolicy::THREADS * ELEMENT) + bin_carry);
+			}
+
+			IterateElements<ELEMENT + 1>::ScatterKeysToGlobal(cta, tile, guarded_elements);
+		}
+	};
+
+
+	/**
+	 * Terminate iteration
+	 */
+	template <int dummy>
+	struct IterateElements<TILE_ELEMENTS_PER_THREAD, dummy>
+	{
+		// GatherDecodeKeys
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void GatherDecodeKeys(
+			Cta *cta, Tile *tile) {}
+
+		// ScatterKeysToGlobal
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void ScatterKeysToGlobal(
+			Cta *cta, Tile *tile, const SizeT &guarded_elements) {}
+	};
+
+
+
+	//---------------------------------------------------------------------
+	// Partition/scattering specializations
+	//---------------------------------------------------------------------
+
+
+	/**
+	 * Specialized for two-phase scatter, keys-only
+	 */
+	template <ScatterStrategy SCATTER_STRATEGY>
+	struct PartitionTile
+	{
+		template <typename Cta, typename Tile>
+		static __device__ __forceinline__ void Invoke(
+			SizeT pack_offset,
+			const SizeT &guarded_elements,
+			Cta *cta,
+			Tile *tile)
+		{
+			// Load keys
+//			tile->LoadKeys(cta, cta_offset, guarded_elements);
+
+			typedef typename util::VecType<KeyType, KernelPolicy::PACK_SIZE>::Type VectorType;
+			VectorType (*vectors)[PACKS_PER_LOAD] = (VectorType (*)[PACKS_PER_LOAD]) tile->keys;
+
+			#pragma unroll
+			for (int LOAD = 0; LOAD < KernelPolicy::LOADS_PER_TILE; LOAD++) {
+
+				#pragma unroll
+				for (int PACK = 0; PACK < PACKS_PER_LOAD; PACK++) {
+
+					vectors[LOAD][PACK] = tex1Dfetch(
+						KeysTex<VectorType>::ref,
+						pack_offset + (threadIdx.x * PACKS_PER_LOAD) + (LOAD * KernelPolicy::THREADS * PACKS_PER_LOAD) + PACK);
+				}
+			}
+
+			// Scan tile
+			tile->ScanTile(cta, KernelPolicy::CURRENT_BIT, true);
+
+			__syncthreads();
+
+			// Scatter keys to smem by local rank
+			#pragma unroll
+			for (int LOAD = 0; LOAD < KernelPolicy::LOADS_PER_TILE; LOAD++) {
+
+				#pragma unroll
+				for (int VEC = 0; VEC < LOAD_VEC_SIZE; VEC++) {
+
+					char * ptr = (char *) cta->smem_storage.key_exchange;
+					KeyType * ptr_key = (KeyType *)(ptr + tile->local_ranks[LOAD][VEC]);
+
+					*ptr_key = tile->keys[LOAD][VEC];
+				}
+			}
+/*
+			__syncthreads();
+
+			// Gather keys from smem (strided)
+			#pragma unroll
+			for (int LOAD = 0; LOAD < KernelPolicy::LOADS_PER_TILE; LOAD++) {
+
+				#pragma unroll
+				for (int VEC = 0; VEC < LOAD_VEC_SIZE; VEC++) {
+
+					const int LOAD_IDX = LOAD * LOAD_VEC_SIZE * KernelPolicy::THREADS;
+					const int LOAD_OFFSET = LOAD_IDX + (LOAD_IDX >> CUB_MAX(5, KernelPolicy::LOG_LOAD_VEC_SIZE));
+
+					tile->keys[LOAD][VEC] = cta->smem_storage.key_exchange[
+						(threadIdx.x * LOAD_VEC_SIZE) +
+						((threadIdx.x * LOAD_VEC_SIZE) >> CUB_MAX(5, KernelPolicy::LOG_LOAD_VEC_SIZE)) +
+						LOAD_OFFSET +
+						VEC];
+				}
+			}
+
+			__syncthreads();
+
+			// Scan tile
+			tile->ScanTile(cta, KernelPolicy::CURRENT_BIT + KernelPolicy::LOG_SCAN_BINS, true);
+
+			__syncthreads();
+
+			// Scatter keys to smem by local rank
+			#pragma unroll
+			for (int LOAD = 0; LOAD < KernelPolicy::LOADS_PER_TILE; LOAD++) {
+
+				#pragma unroll
+				for (int VEC = 0; VEC < LOAD_VEC_SIZE; VEC++) {
+
+					char * ptr = (char *) cta->smem_storage.key_exchange;
+					KeyType * ptr_key = (KeyType *)(ptr + tile->local_ranks[LOAD][VEC]);
+
+					*ptr_key = tile->keys[LOAD][VEC];
+				}
+			}
+*/
+			__syncthreads();
+
+			// Gather keys linearly from smem (also saves off bin in/exclusives)
+			IterateElements<0>::GatherDecodeKeys(cta, tile);
+
+			__syncthreads();
+
+			if (threadIdx.x < KernelPolicy::BINS) {
+
+				// Put exclusive count into corresponding bin
+				int2 item = cta->smem_storage.bin_in_prefixes[threadIdx.x];
+				int bin_inclusive = item.x + 1;
+				cta->smem_storage.bin_ex_prefixes[item.y] = bin_inclusive;
+
+				// Restricted barrier
+				util::BAR(KernelPolicy::BINS);
+
+				int bin_exclusive = cta->smem_storage.bin_ex_prefixes[threadIdx.x];
+
+				cta->my_bin_carry -= bin_exclusive;
+				cta->smem_storage.bin_carry[threadIdx.x] = cta->my_bin_carry;
+				cta->my_bin_carry += bin_inclusive;
+
+				item.x = -1;
+				item.y = KernelPolicy::BINS;
+				cta->smem_storage.bin_in_prefixes[threadIdx.x] = item;
+/*
+				printf("bin %d bin_inclusive %d bin_exclusive %d my_bin_carry %d\n",
+					threadIdx.x, bin_inclusive, bin_exclusive, cta->my_bin_carry);
+*/
+			}
+
+			__syncthreads();
+
+			// Scatter keys to global bin partitions
+			IterateElements<0>::ScatterKeysToGlobal(cta, tile, guarded_elements);
+
+		}
+	};
+
+
+
+
+
+	//---------------------------------------------------------------------
+	// Interface
+	//---------------------------------------------------------------------
+
+	/**
+	 * Loads, decodes, and scatters a tile into global partitions
+	 */
+	template <typename Cta>
+	__device__ __forceinline__ void Partition(
+		SizeT pack_offset,
+		const SizeT &guarded_elements,
+		Cta *cta)
+	{
+		PartitionTile<KernelPolicy::SCATTER_STRATEGY>::Invoke(
+			pack_offset,
+			guarded_elements,
+			cta,
+			(Dispatch *) this);
+
+	}
+
+};
+
+
+} // namespace downsweep
+} // namespace partition
+} // namespace b40c
+

gpu/include/b40c/radix_sort/downsweep/cta.cuh

+/******************************************************************************
+ * 
+ * Copyright 2010-2012 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ ******************************************************************************/
+
+/******************************************************************************
+ * CTA-processing functionality for radix sort downsweep scan kernels
+ ******************************************************************************/
+
+#pragma once
+
+#include <b40c/util/basic_utils.cuh>
+#include <b40c/util/cta_work_distribution.cuh>
+#include <b40c/util/tex_vector.cuh>
+#include <b40c/util/reduction/serial_reduce.cuh>
+#include <b40c/util/scan/serial_scan.cuh>
+#include <b40c/util/io/load_tile.cuh>
+#include <b40c/util/io/scatter_tile.cuh>
+
+#include <b40c/radix_sort/sort_utils.cuh>
+
+#include <b40c/radix_sort/downsweep/kernel_policy.cuh>
+#include <b40c/radix_sort/downsweep/tex_ref.cuh>
+
+#include <b40c/radix_sort/spine/tex_ref.cuh>
+
+namespace b40c {
+namespace radix_sort {
+namespace downsweep {
+
+
+/**
+ * Partitioning downsweep scan CTA
+ */
+template <
+	typename KernelPolicy,
+	typename SizeT,
+	typename KeyType,
+	typename ValueType>
+struct Cta
+{
+	//---------------------------------------------------------------------
+	// Type definitions and Constants
+	//---------------------------------------------------------------------
+
+	// Integer type for digit counters (to be packed in the RakingPartial type defined below)
+	typedef unsigned short Counter;
+
+	// Integer type for raking partials (packed counters).
+	typedef typename util::If<
+		(KernelPolicy::SMEM_8BYTE_BANKS),
+		unsigned long long,
+		unsigned int>::Type RakingPartial;
+
+	enum {
+		CURRENT_BIT 				= KernelPolicy::CURRENT_BIT,
+		CURRENT_PASS 				= KernelPolicy::CURRENT_PASS,
+		FLOP_TURN					= KernelPolicy::CURRENT_PASS & 0x1,					// (FLOP_TURN) ? (d_keys1 --> d_keys0) : (d_keys0 --> d_keys1)
+		KEYS_ONLY 					= util::Equals<ValueType, util::NullType>::VALUE,
+		BANK_PADDING 				= 1,												// Whether or not to insert padding for exchanging keys
+
+		RADIX_BITS					= KernelPolicy::RADIX_BITS,
+		RADIX_DIGITS 				= 1 << RADIX_BITS,
+
+		LOG_THREADS 				= KernelPolicy::LOG_THREADS,
+		THREADS						= 1 << LOG_THREADS,
+
+		LOG_WARP_THREADS 			= CUB_LOG_WARP_THREADS(__CUB_CUDA_ARCH__),
+		WARP_THREADS				= 1 << LOG_WARP_THREADS,
+
+		LOG_WARPS					= LOG_THREADS - LOG_WARP_THREADS,
+		WARPS						= 1 << LOG_WARPS,
+
+		LOG_THREAD_ELEMENTS 		= KernelPolicy::LOG_THREAD_ELEMENTS,
+		THREAD_ELEMENTS				= 1 << LOG_THREAD_ELEMENTS,
+
+		LOG_TILE_ELEMENTS			= LOG_THREADS + LOG_THREAD_ELEMENTS,
+		TILE_ELEMENTS				= 1 << LOG_TILE_ELEMENTS,
+
+		PACKED_COUNTERS				= sizeof(RakingPartial) / sizeof(Counter),
+		LOG_PACKED_COUNTERS			= util::Log2<PACKED_COUNTERS>::VALUE,
+
+		LOG_SCAN_LANES				= CUB_MAX((RADIX_BITS - LOG_PACKED_COUNTERS), 0),				// Always at least one lane
+		SCAN_LANES					= 1 << LOG_SCAN_LANES,
+
+		LOG_SCAN_ELEMENTS			= LOG_SCAN_LANES + LOG_THREADS,
+		SCAN_ELEMENTS				= 1 << LOG_SCAN_ELEMENTS,
+
+		LOG_BASE_RAKING_SEG			= LOG_SCAN_ELEMENTS - LOG_THREADS,
+		PADDED_RAKING_SEG			= (1 << LOG_BASE_RAKING_SEG) + 1,
+
+		LOG_MEM_BANKS				= CUB_LOG_MEM_BANKS(__CUB_CUDA_ARCH__),
+		MEM_BANKS					= 1 << LOG_MEM_BANKS,
+
+		DIGITS_PER_SCATTER_PASS 	= THREADS / MEM_BANKS,
+		SCATTER_PASSES 				= RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
+
+		LOG_STORE_TXN_THREADS 		= LOG_MEM_BANKS,
+		STORE_TXN_THREADS 			= 1 << LOG_STORE_TXN_THREADS,
+
+		BYTES_PER_COUNTER			= sizeof(Counter),
+		LOG_BYTES_PER_COUNTER		= util::Log2<BYTES_PER_COUNTER>::VALUE,
+
+		ELEMENTS_PER_TEX			= Textures<
+										KeyType,
+										ValueType,
+										THREAD_ELEMENTS>::ELEMENTS_PER_TEX,
+
+		THREAD_TEX_LOADS	 		= THREAD_ELEMENTS / ELEMENTS_PER_TEX,
+
+		TILE_TEX_LOADS				= THREADS * THREAD_TEX_LOADS,
+	};
+
+	static const util::io::ld::CacheModifier 	READ_MODIFIER 		= KernelPolicy::READ_MODIFIER;
+	static const util::io::st::CacheModifier 	WRITE_MODIFIER 		= KernelPolicy::WRITE_MODIFIER;
+	static const ScatterStrategy 				SCATTER_STRATEGY 	= KernelPolicy::SCATTER_STRATEGY;
+
+	// Key texture type
+	typedef typename Textures<
+		KeyType,
+		ValueType,
+		THREAD_ELEMENTS>::KeyTexType KeyTexType;
+
+	// Value texture type
+	typedef typename Textures<
+		KeyType,
+		ValueType,
+		THREAD_ELEMENTS>::ValueTexType ValueTexType;
+
+
+	/**
+	 * Shared memory storage layout
+	 */
+	struct SmemStorage
+	{
+		SizeT							tex_offset;
+		SizeT							tex_offset_limit;
+
+		bool 							non_trivial_pass;
+		util::CtaWorkLimits<SizeT> 		work_limits;
+
+		SizeT 							base_digit_offset[RADIX_DIGITS];
+
+		// Storage for scanning local ranks
+		volatile RakingPartial			warpscan[WARPS][WARP_THREADS * 3 / 2];
+
+		struct {
+			int4						align_padding;
+			union {
+				Counter					packed_counters[SCAN_LANES + 1][THREADS][PACKED_COUNTERS];
+				RakingPartial			raking_grid[THREADS][PADDED_RAKING_SEG];
+				KeyType 				key_exchange[TILE_ELEMENTS + (TILE_ELEMENTS >> LOG_MEM_BANKS)];
+				ValueType 				value_exchange[TILE_ELEMENTS + (TILE_ELEMENTS >> LOG_MEM_BANKS)];
+			};
+		};
+	};
+
+
+	/**
+	 * Tile state
+	 */
+	struct Tile
+	{
+		KeyType 			keys[THREAD_ELEMENTS];
+		ValueType 			values[THREAD_ELEMENTS];
+		Counter				thread_prefixes[THREAD_ELEMENTS];
+		int 				ranks[THREAD_ELEMENTS];
+
+		unsigned int 		counter_offsets[THREAD_ELEMENTS];
+
+		SizeT				bin_offsets[THREAD_ELEMENTS];
+	};
+
+
+	//---------------------------------------------------------------------
+	// Members
+	//---------------------------------------------------------------------
+
+	// Shared storage for this CTA
+	SmemStorage 						&smem_storage;
+
+	KeyType								*d_keys0;
+	KeyType								*d_keys1;
+
+	ValueType							*d_values0;
+	ValueType							*d_values1;
+
+	RakingPartial						*raking_segment;
+	Counter								*bin_counter;
+
+	SizeT								my_base_digit_offset;
+
+	int 								warp_id;
+	volatile RakingPartial				*warpscan;
+
+
+	//---------------------------------------------------------------------
+	// Helper structure for tile elements iteration
+	//---------------------------------------------------------------------
+
+	/**
+	 * Iterate next vector element
+	 */
+	template <int VEC, int DUMMY = 0>
+	struct IterateTileElements
+	{
+		// DecodeKeys
+		static __device__ __forceinline__ void DecodeKeys(Cta &cta,	Tile &tile)
+		{
+			// Compute byte offset of smem counter.  Add in thread column.
+			tile.counter_offsets[VEC] = (threadIdx.x << (LOG_PACKED_COUNTERS + LOG_BYTES_PER_COUNTER));
+
+			// Add in sub-counter offset
+			tile.counter_offsets[VEC] = Extract<
+				KeyType,
+				CURRENT_BIT + LOG_SCAN_LANES,
+				LOG_PACKED_COUNTERS,
+				LOG_BYTES_PER_COUNTER>::SuperBFE(
+					tile.keys[VEC],
+					tile.counter_offsets[VEC]);
+
+			// Add in row offset
+			tile.counter_offsets[VEC] = Extract<
+				KeyType,
+				CURRENT_BIT,
+				LOG_SCAN_LANES,
+				LOG_THREADS + LOG_PACKED_COUNTERS + LOG_BYTES_PER_COUNTER>::SuperBFE(
+					tile.keys[VEC],
+					tile.counter_offsets[VEC]);
+
+			Counter* counter = (Counter*)
+				(((unsigned char *) cta.smem_storage.packed_counters) + tile.counter_offsets[VEC]);
+
+			// Load thread-exclusive prefix
+			tile.thread_prefixes[VEC] = *counter;
+
+			// Store inclusive prefix
+			*counter = tile.thread_prefixes[VEC] + 1;
+
+			// Next vector element
+			IterateTileElements<VEC + 1>::DecodeKeys(cta, tile);
+		}
+
+
+		// ComputeLocalRanks
+		static __device__ __forceinline__ void ComputeLocalRanks(Cta &cta, Tile &tile)
+		{
+			Counter* counter = (Counter*)
+				(((unsigned char *) cta.smem_storage.packed_counters) + tile.counter_offsets[VEC]);
+
+			// Add in CTA exclusive prefix
+			tile.ranks[VEC] = tile.thread_prefixes[VEC] + *counter;
+
+			// Next vector element