Snippets

Nathanaël Schaeffer More efficient __shfl_*() intrinsics for HIP on AMD GPUs (CDNA1 and CDNA2 at least)

Created by Nathanaël Schaeffer last modified
/* As of January 2023, ROCm 5.2 compiles __shfl_*() HIP intrinsics to the ds_bpermute_b32 instruction.
 * This is sub-optimal in many cases. The DPP modifier is the most efficient way to exchange data between
 * lanes, but it covers only a limited amount of cases (arbitrary exchange between 4 adjacent lanes + some others).
 * The ds_swizzle_b32 comes next, being a little more efficient than ds_bpermute_b32 because it does not need
 * instructions to compute the lanes. Some cases can only be done with ds_bpermute_b32.
 * The following functions select the most efficient way to exchange data. Note that the shfl_down_ function below
 * is not 100% compatible with nvidia's __shfl_down, as the boundaries (when accessing lanes that are outside the group)
 * are undefined while nvidia's __shfl_down specifies those lanes should be unchanged. The undefined behaviour
 * is most likely OK in most cases.
 * As the intrinsics used here exist only for 32bit data, two of them must be used for 64bit data, as is done in the
 * shfl_xor_() function below.
 * For reference on the ds_bpermute_b32, ds_swizzle_b32 and DPP modifier, see:
 * 		https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations/
 * 		https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf
 */

// better shfl_xor operating on 32bit registers only
template <unsigned XOR_MASK>
inline __device__ int shfl_xor_b32(int v)
{
	if (XOR_MASK==0) return v;
	else if (XOR_MASK<4) {
		return __builtin_amdgcn_mov_dpp(v, (0^XOR_MASK) | ((1^XOR_MASK)<<2) | ((2^XOR_MASK)<<4) | ((3^XOR_MASK)<<6),
			0xF, 0xF, 1);
	} else if (XOR_MASK==0x8) {
		return __builtin_amdgcn_mov_dpp(v, 0x128, 0xF, 0xF, 1);		// row rotate right by 8 threads within row (group of 16)
	} else if (XOR_MASK==0xF) {
		return __builtin_amdgcn_mov_dpp(v, 0x140, 0xF, 0xF, 1);		// reverse within row (group of 16)
	} else if (XOR_MASK==0x7) {
		return __builtin_amdgcn_mov_dpp(v, 0x141, 0xF, 0xF, 1);		// reverse within half-row (group of 8)
	} else if (XOR_MASK<32) {
		// ds_swizzle_b32: xor_mask is encoded into instruction, saves instructions compared to next case
		return __builtin_amdgcn_ds_swizzle(v, (XOR_MASK << 10) | 31);
	} else
		return __builtin_amdgcn_ds_bpermute((threadIdx.x ^ XOR_MASK)*4, v);
	//	return __shfl_xor(v,XOR_MASK);		// emit ds_bpermute_b32, with lots of instructions to compute lanes.
}

// better broadcast operating on 32bit registers only. NGROUP must be a power of 2.
template <unsigned LANE_ID, unsigned NGROUP=64>
inline __device__ int broadcast_b32(int v)
{
	static_assert(LANE_ID < NGROUP, "LANE_ID must be less than NGROUP.");
	if (NGROUP==1) return v;
	else if (NGROUP<=4) {		// NGROUP==2 or 4
		return __builtin_amdgcn_mov_dpp(v, (LANE_ID) | ((LANE_ID)<<2) | ((LANE_ID+4-NGROUP)<<4) | ((LANE_ID+4-NGROUP)<<6),
			0xF, 0xF, 1);
#ifdef __gfx90a__
	} else if (NGROUP==16) {
		return __builtin_amdgcn_mov_dpp(v, 0x150 + LANE_ID, 0xF, 0xF, 1);		// broadcast within row (group of 16), only for MI200
#endif
	} else if (NGROUP<=32) {
		// ds_swizzle_b32: broadcast lane encoded into instruction, saves instructions compared to next case
		return __builtin_amdgcn_ds_swizzle(v, (LANE_ID << 5) | (32-NGROUP));
	} else if (NGROUP==64) {
		// TODO: compare efficiency of this "readlane" vs the default "bpermute"
		return __builtin_amdgcn_readlane(v, LANE_ID);	// return scalar register available to all lanes.
	} else
		return __shfl(v,LANE_ID, NGROUP);		// emit ds_bpermute_b32, good for broadcast
}

// better shfl_down operating on 32bit registers only. NGROUP must be a power of 2.
// WARNING: threads that read out of bounds (group) are undefined (NOT like nvidia cuda __shfl_down which specifies that those lanes are unchanged)
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ int shfl_down_b32(int v)
{
	static_assert(NSHIFT < NGROUP, "NSHIFT must be less than NGROUP.");
	if ((NGROUP==1) || (NSHIFT==0)) return v;
	else if (NGROUP<=4) {
		return __builtin_amdgcn_mov_dpp(v, NSHIFT | (((NSHIFT<3) ? 1+NSHIFT : 1) <<2) | (3<<4) | (3<<6),
			0xF, 0xF, 1);
	} else if (NGROUP<=16) {	// shift crosses group boundary for NGROUP==8
		return __builtin_amdgcn_mov_dpp(v, 0x100 | NSHIFT, 0xF, 0xF, 0);
	} else if ((NGROUP<=64) && (NSHIFT==1)) {
		return __builtin_amdgcn_mov_dpp(v, 0x130, 0xF, 0xF, 0);		// shift crosses group boundary for NGROUP<64
	} else if (NGROUP==32) {
		// ds_swizzle_b32 in rotate mode: upper lanes are filled with lower lanes
		return __builtin_amdgcn_ds_swizzle(v, 0xC000 | (NSHIFT << 5));
	} else
		return __builtin_amdgcn_ds_bpermute((threadIdx.x + NSHIFT)*4, v);	// rotate: fill upper lanes with lower ones
	//return __shfl_down(v,NSHIFT,NGROUP);		// emit ds_bpermute_b32, with lots of instructions to compute lanes exactly as cuda __shfl_down() does.
}

// better shfl_up operating on 32bit registers only. NGROUP must be a power of 2.
// WARNING: threads that read out of bounds (group) are undefined (NOT like nvidia cuda __shfl_up which specifies that those lanes are unchanged)
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ int shfl_up_b32(int v)
{
	static_assert(NSHIFT < NGROUP, "NSHIFT must be less than NGROUP.");
	if ((NGROUP==1) || (NSHIFT==0)) return v;
	else if (NGROUP<=4) {
		return __builtin_amdgcn_mov_dpp(v, 0 | 0 | (((NSHIFT<3) ? 2-NSHIFT : 2) <<4) | ((3-NSHIFT)<<6),
			0xF, 0xF, 1);
	} else if (NGROUP<=16) {	// shift crosses group boundary for NGROUP==8
		return __builtin_amdgcn_mov_dpp(v, 0x110 | NSHIFT, 0xF, 0xF, 0);
	} else if ((NGROUP<=64) && (NSHIFT==1)) {
		return __builtin_amdgcn_mov_dpp(v, 0x138, 0xF, 0xF, 0);		// shift crosses group boundary for NGROUP<64
	} else if (NGROUP==32) {
		// ds_swizzle_b32 in rotate mode: upper lanes are filled with lower lanes
		return __builtin_amdgcn_ds_swizzle(v, 0xC400 | (NSHIFT << 5));
	} else
		return __builtin_amdgcn_ds_bpermute((threadIdx.x - NSHIFT)*4, v);	// rotate: fill upper lanes with lower ones
	//return __shfl_down(v,NSHIFT,NGROUP);		// emit ds_bpermute_b32, with lots of instructions to compute lanes exactly as cuda __shfl_down() does.
}


template <unsigned XOR_MASK>
inline __device__ double shfl_xor_(double v) {
	union {double d; int i[2];};		// allow access to the 2 words forming the double separately
	d = v;
	i[0] = shfl_xor_b32<XOR_MASK>(i[0]);		// shuflle
	i[1] = shfl_xor_b32<XOR_MASK>(i[1]);		// shuflle
	return d;
}

template <unsigned XOR_MASK>
inline __device__ float shfl_xor_(float v) {
	return __int_as_float( shfl_xor_b32<XOR_MASK>( __float_as_int(v) ) );
}

template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ float shfl_down_(float v) {
	return __int_as_float( shfl_down_b32<NSHIFT, NGROUP>( __float_as_int(v) ) );
}
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ int shfl_down_(int v) {
	return shfl_down_b32<NSHIFT, NGROUP>( v );
}
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ double shfl_down_(double v)
{
	union {double d; int i[2];};		// allow access to the 2 words forming the double separately
	d = v;
	i[0] = shfl_down_b32<NSHIFT, NGROUP>(i[0]);		// shuflle first half
	i[1] = shfl_down_b32<NSHIFT, NGROUP>(i[1]);		// and second half
	return d;
}

template <unsigned LANE_ID, unsigned NGROUP=64, class T>
inline __device__ T bcast_(T v) {
	const int NINT = (sizeof(T)+3)/4;
	union {T d; int i[NINT];};		// allow access to the 2 words forming the double separately
	d = v;
	for (int k=0; k<NINT; k++)
		i[k] = broadcast_b32<LANE_ID, NGROUP>(i[k]);		// shuflle
	return d;
}

#undef shfl_xor
#define shfl_xor(v,xor_mask) shfl_xor_<xor_mask>(v)
#undef shfl
#define shfl(v,lane) broadcast_b32<lane>(v)

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.