Snippets
Nathanaël Schaeffer More efficient __shfl_*() intrinsics for HIP on AMD GPUs (CDNA1 and CDNA2 at least)
Created by
Nathanaël Schaeffer
last modified
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | /* As of January 2023, ROCm 5.2 compiles __shfl_*() HIP intrinsics to the ds_bpermute_b32 instruction.
* This is sub-optimal in many cases. The DPP modifier is the most efficient way to exchange data between
* lanes, but it covers only a limited amount of cases (arbitrary exchange between 4 adjacent lanes + some others).
* The ds_swizzle_b32 comes next, being a little more efficient than ds_bpermute_b32 because it does not need
* instructions to compute the lanes. Some cases can only be done with ds_bpermute_b32.
* The following functions select the most efficient way to exchange data. Note that the shfl_down_ function below
* is not 100% compatible with nvidia's __shfl_down, as the boundaries (when accessing lanes that are outside the group)
* are undefined while nvidia's __shfl_down specifies those lanes should be unchanged. The undefined behaviour
* is most likely OK in most cases.
* As the intrinsics used here exist only for 32bit data, two of them must be used for 64bit data, as is done in the
* shfl_xor_() function below.
* For reference on the ds_bpermute_b32, ds_swizzle_b32 and DPP modifier, see:
* https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations/
* https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf
*/
// better shfl_xor operating on 32bit registers only
template <unsigned XOR_MASK>
inline __device__ int shfl_xor_b32(int v)
{
if (XOR_MASK==0) return v;
else if (XOR_MASK<4) {
return __builtin_amdgcn_mov_dpp(v, (0^XOR_MASK) | ((1^XOR_MASK)<<2) | ((2^XOR_MASK)<<4) | ((3^XOR_MASK)<<6),
0xF, 0xF, 1);
} else if (XOR_MASK==0x8) {
return __builtin_amdgcn_mov_dpp(v, 0x128, 0xF, 0xF, 1); // row rotate right by 8 threads within row (group of 16)
} else if (XOR_MASK==0xF) {
return __builtin_amdgcn_mov_dpp(v, 0x140, 0xF, 0xF, 1); // reverse within row (group of 16)
} else if (XOR_MASK==0x7) {
return __builtin_amdgcn_mov_dpp(v, 0x141, 0xF, 0xF, 1); // reverse within half-row (group of 8)
} else if (XOR_MASK<32) {
// ds_swizzle_b32: xor_mask is encoded into instruction, saves instructions compared to next case
return __builtin_amdgcn_ds_swizzle(v, (XOR_MASK << 10) | 31);
} else
return __builtin_amdgcn_ds_bpermute((threadIdx.x ^ XOR_MASK)*4, v);
// return __shfl_xor(v,XOR_MASK); // emit ds_bpermute_b32, with lots of instructions to compute lanes.
}
// better broadcast operating on 32bit registers only. NGROUP must be a power of 2.
template <unsigned LANE_ID, unsigned NGROUP=64>
inline __device__ int broadcast_b32(int v)
{
static_assert(LANE_ID < NGROUP, "LANE_ID must be less than NGROUP.");
if (NGROUP==1) return v;
else if (NGROUP<=4) { // NGROUP==2 or 4
return __builtin_amdgcn_mov_dpp(v, (LANE_ID) | ((LANE_ID)<<2) | ((LANE_ID+4-NGROUP)<<4) | ((LANE_ID+4-NGROUP)<<6),
0xF, 0xF, 1);
#ifdef __gfx90a__
} else if (NGROUP==16) {
return __builtin_amdgcn_mov_dpp(v, 0x150 + LANE_ID, 0xF, 0xF, 1); // broadcast within row (group of 16), only for MI200
#endif
} else if (NGROUP<=32) {
// ds_swizzle_b32: broadcast lane encoded into instruction, saves instructions compared to next case
return __builtin_amdgcn_ds_swizzle(v, (LANE_ID << 5) | (32-NGROUP));
} else if (NGROUP==64) {
// TODO: compare efficiency of this "readlane" vs the default "bpermute"
return __builtin_amdgcn_readlane(v, LANE_ID); // return scalar register available to all lanes.
} else
return __shfl(v,LANE_ID, NGROUP); // emit ds_bpermute_b32, good for broadcast
}
// better shfl_down operating on 32bit registers only. NGROUP must be a power of 2.
// WARNING: threads that read out of bounds (group) are undefined (NOT like nvidia cuda __shfl_down which specifies that those lanes are unchanged)
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ int shfl_down_b32(int v)
{
static_assert(NSHIFT < NGROUP, "NSHIFT must be less than NGROUP.");
if ((NGROUP==1) || (NSHIFT==0)) return v;
else if (NGROUP<=4) {
return __builtin_amdgcn_mov_dpp(v, NSHIFT | (((NSHIFT<3) ? 1+NSHIFT : 1) <<2) | (3<<4) | (3<<6),
0xF, 0xF, 1);
} else if (NGROUP<=16) { // shift crosses group boundary for NGROUP==8
return __builtin_amdgcn_mov_dpp(v, 0x100 | NSHIFT, 0xF, 0xF, 0);
} else if ((NGROUP<=64) && (NSHIFT==1)) {
return __builtin_amdgcn_mov_dpp(v, 0x130, 0xF, 0xF, 0); // shift crosses group boundary for NGROUP<64
} else if (NGROUP==32) {
// ds_swizzle_b32 in rotate mode: upper lanes are filled with lower lanes
return __builtin_amdgcn_ds_swizzle(v, 0xC000 | (NSHIFT << 5));
} else
return __builtin_amdgcn_ds_bpermute((threadIdx.x + NSHIFT)*4, v); // rotate: fill upper lanes with lower ones
//return __shfl_down(v,NSHIFT,NGROUP); // emit ds_bpermute_b32, with lots of instructions to compute lanes exactly as cuda __shfl_down() does.
}
// better shfl_up operating on 32bit registers only. NGROUP must be a power of 2.
// WARNING: threads that read out of bounds (group) are undefined (NOT like nvidia cuda __shfl_up which specifies that those lanes are unchanged)
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ int shfl_up_b32(int v)
{
static_assert(NSHIFT < NGROUP, "NSHIFT must be less than NGROUP.");
if ((NGROUP==1) || (NSHIFT==0)) return v;
else if (NGROUP<=4) {
return __builtin_amdgcn_mov_dpp(v, 0 | 0 | (((NSHIFT<3) ? 2-NSHIFT : 2) <<4) | ((3-NSHIFT)<<6),
0xF, 0xF, 1);
} else if (NGROUP<=16) { // shift crosses group boundary for NGROUP==8
return __builtin_amdgcn_mov_dpp(v, 0x110 | NSHIFT, 0xF, 0xF, 0);
} else if ((NGROUP<=64) && (NSHIFT==1)) {
return __builtin_amdgcn_mov_dpp(v, 0x138, 0xF, 0xF, 0); // shift crosses group boundary for NGROUP<64
} else if (NGROUP==32) {
// ds_swizzle_b32 in rotate mode: upper lanes are filled with lower lanes
return __builtin_amdgcn_ds_swizzle(v, 0xC400 | (NSHIFT << 5));
} else
return __builtin_amdgcn_ds_bpermute((threadIdx.x - NSHIFT)*4, v); // rotate: fill upper lanes with lower ones
//return __shfl_down(v,NSHIFT,NGROUP); // emit ds_bpermute_b32, with lots of instructions to compute lanes exactly as cuda __shfl_down() does.
}
template <unsigned XOR_MASK>
inline __device__ double shfl_xor_(double v) {
union {double d; int i[2];}; // allow access to the 2 words forming the double separately
d = v;
i[0] = shfl_xor_b32<XOR_MASK>(i[0]); // shuflle
i[1] = shfl_xor_b32<XOR_MASK>(i[1]); // shuflle
return d;
}
template <unsigned XOR_MASK>
inline __device__ float shfl_xor_(float v) {
return __int_as_float( shfl_xor_b32<XOR_MASK>( __float_as_int(v) ) );
}
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ float shfl_down_(float v) {
return __int_as_float( shfl_down_b32<NSHIFT, NGROUP>( __float_as_int(v) ) );
}
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ int shfl_down_(int v) {
return shfl_down_b32<NSHIFT, NGROUP>( v );
}
template <unsigned NSHIFT, unsigned NGROUP=64>
inline __device__ double shfl_down_(double v)
{
union {double d; int i[2];}; // allow access to the 2 words forming the double separately
d = v;
i[0] = shfl_down_b32<NSHIFT, NGROUP>(i[0]); // shuflle first half
i[1] = shfl_down_b32<NSHIFT, NGROUP>(i[1]); // and second half
return d;
}
template <unsigned LANE_ID, unsigned NGROUP=64, class T>
inline __device__ T bcast_(T v) {
const int NINT = (sizeof(T)+3)/4;
union {T d; int i[NINT];}; // allow access to the 2 words forming the double separately
d = v;
for (int k=0; k<NINT; k++)
i[k] = broadcast_b32<LANE_ID, NGROUP>(i[k]); // shuflle
return d;
}
#undef shfl_xor
#define shfl_xor(v,xor_mask) shfl_xor_<xor_mask>(v)
#undef shfl
#define shfl(v,lane) broadcast_b32<lane>(v)
|
Comments (0)
You can clone a snippet to your computer for local editing. Learn more.