1. Eric Roshan Eisner
  2. sha3

Commits

Eric Roshan Eisner  committed d0ef610 Draft

block: manually inline rotate

Letting gc do the inlining was losing the rotate optimization.

benchmark old MB/s new MB/s speedup
BenchmarkHash1K 48.51 80.32 1.66x
BenchmarkHash8K 48.72 80.74 1.66x

  • Participants
  • Parent commits ac5a05a
  • Branches default

Comments (0)

Files changed (2)

File block.go

View file
  • Ignore whitespace
 	0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
 }
 
-func rotl(x uint64, r uint) uint64 {
-	return (x << r) | (x >> (64 - r))
-}
-
 func block(st *[25]uint64) {
 	var (
 		t, bc0, bc1, bc2, bc3, bc4 uint64
 		bc2 = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]
 		bc3 = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]
 		bc4 = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]
-		t = bc4 ^ rotl(bc1, 1)
+		t = bc4 ^ (bc1<<1 | bc1>>63)
 		st[0] ^= t
 		st[5] ^= t
 		st[10] ^= t
 		st[15] ^= t
 		st[20] ^= t
-		t = bc0 ^ rotl(bc2, 1)
+		t = bc0 ^ (bc2<<1 | bc2>>63)
 		st[1] ^= t
 		st[6] ^= t
 		st[11] ^= t
 		st[16] ^= t
 		st[21] ^= t
-		t = bc1 ^ rotl(bc3, 1)
+		t = bc1 ^ (bc3<<1 | bc3>>63)
 		st[2] ^= t
 		st[7] ^= t
 		st[12] ^= t
 		st[17] ^= t
 		st[22] ^= t
-		t = bc2 ^ rotl(bc4, 1)
+		t = bc2 ^ (bc4<<1 | bc4>>63)
 		st[3] ^= t
 		st[8] ^= t
 		st[13] ^= t
 		st[18] ^= t
 		st[23] ^= t
-		t = bc3 ^ rotl(bc0, 1)
+		t = bc3 ^ (bc0<<1 | bc0>>63)
 		st[4] ^= t
 		st[9] ^= t
 		st[14] ^= t
 		t = st[1]
 
 		bc0 = st[10]
-		st[10] = rotl(t, 1)
+		st[10] = t<<1 | t>>(64-1)
 		t = bc0
 		bc0 = st[7]
-		st[7] = rotl(t, 3)
+		st[7] = t<<3 | t>>(64-3)
 		t = bc0
 		bc0 = st[11]
-		st[11] = rotl(t, 6)
+		st[11] = t<<6 | t>>(64-6)
 		t = bc0
 		bc0 = st[17]
-		st[17] = rotl(t, 10)
+		st[17] = t<<10 | t>>(64-10)
 		t = bc0
 		bc0 = st[18]
-		st[18] = rotl(t, 15)
+		st[18] = t<<15 | t>>(64-15)
 		t = bc0
 		bc0 = st[3]
-		st[3] = rotl(t, 21)
+		st[3] = t<<21 | t>>(64-21)
 		t = bc0
 		bc0 = st[5]
-		st[5] = rotl(t, 28)
+		st[5] = t<<28 | t>>(64-28)
 		t = bc0
 		bc0 = st[16]
-		st[16] = rotl(t, 36)
+		st[16] = t<<36 | t>>(64-36)
 		t = bc0
 		bc0 = st[8]
-		st[8] = rotl(t, 45)
+		st[8] = t<<45 | t>>(64-45)
 		t = bc0
 		bc0 = st[21]
-		st[21] = rotl(t, 55)
+		st[21] = t<<55 | t>>(64-55)
 		t = bc0
 		bc0 = st[24]
-		st[24] = rotl(t, 2)
+		st[24] = t<<2 | t>>(64-2)
 		t = bc0
 		bc0 = st[4]
-		st[4] = rotl(t, 14)
+		st[4] = t<<14 | t>>(64-14)
 		t = bc0
 		bc0 = st[15]
-		st[15] = rotl(t, 27)
+		st[15] = t<<27 | t>>(64-27)
 		t = bc0
 		bc0 = st[23]
-		st[23] = rotl(t, 41)
+		st[23] = t<<41 | t>>(64-41)
 		t = bc0
 		bc0 = st[19]
-		st[19] = rotl(t, 56)
+		st[19] = t<<56 | t>>(64-56)
 		t = bc0
 		bc0 = st[13]
-		st[13] = rotl(t, 8)
+		st[13] = t<<8 | t>>(64-8)
 		t = bc0
 		bc0 = st[12]
-		st[12] = rotl(t, 25)
+		st[12] = t<<25 | t>>(64-25)
 		t = bc0
 		bc0 = st[2]
-		st[2] = rotl(t, 43)
+		st[2] = t<<43 | t>>(64-43)
 		t = bc0
 		bc0 = st[20]
-		st[20] = rotl(t, 62)
+		st[20] = t<<62 | t>>(64-62)
 		t = bc0
 		bc0 = st[14]
-		st[14] = rotl(t, 18)
+		st[14] = t<<18 | t>>(64-18)
 		t = bc0
 		bc0 = st[22]
-		st[22] = rotl(t, 39)
+		st[22] = t<<39 | t>>(64-39)
 		t = bc0
 		bc0 = st[9]
-		st[9] = rotl(t, 61)
+		st[9] = t<<61 | t>>(64-61)
 		t = bc0
 		bc0 = st[6]
-		st[6] = rotl(t, 20)
+		st[6] = t<<20 | t>>(64-20)
 		t = bc0
 		bc0 = st[1]
-		st[1] = rotl(t, 44)
+		st[1] = t<<44 | t>>(64-44)
 		t = bc0
 
 		//  Chi

File gen.go

View file
  • Ignore whitespace
 	0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
 }
 
-func rotl(x uint64, r uint) uint64 {
-	return (x << r) | (x >> (64 - r))
-}
-
 func block(st *[25]uint64) {
 	var (
 		t, bc0,bc1,bc2,bc3,bc4 uint64
 			i, i, i+5, i+10, i+15, i+20)
 	}
 	for i := 0; i < 5; i++ {
-		fmt.Fprintf(w, "t = bc%d ^ rotl(bc%d, 1)\n", (i+4)%5, (i+1)%5)
+		fmt.Fprintf(w, "t = bc%d ^ (bc%d << 1 | bc%d >> 63)\n", (i+4)%5, (i+1)%5, (i+1)%5)
 		for j := 0; j < 25; j += 5 {
 			fmt.Fprintf(w, "st[%d] ^= t\n", i+j)
 		}
 	for i := 0; i < 24; i++ {
 		fmt.Fprintf(w, `
 		bc0 = st[%d]
-		st[%d] = rotl(t, %d)
-		t = bc0`, piln[i], piln[i], rotc[i])
+		st[%d] = t << %d | t >> (64-%d)
+		t = bc0`, piln[i], piln[i], rotc[i], rotc[i])
 	}
 }