Commits

Eric Roshan Eisner committed a4f4413 Draft

completely unroll the round of block.go

benchmark old MB/s new MB/s speedup
BenchmarkHash1K 12.24 53.87 4.40x
BenchmarkHash8K 12.26 53.99 4.40x

Comments (0)

Files changed (2)

 
 package sha3
 
-// Constants
-
 var rndc = [24]uint64{
 	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
 	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
 	0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
 }
 
-var rotc = [24]uint{
-	1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
-	27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
-}
-
-var piln = [24]int{
-	10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
-	15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
-}
-
 func rotl(x uint64, r uint) uint64 {
 	return (x << r) | (x >> (64 - r))
 }
 
 func block(st *[25]uint64) {
 	var (
-		t  uint64
-		bc [5]uint64
+		t, bc0, bc1, bc2, bc3, bc4 uint64
 	)
-
 	for round := 0; round < 24; round++ {
 		// Theta
-		for i := 0; i < 5; i++ {
-			bc[i] = st[i] ^ st[i+5] ^ st[i+10] ^ st[i+15] ^ st[i+20]
-		}
-
-		for i := 0; i < 5; i++ {
-			t = bc[(i+4)%5] ^ rotl(bc[(i+1)%5], 1)
-			for j := 0; j < 25; j += 5 {
-				st[j+i] ^= t
-			}
-		}
+		bc0 = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]
+		bc1 = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]
+		bc2 = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]
+		bc3 = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]
+		bc4 = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]
+		t = bc4 ^ rotl(bc1, 1)
+		st[0] ^= t
+		st[5] ^= t
+		st[10] ^= t
+		st[15] ^= t
+		st[20] ^= t
+		t = bc0 ^ rotl(bc2, 1)
+		st[1] ^= t
+		st[6] ^= t
+		st[11] ^= t
+		st[16] ^= t
+		st[21] ^= t
+		t = bc1 ^ rotl(bc3, 1)
+		st[2] ^= t
+		st[7] ^= t
+		st[12] ^= t
+		st[17] ^= t
+		st[22] ^= t
+		t = bc2 ^ rotl(bc4, 1)
+		st[3] ^= t
+		st[8] ^= t
+		st[13] ^= t
+		st[18] ^= t
+		st[23] ^= t
+		t = bc3 ^ rotl(bc0, 1)
+		st[4] ^= t
+		st[9] ^= t
+		st[14] ^= t
+		st[19] ^= t
+		st[24] ^= t
 
 		// Rho Pi
 		t = st[1]
-		for i := 0; i < 24; i++ {
-			j := piln[i]
-			bc[0] = st[j]
-			st[j] = rotl(t, rotc[i])
-			t = bc[0]
-		}
+
+		bc0 = st[10]
+		st[10] = rotl(t, 1)
+		t = bc0
+		bc0 = st[7]
+		st[7] = rotl(t, 3)
+		t = bc0
+		bc0 = st[11]
+		st[11] = rotl(t, 6)
+		t = bc0
+		bc0 = st[17]
+		st[17] = rotl(t, 10)
+		t = bc0
+		bc0 = st[18]
+		st[18] = rotl(t, 15)
+		t = bc0
+		bc0 = st[3]
+		st[3] = rotl(t, 21)
+		t = bc0
+		bc0 = st[5]
+		st[5] = rotl(t, 28)
+		t = bc0
+		bc0 = st[16]
+		st[16] = rotl(t, 36)
+		t = bc0
+		bc0 = st[8]
+		st[8] = rotl(t, 45)
+		t = bc0
+		bc0 = st[21]
+		st[21] = rotl(t, 55)
+		t = bc0
+		bc0 = st[24]
+		st[24] = rotl(t, 2)
+		t = bc0
+		bc0 = st[4]
+		st[4] = rotl(t, 14)
+		t = bc0
+		bc0 = st[15]
+		st[15] = rotl(t, 27)
+		t = bc0
+		bc0 = st[23]
+		st[23] = rotl(t, 41)
+		t = bc0
+		bc0 = st[19]
+		st[19] = rotl(t, 56)
+		t = bc0
+		bc0 = st[13]
+		st[13] = rotl(t, 8)
+		t = bc0
+		bc0 = st[12]
+		st[12] = rotl(t, 25)
+		t = bc0
+		bc0 = st[2]
+		st[2] = rotl(t, 43)
+		t = bc0
+		bc0 = st[20]
+		st[20] = rotl(t, 62)
+		t = bc0
+		bc0 = st[14]
+		st[14] = rotl(t, 18)
+		t = bc0
+		bc0 = st[22]
+		st[22] = rotl(t, 39)
+		t = bc0
+		bc0 = st[9]
+		st[9] = rotl(t, 61)
+		t = bc0
+		bc0 = st[6]
+		st[6] = rotl(t, 20)
+		t = bc0
+		bc0 = st[1]
+		st[1] = rotl(t, 44)
+		t = bc0
 
 		//  Chi
-		for j := 0; j < 25; j += 5 {
-			for i := 0; i < 5; i++ {
-				bc[i] = st[j+i]
-			}
-			for i := 0; i < 5; i++ {
-				st[j+i] ^= bc[(i+2)%5] &^ bc[(i+1)%5]
-			}
-		}
+		bc0 = st[0]
+		bc1 = st[1]
+		bc2 = st[2]
+		bc3 = st[3]
+		bc4 = st[4]
+		st[0] ^= bc2 &^ bc1
+		st[1] ^= bc3 &^ bc2
+		st[2] ^= bc4 &^ bc3
+		st[3] ^= bc0 &^ bc4
+		st[4] ^= bc1 &^ bc0
+		bc0 = st[5]
+		bc1 = st[6]
+		bc2 = st[7]
+		bc3 = st[8]
+		bc4 = st[9]
+		st[5] ^= bc2 &^ bc1
+		st[6] ^= bc3 &^ bc2
+		st[7] ^= bc4 &^ bc3
+		st[8] ^= bc0 &^ bc4
+		st[9] ^= bc1 &^ bc0
+		bc0 = st[10]
+		bc1 = st[11]
+		bc2 = st[12]
+		bc3 = st[13]
+		bc4 = st[14]
+		st[10] ^= bc2 &^ bc1
+		st[11] ^= bc3 &^ bc2
+		st[12] ^= bc4 &^ bc3
+		st[13] ^= bc0 &^ bc4
+		st[14] ^= bc1 &^ bc0
+		bc0 = st[15]
+		bc1 = st[16]
+		bc2 = st[17]
+		bc3 = st[18]
+		bc4 = st[19]
+		st[15] ^= bc2 &^ bc1
+		st[16] ^= bc3 &^ bc2
+		st[17] ^= bc4 &^ bc3
+		st[18] ^= bc0 &^ bc4
+		st[19] ^= bc1 &^ bc0
+		bc0 = st[20]
+		bc1 = st[21]
+		bc2 = st[22]
+		bc3 = st[23]
+		bc4 = st[24]
+		st[20] ^= bc2 &^ bc1
+		st[21] ^= bc3 &^ bc2
+		st[22] ^= bc4 &^ bc3
+		st[23] ^= bc0 &^ bc4
+		st[24] ^= bc1 &^ bc0
 
 		//  Iota
 		st[0] ^= rndc[round]
+// Copyright 2012 Eric Roshan-Eisner. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Script to unroll the rounds of the core computation of Keccak.
+// Generate like so: go run gen.go | gofmt > bench.go
+
+// +build ignore
+
+package main
+
+import (
+	"fmt"
+	"io"
+	"os"
+)
+
+var rotc = [24]uint{
+	1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
+	27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
+}
+
+var piln = [24]int{
+	10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
+	15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
+}
+
+func main() {
+	unrollblock(os.Stdout)
+}
+
+func unrollblock(w io.Writer) {
+	fmt.Fprintln(w, `// Copyright 2012 Eric Roshan-Eisner. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha3
+
+var rndc = [24]uint64{
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
+}
+
+func rotl(x uint64, r uint) uint64 {
+	return (x << r) | (x >> (64 - r))
+}
+
+func block(st *[25]uint64) {
+	var (
+		t, bc0,bc1,bc2,bc3,bc4 uint64
+	)
+	for round := 0; round < 24; round++ {`)
+	theta(w)
+	rhopi(w)
+	chi(w)
+	fmt.Fprintln(w, `
+		//  Iota
+		st[0] ^= rndc[round]
+	}
+}`)
+}
+
+func theta(w io.Writer) {
+	fmt.Fprintln(w, "// Theta")
+	for i := 0; i < 5; i++ {
+		fmt.Fprintf(w, "bc%d = st[%d]^st[%d]^st[%d]^st[%d]^st[%d]\n",
+			i, i, i+5, i+10, i+15, i+20)
+	}
+	for i := 0; i < 5; i++ {
+		fmt.Fprintf(w, "t = bc%d ^ rotl(bc%d, 1)\n", (i+4)%5, (i+1)%5)
+		for j := 0; j < 25; j += 5 {
+			fmt.Fprintf(w, "st[%d] ^= t\n", i+j)
+		}
+	}
+}
+
+func rhopi(w io.Writer) {
+	fmt.Fprintln(w, `
+	// Rho Pi
+	t = st[1]`)
+	for i := 0; i < 24; i++ {
+		fmt.Fprintf(w, `
+		bc0 = st[%d]
+		st[%d] = rotl(t, %d)
+		t = bc0`, piln[i], piln[i], rotc[i])
+	}
+}
+
+func chi(w io.Writer) {
+	fmt.Fprintln(w, "\n\n//  Chi")
+	for j := 0; j < 25; j += 5 {
+		for i := 0; i < 5; i++ {
+			fmt.Fprintf(w, "bc%d = st[%d]\n", i, i+j)
+		}
+		for i := 0; i < 5; i++ {
+			fmt.Fprintf(w, "st[%d] ^= bc%d &^ bc%d\n", i+j, (i+2)%5, (i+1)%5)
+		}
+	}
+}