1. Takafumi Arakaki
  2. benchmark_rnn

Commits

Takafumi Arakaki  committed a60010c
  • Participants
  • Branches default

Comments (0)

Files changed (9)

File Makefile

View file
+ALL = \
+	rnn_ca1d-gcc-O2 rnn_ca1d-gcc-O3 rnn_ca1d-icc-O2 rnn_ca1d-icc-O3 \
+	rnn_ca2d-gcc-O2 rnn_ca2d-gcc-O3 rnn_ca2d-icc-O2 rnn_ca2d-icc-O3 \
+	rnn_opt0-gcc-O2 rnn_opt0-gcc-O3 rnn_opt0-icc-O2 rnn_opt0-icc-O3 \
+	rnn_opt1-gcc-O2 rnn_opt1-gcc-O3 rnn_opt1-icc-O2 rnn_opt1-icc-O3 \
+	rnn_opt2-gcc-O2 rnn_opt2-gcc-O3 rnn_opt2-icc-O2 rnn_opt2-icc-O3 \
+	rnn_opt3-gcc-O2 rnn_opt3-gcc-O3 rnn_opt3-icc-O2 rnn_opt3-icc-O3 \
+	rnn_opt4-gcc-O2 rnn_opt4-gcc-O3 rnn_opt4-icc-O2 rnn_opt4-icc-O3 \
+
+#ALL = \
+	rnn_ca1d-gcc-O2 rnn_ca1d-gcc-O3 \
+	rnn_ca2d-gcc-O2 rnn_ca2d-gcc-O3 \
+
+.PHONY: clean all runtest
+
+all: ${ALL}
+
+clean:
+	rm ${ALL}
+
+rnn_ca1d-gcc-O2: rnn_ca1d.c
+	gcc -lm -O2 $< -o $@
+rnn_ca1d-gcc-O3: rnn_ca1d.c
+	gcc -lm -O3 $< -o $@ -ftree-vectorizer-verbose=1
+rnn_ca1d-icc-O2: rnn_ca1d.c
+	icc -vec-report1 -O2 $< -o $@
+rnn_ca1d-icc-O3: rnn_ca1d.c
+	icc -vec-report1 -O3 $< -o $@
+
+rnn_ca2d-gcc-O2: rnn_ca2d.c
+	gcc -lm -O2 $< -o $@
+rnn_ca2d-gcc-O3: rnn_ca2d.c
+	gcc -lm -O3 $< -o $@ -ftree-vectorizer-verbose=1
+rnn_ca2d-icc-O2: rnn_ca2d.c
+	icc -vec-report1 -O2 $< -o $@
+rnn_ca2d-icc-O3: rnn_ca2d.c
+	icc -vec-report1 -O3 $< -o $@
+
+rnn_opt0-gcc-O2: rnn_opt0.c
+	gcc -lm -O2 $< -o $@
+rnn_opt0-gcc-O3: rnn_opt0.c
+	gcc -lm -O3 $< -o $@ -ftree-vectorizer-verbose=1
+rnn_opt0-icc-O2: rnn_opt0.c
+	icc -vec-report1 -O2 $< -o $@
+rnn_opt0-icc-O3: rnn_opt0.c
+	icc -vec-report1 -O3 $< -o $@
+
+rnn_opt1-gcc-O2: rnn_opt1.c
+	gcc -lm -O2 $< -o $@
+rnn_opt1-gcc-O3: rnn_opt1.c
+	gcc -lm -O3 $< -o $@ -ftree-vectorizer-verbose=1
+rnn_opt1-icc-O2: rnn_opt1.c
+	icc -vec-report1 -O2 $< -o $@
+rnn_opt1-icc-O3: rnn_opt1.c
+	icc -vec-report1 -O3 $< -o $@
+
+rnn_opt2-gcc-O2: rnn_opt2.c
+	gcc -lm -O2 $< -o $@
+rnn_opt2-gcc-O3: rnn_opt2.c
+	gcc -lm -O3 $< -o $@ -ftree-vectorizer-verbose=1
+rnn_opt2-icc-O2: rnn_opt2.c
+	icc -vec-report1 -O2 $< -o $@
+rnn_opt2-icc-O3: rnn_opt2.c
+	icc -vec-report1 -O3 $< -o $@
+
+rnn_opt3-gcc-O2: rnn_opt3.c
+	gcc -lm -O2 -std=c99 $< -o $@
+rnn_opt3-gcc-O3: rnn_opt3.c
+	gcc -lm -O3 -std=c99 $< -o $@ -ftree-vectorizer-verbose=1
+rnn_opt3-icc-O2: rnn_opt3.c
+	icc -vec-report1 -O2 -restrict $< -o $@
+rnn_opt3-icc-O3: rnn_opt3.c
+	icc -vec-report1 -O3 -restrict $< -o $@
+
+rnn_opt4-gcc-O2: rnn_opt4.c
+	gcc -lm -O2 -std=c99 $< -o $@
+rnn_opt4-gcc-O3: rnn_opt4.c
+	gcc -lm -O3 -std=c99 $< -o $@ -ftree-vectorizer-verbose=1
+rnn_opt4-icc-O2: rnn_opt4.c
+	icc -vec-report1 -O2 -restrict $< -o $@
+rnn_opt4-icc-O3: rnn_opt4.c
+	icc -vec-report1 -O3 -restrict $< -o $@
+
+num_c=30
+num_s=1000
+repeat_times=300
+#repeat_times=1
+
+# print command and time output to standard error
+runtest: ${ALL}
+	for i in  $^; \
+	do \
+	printf "%s %d %d %d " \
+	./$$i ${num_c} ${num_s} ${repeat_times} 1>&2; \
+	time ./$$i ${num_c} ${num_s} ${repeat_times}; \
+	done

File Result

View file
+./rnn_ca1d-gcc-O2 30 1000 300 1.64user 0.00system 0:01.67elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+271minor)pagefaults 0swaps
+./rnn_ca1d-gcc-O3 30 1000 300 1.57user 0.00system 0:01.57elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+271minor)pagefaults 0swaps
+./rnn_ca1d-icc-O2 30 1000 300 1.04user 0.00system 0:01.06elapsed 97%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+284minor)pagefaults 0swaps
+./rnn_ca1d-icc-O3 30 1000 300 1.12user 0.00system 0:01.13elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+286minor)pagefaults 0swaps
+./rnn_ca2d-gcc-O2 30 1000 300 1.50user 0.00system 0:01.51elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+275minor)pagefaults 0swaps
+./rnn_ca2d-gcc-O3 30 1000 300 2.69user 0.00system 0:02.71elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+275minor)pagefaults 0swaps
+./rnn_ca2d-icc-O2 30 1000 300 1.11user 0.00system 0:01.12elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+288minor)pagefaults 0swaps
+./rnn_ca2d-icc-O3 30 1000 300 2.21user 0.00system 0:02.25elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+289minor)pagefaults 0swaps
+./rnn_opt0-gcc-O2 30 1000 300 1.65user 0.00system 0:01.68elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+271minor)pagefaults 0swaps
+./rnn_opt0-gcc-O3 30 1000 300 1.65user 0.00system 0:01.67elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+272minor)pagefaults 0swaps
+./rnn_opt0-icc-O2 30 1000 300 0.84user 0.00system 0:00.84elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+285minor)pagefaults 0swaps
+./rnn_opt0-icc-O3 30 1000 300 0.95user 0.00system 0:00.98elapsed 96%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+285minor)pagefaults 0swaps
+./rnn_opt1-gcc-O2 30 1000 300 1.64user 0.00system 0:01.68elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+270minor)pagefaults 0swaps
+./rnn_opt1-gcc-O3 30 1000 300 1.32user 0.00system 0:01.33elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+271minor)pagefaults 0swaps
+./rnn_opt1-icc-O2 30 1000 300 0.99user 0.00system 0:01.02elapsed 97%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+284minor)pagefaults 0swaps
+./rnn_opt1-icc-O3 30 1000 300 0.76user 0.00system 0:00.77elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+283minor)pagefaults 0swaps
+./rnn_opt2-gcc-O2 30 1000 300 1.50user 0.00system 0:01.54elapsed 96%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+271minor)pagefaults 0swaps
+./rnn_opt2-gcc-O3 30 1000 300 1.60user 0.00system 0:01.63elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+272minor)pagefaults 0swaps
+./rnn_opt2-icc-O2 30 1000 300 0.92user 0.00system 0:00.92elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+285minor)pagefaults 0swaps
+./rnn_opt2-icc-O3 30 1000 300 0.93user 0.00system 0:00.97elapsed 96%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+286minor)pagefaults 0swaps
+./rnn_opt3-gcc-O2 30 1000 300 1.86user 0.00system 0:01.88elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+271minor)pagefaults 0swaps
+./rnn_opt3-gcc-O3 30 1000 300 1.88user 0.00system 0:01.90elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+271minor)pagefaults 0swaps
+./rnn_opt3-icc-O2 30 1000 300 1.22user 0.00system 0:01.23elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+285minor)pagefaults 0swaps
+./rnn_opt3-icc-O3 30 1000 300 1.08user 0.00system 0:01.12elapsed 96%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+285minor)pagefaults 0swaps
+./rnn_opt4-gcc-O2 30 1000 300 1.52user 0.00system 0:01.53elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+275minor)pagefaults 0swaps
+./rnn_opt4-gcc-O3 30 1000 300 1.41user 0.00system 0:01.42elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+274minor)pagefaults 0swaps
+./rnn_opt4-icc-O2 30 1000 300 0.61user 0.00system 0:00.61elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+289minor)pagefaults 0swaps
+./rnn_opt4-icc-O3 30 1000 300 0.59user 0.00system 0:00.59elapsed 100%CPU (0avgtext+0avgdata 0maxresident)k
+0inputs+0outputs (0major+288minor)pagefaults 0swaps
+---
+./rnn_ca1d-gcc-O2 1.64user
+./rnn_ca1d-gcc-O3 1.57user
+./rnn_ca1d-icc-O2 1.04user
+./rnn_ca1d-icc-O3 1.12user
+./rnn_ca2d-gcc-O2 1.50user
+./rnn_ca2d-gcc-O3 2.69user
+./rnn_ca2d-icc-O2 1.11user
+./rnn_ca2d-icc-O3 2.21user
+./rnn_opt0-gcc-O2 1.65user
+./rnn_opt0-gcc-O3 1.65user
+./rnn_opt0-icc-O2 0.84user
+./rnn_opt0-icc-O3 0.95user
+./rnn_opt1-gcc-O2 1.64user
+./rnn_opt1-gcc-O3 1.32user
+./rnn_opt1-icc-O2 0.99user
+./rnn_opt1-icc-O3 0.76user
+./rnn_opt2-gcc-O2 1.50user
+./rnn_opt2-gcc-O3 1.60user
+./rnn_opt2-icc-O2 0.92user
+./rnn_opt2-icc-O3 0.93user
+./rnn_opt3-gcc-O2 1.86user
+./rnn_opt3-gcc-O3 1.88user
+./rnn_opt3-icc-O2 1.22user
+./rnn_opt3-icc-O3 1.08user
+./rnn_opt4-gcc-O2 1.52user
+./rnn_opt4-gcc-O3 1.41user
+./rnn_opt4-icc-O2 0.61user
+./rnn_opt4-icc-O3 0.59user

File rnn_ca1d.c

View file
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct __RNN__{
+  int num_c, num_s;
+  double *wcc, *bc, *ec, *uc, *xc;
+} RNN;
+
+#define Wcc(i,j) self->wcc[ self->num_c*(i) + (j) ]
+#define Bc(i)    self->bc[(i)]
+#define Ec(i)    self->ec[(i)]
+#define Uc(i,j)  self->uc[ self->num_c*(i) + (j) ]
+#define Xc(i,j)  self->xc[ self->num_c*(i) + (j) ]
+
+void RNN_fptt(RNN *self)
+{
+  int s0, c0, c1;
+
+  s0 = 0;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    Xc(s0,c0) = tanh(Uc(s0,c0));
+  }
+  for (s0 = 1; s0 < self->num_s; ++s0){
+    for (c0 = 0; c0 < self->num_c; ++c0){
+      Uc(s0,c0) = (1 - Ec(c0)) * Uc(s0-1,c0) + Ec(c0) * Bc(c0);
+      for (c1 = 0; c1 < self->num_c; ++c1){
+        Uc(s0,c0) += Ec(c0) * Wcc(c0,c1) * Xc(s0-1,c1);
+      }
+      Xc(s0,c0) = tanh(Uc(s0,c0));
+    }
+  }
+}
+
+void RNN_init(RNN *self)
+{
+  int c0, c1;
+
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    for (c1 = 0; c1 < self->num_c; ++c1){
+      Wcc(c0,c1) = 0.1;
+    }
+    Ec(c0) = 0.1;
+    Bc(c0) = 0.1;
+    Uc(0,c0) = 0;
+    Xc(0,c0) = 0;
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  int nc = atoi((argv[1]));
+  int ns = atoi((argv[2]));
+  int i, repeat_times = atoi(argv[3]);
+  RNN rnn;
+  double wcc[nc*nc], bc[nc], ec[nc], uc[ns*nc], xc[ns*nc];
+
+  rnn.num_s = ns;
+  rnn.num_c = nc;
+  rnn.ec  = ec;
+  rnn.bc  = bc;
+  rnn.wcc = wcc;
+  rnn.uc  = uc;
+  rnn.xc  = xc;
+
+  RNN_init(&rnn);
+  for (i = 0; i < repeat_times; ++i){
+    RNN_fptt(&rnn);
+  }
+
+  return 0;
+}

File rnn_ca2d.c

View file
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct __RNN__{
+  int num_c, num_s;
+  double **wcc, *bc, *ec, **uc, **xc;
+} RNN;
+
+#define Wcc(i,j) self->wcc[i][j]
+#define Bc(i)    self->bc[i]
+#define Ec(i)    self->ec[i]
+#define Uc(i,j)  self->uc[i][j]
+#define Xc(i,j)  self->xc[i][j]
+
+void RNN_fptt(RNN *self)
+{
+  int s0, c0, c1;
+
+  s0 = 0;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    Xc(s0,c0) = tanh(Uc(s0,c0));
+  }
+  for (s0 = 1; s0 < self->num_s; ++s0){
+    for (c0 = 0; c0 < self->num_c; ++c0){
+      Uc(s0,c0) = (1 - Ec(c0)) * Uc(s0-1,c0) + Ec(c0) * Bc(c0);
+      for (c1 = 0; c1 < self->num_c; ++c1){
+        Uc(s0,c0) += Ec(c0) * Wcc(c0,c1) * Xc(s0-1,c1);
+      }
+      Xc(s0,c0) = tanh(Uc(s0,c0));
+    }
+  }
+}
+
+void RNN_init(RNN *self)
+{
+  int c0, c1;
+
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    for (c1 = 0; c1 < self->num_c; ++c1){
+      Wcc(c0,c1) = 0.1;
+    }
+    Ec(c0) = 0.1;
+    Bc(c0) = 0.1;
+    Uc(0,c0) = 0;
+    Xc(0,c0) = 0;
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  int c, nc = atoi((argv[1]));
+  int s, ns = atoi((argv[2]));
+  int i, repeat_times = atoi(argv[3]);
+  RNN rnn;
+  double wcc[nc][nc], bc[nc], ec[nc], uc[ns][nc], xc[ns][nc];
+  double *pwcc[nc], *puc[ns], *pxc[ns];
+
+  rnn.num_s = ns;
+  rnn.num_c = nc;
+  rnn.ec  = ec;
+  rnn.bc  = bc;
+  rnn.wcc = pwcc;
+  rnn.uc  = puc;
+  rnn.xc  = pxc;
+  for (c = 0; c < nc; ++c) rnn.wcc[c] = wcc[c];
+  for (s = 0; s < ns; ++s) rnn.uc[s] = uc[s];
+  for (s = 0; s < ns; ++s) rnn.xc[s] = xc[s];
+
+  RNN_init(&rnn);
+  for (i = 0; i < repeat_times; ++i){
+    RNN_fptt(&rnn);
+  }
+
+  return 0;
+}

File rnn_opt0.c

View file
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct __RNN__{
+  int num_c, num_s;
+  double *bc, *ec;
+  double *wcc, *wcc_c;
+  double *uc, *uc_c, *uc_p, *xc, *xc_c, *xc_p;
+} RNN;
+
+void RNN_fptt(RNN *self)
+{
+  int s0, c0, c1;
+  double wccxc_bc;
+
+  self->uc_c = self->uc;
+  self->xc_c = self->xc;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    self->xc_c[c0] = tanh(self->uc_c[c0]);
+  }
+  self->uc_p = self->uc_c;
+  self->xc_p = self->xc_c;
+  self->uc_c += self->num_c;
+  self->xc_c += self->num_c;
+  for (s0 = 1; s0 < self->num_s; ++s0){
+    self->wcc_c = self->wcc;
+    for (c0 = 0; c0 < self->num_c; ++c0){
+      self->uc_c[c0] = (1 - self->ec[c0]) * self->uc_p[c0];
+      wccxc_bc = self->bc[c0];
+      for (c1 = 0; c1 < self->num_c; ++c1){
+	wccxc_bc += self->wcc_c[c1] * self->xc_p[c1];
+      }
+      self->uc_c[c0] += self->ec[c0] * wccxc_bc;
+      self->xc_c[c0] = tanh(self->uc_c[c0]);
+      self->wcc_c += self->num_c;
+    }
+    self->uc_p = self->uc_c;
+    self->xc_p = self->xc_c;
+    self->uc_c += self->num_c;
+    self->xc_c += self->num_c;
+  }
+}
+
+void RNN_init(RNN *self)
+{
+  int c0, c1;
+
+  self->uc_c = self->uc;
+  self->xc_c = self->xc;
+  self->wcc_c = self->wcc;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    for (c1 = 0; c1 < self->num_c; ++c1){
+      self->wcc_c[c1] = 0.1;
+    }
+    self->ec[c0] = 0.1;
+    self->bc[c0] = 0.1;
+    self->uc_c[c0] = 0;
+    self->xc_c[c0] = 0;
+    self->wcc_c += self->num_c;
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  int nc = atoi((argv[1]));
+  int ns = atoi((argv[2]));
+  int i, repeat_times = atoi(argv[3]);
+  RNN rnn;
+  double wcc[nc*nc], bc[nc], ec[nc], uc[ns*nc], xc[ns*nc];
+
+  rnn.num_s = ns;
+  rnn.num_c = nc;
+  rnn.ec  = ec;
+  rnn.bc  = bc;
+  rnn.wcc = wcc;
+  rnn.uc  = uc;
+  rnn.xc  = xc;
+
+  RNN_init(&rnn);
+  for (i = 0; i < repeat_times; ++i){
+    RNN_fptt(&rnn);
+  }
+
+  return 0;
+}

File rnn_opt1.c

View file
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct __RNN__{
+  int num_c, num_s;
+  double *bc, *bc_c, *ec, *ec_c;
+  double *wcc, *wcc_c;
+  double *uc, *uc_c, *uc_p, *xc, *xc_c, *xc_p0, *xc_p;
+} RNN;
+
+void RNN_fptt(RNN *self)
+{
+  int s0, c0, c1;
+  double wccxc_bc;
+
+  self->uc_c = self->uc;
+  self->xc_c = self->xc;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    *self->xc_c = tanh(*self->uc_c);
+    self->xc_c++;
+    self->uc_c++;
+  }
+  self->uc_p  = self->uc;
+  self->xc_p0 = self->xc;
+  for (s0 = 1; s0 < self->num_s; ++s0){
+    self->ec_c = self->ec;
+    self->bc_c = self->bc;
+    self->wcc_c = self->wcc;
+    for (c0 = 0; c0 < self->num_c; ++c0){
+      *self->uc_c = (1 - *self->ec_c) * *self->uc_p;
+      wccxc_bc = *self->bc_c;
+      self->xc_p = self->xc_p0;
+      for (c1 = 0; c1 < self->num_c; ++c1){
+	wccxc_bc += *self->wcc_c * *self->xc_p;
+	self->wcc_c++;
+	self->xc_p++;
+      }
+      *self->uc_c += *self->ec_c * wccxc_bc;
+      *self->xc_c = tanh(*self->uc_c);
+      self->ec_c++;
+      self->bc_c++;
+      self->xc_c++;
+      self->uc_c++;
+      self->uc_p++;
+    }
+    self->xc_p0 = self->xc_p;
+  }
+}
+
+void RNN_init(RNN *self)
+{
+  int c0, c1;
+
+  self->uc_c = self->uc;
+  self->xc_c = self->xc;
+  self->wcc_c = self->wcc;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    for (c1 = 0; c1 < self->num_c; ++c1){
+      self->wcc_c[c1] = 0.1;
+    }
+    self->ec[c0] = 0.1;
+    self->bc[c0] = 0.1;
+    self->uc_c[c0] = 0;
+    self->xc_c[c0] = 0;
+    self->wcc_c += self->num_c;
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  int nc = atoi((argv[1]));
+  int ns = atoi((argv[2]));
+  int i, repeat_times = atoi(argv[3]);
+  RNN rnn;
+  double wcc[nc*nc], bc[nc], ec[nc], uc[ns*nc], xc[ns*nc];
+
+  rnn.num_s = ns;
+  rnn.num_c = nc;
+  rnn.ec  = ec;
+  rnn.bc  = bc;
+  rnn.wcc = wcc;
+  rnn.uc  = uc;
+  rnn.xc  = xc;
+
+  RNN_init(&rnn);
+  for (i = 0; i < repeat_times; ++i){
+    RNN_fptt(&rnn);
+  }
+
+  return 0;
+}

File rnn_opt2.c

View file
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct __RNN__{
+  int num_c, num_s;
+  double *wcc, *bc, *ec, *uc, *xc;
+} RNN;
+
+#define Wcc(i,j) self->wcc[ self->num_c*(i) + (j) ]
+#define Bc(i)    self->bc[(i)]
+#define Ec(i)    self->ec[(i)]
+#define Uc(i,j)  self->uc[ self->num_c*(i) + (j) ]
+#define Xc(i,j)  self->xc[ self->num_c*(i) + (j) ]
+
+void RNN_fptt(RNN *self)
+{
+  int s0, c0, c1;
+  double wccxc_bc;
+
+  s0 = 0;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    Xc(s0,c0) = tanh(Uc(s0,c0));
+  }
+  for (s0 = 1; s0 < self->num_s; ++s0){
+    for (c0 = 0; c0 < self->num_c; ++c0){
+      Uc(s0,c0) = (1 - Ec(c0)) * Uc(s0-1,c0);
+      wccxc_bc = Bc(c0);
+      for (c1 = 0; c1 < self->num_c; ++c1){
+        wccxc_bc += Wcc(c0,c1) * Xc(s0-1,c1);
+      }
+      Uc(s0,c0) += Ec(c0) * wccxc_bc;
+      Xc(s0,c0) = tanh(Uc(s0,c0));
+    }
+  }
+}
+
+void RNN_init(RNN *self)
+{
+  int c0, c1;
+
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    for (c1 = 0; c1 < self->num_c; ++c1){
+      Wcc(c0,c1) = 0.1;
+    }
+    Ec(c0) = 0.1;
+    Bc(c0) = 0.1;
+    Uc(0,c0) = 0;
+    Xc(0,c0) = 0;
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  int nc = atoi((argv[1]));
+  int ns = atoi((argv[2]));
+  int i, repeat_times = atoi(argv[3]);
+  RNN rnn;
+  double wcc[nc*nc], bc[nc], ec[nc], uc[ns*nc], xc[ns*nc];
+
+  rnn.num_s = ns;
+  rnn.num_c = nc;
+  rnn.ec  = ec;
+  rnn.bc  = bc;
+  rnn.wcc = wcc;
+  rnn.uc  = uc;
+  rnn.xc  = xc;
+
+  RNN_init(&rnn);
+  for (i = 0; i < repeat_times; ++i){
+    RNN_fptt(&rnn);
+  }
+
+  return 0;
+}

File rnn_opt3.c

View file
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct __RNN__{
+  int num_c, num_s;
+  double *wcc, *bc, *ec, *uc, *xc;
+} RNN;
+
+#define Wcc(i,j) self->wcc[ self->num_c*(i) + (j) ]
+#define Bc(i)    self->bc[(i)]
+#define Ec(i)    self->ec[(i)]
+#define Uc(i,j)  self->uc[ self->num_c*(i) + (j) ]
+#define Xc(i,j)  self->xc[ self->num_c*(i) + (j) ]
+
+static inline double
+ddot(double * restrict array1, double * restrict array2, int num)
+{
+  int i;
+  double val=0;
+  for (i = 0; i < num; ++i) val += array1[i] * array2[i];
+  return val;
+}
+
+void RNN_fptt(RNN *self)
+{
+  int s0, c0;
+  
+  s0 = 0;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    Xc(s0,c0) = tanh(Uc(s0,c0));
+  }
+  for (s0 = 1; s0 < self->num_s; ++s0){
+    for (c0 = 0; c0 < self->num_c; ++c0){
+      Uc(s0,c0) = (1 - Ec(c0)) * Uc(s0-1,c0)
+	+ Ec(c0) * ( Bc(c0) + ddot(&Wcc(c0,0), &Xc(s0-1,0), self->num_c) );
+      Xc(s0,c0) = tanh(Uc(s0,c0));
+    }
+  }
+}
+
+void RNN_init(RNN *self)
+{
+  int c0, c1;
+
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    for (c1 = 0; c1 < self->num_c; ++c1){
+      Wcc(c0,c1) = 0.1;
+    }
+    Ec(c0) = 0.1;
+    Bc(c0) = 0.1;
+    Uc(0,c0) = 0;
+    Xc(0,c0) = 0;
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  int nc = atoi((argv[1]));
+  int ns = atoi((argv[2]));
+  int i, repeat_times = atoi(argv[3]);
+  RNN rnn;
+  double wcc[nc*nc], bc[nc], ec[nc], uc[ns*nc], xc[ns*nc];
+
+  rnn.num_s = ns;
+  rnn.num_c = nc;
+  rnn.ec  = ec;
+  rnn.bc  = bc;
+  rnn.wcc = wcc;
+  rnn.uc  = uc;
+  rnn.xc  = xc;
+
+  RNN_init(&rnn);
+  for (i = 0; i < repeat_times; ++i){
+    RNN_fptt(&rnn);
+  }
+
+  return 0;
+}

File rnn_opt4.c

View file
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct __RNN__{
+  int num_c, num_s;
+  double **wcc, *bc, *ec, **uc, **xc;
+} RNN;
+
+#define Wcc(i,j) self->wcc[i][j]
+#define Bc(i)    self->bc[i]
+#define Ec(i)    self->ec[i]
+#define Uc(i,j)  self->uc[i][j]
+#define Xc(i,j)  self->xc[i][j]
+#define VWcc(i)  self->wcc[i]
+#define VXc(i)   self->xc[i]
+
+static inline double
+ddot(double * restrict array1, double * restrict array2, int num)
+{
+  int i;
+  double val=0;
+  for (i = 0; i < num; ++i) val += array1[i] * array2[i];
+  return val;
+}
+
+void RNN_fptt(RNN *self)
+{
+  int s0, c0;
+
+  s0 = 0;
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    Xc(s0,c0) = tanh(Uc(s0,c0));
+  }
+  for (s0 = 1; s0 < self->num_s; ++s0){
+    for (c0 = 0; c0 < self->num_c; ++c0){
+      Uc(s0,c0) = (1 - Ec(c0)) * Uc(s0-1,c0)
+	+ Ec(c0) * ( Bc(c0) + ddot(VWcc(c0), VXc(s0-1), self->num_c) );
+      Xc(s0,c0) = tanh(Uc(s0,c0));
+    }
+  }
+}
+
+void RNN_init(RNN *self)
+{
+  int c0, c1;
+
+  for (c0 = 0; c0 < self->num_c; ++c0){
+    for (c1 = 0; c1 < self->num_c; ++c1){
+      Wcc(c0,c1) = 0.1;
+    }
+    Ec(c0) = 0.1;
+    Bc(c0) = 0.1;
+    Uc(0,c0) = 0;
+    Xc(0,c0) = 0;
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  int c, nc = atoi((argv[1]));
+  int s, ns = atoi((argv[2]));
+  int i, repeat_times = atoi(argv[3]);
+  RNN rnn;
+  double wcc[nc][nc], bc[nc], ec[nc], uc[ns][nc], xc[ns][nc];
+  double *pwcc[nc], *puc[ns], *pxc[ns];
+
+  rnn.num_s = ns;
+  rnn.num_c = nc;
+  rnn.ec  = ec;
+  rnn.bc  = bc;
+  rnn.wcc = pwcc;
+  rnn.uc  = puc;
+  rnn.xc  = pxc;
+  for (c = 0; c < nc; ++c) rnn.wcc[c] = wcc[c];
+  for (s = 0; s < ns; ++s) rnn.uc[s] = uc[s];
+  for (s = 0; s < ns; ++s) rnn.xc[s] = xc[s];
+
+  RNN_init(&rnn);
+  for (i = 0; i < repeat_times; ++i){
+    RNN_fptt(&rnn);
+  }
+
+  return 0;
+}