slapping a bunch of pragma omp parallel for's on the code, no guarantees whatsoever

       const rci_t blockend = MIN(giantstep+blocksize, a_nr);
+#pragma omp parallel for schedule(static,512) private(x,t)
       for(rci_t j = giantstep; j < blockend; j++) {
         const word a = mzd_read_bits(A, j, kk*i, kk);
         x[ 0] = L[ 0][ (a >> 0*k) & bm ];


+#pragma omp parallel for schedule(static,512)
   for(rci_t r = startrow; r < stoprow; ++r) {
     word *m0 = M->rows[r] + block0;
     rci_t const x0 = E0[_mzd_read_bits_int_raw(m0, spot0,           0, spill0, k0)];
