Commits

CarloWood  committed 3249faf

Add new elements to mzd_t and keep them consistent.

After this patch it is guaranteed that excess bits and padding
are zero (I think they already were, but now I checked it, see
next commit).

rowstride is the offset between two rows within a block.
blockrows_log and blockrows_mask are respectively the log2
of the number of rows in blocks before the last block,
and the number of rows minus one. Note that number of rows
is exactly a power of 2: 1 << blockrows_log, so that given
some row, the first word in that row is given by:
blocks[row >> blockrows_log] + (row & blockrows_mask) * rowstride.

high_bitmask and low_bitmask are precomputed bitmasks, masking
valid bits in the case of excess bits and/or offset bits respectively.
If width == 1, then both are equal and mask all valid bits.

Matrices with a width less than mzd_paddingwidth (currently 3),
are no longer 128-bit aligned. This makes small matrices (width = 1),
twice as compact while they already weren't processed with SSE2
instruction anyway (in fact, it now might be possible to start
to use SSE2 to do two rows at once).

Finally, the flags element contains (currently) five bits that
should allow to speed up certain algorithms. The basic idea is
that if flags is zero then we can use the fastest possible algorithm.
The flags contain information about offset, excess bits, it
being a windowed matrix, and the existance of more than one block.

  • Participants
  • Parent commits 843602f

Comments (0)

Files changed (4)

File src/packedmatrix.c

 
 #define SAFECHAR (m4ri_radix + m4ri_radix / 4 + 1)
 
-/*
- * Return r such that x elements fit into r blocks of length y.
- */
-#define DIV_CEIL(x,y) (((x) % (y)) ? (x) / (y) + 1 : (x) / (y))
-
 mzd_t *mzd_init(rci_t r, rci_t c) {
 
   mzd_t *A = (mzd_t *)m4ri_mmc_malloc(sizeof(mzd_t));
 
-  A->width = DIV_CEIL(c, m4ri_radix);
-
-#ifdef HAVE_SSE2
-  int incw = 0;
-  /* make sure each row is 16-byte aligned */
-  if ((A->width & 1)) {
-    A->width++;
-    incw = 1;
+  A->nrows = r;
+  A->ncols = c;
+  A->width = (c + m4ri_radix - 1) / m4ri_radix;
+  A->rowstride = (A->width < mzd_paddingwidth || (A->width & 1) == 0) ? A->width : A->width + 1;
+  if (A->width == 1) {
+    A->high_bitmask = A->low_bitmask = __M4RI_MIDDLE_BITMASK(c, 0);
+  } else {
+    A->high_bitmask = __M4RI_LEFT_BITMASK(c % m4ri_radix);
+    A->low_bitmask = m4ri_ffff;
   }
-#endif
-
-  A->ncols = c;
-  A->nrows = r;
+  A->flags = (A->high_bitmask != m4ri_ffff) ? mzd_flag_nonzero_excess : 0;
   A->offset = 0;
 
   A->rows = (word**)m4ri_mmc_calloc(r + 1, sizeof(word*)); // We're overcomitting here.
 
-  if(r && c) {
-    /* we allow more than one malloc call so he have to be a bit clever
-       here */
-    
-    int const bytes_per_row = A->width * sizeof(word);
-    int const max_rows_per_block = __M4RI_MAX_MZD_BLOCKSIZE / bytes_per_row;
-    assert(max_rows_per_block);
-    int rest = r % max_rows_per_block;
-    
-    int const nblocks = (rest == 0) ? r / max_rows_per_block : r / max_rows_per_block + 1;
-    A->blocks = (mmb_t*)m4ri_mmc_calloc(nblocks + 1, sizeof(mmb_t));
-    for(int i = 0; i < nblocks - 1; ++i) {
-      A->blocks[i].size = __M4RI_MAX_MZD_BLOCKSIZE;
-      A->blocks[i].data = m4ri_mmc_calloc(1, __M4RI_MAX_MZD_BLOCKSIZE);
-      for(rci_t j = 0; j < max_rows_per_block; ++j)
-      {
-	int offset = A->width * j;				// Offset to start of row j within block.
-	word *block_start = (word*)A->blocks[i].data;		// Start of block i.
-	rci_t r = j + i * max_rows_per_block;
-        A->rows[r] = block_start + offset;			// FIXME: rowstride candidate
-      }
+  if (r && c) {
+    int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride;
+    A->blockrows_log = 0;
+    while(blockrows >>= 1)
+      A->blockrows_log++;
+    blockrows = 1 << A->blockrows_log;
+    A->blockrows_mask = blockrows - 1;
+    int const nblocks = (r + blockrows - 1) / blockrows;
+    A->flags |= (nblocks > 1) ? mzd_flag_multiple_blocks : 0;
+    A->blocks = (mzd_block_t*)m4ri_mmc_calloc(nblocks + 1, sizeof(mzd_block_t));
+
+    size_t block_words = (r - (nblocks - 1) * blockrows) * A->rowstride;
+    for(int i = nblocks - 1; i >= 0; --i) {
+      A->blocks[i].size = block_words * sizeof(word);
+      A->blocks[i].begin = m4ri_mmc_calloc(1, A->blocks[i].size);
+      A->blocks[i].end = A->blocks[i].begin + block_words;
+      block_words = blockrows * A->rowstride;
     }
-    if(rest == 0)
-      rest = max_rows_per_block;
 
-    A->blocks[nblocks-1].size = rest * bytes_per_row;
-    A->blocks[nblocks-1].data = m4ri_mmc_calloc(rest, bytes_per_row);
-    for(rci_t j = 0; j < rest; ++j) {
-      A->rows[j + max_rows_per_block * (nblocks - 1)] = (word*)(A->blocks[nblocks - 1].data) + A->width * j;
-    }
+    for(rci_t i = 0; i < A->nrows; ++i) {
+      A->rows[i] = A->blocks[i >> A->blockrows_log].begin + (i & A->blockrows_mask) * A->rowstride;
 #ifdef M4RI_WRAPWORD
-    for (rci_t i = 0; i < A->nrows; ++i)
       word::init_array(A->rows[i], A->width);
 #endif
+    }
+
   } else {
     A->blocks = NULL;
   }
 
-#ifdef HAVE_SSE2
-  if (incw) {
-    A->width--;
-  }
-#endif
-
   return A;
 }
 
   nrows = MIN(highr - lowr, m->nrows - lowr);
   ncols = highc - lowc;
   
+  window->nrows = nrows;
   window->ncols = ncols;
-  window->nrows = nrows;
-
+  window->rowstride = m->rowstride;
   window->offset = (lowc + m->offset) % m4ri_radix;
-  wi_t const offset = (lowc + m->offset) / m4ri_radix;
-  
-  window->width = (ncols + window->offset) / m4ri_radix;
-  if ((ncols + window->offset) % m4ri_radix)
-    window->width++;
+  window->width = (ncols + window->offset + m4ri_radix - 1) / m4ri_radix;
+  if (window->width == 1) {
+    window->high_bitmask = window->low_bitmask = __M4RI_MIDDLE_BITMASK(ncols, window->offset);
+  } else {
+    window->high_bitmask = __M4RI_LEFT_BITMASK((ncols + window->offset) % m4ri_radix);
+    window->low_bitmask = __M4RI_RIGHT_BITMASK(m4ri_radix - window->offset);
+  }
+  window->flags = m->flags & mzd_flag_multiple_blocks;
+  window->flags |= (window->offset == 0) ? mzd_flag_windowed_zerooffset : mzd_flag_nonzero_offset;
+  window->flags |= ((ncols + window->offset) % m4ri_radix == 0) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess;
+  window->blockrows_log = m->blockrows_log;
+  window->blockrows_mask = m->blockrows_mask;
   window->blocks = NULL;
 
   if(nrows)
     window->rows = (word**)m4ri_mmc_calloc(nrows + 1, sizeof(word*));
   else
     window->rows = NULL;
-
+  wi_t const offset = (lowc + m->offset) / m4ri_radix;
   for(rci_t i = 0; i < nrows; ++i) {
     window->rows[i] = m->rows[lowr + i] + offset;
   }
   if(A->blocks) {
     int i;
     for(i = 0; A->blocks[i].size; ++i) {
-      m4ri_mmc_free(A->blocks[i].data, A->blocks[i].size);
+      m4ri_mmc_free(A->blocks[i].begin, A->blocks[i].size);
     }
-    m4ri_mmc_free(A->blocks, (i + 1) * sizeof(mmb_t));
+    m4ri_mmc_free(A->blocks, (i + 1) * sizeof(mzd_block_t));
   }
   m4ri_mmc_free(A, sizeof(mzd_t));
 }
       N = mzd_init(P->nrows, P->ncols+ P->offset);
       N->ncols -= P->offset;
       N->offset = P->offset;
-      N->width=P->width;
+      N->width = P->width;
+      N->flags |= mzd_flag_nonzero_offset;
+      N->low_bitmask &= m4ri_ffff << N->offset;
+      if (N->width == 1)
+	N->high_bitmask = N->low_bitmask;
+      N->flags |= ((N->high_bitmask & ((word)1 << (m4ri_radix - 1)))) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess;
     } else {
       if (N->nrows < P->nrows || N->ncols < P->ncols)
 	m4ri_die("mzd_copy: Target matrix is too small.");

File src/packedmatrix.h

 
 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L2_CACHE))) / 2, 2048)
 
+typedef struct {
+  size_t size;
+  word* begin;
+  word* end;
+} mzd_block_t;
+
 /**
  * \brief Dense matrices over GF(2). 
  * 
 
 typedef struct mzd_t {
   /**
-   * Contains pointers to the actual blocks of memory containing the
-   * values packed into words of size m4ri_radix.
-   */
-
-  struct _mm_block *blocks;
-
-  /**
    * Number of rows.
    */
 
   rci_t ncols;
 
   /**
-   * width = ceil(ncols/m4ri_radix)
+   * Number of words with valid bits.
+   *
+   * width = ceil((ncols + offset) / m4ri_radix)
    */
 
   wi_t width; 
 
   /**
+   * Offset in words between rows.
+   *
+   * rowstride = (width < mzd_paddingwidth || (width & 1) == 0) ? width : width + 1;
+   * where width is the width of the underlaying non-windowed matrix.
+   */
+
+  wi_t rowstride;
+
+  /**
+   * Booleans to speed up things.
+   *
+   * The bits have the following meaning:
+   *
+   * 0: Has non-zero offset (and thus is windowed).
+   * 1: Has non-zero excess.
+   * 2: Is windowed, but has zero offset.
+   * 3: Is windowed, but has zero excess.
+   * 4: Spans more than 1 block.
+   */
+
+  int flags;
+
+  /**
    * column offset of the first column.
    */
 
   int offset;
-  
+
+  /**
+   * blockrows_log = log2(blockrows);
+   * where blockrows is the number of rows in one block, which is a power of 2.
+   */
+
+  int blockrows_log;
+
+  /**
+   * blockrows_mask = blockrows - 1;
+   * where blockrows is the number of rows in one block, which is a power of 2.
+   */
+
+  int blockrows_mask;
+
+  /**
+   * Mask for valid bits in the word with the highest index (width - 1).
+   */
+
+  word high_bitmask;
+
+  /**
+   * Mask for valid bits in the word with the lowest index (0).
+   */
+
+  word low_bitmask;
+
+  /**
+   * Contains pointers to the actual blocks of memory containing the
+   * values packed into words of size m4ri_radix.
+   */
+
+  mzd_block_t *blocks;
+
   /**
    * Address of first word in each row, so the first word of row i is
    * is m->rows[i]
 } mzd_t;
 
 /**
+ * \brief The minimum width where padding occurs.
+ */
+static wi_t const mzd_paddingwidth = 3;
+
+static int const mzd_flag_nonzero_offset = 0x1;
+static int const mzd_flag_nonzero_excess = 0x2;
+static int const mzd_flag_windowed_zerooffset = 0x4;
+static int const mzd_flag_windowed_zeroexcess = 0x8;
+static int const mzd_flag_multiple_blocks = 0x10;
+
+/**
+ * \brief Test if a matrix is windowed.
+ *
+ * \return a non-zero value if the matrix is windowed, otherwise return zero.
+ */
+static inline int mzd_is_windowed(mzd_t const *M) {
+  return M->flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset);
+}
+
+/**
  * \brief Create a new matrix of dimension r x c.
  *
  * Use mzd_free to kill it.

File src/pls_mmpf.c

   /* we're essentially constructing a submatrix but cheaply */
   wi_t const width = A->width;
   rci_t const ncols = A->ncols;
+  int const flags = A->flags;
+  word low_bitmask = A->low_bitmask;
+  word high_bitmask = A->high_bitmask;
 
   if (A->width > splitblock) {
     A->width = splitblock;
     A->ncols = splitblock * m4ri_radix;
+    assert(A->offset == 0);
+    A->flags &= mzd_flag_multiple_blocks;
+    A->flags |= (mzd_flag_windowed_zerooffset | mzd_flag_windowed_zeroexcess);
+    A->high_bitmask = A->low_bitmask = m4ri_ffff;
   }
 
   int curr_pos;
   /* reset to original size */
   A->ncols = ncols;
   A->width = width;
+  A->flags = flags;
+  A->low_bitmask = low_bitmask;
+  A->high_bitmask = high_bitmask;
 
   __M4RI_DD_MZD(A);
   __M4RI_DD_MZP(P);
   wi_t const wide = T->width - blockoffset;
   wi_t const count = (wide + 7) / 8;
   int const entry_point = wide % 8;
+  wi_t const next_row_offset = blockoffset + T->rowstride - T->width;
 
   word *ti, *ti1, *m;
 
   ti1 = T->rows[0] + blockoffset;
-  ti = ti1 + T->width;
-#ifdef HAVE_SSE2
-  unsigned long incw = 0;
-  if (T->width & 1) incw = 1;
-  ti += incw;
-#endif
+  ti = ti1 + T->rowstride;
 
   Le[0] = 0;
   Lm[0] = 0;
     case 1:      *(ti++) = *(m++) ^ *(ti1++);
       } while (--n > 0);
     }
-#ifdef HAVE_SSE2
-    ti += incw; ti1 += incw;
-#endif
-    ti += blockoffset;
-    ti1 += blockoffset;
+    ti += next_row_offset;
+    ti1 += next_row_offset;
 
     /* U is a basis but not the canonical basis, so we need to read what
        element we just created from T */

File src/strassen.c

   mzd_t *tmp = mzd_init (B->nrows, (rci_t)m4ri_radix);
   int const offset = C->offset;
   rci_t const cncols = C->ncols;
+  int const flags = C->flags;
+  word const bitmask = C->low_bitmask;
   C->offset = 0;
   C->ncols = m4ri_radix;
+  C->flags &= mzd_flag_multiple_blocks;
+  C->flags |= (mzd_flag_windowed_zerooffset | mzd_flag_windowed_zeroexcess);
+  assert(C->width == 1);
+  C->low_bitmask = C->high_bitmask = m4ri_ffff;
   word const mask = __M4RI_MIDDLE_BITMASK(B->ncols, B->offset);
   for (rci_t i = 0; i < B->nrows; ++i)
     tmp->rows[i][0] = B->rows[i][0] & mask;
   _mzd_addmul_even (C, A, tmp, cutoff);
   C->offset = offset;
   C->ncols = cncols;
+  C->flags = flags;
+  C->low_bitmask = C->high_bitmask = bitmask;
   mzd_free (tmp);
   return C;
 }