Peter Shinners avatar Peter Shinners committed 8901e94

rework algorithm to work in smaller chunks and use smaller data types than size_t for indices

Comments (0)

Files changed (5)

 #nrnr.a: nrnr.c
 
 test: nrnr.c test.c nrnr.h Makefile
-	$(CC) $(CFLAGS) -O3 -o test nrnr.c test.c
+	$(CC) $(CFLAGS) -g -O3 -o test nrnr.c test.c
 
 
 // start is inclusive, end is exclusive
 struct range {
-    size_t start;
-    size_t end;
+    index_t start;
+    index_t end;
 };
 
 
 
-static void SortIndices(size_t* array, int numPoints, const float* points, size_t axis, size_t dimensions);
-static struct range BisectIndices(const struct NrNrHood* hood, float minVal, float maxVal, size_t axis);
+static void SortIndices(index_t* array, index_t numPoints, const float* points, size_t axis, size_t dimensions);
+static struct range BisectIndices(const index_t* array, const float* points, index_t numPoints, size_t dimensions, float minVal, float maxVal, size_t axis);
 
 
 #include <sys/time.h>
     // All data is allocated in a single block. Allocate and then
     // aim some pointers into the space.
 
-    size_t allocSize = sizeof(struct NrNrHood) + sizeof(size_t) * numPoints * 6;
+    size_t allocSize = sizeof(struct NrNrHood) + sizeof(index_t) * numPoints * 6;
     struct NrNrHood* hood = malloc(allocSize);
     hood->numPoints = numPoints;
     hood->dimensions = dimensions;
     hood->points = points;
-    hood->indices = (size_t*)(hood + 1);
+    hood->chunks = (numPoints + NRNRCHUNK - 1) / NRNRCHUNK;
+    hood->indices = (index_t*)(hood + 1);
     hood->nextIndices = hood->indices + numPoints * 3;
+//printf("numchunks: %zu\n", hood->chunks);
 
-    // Initialize indices with list of all points
-
-    for (size_t i = 0; i < numPoints; ++i) {
-        hood->indices[i] = i;
-    }
-    for (size_t d = 1; d < dimensions; ++d) {
-        size_t* initSrc = hood-> indices + (d - 1) * numPoints;
-        size_t* initDst = hood-> indices + d * numPoints;
-        memcpy(initDst, initSrc, sizeof(size_t) * numPoints);
+    // Allocate temporary buffer for storing the inverse sorted array
+    // Find inverse mappifng to sort index (point to index)
+    // This goes into a temporary set of inverse indices
+    size_t inverseSize = NRNRCHUNK;
+    if (inverseSize > numPoints) {
+        inverseSize = numPoints;
     }
+    index_t* inverse = malloc(sizeof(index_t) * inverseSize);
+
+    // Break the data into small chunks
+    index_t* chunkIndices = hood->indices;
+    index_t* chunkNextIndices = hood->nextIndices;
+    const float* chunkPoints = points;
+    size_t indexStep = hood->dimensions * NRNRCHUNK;
+    for (size_t c = 0; c < hood->chunks; ++c,
+                chunkIndices += indexStep, chunkNextIndices += indexStep, chunkPoints += indexStep) {
+        index_t chunkSize = numPoints - (c * NRNRCHUNK);
+        if (chunkSize > NRNRCHUNK) {
+            chunkSize = NRNRCHUNK;
+        }
+//printf("  create chunk %zu  size=%zu\n", c, chunkSize);
 
-    // Sort indices for each dimension
+        // Initialize indices with list of all points
+        for (index_t i = 0; i < chunkSize; ++i) {
+            chunkIndices[i] = i;
+        }
+//printf("presorted 0:"); for (size_t z=0; z < chunkSize; ++z) printf(" %zu", chunkIndices[z]); printf("\n");
+
+        // Copy ordered point numbers from dimension 0 into all others
+        for (size_t d = 1; d < dimensions; ++d) {
+            index_t* initSrc = chunkIndices + (d - 1) * chunkSize;
+            index_t* initDst = chunkIndices + d * chunkSize;
+            memcpy(initDst, initSrc, sizeof(index_t) * chunkSize);
+//printf("presorted %zu:", d); for (size_t z=0; z < chunkSize; ++z) printf(" %zu", initDst[z]); printf("\n");
+        }
 
-    for (size_t d = 0; d < dimensions; ++d) {
-        size_t* array = hood->indices + d * numPoints;
-        SortIndices(array, numPoints, points, d, dimensions);
-    }
+        // Sort indices for each dimension
+        for (size_t d = 0; d < dimensions; ++d) {
+            index_t* array = chunkIndices + d * chunkSize;
+            SortIndices(array, chunkSize, chunkPoints, d, dimensions);
+//printf("   sorted %zu:", d); for (size_t z=0; z < chunkSize; ++z) printf("  %zu(%.1f)", array[z], chunkPoints[array[z] * dimensions + d]); printf("\n");
+        }
 
-    // Find inverse mappifng to sort index (point to index)
-    // This goes into a temporary set of inverse indices
+        // Build next arrays that map a point from one dimension to the next
+        // Allows jumping from dim0 index N to dim1, and then to dim2, ...
+        for (size_t d = 0; d < dimensions; ++d) {
 
-    size_t* inverse = malloc(sizeof(size_t) * numPoints);
+            // Build an index of the 'inverse' mapping from point number to sorted index
+            index_t* ind = chunkIndices + chunkSize * d;
 
-    // Build next arrays that map a point from one dimension to the next
-    // Allows jumping from dim0 index N to dim1, and then to dim2, ...
+            for (index_t i = 0; i < chunkSize; ++i) {
+                inverse[ind[i]] = i;
+            }
+//printf("     inverse %zu:", d); for (size_t z=0; z < chunkSize; ++z) printf(" %zu", inverse[z]); printf("\n");
 
-    for (size_t d = 0; d < dimensions; ++d) {
-        // Build an index of the 'inverse' mapping from point number to sorted index
-        size_t* ind = hood->indices + numPoints * d;
-        for (size_t i = 0; i < numPoints; ++i) {
-            inverse[ind[i]] = i;
-        }
 
-        // Fill out the next array, move 'ind' to the array coming from
-        size_t* next = hood->nextIndices + numPoints * d;
-        if (d) {
-            ind = hood->indices + numPoints * (d - 1);
-        } else {
-            ind = hood->indices + numPoints * (dimensions - 1);
-        }
-        for (size_t i = 0; i < numPoints; ++i) {
-            next[i] = inverse[ind[i]];
+            // Fill out the next array, move 'ind' to the array coming from
+            index_t* next = chunkNextIndices + chunkSize * d;
+            if (d) {
+                ind = chunkIndices + chunkSize * (d - 1);
+            } else {
+                ind = chunkIndices + chunkSize * (dimensions - 1);
+            }
+            for (index_t i = 0; i < chunkSize; ++i) {
+                next[i] = inverse[ind[i]];
+            }
+
+//printf("        next %zu:", d); for (size_t z=0; z < chunkSize; ++z) printf(" %zu", next[z]); printf("\n");
         }
+
     }
 
     free(inverse);
-
     return hood;
 }
 
 
 
 size_t NrNrSearchBlock(const struct NrNrHood* hood, size_t** results, const float* minimum, const float* maximum) {
-    // Find ranges of indices for each dimension
     struct range* ranges = alloca(sizeof(ranges) * hood->dimensions);
 
-    for (size_t d = 0; d < hood->dimensions; ++d) {
-        size_t* array = hood->indices + hood->numPoints * d;
-        ranges[d] = BisectIndices(hood, minimum[d], maximum[d], d);
-    }
+    size_t* workAxis = alloca(sizeof(size_t) * hood->dimensions);;
+    index_t** workNexts = alloca(sizeof(index_t*) * hood->dimensions);
+    workAxis[0] = hood->dimensions; // unused (will crash if used)
+    workNexts[0] = NULL; // unused (will crash if used)
 
-    // Find dimension with the least number of matches
+    size_t* common = (size_t*)malloc(sizeof(size_t) * hood->numPoints);
+    size_t* found = common;
 
-    size_t minAxis = 0;
-    size_t minSize = ranges[0].end - ranges[0].start;
-    for (size_t d = 0; d < hood->dimensions; ++d) {
-        size_t size = ranges[d].end - ranges[d].start;
-        if (size < minSize) {
-            minSize = size;
-            minAxis = d;
+    // Break the data into small chunks
+    index_t* chunkIndices = hood->indices;
+    index_t* chunkNextIndices = hood->nextIndices;
+    const float* chunkPoints = hood->points;
+    size_t chunkOffset = 0;
+    size_t indexStep = hood->dimensions * NRNRCHUNK;
+    for (size_t c = 0; c < hood->chunks; ++c,
+                chunkIndices += indexStep, chunkNextIndices += indexStep, chunkPoints += indexStep, chunkOffset += NRNRCHUNK) {
+        index_t chunkSize = hood->numPoints - (c * NRNRCHUNK);
+        if (chunkSize > NRNRCHUNK) {
+            chunkSize = NRNRCHUNK;
         }
-    }
-    if (minSize == 0) {
-        *results = NULL;
-        return 0;
-    }
+//printf("Search Chunk %zu  size=%zu\n", c, chunkSize);
 
-    // Find values common to each dimension
+        // Find ranges of indices for each dimension
 
-    size_t* common = (size_t*)malloc(sizeof(size_t) * (minSize + hood->dimensions) + sizeof(size_t*) * hood->dimensions);
-    size_t* found = common;
+        for (size_t d = 0; d < hood->dimensions; ++d) {
+            index_t* array = chunkIndices + chunkSize * d;
+            ranges[d] = BisectIndices(array, chunkPoints, chunkSize, hood->dimensions, minimum[d], maximum[d], d);
+//printf("  bisect %zu: %zu-%zu, ", d, ranges[d].start, ranges[d].end);
+//for (size_t z=ranges[d].start; z<ranges[d].end; ++z) printf("  %zu(%.1f)", array[z] + chunkOffset, chunkPoints[array[z] * hood->dimensions + d]); printf("\n");
+            if (ranges[d].start == ranges[d].end) {
+                continue;
+            }
+        }
 
-    // To eliminate as much code as possible fro the search loop, precompute several needed values
+        // Find dimension with the least number of matches
 
-    size_t* workAxis = common + minSize;
-    size_t** workNexts = (size_t**)workAxis + hood->dimensions;
-    workNexts[0] = 0; // unused
-    workAxis[0] = minAxis; // unused
-    for (size_t d = 1; d < hood->dimensions; ++d) {
-        workAxis[d] = minAxis + d;
-        if (workAxis[d] >= hood->dimensions) {
-            workAxis[d] -= hood->dimensions;
+        size_t minAxis = 0;
+        index_t minSize = ranges[0].end - ranges[0].start;
+        for (size_t d = 0; d < hood->dimensions; ++d) {
+            index_t size = ranges[d].end - ranges[d].start;
+            if (size < minSize) {
+                minSize = size;
+                minAxis = d;
+            }
         }
-        workNexts[d] = hood->nextIndices + hood->numPoints * workAxis[d];
-    }
 
-    // The main search loop. All searching time is spent here.
-    size_t* minIndices = hood->indices + hood->numPoints * minAxis;
+        // To eliminate as much code as possible fro the search loop, precompute several needed values
 
-    float start = GetTime();
-    for (size_t minStep = ranges[minAxis].start; minStep < ranges[minAxis].end; ++minStep) {
-        size_t workStep = minStep;
-        int match = 1;
         for (size_t d = 1; d < hood->dimensions; ++d) {
-            size_t pt = workNexts[d][workStep];
-            if (!(match = (pt >= ranges[workAxis[d]].start && pt <= ranges[workAxis[d]].end)))
-                break;
-            workStep = pt;
+            workAxis[d] = minAxis + d;
+            if (workAxis[d] >= hood->dimensions) {
+                workAxis[d] -= hood->dimensions;
+            }
+            workNexts[d] = chunkNextIndices + chunkSize * workAxis[d];
         }
-        if (match) {
-            *found++ = minIndices[minStep];
+
+        // The main search loop. All searching time is spent here.
+        index_t* minIndices = chunkIndices + chunkSize * minAxis;
+
+        for (index_t minStep = ranges[minAxis].start; minStep < ranges[minAxis].end; ++minStep) {
+            index_t workStep = minStep;
+            int match = 1;
+            for (size_t d = 1; d < hood->dimensions; ++d) {
+                index_t pt = workNexts[d][workStep];
+                if (!(match = (pt >= ranges[workAxis[d]].start && pt <= ranges[workAxis[d]].end)))
+                    break;
+                workStep = pt;
+            }
+            if (match) {
+                *found++ = minIndices[minStep] + chunkOffset;
+            }
         }
     }
-    printf("  SCAN: %.3f sec\n", GetTime() - start);
 
     // Handle return values
-
     size_t totalFound = found - common;
     if (!totalFound) {
         free(common);
         *results = NULL;
         return 0;
     }
-    if (totalFound < minSize) {
+    // realloc common to the smaller, found size
+    if (totalFound < hood->numPoints) {
         common = realloc(common, sizeof(size_t) * totalFound);
     }
 
-    // realloc common to the smaller, found size
     *results = common;
     return totalFound;
 }
 
 
 
-void SortIndices(size_t* array, int numPoints, const float* points, size_t axis, size_t dimensions) {
+void SortIndices(index_t* array, index_t numPoints, const float* points, size_t axis, size_t dimensions) {
     if (numPoints < 2) {
         return;
     }
 
     // Insertion sort for small ranges
     if (numPoints < 16) {
-        for (size_t i = 1; i < numPoints; ++i) {
-            size_t point = array[i];
+        for (index_t i = 1; i < numPoints; ++i) {
+            index_t point = array[i];
             float position = points[point * dimensions + axis];
-            size_t j;
+            index_t j;
             for (j = i; j > 0 && points[array[j - 1] * dimensions + axis] > position; --j) {
                 array[j] = array[j - 1];
             }
 
     // Quicksort (recursive) for most spans
     float position = points[array[numPoints / 2] * dimensions + axis];
-    size_t *left = array;
-    size_t *right = array + numPoints - 1;
+    index_t *left = array;
+    index_t *right = array + numPoints - 1;
     while (left <= right) {
         while (points[(*left) * dimensions + axis] < position)
             left++;
         while (points[(*right) * dimensions + axis] > position)
             right--;
         if (left <= right) {
-            size_t t = *left;
+            index_t t = *left;
             *left++ = *right;
             *right-- = t;
         }
 
 
 
-struct range BisectIndices(const struct NrNrHood* hood, float minVal, float maxVal, size_t axis) {
+struct range BisectIndices(const index_t* array, const float* points, index_t numPoints, size_t dimensions, float minVal, float maxVal, size_t axis) {
     struct range range;
-    size_t *fullStart, *fullEnd;
-    size_t *start;
-    size_t *end;
+    const index_t *fullStart, *fullEnd;
+    const index_t *start;
+    const index_t *end;
 
-    fullStart = start = hood->indices + hood->numPoints * axis;
-    fullEnd = end = fullStart + hood->numPoints;
+    fullStart = start = array;
+    fullEnd = end = fullStart + numPoints;
+#if 0
+    float startValue = points[array[0] * dimensions + axis];
+    if (startValue > maxVal) {
+        range.start = range.end = numPoints;
+        return range;
+    }
+#endif
 
-    float fullEndValue = hood->points[*fullEnd * hood->dimensions + axis];
+    float finalValue = points[array[numPoints - 1] * dimensions + axis];
+#if 1
+    if (finalValue < minVal) {
+        range.start = range.end = 0;
+        return range;
+    }
+#endif
 
-    // TODO, early out when maxVal < start point or minVal > end point
     while (start < end) {
-        size_t dist = end - start;
-        size_t* half = start + (dist / 2);
-        float halfValue = hood->points[*half * hood->dimensions + axis];
+        index_t dist = end - start;
+        const index_t* half = start + (dist / 2);
+        float halfValue = points[*half * dimensions + axis];
         if (minVal <= halfValue) {
             end = half;
 #if 0
-            if (fullEndValue < halfValue) {
+            if (finalValue < halfValue) {
                 fullEnd = half;
-                fullEndValue = hood->points[*fullEnd * hood->dimensions + axis];
+                finalValue = points[*fullEnd * dimensions + axis];
             }
 #endif
         } else {
     start = end;
     end = fullEnd;
     while (start < end) {
-        size_t dist = end - start;
-        size_t* half = start + (dist / 2);
-        float halfValue = hood->points[*half * hood->dimensions + axis];
+        index_t dist = end - start;
+        const index_t* half = start + (dist / 2);
+        float halfValue = points[*half * dimensions + axis];
         if (maxVal >= halfValue) {
             start = half + 1;
         } else {
 
 
 #include <stdlib.h>
+#include <inttypes.h>
 
 
+#define NRNRCHUNK 20000000
+
+
+typedef size_t index_t;
+
 struct NrNrHood {
     size_t numPoints;
     size_t dimensions;
+    size_t chunks;
     const float* points;
-    size_t* indices;
-    size_t* nextIndices;
+    index_t* indices;
+    index_t* nextIndices;
 };
 
 
     Search block 639983: 0.09 sec
     Cleanup: 0.010 sec
     2657216maxresident
+
+
+
+
+Switched the code to work in 'chunks'. This breaks the array of point numbers
+into small sequential ranges. The algorithm wins with these smaller sections.
+The other win is that all the array offsets now only index the chunk. By making
+the chunk size smaller we can fit the indices into uint8 or a uint16. Cuts a
+good 40% off of runtime and really drops memory. We could use still significantly
+less memory by using some form of stack to store the results in, currently makes
+a size_t array of numPoints size, which means a good 40MB.
+
+Notice the sizes of results is varying here, so there is some problem which could
+strike these results as irrelevent. I am highly dubious since these are getting
+about half of the results as the unchunked.
+
+Wow, look at the time needed to build the hood. Super fast, and that is (?) valid time.
+(Hulk Hogan eat your heart out!)
+
+    -- with 10k chunks in size_t index
+    Create neighborhood: 3.332 sec
+    Search complete 640842: 0.066 sec
+    Cleanup: 0.008 sec
+    2366256maxresident
+
+    -- with 10k chunks in uint16 index
+    Create neighborhood: 2.942 sec
+    Search complete 592354: 0.054 sec
+    Cleanup: 0.010 sec
+    926848maxresident
+
+    -- with 255 chunks in uint8 index
+    Create neighborhood: 0.887 sec
+    Search complete 332549: 0.053 sec
+    Cleanup: 0.008 sec
+    715744maxresident
+
+
+
+Memory needed to create the 10mill array of points (baseline). So hood and results
+are amount of memory over this.
+    470832maxresident
 
 
 float GetTime(void);
-void CheckSorting(struct NrNrHood* hood);
-void CheckNextIndices(struct NrNrHood* hood);
 void CheckSearch(struct NrNrHood* hood, size_t numResults, const size_t* results, const float* minimum, const float* maximum);
 
 
 
     // Create random points
 
-    srand(12345);
+    srand(1234);
     size_t numPoints = 10000000;
     start = GetTime();
     float* points = (float*)malloc(sizeof(float) * 3 * numPoints);
 		p[1] = ((float)rand() / RAND_MAX) * 200.0 - 100.0;
 		p[2] = ((float)rand() / RAND_MAX) * 200.0 - 100.0;
 		p[3] = ((float)rand() / RAND_MAX) * 200.0 - 100.0;
-        //printf("  pt %zu: %.2f %.2f %.2f\n",i, p[0], p[1], p[2]);
+        //printf("  pt %zu: %.1f %.1f %.1f\n",i, p[0], p[1], p[2]);
     }
 	//printf("Create random points (%zu): %.3f sec\n", numPoints, GetTime() - start);
 
     struct NrNrHood* hood = NrNrCreateNeighborhood(numPoints, 3, points);
 	printf("Create neighborhood: %.3f sec\n", GetTime() - start);
 
-#if DOCHECKS
-    printf("Checking sorting\n");
-    CheckSorting(hood);
-    printf("Checking nexts\n");
-    CheckNextIndices(hood);
-#endif
-
     // Search block
 
     start = GetTime();
     float minPos[3] = {-40.0f,-20.0f,-60.0f};
     float maxPos[3] = { 40.0,  60.0f, 20.0f};
-//    printf("Searching: %.1f - %.1f,  %.1f - %.1f, %.1f - %.1f\n", minPos[0], maxPos[0], minPos[1], maxPos[1], minPos[2],maxPos[2]);
+    //printf("Searching: %.1f - %.1f,  %.1f - %.1f, %.1f - %.1f\n", minPos[0], maxPos[0], minPos[1], maxPos[1], minPos[2],maxPos[2]);
 
     size_t* results;
     size_t count = NrNrSearchBlock(hood, &results, minPos, maxPos);
-    printf("Search block %zu: %.3f sec\n", count, GetTime() - start);
+    printf("Search complete %zu: %.3f sec\n", count, GetTime() - start);
+
+#if 0
+for(size_t z = 0; z < count; ++z) {
+    float* pos = points + results[z] * 3;
+    printf("  found %zu: %.1f %.1f %.1f\n", results[z], pos[0], pos[1], pos[2]);
+}
+#endif
+
 #if DOCHECKS
     CheckSearch(hood, count, results, minPos, maxPos);
 #endif
 
 
 
-
+#if 0
 void CheckSorting(struct NrNrHood* hood) {
     // Check value ordering
     for (size_t d = 0; d < hood->dimensions; ++d) {
         }
     }
 }
+#endif
 
 
 void CheckSearch(struct NrNrHood* hood, size_t numResults, const size_t* results, const float* minimum, const float* maximum) {
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.