Commits

Peter Shinners committed 2cfda6f

Minimize code and branches in the search loop for 15% speedup.

Comments (0)

Files changed (3)

 
 
 
-void SortIndices(size_t* array, int numPoints, const float* points, size_t axis, size_t dimensions);
-struct range BisectIndices(const struct NrNrHood* hood, float minVal, float maxVal, size_t axis);
+static void SortIndices(size_t* array, int numPoints, const float* points, size_t axis, size_t dimensions);
+static struct range BisectIndices(const struct NrNrHood* hood, float minVal, float maxVal, size_t axis);
+
+
+#include <sys/time.h>
+#include <time.h>
+static float GetTime(void)
+{
+	static struct timeval timeval, first_timeval;
+	gettimeofday(&timeval, 0);
+	if(first_timeval.tv_sec == 0) {
+		first_timeval = timeval;
+		return 0;
+	}
+	unsigned int s = (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000.0;
+    return s / 1000.0f;
+}
 
 
 
     // Find inverse mappifng to sort index (point to index)
     // This goes into a temporary set of inverse indices
 
-    size_t* inverse = malloc(sizeof(size_t) * numPoints * dimensions);
+    size_t* inverse = malloc(sizeof(size_t) * numPoints);
+
+    // Build next arrays that map a point from one dimension to the next
+    // Allows jumping from dim0 index N to dim1, and then to dim2, ...
 
     for (size_t d = 0; d < dimensions; ++d) {
+        // Build an index of the 'inverse' mapping from point number to sorted index
         size_t* ind = hood->indices + numPoints * d;
-        size_t* inv = inverse + numPoints * d;
         for (size_t i = 0; i < numPoints; ++i) {
-            inv[ind[i]] = i;
+            inverse[ind[i]] = i;
         }
-    }
 
-    // Build next arrays that map a point from one dimension to the next
-    // Allows jumping from dim0 index N to dim1, and then to dim2, ...
-
-    for (size_t d = 0; d < dimensions; ++d) {
+        // Fill out the next array, move 'ind' to the array coming from
         size_t* next = hood->nextIndices + numPoints * d;
-        size_t* inv = inverse + numPoints * d;
-        size_t* ind;
         if (d) {
             ind = hood->indices + numPoints * (d - 1);
         } else {
             ind = hood->indices + numPoints * (dimensions - 1);
         }
         for (size_t i = 0; i < numPoints; ++i) {
-            next[i] = inv[ind[i]];
+            next[i] = inverse[ind[i]];
         }
     }
 
 
     // Find values common to each dimension
 
-    size_t* common = (size_t*)malloc(sizeof(size_t) * minSize);
+    size_t* common = (size_t*)malloc(sizeof(size_t) * (minSize + hood->dimensions + hood->dimensions));
     size_t* found = common;
 
+    // To eliminate as much code as possible fro the search loop, precompute several needed values
+
+    size_t* workAxis = common + minSize;
+    size_t** workIndices = (size_t**)workAxis + hood->dimensions;
+    workIndices[0] = 0; // unused
+    workAxis[0] = minAxis; // unused
+    for (size_t d = 1; d < hood->dimensions; ++d) {
+        workAxis[d] = minAxis + d;
+        if (workAxis[d] >= hood->dimensions) {
+            workAxis[d] -= hood->dimensions;
+        }
+        workIndices[d] = hood->nextIndices + hood->numPoints * workAxis[d];
+    }
+
+    // The main search loop. All searching time is spent here.
+
+    float start = GetTime();
     for (size_t minStep = ranges[minAxis].start; minStep < ranges[minAxis].end; ++minStep) {
-        size_t d;
         size_t workStep = minStep;
-        for (d = 1; d < hood->dimensions; ++d) {
-            size_t workAxis = minAxis + d;
-            if (workAxis >= hood->dimensions) {
-                workAxis -= hood->dimensions;
-            }
-            size_t pt = hood->nextIndices[hood->numPoints * workAxis + workStep];
-            if (pt < ranges[workAxis].start || pt > ranges[workAxis].end) {
-                break;
-            }
+        int match = 1;
+        for (size_t d = 1; match && d < hood->dimensions; ++d) {
+            size_t pt = workIndices[d][workStep];
+            match = (pt >= ranges[workAxis[d]].start && pt <= ranges[workAxis[d]].end);
             workStep = pt;
         }
-        if (d == hood->dimensions) {
+        if (match) {
             *found = hood->indices[hood->numPoints * minAxis + minStep];
             ++found;
         }
     }
+    printf("  SCAN: %.3f sec\n", GetTime() - start);
 
     // Handle return values
 
     fullStart = start = hood->indices + hood->numPoints * axis;
     fullEnd = end = fullStart + hood->numPoints;
 
+    float fullEndValue = hood->points[*fullEnd * hood->dimensions + axis];
+
     // TODO, early out when maxVal < start point or minVal > end point
     while (start < end) {
         size_t dist = end - start;
         float halfValue = hood->points[*half * hood->dimensions + axis];
         if (minVal <= halfValue) {
             end = half;
+#if 0
+            if (fullEndValue < halfValue) {
+                fullEnd = half;
+                fullEndValue = hood->points[*fullEnd * hood->dimensions + axis];
+            }
+#endif
         } else {
             start = half + 1;
         }
 nrnr
 Create neighborhood: 15.615 sec
-50r = Search block 5120872: 0.296 sec
+80r = Search block 5120872: 0.296 sec
 40r = Search block 639449: 0.114 sec
 20r = Search block 79971: 0.047 sec
 10r = Search block 20031: 0.021 sec
 40r 2d: .057 sec
 40r 1d: .023 sec
 
-
-
 kdtree
 inserting 10000000 random vectors... 29.000 sec
 80r = range query returned 2680049 items in 1.02700 sec
 cleaned up in 2.32100 sec
 3169120maxresident
 
-
-
 nanoflann
 Create Kdtree: 12.735sec
-80r = Search: 2680049, 0.511999sec
-40r = Search: 334901, 0.0760002sec
-20r = Search: 41902, 0.00699997sec
-10r = Search: 5175, 0.000999451sec
+80r = Search: 2680049, 0.512sec
+40r = Search: 334901, 0.076sec  ! crazy, similar speed between 20 and 40 unit radius !
+20r = Search: 41902, 0.07sec
+10r = Search: 5175, 0.001sec
 Cleanup: 0.0279999sec
 1180688maxresident
 
 
+
+On the creation of the neighborhood. Reduce the 'inverse' to a single array instead
+of one per dimension. This shortens create time from 16.5s to 12.0s. Reduces memory
+by 76mb
+
+Create neighborhood: 12.069 sec
+Search block 639983: 0.116 sec
+Cleanup: 0.011 sec
+2658256maxresident
+
+
+
+Area of a unit sphere is ~.52. Validated by the number of items found
+in the rectangular search compared to the spherical search of kdtree and nanoflann
+
+
 		p[3] = ((float)rand() / RAND_MAX) * 200.0 - 100.0;
         //printf("  pt %zu: %.2f %.2f %.2f\n",i, p[0], p[1], p[2]);
     }
-	printf("Create random points (%zu): %.3f sec\n", numPoints, GetTime() - start);
+	//printf("Create random points (%zu): %.3f sec\n", numPoints, GetTime() - start);
 
     // Create neighborhood