Commits

Peter Shinners committed 30b174b

got the problems worked out. settling on uint16 indices with 32k chunks as the current fastest.

Comments (0)

Files changed (4)

     size_t indexStep = hood->dimensions * NRNRCHUNK;
     for (size_t c = 0; c < hood->chunks; ++c,
                 chunkIndices += indexStep, chunkNextIndices += indexStep, chunkPoints += indexStep) {
-        index_t chunkSize = numPoints - (c * NRNRCHUNK);
+        size_t chunkSize = numPoints - (c * NRNRCHUNK);
         if (chunkSize > NRNRCHUNK) {
             chunkSize = NRNRCHUNK;
         }
+//printf("CHUNK %zu   chunkSize=%zu\n", c, chunkSize);
 
         // Initialize indices with list of all points
         for (index_t i = 0; i < chunkSize; ++i) {
             chunkIndices[i] = i;
         }
-//printf("presorted 0:"); for (size_t z=0; z < chunkSize; ++z) printf(" %zu", chunkIndices[z]); printf("\n");
+//printf("presorted 0:"); for (size_t z=0; z < chunkSize; ++z) printf(" %d", (int)chunkIndices[z]); printf("\n");
 
         // Copy ordered point numbers from dimension 0 into all others
         for (size_t d = 1; d < dimensions; ++d) {
             index_t* array = chunkIndices + d * chunkSize;
             SortIndices(array, chunkSize, chunkPoints, d, dimensions);
 //printf("   sorted %zu:", d); for (size_t z=0; z < chunkSize; ++z) printf("  %zu(%.1f)", array[z], chunkPoints[array[z] * dimensions + d]); printf("\n");
+//printf("   sorted %d:", (int)d); for (size_t z=0; z < chunkSize; ++z) printf(" %d", (int)array[z]); printf("\n");
         }
 
         // Build next arrays that map a point from one dimension to the next
     }
 
     free(inverse);
-
     return hood;
 }
 
     index_t* chunkNextIndices = hood->nextIndices;
     const float* chunkPoints = hood->points;
     size_t chunkOffset = 0;
+//printf("Total chunks: %zu\n", hood->chunks);
     size_t indexStep = hood->dimensions * NRNRCHUNK;
     for (size_t c = 0; c < hood->chunks; ++c,
                 chunkIndices += indexStep, chunkNextIndices += indexStep, chunkPoints += indexStep, chunkOffset += NRNRCHUNK) {
-        index_t chunkSize = hood->numPoints - (c * NRNRCHUNK);
+        size_t chunkSize = hood->numPoints - (c * NRNRCHUNK);
+        size_t chunkFound = 0;
         if (chunkSize > NRNRCHUNK) {
             chunkSize = NRNRCHUNK;
         }
-//printf("Search Chunk %zu  size=%zu\n", c, chunkSize);
+//printf("Search Chunk %zu  size=%d\n", c, (int)chunkSize);
 
         // Find ranges of indices for each dimension
 
+        int emptyBisect = 0;
         for (size_t d = 0; d < hood->dimensions; ++d) {
             index_t* array = chunkIndices + chunkSize * d;
             ranges[d] = BisectIndices(array, chunkPoints, chunkSize, hood->dimensions, minimum[d], maximum[d], d);
-//printf("  bisect %zu: %zu-%zu, ", d, ranges[d].start, ranges[d].end);
-//for (size_t z=ranges[d].start; z<ranges[d].end; ++z) printf("  %zu(%.1f)", array[z] + chunkOffset, chunkPoints[array[z] * hood->dimensions + d]); printf("\n");
+//printf("  bisect %zu: %d-%d, ", d, (int)ranges[d].start, (int)ranges[d].end);
+//for (size_t z=ranges[d].start; z<ranges[d].end; ++z) printf("  %zu(%.1f)", array[z] /*+ chunkOffset*/, chunkPoints[array[z] * hood->dimensions + d]); printf("\n");
+//for (size_t z=ranges[d].start; z<ranges[d].end; ++z) printf(" %d", (int)array[z] /*+ chunkOffset*/); printf("\n");
             if (ranges[d].start == ranges[d].end) {
-                continue;
+                emptyBisect = 1;
+//printf("   chunk found %zu: 0 (empty bisect %zu)\n", c, d);
+                break;
             }
         }
+        if (emptyBisect) {
+            continue;
+        }
 
         // Find dimension with the least number of matches
 
             }
         }
 
-        // To eliminate as much code as possible fro the search loop, precompute several needed values
+        // To eliminate as much code as possible for the search loop,
+        // precompute several needed values
 
         for (size_t d = 1; d < hood->dimensions; ++d) {
             workAxis[d] = minAxis + d;
         for (index_t minStep = ranges[minAxis].start; minStep < ranges[minAxis].end; ++minStep) {
             index_t workStep = minStep;
             int match = 1;
+//printf("     axis=%zu  index=%d\n", minAxis, (int)minStep);
             for (size_t d = 1; d < hood->dimensions; ++d) {
                 index_t pt = workNexts[d][workStep];
-                if (!(match = (pt >= ranges[workAxis[d]].start && pt <= ranges[workAxis[d]].end)))
+//printf("         axis=%zu  index=%d   (%d-%d)\n", workAxis[d], (int)pt, (int)ranges[workAxis[d]].start, (int)ranges[workAxis[d]].end);
+                if (!(match = (pt >= ranges[workAxis[d]].start && pt < ranges[workAxis[d]].end)))
                     break;
                 workStep = pt;
             }
             if (match) {
                 *found++ = minIndices[minStep] + chunkOffset;
+                ++chunkFound;
             }
         }
+//printf("   chunk found %zu: %zu (min axis was %d)\n", c, chunkFound, (int)minSize);
     }
 
     // Handle return values
 #include <inttypes.h>
 
 
-#define NRNRCHUNK 6
+//#define NRNRCHUNK UINT16_MAX
+#define NRNRCHUNK 32000
 
 
-typedef size_t index_t;
+typedef uint16_t index_t;
 
 struct NrNrHood {
     size_t numPoints;
 Memory needed to create the 10mill array of points (baseline). So hood and results
 are amount of memory over this.
     470832maxresident
+
+
+
+Fixed the incorrectness issues. Create time for uint8 is not <1sec. But I think
+it can still be reduced. Want to profile differnt sections first.
+
+    -- with 255 chunks in uint8
+    Create neighborhood: 1.978 sec
+    Search complete 638857: 0.080 sec
+    Cleanup: 0.009 sec
+    725264maxresident
+
+    -- with 65k chunks in uint16
+    Create neighborhood: 4.037 sec
+    Search complete 638857: 0.056 sec
+    Cleanup: 0.015 sec
+    960112maxresident
+
+    -- with 100k chunks in uint32
+    Create neighborhood: 4.370 sec
+    Search complete 638857: 0.062 sec
+    Cleanup: 0.022 sec
+    1428320maxresident
+
+
+The init could still be threaded to cut it down more.
+Actually, with the chunking the search and create could be easily threaded by chunk.
+
+With the uint16 indices (and 32k chunks)
+10r = Search complete 10034: 0.010 sec
+20r = Search complete 79836: 0.025 sec
+40r = Search complete 638857: 0.055 sec
+80r = Search complete 5118520: 0.100 sec
+
+
+Next up, speed up the create. Is it the filling of the initial numbered array? If so that may
+be sped up using memcpy to duplicate each chunk?
+
+
+If I break the data into chunks AFTER sorting, could that help things? PERHAPS!
+Currently all chunks are finding 20k points on the min axis and resulting in about 4k.
+That is from the 65k chunks of uint16.
+
+Perhaps optimizing the uint8 case by finding quicky early outs for scanning sections?
+
+
+Moving uint16 from 65k chunks to 32k gave speedup to larger search radius, but had no
+effect on the 10r and 20r searches.
 #include <time.h>
 
 
-#define DOCHECKS 1
+#define DOCHECKS 0
 
 
 float GetTime(void);
     // Create random points
 
     srand(1234);
-    size_t numPoints = 180;
+    size_t numPoints = 10000000;
     start = GetTime();
     float* points = (float*)malloc(sizeof(float) * 3 * numPoints);
 	for(size_t i=0; i<numPoints; i++) {
 
     // Search block
     start = GetTime();
+#if 0  // 10r
+    float minPos[3] = {-10.0f,10.0f,-30.0f};
+    float maxPos[3] = { 10.0,  30.0f, -10.0f};
+#elif 0 // 20r
+    float minPos[3] = {-20.0f, 0.0f, -40.0f};
+    float maxPos[3] = { 20.0,  40.0f, 00.0f};
+#elif 1 // 40r
     float minPos[3] = {-40.0f,-20.0f,-60.0f};
     float maxPos[3] = { 40.0,  60.0f, 20.0f};
-    printf("Searching: %.1f - %.1f,  %.1f - %.1f, %.1f - %.1f\n", minPos[0], maxPos[0], minPos[1], maxPos[1], minPos[2],maxPos[2]);
+#elif 1 // 80r
+    float minPos[3] = {-80.0f,-60.0f,-100.0f};
+    float maxPos[3] = { 80.0,  100.0f, 60.0f};
+#endif
+
+    //printf("Searching: %.1f - %.1f,  %.1f - %.1f, %.1f - %.1f\n", minPos[0], maxPos[0], minPos[1], maxPos[1], minPos[2],maxPos[2]);
 
     size_t* results;
     size_t count = NrNrSearchBlock(hood, &results, minPos, maxPos);
     printf("Search complete %zu: %.3f sec\n", count, GetTime() - start);
 
-#if 1
+#if 0
 for(size_t z = 0; z < count; ++z) {
     float* pos = points + results[z] * 3;
     printf("  found %zu: %.1f %.1f %.1f\n", results[z], pos[0], pos[1], pos[2]);
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.