Commits

Evan Gates  committed e218777

add rand_lines.c: print n random d delimited lines from file

  • Participants
  • Parent commits 7118956

Comments (0)

Files changed (1)

File rand_lines.c

+// rand_lines
+// Usage: rand_lines <delim> <nlines> <file>
+//
+// given file with "delim" delimited lines, print "nlines" random lines in
+// random order to stdout
+
+#define _POSIX_C_SOURCE 200809L
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#define RAND_PATH "/dev/urandom"
+#define BAIL() do { status = 1; goto cleanup; } while (0)
+
+typedef struct {
+    char  *beg;
+    size_t len;
+} Line;
+
+char *prog;
+
+// Retry reads and writes on EINTR
+ssize_t rw(ssize_t (*act)(), int fd, void *buf, size_t n)
+{
+    ssize_t r;
+    do r = act(fd, buf, n);
+    while (r < 0 && errno == EINTR);
+    return r;
+}
+
+// Continually read or write until entire n bytes are read or written
+int rw_all(ssize_t (*act)(), int fd, void *buf, size_t n)
+{
+    char *b = buf;
+    for (ssize_t r; n; b += r, n -= r)
+        if ((r = rw(act, fd, b, n)) < 0)
+            return -1;
+    return 0;
+}
+
+// Random number generation from
+// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf
+uint32_t s[4]; // Seed
+double drand32(void)
+{
+    uint64_t t;
+    uint32_t r;
+    s[0] = 314527869 * s[0] + 1234567;
+    s[1] ^= s[1] << 5; s[1] ^= s[1] >> 7; s[1] ^= s[1] << 22;
+    t = 4294584393ULL * s[2] + s[3]; s[3] = t >> 32; s[2] = t;
+    r = s[0] + s[1] + s[2];
+    return (double)r / (1ULL << 32);
+}
+
+int srand32(void)
+{
+    int status = 0, e = 0, fd;
+    if (0 > (fd = open(RAND_PATH, O_RDONLY))) BAIL();
+    if (0 > rw_all(read, fd, s, sizeof(s)))   BAIL();
+
+cleanup:
+    e = errno;
+    if (fd > 0) close(fd); // ignore errors
+    errno = e;
+    return -status;
+}
+
+// Starting at *file, read n lines delimited by delim or up to *len bytes
+// and store Lines in array at lines. If last line has no delimter, add it.
+// Update *file and *len accordingly.
+int read_lines(char delim, size_t n, Line *lines, char **file, off_t *len)
+{
+    size_t i = n;
+    while (i-- && *len > 0) {
+        char *end = memchr(*file, delim, *len);
+        if (!end) { // reached end of file, no delimiter
+            end = *file + *len;
+            *end = delim;
+        }
+        ++end;
+        lines[i] = (Line){ *file, end - *file };
+        *file = end;
+        *len -= lines[i].len;
+    }
+    return n - ++i;
+}
+
+// Write n Lines from lines to stdout
+int write_lines(Line *lines, size_t n)
+{
+    while (n--)
+        if (rw_all(write, 1, lines[n].beg, lines[n].len) < 0)
+            return -1;
+    return 0;
+}
+
+void shuffle_lines(Line *lines, size_t n)
+{
+    for (size_t i = n - 1; i; --i) {
+        size_t r = drand32() * (i + 1);
+        Line   t = lines[i];
+        lines[i] = lines[r];
+        lines[r] = t;
+    }
+}
+
+// Starting at file, read n lines delimited by delim.
+// For each subsequent line read, accept line with
+// probability n / nl and replace random existing line.
+int rand_lines(char delim, int n, char *file, off_t len)
+{
+    Line  *lines;
+    Line  new;
+    char *beg = file;
+    int   status = 0;
+
+    if (!(lines = malloc(n * sizeof(Line))))
+        BAIL();
+    if (read_lines(delim, n, lines, &beg, &len) != n) {
+        fprintf(stderr, "%s: more lines requested than in file\n", prog);
+        errno = 0;
+        BAIL();
+    }
+
+    for (size_t nl = n + 1; read_lines(delim, 1, &new, &beg, &len); ++nl)
+        if (drand32() * nl < (double)n)
+            lines[(size_t)(drand32() * n)] = new;
+
+    shuffle_lines(lines, n);
+
+    if (write_lines(lines, n) < 0)
+        BAIL();
+
+cleanup:
+    if (lines) free(lines);
+    return -status;
+}
+
+int main(int argc, char **argv)
+{
+    char  delim, *file = NULL;
+    int   n, fd = -1;
+    off_t fsize;
+    int   status = 0;
+
+    prog = *argv;
+
+    if (argc != 4)                  { fprintf(stderr, "Usage: %s <delim> <nlines> <file>\n", prog); return 1; }
+    if (   1 > (n = atoi(argv[2]))) { fprintf(stderr, "%s: nlines must be greater than 1\n", prog); return 1; }
+    delim = *argv[1];
+
+    if (         0 >  srand32())                                                                   BAIL();
+    if (         0 >  (fd    = open(argv[3], O_RDONLY)))                                           BAIL();
+    if (         0 >  (fsize = lseek(fd, 0, SEEK_END)))                                            BAIL();
+    if (MAP_FAILED == (file  = mmap(NULL, fsize + 1, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0))) BAIL();
+    if (         0 >  rand_lines(delim, n, file, fsize))                                           BAIL();
+
+cleanup:
+    if (status && errno)                       perror(prog);
+    if (file && munmap(file, fsize + 1) < 0) { perror(prog); status = 1; }
+    if (fd > 0 && close(fd))                 { perror(prog); status = 1; }
+    return status ? EXIT_FAILURE : EXIT_SUCCESS;
+}