Commits

Wez Furlong  committed 3ccb060

Add log-file option to capture the actions of the monitor.

Improve robustness around spawning and monitoring the tracer process by
making the list manipulations signal safe (we block SIGCHLD while
manipulating the list) and by adding a separate tracer-interval
parameter to establish a deadline for extant tracers to complete their
work.

Resolve an issue where the monitor may consider a tracer as not being in
the running state and terminate the to-be-traced child out from under
the tracer.

refs #8886 (spent 1)

  • Participants
  • Parent commits c6269f0
  • Tags 1.1.53

Comments (0)

Files changed (4)

   { "arg0", "GIMLI_ARG0", OPT_STRING, &arg0 },
   { "run-once", "GIMLI_RUN_ONCE", OPT_INTEGER, &run_only_once },
   { "immortal", "GIMLI_IMMORTAL", OPT_INTEGER, &immortal_child },
+  { "log-file", "GIMLI_LOG_FILE", OPT_STRING, &log_file },
+  { "trace-interval", "GIMLI_TRACER_INTERVAL",
+    OPT_INTEGER, &trace_interval },
   { NULL }
 };
 
 /*
- * Copyright (c) 2008-2009 Message Systems, Inc. All rights reserved
+ * Copyright (c) 2008-2011 Message Systems, Inc. All rights reserved
  * For licensing information, see:
  * https://bitbucket.org/wez/gimli/src/tip/LICENSE
  */
 };
 
 extern int debug, quiet, detach, watchdog_interval, watchdog_start_interval,
-  watchdog_stop_interval, do_setsid, respawn_frequency;
+  watchdog_stop_interval, do_setsid, respawn_frequency, trace_interval;
 extern int run_only_once;
 extern int immortal_child;
 extern int run_as_uid, run_as_gid;
 extern char *glider_path, *trace_dir, *gimli_progname, *pidfile, *arg0;
+extern char *log_file;
 extern int gimli_nthreads;
 extern struct gimli_thread_state *gimli_threads;
 extern struct gimli_object_file *gimli_files;
 extern struct gimli_object_mapping *gimli_mappings;
 
+extern void logprint(const char *fmt, ...);
+
 struct gimli_object_mapping *gimli_add_mapping(
   const char *objname, void *base, unsigned long len,
   unsigned long offset);
 /*
- * Copyright (c) 2008-2009 Message Systems, Inc. All rights reserved
+ * Copyright (c) 2008-2011 Message Systems, Inc. All rights reserved
  * For licensing information, see:
  * https://bitbucket.org/wez/gimli/src/tip/LICENSE
  */
 int watchdog_interval = 60;
 int watchdog_start_interval = 200;
 int watchdog_stop_interval = 60;
+int trace_interval = 60;
 int respawn_frequency = 15;
 int run_as_uid = -1;
 int run_as_gid = -1;
 char **child_argv;
 char *arg0 = NULL;
 char *pidfile = NULL;
+char *log_file = NULL;
 char *glider_path = GIMLI_GLIDER_PATH;
 char *trace_dir = "/tmp";
 char *child_image;
 void wait_for_exit(struct kid_proc *p, int timeout);
 void wait_for_child(struct kid_proc *p);
 
+void logprint(const char *fmt, ...)
+{
+  va_list ap;
+  char buf[8192];
+  struct tm tmbuf;
+  struct tm *tmnow;
+  time_t now;
+  int i;
+
+  time(&now);
+  tmnow = localtime_r(&now, &tmbuf);
+  i = strftime(buf, sizeof(buf)-1, "[%a %d %b %Y %H:%M:%S] Monitor: ", tmnow);
+  buf[i] = '\0';
+
+  va_start(ap, fmt);
+  vsnprintf(buf + i, sizeof(buf) - i - 1, fmt, ap);
+  va_end(ap);
+
+  fwrite(buf, 1, strlen(buf), stderr);
+  fflush(stderr);
+}
+
 static void catch_sigchld(int sig_num)
 {
   pid_t dead_pid;
 {
   struct kid_proc *p;
 
-  fprintf(stderr, "monitor: caught signal %s, restarting child\n",
+  logprint("caught signal %s, restarting child\n",
     strsignal(sig_num));
 
   for (p = procs; p; p = p->next) {
 {
   struct kid_proc *p;
 
-  fprintf(stderr, "monitor: caught signal %s, terminating.\n",
+  logprint("caught signal %s, terminating.\n",
     strsignal(sig_num));
   should_exit = 1;
   respawn = 0;
 {
   if (hb_file[0]) {
     if (debug) {
-      fprintf(stderr, "unlinking %s\n", hb_file);
+      logprint("unlinking %s\n", hb_file);
     }
     unlink(hb_file);
   }
   snprintf(hb_file, sizeof(hb_file)-1, "/tmp/gimlihbXXXXXX");
   fd = mkstemp(hb_file);
   if (fd == -1) {
-    fprintf(stderr, 
-      "monitor: failed to open heartbeat file: %s\n", hb_file);
+    logprint(
+      "failed to open heartbeat file: %s\n", hb_file);
     hb_file[0] = '\0';
     return 0;
   }
   if (debug) {
-    fprintf(stderr, "monitor: opened hearbeat file: %s\n", hb_file);
+    logprint("opened hearbeat file: %s\n", hb_file);
   }
 
   /* make sure the file is sized big enough for the heartbeat, otherwise
   addr = mmap(NULL, sizeof(*heartbeat),
                 PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
   if (debug) {
-    fprintf(stderr, "monitor: mmap fd=%d -> addr %p (%s)\n",
+    logprint("mmap fd=%d -> addr %p (%s)\n",
       fd, addr, strerror(errno));
   }
   unlink(hb_file);
   /* close(fd); */
 
   if (addr == MAP_FAILED) {
-    perror("monitor: failed to map heartbeat memory");
+    perror("failed to map heartbeat memory");
     return 0;
   }
 
   return 1;
 }
 
+static void mask_signal(int how, int signo)
+{
+  sigset_t s;
+
+  sigemptyset(&s);
+  sigaddset(&s, signo);
+
+  sigprocmask(how, &s, NULL);
+}
+
+static void link_child(struct kid_proc *p)
+{
+  mask_signal(SIG_BLOCK, SIGCHLD);
+
+  p->next = procs;
+  if (procs) {
+    procs->prev = p;
+  }
+  procs = p;
+
+  mask_signal(SIG_UNBLOCK, SIGCHLD);
+}
+
+static void unlink_child(struct kid_proc *p)
+{
+  mask_signal(SIG_BLOCK, SIGCHLD);
+
+  if (procs == p) {
+    procs = p->next;
+  }
+  if (p->next) {
+    p->next->prev = p->prev;
+  }
+  if (p->prev) {
+    p->prev->next = p->next;
+  }
+
+  mask_signal(SIG_UNBLOCK, SIGCHLD);
+}
+
 static struct kid_proc *spawn_child(void)
 {
   struct kid_proc *p;
-  
+
   p = calloc(1, sizeof(*p));
   if (!p) {
-    fprintf(stderr, "calloc(): %s\n", strerror(errno));
+    logprint("calloc(): %s\n", strerror(errno));
     return NULL;
   }
 
   /* link this in first, so that we don't race if the child dies
    * immediately */
   p->running = 1;
-  p->next = procs;
-  if (procs) {
-    procs->prev = p;
-  }
-  procs = p;
+  link_child(p);
 
   p->pid = fork();
   if (p->pid == 0) {
   sleep(4);
   if (p->pid == -1 || !p->running) {
     if (p->pid == -1) {
-      fprintf(stderr, "fork() failed: %s\n", strerror(errno));
+      logprint("fork() failed: %s\n", strerror(errno));
     } else {
-      fprintf(stderr, "child died immediately on startup\n");
+      logprint("child died immediately on startup\n");
     }
     /* unlink */
-    procs = p->next;
+    unlink_child(p);
     free(p);
     return NULL;
   }
 
   trc = calloc(1, sizeof(*trc));
   trc->tracer_for = p->pid;
+  trc->running = 1;
 
   snprintf(childname, sizeof(childname)-1, "%s", child_argv[0]);
 
     trace_dir, basename(childname), p->pid);
   tracefd = open(tracefile, O_WRONLY|O_CREAT|O_TRUNC|O_APPEND, 0600);
   if (tracefd == -1) {
-    fprintf(stderr, "Unable to open trace file %s: %s\n",
+    logprint("Unable to open trace file %s: %s\n",
       tracefile, strerror(errno));
   } else {
     char buf[2048];
     time_t now;
     int i;
 
-    fprintf(stderr, "Tracing to file: %s\n", tracefile);
+    logprint("Tracing to file: %s\n", tracefile);
 
     time(&now);
 
       );
     write(tracefd, buf, strlen(buf));
 
+    link_child(trc);
+
     trc->pid = fork();
     if (trc->pid == 0) {
       setup_signal_handlers(1);
       dup2(tracefd, 1);
       dup2(tracefd, 2);
       close(tracefd);
-      _exit(execlp(cmdbuf, cmdbuf, pidbuf, (char*)NULL));
+      execlp(cmdbuf, cmdbuf, pidbuf, (char*)NULL);
+      logprint("execlp: %s %s failed: %s\n", cmdbuf, pidbuf, strerror(errno));
+      _exit(1);
     }
     if (trc->pid == -1) {
       int err = errno;
-      fprintf(stderr, "fork() failed while tracing child %d: %s\n",
+      logprint("fork() failed while tracing child %d: %s\n",
           p->pid, strerror(err));
       snprintf(buf, sizeof(buf)-1,
           "fork() failed while launching tracer: %s\n",
           strerror(err));
       write(tracefd, buf, strlen(buf));
+      unlink_child(trc);
       free(trc);
       trc = NULL;
     } else {
-      trc->next = procs;
-      if (procs) {
-        procs->prev = trc;
-      }
-      procs = trc;
       /* force a context switch to allow enough time for the child to run
        * so that we can wait for it */
       sleep(2);
-      wait_for_child(trc);
+      if (debug) {
+        logprint("waiting for tracer to exit (pid=%d)\n", trc->pid);
+      }
+      wait_for_exit(trc, trace_interval);
+      if (trc->running) {
+        logprint("tracer is taking too long, terminating it\n");
+        while (trc->running) {
+          kill(trc->pid, SIGKILL);
+          wait_for_exit(trc, 2);
+        }
+      }
     }
 
     close(tracefd);
   }
 
   /* process is done; unlink from the list */
-  if (procs == p) {
-    procs = p->next;
-  }
-  if (p->next) {
-    p->next->prev = p->prev;
-  }
-  if (p->prev) {
-    p->prev->next = p->next;
-  }
+  unlink_child(p);
 }
 
 int main(int argc, char *argv[])
   struct kid_proc *p;
 
   if (argc < 2) {
-    fprintf(stderr, "not enough arguments\n");
+    logprint("not enough arguments\n");
     return 1;
   }
   argv = gimli_init_proctitle(argc, argv);
   }
 
   if (debug) {
-    fprintf(stderr, "Child to monitor: (argc=%d) ", child_argc);
+    logprint("Child to monitor: (argc=%d) ", child_argc);
     for (i = 0; i < child_argc; i++) {
-      if (i) fprintf(stderr, " ");
-      fprintf(stderr, "%s", child_argv[i]);
+      if (i) logprint(" ");
+      logprint("%s", child_argv[i]);
     }
-    fprintf(stderr, "\n");
+    logprint("\n");
   }
 
   if (!prep()) {
 
   if (detach) {
     if (debug) {
-      fprintf(stderr, "detaching to spawn %s\n", child_argv[0]);
+      logprint("detaching to spawn %s\n", child_argv[0]);
     }
     if (fork()) {
       exit(0);
     }
   } else {
     if (debug) {
-      fprintf(stderr, "starting new session for %s\n", child_argv[0]);
+      logprint("starting new session for %s\n", child_argv[0]);
     }
     setsid();
   }
 
     fd = open(pidfile, O_RDWR|O_CREAT, 0644);
     if (fd == -1) {
-      fprintf(stderr, "Failed to open pidfile %s for write: %s\n",
+      logprint("Failed to open pidfile %s for write: %s\n",
         pidfile, strerror(errno));
       exit(1);
     }
       len = read(fd, pidstr, sizeof(pidstr)-1);
       pidstr[len] = '\0';
 
-      fprintf(stderr, "Failed to lock pidfile %s: process %s owns it: %s\n",
+      logprint("Failed to lock pidfile %s: process %s owns it: %s\n",
         pidfile, pidstr, strerror(errno));
       exit(1);
     }
   /* drop privs if appropriate */
   if (run_as_gid != -1) {
     if (setgid(run_as_gid)) {
-      fprintf(stderr, "Failed to setgid(%d): %s\n",
+      logprint("Failed to setgid(%d): %s\n",
         run_as_gid, strerror(errno));
       exit(1);
     }
   }
   if (run_as_uid != -1) {
     if (setuid(run_as_uid)) {
-      fprintf(stderr, "Failed to setuid(%d): %s\n",
+      logprint("Failed to setuid(%d): %s\n",
         run_as_uid, strerror(errno));
       exit(1);
     }
   }
 
+  if (log_file) {
+    int logfd = open(log_file, O_WRONLY|O_APPEND|O_CREAT, 0600);
+
+    if (logfd == -1) {
+      logprint("unable to open logfile %s: %s\n",
+          log_file, strerror(errno));
+    } else {
+      dup2(logfd, STDOUT_FILENO);
+      dup2(logfd, STDERR_FILENO);
+      close(logfd);
+    }
+  }
+
   if (detach) {
     int devnull = open("/dev/null", O_RDWR);
 
     if (devnull >= 0) {
       dup2(devnull, STDIN_FILENO);
-      if (quiet) {
+      if (quiet && !log_file) {
         dup2(devnull, STDOUT_FILENO);
         dup2(devnull, STDERR_FILENO);
       }
       /* not an abnormal termination */
       int ret = WIFEXITED(p->exit_status) ?
         WEXITSTATUS(p->exit_status) : 0;
-      fprintf(stderr, "child exited with return %d\n", ret);
+      logprint("child exited with return %d\n", ret);
       exit(ret);
     }
     if (run_only_once) {
       }
     }
     if (!running) break;
-    fprintf(stderr, "waiting for %d processes to terminate\n", running);
+    logprint("waiting for %d processes to terminate\n", running);
     wait_for_child(procs);
   }
 
   len = strlen(title);
   memset(title + len, '\0', title_size - len);
   if (debug) {
-    fprintf(stderr, "%s\n", title);
+    logprint("%s\n", title);
   }
 }