Source

CLM_CN_global_to_AQMEII-NA / netCDF.stats.to.stdout.r

Full commit
# R code to write simple stats (min, mean, median, max) for an input file.
# Run this like
# $ Rscript ./netCDF.stats.to.stdout.r netcdf.fp=./GEIA_N2O_oceanic.nc var.name=emi_n2o
# or
# > source('./netCDF.stats.to.stdout.r')
# > netCDF.stats.to.stdout(...)

# ----------------------------------------------------------------------
# constants
# ----------------------------------------------------------------------

this.fn <- 'netCDF.stats.to.stdout.r'      # TODO: get like $0

# ----------------------------------------------------------------------
# functions
# ----------------------------------------------------------------------

# syntactic sugar
q1 <- function(vec) { quantile(vec, 0.25) } # first quartile
q3 <- function(vec) { quantile(vec, 0.75) } # third quartile

# the main event
netCDF.stats.to.stdout <- function(
  netcdf.fp, # /path/to/netcdf/file, can be relative or FQ
  data.var.name,   # name of data variable (datavar) of interest
  stats.precision=3 # sigdigs to use for min, median, max of obs  
) {

  # TODO: test arguments!

#  # start debug
#  cat(sprintf(
#    '%s: netcdf.fp==%s, data.var.name==%s\n',
#    this.fn, netcdf.fp, data.var.name))
#  system(sprintf('ls -alth %s', netcdf.fp))
#  system(sprintf('ncdump -h %s', netcdf.fp))
#  #   end debug

  # create strings for use in output below (far below)
  # double-sprintf-ing to set precision by constant: cool or brittle?
  stat.str.template <- sprintf('%%.%ig', stats.precision)
  # use these in function=subtitle.stats as sprintf inputs
  max.str.template <- sprintf('max=%s', stat.str.template)
  mea.str.template <- sprintf('mean=%s', stat.str.template)
  med.str.template <- sprintf('med=%s', stat.str.template)  # median==2nd quartile
  min.str.template <- sprintf('min=%s', stat.str.template)
  q1.str.template <- sprintf('q1=%s', stat.str.template)    # 1st quartile
  q3.str.template <- sprintf('q3=%s', stat.str.template)    # 3rd quartile

  # needed to parse netCDF
  library(ncdf4)

  # open netCDF file, uncautiously
  # NOTE: you must assign when you nc_open!
  netcdf.file <- ncdf4::nc_open(
    filename=netcdf.fp,
    write=FALSE,    # will only read below
    readunlim=TRUE) # it's a small file

  # uncautiously get the data out of the datavar
  data.var.data <- ncvar_get(
    nc=netcdf.file,
    varid=data.var.name)

#  # start debug
#  cat(sprintf('%s: data.var.name==%s has size==\n',
#    this.fn, data.var.name))
#  print(dim(data.var.data))
#  #   end debug

  if (is.numeric(data.var.data) && sum(!is.na(data.var.data))) {
#    unsparse.data <- data.var.data[!is.na(data.var.data)]
    # collapse its structure
    unsparse.data <- c(data.var.data[!is.na(data.var.data)])
    obs <- length(unsparse.data)
    if (obs > 0) {
      cells.str <- sprintf('cells=%i', length(data.var.data))
      obs.str <- sprintf('obs=%i', obs)
      min.str <- sprintf(min.str.template, min(unsparse.data))
      q1.str <- sprintf(q1.str.template, q1(unsparse.data))
      mea.str <- sprintf(mea.str.template, mean(unsparse.data))
      med.str <- sprintf(med.str.template, median(unsparse.data))
      q3.str <- sprintf(q3.str.template, q3(unsparse.data))
      max.str <- sprintf(max.str.template, max(unsparse.data))

      # at last: output!
      cat(sprintf('For %s var=%s\n', netcdf.fp, data.var.name))
      cat(sprintf('\t%s\n', cells.str))
      cat(sprintf('\t%s\n', obs.str))
      # 6-number summary
      cat(sprintf('\t%s\n', min.str))
      cat(sprintf('\t%s\n', q1.str))
      cat(sprintf('\t%s\n', med.str))
      cat(sprintf('\t%s\n', mea.str))
      cat(sprintf('\t%s\n', q3.str))
      cat(sprintf('\t%s\n', max.str))

    } else {
      cat(sprintf('%s: %s var=%s has no non-NA data',
        this.fn, data.var.name, netcdf.fp))
    }
  } else {
    cat(sprintf('%s: %s var=%s has no numeric non-NA data',
      this.fn, data.var.name, netcdf.fp))
  }

  # teardown
  nc_close(netcdf.file)

} # end function netCDF.stats.to.stdout

netCDF.stats.to.stdout.by.timestep <- function(
  netcdf.fp, # /path/to/netcdf/file, can be relative or FQ
  data.var.name,   # name of data variable (datavar) of interest
  time.dim.name='time', # of time dimension in datavar args
  stats.precision=3 # sigdigs to use for min, median, max of obs
) {

  # TODO: test arguments!

#  # start debug
#  cat(sprintf(
#    '%s: netcdf.fp==%s, data.var.name==%s, time.dim.name=%s\n',
#    this.fn, netcdf.fp, data.var.name, time.dim.name))
#  system(sprintf('ls -alth %s', netcdf.fp))
#  system(sprintf('ncdump -h %s', netcdf.fp))
#  #   end debug

  # create strings for use in output below (far below)
  # double-sprintf-ing to set precision by constant: cool or brittle?
  stat.str.template <- sprintf('%%.%ig', stats.precision)
  # use these in function=subtitle.stats as sprintf inputs
  max.str.template <- sprintf('max=%s', stat.str.template)
  mea.str.template <- sprintf('mean=%s', stat.str.template)
  med.str.template <- sprintf('med=%s', stat.str.template)  # median==2nd quartile
  min.str.template <- sprintf('min=%s', stat.str.template)
  q1.str.template <- sprintf('q1=%s', stat.str.template)    # 1st quartile
  q3.str.template <- sprintf('q3=%s', stat.str.template)    # 3rd quartile

  # needed to parse netCDF
  library(ncdf4)

  # open netCDF file, uncautiously
  # NOTE: you must assign when you nc_open!
  netcdf.file <- ncdf4::nc_open(
    filename=netcdf.fp,
    write=FALSE,    # will only read below
    readunlim=TRUE) # it's a small file? TODO: read by timestep!

  # Find the index of the datavar of interest
  # (in the list of all non-coordinate variables).
  netcdf.file.vars.n <- netcdf.file$nvars
  data.var.index <- -1 # an invalid index
  for (i in 1:netcdf.file.vars.n) {
    netcdf.file.var <- netcdf.file$var[[i]]
    if (netcdf.file.var$name == data.var.name) {
      data.var.index <- i
    }
  }
  if (data.var.index == -1) {
    stop(sprintf('%s: ERROR: failed to find data.var.index\n', this.fn))
  }

#  # start debug
#  cat(sprintf('%s: data.var.name==%s has var posn=%i and size==\n',
#    this.fn, data.var.name, data.var.index))
#  print(dim(data.var.data))
#  #   end debug

  # can only compute n.timesteps once we have the datavar's dims ...
  data.var <- netcdf.file$var[[data.var.index]]
  data.var.dims <- data.var$size
  data.var.dims.n <- data.var$ndims
  # ... and particularly the position of dimension$name==time.dim.name
  time.dim.index <- -1 # an invalid index
  for (i in 1:data.var.dims.n) {
    data.var.dim <- data.var$dim[[i]]
    if (data.var.dim$name == time.dim.name) {
      time.dim.index <- i
    }
  }
  if (time.dim.index == -1) {
    stop(sprintf('%s: ERROR: failed to find time.dim.index\n', this.fn))
  } # else
  n.timesteps <- data.var.dims[time.dim.index]

#  # start debug
#  cat(sprintf('%s: time.dim.name==%s has dim posn=%i and size=%i\n',
#    this.fn, time.dim.name, time.dim.index, n.timesteps))
#  #   end debug

# TODO: ncdf4 bug?
#  # compute read vectors for use in `ncvar_get`: read all timestep values
#  vec.start.template <- rep(1, data.var.dims.n)   # ASSERT: constant
##  vec.count.template <- rep(-1, data.var.dims.n)  # ASSERT: constant
#  # see bug below, try instead
#  vec.count.template <- data.var.dims             # ASSERT: constant

  # iterate over the timesteps
  for (i in 1:n.timesteps) {

#    vec.start <- vec.start.template
#    vec.start[time.dim.index] <- i # only get the i'th timestep
#    vec.count <- vec.count.template
#    vec.count[time.dim.index] <- i # only get the i'th timestep
#
#    # start debug
#    cat(sprintf('%s: vec.start==\n', this.fn))
#    print(vec.start)
#    cat(sprintf('%s: vec.count==\n', this.fn))
#    print(vec.count)
#    #   end debug

    data.var.data <- ncdf4::ncvar_get(
      nc=netcdf.file,
      varid=data.var.name
# TODO: {debug, bug report} why this fails
#      varid=data.var.name,
#      start=vec.start,
#      count=vec.count
    )
# each loop call increases time dimension! e.g.,
# > netCDF.stats.to.stdout.r: dim(data.var.data)==
# > [1] 144  96
# ...
# > netCDF.stats.to.stdout.r: dim(data.var.data)==
# > [1] 144  96   2
# ...
# > netCDF.stats.to.stdout.r: dim(data.var.data)==
# > [1] 144  96   3
    # workaround: read unlimited (above), then just take the slice of interest
    data.var.timestep <- data.var.data[,,i]
                                  
#    # start debug
#    cat(sprintf('%s: dim(data.var.timestep)==\n', this.fn))
#    print(dim(data.var.timestep))
#    cat(sprintf('%s: summary(data.var.timestep)==\n', this.fn))
#    print(summary(c(data.var.timestep))) # collapse its structure
#    #   end debug

    if (is.numeric(data.var.timestep) && sum(!is.na(data.var.timestep))) {
#      unsparse.data <- data.var.timestep[!is.na(data.var.timestep)]
      # collapse its structure
      unsparse.data <- c(data.var.timestep[!is.na(data.var.timestep)])
      obs <- length(unsparse.data)
      if (obs > 0) {
        cells.str <- sprintf('cells=%i', length(data.var.timestep))
        obs.str <- sprintf('obs=%i', obs)
        min.str <- sprintf(min.str.template, min(unsparse.data))
        q1.str <- sprintf(q1.str.template, q1(unsparse.data))
        mea.str <- sprintf(mea.str.template, mean(unsparse.data))
        med.str <- sprintf(med.str.template, median(unsparse.data))
        q3.str <- sprintf(q3.str.template, q3(unsparse.data))
        max.str <- sprintf(max.str.template, max(unsparse.data))

        # at last: output!
        cat(sprintf('For %s data.var=%s, timestep=%i of %i\n',
          netcdf.fp, data.var.name, i, n.timesteps))
        cat(sprintf('\t%s\n', cells.str))
        cat(sprintf('\t%s\n', obs.str))
        # 6-number summary
        cat(sprintf('\t%s\n', min.str))
        cat(sprintf('\t%s\n', q1.str))
        cat(sprintf('\t%s\n', med.str))
        cat(sprintf('\t%s\n', mea.str))
        cat(sprintf('\t%s\n', q3.str))
        cat(sprintf('\t%s\n', max.str))

#        rm( # debugging why output is same for each timestep
#          cells.str,
#          obs.str,
#          min.str,
#          q1.str,
#          med.str,
#          mea.str,
#          q3.str,
#          max.str)

      } else {
        cat(sprintf('%s: %s data.var=%s has no non-NA data',
          this.fn, data.var.name, netcdf.fp))
      }
    } else {
      cat(sprintf('%s: %s data.var=%s has no numeric non-NA data',
        this.fn, data.var.name, netcdf.fp))
    }

    rm( # debugging why output is same for each timestep
#      data.var.data,
      data.var.timestep,
      unsparse.data)

  } # end for (i in 1:n.timesteps)

  # teardown
  ncdf4::nc_close(netcdf.file)
  rm(this.fn) # hopefully prevents overwriting `this.fn` in prior namespaces
} # end function netCDF.stats.to.stdout.by.timestep

# ----------------------------------------------------------------------
# code
# ----------------------------------------------------------------------

# if this is called as a script, provide a main(): see
# https://stat.ethz.ch/pipermail/r-help/2012-September/323551.html
# https://stat.ethz.ch/pipermail/r-help/2012-September/323559.html
if (!interactive()) {

# start debug
#  cat(sprintf('%s: interactive()==TRUE\n', this.fn))
#   end debug
  
  # TODO: fix `strsplit` regexp below to make this unnecessary
  library(stringr)

  # pass named arguments: var above separated by '='

  args <- commandArgs(TRUE)
  # args is now a list of character vectors
  # First check to see if any arguments were passed, then evaluate each argument:
  # assign val (RHS) to key (LHS) for arguments of the (required) form 'key=val'
  if (length(args)==0) {
    cat(sprintf('%s: no arguments supplied, exiting\n', this.fn))
#    q(status=1) # KLUDGE:
# Currently this is not seeing arguments when called from Rscript,
# so this exit also exits the caller :-(    
  } else {
  # simple positional args work
  # TODO: also support positional usage
  #  netcdf.fp <- args[1]
  #  var.name <- args[2]
    # TODO: test arg length: 2 is required!

# start debug
#    cat(sprintf('%s: got length(args)==%i\n', this.fn, length(args)))
#   end debug

    for (i in 1:length(args)) {
#       eval(parse(text=args[[i]]))
      # `eval(parse())` is unsafe and requires awkward quoting:
      # e.g., of the following (bash) commandlines

      # - Rscript ./netCDF.stats.to.stdout.r netcdf.fp="GEIA_N2O_oceanic.nc" var.name="emi_n2o"
      #   fails

      # + Rscript ./netCDF.stats.to.stdout.r 'netcdf.fp="GEIA_N2O_oceanic.nc"' 'var.name="emi_n2o"'
      #   succeeds

      # so instead
      # TODO: use package `optparse` or `getopt`
      args.keyval.list <-
        strsplit(as.character(parse(text=args[[i]])),
          split='[[:blank:]]*<-|=[[:blank:]]*', fixed=FALSE)
  #                            split='[ \t]*<-|=[ \t]*', fixed=FALSE)
      args.keyval.vec <- unlist(args.keyval.list, recursive=FALSE, use.names=FALSE)
      # TODO: test vector elements!
      # Neither wants to remove all whitespace from around arguments :-( so
      args.key <- str_trim(args.keyval.vec[1], side="both")
      args.val <- str_trim(args.keyval.vec[2], side="both")

# start debug
#       cat(sprintf('%s: got\n', this.fn))
#       cat('\targs.keyval.list==\n')
#       print(args.keyval.list)
#       cat('\targs.keyval.vec==\n')
#       print(args.keyval.vec)
#       cat(sprintf('\targs.key==%s\n', args.key))
#       cat(sprintf('\targs.val==%s\n', args.val))
#   end debug

      # A real case statement would be nice to have
      if        (args.key == 'netcdf.fp') {
        netcdf.fp <- args.val
      } else if (args.key == 'var.name') {
        var.name <- args.val
      } else {
        stop(sprintf("unknown argument='%s'", args.key))
        # TODO: show usage
        q(status=1) # exit with error
      }
    } # end for loop over arguments

    # payload!
    netCDF.stats.to.stdout(netcdf.fp, var.name)

  } # end if testing number of arguments
} # end if (!interactive())