Commits

stevejb committed 83f33b1

Adding the FF iss

  • Participants
  • Parent commits 12e5078

Comments (0)

Files changed (1)

File data/iss_calc_moments_by_FF_Industry.R

+## Stephen J. Barr
+##
+## calculating the moments
+
+
+## library(data.table)
+## load("dt80.14g.Robj")
+## dt = dt80.14g
+## icols = c("PPEGT", "CHE", "AT", "CAPXV", "SPPE", "IB",
+##   "DP", "SSTK", "PRSTKC", "DLTT", "DLC", "CHE", "OIBDP")
+library(data.table)
+library(zoo)
+## datapath = '/home/stevejb/myhg/is-solver/data/f3b4f643073a3050.txt'
+## datapath.csv = '/home/stevejb/myhg/is-solver/data/95a38ae0ef448323.csv'
+## datapath.csv = '/home/stevejb/myhg/is-solver/data/67957f88fe114272.csv'
+## datapath.csv = '/home/stevejb/myhg/is-solver/data/83baebcc8faf28cd.csv'
+## datapath.csv = '/home/stevejb/myhg/is-solver/data/c046030d7f1cc0a1.csv'
+## datapath.csv = '/home/stevejb/myhg/is-solver/data/4403cf840b6e3d9d.csv'
+## datapath.csv = '/home/stevejb/myhg/is-solver/data/930752a42a8f42d5.csv'
+## datapath.csv = '/home/stevejb/myhg/is-solver/data/86a8e3999f92db68.csv'
+## datapath.csv = '/home/stevejb/myhg/is-solver/data/8f2d0a8f1ce37c54.csv'
+datapath.csv = '/home/stevejb/myhg/is-solver/data/8ebd149d0451278c.csv'
+data = read.csv(datapath.csv, header=TRUE)
+
+
+
+##################################################
+## trying the stupid gvkey experiment
+exec_gvs = read.csv("execucomp/exec_gvkeys.csv")
+colnames(exec_gvs) = c("gvidx", "gvkey")
+USE_EXECUCOMP = FALSE
+if(USE_EXECUCOMP) {
+  data.gvs = merge(data, exec_gvs, by="gvkey")
+  data = data.gvs
+}
+
+##
+########################################
+
+
+
+data = transform(data, sic2digit = floor(SIC/100) * 100)
+data = transform(data, sic1digit = floor(SIC/1000) * 1000)
+idxs = 1:nrow(data)
+data = cbind(data, idxs)
+datacols = colnames(data)
+for(i in 1:length(colnames(data))) {
+  dctmp = datacols[i]
+  nasum = print(paste(dctmp, sum(is.na(data[,i]))))
+}
+PSTKfull = data$PSTK
+PSTKna = is.na(data$PSTK)
+PSTKRVna = is.na(data$PSTKRV)
+PSTKLna = is.na(data$PSTKL)
+repl1 = which(PSTKna & !(PSTKRVna))
+PSTKfull[repl1] = data$PSTKRV[repl1]
+repl2 = which(is.na(PSTKfull) & (!(PSTKLna)))
+PSTKfull[repl2] = data$PSTKL[repl2]
+data$PSTKfull = PSTKfull
+
+data.orig = data
+
+
+writeRawCSV = function(x, fnameInput) {
+  write.table(x, row.names=FALSE, col.names=FALSE, sep=",", file=fnameInput)
+}
+
+## We believe this is now accurate
+filterfirms <- function(data) {
+##     - Delete where total assets, gross capital stock, or sales <= 0
+##     - Delete where total assets < 2MM
+##     - Delete where gross capital stocks < 1MM
+
+  ## ## Drop firms outside of DDW date range - 1988 to 2001 
+  data <- subset(data, fyear >= 1988 & fyear <= 2001)
+
+  ## Delete where total assets, gross capital stock, or sales <= 0
+  data = subset(data, AT > 0)      ## total assets
+  data = subset(data, PPEGT > 0)   ## gross capital stock
+  data = subset(data, SSTK >= 0)    ## sales
+
+
+  ## Delete where total assets < 2MM
+  data = subset(data, AT >= 100)
+
+  ## Delete where gross capital stocks < 1MM
+  data = subset(data, PPEGT >= 100)
+
+  ## drop regulated and financial firms
+  data <- subset(data, SIC < 4900 | SIC > 4999)
+  data <- subset(data, SIC < 6000 | SIC > 6999)  
+  return(data)
+}
+
+
+filterfirmsNoSize <- function(data) {
+##     - Delete where total assets, gross capital stock, or sales <= 0
+##     - Delete where total assets < 2MM
+##     - Delete where gross capital stocks < 1MM
+
+  ## ## Drop firms outside of DDW date range - 1988 to 2001 
+  data <- subset(data, fyear >= 1988 & fyear <= 2001)
+
+  ## Delete where total assets, gross capital stock, or sales <= 0
+  data = subset(data, AT > 0)      ## total assets
+  data = subset(data, PPEGT > 0)   ## gross capital stock
+  data = subset(data, SSTK >= 0)    ## sales
+
+  ## drop regulated and financial firms
+  data <- subset(data, SIC < 4900 | SIC > 4999)
+  data <- subset(data, SIC < 6000 | SIC > 6999)  
+  return(data)
+}
+
+
+
+
+########################################
+## Windsorize the numerical ones
+
+w1 <- function (x, fraction=.02)
+{
+   if(length(fraction) != 1 || fraction < 0 ||
+         fraction > 0.5) {
+      stop("bad value for 'fraction'")
+   }
+   lim <- quantile(x, probs=c(fraction, 1-fraction))
+   x[ x < lim[1] ] <- lim[1]
+   x[ x > lim[2] ] <- lim[2]
+   return(x)
+}
+
+
+windsor_demo <- function() {
+  x = c(rnorm(1000), rnorm(5, mean = 100), rnorm(4, mean=-150))
+  x.windsor = w1(x)
+  par(mfrow=c(2,1))
+  plot(density(x))
+  plot(density(x.windsor))
+}
+
+
+windsor_multicols <-  function(frac, data, numericalCols) {
+  
+  for(n in 1:length(numericalCols)) {
+    nctmp = numericalCols[n]
+    datatmp = w1(data[,nctmp], fraction =  frac)
+    data[,nctmp] = datatmp
+  }
+  return(data)
+}
+
+
+## mvt_trim - a function to jointly trim multiple columns,
+## specified by column name, to a data frame
+mvt_trim = function(mydf, trimlevel, trimcolnames) {
+  
+  idxs = vector()
+
+  for(coltmp in trimcolnames) {
+    print(paste("trimming column", coltmp))
+    bounds = quantile(mydf[,coltmp], probs=c(trimlevel, (1-trimlevel)))
+    idxs = union(idxs, which(mydf[,coltmp] < bounds[1]))
+    idxs = union(idxs, which(mydf[,coltmp] > bounds[2])) 
+    print(paste("Nlower:", sum(mydf[,coltmp] < bounds[1])))
+    print(paste("Nlower:", sum(mydf[,coltmp] > bounds[2])))
+    
+  }
+
+  mydf.trimmed = mydf[-idxs,]
+  return(mydf.trimmed)
+}
+
+trim = function(x, trimlevel = 0.02) {
+  bounds = quantile(x, probs=c(trimlevel, (1-trimlevel)))
+  y = x[ x > bounds[1] & x < bounds[2]]
+  return(y)
+}
+
+
+## DON'T USE
+## makequantities <- function(data) {
+##   data$k             = data$PPEGT
+##   data$cash          = data$CHE/data$k
+##   data$book.assets   = data$AT/data$k
+##   data$inv           = (data$CAPXV - data$SPPE)/data$k
+##   data$cf            = (data$IB + data$DP)/data$k
+##   data$equity        = (data$SSTK - data$PRSTKC)/data$k
+##   data$netdebt       = (data$DLTT + data$DLC - data$CHE)/data$k
+##   data$debt.unscaled = pmax( (data$DLTT + data$DLC - data$CHE), 0)
+##   data$leverage      = data$debt.unscaled / data$k
+##   data$leverage2      = data$debt.netdebt / data$k
+
+##   return(data)
+## }
+
+
+makequantitiesDDW <- function(data) {
+  data$k             = data$PPEGT
+  if("ACQ" %in% colnames(data)) {
+    data$inv = (data$CAPX + data$AQC) / data$PPEGT
+  }
+  data$invAltA = (data$CAPX - data$SPPE) / data$PPEGT
+  data$opinc = (data$OIBDP)/data$AT
+  data$leverage = (data$DLTT + data$DLC)/data$AT
+  data$equity = data$SSTK / data$AT
+  data$cf = data$CHE / data$AT
+  data$market2book = (data$DLTT + data$DLC + data$PRSTKC +
+                      (data$CSHO * data$prcc_f))/data$AT
+
+  data$bookequity = data$SEQ + data$TXDITC - data$PSTKfull
+  data$bookdebt = data$AT - data$bookequity
+  data$tobinsq = ( (data$CSHO * data$prcc_f ) + data$bookdebt - data$ACT)/data$PPEGT
+  return(data)
+}
+
+
+##     - Delete singletons
+removeSingleObs <- function(data) {
+  dt = data.table(data)
+  setkey(dt, "cusip")
+  sublengths <- dt[,list(tslen=length(fyear)),by=cusip, mult="last"]
+  setkey(sublengths, "cusip")
+  dt <- dt[sublengths]
+  dt.nosingles = subset(dt, tslen > 1)
+  return(data.frame(dt.nosingles))
+}
+
+
+
+##     - Filter out ones where
+## assets != liabilities
+filterACC <- function(data) {
+  d1 = abs(data$AT - (data$LT + data$SEQ))
+  keepidxs = which(d1 < 0.05)
+  return(data[keepidxs,])
+}
+
+
+
+
+##     - Find largest consecutive set
+largestConsecutive <- function(data) {
+  cusips = unique(data$cusip)
+  cusipslist = as.list(cusips)
+  
+}
+
+
+
+ddwmoments <- function(data) {
+  print(paste("Var inv", var(data$invAltA)))
+  print(paste("Var lev", var(data$leverage)))
+  print(paste("Mean lev", mean(data$leverage)))
+  print(paste("Mean equity", mean(data$equity)))
+  print(paste("Tobin's Q - FIXED", 3.6707))
+  
+}
+
+
+
+
+
+########################################
+## some nice variables to have defined
+
+keepcols = c("gvkey"   , "datadate" ,"fyear"    ,  "indfmt",   "consol" ,  "popsrc"   ,
+"datafmt" , "cusip"    ,"CURCD"    ,  "ACT"   ,   "AT"     ,  "CAPX"     ,
+ "CHE"    ,  "CSHO"    , "DLC"     ,  "DLTT"  ,   "LT"     ,  "OIBDP"    ,
+ "PPEGT"  ,  "PRSTKC"  ,   "SEQ"   ,  
+ "SPPE"   ,  "SSTK"    , "TXDITC"  ,  "prcc_f" , 
+ "SIC"    ,  "idxs"    , "PSTKfull", "sic2digit", "sic1digit")
+
+data = data[,keepcols]          ## about 171K rows
+data = na.omit(data)            ## about 70K rows
+
+if(USE_EXECUCOMP) {
+  data = filterfirmsNoSize(data)
+} else {
+  data = filterfirms(data)       ## about 17K rows
+}
+data = removeSingleObs(data)    ## about 11K rows
+data = filterACC(data)          ## about 8K rows
+
+data = data.frame(data)
+data.rawq = makequantitiesDDW(data)
+
+
+library(moments)
+###  investment
+invA = data.rawq$invAltA
+mean(w1(invA, fraction=0.01))
+var(w1(invA, fraction=0.01))
+moment(w1(invA, fraction=0.01), order=3, central=TRUE)
+
+### operating income
+opinc = w1(data.rawq$opinc, fraction=0.01)
+opinc2 = w1(data$EBITDA/data$AT, fraction=0.01)
+opinc3 = w1(data$OIBDP/data$AT, fraction=0.01)
+mean(opinc2)
+var(opinc2)
+mean(opinc3)
+var(opinc3)
+
+### leverage
+lev = data.rawq$leverage
+mean(w1(lev, fraction=0.01))
+var(w1(lev, fraction=0.01))
+
+
+### equity issuance
+equity = data.rawq$equity
+#equity = equity[equity < 0.5]
+mean(w1(equity, fraction=0.01))
+var(w1(equity, fraction=0.01))
+
+### cash balances
+cash = data.rawq$cf
+mean(w1(cash, fraction=0.01))
+
+
+### for the serial correlation and lag stuff
+
+
+
+
+########################################
+## WHOLESALE WINDSORIZING
+
+pcts = c(0.99, 0.97, 0.95)
+fracs = as.list(1 - pcts)
+
+
+## columns to Windsorize
+numericalCols = c(
+"ACT",  "AT"    , "CAPX" , 
+"CHE"   , "CSHO",  "DLC"   , "DLTT" ,
+"LT", "OIBDP" , "PPEGT" , "PRSTKC",  "SSTK" ,
+"SEQ"   , "SPPE"  , "CSHO",
+  "prcc_f", "TXDITC", "PSTKfull"
+  )
+
+
+
+## Windsor only
+data.w1 = windsor_multicols(0.01, data, numericalCols)
+data.q = makequantitiesDDW(data.w1)
+
+## Trim plus windsor
+data.tr = mvt_trim(data, 0.01, numericalCols)
+data.tr = windsor_multicols(0.01, data.tr, numericalCols)
+data.tr.q = makequantitiesDDW(data.tr)
+
+
+
+
+
+
+
+findyearseries <- function(year) {
+
+  x = rle(diff(year))
+  lcum = cumsum(x$lengths)
+  z = cbind(x$lengths, x$values, lcum)
+  colnames(z) = c("lengths", "values", "lcum")
+  z = data.frame(z)
+
+  zsub = subset(z, values==1)
+  maxrow = which.max(zsub$lengths)
+  maxzrow = as.numeric(rownames(zsub[maxrow,]))
+
+  if(length(maxzrow) == 0) {
+    return(NULL)
+  }
+
+  maxidx = 1
+  yearendidx = 1
+  if(maxzrow > 1) {
+    maxidx = z[(maxzrow-1), ]$lcum + 1
+  }
+
+  yearendidx = z[maxzrow,]$lcum
+  yearstart = year[maxidx]
+  yearend = year[yearendidx+1]
+  slen = yearend - yearstart + 1
+  print(paste(yearstart, "to", yearend,
+              ", spanning", slen))
+
+  yobj = list(yearmin = yearstart,
+    yearmax = yearend,
+    slen = slen)
+  return(yobj)
+}
+
+## year
+year1 = c(1,2,3,4,5,6,7,10,11,15,16,17,18) + 1990
+year2 = c(1,2,5,6,7,8,9,13,14,17) + 2000
+year3 = c(1,2,3,15,16,17,21,23,27:40) + 1980
+year4 = c(1,2,3, 5:10, 15:20) + 1970
+
+
+
+cusiplist = as.list(unique(data.rawq$cusip))
+
+subsetByCusip <- function(ccode, data) {
+  ccode = as.character(ccode)
+##  print("Initial subset")
+  dtmp = subset(data, cusip == ccode)
+  if(nrow(dtmp) < 4) {
+    return(NULL)
+  }
+
+  ## print("Find year series")
+  yout = findyearseries(unique(sort(dtmp$fyear)))
+  if(is.null(yout)) {
+    return(NULL)
+  }
+  yearmin = yout$yearmin
+  yearmax = yout$yearmax
+
+  if(yout$slen < 4) {
+    return(NULL)
+  }
+
+  ## print("Enough data to return")
+  dshort = subset(dtmp, fyear >= yearmin & fyear <= yearmax)
+  dshort = dshort[order(dshort$fyear),]
+  return(dshort)
+  
+}
+
+indiv4 = lapply(cusiplist, subsetByCusip, data.q)
+library(multicore)
+indiv4.mc = mclapply(cusiplist, subsetByCusip, data.q)
+all.equal(indiv4, indiv4.mc)
+indiv4 = indiv4.mc
+
+i4 = list()
+icount = 1
+for(i in 1:length(indiv4)) {
+  if(!(is.null(indiv4[[i]]))) {
+    i4[[icount]] = indiv4[[i]]
+    icount = icount + 1
+  }
+}
+
+trows = sum(unlist(lapply(i4, nrow)))
+
+
+
+
+
+makeAllMoments <- function(data, fname = NULL) {
+  invA = data$invAltA
+  mean_inv = mean(invA)
+  var_inv =  var(invA)
+  inv_3rd_central = moment(invA, order=3, central=TRUE)
+  inv_skewness = moment(invA, order=3, central=FALSE)
+  inv_kurtosis = moment(invA, order=4, central=FALSE)
+
+  ## operating income
+  opinc = data$opinc
+  mean_opr_income = mean(opinc)
+  var_opr_income = var(opinc)
+  opr_inc_3rd_central = moment(opinc, order=3, central=TRUE)
+  opr_inc_skewness = moment(opinc, order=3, central=FALSE)
+  opr_inc_kurtosis = moment(opinc, order=4, central=FALSE)
+
+  ## leverage
+  lev = data$leverage
+  mean_lev = mean(lev)
+  var_lev = var(lev)
+  lev_3rd_central = moment(lev, order=3, central=TRUE)
+  lev_skewness =  moment(lev, order=3, central=FALSE)
+  lev_kurtosis =  moment(lev, order=4, central=FALSE)
+
+  ## equity issuance
+  equity = data$equity
+  ##equity = equity[equity < 0.5]
+  mean_equity_issuance = mean(equity)
+  var_equity_issuance = var(equity)
+  eq_issuance_3rd_central = moment(equity,  order=3, central=TRUE)
+  eq_issuance_skewness =  moment(equity,  order=3, central=FALSE)
+  eq_issuance_kurtosis =  moment(equity,  order=4, central=FALSE)
+
+  ## cash balances
+  cash = data$cf
+  mean_cash_bal = mean(cash)
+  var_cash_bal = var(cash)
+  cash_bal_3rd_central = moment(cash, order=3, central=TRUE)
+  cash_bal_skewness = moment(cash, order=3, central=FALSE)
+  cash_bal_kurtosis = moment(cash, order=4, central=FALSE)
+
+
+  ########################################
+
+  cusiplist = as.list(unique(data$cusip))
+  indiv4 = lapply(cusiplist, subsetByCusip, data)
+  ## library(multicore)
+  ## indiv4.mc = mclapply(cusiplist, subsetByCusip, data.q)
+  ## all.equal(indiv4, indiv4.mc)
+  
+  i4 = list()
+  icount = 1
+  for(i in 1:length(indiv4)) {
+    if(!(is.null(indiv4[[i]]))) {
+      i4[[icount]] = indiv4[[i]]
+      icount = icount + 1
+    }
+  }
+
+  trows = sum(unlist(lapply(i4, nrow)))
+
+  ## make lags of operating income wrt profit,
+  ## make lags of op income vs leverage
+
+  lag1list = list()
+  lag1count = 1
+  lag2list = list()
+  lag2count = 1
+  lag3list = list()
+  lag3count = 1
+
+  ## at this point I can assumpe that the rows are sorted by
+  ## year ascending
+  for(i in 1:length(i4)) {
+    dtmp = i4[[i]]
+    
+    for(t in 2:nrow(dtmp)) {
+      ltmp = data.frame(
+        opincL1 = dtmp[t-1,]$opinc,
+        opinc = dtmp[t,]$opinc,
+        levL1 = dtmp[t-1,]$leverage,
+        lev = dtmp[t,]$leverage,  
+        invAltAL1 = dtmp[t-1,]$invAltA,
+        invAltA = dtmp[t,]$invAltA,
+        ppegt = dtmp[t,]$PPEGT,
+        ppegtL1 = dtmp[t-1,]$PPEGT,
+        mktbk = dtmp[t,]$market2book,
+        mktbkL1 = dtmp[t-1,]$market2book,
+        cashflow = dtmp[t,]$cf
+        )
+      lag1list[[lag1count]] = ltmp
+      lag1count = lag1count + 1
+    }
+
+    for(t in 3:nrow(dtmp)) {
+      ltmp = data.frame(
+        opincL2 = dtmp[t-2,]$opinc,
+        opinc = dtmp[t,]$opinc,
+        levL2 = dtmp[t-2,]$leverage,
+        lev = dtmp[t,]$leverage,  
+        invAltAL2 = dtmp[t-2,]$invAltA,
+        invAltA = dtmp[t,]$invAltA,
+        ppegt = dtmp[t,]$PPEGT,
+        ppegtL2 = dtmp[t-2,]$PPEGT
+        )
+      lag2list[[lag2count]] = ltmp
+      lag2count = lag2count + 1
+    }
+
+    for(t in 4:nrow(dtmp)) {
+      ltmp = data.frame(
+        opincL3 = dtmp[t-3,]$opinc,
+        opinc = dtmp[t,]$opinc,
+        levL3 = dtmp[t-3,]$leverage,
+        lev = dtmp[t,]$leverage,  
+        invAltAL3 = dtmp[t-3,]$invAltA,
+        invAltA = dtmp[t,]$invAltA,
+        ppegt = dtmp[t,]$PPEGT,
+        ppegtL3 = dtmp[t-3,]$PPEGT
+        )
+      lag3list[[lag3count]] = ltmp
+      lag3count = lag3count + 1
+    }
+
+  }
+
+
+
+  l3df = do.call(rbind, lag3list)
+  l2df = do.call(rbind, lag2list)
+  l1df = do.call(rbind, lag1list)
+
+########################################
+
+  ## innovation to income
+  opinc.autocor = lm(opinc ~ opincL1, data=l1df)
+  ser_cor = coef(opinc.autocor)[2]
+  innov = residuals(opinc.autocor)
+  var_opr_income_innov = var(innov)
+  innov_inc_3rd_central = moment(innov, order=3, central=TRUE)
+  innov_inc_skewness = moment(innov, order=3, central=FALSE)
+  innov_inc_kurtosis = moment(innov, order=4, central=FALSE)
+  
+
+
+
+  ## investment auto covariances
+  cov_inv_auto1 = cov(l1df$invAltA, l1df$invAltAL1)
+  cov_inv_auto2 = cov(l2df$invAltA, l2df$invAltAL2)
+  cov_inv_auto3 = cov(l3df$invAltA, l3df$invAltAL3)
+
+  ## operating income auto covarainces
+  cov_opinc_auto1 = cov(l1df$opinc, l1df$opincL1)
+  cov_opinc_auto2 = cov(l2df$opinc, l2df$opincL2)
+  cov_opinc_auto3 = cov(l3df$opinc, l3df$opincL3)
+
+  ## leverage auto covarainces
+  cov_lev_auto1 = cov(l1df$lev, l1df$levL1)
+  cov_lev_auto2 = cov(l2df$lev, l2df$levL2)
+  cov_lev_auto3 = cov(l3df$lev, l3df$levL3)
+
+  ## operating income auto covarainces
+  cov_lev_opincL1 = cov(l1df$lev, l1df$opincL1)
+  cov_lev_opincL2 = cov(l2df$lev, l2df$opincL2)
+  cov_lev_opincL3 = cov(l3df$lev, l3df$opincL3)
+
+
+  ## cov inv opinc
+  cov_inv_opinc = cov(data$leverage, data$opinc)
+
+  ## klag3
+  cov_kl3_opinc3 = cov( (l3df$ppegt/l3df$ppegtL3), l3df$opincL3)
+
+  ## Tobin's Q
+  mean_tobins_q = mean(data$tobinsq)
+  var_tobins_q = var(data$tobinsq)
+  tobinq_q_3rd_central = moment(data$tobinsq, order=3, central=TRUE)
+  tobinq_q_skewness = moment(data$tobinsq, order=3, central=FALSE)
+  tobinq_q_kurtosis = moment(data$tobinsq, order=4, central=FALSE)
+
+
+  ## SOA stuff
+  ## lev ~ lag(lev) + value/k + (p'-p)/k
+  ## 
+  beta_soa = coef(lm(lev ~ levL1 + mktbk + cashflow, data=l1df))
+  soa_beta_icept = beta_soa[1]
+  soa_beta_lev1 = beta_soa[2]
+  soa_beta_val = beta_soa[3]
+  soa_beta_cf = beta_soa[4]
+  
+
+  mf2 = data.frame(mean_inv,
+    var_inv,
+    mean_lev,
+    var_lev,
+    mean_equity_issuance,
+    var_equity_issuance,
+    mean_tobins_q,
+    var_tobins_q,
+    mean_cash_bal,
+    var_cash_bal,
+    mean_opr_income,
+    var_opr_income,
+    ser_cor,                 ## ser cor
+    var_opr_income_innov,    ## var opr income innov
+    inv_3rd_central,
+    inv_skewness,
+    inv_kurtosis,
+    lev_3rd_central,
+    lev_skewness,
+    lev_kurtosis,
+    innov_inc_3rd_central,
+    innov_inc_skewness,
+    innov_inc_kurtosis,
+    opr_inc_3rd_central,
+    opr_inc_skewness,
+    opr_inc_kurtosis,
+    cash_bal_3rd_central,
+    cash_bal_skewness,
+    cash_bal_kurtosis,
+    tobinq_q_3rd_central,
+    tobinq_q_skewness,
+    tobinq_q_kurtosis,
+    eq_issuance_3rd_central,
+    eq_issuance_skewness,
+    eq_issuance_kurtosis,
+    cov_inv_auto1,
+    cov_inv_auto2,
+    cov_inv_auto3,
+    cov_opinc_auto1,
+    cov_opinc_auto2,
+    cov_opinc_auto3,
+    cov_inv_opinc,
+    cov_kl3_opinc3,
+    cov_lev_auto1,
+    cov_lev_auto2,
+    cov_lev_auto3,
+    cov_lev_opincL1,
+    cov_lev_opincL2,
+    cov_lev_opincL3,
+    soa_beta_icept,
+    soa_beta_lev1,
+    soa_beta_val,
+    soa_beta_cf
+    )
+
+  outmat  = t(mf2)
+  colnames(outmat) = "moment"
+  if(is.null(fname)) {
+    fname = "data_moments_v1.csv"
+  }
+  write.csv(outmat, file=fname)
+}
+
+
+IF.mean = function(datavec) {
+  IF = -1 * datavec + mean(datavec)
+  return(IF)
+}
+
+IF.variance = function(datavec) {
+  IF.var = (datavec - mean(datavec))^2 - var(datavec)
+  return(IF.var)
+}
+
+IF.covariance = function(dataX, dataY) {
+  IF.cov = ( (dataX - mean(dataX)) * (dataY - mean(dataY))) - cov(dataX,dataY)
+  return(IF.cov)
+}
+
+IF.3rdcentral = function(datavec) {
+  IF.3c = (datavec - mean(datavec))^3 - moment(datavec, order=3, central=TRUE)
+  return(IF.3c)
+}
+
+
+makeSubsetLags <- function(dtmp) {
+
+  dflatz = zoo(dtmp)  
+
+  ## lag operating incomes
+  dflatz$opincL1 = lag(dflatz$opinc, -1, na.pad=TRUE)
+  dflatz$opincL2 = lag(dflatz$opinc, -2, na.pad=TRUE)
+  dflatz$opincL3 = lag(dflatz$opinc, -3, na.pad=TRUE)
+
+  ## lag invAltA
+  dflatz$invAltAL1 = lag(dflatz$invAltA, -1, na.pad=TRUE)
+  dflatz$invAltAL2 = lag(dflatz$invAltA, -2, na.pad=TRUE)
+  dflatz$invAltAL3 = lag(dflatz$invAltA, -3, na.pad=TRUE)
+
+  ## lag leverage
+  dflatz$leverageL1 = lag(dflatz$leverage, -1, na.pad=TRUE)
+  dflatz$leverageL2 = lag(dflatz$leverage, -2, na.pad=TRUE)
+  dflatz$leverageL3 = lag(dflatz$leverage, -3, na.pad=TRUE)
+
+
+  ## lag ppegt
+  dflatz$ppegt = dflatz$PPEGT
+  dflatz$ppegtL1 = lag(dflatz$ppegt, -1, na.pad=TRUE)
+  dflatz$ppegtL2 = lag(dflatz$ppegt, -2, na.pad=TRUE)
+  dflatz$ppegtL3 = lag(dflatz$ppegt, -3, na.pad=TRUE)
+
+  dfz.nona = na.omit(as.data.frame(dflatz))
+  return(dfz.nona)
+}
+
+
+nconv = function(x) { as.numeric(as.character(x))}
+makeInfluenceFunctions <- function(data, fname = NULL) {
+  ########################################
+
+  cusiplist = as.list(unique(data$cusip))
+  indiv4 = lapply(cusiplist, subsetByCusip, data)
+  ## library(multicore)
+  ## indiv4.mc = mclapply(cusiplist, subsetByCusip, data.q)
+  ## all.equal(indiv4, indiv4.mc)
+  
+  i4 = list()
+  icount = 1
+  for(i in 1:length(indiv4)) {
+    if(!(is.null(indiv4[[i]]))) {
+      i4[[icount]] = indiv4[[i]]
+      icount = icount + 1
+    }
+  }
+
+  trows = sum(unlist(lapply(i4, nrow)))
+
+  ## dflat = do.call("rbind", i4)
+  subsetLags = lapply(i4, makeSubsetLags)
+  data_with_lags = do.call("rbind", subsetLags)
+  ##  subsetLags_mc = mclapply(i4, makeSubsetLags)
+  ## data_with_lags = do.call("rbind", subsetLags_mc)
+  
+
+
+
+  dwl = transform(data_with_lags,
+    opinc = nconv(opinc),
+    opincL1 = nconv(opincL1),
+    opincL2 = nconv(opincL2),
+    opincL3 = nconv(opincL3),
+    invAltA = nconv(invAltA),
+    invAltAL1 = nconv(invAltAL1),
+    invAltAL2 = nconv(invAltAL2),
+    invAltAL3 = nconv(invAltAL3),
+    leverage = nconv(leverage),
+    leverageL1 = nconv(leverageL1),
+    leverageL2 = nconv(leverageL2),
+    leverageL3 = nconv(leverageL3),
+    ppegt = nconv(ppegt),
+    ppegtL1 = nconv(ppegtL1),
+    ppegtL2 = nconv(ppegtL2),
+    ppegtL3 = nconv(ppegtL3),
+    mktbk = nconv(market2book),
+    cf = nconv(cf)
+    )
+
+  N_observations = nrow(dwl)
+
+  ## ######################################
+  invA = dwl$invAltA
+  mean_inv = mean(invA)
+  var_inv =  var(invA)
+  inv_3rd_central = moment(invA, order=3, central=TRUE)
+  inv_skewness = moment(invA, order=3, central=FALSE)
+  inv_kurtosis = moment(invA, order=4, central=FALSE)
+
+  ## operating income
+  opinc = dwl$opinc
+  mean_opr_income = mean(opinc)
+  var_opr_income = var(opinc)
+  opr_inc_3rd_central = moment(opinc, order=3, central=TRUE)
+  opr_inc_skewness = moment(opinc, order=3, central=FALSE)
+  opr_inc_kurtosis = moment(opinc, order=4, central=FALSE)
+
+  ## leverage
+  lev = dwl$leverage
+  mean_lev = mean(lev)
+  var_lev = var(lev)
+  lev_3rd_central = moment(lev, order=3, central=TRUE)
+  lev_skewness =  moment(lev, order=3, central=FALSE)
+  lev_kurtosis =  moment(lev, order=4, central=FALSE)
+
+  ## equity issuance
+  equity = nconv(dwl$equity)
+  ##equity = equity[equity < 0.5]
+  mean_equity_issuance = mean(equity)
+  var_equity_issuance = var(equity)
+  eq_issuance_3rd_central = moment(equity,  order=3, central=TRUE)
+  eq_issuance_skewness =  moment(equity,  order=3, central=FALSE)
+  eq_issuance_kurtosis =  moment(equity,  order=4, central=FALSE)
+
+  ## cash balances
+  cash = nconv(dwl$cf)
+  mean_cash_bal = mean(cash)
+  var_cash_bal = var(cash)
+  cash_bal_3rd_central = moment(cash, order=3, central=TRUE)
+  cash_bal_skewness = moment(cash, order=3, central=FALSE)
+  cash_bal_kurtosis = moment(cash, order=4, central=FALSE)
+  
+
+
+  opinc.autocor = lm(opinc ~ opincL1, data=dwl)
+  ser_cor = coef(opinc.autocor)[2]
+  innov = residuals(opinc.autocor)
+  var_opr_income_innov = var(innov)
+  innov_inc_3rd_central = moment(innov, order=3, central=TRUE)
+  innov_inc_skewness = moment(innov, order=3, central=FALSE)
+  innov_inc_kurtosis = moment(innov, order=4, central=FALSE)
+  
+
+  n = nrow(dwl)
+  X.ser_cor = cbind(1, dwl$opincL1)
+  r = residuals(opinc.autocor)
+  I = solve(t(X.ser_cor)%*%X.ser_cor/n)%*% t(X.ser_cor*r)
+  if.ser_cor = I[2,]
+
+  ## investment auto covariances
+  cov_inv_auto1 = cov(dwl$invAltA, dwl$invAltAL1)
+  cov_inv_auto2 = cov(dwl$invAltA, dwl$invAltAL2)
+  cov_inv_auto3 = cov(dwl$invAltA, dwl$invAltAL3)
+
+  ## operating income auto covarainces
+  cov_opinc_auto1 = cov(dwl$opinc, dwl$opincL1)
+  cov_opinc_auto2 = cov(dwl$opinc, dwl$opincL2)
+  cov_opinc_auto3 = cov(dwl$opinc, dwl$opincL3)
+
+  ## leverage auto covarainces
+  cov_lev_auto1 = cov(dwl$leverage, dwl$leverageL1)
+  cov_lev_auto2 = cov(dwl$leverage, dwl$leverageL2)
+  cov_lev_auto3 = cov(dwl$leverage, dwl$leverageL3)
+
+  ## operating income auto covarainces
+  cov_lev_opincL1 = cov(dwl$leverage, dwl$opincL1)
+  cov_lev_opincL2 = cov(dwl$leverage, dwl$opincL2)
+  cov_lev_opincL3 = cov(dwl$leverage, dwl$opincL3)
+
+
+  ## cov inv opinc
+  cov_inv_opinc = cov(dwl$invAltA, dwl$opinc)
+
+  ## klag3
+  cov_kl3_opinc3 = cov( log(dwl$ppegt/dwl$ppegtL3), dwl$opincL3)
+
+  ## Tobin's Q
+  data.tq = nconv(dwl$tobinsq)
+  mean_tobins_q = mean(data.tq)
+  var_tobins_q = var(data.tq)
+  tobinq_q_3rd_central = moment(data.tq, order=3, central=TRUE)
+  tobinq_q_skewness = moment(data.tq, order=3, central=FALSE)
+  tobinq_q_kurtosis = moment(data.tq, order=4, central=FALSE)
+
+
+  ## SOA stuff
+  ## lev ~ lag(lev) + value/k + (p'-p)/k
+  ## 
+  beta_soa_reg = lm(leverage ~ leverageL1 + mktbk + cf, data=dwl)
+  beta_soa = coef(beta_soa_reg)
+  soa_beta_icept = beta_soa[1]
+  soa_beta_lev1 = beta_soa[2]
+  soa_beta_val = beta_soa[3]
+  soa_beta_cf = beta_soa[4]
+  
+
+  X.soa_beta = cbind(1, dwl$leverageL1, dwl$mktbk, dwl$cf)
+  r = residuals(beta_soa_reg)
+  I = solve(t(X.soa_beta)%*%X.soa_beta/n)%*% t(X.soa_beta*r)
+  if.soa_beta_lev1 = I[2,]
+
+
+  ## IF.mean, ## IF.variance, ## IF.covariance
+
+  if.mean_inv        = IF.mean(invA)
+  if.var_inv         = IF.variance(invA)
+  if.inv_3rd_central = IF.3rdcentral(invA)
+  if.mean_lev        = IF.mean(lev)
+  if.var_lev         = IF.variance(lev)
+  if.mean_opr_income = IF.mean(opinc)
+  ##if.ser_cor
+  if.var_oper_income_innov = IF.variance(innov)
+  if.mean_tobins_q         = IF.mean(data.tq)
+  if.mean_eq               = IF.mean(equity)
+  if.var_eq                = IF.variance(equity)
+  if.mean_cash_bal         = IF.mean(cash)
+  ##if.soa_beta_lev1
+  if.cov_kl3_opinc3        =  IF.covariance(log(dwl$ppegt/dwl$ppegtL3), dwl$opincL3)
+
+  mmmatlist = list(if.mean_inv, if.var_inv, if.inv_3rd_central, if.mean_lev, if.var_lev, if.mean_opr_income, if.ser_cor, if.var_oper_income_innov, if.mean_tobins_q, if.mean_eq, if.var_eq, if.mean_cash_bal, if.soa_beta_lev1, if.cov_kl3_opinc3)
+  
+  influence_stack = do.call("rbind", mmmatlist)
+  gmmweights = solve(influence_stack %*% t(influence_stack)/n)
+
+  mmmatlist_ddw = list(if.mean_inv, if.var_inv, if.inv_3rd_central, if.mean_lev, if.var_lev, if.mean_opr_income, if.ser_cor, if.var_oper_income_innov, if.mean_tobins_q, if.mean_eq, if.var_eq, if.mean_cash_bal)
+  
+  influence_stack_ddw = do.call("rbind", mmmatlist_ddw)
+  gmmweights_ddw = solve(influence_stack_ddw %*% t(influence_stack_ddw)/n)
+
+
+
+  if(is.null(fname)) {
+    fname = "DEFAULT___"
+  }
+
+  ## write out the rawdata.csv and the GMM weighting matrix
+  write.csv(dwl, paste(fname, "rawdata.csv", sep=""))
+  writeRawCSV(gmmweights, paste(fname, "gmmweights.csv", sep=""))
+  writeRawCSV(gmmweights_ddw, paste(fname, "_ddw_gmmweights.csv", sep=""))
+
+  ## write out all of the moments that make this happen
+  mf2 = data.frame(mean_inv,
+    var_inv,
+    mean_lev,
+    var_lev,
+    mean_equity_issuance,
+    var_equity_issuance,
+    mean_tobins_q,
+    var_tobins_q,
+    mean_cash_bal,
+    var_cash_bal,
+    mean_opr_income,
+    var_opr_income,
+    ser_cor,                 ## ser cor
+    var_opr_income_innov,    ## var opr income innov
+    inv_3rd_central,
+    inv_skewness,
+    inv_kurtosis,
+    lev_3rd_central,
+    lev_skewness,
+    lev_kurtosis,
+    innov_inc_3rd_central,
+    innov_inc_skewness,
+    innov_inc_kurtosis,
+    opr_inc_3rd_central,
+    opr_inc_skewness,
+    opr_inc_kurtosis,
+    cash_bal_3rd_central,
+    cash_bal_skewness,
+    cash_bal_kurtosis,
+    tobinq_q_3rd_central,
+    tobinq_q_skewness,
+    tobinq_q_kurtosis,
+    eq_issuance_3rd_central,
+    eq_issuance_skewness,
+    eq_issuance_kurtosis,
+    cov_inv_auto1,
+    cov_inv_auto2,
+    cov_inv_auto3,
+    cov_opinc_auto1,
+    cov_opinc_auto2,
+    cov_opinc_auto3,
+    cov_inv_opinc,
+    cov_kl3_opinc3,
+    cov_lev_auto1,
+    cov_lev_auto2,
+    cov_lev_auto3,
+    cov_lev_opincL1,
+    cov_lev_opincL2,
+    cov_lev_opincL3,
+    soa_beta_icept,
+    soa_beta_lev1,
+    soa_beta_val,
+    soa_beta_cf,
+    N_observations
+    )
+
+  outmat  = t(mf2)
+  colnames(outmat) = "moment"
+  write.csv(outmat, paste(fname, "moments_v2.csv", sep=""))
+  write.csv(gmmweights, paste(fname, "gmm_with_names.csv", sep=""))
+
+  return(mf2)
+}
+
+
+### make influence functions for the whole dataset
+totaldata = makeInfluenceFunctions(data.q, fname = "moments3")
+
+
+### lets run the moments for a couple of different groups
+
+sicSubsets = matrix(
+  c(0001, 999,   # ag
+    1000, 1499,  # mining
+    1500, 1799,  # constr
+    2000, 3999,  # manuf
+    4000, 4999,  # transport
+    5200, 5999,  # retail
+    7000, 8999), # services
+  ncol=2, byrow=TRUE)
+
+industry_level1 = c("ag","mining","constr","manuf","transport","retail","services")
+N_industries = length(industry_level1)
+
+## for each row in sicSubsets
+##  subset the data,
+##  generate a filename
+##  make the moments
+##  stick it in a list
+sicSubsetData = list()
+for(nii in 1:N_industries) {
+  industry = industry_level1[nii]
+  sic_min = sicSubsets[nii, 1]
+  sic_max = sicSubsets[nii, 2]
+  data_industry = subset(data.q, SIC >= sic_min & SIC <= sic_max)
+
+  if(nrow(data_industry) < 100) {
+    print(paste(industry, "doesn't have much data (" , nrow(data_industry), ") skipping"))
+    next 
+  } else {
+    print(paste(industry, "has data (" , nrow(data_industry), ")"))
+  }
+  
+  industry_fname = paste(industry, "_mm1", sep="")
+  sicSubsetData[[nii]] = makeInfluenceFunctions(data_industry, fname = industry_fname)
+  
+}
+
+
+sicLevelData = list()
+for(nii in 1:N_industries) {
+  industry = industry_level1[nii]
+  sic_min = sicSubsets[nii, 1]
+  sic_max = sicSubsets[nii, 2]
+  data_industry = subset(data.q, SIC >= sic_min & SIC <= sic_max)
+  sicLevelData[[nii]] = data_industry
+}
+
+save(sicLevelData, file="sicLevelData.Robj")
+save(industry_level1, file="industry_levels.Robj")