Commits

Faheem Mitha committed 38f7f6a Merge

Branch merge with default.

Comments (0)

Files changed (17)

 
 The following example usage is for mouse 12 RSS
 
-### Loading database tables related to motif data analysis: ###
+### Loading database tables related to sequence data analysis: ###
 
 To load all tables:
 
-    ./cmload all_motif mouse12rss
+    ./cmload all_sequence mouse12rss
 
 This can be divided into the following steps:
 
 a) Load sequence data.
 
-    ./cmload motif mouse12rss
+    ./cmload sequence mouse12rss
 
 b) Load cross-validation data.
 
 
     ./init_db corrmodel
 
-    ./cmload motif -s human12rss
-    ./cmload all_motif human12rss
-    ./cmload all_motif -n 1 human12rss
+    ./cmload sequence -s human12rss
+    ./cmload all_sequence human12rss
+    ./cmload all_sequence -n 1 human12rss
 
-    ./cmload motif -s mouse12rss
-    ./cmload all_motif mouse12rss
-    ./cmload all_motif -n 1 mouse12rss
+    ./cmload sequence -s mouse12rss
+    ./cmload all_sequence mouse12rss
+    ./cmload all_sequence -n 1 mouse12rss
 
 To use the scripts installed in the system, replace `./init_db` with
 `init_db`, and `./cmload` with `cmload`.
     cdf_cmd = "./cm-cdf"
     neglogpp_cmd = "./cm-neglogpp"
     neglogpptable_cmd = "./cm-neglogpptable"
-    datafiletable_cmd = "./cm-datafiletable"
+    genetable_cmd = "./cm-genetable"
 else:
     cdf_cmd = "cm-cdf"
     neglogpp_cmd = "cm-neglogpp"
     neglogpptable_cmd = "cm-neglogpptable"
-    datafiletable_cmd = "cm-datafiletable"
+    genetable_cmd = "cm-genetable"
 
 pdffig_tex = Split("MODEL.tex CDF.tex NEGLOGPPHUMAN.tex NEGLOGPPMOUSE.tex")
 pgffig_pdf = Split("Figure1-model.pdf Figure4-cdf.pdf Figure5-neglogpphuman.pdf Figure6-neglogppmouse.pdf")
-table_tex = Split("NEGLOGPPTABLEHUMAN.tex NEGLOGPPTABLEMOUSE.tex DATAFILETABLEHUMAN.tex DATAFILETABLEMOUSE.tex")
+table_tex = Split("NEGLOGPPTABLEHUMAN.tex NEGLOGPPTABLEMOUSE.tex GENETABLEHUMAN.tex GENETABLEMOUSE.tex")
 env = Environment()
 env.Append(PDFLATEXFLAGS=[ "-shell-escape"])
 
 env.Command("CDF.tex", source=None, action=cdf_cmd)
 env.Command(Split("NEGLOGPPHUMAN.tex NEGLOGPPMOUSE.tex"), source=None, action=neglogpp_cmd)
 env.Command(Split("NEGLOGPPTABLEHUMAN.tex NEGLOGPPTABLEMOUSE.tex"), source=None, action=neglogpptable_cmd)
-env.Command(Split("DATAFILETABLEHUMAN.tex DATAFILETABLEMOUSE.tex"), source=None, action=datafiletable_cmd)
+env.Command(Split("GENETABLEHUMAN.tex GENETABLEMOUSE.tex"), source=None, action=genetable_cmd)
 
 env.Depends("figures.pdf", pdffig_tex + table_tex)
 env.PDF(target=["figures.pdf"]+pgffig_pdf, source="figures.tex")
     from Bio.SeqRecord import SeqRecord
     handle = open(fastafile, "rU")
     newrecords = []
-    datafiles_info = []
+    genes_info = []
     filestring = "\n"
     for record in SeqIO.parse(handle, "fasta"):
         accnum = record.id
             print "result of checkfasta", result
             if result[0] is True:
                 addtodescription = "GOOD ACCESSION NUMBER, RSS FOUND ORIENTATION %s"%result[1]
-                datafiles_info.append((result[1], accnumtrunc+ ".fasta"))
+                genes_info.append((result[1], accnumtrunc+ ".fasta"))
                 filestring += " " + accnumtrunc + ".fasta"
             elif result[0] is False:
                 addtodescription = "GOOD ACCESSION NUMBER, RSS NOT FOUND"
     print newfastafile
     SeqIO.write(newrecords, newfastafile, "fasta")
     nff = open(newfastafile, 'a')
-    nff.write('\n# '+str(tuple(datafiles_info))+'\n')
+    nff.write('\n# '+str(tuple(genes_info))+'\n')
     nff.write(filestring+'\n')
     handle.close()
     nff.close()
 
 get_stat <- function(schema, con)
   {
-    True <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.motifgroup inner join %1$s.motifstat on motifgroup.id=motifstat.motifstatgroup_id where motifgroup.simulated='f';", schema))
-    Sim <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.motifgroup inner join %1$s.motifstat on motifgroup.id=motifstat.motifstatgroup_id where motifgroup.simulated='t';", schema))
+    True <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.sequencegroup inner join %1$s.sequencestat on sequencegroup.id=sequencestat.sequencestatgroup_id where sequencegroup.simulated='f';", schema))
+    Sim <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.sequencegroup inner join %1$s.sequencestat on sequencegroup.id=sequencestat.sequencestatgroup_id where sequencegroup.simulated='t';", schema))
     Data <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.data inner join %1$s.datasubgroup on data.datasubgroup_id=datasubgroup.id inner join %1$s.crossval on datasubgroup.crossval_id=crossval.id inner join %1$s.crossvalgroup on crossval.crossvalgroup_id=crossvalgroup.id where crossvalgroup.samplenum=1;", schema))
     Stat = data.frame(value = c(True$neglogpp, Sim$neglogpp, Data$neglogpp),
       pvalue = c(True$pvalue, Sim$pvalue, Data$pvalue),

cm-datafiletable

-#!/usr/bin/Rscript
-
-# Copyright 2004-2013 Faheem Mitha <faheem@faheem.info>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-
-##http://dba.stackexchange.com/questions/30514/combining-two-similar-sql-queries
-
-#options(error=traceback)
-#options(error=recover)
-#options(error = quote({dump.frames(to.file=TRUE); q()}))
-
-get_datafile <- function(schema, connection)
-{
-  Datafile <- data.frame(dbGetQuery(connection,
-                               sprintf("SELECT accnum, array_to_string(array_agg(orientation), ', ')
-                                        FROM (SELECT * FROM %1$s.datafile ORDER BY accnum, orientation) x
-                                        GROUP BY accnum;", schema)))
-
-  return(Datafile)
-}
-
-## https://github.com/hadley/ggplot2/wiki/labeller
-label_wrap_gen <- function(width = 25)
-{
-  function(variable, value) {
-    lapply(strwrap(as.character(value), width=width, simplify=FALSE),
-          paste, collapse="\n")
-  }
-}
-
-get_bottomstuff <- function(schema, connection, bottomstuffstring)
-{
-  tot <- dbGetQuery(connection, sprintf("SELECT count(*) from %1$s.datafile", schema))
-  if(!isTRUE(all.equal( max(tot), min(tot))))
-    {
-      stop("values of the tot vector in get_bottomstuff must all be the same")
-    }
-  bs = sprintf(bottomstuffstring, tot[1, 1])
-  return(bs)
-}
-
-extenddatatable <- function(dataframe, rownum)
-{
-  rownum1 = nrow(dataframe)
-  if(rownum1<rownum)
-    for(i in (rownum1+1):rownum)
-      {
-        dataframe[i,] = c("", "")
-      }
-  return(dataframe)
-}
-
-make_table <- function(schema, filename, label, caption, bottomstuffstring)
-  {
-    library(Hmisc)
-    library(RPostgreSQL)
-    library(yaml)
-    if(file.exists("../corrmodel/default_conf.yaml"))
-      conf = yaml.load_file("../corrmodel/default_conf.yaml")
-    else if(file.exists("/etc/corrmodel/default_conf.yaml"))
-      conf = yaml.load_file("/etc/corrmodel/default_conf.yaml")
-    else
-      stop("cannot find config file 'default_conf.yaml'")
-
-    tryCatch(expr=
-             {
-               drv <- dbDriver("PostgreSQL")
-               con <- dbConnect(drv, password=conf$password, dbname=conf$dbname)
-               Datafile = get_datafile(schema, con)
-               bottomstuff = get_bottomstuff(schema, con, bottomstuffstring)
-             },
-             finally =
-             {
-               dbDisconnect(con)
-               dbUnloadDriver(drv)}
-             )
-    rownum = nrow(Datafile)
-    tablerows = ceil(rownum/3)
-    rownum1 = tablerows
-    if(rownum <= tablerows) ## case where table has one column. This is a degenerate case
-      {
-        Datafile1 = Datafile[1:rownum1, ]
-        Datafilesplit = Datafile1
-        colnames(Datafilesplit) = c("File", "Orientation")
-      }
-    else if(rownum <= 2*tablerows) ## case where table has two columns. This is a degenerate case
-      {
-        Datafile1 = Datafile[1:tablerows, ]
-        Datafile2 = Datafile[(tablerows+1):rownum, ]
-        Datafile2 = extenddatatable(Datafile2, tablerows)
-        Datafilesplit = cbind(Datafile1, Datafile2)
-        colnames(Datafilesplit) = c("File", "Orientation", "File", "Orientation")
-      }
-    else     ## case where table has three columns.
-      {
-        Datafile1 = Datafile[1:tablerows, ]
-        Datafile2 = Datafile[(tablerows+1):(2*tablerows), ]
-        Datafile3 = Datafile[(2*tablerows+1):rownum, ]
-        Datafile3 = extenddatatable(Datafile3, tablerows)
-        Datafilesplit = cbind(Datafile1, Datafile2, Datafile3)
-        colnames(Datafilesplit) = c("File", "Orientation", "File", "Orientation", "File", "Orientation")
-      }
-
-    ##w <- latex(Datafilesplit, file=filename, rowname=NULL, col.just=c(">{\\centering\\arraybackslash}p{2.5cm}", ">{\\centering\\arraybackslash}p{2.5cm}", ">{\\centering\\arraybackslash}p{2.5cm}", ">{\\centering\\arraybackslash}p{2.5cm}"), style=c("array"), where='!ht', multicol=FALSE,
-               ## center="centering",
-               ## label=label,
-               ## caption=caption,
-               ## insert.bottom=bottomstuff)
-
-    w <- latex(Datafilesplit, file=filename, rowname=NULL, style=c("array"), where='!ht', multicol=FALSE,
-               center="centering",
-               label=label,
-               caption=caption,
-               insert.bottom=bottomstuff)
-    return(Datafilesplit)
-  }
-
-Datafilehuman = make_table("human12rss", "DATAFILETABLEHUMAN.tex", "datafilehumantab", "\\bf{List of Human gene segments containing RSS}",
-    "\\begin{flushleft}\\end{flushleft}")
-
-Datafilemouse = make_table("mouse12rss", "DATAFILETABLEMOUSE.tex", "datafilemousetab", "\\bf{List of Mouse gene segments containing RSS}",
-    "\\begin{flushleft}\\end{flushleft}")
+#!/usr/bin/Rscript
+
+# Copyright 2004-2013 Faheem Mitha <faheem@faheem.info>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+##http://dba.stackexchange.com/questions/30514/combining-two-similar-sql-queries
+
+#options(error=traceback)
+#options(error=recover)
+#options(error = quote({dump.frames(to.file=TRUE); q()}))
+
+get_gene <- function(schema, connection)
+{
+  Gene <- data.frame(dbGetQuery(connection,
+                               sprintf("SELECT accnum, array_to_string(array_agg(orientation), ', ')
+                                        FROM (SELECT * FROM %1$s.gene ORDER BY accnum, orientation) x
+                                        GROUP BY accnum;", schema)))
+
+  return(Gene)
+}
+
+## https://github.com/hadley/ggplot2/wiki/labeller
+label_wrap_gen <- function(width = 25)
+{
+  function(variable, value) {
+    lapply(strwrap(as.character(value), width=width, simplify=FALSE),
+          paste, collapse="\n")
+  }
+}
+
+get_bottomstuff <- function(schema, connection, bottomstuffstring)
+{
+  tot <- dbGetQuery(connection, sprintf("SELECT count(*) from %1$s.gene", schema))
+  if(!isTRUE(all.equal( max(tot), min(tot))))
+    {
+      stop("values of the tot vector in get_bottomstuff must all be the same")
+    }
+  bs = sprintf(bottomstuffstring, tot[1, 1])
+  return(bs)
+}
+
+extenddatatable <- function(dataframe, rownum)
+{
+  rownum1 = nrow(dataframe)
+  if(rownum1<rownum)
+    for(i in (rownum1+1):rownum)
+      {
+        dataframe[i,] = c("", "")
+      }
+  return(dataframe)
+}
+
+make_table <- function(schema, filename, label, caption, bottomstuffstring)
+  {
+    library(Hmisc)
+    library(RPostgreSQL)
+    library(yaml)
+    if(file.exists("../corrmodel/default_conf.yaml"))
+      conf = yaml.load_file("../corrmodel/default_conf.yaml")
+    else if(file.exists("/etc/corrmodel/default_conf.yaml"))
+      conf = yaml.load_file("/etc/corrmodel/default_conf.yaml")
+    else
+      stop("cannot find config file 'default_conf.yaml'")
+
+    tryCatch(expr=
+             {
+               drv <- dbDriver("PostgreSQL")
+               con <- dbConnect(drv, password=conf$password, dbname=conf$dbname)
+               Gene = get_gene(schema, con)
+               bottomstuff = get_bottomstuff(schema, con, bottomstuffstring)
+             },
+             finally =
+             {
+               dbDisconnect(con)
+               dbUnloadDriver(drv)}
+             )
+    rownum = nrow(Gene)
+    tablerows = ceil(rownum/3)
+    rownum1 = tablerows
+    if(rownum <= tablerows) ## case where table has one column. This is a degenerate case
+      {
+        Gene1 = Gene[1:rownum1, ]
+        Genesplit = Gene1
+        colnames(Genesplit) = c("File", "Orientation")
+      }
+    else if(rownum <= 2*tablerows) ## case where table has two columns. This is a degenerate case
+      {
+        Gene1 = Gene[1:tablerows, ]
+        Gene2 = Gene[(tablerows+1):rownum, ]
+        Gene2 = extenddatatable(Gene2, tablerows)
+        Genesplit = cbind(Gene1, Gene2)
+        colnames(Genesplit) = c("File", "Orientation", "File", "Orientation")
+      }
+    else     ## case where table has three columns.
+      {
+        Gene1 = Gene[1:tablerows, ]
+        Gene2 = Gene[(tablerows+1):(2*tablerows), ]
+        Gene3 = Gene[(2*tablerows+1):rownum, ]
+        Gene3 = extenddatatable(Gene3, tablerows)
+        Genesplit = cbind(Gene1, Gene2, Gene3)
+        colnames(Genesplit) = c("File", "Orientation", "File", "Orientation", "File", "Orientation")
+      }
+
+    ##w <- latex(Genesplit, file=filename, rowname=NULL, col.just=c(">{\\centering\\arraybackslash}p{2.5cm}", ">{\\centering\\arraybackslash}p{2.5cm}", ">{\\centering\\arraybackslash}p{2.5cm}", ">{\\centering\\arraybackslash}p{2.5cm}"), style=c("array"), where='!ht', multicol=FALSE,
+               ## center="centering",
+               ## label=label,
+               ## caption=caption,
+               ## insert.bottom=bottomstuff)
+
+    w <- latex(Genesplit, file=filename, rowname=NULL, style=c("array"), where='!ht', multicol=FALSE,
+               center="centering",
+               label=label,
+               caption=caption,
+               insert.bottom=bottomstuff)
+    return(Genesplit)
+  }
+
+Genehuman = make_table("human12rss", "GENETABLEHUMAN.tex", "genehumantab", "\\bf{List of Human gene segments containing RSS}",
+    "\\begin{flushleft}\\end{flushleft}")
+
+Genemouse = make_table("mouse12rss", "GENETABLEMOUSE.tex", "genemousetab", "\\bf{List of Mouse gene segments containing RSS}",
+    "\\begin{flushleft}\\end{flushleft}")
                        ON         datasubgroup.crossval_id=crossval.id
                        INNER JOIN %1$s.crossvalgroup
                        ON         crossval.crossvalgroup_id=crossvalgroup.id
-                       INNER JOIN %1$s.motifgroup
-                       ON         crossvalgroup.motifgroup_id=motifgroup.id
+                       INNER JOIN %1$s.sequencegroup
+                       ON         crossvalgroup.sequencegroup_id=sequencegroup.id
    WHERE      (data.seqindex = any(crossval.sample)
    OR         data.seqindex IS NULL)
    AND        crossvalgroup.permuted='f'
    AND        crossvalgroup.samplenum=5
-   AND        motifgroup.simulated='f'
-   AND        motifgroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
+   AND        sequencegroup.simulated='f'
+   AND        sequencegroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
 ) AS q
 WHERE q.r <= %2$s
 ", schema, limit)))
   }
 }
 
-simplot <- function(schema, limit1, limit2, datafile, outputfile)
+simplot <- function(schema, limit1, limit2, gene, outputfile)
   {
     library(ggplot2)
     library(gridExtra)
 
 ## First, the query q gives
 
-## r | motif | crossvalnum | neglogpp
+## r | rss | crossvalnum | neglogpp
 
 ## r: is the rank, which corresponds to grouping the sequence rows by
 ## crossvalnum with groups in order of increasing crossvalnum, and then
 ## within each crossvalnumgroup, we order by neglogpp
 
-## motif: 0 if seqindex is null
-##        1 if seqindex is not null
-## i.e. motif is an indicator function for whether a sequence is an RSS or not
+## rss: 0 if seqindex is null
+##      1 if seqindex is not null
+## i.e. rss is an indicator function for whether a sequence is an RSS or not
 
 ## Second, the values sum1, sum2, and sum3, derived from the query q
 ## give:
 {
   Neglogpp <- data.frame(dbGetQuery(connection,
                                sprintf("SELECT crossvalnum,
-SUM(motif) as sum1,
-SUM(CASE WHEN r <= LEAST(40,100) THEN motif ELSE 0 END) as sum2,
-SUM(CASE WHEN r <= GREATEST(40,100) THEN motif ELSE 0 END) as sum3
+SUM(rss) as sum1,
+SUM(CASE WHEN r <= LEAST(40,100) THEN rss ELSE 0 END) as sum2,
+SUM(CASE WHEN r <= GREATEST(40,100) THEN rss ELSE 0 END) as sum3
 FROM (
 SELECT   ROW_NUMBER() OVER (PARTITION BY crossval.crossvalnum ORDER BY crossval.crossvalnum, data.neglogpp) AS r,
-                    (seqindex IS NOT NULL)::INTEGER                                                      AS motif,
+                    (seqindex IS NOT NULL)::INTEGER                                                      AS rss,
                     crossvalnum                                                                                  ,
                     neglogpp
          FROM       %1$s.data
                     ON         datasubgroup.crossval_id=crossval.id
                     INNER JOIN %1$s.crossvalgroup
                     ON         crossval.crossvalgroup_id=crossvalgroup.id
-                    INNER JOIN %1$s.motifgroup
-                    ON         crossvalgroup.motifgroup_id=motifgroup.id
+                    INNER JOIN %1$s.sequencegroup
+                    ON         crossvalgroup.sequencegroup_id=sequencegroup.id
          WHERE      (data.seqindex                        = ANY(crossval.sample)
          OR         data.seqindex                  IS NULL)
          AND        crossvalgroup.permuted='f'
          AND        crossvalgroup.samplenum=5
-         AND        motifgroup.simulated='f'
-         AND        motifgroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
+         AND        sequencegroup.simulated='f'
+         AND        sequencegroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
 ) AS q
 GROUP BY crossvalnum", schema)))
 
 
     Combinedneglogpp <- data.frame(dbGetQuery(connection,
                                       sprintf("SELECT
-SUM(motif) as sum1,
-SUM(CASE WHEN r <= 200 THEN motif ELSE 0 END) as sum2,
-SUM(CASE WHEN r <= 500 THEN motif ELSE 0 END) as sum3
+SUM(rss) as sum1,
+SUM(CASE WHEN r <= 200 THEN rss ELSE 0 END) as sum2,
+SUM(CASE WHEN r <= 500 THEN rss ELSE 0 END) as sum3
 from (
 SELECT ROW_NUMBER() OVER (order by avg(neglogpp)) as r,
-                  datafile_id,
+                  gene_id,
                   index,
                   avg(neglogpp) as avg_neglogpp,
-                  (seqindex is not null)::integer as motif
+                  (seqindex is not null)::integer as rss
        FROM       %1$s.data
                   INNER JOIN %1$s.datasubgroup
                   ON         data.datasubgroup_id=datasubgroup.id
                   ON         datasubgroup.crossval_id=crossval.id
                   INNER JOIN %1$s.crossvalgroup
                   ON         crossval.crossvalgroup_id=crossvalgroup.id
-                  INNER JOIN %1$s.motifgroup
-                  ON         crossvalgroup.motifgroup_id=motifgroup.id
+                  INNER JOIN %1$s.sequencegroup
+                  ON         crossvalgroup.sequencegroup_id=sequencegroup.id
        WHERE      crossvalgroup.permuted='f'
        AND        crossvalgroup.samplenum=1
-       AND        motifgroup.simulated='f'
-       AND        motifgroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
-                  group by datafile_id, index, seqindex order by avg(neglogpp)) AS q", schema)))
+       AND        sequencegroup.simulated='f'
+       AND        sequencegroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
+                  group by gene_id, index, seqindex order by avg(neglogpp)) AS q", schema)))
 print(Combinedneglogpp)
 return(Combinedneglogpp)
   }
 
 # create the top-level parser
 
-from corrmodel.load import load_motif_dataset, load_crossval_dataset, load_all_motifstat, load_allseq, load_all_motif_tables, load_simdata, get_and_load_simresult, load_all_sim_tables, load_all_sim_tables_parallel
+from corrmodel.load import load_sequence_dataset, load_crossval_dataset, load_all_sequencestat, load_allseq, load_all_sequence_tables, load_simdata, get_and_load_simresult, load_all_sim_tables, load_all_sim_tables_parallel
 
 import argparse
 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 subparsers = parser.add_subparsers()
 
-# create the parser for the "motif" command
-parser_motif = subparsers.add_parser('motif', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser_motif.add_argument('dataset', help='name of dataset')
-parser_motif.add_argument('-n', '--motifsimnum', action="store", type=int, dest="motifsimnum", help='number of sequences to simulate')
-parser_motif.add_argument('--subset', help='subset')
-parser_motif.add_argument('-s', '--simulated', action="store_true", dest="simulated", help='simulated data set', default=False)
-parser_motif.set_defaults(func=load_motif_dataset)
+# create the parser for the "sequence" command
+parser_sequence = subparsers.add_parser('sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser_sequence.add_argument('dataset', help='name of dataset')
+parser_sequence.add_argument('-n', '--sequencesimnum', action="store", type=int, dest="sequencesimnum", help='number of sequences to simulate')
+parser_sequence.add_argument('--subset', help='subset')
+parser_sequence.add_argument('-s', '--simulated', action="store_true", dest="simulated", help='simulated data set', default=False)
+parser_sequence.set_defaults(func=load_sequence_dataset)
 
 # create the parser for the "crossval" command
 parser_crossval = subparsers.add_parser('crossval', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser_crossval.add_argument("-s", "--search", action="store_true", dest="search", help="model search", default=False)
 parser_crossval.set_defaults(func=load_crossval_dataset)
 
-# create the parser for the "motifstat" command
-parser_motifstat = subparsers.add_parser('motifstat', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser_motifstat.add_argument('dataset', help='name of dataset')
-parser_motifstat.add_argument('-n', '--numsim', type=int, help='number of replicates to simulate')
-parser_motifstat.set_defaults(func=load_all_motifstat)
+# create the parser for the "sequencestat" command
+parser_sequencestat = subparsers.add_parser('sequencestat', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser_sequencestat.add_argument('dataset', help='name of dataset')
+parser_sequencestat.add_argument('-n', '--numsim', type=int, help='number of replicates to simulate')
+parser_sequencestat.set_defaults(func=load_all_sequencestat)
 
 # create the parser for the "data" command
 parser_data = subparsers.add_parser('data', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser_data.add_argument('-n', '--numsim', type=int, help='number of replicates to simulate')
 parser_data.set_defaults(func=load_allseq)
 
-# create the parser for the "all_motif" command
-parser_all_motif = subparsers.add_parser('all_motif', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser_all_motif.add_argument('dataset', help='name of dataset')
-parser_all_motif.add_argument('--slow', action="store_false", dest="fast", help='switch to slow mode calculation (do not assume pvalues above the neglogppcutoff are 0)')
-parser_all_motif.add_argument('--motifsimnum', action="store", type=int, dest="motifsimnum", help='number of sequences to simulate')
-parser_all_motif.add_argument('--numsim', type=int, help='number of replicates to simulate')
-parser_all_motif.add_argument("-n", "--samplenum", action="store", type=int, dest="samplenum", help="number of samples to do crossvalidation on")
-parser_all_motif.add_argument("-p", "--permuted", action="store_true", dest="permuted", help="permuted motif dataset", default=False)
-parser_all_motif.add_argument("-s", "--simulated", action="store_true", dest="simulated", help="simulated motif dataset")
-parser_all_motif.add_argument("--search", action="store_true", dest="search", help='search for model in crossval step', default=False)
-parser_all_motif.set_defaults(func=load_all_motif_tables)
+# create the parser for the "all_sequence" command
+parser_all_sequence = subparsers.add_parser('all_sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser_all_sequence.add_argument('dataset', help='name of dataset')
+parser_all_sequence.add_argument('--slow', action="store_false", dest="fast", help='switch to slow mode calculation (do not assume pvalues above the neglogppcutoff are 0)')
+parser_all_sequence.add_argument('--sequencesimnum', action="store", type=int, dest="sequencesimnum", help='number of sequences to simulate')
+parser_all_sequence.add_argument('--numsim', type=int, help='number of replicates to simulate')
+parser_all_sequence.add_argument("-n", "--samplenum", action="store", type=int, dest="samplenum", help="number of samples to do crossvalidation on")
+parser_all_sequence.add_argument("-p", "--permuted", action="store_true", dest="permuted", help="permuted sequence dataset", default=False)
+parser_all_sequence.add_argument("-s", "--simulated", action="store_true", dest="simulated", help="simulated sequence dataset")
+parser_all_sequence.add_argument("--search", action="store_true", dest="search", help='search for model in crossval step', default=False)
+parser_all_sequence.set_defaults(func=load_all_sequence_tables)
 
 # create the parser for the "simdata" command
 parser_simdata = subparsers.add_parser('simdata', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

corrmodel/dbschema.py

         return value
 
 class Crossvalgroup(object):
-    def __init__(self, permuted, samplenum, time, motifgroup_id):
+    def __init__(self, permuted, samplenum, time, sequencegroup_id):
         self.permuted = permuted
         self.samplenum = samplenum
         self.time = time
-        self.motifgroup_id = motifgroup_id
+        self.sequencegroup_id = sequencegroup_id
     def __repr__(self):
         return '<Crossvalgroup %s>'%self.id
 
     def __repr__(self):
         return '<Model %s>'%(str((self.cols, self.model)))
 
-class Motifstatgroup(object):
-    def __init__(self, motifgroup_id, numsim):
-        self.motifgroup_id = motifgroup_id
+class Sequencestatgroup(object):
+    def __init__(self, sequencegroup_id, numsim):
+        self.sequencegroup_id = sequencegroup_id
         self.numsim = numsim
     def __repr__(self):
-        return '<Motifstatgroup %s>'%self.id
+        return '<Sequencestatgroup %s>'%self.id
 
-class Motifstat(object):
-    def __init__(self, motifstatgroup_id, neglogpp, pvalue):
-        self.motifstatgroup_id = motifstatgroup_id
+class Sequencestat(object):
+    def __init__(self, sequencestatgroup_id, neglogpp, pvalue):
+        self.sequencestatgroup_id = sequencestatgroup_id
         self.neglogpp = neglogpp
         self.pvalue = pvalue
     def __repr__(self):
-        return '<Motifstat %s>'%self.id
+        return '<Sequencestat %s>'%self.id
 
 class Sequencegroup(object):
-    def __repr__(self):
-        return '<Sequencegroup %s>'%str(self.id)
-
-class Motifgroup(object):
-    def __init__(self, id, subset, model_id, simulated, size=None):
-        self.id = id
+    def __init__(self, subset, model_id, simulated, size=None):
         # the desired subset of the RSS sequence
         self.subset = subset
         self.model_id = model_id
         self.simulated = simulated
         self.size = size
     def __repr__(self):
-        return '<Motifgroup %s>'%str(self.id)
+        return '<Sequencegroup %s>'%str(self.id)
 
-class Motif(object):
-    def __init__(self, motifgroup_id, sequence):
-        self.motifgroup_id = motifgroup_id
+class Sequence(object):
+    def __init__(self, sequencegroup_id, sequence):
+        self.sequencegroup_id = sequencegroup_id
         self.sequence = sequence
     def __repr__(self):
-        return '<Motif %s>'%self.id
+        return '<Sequence %s>'%self.id
 
 class Datagroup(object):
     def __init__(self, crossvalgroup_id, numsim, time=None):
     def __repr__(self):
         return '<Datasubgroup %s>'%self.id
 
-class Datafile(object):
+class Gene(object):
     def __init__(self, accnum, orientation):
         self.accnum = accnum
         self.orientation = orientation
     def __repr__(self):
-        return '<Datafile %s>'%self.id
+        return '<Gene %s>'%self.id
 
 class Data(object):
-    def __init__(self, datasubgroup_id, datafile_id, sequence, index, seqindex, neglogpp, pvalue):
+    def __init__(self, datasubgroup_id, gene_id, sequence, index, seqindex, neglogpp, pvalue):
         self.datasubgroup_id = datasubgroup_id
-        self.datafile_id = datafile_id
+        self.gene_id = gene_id
         self.sequence = sequence
         self.index = index
         self.seqindex = seqindex
         return '<Data %s>'%self.id
 
 class Simgroup(object):
-    def __init__(self, id, rows, cols, model_id=None):
-        self.id = id
+    def __init__(self, rows, cols, model_id=None):
         self.rows = rows
         self.cols = cols
         self.model_id = model_id
         Column('permuted', Boolean),
         Column('samplenum', Integer),
         Column('time', Float),
-        Column('motifgroup_id', Integer, ForeignKey(schema+'.motifgroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
-        UniqueConstraint('permuted', 'samplenum', 'motifgroup_id'),
+        Column('sequencegroup_id', Integer, ForeignKey(schema+'.sequencegroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
+        UniqueConstraint('permuted', 'samplenum', 'sequencegroup_id'),
         schema = schema
         )
     mapper(Crossvalgroup, crossvalgroup_table)
     """Each row of this table corresponds to a subgroup, which is part
     of a crossvalidation procedure. 'crossvalnum' is the index of this
     subgroup in the crossvalidation. 'sample' is the subgroup, with
-    the indexing relative to the RSS ids in the motif table. Each row
+    the indexing relative to the RSS ids in the sequence table. Each row
     corresponds to a single crossvalidation process, with the
     'training dataset' being the RSS minus the 'sample' subgroup, and
     the 'testing dataset' being the 'sample' subgroup. 'model_id' is
     mapper(Model, model_table)
     return model_table, Model
 
-def make_motifstatgroup_table(meta, schema, name='motifstatgroup'):
+def make_sequencestatgroup_table(meta, schema, name='sequencestatgroup'):
     """
-    We calculate statistics for all motifs in the motif group whose id
-    is 'motifgroup_id', currently based on the distributed implied by
+    We calculate statistics for all sequences in the sequence group whose id
+    is 'sequencegroup_id', currently based on the distributed implied by
     the model in the config file corresponding to 'schema'. A row in
-    this table represents these statistics (motifstat) for the
-    motifgroup with id 'motifgroup_id'. 'numsim' is a parameter for
+    this table represents these statistics (sequencestat) for the
+    sequencegroup with id 'sequencegroup_id'. 'numsim' is a parameter for
     the simulated distribution.
     """
-    motifstatgroup_table = Table(
+    sequencestatgroup_table = Table(
         name, meta,
         #Column('id', Integer, primary_key=True),
-        Column('motifgroup_id', Integer, ForeignKey(schema+'.motifgroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True, primary_key=True),
+        Column('sequencegroup_id', Integer, ForeignKey(schema+'.sequencegroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True, primary_key=True),
         Column('created', TIMESTAMP(), default=now()),
         Column('numsim', Integer),
         schema = schema,
         )
-    mapper(Motifstatgroup, motifstatgroup_table)
-    return motifstatgroup_table, Motifstatgroup
+    mapper(Sequencestatgroup, sequencestatgroup_table)
+    return sequencestatgroup_table, Sequencestatgroup
 
-def make_motifstat_table(meta, schema, name='motifstat'):
+def make_sequencestat_table(meta, schema, name='sequencestat'):
     """
     Each row of this table corresponds to statisics ('neglogpp',
-    'pvalue') for a motif in a motifgroup. These motifs together
-    correspond to the motifstatgroup with id 'motifstatgroup_id'.
-    This motifgroup has id 'motifgroup_id' in the corresponding
-    motifstatgroup with id 'motifstatgroup_id'. FIXME: It would
+    'pvalue') for a sequence in a sequencegroup. These sequences together
+    correspond to the sequencestatgroup with id 'sequencestatgroup_id'.
+    This sequencegroup has id 'sequencegroup_id' in the corresponding
+    sequencestatgroup with id 'sequencestatgroup_id'. FIXME: It would
     perhaps be better if each row corresponded directly to the
-    corresponding entry in the 'motif' table, but this is currently
+    corresponding entry in the 'sequence' table, but this is currently
     not the case.
     """
-    motifstat_table = Table(
+    sequencestat_table = Table(
         name, meta,
         Column('id', Integer, primary_key=True),
-        Column('motifstatgroup_id',  Integer, ForeignKey(schema+'.motifstatgroup.motifgroup_id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
+        Column('sequencestatgroup_id',  Integer, ForeignKey(schema+'.sequencestatgroup.sequencegroup_id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
         Column('neglogpp', Float),
         Column('pvalue', Float),
         schema = schema,
         )
-    mapper(Motifstat, motifstat_table)
-    return motifstat_table, Motifstat
+    mapper(Sequencestat, sequencestat_table)
+    return sequencestat_table, Sequencestat
 
 def make_sequencegroup_table(meta, schema, name='sequencegroup'):
+    """Each row of this table corresponds to a group of RSS sequences"""
     sequencegroup_table = Table(
         name, meta,
-        Column('id', Integer, primary_key=True),
-        schema = schema,
-        )
-    mapper(Sequencegroup, sequencegroup_table)
-    return sequencegroup_table, Sequencegroup
-
-def make_motifgroup_table(meta, schema, name='motifgroup'):
-    """Each row of this table corresponds to a group of RSS sequences"""
-    motifgroup_table = Table(
-        name, meta,
-        Column('id',  Integer, ForeignKey(schema+'.sequencegroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True, primary_key=True),
+        Column('id',  Integer, index=True, primary_key=True),
         Column('subset', postgresql.ARRAY(Integer, as_tuple=True)),
         Column('model_id', Integer, ForeignKey(schema+'.model.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
         Column('simulated', Boolean),
         Column('size', Integer),
         schema = schema,
         )
-    mapper(Motifgroup, motifgroup_table)
-    return motifgroup_table, Motifgroup
+    mapper(Sequencegroup, sequencegroup_table)
+    return sequencegroup_table, Sequencegroup
 
-def make_motifgroup_table_index(motifgroup_table):
+def make_sequencegroup_table_index(sequencegroup_table):
     """
-    Add constraint on motifgroup so if simulated = 'f', then subset must be unique.
+    Add constraint on sequencegroup so if simulated = 'f', then subset must be unique.
     """
-    event.listen(motifgroup_table, 'after_create',
-                 DDL("CREATE UNIQUE INDEX motifgroup_subset_key ON %(fullname)s (subset) WHERE simulated = 'f'")
+    event.listen(sequencegroup_table, 'after_create',
+                 DDL("CREATE UNIQUE INDEX sequencegroup_subset_key ON %(fullname)s (subset) WHERE simulated = 'f'")
                  )
 
-def make_motif_table(meta, schema, name='motif'):
+def make_sequence_table(meta, schema, name='sequence'):
     """Each row of this table corresponds to an RSS sequence"""
-    motif_table = Table(
+    sequence_table = Table(
         name, meta,
         Column('id', Integer, primary_key=True),
-        Column('motifgroup_id', Integer, ForeignKey(schema+'.motifgroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
+        Column('sequencegroup_id', Integer, ForeignKey(schema+'.sequencegroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
         Column('sequence', String),
         schema = schema,
         )
-    mapper(Motif, motif_table)
-    return motif_table, Motif
+    mapper(Sequence, sequence_table)
+    return sequence_table, Sequence
 
 def make_datagroup_table(meta, schema, name='datagroup'):
     """Each row of this table corresponds to a group of Datasubgroup objects
     mapper(Datasubgroup, datasubgroup_table)
     return datasubgroup_table, Datasubgroup
 
-def make_datafile_table(meta, schema, name='datafile'):
-    """Each row of this table corresponds to a data file"""
-    datafile_table = Table(
+def make_gene_table(meta, schema, name='gene'):
+    """Each row of this table corresponds to a gene"""
+    gene_table = Table(
         name, meta,
         Column('id', Integer, primary_key=True),
         Column('accnum', String),
         UniqueConstraint('accnum', 'orientation'),
         schema = schema,
         )
-    mapper(Datafile, datafile_table)
-    return datafile_table, Datafile
+    mapper(Gene, gene_table)
+    return gene_table, Gene
 
 def make_data_table(meta, schema, name='data'):
     """
     for a sequence in the context of a crossvalidation process. Here
     it suffices to identify the crossvalidation process and the
     enclosing round of crossvalidation by specifying the
-    'datasubgroup_id'. The 'datafile_id' gives the sequence data file
-    from which the sequence comes. The 'sequence' is the actual
-    sequence string. The 'index' is the index/location of the sequence
-    within the data file. The 'seqindex' is the id of the sequence, if
-    it is a RSS, in the 'motif' table. The 'seqindex' is empty if the
-    sequence is not an RSS. 'neglogpp' is the negative log posterior
-    predictive probability of the sequence. 'pvalue' is the posterior
-    predictive pvalue of the sequence. The 'neglogpp' and 'pvalue' are
-    calculated relative to the crossvalidation process specified by
+    'datasubgroup_id'. The 'gene_id' gives the gene from which the
+    sequence comes. The 'sequence' is the actual sequence string. The
+    'index' is the index/location of the sequence within the gene. The
+    'seqindex' is the id of the sequence, if it is a RSS, in the
+    'sequence' table. The 'seqindex' is empty if the sequence is not an
+    RSS. 'neglogpp' is the negative log posterior predictive
+    probability of the sequence. 'pvalue' is the posterior predictive
+    pvalue of the sequence. The 'neglogpp' and 'pvalue' are calculated
+    relative to the crossvalidation process specified by
     'datasubgroup_id', specifically, using the 'training dataset'.
     """
     data_table = Table(
         name, meta,
         Column('id', Integer, primary_key=True),
         Column('datasubgroup_id',  Integer, ForeignKey(schema+'.datasubgroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
-        Column('datafile_id',  Integer, ForeignKey(schema+'.datafile.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
+        Column('gene_id',  Integer, ForeignKey(schema+'.gene.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
         Column('sequence', String),
         Column('index', Integer),
         Column('seqindex', Integer, nullable=True),
         Column('neglogpp', Float),
         Column('pvalue', Float),
-        UniqueConstraint('datasubgroup_id', 'datafile_id', 'index'),
+        UniqueConstraint('datasubgroup_id', 'gene_id', 'index'),
         schema = schema,
         )
     mapper(Data, data_table)
     """
     simgroup_table = Table(
         name, meta,
-        Column('id',  Integer, ForeignKey(schema+'.sequencegroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True, primary_key=True),
+        Column('id',  Integer, index=True, primary_key=True),
         Column('created', TIMESTAMP(), default=now()),
         Column('rows', Integer),
         Column('cols', Integer),

corrmodel/default_conf.yaml

 runs: 50
 convergeruns: 25
 numsim: 100
-motifsimnum: 1000
+sequencesimnum: 1000
 splitnum: 2
 samplenum: 5
 
 
 mouse12rss: &mouse12rsslabel
     fasta : "MM12RSS.fasta"
-    datafile: mouse12rss.fasta
+    gene: mouse12rss.fasta
     rows : 120
     cols : 28
     seqlen : 28
     neglogppcutoff: 40
     model : [[5, 6], [7, 13, 18], [9, 22], [10, 12], [15, 16, 17], [20, 21]]
 
-    #datafiles_info : [['AF018146', 'forward'], ['AF018146', 'reverse'], ['X58411', 'reverse'], ['X58414', 'reverse'], ['AE000665', 'forward'], ['AE000665', 'reverse'], ['M38103', 'reverse'], ['M64239', 'reverse'], ['M38105', 'reverse'], ['X03761', 'reverse'], ['X03057', 'reverse'], ['X17177', 'reverse'], ['X02858', 'reverse'], ['X02862', 'reverse'], ['X03056', 'reverse'], ['X02843', 'reverse'], ['X03058', 'reverse'], ['X02859', 'reverse'], ['X03059', 'reverse'], ['D00080', 'forward'], ['D00081', 'forward'], ['D00082', 'forward'], ['M28132', 'forward'], ['M28134', 'forward'], ['Z72384', 'forward'], ['Z72382', 'forward'], ['J00562', 'forward'], ['K02418', 'forward'], ['K02159', 'forward'], ['K02160', 'forward'], ['K02158', 'forward'], ['K02161', 'forward'], ['K02162', 'forward'], ['X16954', 'forward'], ['X16955', 'forward'], ['Y15968', 'forward'], ['Y15972', 'forward'], ['V01565', 'forward'], ['K00884', 'forward'], ['V01564', 'forward'], ['M14360', 'forward'], ['Y15975', 'forward'], ['Y15976', 'forward'], ['Y15978', 'forward'], ['Y15981', 'forward'], ['AF044198', 'forward'], ['Y15977', 'forward'], ['Y15980', 'forward'], ['Y15982', 'forward'], ['M15520', 'forward'], ['M54906', 'forward'], ['AF029261', 'forward'], ['J00545', 'forward'], ['V01563', 'forward'], ['AJ235963', 'forward'], ['AJ235953', 'forward'], ['AJ235940', 'forward'], ['AJ231200', 'forward'], ['AJ231206', 'forward'], ['AJ231207', 'forward'], ['AJ132683', 'forward'], ['AJ231263', 'forward'], ['AJ235938', 'forward'], ['AJ239198', 'forward'], ['AJ231231', 'forward'], ['AJ231223', 'forward'], ['AJ231225', 'forward'], ['AJ231221', 'forward'], ['AJ231234', 'forward'], ['AJ231209', 'forward'], ['AJ231248', 'forward'], ['AJ231250', 'forward'], ['AJ231273', 'forward'], ['AJ231222', 'forward'], ['AJ235942', 'forward'], ['AJ235943', 'forward'], ['AJ231218', 'forward'], ['AJ231219', 'forward'], ['AJ231216', 'forward'], ['AJ231217', 'forward'], ['AJ231212', 'forward'], ['AJ231213', 'forward'], ['AJ231214', 'forward'], ['AJ231215', 'forward'], ['AJ231224', 'forward'], ['AJ231226', 'forward'], ['AJ231229', 'forward'], ['AJ235944', 'forward'], ['AJ235946', 'forward'], ['AJ235947', 'forward'], ['AJ235948', 'forward'], ['AJ235958', 'forward'], ['AJ231241', 'forward'], ['AJ239197', 'forward'], ['AJ231244', 'forward'], ['AJ231247', 'forward'], ['AJ231256', 'forward'], ['AJ235949', 'forward'], ['AJ235950', 'forward'], ['AJ235951', 'forward'], ['AJ235956', 'forward'], ['AJ235961', 'forward'], ['AJ235962', 'forward'], ['AJ235967', 'forward'], ['AJ235968', 'forward'], ['AJ235966', 'forward'], ['AJ231258', 'forward'], ['AJ231259', 'forward'], ['AJ235964', 'forward'], ['AJ235973', 'forward'], ['AJ235974', 'forward'], ['AJ231264', 'forward'], ['AJ231269', 'forward'], ['AJ231274', 'forward'], ['AJ235935', 'forward'], ['AJ235936', 'forward'], ['AJ231243', 'forward'], ['J00434', 'reverse'], ['M35332', 'reverse'], ['J00436', 'reverse'], ['M60961', 'reverse'], ['M60955', 'reverse'], ['J00431', 'reverse'], ['J00435', 'reverse'], ['J00437', 'reverse'], ['J00432', 'reverse'], ['J00433', 'reverse'], ['J00438', 'reverse'], ['J00439', 'reverse'], ['D13199', 'reverse'], ['M23243', 'reverse'], ['L32868', 'reverse'], ['J00440', 'reverse'], ['J00434', 'forward'], ['M35332', 'forward'], ['J00436', 'forward'], ['J00431', 'forward'], ['J00435', 'forward'], ['J00437', 'forward'], ['J00432', 'forward'], ['J00433', 'forward'], ['J00438', 'forward'], ['J00439', 'forward'], ['D13199', 'forward'], ['L32868', 'forward'], ['J00440', 'forward']]
+    #genes_info : [['AF018146', 'forward'], ['AF018146', 'reverse'], ['X58411', 'reverse'], ['X58414', 'reverse'], ['AE000665', 'forward'], ['AE000665', 'reverse'], ['M38103', 'reverse'], ['M64239', 'reverse'], ['M38105', 'reverse'], ['X03761', 'reverse'], ['X03057', 'reverse'], ['X17177', 'reverse'], ['X02858', 'reverse'], ['X02862', 'reverse'], ['X03056', 'reverse'], ['X02843', 'reverse'], ['X03058', 'reverse'], ['X02859', 'reverse'], ['X03059', 'reverse'], ['D00080', 'forward'], ['D00081', 'forward'], ['D00082', 'forward'], ['M28132', 'forward'], ['M28134', 'forward'], ['Z72384', 'forward'], ['Z72382', 'forward'], ['J00562', 'forward'], ['K02418', 'forward'], ['K02159', 'forward'], ['K02160', 'forward'], ['K02158', 'forward'], ['K02161', 'forward'], ['K02162', 'forward'], ['X16954', 'forward'], ['X16955', 'forward'], ['Y15968', 'forward'], ['Y15972', 'forward'], ['V01565', 'forward'], ['K00884', 'forward'], ['V01564', 'forward'], ['M14360', 'forward'], ['Y15975', 'forward'], ['Y15976', 'forward'], ['Y15978', 'forward'], ['Y15981', 'forward'], ['AF044198', 'forward'], ['Y15977', 'forward'], ['Y15980', 'forward'], ['Y15982', 'forward'], ['M15520', 'forward'], ['M54906', 'forward'], ['AF029261', 'forward'], ['J00545', 'forward'], ['V01563', 'forward'], ['AJ235963', 'forward'], ['AJ235953', 'forward'], ['AJ235940', 'forward'], ['AJ231200', 'forward'], ['AJ231206', 'forward'], ['AJ231207', 'forward'], ['AJ132683', 'forward'], ['AJ231263', 'forward'], ['AJ235938', 'forward'], ['AJ239198', 'forward'], ['AJ231231', 'forward'], ['AJ231223', 'forward'], ['AJ231225', 'forward'], ['AJ231221', 'forward'], ['AJ231234', 'forward'], ['AJ231209', 'forward'], ['AJ231248', 'forward'], ['AJ231250', 'forward'], ['AJ231273', 'forward'], ['AJ231222', 'forward'], ['AJ235942', 'forward'], ['AJ235943', 'forward'], ['AJ231218', 'forward'], ['AJ231219', 'forward'], ['AJ231216', 'forward'], ['AJ231217', 'forward'], ['AJ231212', 'forward'], ['AJ231213', 'forward'], ['AJ231214', 'forward'], ['AJ231215', 'forward'], ['AJ231224', 'forward'], ['AJ231226', 'forward'], ['AJ231229', 'forward'], ['AJ235944', 'forward'], ['AJ235946', 'forward'], ['AJ235947', 'forward'], ['AJ235948', 'forward'], ['AJ235958', 'forward'], ['AJ231241', 'forward'], ['AJ239197', 'forward'], ['AJ231244', 'forward'], ['AJ231247', 'forward'], ['AJ231256', 'forward'], ['AJ235949', 'forward'], ['AJ235950', 'forward'], ['AJ235951', 'forward'], ['AJ235956', 'forward'], ['AJ235961', 'forward'], ['AJ235962', 'forward'], ['AJ235967', 'forward'], ['AJ235968', 'forward'], ['AJ235966', 'forward'], ['AJ231258', 'forward'], ['AJ231259', 'forward'], ['AJ235964', 'forward'], ['AJ235973', 'forward'], ['AJ235974', 'forward'], ['AJ231264', 'forward'], ['AJ231269', 'forward'], ['AJ231274', 'forward'], ['AJ235935', 'forward'], ['AJ235936', 'forward'], ['AJ231243', 'forward'], ['J00434', 'reverse'], ['M35332', 'reverse'], ['J00436', 'reverse'], ['M60961', 'reverse'], ['M60955', 'reverse'], ['J00431', 'reverse'], ['J00435', 'reverse'], ['J00437', 'reverse'], ['J00432', 'reverse'], ['J00433', 'reverse'], ['J00438', 'reverse'], ['J00439', 'reverse'], ['D13199', 'reverse'], ['M23243', 'reverse'], ['L32868', 'reverse'], ['J00440', 'reverse'], ['J00434', 'forward'], ['M35332', 'forward'], ['J00436', 'forward'], ['J00431', 'forward'], ['J00435', 'forward'], ['J00437', 'forward'], ['J00432', 'forward'], ['J00433', 'forward'], ['J00438', 'forward'], ['J00439', 'forward'], ['D13199', 'forward'], ['L32868', 'forward'], ['J00440', 'forward']]
 
-    datafiles_info : [['AE000665', 'reverse'], ['AF018146', 'forward'], ['AF018146', 'reverse'], ['AF019412', 'reverse'], ['AF021335', 'forward'], ['AF021335', 'reverse'], ['AF029261', 'forward'], ['AF037352', 'reverse'], ['AF044198', 'forward'], ['AJ132683', 'forward'], ['AJ231200', 'forward'], ['AJ231206', 'forward'], ['AJ231207', 'forward'], ['AJ231209', 'forward'], ['AJ231212', 'forward'], ['AJ231213', 'forward'], ['AJ231214', 'forward'], ['AJ231215', 'forward'], ['AJ231216', 'forward'], ['AJ231217', 'forward'], ['AJ231218', 'forward'], ['AJ231219', 'forward'], ['AJ231221', 'forward'], ['AJ231222', 'forward'], ['AJ231223', 'forward'], ['AJ231224', 'forward'], ['AJ231225', 'forward'], ['AJ231226', 'forward'], ['AJ231228', 'forward'], ['AJ231229', 'forward'], ['AJ231231', 'forward'], ['AJ231234', 'forward'], ['AJ231241', 'forward'], ['AJ231243', 'forward'], ['AJ231244', 'forward'], ['AJ231247', 'forward'], ['AJ231248', 'forward'], ['AJ231250', 'forward'], ['AJ231256', 'forward'], ['AJ231258', 'forward'], ['AJ231259', 'forward'], ['AJ231263', 'forward'], ['AJ231264', 'forward'], ['AJ231269', 'forward'], ['AJ231273', 'forward'], ['AJ231274', 'forward'], ['AJ235935', 'forward'], ['AJ235936', 'forward'], ['AJ235938', 'forward'], ['AJ235940', 'forward'], ['AJ235942', 'forward'], ['AJ235943', 'forward'], ['AJ235944', 'forward'], ['AJ235946', 'forward'], ['AJ235947', 'forward'], ['AJ235948', 'forward'], ['AJ235949', 'forward'], ['AJ235950', 'forward'], ['AJ235951', 'forward'], ['AJ235953', 'forward'], ['AJ235956', 'forward'], ['AJ235958', 'forward'], ['AJ235961', 'forward'], ['AJ235962', 'forward'], ['AJ235963', 'forward'], ['AJ235964', 'forward'], ['AJ235966', 'forward'], ['AJ235967', 'forward'], ['AJ235968', 'forward'], ['AJ235973', 'forward'], ['AJ235974', 'forward'], ['AJ239197', 'forward'], ['AJ239198', 'forward'], ['D00080', 'forward'], ['D00081', 'forward'], ['D00082', 'forward'], ['D13199', 'forward'], ['D13199', 'reverse'], ['J00431', 'forward'], ['J00431', 'reverse'], ['J00432', 'forward'], ['J00432', 'reverse'], ['J00433', 'forward'], ['J00433', 'reverse'], ['J00434', 'forward'], ['J00434', 'reverse'], ['J00435', 'forward'], ['J00435', 'reverse'], ['J00436', 'forward'], ['J00436', 'reverse'], ['J00437', 'forward'], ['J00437', 'reverse'], ['J00438', 'forward'], ['J00438', 'reverse'], ['J00439', 'forward'], ['J00439', 'reverse'], ['J00440', 'forward'], ['J00440', 'reverse'], ['J00545', 'forward'], ['J00562', 'forward'], ['J00583', 'reverse'], ['J00593', 'reverse'], ['K00884', 'forward'], ['K02158', 'forward'], ['K02159', 'forward'], ['K02160', 'forward'], ['K02161', 'forward'], ['K02162', 'forward'], ['K02418', 'forward'], ['K02802', 'reverse'], ['L32868', 'forward'], ['L32868', 'reverse'], ['M14360', 'forward'], ['M15520', 'forward'], ['M23243', 'reverse'], ['M28132', 'forward'], ['M28134', 'forward'], ['M35332', 'forward'], ['M35332', 'reverse'], ['M38103', 'reverse'], ['M38105', 'reverse'], ['M54906', 'forward'], ['M60955', 'reverse'], ['M60961', 'reverse'], ['M64239', 'reverse'], ['V00813', 'reverse'], ['V01563', 'forward'], ['V01564', 'forward'], ['V01565', 'forward'], ['X00933', 'reverse'], ['X00934', 'reverse'], ['X01018', 'reverse'], ['X02843', 'reverse'], ['X02858', 'reverse'], ['X02859', 'reverse'], ['X02862', 'reverse'], ['X03056', 'reverse'], ['X03057', 'reverse'], ['X03058', 'reverse'], ['X03059', 'reverse'], ['X03761', 'reverse'], ['X05502', 'reverse'], ['X16954', 'forward'], ['X16955', 'forward'], ['X17177', 'reverse'], ['X17179', 'reverse'], ['X58411', 'reverse'], ['X58414', 'reverse'], ['Y15968', 'forward'], ['Y15972', 'forward'], ['Y15975', 'forward'], ['Y15976', 'forward'], ['Y15977', 'forward'], ['Y15978', 'forward'], ['Y15980', 'forward'], ['Y15981', 'forward'], ['Y15982', 'forward'], ['Z72382', 'forward'], ['Z72384', 'forward']]
+    genes_info : [['AE000665', 'reverse'], ['AF018146', 'forward'], ['AF018146', 'reverse'], ['AF019412', 'reverse'], ['AF021335', 'forward'], ['AF021335', 'reverse'], ['AF029261', 'forward'], ['AF037352', 'reverse'], ['AF044198', 'forward'], ['AJ132683', 'forward'], ['AJ231200', 'forward'], ['AJ231206', 'forward'], ['AJ231207', 'forward'], ['AJ231209', 'forward'], ['AJ231212', 'forward'], ['AJ231213', 'forward'], ['AJ231214', 'forward'], ['AJ231215', 'forward'], ['AJ231216', 'forward'], ['AJ231217', 'forward'], ['AJ231218', 'forward'], ['AJ231219', 'forward'], ['AJ231221', 'forward'], ['AJ231222', 'forward'], ['AJ231223', 'forward'], ['AJ231224', 'forward'], ['AJ231225', 'forward'], ['AJ231226', 'forward'], ['AJ231228', 'forward'], ['AJ231229', 'forward'], ['AJ231231', 'forward'], ['AJ231234', 'forward'], ['AJ231241', 'forward'], ['AJ231243', 'forward'], ['AJ231244', 'forward'], ['AJ231247', 'forward'], ['AJ231248', 'forward'], ['AJ231250', 'forward'], ['AJ231256', 'forward'], ['AJ231258', 'forward'], ['AJ231259', 'forward'], ['AJ231263', 'forward'], ['AJ231264', 'forward'], ['AJ231269', 'forward'], ['AJ231273', 'forward'], ['AJ231274', 'forward'], ['AJ235935', 'forward'], ['AJ235936', 'forward'], ['AJ235938', 'forward'], ['AJ235940', 'forward'], ['AJ235942', 'forward'], ['AJ235943', 'forward'], ['AJ235944', 'forward'], ['AJ235946', 'forward'], ['AJ235947', 'forward'], ['AJ235948', 'forward'], ['AJ235949', 'forward'], ['AJ235950', 'forward'], ['AJ235951', 'forward'], ['AJ235953', 'forward'], ['AJ235956', 'forward'], ['AJ235958', 'forward'], ['AJ235961', 'forward'], ['AJ235962', 'forward'], ['AJ235963', 'forward'], ['AJ235964', 'forward'], ['AJ235966', 'forward'], ['AJ235967', 'forward'], ['AJ235968', 'forward'], ['AJ235973', 'forward'], ['AJ235974', 'forward'], ['AJ239197', 'forward'], ['AJ239198', 'forward'], ['D00080', 'forward'], ['D00081', 'forward'], ['D00082', 'forward'], ['D13199', 'forward'], ['D13199', 'reverse'], ['J00431', 'forward'], ['J00431', 'reverse'], ['J00432', 'forward'], ['J00432', 'reverse'], ['J00433', 'forward'], ['J00433', 'reverse'], ['J00434', 'forward'], ['J00434', 'reverse'], ['J00435', 'forward'], ['J00435', 'reverse'], ['J00436', 'forward'], ['J00436', 'reverse'], ['J00437', 'forward'], ['J00437', 'reverse'], ['J00438', 'forward'], ['J00438', 'reverse'], ['J00439', 'forward'], ['J00439', 'reverse'], ['J00440', 'forward'], ['J00440', 'reverse'], ['J00545', 'forward'], ['J00562', 'forward'], ['J00583', 'reverse'], ['J00593', 'reverse'], ['K00884', 'forward'], ['K02158', 'forward'], ['K02159', 'forward'], ['K02160', 'forward'], ['K02161', 'forward'], ['K02162', 'forward'], ['K02418', 'forward'], ['K02802', 'reverse'], ['L32868', 'forward'], ['L32868', 'reverse'], ['M14360', 'forward'], ['M15520', 'forward'], ['M23243', 'reverse'], ['M28132', 'forward'], ['M28134', 'forward'], ['M35332', 'forward'], ['M35332', 'reverse'], ['M38103', 'reverse'], ['M38105', 'reverse'], ['M54906', 'forward'], ['M60955', 'reverse'], ['M60961', 'reverse'], ['M64239', 'reverse'], ['V00813', 'reverse'], ['V01563', 'forward'], ['V01564', 'forward'], ['V01565', 'forward'], ['X00933', 'reverse'], ['X00934', 'reverse'], ['X01018', 'reverse'], ['X02843', 'reverse'], ['X02858', 'reverse'], ['X02859', 'reverse'], ['X02862', 'reverse'], ['X03056', 'reverse'], ['X03057', 'reverse'], ['X03058', 'reverse'], ['X03059', 'reverse'], ['X03761', 'reverse'], ['X05502', 'reverse'], ['X16954', 'forward'], ['X16955', 'forward'], ['X17177', 'reverse'], ['X17179', 'reverse'], ['X58411', 'reverse'], ['X58414', 'reverse'], ['Y15968', 'forward'], ['Y15972', 'forward'], ['Y15975', 'forward'], ['Y15976', 'forward'], ['Y15977', 'forward'], ['Y15978', 'forward'], ['Y15980', 'forward'], ['Y15981', 'forward'], ['Y15982', 'forward'], ['Z72382', 'forward'], ['Z72384', 'forward']]
 
     subset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
 
 human12rss:
 
     fasta: HS12RSS.fasta
-    datafile: human12rss.fasta
+    gene: human12rss.fasta
     rows: 111
     cols: 28
     seqlen: 28
 
     # 5' to 3' is forward and 3' to 5' is reverse
 
-    datafiles_info : [['AP001216', 'forward'], ['D88255', 'forward'], ['J00232', 'reverse'], ['J00256', 'forward'], ['K02545', 'reverse'], ['M14158', 'reverse'], ['M14159', 'reverse'], ['M14905', 'reverse'], ['M15641', 'reverse'], ['M16747', 'reverse'], ['M18338', 'reverse'], ['M21508', 'reverse'], ['M22153', 'reverse'], ['M23090', 'forward'], ['M35619', 'reverse'], ['M35620', 'reverse'], ['M35622', 'reverse'], ['M35648', 'forward'], ['M64855', 'forward'], ['M64856', 'forward'], ['M64858', 'forward'], ['M94081', 'reverse'], ['U41643', 'forward'], ['V00558', 'forward'], ['X01668', 'forward'], ['X02884', 'reverse'], ['X02885', 'reverse'], ['X02886', 'reverse'], ['X02987', 'reverse'], ['X04457', 'reverse'], ['X05773', 'reverse'], ['X05775', 'reverse'], ['X12683', 'forward'], ['X12684', 'forward'], ['X12687', 'forward'], ['X12691', 'forward'], ['X13972', 'forward'], ['X13972', 'reverse'], ['X17264', 'forward'], ['X51755', 'reverse'], ['X51887', 'forward'], ['X59312', 'forward'], ['X59314', 'forward'], ['X59316', 'forward'], ['X59318', 'forward'], ['X63392', 'forward'], ['X63395', 'forward'], ['X63396', 'forward'], ['X63397', 'forward'], ['X63398', 'forward'], ['X63402', 'forward'], ['X63403', 'forward'], ['X72817', 'forward'], ['X93614', 'forward'], ['X93614', 'reverse'], ['X93616', 'forward'], ['X97051', 'forward'], ['X97051', 'reverse'], ['Z00001', 'forward'], ['Z00008', 'forward'], ['Z00013', 'forward'], ['Z00014', 'forward'], ['Z00023', 'forward']]
+    genes_info : [['AP001216', 'forward'], ['D88255', 'forward'], ['J00232', 'reverse'], ['J00256', 'forward'], ['K02545', 'reverse'], ['M14158', 'reverse'], ['M14159', 'reverse'], ['M14905', 'reverse'], ['M15641', 'reverse'], ['M16747', 'reverse'], ['M18338', 'reverse'], ['M21508', 'reverse'], ['M22153', 'reverse'], ['M23090', 'forward'], ['M35619', 'reverse'], ['M35620', 'reverse'], ['M35622', 'reverse'], ['M35648', 'forward'], ['M64855', 'forward'], ['M64856', 'forward'], ['M64858', 'forward'], ['M94081', 'reverse'], ['U41643', 'forward'], ['V00558', 'forward'], ['X01668', 'forward'], ['X02884', 'reverse'], ['X02885', 'reverse'], ['X02886', 'reverse'], ['X02987', 'reverse'], ['X04457', 'reverse'], ['X05773', 'reverse'], ['X05775', 'reverse'], ['X12683', 'forward'], ['X12684', 'forward'], ['X12687', 'forward'], ['X12691', 'forward'], ['X13972', 'forward'], ['X13972', 'reverse'], ['X17264', 'forward'], ['X51755', 'reverse'], ['X51887', 'forward'], ['X59312', 'forward'], ['X59314', 'forward'], ['X59316', 'forward'], ['X59318', 'forward'], ['X63392', 'forward'], ['X63395', 'forward'], ['X63396', 'forward'], ['X63397', 'forward'], ['X63398', 'forward'], ['X63402', 'forward'], ['X63403', 'forward'], ['X72817', 'forward'], ['X93614', 'forward'], ['X93614', 'reverse'], ['X93616', 'forward'], ['X97051', 'forward'], ['X97051', 'reverse'], ['Z00001', 'forward'], ['Z00008', 'forward'], ['Z00013', 'forward'], ['Z00014', 'forward'], ['Z00023', 'forward']]
 
     subset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
     from utils import file_not_exist, create_engine_wrapper, get_conf, pg_url
     from dbutils import schema_exists
     from sqlalchemy.orm import mapper, relation, sessionmaker
-    from dbschema import make_sequencegroup_table, make_simgroup_table, make_simdata_table, make_simresult_table, make_model_table
+    from dbschema import make_simgroup_table, make_simdata_table, make_simresult_table, make_model_table
     import cPickle, random, cpplib, getmodel
     conf = get_conf()
     dbuser = conf["dbuser"]
     print "making tables corresponding to schema %s and dbstring %s"%(schema, dbstring)
     from sqlalchemy import MetaData
     meta = MetaData()
-    sequencegroup_table, Sequencegroup = make_sequencegroup_table(meta, schema)
     simgroup_table, Simgroup = make_simgroup_table(meta, schema)
     simdata_table, Simdata = make_simdata_table(meta, schema)
     simresult_table, Simresult = make_simresult_table(meta, schema)
     d['db'] = db
     d['meta'] = meta
     d['Model'] = Model
-    d['Sequencegroup'] = Sequencegroup
     d['Simgroup'] = Simgroup
     d['Simdata'] = Simdata
     d['Simresult'] = Simresult
     Add model specified to db.
     """
     from sqlalchemy.orm import sessionmaker
-    tabledict = create_motif_tables(schema)
+    tabledict = create_sequence_tables(schema)
     Model = tabledict["Model"]
     confval = create_db()
     db = confval["db"]
     db.dispose()
     return model, model_id
 
-def create_motif_tables(schema, result={}):
-    """Create all motif-related tables in a particular schema"""
+def create_sequence_tables(schema, result={}):
+    """Create all sequence-related tables in a particular schema"""
     if result:
         return result
     import os, sys, time
     from utils import file_not_exist, create_engine_wrapper, get_conf, pg_url
     from dbutils import schema_exists
     from sqlalchemy.orm import mapper, relation, sessionmaker
-    from dbschema import make_crossvalgroup_table, make_crossval_table, make_motifstat_table, make_motifstatgroup_table, make_sequencegroup_table, make_motifgroup_table, make_motifgroup_table_index, make_motif_table, make_model_table, make_datagroup_table, make_datasubgroup_table, make_datafile_table, make_data_table
+    from dbschema import make_crossvalgroup_table, make_crossval_table, make_sequencestat_table, make_sequencestatgroup_table, make_sequencegroup_table, make_sequencegroup_table_index, make_sequence_table, make_model_table, make_datagroup_table, make_datasubgroup_table, make_gene_table, make_data_table
     import cPickle, random, cpplib, getmodel
     confval = create_db()
     dbname = confval["dbname"]
     meta = MetaData()
     crossvalgroup_table, Crossvalgroup = make_crossvalgroup_table(meta, schema)
     crossval_table, Crossval = make_crossval_table(meta, schema)
-    motifstat_table, Motifstat = make_motifstat_table(meta, schema)
-    motifstatgroup_table, Motifstatgroup = make_motifstatgroup_table(meta, schema)
+    sequencestat_table, Sequencestat = make_sequencestat_table(meta, schema)
+    sequencestatgroup_table, Sequencestatgroup = make_sequencestatgroup_table(meta, schema)
     sequencegroup_table, Sequencegroup = make_sequencegroup_table(meta, schema)
-    motifgroup_table, Motifgroup = make_motifgroup_table(meta, schema)
-    make_motifgroup_table_index(motifgroup_table)
-    motif_table, Motif = make_motif_table(meta, schema)
+    make_sequencegroup_table_index(sequencegroup_table)
+    sequence_table, Sequence = make_sequence_table(meta, schema)
     model_table, Model = make_model_table(meta, schema)
     datagroup_table, Datagroup = make_datagroup_table(meta, schema)
     datasubgroup_table, Datasubgroup = make_datasubgroup_table(meta, schema)
-    datafile_table, Datafile = make_datafile_table(meta, schema)
+    gene_table, Gene = make_gene_table(meta, schema)
     data_table, Data = make_data_table(meta, schema)
     meta.bind = db
     meta.create_all()
     d['meta'] = meta
     d['Crossvalgroup'] = Crossvalgroup
     d['Crossval'] = Crossval
-    d['Motifstatgroup'] = Motifstatgroup
-    d['Motifstat'] = Motifstat
+    d['Sequencestatgroup'] = Sequencestatgroup
+    d['Sequencestat'] = Sequencestat
     d['Sequencegroup'] = Sequencegroup
-    d['Motifgroup'] = Motifgroup
-    d['Motif'] = Motif
+    d['Sequence'] = Sequence
     d['Model'] = Model
     d['Datagroup'] = Datagroup
     d['Datasubgroup'] = Datasubgroup
-    d['Datafile'] = Datafile
+    d['Gene'] = Gene
     d['Data'] = Data
     result.update(d)
     return result
         return str(numpy_string)
     register_adapter(numpy.string_, addapt_numpy_string)
 
-def load_motif_dataset(args):
-    """Load RSS sequences from fasta file into Motif table. If
+def load_sequence_dataset(args):
+    """Load RSS sequences from fasta file into Sequence table. If
     model_id is specified, simulate sequences rather than reading from
     fasta file"""
     schema, subset, simulated = args.dataset, args.subset, args.simulated
     import cpplib
     from getmodel import writearr
     from utils import get_conf
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Model = d['Model']
     Sequencegroup = d['Sequencegroup']
-    Motifgroup = d['Motifgroup']
-    Motif = d['Motif']
+    Sequence = d['Sequence']
     conf = get_conf()
     if subset==None and 'subset' in conf[schema]:
         subset = conf[schema]["subset"]
 
     # If simulated = 'f', then check that subset does not already
     # exist. This is the same as in enforced at the db level by
-    # 'make_motifgroup_table_index' in dbschema.py.
-    if(session.query(Motifgroup).filter(Motifgroup.subset==subset).filter(Motifgroup.simulated==False).count() == 1):
-        print "Motif dataset already loaded, skipping"
+    # 'make_sequencegroup_table_index' in dbschema.py.
+    if(session.query(Sequencegroup).filter(Sequencegroup.subset==subset).filter(Sequencegroup.simulated==False).count() == 1):
+        print "Sequence dataset already loaded, skipping"
         return
 
     if subset is not None:
     mod = cpplib.cpp_model(cols, model)
     model, model_id = write_model_to_db(schema, cols, model)
 
-    # Add Motifgroup object
-    seqg = Sequencegroup()
-    session.add(seqg)
+    # Add Sequencegroup object
     session.commit()
-    if args.motifsimnum is not None and not simulated:
-        sys.exit("--motifsimnum option is only used when dataset is simulated")
-    elif args.motifsimnum is not None and simulated:
-        motifsimnum = args.motifsimnum
+    if args.sequencesimnum is not None and not simulated:
+        sys.exit("--sequencesimnum option is only used when dataset is simulated")
+    elif args.sequencesimnum is not None and simulated:
+        sequencesimnum = args.sequencesimnum
     elif simulated:
-        motifsimnum = conf["motifsimnum"]
+        sequencesimnum = conf["sequencesimnum"]
     else:
-        motifsimnum = None
-    mgroup = Motifgroup(seqg.id, subset, model_id, simulated, motifsimnum)
+        sequencesimnum = None
+    mgroup = Sequencegroup(subset, model_id, simulated, sequencesimnum)
     session.add(mgroup)
     session.commit()
-    motifgroup_id = mgroup.id
+    sequencegroup_id = mgroup.id
     try:
         if simulated:
-            arr = numpy.array(cpplib.cpp_sim_with_data_prob(mod, arr, motifsimnum))
+            arr = numpy.array(cpplib.cpp_sim_with_data_prob(mod, arr, sequencesimnum))
         # Add sequences to db
         mlst = []
         for row in arr:
             seq = ''.join(row)
-            mlst.append(Motif(motifgroup_id, seq))
+            mlst.append(Sequence(sequencegroup_id, seq))
         session.add_all(mlst)
         session.commit()
         session.flush()
         session.close()
         db.dispose()
-        return motifgroup_id
+        return sequencegroup_id
     except:
         session.delete(mgroup)
         session.commit()
     from crossval import crossval
     from utils import arr_to_list, permute_array, get_conf
     import decimal, numpy
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Crossval = d['Crossval']
     Crossvalgroup = d['Crossvalgroup']
     Model = d['Model']
-    Motifgroup = d['Motifgroup']
+    Sequencegroup = d['Sequencegroup']
     Session = sessionmaker(bind=db)
     session = Session()
     if permuted==True:
         else:
             samplenum = conf["samplenum"]
     subset = conf[schema]["subset"]
-    # motifgroup_id for true motif group (RSS)
-    motifgroup_id = session.query(Motifgroup).filter(Motifgroup.subset==subset).filter(Motifgroup.simulated==False).one().id
-    sequences = session.execute("SELECT id, sequence from %s.motif where motif.motifgroup_id=%s;"%(schema, motifgroup_id)).fetchall()
+    # sequencegroup_id for true sequence group (RSS)
+    sequencegroup_id = session.query(Sequencegroup).filter(Sequencegroup.subset==subset).filter(Sequencegroup.simulated==False).one().id
+    sequences = session.execute("SELECT id, sequence from %s.sequence where sequence.sequencegroup_id=%s;"%(schema, sequencegroup_id)).fetchall()
     cols = len(sequences[0][1])
     t1 = time.time()
     # run model search
     elif not search:
         samplelst = crossval(sequences, samplenum, runs, convergeruns, search)
     t2 = time.time()
-    cg = Crossvalgroup(permuted, samplenum, t2-t1, motifgroup_id)
+    cg = Crossvalgroup(permuted, samplenum, t2-t1, sequencegroup_id)
     session.add(cg)
     session.commit()
 
     return result
 
 @print_timing
-def load_all_motifstat(args):
+def load_all_sequencestat(args):
     """
-    Calls load_motifstat for every entry in motifgroup for which an
-    entry in motifstatgroup with the same motifgroup id is not already
+    Calls load_sequencestat for every entry in sequencegroup for which an
+    entry in sequencestatgroup with the same sequencegroup id is not already
     present.
     """
     schema, numsim = args.dataset, args.numsim
     if numsim is None:
         numsim = conf["numsim"]
     subset = conf[schema]["subset"]
-    # check for motifgroup entries with given subset
-    d = create_motif_tables(schema)
+    # check for sequencegroup entries with given subset
+    d = create_sequence_tables(schema)
     db = d['db']
-    Motifgroup = d['Motifgroup']
-    Motifstatgroup = d['Motifstatgroup']
+    Sequencegroup = d['Sequencegroup']
+    Sequencestatgroup = d['Sequencestatgroup']
     Session = sessionmaker(bind=db)
     session = Session()
-    if session.query(Motifgroup).filter(Motifgroup.subset==subset).count() == 0:
-        sys.exit("no motifgroup entries with subset %s"%(subset))
-    motifgroup_ids = [mg.id for mg in session.query(Motifgroup).filter(Motifgroup.subset==subset).all()]
-    # check whether corresponding entries exist in Motifstatgroup already
-    for motifgroup_id in motifgroup_ids:
-        if session.query(Motifstatgroup).filter(Motifstatgroup.motifgroup_id==motifgroup_id).count()==0:
-            load_motifstat(schema, motifgroup_id, numsim)
+    if session.query(Sequencegroup).filter(Sequencegroup.subset==subset).count() == 0:
+        sys.exit("no sequencegroup entries with subset %s"%(subset))
+    sequencegroup_ids = [mg.id for mg in session.query(Sequencegroup).filter(Sequencegroup.subset==subset).all()]
+    # check whether corresponding entries exist in Sequencestatgroup already
+    for sequencegroup_id in sequencegroup_ids:
+        if session.query(Sequencestatgroup).filter(Sequencestatgroup.sequencegroup_id==sequencegroup_id).count()==0:
+            load_sequencestat(schema, sequencegroup_id, numsim)
 
 @print_timing
-def load_motifstat(schema, motifgroup_id, numsim):
+def load_sequencestat(schema, sequencegroup_id, numsim):
     """
     This loads pvalues and conditional expected statistics into the
-    table Motifstat for motifs in the group whose id is
-    'motifgroup_id'. 'logjtdistlst' is the list of values from the
+    table Sequencestat for sequences in the group whose id is
+    'sequencegroup_id'. 'logjtdistlst' is the list of values from the
     joint distribution simulated by MCMC.
     """
     import cpplib, cPickle, getmodel, random, time
     conf = get_conf()
     # if numsim is None:
     #     numsim = conf["numsim"]
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
-    Motifstatgroup = d['Motifstatgroup']
+    Sequencestatgroup = d['Sequencestatgroup']
     Model = d['Model']
-    Motifstat = d['Motifstat']
+    Sequencestat = d['Sequencestat']
     Session = sessionmaker(bind=db)
     session = Session()
-    msg = Motifstatgroup(motifgroup_id, numsim)
+    msg = Sequencestatgroup(sequencegroup_id, numsim)
     session.add(msg)
     session.commit()
-    arr = arr_from_sequence_table(schema, motifgroup_id)
+    arr = arr_from_sequence_table(schema, sequencegroup_id)
     modval = conf[schema]["model"]
     mod = cpplib.cpp_model(arr.shape[1], modval)
     pvalues = cpplib.cpp_pval_stat(arr, mod, numsim)
     session.commit()
     pvallst = []
     for pval in pvalues:
-        pvallst.append(Motifstat(msg.motifgroup_id, pval[0], pval[1]))
+        pvallst.append(Sequencestat(msg.sequencegroup_id, pval[0], pval[1]))
     session.add_all(pvallst)
     session.commit()
     session.flush()
     session.close()
     db.dispose()
 
-def arr_from_sequence_table(schema, motifgroup_id):
+def arr_from_sequence_table(schema, sequencegroup_id):
     """Convert list of sequences into an array"""
     import numpy
     from sqlalchemy.orm import sessionmaker
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Session = sessionmaker(bind=db)
     session = Session()
-    sequences = session.execute("SELECT sequence from %s.motif where motif.motifgroup_id=%s;"%(schema, motifgroup_id)).fetchall()
+    sequences = session.execute("SELECT sequence from %s.sequence where sequence.sequencegroup_id=%s;"%(schema, sequencegroup_id)).fetchall()
     seqlst = []
     for seq in sequences:
         seqlst.append([s for s in str(seq[0])])
     from getmodel import writearr, getmodel
     from utils import get_conf
     numpy.set_printoptions(threshold='nan')
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Model = d['Model']
     Session = sessionmaker(bind=db)
 
 def drop_data_table_constraints(schema, session):
     session.execute("select * from drop_constraint_if_exists('data', 'data_pkey', '%s');"%(schema))
-    session.execute("select * from drop_constraint_if_exists('data', 'data_datafile_id_fkey', '%s');"%(schema))
+    session.execute("select * from drop_constraint_if_exists('data', 'data_gene_id_fkey', '%s');"%(schema))
 
 def restore_data_table_constraints(schema, session):
     session.execute("ALTER TABLE ONLY %s.data ADD CONSTRAINT data_pkey PRIMARY KEY (id);"%(schema))
-    session.execute("ALTER TABLE ONLY %s.data ADD CONSTRAINT data_datafile_id_fkey FOREIGN KEY (datafile_id) REFERENCES %s.datafile(id) ON UPDATE CASCADE ON DELETE CASCADE;"%(schema, schema))
+    session.execute("ALTER TABLE ONLY %s.data ADD CONSTRAINT data_gene_id_fkey FOREIGN KEY (gene_id) REFERENCES %s.gene(id) ON UPDATE CASCADE ON DELETE CASCADE;"%(schema, schema))
 
 def write_data_file(arglist):
-    datatablefile, startpos, endpos, datastr, sequences, model, arrfreqlst, rownum, logjtdistlst, simstatlst, numsim, datasubgroup_id, datafile_id, seqlen, fast, neglogppcutoff = arglist['datatablefile'], arglist['startpos'], arglist['endpos'], arglist['datastr'], arglist['sequences'], arglist['model'], arglist['arrfreqlst'], arglist['rownum'], arglist['logjtdistlst'], arglist['simstatlst'], arglist['numsim'], arglist['datasubgroup_id'], arglist['datafile_id'], arglist['seqlen'], arglist['fast'], arglist['neglogppcutoff']
+    datatablefile, startpos, endpos, datastr, sequences, model, arrfreqlst, rownum, logjtdistlst, simstatlst, numsim, datasubgroup_id, gene_id, seqlen, fast, neglogppcutoff = arglist['datatablefile'], arglist['startpos'], arglist['endpos'], arglist['datastr'], arglist['sequences'], arglist['model'], arglist['arrfreqlst'], arglist['rownum'], arglist['logjtdistlst'], arglist['simstatlst'], arglist['numsim'], arglist['datasubgroup_id'], arglist['gene_id'], arglist['seqlen'], arglist['fast'], arglist['neglogppcutoff']
     import cpplib
     try:
-        return cpplib.cpp_write_data_file_partial(datatablefile, startpos, endpos, datastr, sequences, model, arrfreqlst, rownum, logjtdistlst, simstatlst, numsim, datasubgroup_id, datafile_id, seqlen, fast, neglogppcutoff)
+        return cpplib.cpp_write_data_file_partial(datatablefile, startpos, endpos, datastr, sequences, model, arrfreqlst, rownum, logjtdistlst, simstatlst, numsim, datasubgroup_id, gene_id, seqlen, fast, neglogppcutoff)
     except Exception, e:
         # Work around Python bug http://bugs.python.org/issue1692335 See also
         # http://stackoverflow.com/questions/8785899/hang-in-python-script-using-sqlalchemy-and-multiprocessing
         import sys
         raise Exception(e.__class__.__name__ + ": " +str(e)), None, sys.exc_info()[2]
 
-def get_datafile(session, tabledict, accnum, orientation):
-    Datafile = tabledict['Datafile']
-    dfqueryresult = session.query(Datafile).filter(Datafile.accnum==accnum).filter(Datafile.orientation==orientation)
+def get_gene(session, tabledict, accnum, orientation):
+    Gene = tabledict['Gene']
+    dfqueryresult = session.query(Gene).filter(Gene.accnum==accnum).filter(Gene.orientation==orientation)
     if dfqueryresult.count()!=0:
         return dfqueryresult[0]
     else:
-        df = Datafile(accnum, orientation)
+        df = Gene(accnum, orientation)
         session.add(df)
         session.commit()
         return df
 @print_timing
 def load_file(schema, datatablefile, accnum, orientation, sequences, dataseqdict, model, arrfreqlst, rownum, logjtdistlst, simstatlst, numsim, tabledict, session, datalst, datasubgroup_id, splitnum, seqlen, fast, neglogppcutoff, tmpdir):
     """
-    This loads sequences from a datafile. which is a section of the
+    This loads sequences from a gene. which is a section of the
     genome, along with some associated statistics, 'margstat' and
     'pvalue', into the Data tables. This function first calls
     'write_data_file' to write data files, then loads those files into
     from utils import get_conf
     conf = get_conf()
     Data = tabledict['Data']
-    df = get_datafile(session, tabledict, accnum, orientation)
+    df = get_gene(session, tabledict, accnum, orientation)
     fasta = os.path.abspath(os.path.expanduser(os.path.join(conf["datadir"], accnum + ".fasta")))
     if orientation == "forward":
         datastr = dataseqdict[accnum]
     # multiple processes here.
     if splitsize == 0:
         splitnum = 1
-        write_data_file({'datatablefile':datatablefile+"."+str(0), 'startpos':0, 'endpos':datastrlen-seqlen, 'datastr':datastr, 'sequences':sequences, 'model':model, 'arrfreqlst':arrfreqlst, 'rownum':rownum, 'logjtdistlst':logjtdistlst, 'simstatlst':simstatlst, 'numsim':numsim, 'datasubgroup_id':datasubgroup_id, 'datafile_id':df.id, 'seqlen':seqlen, 'fast':fast, 'neglogppcutoff':neglogppcutoff})
+        write_data_file({'datatablefile':datatablefile+"."+str(0), 'startpos':0, 'endpos':datastrlen-seqlen, 'datastr':datastr, 'sequences':sequences, 'model':model, 'arrfreqlst':arrfreqlst, 'rownum':rownum, 'logjtdistlst':logjtdistlst, 'simstatlst':simstatlst, 'numsim':numsim, 'datasubgroup_id':datasubgroup_id, 'gene_id':df.id, 'seqlen':seqlen, 'fast':fast, 'neglogppcutoff':neglogppcutoff})
 
     # Write data files
     if splitsize > 0:
         for i in range(splitnum-1):
-            arglist.append({'datatablefile':datatablefile+"."+str(i), 'startpos':i*splitsize, 'endpos':(i+1)*splitsize - 1, 'datastr':datastr, 'sequences':sequences, 'model':model, 'arrfreqlst':arrfreqlst, 'rownum':rownum, 'logjtdistlst':logjtdistlst, 'simstatlst':simstatlst, 'numsim':numsim, 'datasubgroup_id':datasubgroup_id, 'datafile_id':df.id, 'seqlen':seqlen, 'fast':fast, 'neglogppcutoff':neglogppcutoff})
+            arglist.append({'datatablefile':datatablefile+"."+str(i), 'startpos':i*splitsize, 'endpos':(i+1)*splitsize - 1, 'datastr':datastr, 'sequences':sequences, 'model':model, 'arrfreqlst':arrfreqlst, 'rownum':rownum, 'logjtdistlst':logjtdistlst, 'simstatlst':simstatlst, 'numsim':numsim, 'datasubgroup_id':datasubgroup_id, 'gene_id':df.id, 'seqlen':seqlen, 'fast':fast, 'neglogppcutoff':neglogppcutoff})
         # Process to load the remainder of the file sequence that is less than splitsize
-        arglist.append({'datatablefile':datatablefile+"."+str(splitnum-1), 'startpos':(splitnum-1)*splitsize, 'endpos':datastrlen - seqlen, 'datastr':datastr, 'sequences':sequences, 'model':model, 'arrfreqlst':arrfreqlst, 'rownum':rownum, 'logjtdistlst':logjtdistlst, 'simstatlst':simstatlst, 'numsim':numsim, 'datasubgroup_id':datasubgroup_id, 'datafile_id':df.id, 'seqlen':seqlen, 'fast':fast, 'neglogppcutoff':neglogppcutoff})
+        arglist.append({'datatablefile':datatablefile+"."+str(splitnum-1), 'startpos':(splitnum-1)*splitsize, 'endpos':datastrlen - seqlen, 'datastr':datastr, 'sequences':sequences, 'model':model, 'arrfreqlst':arrfreqlst, 'rownum':rownum, 'logjtdistlst':logjtdistlst, 'simstatlst':simstatlst, 'numsim':numsim, 'datasubgroup_id':datasubgroup_id, 'gene_id':df.id, 'seqlen':seqlen, 'fast':fast, 'neglogppcutoff':neglogppcutoff})
 
     r = pool.map_async(write_data_file, arglist)
     r.get()
     drop_data_table_constraints(schema, session)
     # read data files into database
     for i in range(splitnum):
-        session.execute("COMMIT; BEGIN; COPY %s.data(datasubgroup_id, datafile_id, sequence, index, seqindex, neglogpp, pvalue) FROM '%s' WITH DELIMITER AS ',' NULL AS ''; COMMIT;"%(schema, os.path.join(dirpath, datatablefile+"."+str(i))))
+        session.execute("COMMIT; BEGIN; COPY %s.data(datasubgroup_id, gene_id, sequence, index, seqindex, neglogpp, pvalue) FROM '%s' WITH DELIMITER AS ',' NULL AS ''; COMMIT;"%(schema, os.path.join(dirpath, datatablefile+"."+str(i))))
     restore_data_table_constraints(schema, session)
 
 @print_timing
     import cpplib, getmodel, numpy, os, time
     from sqlalchemy.orm import sessionmaker
     from utils import get_conf, randstr, safe_mkdir
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Crossval = d['Crossval']
     Datagroup = d['Datagroup']
 @print_timing
 def load_allseq_crossval_subgroup(schema, datasubgroup_id, numsim, fast):
     """
-    This loads sequences from a datafile. which is a section of the
+    This loads sequences from a gene. which is a section of the
     genome, along with some associated statistics, 'margstat' and
     'pvalue', into the tables Datagroup and Data. Loading of the Data
     tables is delegated to the function 'load_file'.
     import cpplib, getmodel, numpy, os, time
     from sqlalchemy.orm import sessionmaker
     from utils import get_conf, randstr, safe_mkdir
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Crossval = d['Crossval']
     Datagroup = d['Datagroup']
     Datasubgroup = d['Datasubgroup']
     Model = d['Model']
-    Motif = d['Motif']
-    Motifgroup = d['Motifgroup']
+    Sequence = d['Sequence']
+    Sequencegroup = d['Sequencegroup']
     Session = sessionmaker(bind=db)
     session = Session()
     t1 = time.time()
         numsim = conf["numsim"]
     splitnum = conf["splitnum"]
     neglogppcutoff = conf[schema]["neglogppcutoff"]
-    motifgroup_id = session.query(Motifgroup).filter(Motifgroup.simulated==False).filter(Motifgroup.subset==subset).one().id
+    sequencegroup_id = session.query(Sequencegroup).filter(Sequencegroup.simulated==False).filter(Sequencegroup.subset==subset).one().id
     dsg = session.query(Datasubgroup).filter(Datasubgroup.id==datasubgroup_id).one()
     crossval_id = dsg.crossval_id
     datagroup_id = dsg.datagroup_id
         print "warning: model id missing in crossval table for crossval id %s - using model id in config"%crossval_id
         modval = conf[schema]["model"]
     idsubset = session.query(Crossval).filter(Crossval.id==crossval_id).one().sample
-    totids = [m.id for m in session.query(Motif).filter(Motif.motifgroup_id==motifgroup_id).all()]
+    totids = [m.id for m in session.query(Sequence).filter(Sequence.sequencegroup_id==sequencegroup_id).all()]
     # check that idsubset is contained in totids
     if not set(idsubset) <= set(totids):
         raise ValueError("idsubset %s is not a subset of totids %s for crossval id %s. Aborting"%(idsubset, totids, crossval_id))
     idlst = tuple(set(totids).difference(set(idsubset)))
-    sequences = session.execute("SELECT sequence from %(schema)s.motif where motif.id in %(idlst)s"%{"schema":schema, "idlst":idlst}).fetchall()
+    sequences = session.execute("SELECT sequence from %(schema)s.sequence where sequence.id in %(idlst)s"%{"schema":schema, "idlst":idlst}).fetchall()
     seqlst = []
     for seq in sequences:
         seqlst.append([s for s in str(seq[0])])
     modlen = cpplib.cpp_modlen(model)
     if modlen != len(subset):
         raise ValueError("the length of the model and the subset in load_allseq must be the same, but the model is %s and the subset is %s"%(model, subset))
-    sqresult = session.execute("SELECT sequence, id from %(schema)s.motif"%{"schema":schema}).fetchall()
+    sqresult = session.execute("SELECT sequence, id from %(schema)s.sequence"%{"schema":schema}).fetchall()
     seqdict = {}
     for s in sqresult:
         if str(s[0]) in sqresult:
     tmpdir = os.path.abspath(os.path.expanduser(os.path.join(conf['tmpdir'], 'tmp_%s'%rstr)))
     try:
         safe_mkdir(tmpdir)
-        datafiles_info = conf[schema]["datafiles_info"]
-        dataseqdict = fastatodict(os.path.join(conf['datadir'], conf[schema]["datafile"]))
-        for f in datafiles_info:
+        genes_info = conf[schema]["genes_info"]
+        dataseqdict = fastatodict(os.path.join(conf['datadir'], conf[schema]["gene"]))
+        for f in genes_info:
             datatablefile = os.path.join(tmpdir, f[0]+"_"+f[1].split(".")[0]+".csv")
             load_file(schema, datatablefile, f[0], f[1], seqdict, dataseqdict, model, arrfreqlst, rownum, logjtdistlst, simstatlst, numsim, d, session, datalst, datasubgroup_id, splitnum, seqlen, fast, neglogppcutoff, tmpdir)
         session.add_all(datalst)
         session.close()
         db.dispose()
 
-def load_all_motif_tables(args):
-    schema, fast, motifsimnum, numsim, samplenum, permuted, simulated, search = args.dataset, args.fast, args.motifsimnum, args.numsim, args.samplenum, args.permuted, args.simulated, args.search
+def load_all_sequence_tables(args):
+    schema, fast, sequencesimnum, numsim, samplenum, permuted, simulated, search = args.dataset, args.fast, args.sequencesimnum, args.numsim, args.samplenum, args.permuted, args.simulated, args.search
     from utils import get_conf
     conf = get_conf()
     args.subset = conf[schema]["subset"]
-    from load import load_motif_dataset, load_crossval_dataset, load_motifstat, load_allseq
-    args.motifgroup_id = load_motif_dataset(args)
+    from load import load_sequence_dataset, load_crossval_dataset, load_sequencestat, load_allseq
+    args.sequencegroup_id = load_sequence_dataset(args)
     args.crossvalgroup_id = load_crossval_dataset(args)
-    load_all_motifstat(args)
+    load_all_sequencestat(args)
     load_allseq(args)
 
 def testdb(schema):
     import numpy
     from sqlalchemy.orm import sessionmaker
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Crossvalgroup = d['Crossvalgroup']
     Crossval = d['Crossval']
 def select_sequence(schema):
     import numpy
     from sqlalchemy.orm import sessionmaker
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Sequence = d['Sequence']
     Session = sessionmaker(bind=db)
     db = confval["db"]
     meta = tabledict["meta"]
     Model = tabledict["Model"]
-    Sequencegroup = tabledict["Sequencegroup"]
     Simgroup = tabledict["Simgroup"]
     Simdata = tabledict["Simdata"]
     Session = sessionmaker(bind=db)
     session = Session()
-    seqg = Sequencegroup()
-    session.add(seqg)
     session.commit()
     modelquery = session.query(Model)
     # In this case, the model gets added to the db
     colnum, model, model_id = write_config_model_to_db(schema, colnum, model_id, default_model, empty_model)
 
     mod = cpplib.cpp_model(colnum, model)
-    simg = Simgroup(seqg.id, rownum, colnum, model_id)
+    simg = Simgroup(rownum, colnum, model_id)
     session.add(simg)
     session.commit()
     arr = numpy.array(cpplib.cpp_gendata(rownum, mod))
     db = confval["db"]
     meta = tabledict["meta"]
     Model = tabledict["Model"]
-    Sequencegroup = tabledict["Sequencegroup"]
     Simgroup = tabledict["Simgroup"]
     Simdata = tabledict["Simdata"]
     Simresult = tabledict["Simresult"]
   CDFs corresponding to sequences simulated from the distribution
   model $\M$, the RSS, and the set $\mathcal{L}$, which is comprised
   of all sequences of length 28 in the gene segments listed in
-  Tables~\ref{datafilehumantab} and \ref{datafilemousetab}
+  Tables~\ref{genehumantab} and \ref{genemousetab}
   respectively. Additionally, we plot the posterior predictive pvalues
   corresponding to the same simulated sequences from the model $\M$
   used for calculating the ECDF.
 \item License: GNU General Public License (GPL), version 2 or later.
 \end{itemize}
 
-\subsection{Data files}\label{datafiles}
+\subsection{Data files}\label{genes}
 
-\input{DATAFILETABLEHUMAN}
-\input{DATAFILETABLEMOUSE}
+\input{GENETABLEHUMAN}
+\input{GENETABLEMOUSE}
 
 \end{document}
 def getfasta(dataset, dirpath):
     """
     Make multiple wget calls to download a fasta file corresponding to
-    the accession numbers in 'datafiles_info' corresponding to
+    the accession numbers in 'genes_info' corresponding to
     'dataset'. Each call downloads a fasta file. wgetcmd concatenates
     them together. NOTE: The hardwired 30 is the maximum number of
     accession numbers that ebi accepts in a single request.
     import os
     from corrmodel import utils
     conf = utils.get_conf()
-    datafiles_info = conf[dataset]["datafiles_info"]
-    accnumlst = list(set([d[0] for d in datafiles_info]))
+    genes_info = conf[dataset]["genes_info"]
+    accnumlst = list(set([d[0] for d in genes_info]))
     accnumlst.sort()
     lstlen = len(accnumlst)
     try:
 			     const unordered_map<string, int>& sequences, const model& mod, const vector<int> arrfreqlst,
 			     const size_t rownum, const std::pair<vector<double>, vector<int> >& logjtdistlst,
 			     const vector<double>& simstatlst, const size_t numsim, const int datasubgroup_id,
-			     const int datafile_id, const int seqlen, const bool fast, const double neglogppcutoff)
+			     const int gene_id, const int seqlen, const bool fast, const double neglogppcutoff)
 {
   ofstream dtf(datatablefile.c_str(), std::ios::out);
   string seq, seqindstr;
       else
  	pval = pvalue(mod, seqarr, logjtdistlst, datastatlst, simstatlst, numsim);
       stringstream dts;
-      dts << datasubgroup_id << "," << datafile_id << "," << seq << "," << i << "," << seqindstr << "," << neglogpp << "," << pval << endl;
+      dts << datasubgroup_id << "," << gene_id << "," << seq << "," << i << "," << seqindstr << "," << neglogpp << "," << pval << endl;
       dtf << dts.str();
     }
   dtf.close();
 			     const unordered_map<string, int>& sequences, const model& mod, const vector<int> arrfreqlst,
 			     const size_t rownum, const std::pair<vector<double>, vector<int> >& logjstdistlst,
 			     const vector<double>& simstatlst, const size_t numsim, const int datasubgroup_id,
-			     const int datafile_id, const int seqlen, const bool fast=true, const double neglogppcutoff=40);
+			     const int gene_id, const int seqlen, const bool fast=true, const double neglogppcutoff=40);
 
 #endif
   //boost::python::def("cpp_write_data_file_partial", write_data_file_partial);
   boost::python::def("cpp_write_data_file_partial", write_data_file_partial,
 		     write_data_file_partial_overloads("Write part of a data file",
-						       args("datatablefile", "startpos", "endpos", "datastr", "sequences", "mod", "arrfreqlst", "rownum", "logjtdistlst", "simstatlst", "numsim", "datasubgroup_id", "datafile_id", "seqlen", "fast", "neglogppcutoff")));
+						       args("datatablefile", "startpos", "endpos", "datastr", "sequences", "mod", "arrfreqlst", "rownum", "logjtdistlst", "simstatlst", "numsim", "datasubgroup_id", "gene_id", "seqlen", "fast", "neglogppcutoff")));
 }