Commits

Faheem Mitha committed 974a903

Rename 'motif' table to 'sequence', 'motifgroup' table to 'sequencegroup', 'motifstat' table to 'sequencestat', and 'motifstatgroup' table to 'sequencestatgroup'.

  • Participants
  • Parent commits ad98971

Comments (0)

Files changed (8)

 
 The following example usage is for mouse 12 RSS
 
-### Loading database tables related to motif data analysis: ###
+### Loading database tables related to sequence data analysis: ###
 
 To load all tables:
 
-    ./cmload all_motif mouse12rss
+    ./cmload all_sequence mouse12rss
 
 This can be divided into the following steps:
 
 a) Load sequence data.
 
-    ./cmload motif mouse12rss
+    ./cmload sequence mouse12rss
 
 b) Load cross-validation data.
 
 
     ./init_db corrmodel
 
-    ./cmload motif -s human12rss
-    ./cmload all_motif human12rss
-    ./cmload all_motif -n 1 human12rss
+    ./cmload sequence -s human12rss
+    ./cmload all_sequence human12rss
+    ./cmload all_sequence -n 1 human12rss
 
-    ./cmload motif -s mouse12rss
-    ./cmload all_motif mouse12rss
-    ./cmload all_motif -n 1 mouse12rss
+    ./cmload sequence -s mouse12rss
+    ./cmload all_sequence mouse12rss
+    ./cmload all_sequence -n 1 mouse12rss
 
 To use the scripts installed in the system, replace `./init_db` with
 `init_db`, and `./cmload` with `cmload`.
 
 get_stat <- function(schema, con)
   {
-    True <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.motifgroup inner join %1$s.motifstat on motifgroup.id=motifstat.motifstatgroup_id where motifgroup.simulated='f';", schema))
-    Sim <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.motifgroup inner join %1$s.motifstat on motifgroup.id=motifstat.motifstatgroup_id where motifgroup.simulated='t';", schema))
+    True <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.sequencegroup inner join %1$s.sequencestat on sequencegroup.id=sequencestat.sequencestatgroup_id where sequencegroup.simulated='f';", schema))
+    Sim <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.sequencegroup inner join %1$s.sequencestat on sequencegroup.id=sequencestat.sequencestatgroup_id where sequencegroup.simulated='t';", schema))
     Data <- dbGetQuery(con, sprintf("SELECT neglogpp, pvalue from %1$s.data inner join %1$s.datasubgroup on data.datasubgroup_id=datasubgroup.id inner join %1$s.crossval on datasubgroup.crossval_id=crossval.id inner join %1$s.crossvalgroup on crossval.crossvalgroup_id=crossvalgroup.id where crossvalgroup.samplenum=1;", schema))
     Stat = data.frame(value = c(True$neglogpp, Sim$neglogpp, Data$neglogpp),
       pvalue = c(True$pvalue, Sim$pvalue, Data$pvalue),
                        ON         datasubgroup.crossval_id=crossval.id
                        INNER JOIN %1$s.crossvalgroup
                        ON         crossval.crossvalgroup_id=crossvalgroup.id
-                       INNER JOIN %1$s.motifgroup
-                       ON         crossvalgroup.motifgroup_id=motifgroup.id
+                       INNER JOIN %1$s.sequencegroup
+                       ON         crossvalgroup.sequencegroup_id=sequencegroup.id
    WHERE      (data.seqindex = any(crossval.sample)
    OR         data.seqindex IS NULL)
    AND        crossvalgroup.permuted='f'
    AND        crossvalgroup.samplenum=5
-   AND        motifgroup.simulated='f'
-   AND        motifgroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
+   AND        sequencegroup.simulated='f'
+   AND        sequencegroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
 ) AS q
 WHERE q.r <= %2$s
 ", schema, limit)))

File cm-neglogpptable

 
 ## First, the query q gives
 
-## r | motif | crossvalnum | neglogpp
+## r | rss | crossvalnum | neglogpp
 
 ## r: is the rank, which corresponds to grouping the sequence rows by
 ## crossvalnum with groups in order of increasing crossvalnum, and then
 ## within each crossvalnumgroup, we order by neglogpp
 
-## motif: 0 if seqindex is null
-##        1 if seqindex is not null
-## i.e. motif is an indicator function for whether a sequence is an RSS or not
+## rss: 0 if seqindex is null
+##      1 if seqindex is not null
+## i.e. rss is an indicator function for whether a sequence is an RSS or not
 
 ## Second, the values sum1, sum2, and sum3, derived from the query q
 ## give:
 {
   Neglogpp <- data.frame(dbGetQuery(connection,
                                sprintf("SELECT crossvalnum,
-SUM(motif) as sum1,
-SUM(CASE WHEN r <= LEAST(40,100) THEN motif ELSE 0 END) as sum2,
-SUM(CASE WHEN r <= GREATEST(40,100) THEN motif ELSE 0 END) as sum3
+SUM(rss) as sum1,
+SUM(CASE WHEN r <= LEAST(40,100) THEN rss ELSE 0 END) as sum2,
+SUM(CASE WHEN r <= GREATEST(40,100) THEN rss ELSE 0 END) as sum3
 FROM (
 SELECT   ROW_NUMBER() OVER (PARTITION BY crossval.crossvalnum ORDER BY crossval.crossvalnum, data.neglogpp) AS r,
-                    (seqindex IS NOT NULL)::INTEGER                                                      AS motif,
+                    (seqindex IS NOT NULL)::INTEGER                                                      AS rss,
                     crossvalnum                                                                                  ,
                     neglogpp
          FROM       %1$s.data
                     ON         datasubgroup.crossval_id=crossval.id
                     INNER JOIN %1$s.crossvalgroup
                     ON         crossval.crossvalgroup_id=crossvalgroup.id
-                    INNER JOIN %1$s.motifgroup
-                    ON         crossvalgroup.motifgroup_id=motifgroup.id
+                    INNER JOIN %1$s.sequencegroup
+                    ON         crossvalgroup.sequencegroup_id=sequencegroup.id
          WHERE      (data.seqindex                        = ANY(crossval.sample)
          OR         data.seqindex                  IS NULL)
          AND        crossvalgroup.permuted='f'
          AND        crossvalgroup.samplenum=5
-         AND        motifgroup.simulated='f'
-         AND        motifgroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
+         AND        sequencegroup.simulated='f'
+         AND        sequencegroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
 ) AS q
 GROUP BY crossvalnum", schema)))
 
 
     Combinedneglogpp <- data.frame(dbGetQuery(connection,
                                       sprintf("SELECT
-SUM(motif) as sum1,
-SUM(CASE WHEN r <= 200 THEN motif ELSE 0 END) as sum2,
-SUM(CASE WHEN r <= 500 THEN motif ELSE 0 END) as sum3
+SUM(rss) as sum1,
+SUM(CASE WHEN r <= 200 THEN rss ELSE 0 END) as sum2,
+SUM(CASE WHEN r <= 500 THEN rss ELSE 0 END) as sum3
 from (
 SELECT ROW_NUMBER() OVER (order by avg(neglogpp)) as r,
                   gene_id,
                   index,
                   avg(neglogpp) as avg_neglogpp,
-                  (seqindex is not null)::integer as motif
+                  (seqindex is not null)::integer as rss
        FROM       %1$s.data
                   INNER JOIN %1$s.datasubgroup
                   ON         data.datasubgroup_id=datasubgroup.id
                   ON         datasubgroup.crossval_id=crossval.id
                   INNER JOIN %1$s.crossvalgroup
                   ON         crossval.crossvalgroup_id=crossvalgroup.id
-                  INNER JOIN %1$s.motifgroup
-                  ON         crossvalgroup.motifgroup_id=motifgroup.id
+                  INNER JOIN %1$s.sequencegroup
+                  ON         crossvalgroup.sequencegroup_id=sequencegroup.id
        WHERE      crossvalgroup.permuted='f'
        AND        crossvalgroup.samplenum=1
-       AND        motifgroup.simulated='f'
-       AND        motifgroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
+       AND        sequencegroup.simulated='f'
+       AND        sequencegroup.subset='{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27}'
                   group by gene_id, index, seqindex order by avg(neglogpp)) AS q", schema)))
 print(Combinedneglogpp)
 return(Combinedneglogpp)
 
 # create the top-level parser
 
-from corrmodel.load import load_motif_dataset, load_crossval_dataset, load_all_motifstat, load_allseq, load_all_motif_tables, load_simdata, get_and_load_simresult, load_all_sim_tables, load_all_sim_tables_parallel
+from corrmodel.load import load_sequence_dataset, load_crossval_dataset, load_all_sequencestat, load_allseq, load_all_sequence_tables, load_simdata, get_and_load_simresult, load_all_sim_tables, load_all_sim_tables_parallel
 
 import argparse
 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 subparsers = parser.add_subparsers()
 
-# create the parser for the "motif" command
-parser_motif = subparsers.add_parser('motif', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser_motif.add_argument('dataset', help='name of dataset')
-parser_motif.add_argument('-n', '--motifsimnum', action="store", type=int, dest="motifsimnum", help='number of sequences to simulate')
-parser_motif.add_argument('--subset', help='subset')
-parser_motif.add_argument('-s', '--simulated', action="store_true", dest="simulated", help='simulated data set', default=False)
-parser_motif.set_defaults(func=load_motif_dataset)
+# create the parser for the "sequence" command
+parser_sequence = subparsers.add_parser('sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser_sequence.add_argument('dataset', help='name of dataset')
+parser_sequence.add_argument('-n', '--sequencesimnum', action="store", type=int, dest="sequencesimnum", help='number of sequences to simulate')
+parser_sequence.add_argument('--subset', help='subset')
+parser_sequence.add_argument('-s', '--simulated', action="store_true", dest="simulated", help='simulated data set', default=False)
+parser_sequence.set_defaults(func=load_sequence_dataset)
 
 # create the parser for the "crossval" command
 parser_crossval = subparsers.add_parser('crossval', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser_crossval.add_argument("-s", "--search", action="store_true", dest="search", help="model search", default=False)
 parser_crossval.set_defaults(func=load_crossval_dataset)
 
-# create the parser for the "motifstat" command
-parser_motifstat = subparsers.add_parser('motifstat', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser_motifstat.add_argument('dataset', help='name of dataset')
-parser_motifstat.add_argument('-n', '--numsim', type=int, help='number of replicates to simulate')
-parser_motifstat.set_defaults(func=load_all_motifstat)
+# create the parser for the "sequencestat" command
+parser_sequencestat = subparsers.add_parser('sequencestat', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser_sequencestat.add_argument('dataset', help='name of dataset')
+parser_sequencestat.add_argument('-n', '--numsim', type=int, help='number of replicates to simulate')
+parser_sequencestat.set_defaults(func=load_all_sequencestat)
 
 # create the parser for the "data" command
 parser_data = subparsers.add_parser('data', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser_data.add_argument('-n', '--numsim', type=int, help='number of replicates to simulate')
 parser_data.set_defaults(func=load_allseq)
 
-# create the parser for the "all_motif" command
-parser_all_motif = subparsers.add_parser('all_motif', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser_all_motif.add_argument('dataset', help='name of dataset')
-parser_all_motif.add_argument('--slow', action="store_false", dest="fast", help='switch to slow mode calculation (do not assume pvalues above the neglogppcutoff are 0)')
-parser_all_motif.add_argument('--motifsimnum', action="store", type=int, dest="motifsimnum", help='number of sequences to simulate')
-parser_all_motif.add_argument('--numsim', type=int, help='number of replicates to simulate')
-parser_all_motif.add_argument("-n", "--samplenum", action="store", type=int, dest="samplenum", help="number of samples to do crossvalidation on")
-parser_all_motif.add_argument("-p", "--permuted", action="store_true", dest="permuted", help="permuted motif dataset", default=False)
-parser_all_motif.add_argument("-s", "--simulated", action="store_true", dest="simulated", help="simulated motif dataset")
-parser_all_motif.add_argument("--search", action="store_true", dest="search", help='search for model in crossval step', default=False)
-parser_all_motif.set_defaults(func=load_all_motif_tables)
+# create the parser for the "all_sequence" command
+parser_all_sequence = subparsers.add_parser('all_sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser_all_sequence.add_argument('dataset', help='name of dataset')
+parser_all_sequence.add_argument('--slow', action="store_false", dest="fast", help='switch to slow mode calculation (do not assume pvalues above the neglogppcutoff are 0)')
+parser_all_sequence.add_argument('--sequencesimnum', action="store", type=int, dest="sequencesimnum", help='number of sequences to simulate')
+parser_all_sequence.add_argument('--numsim', type=int, help='number of replicates to simulate')
+parser_all_sequence.add_argument("-n", "--samplenum", action="store", type=int, dest="samplenum", help="number of samples to do crossvalidation on")
+parser_all_sequence.add_argument("-p", "--permuted", action="store_true", dest="permuted", help="permuted sequence dataset", default=False)
+parser_all_sequence.add_argument("-s", "--simulated", action="store_true", dest="simulated", help="simulated sequence dataset")
+parser_all_sequence.add_argument("--search", action="store_true", dest="search", help='search for model in crossval step', default=False)
+parser_all_sequence.set_defaults(func=load_all_sequence_tables)
 
 # create the parser for the "simdata" command
 parser_simdata = subparsers.add_parser('simdata', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

File corrmodel/dbschema.py

         return value
 
 class Crossvalgroup(object):
-    def __init__(self, permuted, samplenum, time, motifgroup_id):
+    def __init__(self, permuted, samplenum, time, sequencegroup_id):
         self.permuted = permuted
         self.samplenum = samplenum
         self.time = time
-        self.motifgroup_id = motifgroup_id
+        self.sequencegroup_id = sequencegroup_id
     def __repr__(self):
         return '<Crossvalgroup %s>'%self.id
 
     def __repr__(self):
         return '<Model %s>'%(str((self.cols, self.model)))
 
-class Motifstatgroup(object):
-    def __init__(self, motifgroup_id, numsim):
-        self.motifgroup_id = motifgroup_id
+class Sequencestatgroup(object):
+    def __init__(self, sequencegroup_id, numsim):
+        self.sequencegroup_id = sequencegroup_id
         self.numsim = numsim
     def __repr__(self):
-        return '<Motifstatgroup %s>'%self.id
+        return '<Sequencestatgroup %s>'%self.id
 
-class Motifstat(object):
-    def __init__(self, motifstatgroup_id, neglogpp, pvalue):
-        self.motifstatgroup_id = motifstatgroup_id
+class Sequencestat(object):
+    def __init__(self, sequencestatgroup_id, neglogpp, pvalue):
+        self.sequencestatgroup_id = sequencestatgroup_id
         self.neglogpp = neglogpp
         self.pvalue = pvalue
     def __repr__(self):
-        return '<Motifstat %s>'%self.id
+        return '<Sequencestat %s>'%self.id
 
-class Motifgroup(object):
+class Sequencegroup(object):
     def __init__(self, subset, model_id, simulated, size=None):
         # the desired subset of the RSS sequence
         self.subset = subset
         self.simulated = simulated
         self.size = size
     def __repr__(self):
-        return '<Motifgroup %s>'%str(self.id)
+        return '<Sequencegroup %s>'%str(self.id)
 
-class Motif(object):
-    def __init__(self, motifgroup_id, sequence):
-        self.motifgroup_id = motifgroup_id
+class Sequence(object):
+    def __init__(self, sequencegroup_id, sequence):
+        self.sequencegroup_id = sequencegroup_id
         self.sequence = sequence
     def __repr__(self):
-        return '<Motif %s>'%self.id
+        return '<Sequence %s>'%self.id
 
 class Datagroup(object):
     def __init__(self, crossvalgroup_id, numsim, time=None):
         Column('permuted', Boolean),
         Column('samplenum', Integer),
         Column('time', Float),
-        Column('motifgroup_id', Integer, ForeignKey(schema+'.motifgroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
-        UniqueConstraint('permuted', 'samplenum', 'motifgroup_id'),
+        Column('sequencegroup_id', Integer, ForeignKey(schema+'.sequencegroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
+        UniqueConstraint('permuted', 'samplenum', 'sequencegroup_id'),
         schema = schema
         )
     mapper(Crossvalgroup, crossvalgroup_table)
     """Each row of this table corresponds to a subgroup, which is part
     of a crossvalidation procedure. 'crossvalnum' is the index of this
     subgroup in the crossvalidation. 'sample' is the subgroup, with
-    the indexing relative to the RSS ids in the motif table. Each row
+    the indexing relative to the RSS ids in the sequence table. Each row
     corresponds to a single crossvalidation process, with the
     'training dataset' being the RSS minus the 'sample' subgroup, and
     the 'testing dataset' being the 'sample' subgroup. 'model_id' is
     mapper(Model, model_table)
     return model_table, Model
 
-def make_motifstatgroup_table(meta, schema, name='motifstatgroup'):
+def make_sequencestatgroup_table(meta, schema, name='sequencestatgroup'):
     """
-    We calculate statistics for all motifs in the motif group whose id
-    is 'motifgroup_id', currently based on the distributed implied by
+    We calculate statistics for all sequences in the sequence group whose id
+    is 'sequencegroup_id', currently based on the distributed implied by
     the model in the config file corresponding to 'schema'. A row in
-    this table represents these statistics (motifstat) for the
-    motifgroup with id 'motifgroup_id'. 'numsim' is a parameter for
+    this table represents these statistics (sequencestat) for the
+    sequencegroup with id 'sequencegroup_id'. 'numsim' is a parameter for
     the simulated distribution.
     """
-    motifstatgroup_table = Table(
+    sequencestatgroup_table = Table(
         name, meta,
         #Column('id', Integer, primary_key=True),
-        Column('motifgroup_id', Integer, ForeignKey(schema+'.motifgroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True, primary_key=True),
+        Column('sequencegroup_id', Integer, ForeignKey(schema+'.sequencegroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True, primary_key=True),
         Column('created', TIMESTAMP(), default=now()),
         Column('numsim', Integer),
         schema = schema,
         )
-    mapper(Motifstatgroup, motifstatgroup_table)
-    return motifstatgroup_table, Motifstatgroup
+    mapper(Sequencestatgroup, sequencestatgroup_table)
+    return sequencestatgroup_table, Sequencestatgroup
 
-def make_motifstat_table(meta, schema, name='motifstat'):
+def make_sequencestat_table(meta, schema, name='sequencestat'):
     """
     Each row of this table corresponds to statisics ('neglogpp',
-    'pvalue') for a motif in a motifgroup. These motifs together
-    correspond to the motifstatgroup with id 'motifstatgroup_id'.
-    This motifgroup has id 'motifgroup_id' in the corresponding
-    motifstatgroup with id 'motifstatgroup_id'. FIXME: It would
+    'pvalue') for a sequence in a sequencegroup. These sequences together
+    correspond to the sequencestatgroup with id 'sequencestatgroup_id'.
+    This sequencegroup has id 'sequencegroup_id' in the corresponding
+    sequencestatgroup with id 'sequencestatgroup_id'. FIXME: It would
     perhaps be better if each row corresponded directly to the
-    corresponding entry in the 'motif' table, but this is currently
+    corresponding entry in the 'sequence' table, but this is currently
     not the case.
     """
-    motifstat_table = Table(
+    sequencestat_table = Table(
         name, meta,
         Column('id', Integer, primary_key=True),
-        Column('motifstatgroup_id',  Integer, ForeignKey(schema+'.motifstatgroup.motifgroup_id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
+        Column('sequencestatgroup_id',  Integer, ForeignKey(schema+'.sequencestatgroup.sequencegroup_id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
         Column('neglogpp', Float),
         Column('pvalue', Float),
         schema = schema,
         )
-    mapper(Motifstat, motifstat_table)
-    return motifstat_table, Motifstat
+    mapper(Sequencestat, sequencestat_table)
+    return sequencestat_table, Sequencestat
 
-def make_motifgroup_table(meta, schema, name='motifgroup'):
+def make_sequencegroup_table(meta, schema, name='sequencegroup'):
     """Each row of this table corresponds to a group of RSS sequences"""
-    motifgroup_table = Table(
+    sequencegroup_table = Table(
         name, meta,
         Column('id',  Integer, index=True, primary_key=True),
         Column('subset', postgresql.ARRAY(Integer, as_tuple=True)),
         Column('size', Integer),
         schema = schema,
         )
-    mapper(Motifgroup, motifgroup_table)
-    return motifgroup_table, Motifgroup
+    mapper(Sequencegroup, sequencegroup_table)
+    return sequencegroup_table, Sequencegroup
 
-def make_motifgroup_table_index(motifgroup_table):
+def make_sequencegroup_table_index(sequencegroup_table):
     """
-    Add constraint on motifgroup so if simulated = 'f', then subset must be unique.
+    Add constraint on sequencegroup so if simulated = 'f', then subset must be unique.
     """
-    event.listen(motifgroup_table, 'after_create',
-                 DDL("CREATE UNIQUE INDEX motifgroup_subset_key ON %(fullname)s (subset) WHERE simulated = 'f'")
+    event.listen(sequencegroup_table, 'after_create',
+                 DDL("CREATE UNIQUE INDEX sequencegroup_subset_key ON %(fullname)s (subset) WHERE simulated = 'f'")
                  )
 
-def make_motif_table(meta, schema, name='motif'):
+def make_sequence_table(meta, schema, name='sequence'):
     """Each row of this table corresponds to an RSS sequence"""
-    motif_table = Table(
+    sequence_table = Table(
         name, meta,
         Column('id', Integer, primary_key=True),
-        Column('motifgroup_id', Integer, ForeignKey(schema+'.motifgroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
+        Column('sequencegroup_id', Integer, ForeignKey(schema+'.sequencegroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
         Column('sequence', String),
         schema = schema,
         )
-    mapper(Motif, motif_table)
-    return motif_table, Motif
+    mapper(Sequence, sequence_table)
+    return sequence_table, Sequence
 
 def make_datagroup_table(meta, schema, name='datagroup'):
     """Each row of this table corresponds to a group of Datasubgroup objects
     sequence comes. The 'sequence' is the actual sequence string. The
     'index' is the index/location of the sequence within the gene. The
     'seqindex' is the id of the sequence, if it is a RSS, in the
-    'motif' table. The 'seqindex' is empty if the sequence is not an
+    'sequence' table. The 'seqindex' is empty if the sequence is not an
     RSS. 'neglogpp' is the negative log posterior predictive
     probability of the sequence. 'pvalue' is the posterior predictive
     pvalue of the sequence. The 'neglogpp' and 'pvalue' are calculated

File corrmodel/default_conf.yaml

 runs: 50
 convergeruns: 25
 numsim: 100
-motifsimnum: 1000
+sequencesimnum: 1000
 splitnum: 2
 samplenum: 5
 

File corrmodel/load.py

     Add model specified to db.
     """
     from sqlalchemy.orm import sessionmaker
-    tabledict = create_motif_tables(schema)
+    tabledict = create_sequence_tables(schema)
     Model = tabledict["Model"]
     confval = create_db()
     db = confval["db"]
     db.dispose()
     return model, model_id
 
-def create_motif_tables(schema, result={}):
-    """Create all motif-related tables in a particular schema"""
+def create_sequence_tables(schema, result={}):
+    """Create all sequence-related tables in a particular schema"""
     if result:
         return result
     import os, sys, time
     from utils import file_not_exist, create_engine_wrapper, get_conf, pg_url
     from dbutils import schema_exists
     from sqlalchemy.orm import mapper, relation, sessionmaker
-    from dbschema import make_crossvalgroup_table, make_crossval_table, make_motifstat_table, make_motifstatgroup_table, make_motifgroup_table, make_motifgroup_table_index, make_motif_table, make_model_table, make_datagroup_table, make_datasubgroup_table, make_gene_table, make_data_table
+    from dbschema import make_crossvalgroup_table, make_crossval_table, make_sequencestat_table, make_sequencestatgroup_table, make_sequencegroup_table, make_sequencegroup_table_index, make_sequence_table, make_model_table, make_datagroup_table, make_datasubgroup_table, make_gene_table, make_data_table
     import cPickle, random, cpplib, getmodel
     confval = create_db()
     dbname = confval["dbname"]
     meta = MetaData()
     crossvalgroup_table, Crossvalgroup = make_crossvalgroup_table(meta, schema)
     crossval_table, Crossval = make_crossval_table(meta, schema)
-    motifstat_table, Motifstat = make_motifstat_table(meta, schema)
-    motifstatgroup_table, Motifstatgroup = make_motifstatgroup_table(meta, schema)
-    motifgroup_table, Motifgroup = make_motifgroup_table(meta, schema)
-    make_motifgroup_table_index(motifgroup_table)
-    motif_table, Motif = make_motif_table(meta, schema)
+    sequencestat_table, Sequencestat = make_sequencestat_table(meta, schema)
+    sequencestatgroup_table, Sequencestatgroup = make_sequencestatgroup_table(meta, schema)
+    sequencegroup_table, Sequencegroup = make_sequencegroup_table(meta, schema)
+    make_sequencegroup_table_index(sequencegroup_table)
+    sequence_table, Sequence = make_sequence_table(meta, schema)
     model_table, Model = make_model_table(meta, schema)
     datagroup_table, Datagroup = make_datagroup_table(meta, schema)
     datasubgroup_table, Datasubgroup = make_datasubgroup_table(meta, schema)
     d['meta'] = meta
     d['Crossvalgroup'] = Crossvalgroup
     d['Crossval'] = Crossval
-    d['Motifstatgroup'] = Motifstatgroup
-    d['Motifstat'] = Motifstat
-    d['Motifgroup'] = Motifgroup
-    d['Motif'] = Motif
+    d['Sequencestatgroup'] = Sequencestatgroup
+    d['Sequencestat'] = Sequencestat
+    d['Sequencegroup'] = Sequencegroup
+    d['Sequence'] = Sequence
     d['Model'] = Model
     d['Datagroup'] = Datagroup
     d['Datasubgroup'] = Datasubgroup
         return str(numpy_string)
     register_adapter(numpy.string_, addapt_numpy_string)
 
-def load_motif_dataset(args):
-    """Load RSS sequences from fasta file into Motif table. If
+def load_sequence_dataset(args):
+    """Load RSS sequences from fasta file into Sequence table. If
     model_id is specified, simulate sequences rather than reading from
     fasta file"""
     schema, subset, simulated = args.dataset, args.subset, args.simulated
     import cpplib
     from getmodel import writearr
     from utils import get_conf
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Model = d['Model']
-    Motifgroup = d['Motifgroup']
-    Motif = d['Motif']
+    Sequencegroup = d['Sequencegroup']
+    Sequence = d['Sequence']
     conf = get_conf()
     if subset==None and 'subset' in conf[schema]:
         subset = conf[schema]["subset"]
 
     # If simulated = 'f', then check that subset does not already
     # exist. This is the same as in enforced at the db level by
-    # 'make_motifgroup_table_index' in dbschema.py.
-    if(session.query(Motifgroup).filter(Motifgroup.subset==subset).filter(Motifgroup.simulated==False).count() == 1):
-        print "Motif dataset already loaded, skipping"
+    # 'make_sequencegroup_table_index' in dbschema.py.
+    if(session.query(Sequencegroup).filter(Sequencegroup.subset==subset).filter(Sequencegroup.simulated==False).count() == 1):
+        print "Sequence dataset already loaded, skipping"
         return
 
     if subset is not None:
     mod = cpplib.cpp_model(cols, model)
     model, model_id = write_model_to_db(schema, cols, model)
 
-    # Add Motifgroup object
+    # Add Sequencegroup object
     session.commit()
-    if args.motifsimnum is not None and not simulated:
-        sys.exit("--motifsimnum option is only used when dataset is simulated")
-    elif args.motifsimnum is not None and simulated:
-        motifsimnum = args.motifsimnum
+    if args.sequencesimnum is not None and not simulated:
+        sys.exit("--sequencesimnum option is only used when dataset is simulated")
+    elif args.sequencesimnum is not None and simulated:
+        sequencesimnum = args.sequencesimnum
     elif simulated:
-        motifsimnum = conf["motifsimnum"]
+        sequencesimnum = conf["sequencesimnum"]
     else:
-        motifsimnum = None
-    mgroup = Motifgroup(subset, model_id, simulated, motifsimnum)
+        sequencesimnum = None
+    mgroup = Sequencegroup(subset, model_id, simulated, sequencesimnum)
     session.add(mgroup)
     session.commit()
-    motifgroup_id = mgroup.id
+    sequencegroup_id = mgroup.id
     try:
         if simulated:
-            arr = numpy.array(cpplib.cpp_sim_with_data_prob(mod, arr, motifsimnum))
+            arr = numpy.array(cpplib.cpp_sim_with_data_prob(mod, arr, sequencesimnum))
         # Add sequences to db
         mlst = []
         for row in arr:
             seq = ''.join(row)
-            mlst.append(Motif(motifgroup_id, seq))
+            mlst.append(Sequence(sequencegroup_id, seq))
         session.add_all(mlst)
         session.commit()
         session.flush()
         session.close()
         db.dispose()
-        return motifgroup_id
+        return sequencegroup_id
     except:
         session.delete(mgroup)
         session.commit()
     from crossval import crossval
     from utils import arr_to_list, permute_array, get_conf
     import decimal, numpy
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Crossval = d['Crossval']
     Crossvalgroup = d['Crossvalgroup']
     Model = d['Model']
-    Motifgroup = d['Motifgroup']
+    Sequencegroup = d['Sequencegroup']
     Session = sessionmaker(bind=db)
     session = Session()
     if permuted==True:
         else:
             samplenum = conf["samplenum"]
     subset = conf[schema]["subset"]
-    # motifgroup_id for true motif group (RSS)
-    motifgroup_id = session.query(Motifgroup).filter(Motifgroup.subset==subset).filter(Motifgroup.simulated==False).one().id
-    sequences = session.execute("SELECT id, sequence from %s.motif where motif.motifgroup_id=%s;"%(schema, motifgroup_id)).fetchall()
+    # sequencegroup_id for true sequence group (RSS)
+    sequencegroup_id = session.query(Sequencegroup).filter(Sequencegroup.subset==subset).filter(Sequencegroup.simulated==False).one().id
+    sequences = session.execute("SELECT id, sequence from %s.sequence where sequence.sequencegroup_id=%s;"%(schema, sequencegroup_id)).fetchall()
     cols = len(sequences[0][1])
     t1 = time.time()
     # run model search
     elif not search:
         samplelst = crossval(sequences, samplenum, runs, convergeruns, search)
     t2 = time.time()
-    cg = Crossvalgroup(permuted, samplenum, t2-t1, motifgroup_id)
+    cg = Crossvalgroup(permuted, samplenum, t2-t1, sequencegroup_id)
     session.add(cg)
     session.commit()
 
     return result
 
 @print_timing
-def load_all_motifstat(args):
+def load_all_sequencestat(args):
     """
-    Calls load_motifstat for every entry in motifgroup for which an
-    entry in motifstatgroup with the same motifgroup id is not already
+    Calls load_sequencestat for every entry in sequencegroup for which an
+    entry in sequencestatgroup with the same sequencegroup id is not already
     present.
     """
     schema, numsim = args.dataset, args.numsim
     if numsim is None:
         numsim = conf["numsim"]
     subset = conf[schema]["subset"]
-    # check for motifgroup entries with given subset
-    d = create_motif_tables(schema)
+    # check for sequencegroup entries with given subset
+    d = create_sequence_tables(schema)
     db = d['db']
-    Motifgroup = d['Motifgroup']
-    Motifstatgroup = d['Motifstatgroup']
+    Sequencegroup = d['Sequencegroup']
+    Sequencestatgroup = d['Sequencestatgroup']
     Session = sessionmaker(bind=db)
     session = Session()
-    if session.query(Motifgroup).filter(Motifgroup.subset==subset).count() == 0:
-        sys.exit("no motifgroup entries with subset %s"%(subset))
-    motifgroup_ids = [mg.id for mg in session.query(Motifgroup).filter(Motifgroup.subset==subset).all()]
-    # check whether corresponding entries exist in Motifstatgroup already
-    for motifgroup_id in motifgroup_ids:
-        if session.query(Motifstatgroup).filter(Motifstatgroup.motifgroup_id==motifgroup_id).count()==0:
-            load_motifstat(schema, motifgroup_id, numsim)
+    if session.query(Sequencegroup).filter(Sequencegroup.subset==subset).count() == 0:
+        sys.exit("no sequencegroup entries with subset %s"%(subset))
+    sequencegroup_ids = [mg.id for mg in session.query(Sequencegroup).filter(Sequencegroup.subset==subset).all()]
+    # check whether corresponding entries exist in Sequencestatgroup already
+    for sequencegroup_id in sequencegroup_ids:
+        if session.query(Sequencestatgroup).filter(Sequencestatgroup.sequencegroup_id==sequencegroup_id).count()==0:
+            load_sequencestat(schema, sequencegroup_id, numsim)
 
 @print_timing
-def load_motifstat(schema, motifgroup_id, numsim):
+def load_sequencestat(schema, sequencegroup_id, numsim):
     """
     This loads pvalues and conditional expected statistics into the
-    table Motifstat for motifs in the group whose id is
-    'motifgroup_id'. 'logjtdistlst' is the list of values from the
+    table Sequencestat for sequences in the group whose id is
+    'sequencegroup_id'. 'logjtdistlst' is the list of values from the
     joint distribution simulated by MCMC.
     """
     import cpplib, cPickle, getmodel, random, time
     conf = get_conf()
     # if numsim is None:
     #     numsim = conf["numsim"]
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
-    Motifstatgroup = d['Motifstatgroup']
+    Sequencestatgroup = d['Sequencestatgroup']
     Model = d['Model']
-    Motifstat = d['Motifstat']
+    Sequencestat = d['Sequencestat']
     Session = sessionmaker(bind=db)
     session = Session()
-    msg = Motifstatgroup(motifgroup_id, numsim)
+    msg = Sequencestatgroup(sequencegroup_id, numsim)
     session.add(msg)
     session.commit()
-    arr = arr_from_sequence_table(schema, motifgroup_id)
+    arr = arr_from_sequence_table(schema, sequencegroup_id)
     modval = conf[schema]["model"]
     mod = cpplib.cpp_model(arr.shape[1], modval)
     pvalues = cpplib.cpp_pval_stat(arr, mod, numsim)
     session.commit()
     pvallst = []
     for pval in pvalues:
-        pvallst.append(Motifstat(msg.motifgroup_id, pval[0], pval[1]))
+        pvallst.append(Sequencestat(msg.sequencegroup_id, pval[0], pval[1]))
     session.add_all(pvallst)
     session.commit()
     session.flush()
     session.close()
     db.dispose()
 
-def arr_from_sequence_table(schema, motifgroup_id):
+def arr_from_sequence_table(schema, sequencegroup_id):
     """Convert list of sequences into an array"""
     import numpy
     from sqlalchemy.orm import sessionmaker
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Session = sessionmaker(bind=db)
     session = Session()
-    sequences = session.execute("SELECT sequence from %s.motif where motif.motifgroup_id=%s;"%(schema, motifgroup_id)).fetchall()
+    sequences = session.execute("SELECT sequence from %s.sequence where sequence.sequencegroup_id=%s;"%(schema, sequencegroup_id)).fetchall()
     seqlst = []
     for seq in sequences:
         seqlst.append([s for s in str(seq[0])])
     from getmodel import writearr, getmodel
     from utils import get_conf
     numpy.set_printoptions(threshold='nan')
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Model = d['Model']
     Session = sessionmaker(bind=db)
     import cpplib, getmodel, numpy, os, time
     from sqlalchemy.orm import sessionmaker
     from utils import get_conf, randstr, safe_mkdir
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Crossval = d['Crossval']
     Datagroup = d['Datagroup']
     import cpplib, getmodel, numpy, os, time
     from sqlalchemy.orm import sessionmaker
     from utils import get_conf, randstr, safe_mkdir
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Crossval = d['Crossval']
     Datagroup = d['Datagroup']
     Datasubgroup = d['Datasubgroup']
     Model = d['Model']
-    Motif = d['Motif']
-    Motifgroup = d['Motifgroup']
+    Sequence = d['Sequence']
+    Sequencegroup = d['Sequencegroup']
     Session = sessionmaker(bind=db)
     session = Session()
     t1 = time.time()
         numsim = conf["numsim"]
     splitnum = conf["splitnum"]
     neglogppcutoff = conf[schema]["neglogppcutoff"]
-    motifgroup_id = session.query(Motifgroup).filter(Motifgroup.simulated==False).filter(Motifgroup.subset==subset).one().id
+    sequencegroup_id = session.query(Sequencegroup).filter(Sequencegroup.simulated==False).filter(Sequencegroup.subset==subset).one().id
     dsg = session.query(Datasubgroup).filter(Datasubgroup.id==datasubgroup_id).one()
     crossval_id = dsg.crossval_id
     datagroup_id = dsg.datagroup_id
         print "warning: model id missing in crossval table for crossval id %s - using model id in config"%crossval_id
         modval = conf[schema]["model"]
     idsubset = session.query(Crossval).filter(Crossval.id==crossval_id).one().sample
-    totids = [m.id for m in session.query(Motif).filter(Motif.motifgroup_id==motifgroup_id).all()]
+    totids = [m.id for m in session.query(Sequence).filter(Sequence.sequencegroup_id==sequencegroup_id).all()]
     # check that idsubset is contained in totids
     if not set(idsubset) <= set(totids):
         raise ValueError("idsubset %s is not a subset of totids %s for crossval id %s. Aborting"%(idsubset, totids, crossval_id))
     idlst = tuple(set(totids).difference(set(idsubset)))
-    sequences = session.execute("SELECT sequence from %(schema)s.motif where motif.id in %(idlst)s"%{"schema":schema, "idlst":idlst}).fetchall()
+    sequences = session.execute("SELECT sequence from %(schema)s.sequence where sequence.id in %(idlst)s"%{"schema":schema, "idlst":idlst}).fetchall()
     seqlst = []
     for seq in sequences:
         seqlst.append([s for s in str(seq[0])])
     modlen = cpplib.cpp_modlen(model)
     if modlen != len(subset):
         raise ValueError("the length of the model and the subset in load_allseq must be the same, but the model is %s and the subset is %s"%(model, subset))
-    sqresult = session.execute("SELECT sequence, id from %(schema)s.motif"%{"schema":schema}).fetchall()
+    sqresult = session.execute("SELECT sequence, id from %(schema)s.sequence"%{"schema":schema}).fetchall()
     seqdict = {}
     for s in sqresult:
         if str(s[0]) in sqresult:
         session.close()
         db.dispose()
 
-def load_all_motif_tables(args):
-    schema, fast, motifsimnum, numsim, samplenum, permuted, simulated, search = args.dataset, args.fast, args.motifsimnum, args.numsim, args.samplenum, args.permuted, args.simulated, args.search
+def load_all_sequence_tables(args):
+    schema, fast, sequencesimnum, numsim, samplenum, permuted, simulated, search = args.dataset, args.fast, args.sequencesimnum, args.numsim, args.samplenum, args.permuted, args.simulated, args.search
     from utils import get_conf
     conf = get_conf()
     args.subset = conf[schema]["subset"]
-    from load import load_motif_dataset, load_crossval_dataset, load_motifstat, load_allseq
-    args.motifgroup_id = load_motif_dataset(args)
+    from load import load_sequence_dataset, load_crossval_dataset, load_sequencestat, load_allseq
+    args.sequencegroup_id = load_sequence_dataset(args)
     args.crossvalgroup_id = load_crossval_dataset(args)
-    load_all_motifstat(args)
+    load_all_sequencestat(args)
     load_allseq(args)
 
 def testdb(schema):
     import numpy
     from sqlalchemy.orm import sessionmaker
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Crossvalgroup = d['Crossvalgroup']
     Crossval = d['Crossval']
 def select_sequence(schema):
     import numpy
     from sqlalchemy.orm import sessionmaker
-    d = create_motif_tables(schema)
+    d = create_sequence_tables(schema)
     db = d['db']
     Sequence = d['Sequence']
     Session = sessionmaker(bind=db)