Commits

Faheem Mitha committed dee4ccc Merge

Branch merge with default.

Comments (0)

Files changed (5)

 
     ./cmload motif mouse12rss
 
-b) Load crossvalidation data.
+b) Load cross-validation data.
 
     ./cmload crossval mouse12rss
 
 c) Load stat data for reference RSS corresponding to specified
-crossvalidation setup.
+cross-validation setup.
 
     ./cmload stat mouse12rss
 
 To use the scripts installed in the system, replace `./init_db` with
 `init_db`, and `./cmload` with `cmload`.
 
-To generate the `figures.pdf` file, containing the figures from the
-corrmodel paper, run either
+To generate the `figures.pdf` file, containing the figures and tables
+from the corrmodel paper, run either
 
     scons figures.pdf
 
 local scripts or the scripts installed in the system. The default is
 the local scripts.
 
+NB: Since the subsets used for cross-validation are chosen randomly,
+and the results in the figures and tables depend on the choice of
+these subsets, these results will differ between runs of the software.
+
 Please send comments and questions on this document to Faheem Mitha at
 faheem@faheem.info.
 
 
 # create the top-level parser
 
-from corrmodel.load import load_motif_dataset, load_crossval_dataset, load_all_motifstat, load_allseq, load_all_motif_tables, load_simdata, get_and_load_simresult, load_all_sim_tables, load_all_sim_tables_parallel, load_jtpval
+from corrmodel.load import load_motif_dataset, load_crossval_dataset, load_all_motifstat, load_allseq, load_all_motif_tables, load_simdata, get_and_load_simresult, load_all_sim_tables, load_all_sim_tables_parallel
 
 import argparse
 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser_motifstat.add_argument('-n', '--numsim', type=int, help='number of replicates to simulate')
 parser_motifstat.set_defaults(func=load_all_motifstat)
 
-# create the parser for the "jtpval" command
-parser_jtpval = subparsers.add_parser('jtpval', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser_jtpval.add_argument('dataset', help='name of dataset')
-parser_jtpval.add_argument('-n', '--numsim', type=int, help='number of replicates to simulate')
-parser_jtpval.add_argument('motifgroup_id', type=int, help='motifgroup id')
-parser_jtpval.set_defaults(func=load_jtpval)
-
 # create the parser for the "data" command
 parser_data = subparsers.add_parser('data', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser_data.add_argument('dataset', help='name of dataset')

corrmodel/dbschema.py

     def __repr__(self):
         return '<Motifstat %s>'%self.id
 
-class Jtpval(object):
-    def __init__(self, motifgroup_id, pvalue, numsim):
-        self.motifgroup_id = motifgroup_id
-        self.pvalue = pvalue
-        self.numsim = numsim
-    def __repr__(self):
-        return '<Jtpval %s>'%self.id
-
 class Sequencegroup(object):
     def __repr__(self):
         return '<Sequencegroup %s>'%str(self.id)
     mapper(Motifstat, motifstat_table)
     return motifstat_table, Motifstat
 
-def make_jtpval_table(meta, schema, name='jtpval'):
-    jtpval_table = Table(
-        name, meta,
-        Column('id', Integer, primary_key=True),
-        Column('motifgroup_id',  Integer, ForeignKey(schema+'.motifgroup.id', onupdate='CASCADE', ondelete='CASCADE'), index=True),
-        Column('pvalue', Float),
-        Column('numsim', Integer),
-        schema = schema,
-        )
-    mapper(Jtpval, jtpval_table)
-    return jtpval_table, Jtpval
-
 def make_sequencegroup_table(meta, schema, name='sequencegroup'):
     sequencegroup_table = Table(
         name, meta,

corrmodel/load.py

     from utils import file_not_exist, create_engine_wrapper, get_conf, pg_url
     from dbutils import schema_exists
     from sqlalchemy.orm import mapper, relation, sessionmaker
-    from dbschema import make_crossvalgroup_table, make_crossval_table, make_jtpval_table, make_motifstat_table, make_motifstatgroup_table, make_sequencegroup_table, make_motifgroup_table, make_motifgroup_table_index, make_motif_table, make_model_table, make_datagroup_table, make_datasubgroup_table, make_datafile_table, make_data_table
+    from dbschema import make_crossvalgroup_table, make_crossval_table, make_motifstat_table, make_motifstatgroup_table, make_sequencegroup_table, make_motifgroup_table, make_motifgroup_table_index, make_motif_table, make_model_table, make_datagroup_table, make_datasubgroup_table, make_datafile_table, make_data_table
     import cPickle, random, cpplib, getmodel
     confval = create_db()
     dbname = confval["dbname"]
     meta = MetaData()
     crossvalgroup_table, Crossvalgroup = make_crossvalgroup_table(meta, schema)
     crossval_table, Crossval = make_crossval_table(meta, schema)
-    jtpval_table, Jtpval = make_jtpval_table(meta, schema)
     motifstat_table, Motifstat = make_motifstat_table(meta, schema)
     motifstatgroup_table, Motifstatgroup = make_motifstatgroup_table(meta, schema)
     sequencegroup_table, Sequencegroup = make_sequencegroup_table(meta, schema)
     d['meta'] = meta
     d['Crossvalgroup'] = Crossvalgroup
     d['Crossval'] = Crossval
-    d['Jtpval'] = Jtpval
     d['Motifstatgroup'] = Motifstatgroup
     d['Motifstat'] = Motifstat
     d['Sequencegroup'] = Sequencegroup
         load_simresult(args)
     return simresults[0]
 
-@print_timing
-def load_jtpval(args):
-    """
-    This loads pvalues and conditional expectated statistics into the
-    table Motifstat. 'logjtdistlst' is the list of values from the joint
-    distribution simulated by MCMC.
-    """
-    dataset, motifgroup_id, = args.dataset, args.motifgroup_id,
-    import cpplib, cPickle, getmodel, random, time
-    from load import create_motif_tables, arr_from_sequence_table
-    from sqlalchemy.orm import mapper, relation, sessionmaker
-    from utils import get_conf
-    conf = get_conf()
-    if args.numsim is not None:
-        numsim = args.numsim
-    else:
-        numsim = conf["numsim"]
-    d = create_motif_tables(dataset)
-    db = d['db']
-    Jtpval = d['Jtpval']
-    Session = sessionmaker(bind=db)
-    session = Session()
-    arr = arr_from_sequence_table(dataset, motifgroup_id)
-    modval = conf[dataset]["model"]
-    mod = cpplib.cpp_model(arr.shape[1], modval)
-    pvalue = cpplib.cpp_jtpval(arr, mod, numsim)
-    session.add(Jtpval(motifgroup_id, pvalue, numsim))
-    session.commit()
-    session.close()
-    db.dispose()
 
 \setlength{\abovecaptionskip}{0pt}
 
+\DeclareMathOperator{\G}{\mathcal{G}}
+\DeclareMathOperator{\Gc}{\mathcal{G}^{\mathsf{c}}}
+\DeclareMathOperator{\Lt}{\mathcal{L}}
+\DeclareMathOperator{\Lp}{\mathcal{L}^{\prime}}
 \DeclareMathOperator{\Multinom}{Multinom}
 \DeclareMathOperator{\M}{\mathcal{M}}
 \DeclareMathOperator{\p}{\mathbf{p}}
+\DeclareMathOperator{\q}{\mathbf{q}}
 \DeclareMathOperator{\Si}{\mathbf{S}_i}
+\DeclareMathOperator{\set}{\mathbf{S}}
 \DeclareMathOperator{\X}{\mathbf{X}}
 \DeclareMathOperator{\x}{\mathbf{x}}
 \DeclareMathOperator{\Y}{\mathbf{Y}}
 \subsection{Doing prediction on nucleotide sequence data sets}\label{testbio}
 \newcommand{\neglogppcaption}[2] {%
   These plots show how well the model predicts #1 12 RSS
-  sequences. The panels labelled 0 to 4 correspond to 5 different
-  training sets, each $4/5th$ of the RSS. The values calculated are
-  the negative log of the posterior predictive probabilities of all
-  sequences of length 28. Each panel corresponds to a training set,
-  and shows the highest ranked 40 values, excluding the sequences in
-  the training set, with the green lines corresponding to RSS
-  sequences. The panel labelled COMBINED shows the highest ranked 200
-  averaged values, where the values are averaged across the training
-  sets. See also the discussion in Section~\ref{testbio}, and
-  Table~\ref{#2} for numerical results.}
+  sequences. The panels labelled 0 to 4 correspond to 5
+  cross-validation processes comprising one cross-validation
+  round, as discussed in Section~\ref{testbio}.
+  Each panel shows the highest ranked 40 neglogpp values in $\Lp$ with
+  the green lines corresponding to known RSS sequences in $\G$. Table~\ref{#2} gives
+  numerical results for the same cross-validation round.}
 
 \makefigure{neglogpphuman}{Figure5-neglogpphuman}{NEGLOGPPHUMAN}{%
-  \neglogppcaption{human}{neglogpphuman}
+  \neglogppcaption{human}{neglogpphumantab}
   %
 }
 
 \makefigure{neglogppmouse}{Figure6-neglogppmouse}{NEGLOGPPMOUSE}{%
-  \neglogppcaption{mouse}{neglogppmouse}
+  \neglogppcaption{mouse}{neglogppmousetab}
   %
 }