Commits

Olivier Grisel  committed a85ac32

factorise out the method to build training sets out of brain history

  • Participants
  • Parent commits 7a7e9b3
  • Branches sgd

Comments (0)

Files changed (1)

File src/gumby/brain.py

             if True, use cross validation to perform parameter
             estimation
 
+        ``max_sample_size``:
+            if not None and positive integer, randomly sample a subset with
+            the given size to feed it to the learner so as to ensure bounded
+            training time for instance by trading it for optimization error
+
+        ``do_shuffle``:
+            if True, shuffle the data set before feeding it to the model (some
+            models such as only stochastic learners do not like datasets with
+            high sequential dependency)
+
+            TODO: verify this claim in a test case
+
         ``nr_folds``:
             number of folds to use in cross validations performed during
             parameters estimation
         """
         t0 = time()
         # collect data to build the training set
-        # TODO: factor out in an external method?
+        m_inputs, m_outputs = self._build_training_set(
+            max_sample_size, do_shuffle, **kw)
+
+        # perform the actual model training
+        if find_parameters:
+            self.model.find_parameters(m_inputs, m_outputs, **kw)
+        self.model.train(m_inputs, m_outputs, epochs=3,
+                         compute_loss=True, stopping_tol=1e-4)
+        self.logger.info("training on layer %d on %d data points [%0.3fs]",
+                         id(self), len(m_inputs), time()-t0)
+
+    def _build_training_set(self, max_sample_size=None, do_shuffle=True, **kw):
+        """Train build the training set out of the recorded histories
+
+        ``find_parameters``:
+            if True, use cross validation to perform parameter
+            estimation
+
+        ``nr_folds``:
+            number of folds to use in cross validations performed during
+            parameters estimation
+        """
         m_outputs = []
         m_inputs = []
         for t in xrange(self.history_size - self.temporal_neighborhood - 1):
         if do_shuffle:
             m_inputs, m_outputs = shuffle(m_inputs, m_outputs)
 
-        # perform the actual model training
-        if find_parameters:
-            self.model.find_parameters(m_inputs, m_outputs, **kw)
-        self.model.train(m_inputs, m_outputs, epochs=3,
-                         compute_loss=True, stopping_tol=1e-4)
-        self.logger.info("training on layer %d on %d data points [%0.3fs]",
-                         id(self), len(m_inputs), time()-t0)
+        return m_inputs, m_outputs
 
     def _build_model_input(self, position, time_origin=0):
         """Helper method to build an input vector for the model"""