Commits

Flashpoint committed b2d7bdc

updated bluebook_competition.py

Comments (0)

Files changed (1)

bluebook_competition.py

 #                                                     #
 # Using several regressors with stacking              #
 #                                                     #
-# Author: Flashpoint                                  #
+# Author: Vid Jelen                                   #
 #                                                     #
 #######################################################
 
     ###############################################################################
     """Binarize values... e.g. [3,2,1,0] into [0,0,1], [0,1,0], [1,0,0], [0,0,0]"""
     ###############################################################################
-    binarized_columns = [parse_year(train["YearMade"])]
+    binarized_columns = [parse_year(train["YearMade"]), parse_auctioneerID(train["auctioneerID"]), parse_usageband(train["UsageBand"])]
 
     columns = set(train.columns)
     columns.remove("SalesID")
     """Binarize some additional features"""
     #######################################
     columns.remove("YearMade")
+    columns.remove("auctioneerID")
+    columns.remove("UsageBand")
 
     train_fea = get_date_dataframe(train["saledate"])
     test_fea = get_date_dataframe(test["saledate"])
     # Map consecutive integers onto strings and fill the empty values
     for col in columns:
         if train[col].dtype == np.dtype('object'):
-            s = np.unique(train[col].fillna(-1).values)
+            s = np.unique(train[col].fillna(0).values)
             mapping = pd.Series([x[0] for x in enumerate(s)], index = s)
-            train_fea = train_fea.join(train[col].map(mapping).fillna(-1))
-            test_fea = test_fea.join(test[col].map(mapping).fillna(-1))
+            train_fea = train_fea.join(train[col].map(mapping).fillna(0))
+            test_fea = test_fea.join(test[col].map(mapping).fillna(0))
         else:
             train_fea = train_fea.join(train[col].fillna(0))
             test_fea = test_fea.join(test[col].fillna(0))
     pickle.dump((train, train_fea, test_fea), open(pickle_file, "wb"), -1)
 
 else:
-    train, train_fea, test_fea = pickle.load(open(pickle_file, "rb"))
+    train, test, train_fea, test_fea = pickle.load(open(pickle_file, "rb"))
 
 # This actually makes the predictions worse... not worth using
 """Tree-based Feature selection"""
 train_sample = train_fea.drop(rows)
 train_targets = train["SalePrice"].drop(rows)
 
-del train   # free up some memory
-del rows    # free up some memory
+del rows    # for saving memory
 
 #----------------------------------#
 #-----------Stacking---------------#
 """Gradient Boosting Regressor"""
 gbr = GradientBoostingRegressor()
 gbr_predictions = train_predict(train_sample, train_targets, valid_sample, gbr)
+gbr_predictions[gbr_predictions < 0] = np.mean(gbr_predictions[gbr_predictions > 0]) # Set all the negative predictions to the mean prediction
 
 """Multivariate Adaptive Regression Splines - MARS"""
 orange_sample = train_sample.join(train_targets)
 mars_predictions = np.array([mars_regr(x).value for x in orange_ending])
 mars_predictions[mars_predictions < 0] = np.mean(mars_predictions[mars_predictions > 0]) # Set all the negative predictions to the mean prediction
 
-del orange_sample   # free up some memory
-del orange_data     # free up some memory
-del orange_target   # free up some memory
-del orange_ending   # free up some memory
+del orange_sample   # for saving memory
+del orange_data     # for saving memory
+del orange_target   # for saving memory
+del orange_ending   # for saving memory
 
 # SVM takes too much time to train
 #"""Support Vector Machine Regression"""
 error = rmsle(valid_targets, rf_predictions)
 print("Random Forests regression RMSLE: %f" % error)
 
-del error   # free up some memory
+del error # for saving memory
 
 # Create the level-1 dataset from level-0 predictions
-level_1_data = np.array([lr_predictions, brr_predictions,lass_predictions, knn_predictions, dt_predictions, gbr_predictions, mars_predictions, rf_predictions]).transpose() #mars_predictions
+level_1_data = np.array([rf_predictions]).transpose()#lr_predictions, brr_predictions,lass_predictions, knn_predictions, dt_predictions, gbr_predictions, mars_predictions, rf_predictions]).transpose()
 
 # Create a level-1 stacking generalizer
 l1_rf = RandomForestRegressor(n_estimators=50, n_jobs=-1)
 l1_rf.fit(level_1_data, valid_targets)
 
-del level_1_data    # free up some memory
-del train_sample    # free up some memory
-del train_targets   # free up some memory
-del valid_sample    # free up some memory
-del valid_targets   # free up some memory
+del level_1_data    # for saving memory
+del train_sample    # for saving memory
+del train_targets   # for saving memory
+del valid_sample    # for saving memory
+del valid_targets   # for saving memory
+
+mars_ending = convert_dataframe_to_orange(test_fea)
+
+# Fit the regressors to the 100% of the training dataset
+regr.fit(train_fea, train["SalePrice"])
+brr.fit(train_fea, train["SalePrice"])
+lass.fit(train_fea, train["SalePrice"])
+knn.fit(train_fea, train["SalePrice"])
+dt.fit(train_fea, train["SalePrice"])
+gbr.fit(train_fea, train["SalePrice"])
+rf.fit(train_fea, train["SalePrice"])
+orange_sample = train_fea.join(train["SalePrice"])
+orange_data = convert_dataframe_to_orange(orange_sample)
+mars_regr = earth.EarthLearner(orange_data, terms=30, degree=2, penalty=1.0)
+
+del train # for saving memory
 
 # Now use the regressors on the test dataset
-final_mars = np.array([mars_final_regr(x).value for x in mars_ending])
+final_mars = np.array([mars_regr(x).value for x in mars_ending])
 final_regr = regr.predict(test_fea)
 final_brr = brr.predict(test_fea)
 final_lass = lass.predict(test_fea)
 final_dt = dt.predict(test_fea)
 final_gbr = gbr.predict(test_fea)
 final_rf = rf.predict(test_fea)
-final_regr[final_regr < 0] = np.mean(final_regr[final_regr > 0]) # Set all the negative predictions to the mean prediction
-final_brr[final_brr < 0] = np.mean(final_brr[final_brr > 0]) # Set all the negative predictions to the mean prediction
-final_lass[final_lass < 0] = np.mean(final_lass[final_lass > 0]) # Set all the negative predictions to the mean prediction
-final_mars[final_mars < 0] = np.mean(final_mars[final_mars > 0]) # Set all the negative predictions to the mean prediction
+final_gbr[final_gbr < 0] = np.mean(final_gbr[final_gbr > 0])        # Set all the negative predictions to the mean prediction
+final_regr[final_regr < 0] = np.mean(final_regr[final_regr > 0])    # Set all the negative predictions to the mean prediction
+final_brr[final_brr < 0] = np.mean(final_brr[final_brr > 0])        # Set all the negative predictions to the mean prediction
+final_lass[final_lass < 0] = np.mean(final_lass[final_lass > 0])    # Set all the negative predictions to the mean prediction
+final_mars[final_mars < 0] = np.mean(final_mars[final_mars > 0])    # Set all the negative predictions to the mean prediction
 
-predictions = np.array([final_regr, final_brr, final_lass, final_knn, final_dt, final_gbr, final_mars, final_rf]).transpose() #final_mars
+predictions = np.array([final_rf]).transpose()#final_regr, final_brr, final_lass, final_knn, final_dt, final_gbr, final_mars, final_rf]).transpose()
 
 # Use the level-1 stacking generalizer on the test data predictions to make final predictions
 final_predictions = l1_rf.predict(predictions)
 
 test = test.join(pd.DataFrame({"SalePrice": final_predictions}))
-test.ix[test.SalePrice < 0, "SalePrice"] = np.mean(test["SalePrice"]) #Set all the negative predictions to the mean prediction
+test.ix[test.SalePrice < 0, "SalePrice"] = np.mean(test.ix[test.SalePrice > 0, "SalePrice"]) #Set all the negative predictions to the mean prediction
 test[["SalesID", "SalePrice"]].to_csv(out_file, index=False)
 
 """Used for timing purposes"""
 print("\nSeconds elapsed: {}".format(time.time() - start))
+