HTTPS SSH

What is this about


An effort to standardize ML experiments and to move from script-like code that saves results to file to a more structured framework that can save results in a database in such a way so that the experiments can be repeated and/or resumed with the same parameters and in the same way and the results can be compared with a simple query.

The tools (so far)


The basics are:

Python, the scikit stack, pandas and pyodm for the models and the connection to MongoDB. (SQLAlchemy was initially employed but as the project took form, it was evident that there was a need for document-like storage)

A bit about the architecture


Is it functional?


Well, it is semi-functional. There are many cases to be considered architecturally, e.g. proper feature extraction and selection, different types of learning etc.

Give us an example


import pandas as pd
from experiments_suite.models.config_mongo import AlgorithmConfig, VectorizerConfig, DatasetConfig, ExperimentSuiteConfig
from experiments_suite.models.dataset import DataSet
from experiments_suite.models.experiment_suite import ExperimentSuite
from experiments_suite.utils import ClsType
from experiments_suite.db.mongo import conn

# define a custom train test split function
def split_df(df):
    import numpy as np
    malicious = df[df["class"] == -1].sample(frac=1)
    malicious["is_train"] = np.random.uniform(0, 1, len(malicious)) <= .75
    normal = df[df["class"] == 1].sample(frac=1)
    normal["is_train"] = np.random.uniform(0, 1, len(normal)) <= .75
    df = pd.concat([malicious, normal], ignore_index=True)
    df = df.sample(frac=1)  # shuffle

    return df[df["is_train"] == True], df[df["is_train"] == False], None, None

# just a dummy dataset as a pandas DataFrame - it is important to have a column 
# that holds the class  when we are considering supervised solutions
df = pd.DataFrame([[1, 2, 3], [5, 6, 7], [5, 6, 7], [2, 4, 3], [5, 5, 7]], columns=["A", "B", "class"])

# set the dataset once
dataset = DataSet(df)

# here you can configure the class column if any, the train test split and 
dataset_config = DatasetConfig(class_col="class", split=0.9) 
dataset_config_custom_split = DatasetConfig(class_col="class", split=split_df)
# configure the algorithm to be used 
cls_config = AlgorithmConfig(alg_type=ClsType.LinearSvm)
cls_config2 = AlgorithmConfig(alg_type=ClsType.KMeans, alg_kwargs={"n_clusters": 2})
cls_config3 = AlgorithmConfig(alg_type=ClsType.Perceptron)
vec_config = VectorizerConfig() # the way the features are going to be vectorized - if necessary
# the following are under construction: a way to perform feature extraction or/ and transformation
# feature_config = FeatureExtractionConfig(column_names=['']) # the feature extraction process
# transformer_config = TransformerConfig()

experiment_suite_cfg = ExperimentSuiteConfig()
experiment_suite_cfg.alg_configs = [cls_config, cls_config2, cls_config3]
experiment_suite_cfg.vec_configs = [vec_config]
experiment_suite_cfg.ds_configs = [dataset_config, dataset_config_custom_split]
experiment_suite_cfg.feature_extraction_configs = [feature_config]
experiment_suite_cfg.transformation_configs = [transformer_config]
experiment_suite = ExperimentSuite(config=experiment_suite_cfg)

# start the process 
experiment_suite.start(dataset)
# save experiments and dataset
experiment_suite.save()
dataset.save()

Experiment Suite is saved in this form:

{
    "_id" : ObjectId("58b996d854bbed5223492f7c"),
    "experiments" : [ 
        ObjectId("58b996d954bbed5242492f7e"), 
        ObjectId("58b996d954bbed5245492f7e"), 
        ObjectId("58b996d954bbed5246492f7e"), 
        ObjectId("58b996d954bbed524b492f7e"), 
        ObjectId("58b996da54bbed5242492f81"), 
        ObjectId("58b996da54bbed5245492f81")
    ],
    "config" : {
        "_id" : ObjectId("58b996d854bbed5223492f7b"),
        "alg_configs" : [ 
            {
                "_id" : ObjectId("58b996d854bbed5223492f74"),
                "alg_type" : 0,
                "alg_args" : null,
                "alg_kwargs" : null,
                "_cls" : "experiments_suite.models.config_mongo.AlgorithmConfig"
            }, 
            {
                "_id" : ObjectId("58b996d854bbed5223492f75"),
                "alg_type" : 202,
                "alg_args" : null,
                "alg_kwargs" : {
                    "n_clusters" : 2
                },
                "_cls" : "experiments_suite.models.config_mongo.AlgorithmConfig"
            }, 
            {
                "_id" : ObjectId("58b996d854bbed5223492f76"),
                "alg_type" : 16,
                "alg_args" : null,
                "alg_kwargs" : null,
                "_cls" : "experiments_suite.models.config_mongo.AlgorithmConfig"
            }
        ],
        "feature_extraction_configs" : [],
        "ds_configs" : [ 
            {
                "_id" : ObjectId("58b996d854bbed5223492f77"),
                "class_col" : "class",
                "str_split" : "def split_df(df):\n    import numpy as np\n    malicious = df[df[\"class\"] == -1].sample(frac=1)\n    malicious[\"is_train\"] = np.random.uniform(0, 1, len(malicious)) <= .75\n    normal = df[df[\"class\"] == 1].sample(frac=1)\n    normal[\"is_train\"] = np.random.uniform(0, 1, len(normal)) <= .75\n    df = pd.concat([malicious, normal], ignore_index=True)\n    df = df.sample(frac=1)  # shuffle\n\n    return df[df[\"is_train\"] == True], df[df[\"is_train\"] == False], None, None\n",
                "shuffle" : true,
                "cv" : -1,
                "_cls" : "experiments_suite.models.config_mongo.DatasetConfig"
            }, 
            {
                "_id" : ObjectId("58b996d854bbed5223492f78"),
                "class_col" : "class",
                "str_split" : "0.9",
                "shuffle" : true,
                "cv" : -1,
                "_cls" : "experiments_suite.models.config_mongo.DatasetConfig"
            }
        ],
        "transformation_configs" : [ 
            {
                "_id" : ObjectId("58b996d854bbed5223492f79"),
                "_cls" : "experiments_suite.models.config_mongo.TransformerConfig"
            }
        ],
        "resume" : true,
        "auto_save" : true,
        "parallelize" : true,
        "_cls" : "experiments_suite.models.config_mongo.ExperimentSuiteConfig"
    },
    "parameters" : [ 
        {
            "alg_config" : {
                "alg_type" : 0,
                "alg_args" : null,
                "alg_kwargs" : null
            },
            "transformer_config" : {
                "vec_type" : 1
            },
            "ds_config" : {
                "class_col" : "class",
                "shuffle" : true,
                "split" : "def split_df(df):\n    import numpy as np\n    malicious = df[df[\"class\"] == -1].sample(frac=1)\n    malicious[\"is_train\"] = np.random.uniform(0, 1, len(malicious)) <= .75\n    normal = df[df[\"class\"] == 1].sample(frac=1)\n    normal[\"is_train\"] = np.random.uniform(0, 1, len(normal)) <= .75\n    df = pd.concat([malicious, normal], ignore_index=True)\n    df = df.sample(frac=1)  # shuffle\n\n    return df[df[\"is_train\"] == True], df[df[\"is_train\"] == False], None, None\n",
                "cv" : -1
            }
        }, 
        {
            "alg_config" : {
                "alg_type" : 0,
                "alg_args" : null,
                "alg_kwargs" : null
            },
            "transformer_config" : {
                "vec_type" : 1
            },
            "ds_config" : {
                "class_col" : "class",
                "shuffle" : true,
                "split" : "0.9",
                "cv" : -1
            }
        }, 
        {
            "alg_config" : {
                "alg_type" : 202,
                "alg_args" : null,
                "alg_kwargs" : {
                    "n_clusters" : 2
                }
            },
            "transformer_config" : {
                "vec_type" : 1
            },
            "ds_config" : {
                "class_col" : "class",
                "shuffle" : true,
                "split" : "def split_df(df):\n    import numpy as np\n    malicious = df[df[\"class\"] == -1].sample(frac=1)\n    malicious[\"is_train\"] = np.random.uniform(0, 1, len(malicious)) <= .75\n    normal = df[df[\"class\"] == 1].sample(frac=1)\n    normal[\"is_train\"] = np.random.uniform(0, 1, len(normal)) <= .75\n    df = pd.concat([malicious, normal], ignore_index=True)\n    df = df.sample(frac=1)  # shuffle\n\n    return df[df[\"is_train\"] == True], df[df[\"is_train\"] == False], None, None\n",
                "cv" : -1
            }
        }, 
        {
            "alg_config" : {
                "alg_type" : 202,
                "alg_args" : null,
                "alg_kwargs" : {
                    "n_clusters" : 2
                }
            },
            "transformer_config" : {
                "vec_type" : 1
            },
            "ds_config" : {
                "class_col" : "class",
                "shuffle" : true,
                "split" : "0.9",
                "cv" : -1
            }
        }, 
        {
            "alg_config" : {
                "alg_type" : 16,
                "alg_args" : null,
                "alg_kwargs" : null
            },
            "transformer_config" : {
                "vec_type" : 1
            },
            "ds_config" : {
                "class_col" : "class",
                "shuffle" : true,
                "split" : "def split_df(df):\n    import numpy as np\n    malicious = df[df[\"class\"] == -1].sample(frac=1)\n    malicious[\"is_train\"] = np.random.uniform(0, 1, len(malicious)) <= .75\n    normal = df[df[\"class\"] == 1].sample(frac=1)\n    normal[\"is_train\"] = np.random.uniform(0, 1, len(normal)) <= .75\n    df = pd.concat([malicious, normal], ignore_index=True)\n    df = df.sample(frac=1)  # shuffle\n\n    return df[df[\"is_train\"] == True], df[df[\"is_train\"] == False], None, None\n",
                "cv" : -1
            }
        }, 
        {
            "alg_config" : {
                "alg_type" : 16,
                "alg_args" : null,
                "alg_kwargs" : null
            },
            "transformer_config" : {
                "vec_type" : 1
            },
            "ds_config" : {
                "class_col" : "class",
                "shuffle" : true,
                "split" : "0.9",
                "cv" : -1
            }
        }
    ],
    "date_created" : ISODate("2017-03-03T16:16:24.632Z"),
    "last_updated" : ISODate("2017-03-03T16:16:24.632Z"),
    "_cls" : "experiments_suite.models.experiment_suite.ExperimentSuite"
}

An exapmle of how experiments are saved:

{
    "_id" : ObjectId("58b996d954bbed5242492f7e"),
    "dataset" : ObjectId("58b996d854bbed5223492f7a"),
    "config" : {
        "_id" : ObjectId("58b996d954bbed5242492f7d"),
        "memory_limit" : -1,
        "alg_config" : {
            "_id" : ObjectId("58b996d854bbed5223492f74"),
            "alg_type" : 0,
            "alg_args" : null,
            "alg_kwargs" : null,
            "_cls" : "experiments_suite.models.config_mongo.AlgorithmConfig"
        },
        "feature_extraction_config" : null,
        "vec_config" : null,
        "transformer_config" : {
            "_id" : ObjectId("58b996d854bbed5223492f79"),
            "_cls" : "experiments_suite.models.config_mongo.TransformerConfig"
        },
        "ds_config" : {
            "_id" : ObjectId("58b996d854bbed5223492f77"),
            "class_col" : "class",
            "str_split" : "def split_df(df):\n    import numpy as np\n    malicious = df[df[\"class\"] == -1].sample(frac=1)\n    malicious[\"is_train\"] = np.random.uniform(0, 1, len(malicious)) <= .75\n    normal = df[df[\"class\"] == 1].sample(frac=1)\n    normal[\"is_train\"] = np.random.uniform(0, 1, len(normal)) <= .75\n    df = pd.concat([malicious, normal], ignore_index=True)\n    df = df.sample(frac=1)  # shuffle\n\n    return df[df[\"is_train\"] == True], df[df[\"is_train\"] == False], None, None\n",
            "shuffle" : true,
            "cv" : -1,
            "_cls" : "experiments_suite.models.config_mongo.DatasetConfig"
        },
        "steps" : null,
        "_cls" : "experiments_suite.models.config_mongo.ExperimentConfig"
    },
    "metrics" : {
        "pk" : ObjectId("58b996d954bbed5242492f7f"),
        "predictions" : [ /*shortened to be easier to view here */
            1, 
            1, 
            1, 
            -1, 
            -1, 
            1, 
            1
        ],
        "ground_truth" : [ /*shortened to be easier to view here */
            1, 
            1, 
            -1, 
            -1, 
            -1, 
            1, 
            1
        ],
        "accuracy" : 0.860902255639098,
        "precision" : 0.876228667510561,
        "recall" : 0.860902255639098,
        "f_score" : 0.867849638715144,
        "cosine_similarity" : 0.721804511278195,
        "mse" : 0.556390977443609,
        "classification_report" : "             precision    recall  f1-score   support\n\n         -1       0.35      0.44      0.39        27\n          1       0.94      0.91      0.92       239\n\navg / total       0.88      0.86      0.87       266\n",
        "classification_report_as_list" : [ 
            [ 
                -1, 
                0.353, 
                0.444, 
                0.393, 
                27
            ], 
            [ 
                1, 
                0.935, 
                0.908, 
                0.921, 
                239
            ]
        ],
        "_cls" : "experiments_suite.models.metrics.BasicMetrics"
    },
    "_cls" : "experiments_suite.models.experiment.Experiment"
}

Resuming

# we can then retrieve the experiments ran using any of the fields
retrieved = ExperimentSuite.objects.raw({"experiments": experiment_suite.experiments[0].pk})

An example of resuming experiments with the same configuration - say something went wrong:

# reset things
ExperimentSuite.objects.delete()
Experiment.objects.delete()
DataSet.objects.delete()
... 
... 
...

# use interruptingcow to simulate 
from interruptingcow import timeout
try:
    with timeout(2, exception=RuntimeError):
        experiment_suite.start(dataset)
except RuntimeError:
    log.warn("Moooo!")
    pass
experiment_suite.save()
dataset.save()

assert ExperimentSuite.objects.all().count()== 1 # sanity check
assert Experiment.objects.all().count() >= 1  # we have saved at least one experiment before interruptingcow interrupted :)
retrieved = ExperimentSuite.objects.raw({"experiments": experiment_suite.experiments[0].pk})
assert len(retrieved.first().experiments) < 6  # check that experiments have not completed
e_suite = retrieved.first()
assert e_suite.config.resume == True  # sanity checking
# start again using the retrieved suite 
e_suite.start(dataset)
e_suite.save()
assert len(e_suite.experiments)== 6

What's yet to be done and timeframe


Since this is a side-project, the timeframe for now is non existent.

So, aside from functional FeatureExtraction configuration and implementation, it would be fun to try dask.