Snippets

Aerobotics SageMaker hyperparameter tuning job config for optimising TensorFlow CPU parallelism settings

Created by Michael Malahe
{
  "HyperParameterTuningJobName": "job-name",
  "HyperParameterTuningJobConfig": {
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "Type": "Minimize",
      "MetricName": "train_duration"
    },
    "ParameterRanges": {
      "IntegerParameterRanges": [
        {
          "Name": "inter_op_parallelism_threads",
          "MinValue": "8",
          "MaxValue": "48",
          "ScalingType": "Auto"
        },
        {
          "Name": "intra_op_parallelism_threads",
          "MinValue": "1",
          "MaxValue": "4",
          "ScalingType": "Auto"
        },
        {
          "Name": "n_parallel_fetch",
          "MinValue": "1",
          "MaxValue": "16",
          "ScalingType": "Auto"
        },
        {
          "Name": "n_parallel_proc",
          "MinValue": "1",
          "MaxValue": "16",
          "ScalingType": "Auto"
        },
        {
          "Name": "n_prefetch_fetch",
          "MinValue": "16",
          "MaxValue": "48",
          "ScalingType": "Auto"
        },
        {
          "Name": "n_prefetch_proc",
          "MinValue": "16",
          "MaxValue": "48",
          "ScalingType": "Auto"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 128,
      "MaxParallelTrainingJobs": 3
    }
  },
  "TrainingJobDefinition": {
    "StaticHyperParameters": {
      "dataset_filename": "s3://bucket-name/train",
      "val_dataset_filename": "s3://bucket-name/val",
    },
    "AlgorithmSpecification": {
      "TrainingImage": "xxxx.dkr.ecr.us-east-1.amazonaws.com/training/image",
      "TrainingInputMode": "File",
      "MetricDefinitions": [
        {
          "Name": "train_duration",
          "Regex": "- train_duration=(.*?) -"
        }
      ]
    },
    "RoleArn": "arn:aws:iam::xxxxxx:role/service-role/AmazonSageMaker-ExecutionRole-xxxx",
    "OutputDataConfig": {
      "S3OutputPath": "s3://bucket-name/training-outputs/"
    },
    "ResourceConfig": {
      "InstanceType": "ml.p3.2xlarge",
      "InstanceCount": 1,
      "VolumeSizeInGB": 1
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 1800
    }
  }
}

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.