AWS Data Pipeline
Developer Guide (API Version 2012-10-29)

Attach EBS volumes to cluster nodes

You can attach EBS volumes to any type of node in the EMR cluster within your pipeline. To attach EBS volumes to nodes, use coreEbsConfiguration, masterEbsConfiguration, and TaskEbsConfiguration in your EmrCluster configuration.

This example of the Amazon EMR cluster uses Amazon EBS volumes for its master, task, and core nodes. For more information, see Amazon EBS volumes in Amazon EMR in the Amazon EMR Management Guide.

These configurations are optional. You can use them in any pipeline that uses an EmrCluster object.

In the pipeline, click the EmrCluster object configuration, choose Master EBS Configuration, Core EBS Configuration, or Task EBS Configuration, and enter the configuration details similar to the following example.

{ "objects": [ { "output": { "ref": "S3BackupLocation" }, "input": { "ref": "DDBSourceTable" }, "maximumRetries": "2", "name": "TableBackupActivity", "step": "s3://dynamodb-emr-#{myDDBRegion}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}", "id": "TableBackupActivity", "runsOn": { "ref": "EmrClusterForBackup" }, "type": "EmrActivity", "resizeClusterBeforeRunning": "false" }, { "readThroughputPercent": "#{myDDBReadThroughputRatio}", "name": "DDBSourceTable", "id": "DDBSourceTable", "type": "DynamoDBDataNode", "tableName": "#{myDDBTableName}" }, { "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}", "name": "S3BackupLocation", "id": "S3BackupLocation", "type": "S3DataNode" }, { "name": "EmrClusterForBackup", "coreInstanceCount": "1", "taskInstanceCount": "1", "taskInstanceType": "m4.xlarge", "coreInstanceType": "m4.xlarge", "releaseLabel": "emr-4.7.0", "masterInstanceType": "m4.xlarge", "id": "EmrClusterForBackup", "subnetId": "#{mySubnetId}", "emrManagedMasterSecurityGroupId": "#{myMasterSecurityGroup}", "emrManagedSlaveSecurityGroupId": "#{mySlaveSecurityGroup}", "region": "#{myDDBRegion}", "type": "EmrCluster", "coreEbsConfiguration": { "ref": "EBSConfiguration" }, "masterEbsConfiguration": { "ref": "EBSConfiguration" }, "taskEbsConfiguration": { "ref": "EBSConfiguration" }, "keyPair": "user-key-pair" }, { "name": "EBSConfiguration", "id": "EBSConfiguration", "ebsOptimized": "true", "ebsBlockDeviceConfig" : [ { "ref": "EbsBlockDeviceConfig" } ], "type": "EbsConfiguration" }, { "name": "EbsBlockDeviceConfig", "id": "EbsBlockDeviceConfig", "type": "EbsBlockDeviceConfig", "volumesPerInstance" : "2", "volumeSpecification" : { "ref": "VolumeSpecification" } }, { "name": "VolumeSpecification", "id": "VolumeSpecification", "type": "VolumeSpecification", "sizeInGB": "500", "volumeType": "io1", "iops": "1000" }, { "failureAndRerunMode": "CASCADE", "resourceRole": "DataPipelineDefaultResourceRole", "role": "DataPipelineDefaultRole", "pipelineLogUri": "#{myPipelineLogUri}", "scheduleType": "ONDEMAND", "name": "Default", "id": "Default" } ], "parameters": [ { "description": "Output S3 folder", "id": "myOutputS3Loc", "type": "AWS::S3::ObjectKey" }, { "description": "Source DynamoDB table name", "id": "myDDBTableName", "type": "String" }, { "default": "0.25", "watermark": "Enter value between 0.1-1.0", "description": "DynamoDB read throughput ratio", "id": "myDDBReadThroughputRatio", "type": "Double" }, { "default": "us-east-1", "watermark": "us-east-1", "description": "Region of the DynamoDB table", "id": "myDDBRegion", "type": "String" } ], "values": { "myDDBRegion": "us-east-1", "myDDBTableName": "ddb_table", "myDDBReadThroughputRatio": "0.25", "myOutputS3Loc": "s3://s3_path", "mySubnetId": "subnet_id", "mySlaveSecurityGroup": "slave security group", "myMasterSecurityGroup": "master security group", "myPipelineLogUri": "s3://s3_path" } }