| « PreviousNext » | |
![]() ![]() ![]() | Did this page help you? Yes | No | Tell us about it... |
This example pipeline definition creates an Amazon EMR cluster to extract data from Apache web logs in Amazon S3 to a CSV file in Amazon S3 using Hive.
Note
You can accommodate tab-delimited (TSV) data files similarly to how this sample demonstrates
using comma-delimited (CSV) files, if you change the reference to MyInputDataType and
MyOutputDataType to be objects with a type "TSV" instead of "CSV".
{
"objects": [
{
"startDateTime": "2012-05-04T00:00:00",
"id": "MyEmrResourcePeriod",
"period": "1 day",
"type": "Schedule",
"endDateTime": "2012-05-05T00:00:00"
},
{
"id": "MyHiveActivity",
"maximumRetries": "10",
"type": "HiveActivity",
"schedule": {
"ref": "MyEmrResourcePeriod"
},
"runsOn": {
"ref": "MyEmrResource"
},
"input": {
"ref": "MyInputData"
},
"output": {
"ref": "MyOutputData"
},
"hiveScript": "INSERT OVERWRITE TABLE ${output1} select * from ${input1};"
},
{
"schedule": {
"ref": "MyEmrResourcePeriod"
},
"masterInstanceType": "m1.small",
"coreInstanceType": "m1.small",
"enableDebugging": "true",
"keyPair": "test-pair",
"id": "MyEmrResource",
"coreInstanceCount": "1",
"actionOnTaskFailure": "continue",
"maximumRetries": "2",
"type": "EmrCluster",
"actionOnResourceFailure": "retryAll",
"terminateAfter": "10 hour"
},
{
"id": "MyInputData",
"type": "S3DataNode",
"schedule": {
"ref": "MyEmrResourcePeriod"
},
"directoryPath": "s3://test-hive/input",
"dataFormat": {
"ref": "MyInputDataType"
}
},
{
"id": "MyOutputData",
"type": "S3DataNode",
"schedule": {
"ref": "MyEmrResourcePeriod"
},
"directoryPath": "s3://test-hive/output",
"dataFormat": {
"ref": "MyOutputDataType"
}
},
{
"id": "MyOutputDataType",
"type": "CSV",
"column": [
"Name STRING",
"Age STRING",
"Surname STRING"
]
},
{
"id": "MyInputDataType",
"type": "CSV",
"column": [
"Name STRING",
"Age STRING",
"Surname STRING"
]
}
]
}