There are more AWS SDK examples available in the AWS Doc SDK Examples
Getting started with Amazon Textract
The following code example shows how to:
Create an S3 bucket
Upload a document to S3
Clean up resources
- Bash
-
- AWS CLI with Bash script
-
Note
There's more on GitHub. Find the complete example and learn how to set up and run in the Sample developer tutorials
repository. #!/bin/bash # Amazon Textract Getting Started Tutorial Script # This script demonstrates how to use Amazon Textract to analyze document text set -euo pipefail # Set up logging with restricted permissions LOG_FILE="textract-tutorial.log" touch "$LOG_FILE" chmod 600 "$LOG_FILE" exec > >(tee -a "$LOG_FILE") 2>&1 echo "===================================================" echo "Amazon Textract Getting Started Tutorial" echo "===================================================" echo "This script will guide you through using Amazon Textract to analyze document text." echo "" # Function to check for errors in command output and exit code check_error() { local exit_code=$1 local output=$2 local cmd=$3 if [ $exit_code -ne 0 ] || echo "$output" | grep -i "error" > /dev/null; then echo "ERROR: Command failed: $cmd" echo "$output" | sed 's/\(aws_secret_access_key\|Authorization\|X-Amz-Security-Token\).*/\1=***REDACTED***/g' cleanup_on_error exit 1 fi } # Function to clean up resources on error cleanup_on_error() { echo "Error encountered. Cleaning up resources..." # Clean up temporary JSON files if [ -f "document.json" ]; then rm -f document.json fi if [ -f "features.json" ]; then rm -f features.json fi if [ -n "${DOCUMENT_NAME:-}" ] && [ -n "${BUCKET_NAME:-}" ]; then echo "Deleting document from S3..." aws s3 rm "s3://${BUCKET_NAME}/${DOCUMENT_NAME}" || echo "Failed to delete document" fi if [ -n "${BUCKET_NAME:-}" ] && [ "${BUCKET_IS_SHARED:-false}" = "false" ]; then echo "Deleting S3 bucket..." aws s3 rb "s3://${BUCKET_NAME}" --force || echo "Failed to delete bucket" fi } # Set up trap for cleanup on exit trap cleanup_on_error EXIT # Verify AWS CLI is installed and configured echo "Verifying AWS CLI configuration..." if ! command -v aws &> /dev/null; then echo "ERROR: AWS CLI is not installed." exit 1 fi AWS_CONFIG_OUTPUT=$(aws configure list 2>&1) AWS_CONFIG_STATUS=$? if [ $AWS_CONFIG_STATUS -ne 0 ]; then echo "ERROR: AWS CLI is not properly configured." echo "$AWS_CONFIG_OUTPUT" | sed 's/\(aws_secret_access_key\|Authorization\).*/\1=***REDACTED***/g' exit 1 fi # Verify AWS region is configured and supports Textract AWS_REGION=$(aws configure get region) if [ -z "$AWS_REGION" ]; then echo "ERROR: No AWS region configured. Please run 'aws configure' to set a default region." exit 1 fi # Check if Textract is available in the configured region echo "Checking if Amazon Textract is available in region $AWS_REGION..." TEXTRACT_CHECK=$(aws textract help 2>&1) TEXTRACT_CHECK_STATUS=$? if [ $TEXTRACT_CHECK_STATUS -ne 0 ]; then echo "ERROR: Amazon Textract may not be available in region $AWS_REGION." exit 1 fi # Generate a random identifier for S3 bucket RANDOM_ID=$(openssl rand -hex 6) # Check for shared prereq bucket PREREQ_BUCKET=$(aws cloudformation describe-stacks --stack-name tutorial-prereqs-bucket \ --query 'Stacks[0].Outputs[?OutputKey==`BucketName`].OutputValue' --output text 2>/dev/null || echo "") if [ -n "$PREREQ_BUCKET" ] && [ "$PREREQ_BUCKET" != "None" ]; then BUCKET_NAME="$PREREQ_BUCKET" BUCKET_IS_SHARED=true echo "Using shared bucket: $BUCKET_NAME" else BUCKET_IS_SHARED=false BUCKET_NAME="textract-${RANDOM_ID}" fi DOCUMENT_NAME="document.png" RESOURCES_CREATED=() # Step 1: Create S3 bucket if [ "$BUCKET_IS_SHARED" = false ]; then echo "Creating S3 bucket: $BUCKET_NAME" CREATE_BUCKET_OUTPUT=$(aws s3 mb "s3://$BUCKET_NAME" --region "$AWS_REGION" 2>&1) CREATE_BUCKET_STATUS=$? echo "$CREATE_BUCKET_OUTPUT" check_error $CREATE_BUCKET_STATUS "$CREATE_BUCKET_OUTPUT" "aws s3 mb s3://$BUCKET_NAME" # Apply security settings to bucket aws s3api put-bucket-versioning --bucket "$BUCKET_NAME" --versioning-configuration Status=Enabled 2>&1 || true aws s3api put-bucket-encryption --bucket "$BUCKET_NAME" --server-side-encryption-configuration '{"Rules": [{"ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"}}]}' 2>&1 || true aws s3api put-bucket-acl --bucket "$BUCKET_NAME" --acl private 2>&1 || true RESOURCES_CREATED+=("S3 Bucket: $BUCKET_NAME") fi # Step 2: Check if sample document exists, if not create a simple one if [ ! -f "$DOCUMENT_NAME" ]; then echo "Sample document not found. Generating a sample document..." # Create a simple PNG document using ImageMagick or convert if command -v convert &> /dev/null; then convert -size 400x300 xc:white -pointsize 20 -fill black -draw "text 50,50 'Sample Document'" "$DOCUMENT_NAME" chmod 600 "$DOCUMENT_NAME" echo "Generated sample document: $DOCUMENT_NAME" else # Fallback: create a minimal valid PNG using base64 echo "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" | base64 -d > "$DOCUMENT_NAME" chmod 600 "$DOCUMENT_NAME" echo "Created minimal sample document: $DOCUMENT_NAME" fi fi # Step 3: Upload document to S3 echo "Uploading document to S3..." UPLOAD_OUTPUT=$(aws s3 cp "./$DOCUMENT_NAME" "s3://$BUCKET_NAME/" --sse AES256 2>&1) UPLOAD_STATUS=$? echo "$UPLOAD_OUTPUT" check_error $UPLOAD_STATUS "$UPLOAD_OUTPUT" "aws s3 cp ./$DOCUMENT_NAME s3://$BUCKET_NAME/" RESOURCES_CREATED+=("S3 Object: s3://$BUCKET_NAME/$DOCUMENT_NAME") # Step 4: Analyze document with Amazon Textract echo "Analyzing document with Amazon Textract..." echo "This may take a few seconds..." # Create a JSON file for the document parameter to avoid shell escaping issues cat > document.json << 'EOF' { "S3Object": { "Bucket": "BUCKET_PLACEHOLDER", "Name": "DOCUMENT_PLACEHOLDER" } } EOF sed -i.bak "s|BUCKET_PLACEHOLDER|$BUCKET_NAME|g; s|DOCUMENT_PLACEHOLDER|$DOCUMENT_NAME|g" document.json rm -f document.json.bak chmod 600 document.json # Create a JSON file for the feature types parameter cat > features.json << 'EOF' ["TABLES","FORMS","SIGNATURES"] EOF chmod 600 features.json ANALYZE_OUTPUT=$(aws textract analyze-document --document file://document.json --feature-types file://features.json 2>&1) ANALYZE_STATUS=$? echo "Analysis complete." if [ $ANALYZE_STATUS -ne 0 ]; then echo "ERROR: Document analysis failed" echo "$ANALYZE_OUTPUT" | sed 's/\(aws_secret_access_key\|Authorization\|Token\).*/\1=***REDACTED***/g' exit 1 fi # Save the analysis results to a file with restricted permissions echo "$ANALYZE_OUTPUT" > textract-analysis-results.json chmod 600 textract-analysis-results.json echo "Analysis results saved to textract-analysis-results.json" RESOURCES_CREATED+=("Local file: textract-analysis-results.json") # Display a summary of the analysis echo "" echo "===================================================" echo "Analysis Summary" echo "===================================================" PAGES=$(echo "$ANALYZE_OUTPUT" | grep -o '"Pages": [0-9]*' | head -1 | awk '{print $2}') echo "Document pages: $PAGES" BLOCKS_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType":' | wc -l) echo "Total blocks detected: $BLOCKS_COUNT" # Count different block types using jq if available, fallback to grep PAGE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "PAGE"' | wc -l || echo 0) LINE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "LINE"' | wc -l || echo 0) WORD_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "WORD"' | wc -l || echo 0) TABLE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "TABLE"' | wc -l || echo 0) CELL_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "CELL"' | wc -l || echo 0) KEY_VALUE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "KEY_VALUE_SET"' | wc -l || echo 0) SIGNATURE_COUNT=$(echo "$ANALYZE_OUTPUT" | grep -o '"BlockType": "SIGNATURE"' | wc -l || echo 0) echo "Pages: $PAGE_COUNT" echo "Lines of text: $LINE_COUNT" echo "Words: $WORD_COUNT" echo "Tables: $TABLE_COUNT" echo "Table cells: $CELL_COUNT" echo "Key-value pairs: $KEY_VALUE_COUNT" echo "Signatures: $SIGNATURE_COUNT" echo "" # Cleanup confirmation echo "" echo "===================================================" echo "RESOURCES CREATED" echo "===================================================" for resource in "${RESOURCES_CREATED[@]}"; do echo "- $resource" done echo "" echo "===================================================" echo "CLEANUP CONFIRMATION" echo "===================================================" echo "Cleaning up resources..." # Delete document from S3 echo "Deleting document from S3..." DELETE_DOC_OUTPUT=$(aws s3 rm "s3://$BUCKET_NAME/$DOCUMENT_NAME" 2>&1) DELETE_DOC_STATUS=$? echo "$DELETE_DOC_OUTPUT" check_error $DELETE_DOC_STATUS "$DELETE_DOC_OUTPUT" "aws s3 rm s3://$BUCKET_NAME/$DOCUMENT_NAME" # Delete S3 bucket (only if not shared) if [ "$BUCKET_IS_SHARED" = false ]; then echo "Deleting S3 bucket..." DELETE_BUCKET_OUTPUT=$(aws s3 rb "s3://$BUCKET_NAME" --force 2>&1) DELETE_BUCKET_STATUS=$? echo "$DELETE_BUCKET_OUTPUT" check_error $DELETE_BUCKET_STATUS "$DELETE_BUCKET_OUTPUT" "aws s3 rb s3://$BUCKET_NAME --force" fi # Delete local JSON files rm -f document.json features.json echo "Cleanup complete. The analysis results file (textract-analysis-results.json) has been kept." echo "" echo "===================================================" echo "Tutorial complete!" echo "===================================================" echo "You have successfully analyzed a document using Amazon Textract." echo "The analysis results are available in textract-analysis-results.json" echo "" trap - EXIT
Get started with document analysis
Amazon Transcribe