diff --git a/lambda-s3-download/README.md b/lambda-s3-download/README.md new file mode 100644 index 000000000..72289449c --- /dev/null +++ b/lambda-s3-download/README.md @@ -0,0 +1,96 @@ +# Lambda S3 Download + +This pattern deploys a Lambda function that downloads a file from a URL and uploads it to an S3 bucket using multipart upload. It streams the file in configurable chunks through `/tmp`, making it capable of handling files larger than Lambda's memory and storage limits. + +Important: this application uses various AWS services and there are costs associated with these services after the Free Tier usage - please see the [AWS Pricing page](https://aws.amazon.com/pricing/) for details. You are responsible for any AWS costs incurred. No warranty is implied in this example. + +## Requirements + +* [Create an AWS account](https://portal.aws.amazon.com/gp/aws/developer/registration/index.html) if you do not already have one and log in. The IAM user that you use must have sufficient permissions to make necessary AWS service calls and manage AWS resources. +* [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) installed and configured +* [Git Installed](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) +* [AWS Serverless Application Model](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-install.html) (AWS SAM) installed + +## Deployment Instructions + +1. Create a new directory, navigate to that directory in a terminal and clone the GitHub repository: + ``` + git clone https://github.com/aws-samples/serverless-patterns + ``` +1. Change directory to the pattern directory: + ``` + cd serverless-patterns/lambda-s3-download + ``` +1. Build the application: + ``` + sam build + ``` +1. Deploy the application: + ``` + sam deploy --guided + ``` +1. During the prompts: + * Enter a stack name + * Enter the desired AWS Region + * Enter the target S3 bucket name (the bucket must already exist) + * Allow SAM CLI to create IAM roles with the required permissions + + Once you have run `sam deploy --guided` mode once and saved arguments to a configuration file (samconfig.toml), you can use `sam deploy` in future to use these defaults. + +1. Note the outputs from the SAM deployment process. These contain the resource names and/or ARNs which are used for testing. + +## How it works + +The Lambda function: + +1. Receives a download URL and filename via the event payload +2. Initiates an S3 multipart upload with SHA256 checksums +3. Streams the file from the URL in chunks (default 128 MB), writing each chunk to `/tmp` and uploading it as a multipart part +4. Cleans up each chunk from `/tmp` after uploading to stay within the 10 GB ephemeral storage limit +5. Completes the multipart upload and returns the S3 object checksum +6. If any step fails, aborts the multipart upload to avoid orphaned parts + +The function is configured with a 15-minute timeout, 1 GB memory, and 10 GB ephemeral storage. + +## Testing + +Invoke the Lambda function with a test event: + +```bash +aws lambda invoke \ + --function-name FUNCTION_NAME \ + --cli-binary-format raw-in-base64-out \ + --payload '{ + "download_url": "https://example.com/file.zip", + "download_filename": "file.zip" + }' \ + response.json +``` + +Optional event parameters: + +| Parameter | Description | Default | +|---|---|---| +| `target_bucket` | S3 bucket name (overrides the deployed parameter) | Value from template parameter | +| `target_bucket_region` | S3 bucket region | Lambda's region | +| `chunk_size_mb` | Size of each download chunk in MB (clamped between 5 and 5120) | 128 | + +## Known Limitations + +- The Lambda function has a 15-minute maximum timeout. If the download and upload combined take longer than that, the function will be killed mid-stream and the multipart upload will be left incomplete. Consider setting an [S3 lifecycle rule](https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpu-abort-incomplete-mpu-lifecycle-config.html) on the target bucket to auto-clean incomplete multipart uploads. +- The `download_filename` should be a flat filename (e.g. `file.zip`). If it contains slashes (e.g. `path/to/file.zip`), the temporary file path in `/tmp` will include subdirectories that may not exist, causing a write failure. + +## Cleanup + +1. Delete the stack + ```bash + aws cloudformation delete-stack --stack-name STACK_NAME + ``` +1. Confirm the stack has been deleted + ```bash + aws cloudformation list-stacks --query "StackSummaries[?contains(StackName,'STACK_NAME')].StackStatus" + ``` +---- +Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +SPDX-License-Identifier: MIT-0 diff --git a/lambda-s3-download/example-pattern.json b/lambda-s3-download/example-pattern.json new file mode 100644 index 000000000..fc07c7fe1 --- /dev/null +++ b/lambda-s3-download/example-pattern.json @@ -0,0 +1,60 @@ +{ + "title": "Lambda S3 Download", + "description": "A Lambda function that downloads a file from a URL and uploads it to S3 using multipart upload with SHA256 checksums.", + "language": "Python", + "level": "300", + "framework": "SAM", + "introBox": { + "headline": "How it works", + "text": [ + "This pattern deploys a Lambda function that streams a file from a URL and uploads it to an S3 bucket using multipart upload.", + "The file is downloaded in configurable chunks (default 128 MB, clamped between 5 MB and 5 GB) and written to /tmp before being uploaded as individual parts. Each chunk is cleaned up from /tmp after upload, allowing the function to handle files larger than Lambda's memory or ephemeral storage limits.", + "SHA256 checksums are calculated for each part and verified on completion. If any step fails, the multipart upload is automatically aborted to avoid orphaned parts." + ] + }, + "gitHub": { + "template": { + "repoURL": "https://github.com/aws-samples/serverless-patterns/tree/main/lambda-s3-download", + "templateURL": "serverless-patterns/lambda-s3-download", + "projectFolder": "lambda-s3-download", + "templateFile": "template.yaml" + } + }, + "resources": { + "bullets": [ + { + "text": "S3 Multipart Upload Overview", + "link": "https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html" + }, + { + "text": "AWS Lambda - Configuring Ephemeral Storage", + "link": "https://docs.aws.amazon.com/lambda/latest/dg/configuration-ephemeral-storage.html" + } + ] + }, + "deploy": { + "text": [ + "sam build", + "sam deploy --guided" + ] + }, + "testing": { + "text": [ + "See the GitHub repo for detailed testing instructions." + ] + }, + "cleanup": { + "text": [ + "Delete the stack: aws cloudformation delete-stack --stack-name STACK_NAME." + ] + }, + "authors": [ + { + "name": "Robert Meyer", + "image": "https://serverlessland.com/assets/images/resources/contributors/ext-robert-meyer.jpg", + "bio": "Robert is a Partner Solutions Architect with AWS in EMEA.", + "linkedin": "https://www.linkedin.com/in/robert-meyer-phd-6a114a58/", + "twitter": "@robl_on_tour" + } + ] +} diff --git a/lambda-s3-download/src/app.py b/lambda-s3-download/src/app.py new file mode 100644 index 000000000..733ff87ec --- /dev/null +++ b/lambda-s3-download/src/app.py @@ -0,0 +1,66 @@ +import requests +import boto3 +import json +import os +from pathlib import Path + + +def lambda_handler(event, context): + + target_bucket = event.get("target_bucket", os.environ["TARGET_BUCKET"]) + target_bucket_region = event.get("target_bucket_region", os.environ.get("AWS_REGION")) + + download_url = event["download_url"] + download_filename = event["download_filename"] + + # Cap chunk size under 5 GB to be inside S3 max part size and not exhaust max Lambda memory + # Floor chunk size at 5 MB to fit the S3 minimum part size + chunk_size_mb = min(max(int(event.get("chunk_size_mb", 128)), 5), 5120) + + # open a multipart s3 upload request. + s3 = boto3.client("s3", region_name = target_bucket_region) + upload_request = s3.create_multipart_upload(Bucket=target_bucket, Key=download_filename, ChecksumAlgorithm="SHA256") + upload_id = upload_request["UploadId"] + part_number = 0 + parts = [] + + try: + with requests.get(download_url, stream=True) as download_request: + + for chunk in download_request.iter_content(chunk_size=chunk_size_mb*1024*1024): + part_number = part_number + 1 + download_target = Path("/tmp", download_filename + "_" + str(part_number)) + + with download_target.open('wb') as download_file: + download_file.write(chunk) + download_file.close() + + with download_target.open('rb') as download_file: + part_upload = s3.upload_part(Body=download_file, Bucket=target_bucket, Key=download_filename, PartNumber=part_number, UploadId=upload_id, ChecksumAlgorithm="SHA256") + parts.append({'ETag': part_upload['ETag'], 'ChecksumSHA256': part_upload['ChecksumSHA256'], 'PartNumber': part_number}) + download_file.close() + + download_target.unlink() + + s3.complete_multipart_upload(Bucket=target_bucket, Key=download_filename, MultipartUpload={'Parts': parts}, UploadId=upload_id) + objectSummary = s3.get_object_attributes(Bucket=target_bucket,Key=download_filename, ObjectAttributes=['Checksum']) + + return { + "statusCode": 200, + "body": json.dumps({ + "message": f"{download_filename} uploaded successfully", + "bucket": target_bucket, + "key": download_filename, + "checksum_sha256": objectSummary["Checksum"]["ChecksumSHA256"], + "parts": len(parts) + }) + } + + except Exception as e: + s3.abort_multipart_upload(Bucket=target_bucket, Key=download_filename, UploadId=upload_id) + return { + "statusCode": 500, + "body": json.dumps({"message": f"Download/Upload failed: {str(e)}"}) + } + + diff --git a/lambda-s3-download/src/requirements.txt b/lambda-s3-download/src/requirements.txt new file mode 100644 index 000000000..245db934d --- /dev/null +++ b/lambda-s3-download/src/requirements.txt @@ -0,0 +1,6 @@ +boto3 +json +os +Path +requests + diff --git a/lambda-s3-download/template.yaml b/lambda-s3-download/template.yaml new file mode 100644 index 000000000..8c40ab8b9 --- /dev/null +++ b/lambda-s3-download/template.yaml @@ -0,0 +1,31 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: Lambda function that downloads a file from a URL and uploads it to S3 using multipart upload + +Parameters: + TargetBucketName: + Type: String + Description: Name of the S3 bucket to upload files to + +Resources: + DownloadFunction: + Type: AWS::Serverless::Function + Properties: + Handler: app.lambda_handler + Runtime: python3.12 + CodeUri: src/ + Timeout: 900 + MemorySize: 1024 + EphemeralStorage: + Size: 10240 + Environment: + Variables: + TARGET_BUCKET: !Ref TargetBucketName + Policies: + - S3CrudPolicy: + BucketName: !Ref TargetBucketName + +Outputs: + DownloadFunctionArn: + Description: Lambda function ARN + Value: !GetAtt DownloadFunction.Arn