Skip to content

Commit

Permalink
feat(s3): add script to download S3 bucket contents
Browse files Browse the repository at this point in the history
  • Loading branch information
dannysteenman committed Jan 7, 2025
1 parent 6830204 commit a0cafea
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ This collection includes Python and Bash scripts for managing various AWS servic
| S3 | [s3_delete_empty_buckets.py](s3/s3_delete_empty_buckets.py) | Deletes empty S3 buckets |
| S3 | [s3_list_old_files.py](s3/s3_list_old_files.py) | Lists old files in S3 |
| S3 | [s3_search_bucket_and_delete.py](s3/s3_search_bucket_and_delete.py) | Deletes S3 bucket and its contents |
| S3 | [s3_search_bucket_and_download.py](s3/s3_search_bucket_and_download.py) | Finds S3 bucket and download all its content |
| S3 | [s3_search_file.py](s3/s3_search_file.py) | Searches for files in S3 bucket |
| S3 | [s3_search_key.py](s3/s3_search_key.py) | Searches for a key in S3 bucket |
| S3 | [s3_search_multiple_keys.py](s3/s3_search_multiple_keys.py) | Searches for multiple keys in S3 bucket |
Expand Down
152 changes: 152 additions & 0 deletions s3/s3_search_bucket_and_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""
Description: This script searches for a specified S3 bucket and downloads all objects locally,
maintaining the original folder structure. It provides detailed logging and supports a dry-run mode.
Key features:
- Supports dry run mode for safe execution
- Provides detailed logging of all operations
- Maintains the original folder structure when downloading
- Implements error handling for robustness
- Shows progress of downloads
- Allows specifying a custom target path for each bucket's contents
Usage:
python s3_search_bucket_and_download.py <bucket-name> [--dry-run] [--output-dir <path>] [--target-path <path>]
Author: Danny Steenman
License: MIT
"""

import argparse
import logging
import os
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed

import boto3
from botocore.config import Config
from botocore.exceptions import ClientError


def setup_logging():
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
return logging.getLogger(__name__)


def get_s3_client():
try:
config = Config(
max_pool_connections=50, # Increase concurrent connections
retries={"max_attempts": 10, "mode": "adaptive"}, # Add retry logic
)
return boto3.client("s3", config=config)
except ClientError as e:
logger.error(f"Failed to create S3 client: {e}")
sys.exit(1)


def get_bucket_size(s3_client, bucket_name):
try:
response = s3_client.list_objects_v2(Bucket=bucket_name)
total_size = sum(obj["Size"] for obj in response.get("Contents", []))
return total_size
except ClientError as e:
logger.error(f"Failed to get bucket size for {bucket_name}: {e}")
return 0


def download_object(s3_client, bucket_name, obj_key, output_dir, dry_run=False):
local_path = os.path.join(output_dir, obj_key)
os.makedirs(os.path.dirname(local_path), exist_ok=True)

if dry_run:
logger.info(f"Would download: s3://{bucket_name}/{obj_key} to {local_path}")
else:
try:
s3_client.download_file(bucket_name, obj_key, local_path)
logger.info(f"Downloaded: s3://{bucket_name}/{obj_key} to {local_path}")
except ClientError as e:
logger.error(f"Failed to download s3://{bucket_name}/{obj_key}: {e}")


def download_bucket_contents(s3_client, bucket_name, output_dir, dry_run=False):
try:
paginator = s3_client.get_paginator("list_objects_v2")
total_objects = 0
downloaded_objects = 0

# Count total objects
for page in paginator.paginate(Bucket=bucket_name):
total_objects += len(page.get("Contents", []))

logger.info(f"Total objects to download: {total_objects}")

with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for page in paginator.paginate(Bucket=bucket_name):
for obj in page.get("Contents", []):
future = executor.submit(download_object, s3_client, bucket_name, obj["Key"], output_dir, dry_run)
futures.append(future)

for future in as_completed(futures):
downloaded_objects += 1
if downloaded_objects % 100 == 0 or downloaded_objects == total_objects:
logger.info(f"Progress: {downloaded_objects}/{total_objects} objects downloaded")

logger.info(f"{'Would download' if dry_run else 'Downloaded'} {total_objects} objects from {bucket_name}")

except ClientError as e:
logger.error(f"Failed to download contents of bucket {bucket_name}: {e}")


def main(target_bucket_name, output_dir, target_path, dry_run=False):
s3_client = get_s3_client()

try:
response = s3_client.list_buckets()
except ClientError as e:
logger.error(f"Failed to list buckets: {e}")
sys.exit(1)

found_buckets = [bucket["Name"] for bucket in response["Buckets"] if target_bucket_name in bucket["Name"]]

if not found_buckets:
logger.info(f"No buckets found containing the name: {target_bucket_name}")
return

for bucket_name in found_buckets:
logger.info(f"Found bucket: {bucket_name}")
size_bytes = get_bucket_size(s3_client, bucket_name)
size_gb = size_bytes / (1024**3) # Convert bytes to gigabytes
logger.info(f"Bucket size: {size_gb:.2f} GB")

if target_path:
bucket_output_dir = os.path.join(output_dir, target_path)
else:
bucket_output_dir = os.path.join(output_dir, bucket_name)

if dry_run:
logger.info(f"Dry run: Would download all contents from {bucket_name} to {bucket_output_dir}")
else:
download_bucket_contents(s3_client, bucket_name, bucket_output_dir, dry_run)

logger.info("Operation completed.")


if __name__ == "__main__":
logger = setup_logging()

parser = argparse.ArgumentParser(description="Download S3 bucket contents while maintaining folder structure")
parser.add_argument("bucket_name", help="Name of the bucket to search for and download")
parser.add_argument(
"--dry-run", action="store_true", help="Perform a dry run without actually downloading anything"
)
parser.add_argument(
"--output-dir", default=".", help="Base output directory for downloaded files (default: current directory)"
)
parser.add_argument(
"--target-path", help="Specific target path within the output directory to store bucket contents"
)
args = parser.parse_args()

main(args.bucket_name, args.output_dir, args.target_path, args.dry_run)

0 comments on commit a0cafea

Please sign in to comment.