feat(s3): add script to download S3 bucket contents

dannysteenman · Jan 7, 2025 · a0cafea · a0cafea
1 parent 6830204
commit a0cafea
Show file tree

Hide file tree

Showing 2 changed files with 153 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -82,6 +82,7 @@ This collection includes Python and Bash scripts for managing various AWS servic
 | S3             | [s3_delete_empty_buckets.py](s3/s3_delete_empty_buckets.py)                                       | Deletes empty S3 buckets                                           |
 | S3             | [s3_list_old_files.py](s3/s3_list_old_files.py)                                                   | Lists old files in S3                                              |
 | S3             | [s3_search_bucket_and_delete.py](s3/s3_search_bucket_and_delete.py)                               | Deletes S3 bucket and its contents                                 |
+| S3             | [s3_search_bucket_and_download.py](s3/s3_search_bucket_and_download.py)                           | Finds S3 bucket and download all its content                       |
 | S3             | [s3_search_file.py](s3/s3_search_file.py)                                                         | Searches for files in S3 bucket                                    |
 | S3             | [s3_search_key.py](s3/s3_search_key.py)                                                           | Searches for a key in S3 bucket                                    |
 | S3             | [s3_search_multiple_keys.py](s3/s3_search_multiple_keys.py)                                       | Searches for multiple keys in S3 bucket                            |

diff --git a/s3/s3_search_bucket_and_download.py b/s3/s3_search_bucket_and_download.py
@@ -0,0 +1,152 @@
+"""
+Description: This script searches for a specified S3 bucket and downloads all objects locally,
+maintaining the original folder structure. It provides detailed logging and supports a dry-run mode.
+
+Key features:
+- Supports dry run mode for safe execution
+- Provides detailed logging of all operations
+- Maintains the original folder structure when downloading
+- Implements error handling for robustness
+- Shows progress of downloads
+- Allows specifying a custom target path for each bucket's contents
+
+Usage:
+python s3_search_bucket_and_download.py <bucket-name> [--dry-run] [--output-dir <path>] [--target-path <path>]
+
+Author: Danny Steenman
+License: MIT
+"""
+
+import argparse
+import logging
+import os
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import boto3
+from botocore.config import Config
+from botocore.exceptions import ClientError
+
+
+def setup_logging():
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+    return logging.getLogger(__name__)
+
+
+def get_s3_client():
+    try:
+        config = Config(
+            max_pool_connections=50,  # Increase concurrent connections
+            retries={"max_attempts": 10, "mode": "adaptive"},  # Add retry logic
+        )
+        return boto3.client("s3", config=config)
+    except ClientError as e:
+        logger.error(f"Failed to create S3 client: {e}")
+        sys.exit(1)
+
+
+def get_bucket_size(s3_client, bucket_name):
+    try:
+        response = s3_client.list_objects_v2(Bucket=bucket_name)
+        total_size = sum(obj["Size"] for obj in response.get("Contents", []))
+        return total_size
+    except ClientError as e:
+        logger.error(f"Failed to get bucket size for {bucket_name}: {e}")
+        return 0
+
+
+def download_object(s3_client, bucket_name, obj_key, output_dir, dry_run=False):
+    local_path = os.path.join(output_dir, obj_key)
+    os.makedirs(os.path.dirname(local_path), exist_ok=True)
+
+    if dry_run:
+        logger.info(f"Would download: s3://{bucket_name}/{obj_key} to {local_path}")
+    else:
+        try:
+            s3_client.download_file(bucket_name, obj_key, local_path)
+            logger.info(f"Downloaded: s3://{bucket_name}/{obj_key} to {local_path}")
+        except ClientError as e:
+            logger.error(f"Failed to download s3://{bucket_name}/{obj_key}: {e}")
+
+
+def download_bucket_contents(s3_client, bucket_name, output_dir, dry_run=False):
+    try:
+        paginator = s3_client.get_paginator("list_objects_v2")
+        total_objects = 0
+        downloaded_objects = 0
+
+        # Count total objects
+        for page in paginator.paginate(Bucket=bucket_name):
+            total_objects += len(page.get("Contents", []))
+
+        logger.info(f"Total objects to download: {total_objects}")
+
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            futures = []
+            for page in paginator.paginate(Bucket=bucket_name):
+                for obj in page.get("Contents", []):
+                    future = executor.submit(download_object, s3_client, bucket_name, obj["Key"], output_dir, dry_run)
+                    futures.append(future)
+
+            for future in as_completed(futures):
+                downloaded_objects += 1
+                if downloaded_objects % 100 == 0 or downloaded_objects == total_objects:
+                    logger.info(f"Progress: {downloaded_objects}/{total_objects} objects downloaded")
+
+        logger.info(f"{'Would download' if dry_run else 'Downloaded'} {total_objects} objects from {bucket_name}")
+
+    except ClientError as e:
+        logger.error(f"Failed to download contents of bucket {bucket_name}: {e}")
+
+
+def main(target_bucket_name, output_dir, target_path, dry_run=False):
+    s3_client = get_s3_client()
+
+    try:
+        response = s3_client.list_buckets()
+    except ClientError as e:
+        logger.error(f"Failed to list buckets: {e}")
+        sys.exit(1)
+
+    found_buckets = [bucket["Name"] for bucket in response["Buckets"] if target_bucket_name in bucket["Name"]]
+
+    if not found_buckets:
+        logger.info(f"No buckets found containing the name: {target_bucket_name}")
+        return
+
+    for bucket_name in found_buckets:
+        logger.info(f"Found bucket: {bucket_name}")
+        size_bytes = get_bucket_size(s3_client, bucket_name)
+        size_gb = size_bytes / (1024**3)  # Convert bytes to gigabytes
+        logger.info(f"Bucket size: {size_gb:.2f} GB")
+
+        if target_path:
+            bucket_output_dir = os.path.join(output_dir, target_path)
+        else:
+            bucket_output_dir = os.path.join(output_dir, bucket_name)
+
+        if dry_run:
+            logger.info(f"Dry run: Would download all contents from {bucket_name} to {bucket_output_dir}")
+        else:
+            download_bucket_contents(s3_client, bucket_name, bucket_output_dir, dry_run)
+
+    logger.info("Operation completed.")
+
+
+if __name__ == "__main__":
+    logger = setup_logging()
+
+    parser = argparse.ArgumentParser(description="Download S3 bucket contents while maintaining folder structure")
+    parser.add_argument("bucket_name", help="Name of the bucket to search for and download")
+    parser.add_argument(
+        "--dry-run", action="store_true", help="Perform a dry run without actually downloading anything"
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Base output directory for downloaded files (default: current directory)"
+    )
+    parser.add_argument(
+        "--target-path", help="Specific target path within the output directory to store bucket contents"
+    )
+    args = parser.parse_args()
+
+    main(args.bucket_name, args.output_dir, args.target_path, args.dry_run)