From 50aaa9ce26561ae3d83240c5f0540067673c1662 Mon Sep 17 00:00:00 2001
From: George Khananaev <106206490+georgekhananaev@users.noreply.github.com>
Date: Tue, 3 Jun 2025 00:12:11 +0700
Subject: [PATCH] Added pytest + some tests. Added AWS S3 Support (optional,
 for cloud image storage)

---
 README.md                        | 146 ++++++++++++++++++++++
 modules/image_handler.py         |  47 ++++++-
 modules/s3_handler.py            | 177 +++++++++++++++++++++++++++
 requirements.txt                 |   4 +-
 tests/README.md                  |  54 +++++++++
 tests/__init__.py                |   1 +
 tests/conftest.py                |  39 ++++++
 tests/test_mongodb_connection.py |  90 ++++++++++++++
 tests/test_s3_connection.py      | 202 +++++++++++++++++++++++++++++++
 9 files changed, 755 insertions(+), 5 deletions(-)
 create mode 100644 modules/s3_handler.py
 create mode 100644 tests/README.md
 create mode 100644 tests/__init__.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_mongodb_connection.py
 create mode 100644 tests/test_s3_connection.py

diff --git a/README.md b/README.md
index 9a06734..42ad1d8 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@
   - Snags EVERY damn photo from reviews and profiles
   - Hoards local paths or swaps URLs to your domain like a boss
   - Multi-threaded downloading that would make NASA jealous
+  - **S3 Cloud Storage**: Auto-upload images to AWS S3 with custom folder structure
 - **Time-Bending Magic**: Transforms Google's vague "2 weeks ago" garbage into precise ISO timestamps
 - **Sort Any Damn Way**: Newest, highest, lowest, relevance - we've got you covered
 - **Metadata on Steroids**: Inject custom parameters into every review record
@@ -32,6 +33,7 @@
 Python 3.10+ (don't even try with 3.9, seriously)
 Chrome browser (the fresher the better)
 MongoDB (optional, but c'mon, live a little)
+AWS S3 Account (optional, for cloud image storage)
 Coffee (mandatory for watching thousands of reviews roll in)
 ```
 
@@ -94,6 +96,19 @@ image_dir: "review_images"    # Directory to store downloaded images
 download_threads: 4           # Number of threads for downloading images
 store_local_paths: true       # Whether to store local image paths in documents
 
+# S3 settings (optional)
+use_s3: false                 # Whether to upload images to S3
+s3:
+  aws_access_key_id: ""       # AWS Access Key ID
+  aws_secret_access_key: ""   # AWS Secret Access Key
+  region_name: "us-east-1"    # AWS region
+  bucket_name: ""             # S3 bucket name
+  prefix: "reviews/"          # Base prefix for uploaded files
+  profiles_folder: "profiles/"    # Folder name for profile images within prefix
+  reviews_folder: "reviews/"      # Folder name for review images within prefix
+  delete_local_after_upload: false  # Delete local files after successful S3 upload
+  s3_base_url: ""             # Custom S3 base URL for accessing files (if empty, uses AWS default)
+
 # URL replacement settings
 replace_urls: true           # Whether to replace original URLs with custom ones
 custom_url_base: "https://yourdomain.com/images"  # Base URL for replacement
@@ -150,6 +165,12 @@ python start.py --url "https://maps.app.goo.gl/YOUR_URL" --download-images true
 # Every. Single. Picture. With your domain stamped all over 'em.
 ```
 
+6. S3 Cloud Storage Beast Mode:
+```bash
+python start.py --url "https://maps.app.goo.gl/YOUR_URL" --download-images true --use-s3 true
+# Downloads locally AND uploads to S3. Best of both worlds, baby!
+```
+
 ### Command Line Arguments
 
 ```
@@ -243,6 +264,7 @@ When running with default settings, the scraper creates:
 3. `review_images/` - Directory containing downloaded images:
    - `review_images/profiles/` - Profile pictures
    - `review_images/reviews/` - Review images
+4. **S3 Bucket** (when enabled) - Images uploaded to your configured S3 bucket with custom folder structure
 
 ## 🔄 Integration Examples
 
@@ -304,6 +326,12 @@ print(f"Reviews with images: {len(reviews_with_images)}")
    - Run with `sudo` if you're getting permission errors (not ideal but gets the job done)
    - Some images vanish from Google's CDN faster than your ex. Nothing we can do about that.
 
+5. **S3 Upload Chaos**
+   - Double-check your AWS credentials and bucket permissions
+   - Make sure your bucket exists and is in the specified region
+   - Check if your bucket policy allows public-read for uploaded objects
+   - AWS charges for every API call, so don't go crazy with test uploads
+
 ### Operation Logs (AKA "What The Hell Is It Doing?")
 
 We don't just log, we OBSESSIVELY document the scraper's every breath:
@@ -353,11 +381,129 @@ A: Damn straight. We've pulled 50k+ reviews without breaking a sweat. The MongoD
 **Q: I found a bug/have a killer feature idea!**  
 A: Jump on GitHub and file an issue or PR. But do your homework first – if you're reporting something already in the README, we'll roast you publicly.
 
+## ☁️ AWS S3 Setup Guide
+
+Want to store your images in the cloud like a boss? Here's how to set up S3 integration:
+
+### 1. Create an S3 Bucket
+
+1. Log into [AWS Console](https://console.aws.amazon.com/s3/)
+2. Click "Create bucket"
+3. Choose a unique bucket name (e.g., `your-company-reviews`)
+4. Select your preferred region
+5. **Important**: Under "Block public access settings" - UNCHECK "Block all public access" if you want images to be publicly accessible
+6. Create the bucket
+
+### 2. Set Bucket Permissions
+
+For public image access, add this bucket policy (replace `your-bucket-name`):
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Sid": "PublicReadGetObject",
+      "Effect": "Allow",
+      "Principal": "*",
+      "Action": "s3:GetObject",
+      "Resource": "arn:aws:s3:::your-bucket-name/*"
+    }
+  ]
+}
+```
+
+### 3. Create IAM User for API Access
+
+1. Go to [IAM Console](https://console.aws.amazon.com/iam/)
+2. Create a new user with programmatic access
+3. Attach this policy (replace `your-bucket-name`):
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": [
+        "s3:PutObject",
+        "s3:PutObjectAcl",
+        "s3:GetObject",
+        "s3:DeleteObject"
+      ],
+      "Resource": "arn:aws:s3:::your-bucket-name/*"
+    },
+    {
+      "Effect": "Allow",
+      "Action": [
+        "s3:ListBucket"
+      ],
+      "Resource": "arn:aws:s3:::your-bucket-name"
+    }
+  ]
+}
+```
+
+4. Save the Access Key ID and Secret Access Key
+
+### 4. Configure Your Scraper
+
+Update your `config.yaml`:
+
+```yaml
+use_s3: true
+s3:
+  aws_access_key_id: "YOUR_ACCESS_KEY_ID"
+  aws_secret_access_key: "YOUR_SECRET_ACCESS_KEY"
+  region_name: "us-east-1"  # Match your bucket region
+  bucket_name: "your-bucket-name"
+  prefix: "google_reviews/"
+  profiles_folder: "profiles/"
+  reviews_folder: "reviews/"
+  delete_local_after_upload: false  # Keep local copies
+  s3_base_url: ""  # Leave empty for default AWS URLs
+```
+
+### 5. Test Your Setup
+
+Run the included tests to verify everything works:
+
+```bash
+# Install dependencies
+pip install -r requirements.txt
+
+# Test S3 connection
+pytest tests/test_s3_connection.py -v
+```
+
+### 6. Folder Structure
+
+Your S3 bucket will organize images like this:
+```
+your-bucket/
+├── google_reviews/
+│   ├── profiles/
+│   │   ├── user123.jpg
+│   │   └── user456.jpg
+│   └── reviews/
+│       ├── review789.jpg
+│       └── review101.jpg
+```
+
+### Pro Tips:
+
+- **Cost Optimization**: Enable S3 Intelligent Tiering for automatic cost savings
+- **CDN**: Add CloudFront distribution for faster global image delivery
+- **Security**: Use IAM roles instead of hardcoded keys in production
+- **Monitoring**: Enable S3 access logging to track usage
+
 ## 🌐 Links
 
 - [Python Documentation](https://docs.python.org/3/)
 - [Selenium Documentation](https://selenium-python.readthedocs.io/)
 - [MongoDB Documentation](https://docs.mongodb.com/)
+- [AWS S3 Documentation](https://docs.aws.amazon.com/s3/)
+- [Boto3 Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html)
 
 ---
 
diff --git a/modules/image_handler.py b/modules/image_handler.py
index 4896e06..22a8d6b 100644
--- a/modules/image_handler.py
+++ b/modules/image_handler.py
@@ -11,6 +11,8 @@ from urllib.parse import urlparse
 
 import requests
 
+from modules.s3_handler import S3Handler
+
 # Logger
 log = logging.getLogger("scraper")
 
@@ -34,6 +36,10 @@ class ImageHandler:
         # Subdirectories for different image types
         self.profile_dir = self.image_dir / "profiles"
         self.review_dir = self.image_dir / "reviews"
+        
+        # Initialize S3 handler
+        self.s3_handler = S3Handler(config)
+        self.use_s3 = config.get("use_s3", False)
 
     def ensure_directories(self):
         """Ensure all image directories exist"""
@@ -206,6 +212,31 @@ class ImageHandler:
                 if custom_url:
                     url_to_custom_url[url] = custom_url
 
+        # Upload to S3 if enabled
+        s3_url_mapping = {}
+        if self.use_s3 and self.s3_handler.enabled and url_to_filename:
+            log.info("Uploading images to S3...")
+            
+            # Prepare files for S3 upload
+            files_to_upload = {}
+            for url, filename in url_to_filename.items():
+                # Determine if it's a profile image
+                is_profile = any(url == profile_url for profile_url in profile_urls)
+                
+                # Get local file path
+                local_path = (self.profile_dir if is_profile else self.review_dir) / filename
+                
+                if local_path.exists():
+                    files_to_upload[filename] = (local_path, is_profile)
+            
+            # Upload to S3
+            s3_results = self.s3_handler.upload_images_batch(files_to_upload)
+            
+            # Create mapping from original URL to S3 URL
+            for url, filename in url_to_filename.items():
+                if filename in s3_results:
+                    s3_url_mapping[url] = s3_results[filename]
+
         # Update review documents
         for review_id, review in reviews.items():
             # Find the original URLs to use for lookup - important for both user_images and profile_picture
@@ -241,7 +272,10 @@ class ImageHandler:
                     # Create custom URLs for each image
                     custom_images = []
                     for url in user_images_original:
-                        if url in url_to_custom_url:
+                        # Prefer S3 URL if available
+                        if url in s3_url_mapping:
+                            custom_images.append(s3_url_mapping[url])
+                        elif url in url_to_custom_url:
                             custom_images.append(url_to_custom_url[url])
                         elif not self.is_not_custom_url(url):  # Already a custom URL
                             custom_images.append(url)
@@ -262,8 +296,10 @@ class ImageHandler:
                     if self.preserve_original_urls and "original_profile_picture" not in review:
                         review["original_profile_picture"] = review["profile_picture"]
 
-                    # Replace with custom URL if we have one for this profile image
-                    if profile_picture_original in url_to_custom_url:
+                    # Replace with S3 URL if available, otherwise use custom URL
+                    if profile_picture_original in s3_url_mapping:
+                        review["profile_picture"] = s3_url_mapping[profile_picture_original]
+                    elif profile_picture_original in url_to_custom_url:
                         review["profile_picture"] = url_to_custom_url[profile_picture_original]
                     elif not self.is_not_custom_url(review["profile_picture"]):
                         # If current URL is already a custom URL, keep it
@@ -277,7 +313,10 @@ class ImageHandler:
                                 review["profile_picture"] = custom_url
 
         log.info(f"Downloaded {len(url_to_filename)} images")
+        if self.use_s3 and s3_url_mapping:
+            log.info(f"Uploaded {len(s3_url_mapping)} images to S3")
         if self.replace_urls:
-            log.info(f"Replaced URLs for {len(url_to_custom_url)} images")
+            total_replaced = len(s3_url_mapping) + len(url_to_custom_url)
+            log.info(f"Replaced URLs for {total_replaced} images")
 
         return reviews
diff --git a/modules/s3_handler.py b/modules/s3_handler.py
new file mode 100644
index 0000000..32361fc
--- /dev/null
+++ b/modules/s3_handler.py
@@ -0,0 +1,177 @@
+"""
+S3 upload handler for Google Maps Reviews Scraper.
+"""
+
+import logging
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+
+import boto3
+from botocore.exceptions import ClientError
+
+log = logging.getLogger("scraper")
+
+
+class S3Handler:
+    """Handler for uploading images to AWS S3"""
+
+    def __init__(self, config: Dict[str, Any]):
+        """Initialize S3 handler with configuration"""
+        self.enabled = config.get("use_s3", False)
+        
+        if not self.enabled:
+            return
+            
+        s3_config = config.get("s3", {})
+        
+        self.aws_access_key_id = s3_config.get("aws_access_key_id", "")
+        self.aws_secret_access_key = s3_config.get("aws_secret_access_key", "")
+        self.region_name = s3_config.get("region_name", "us-east-1")
+        self.bucket_name = s3_config.get("bucket_name", "")
+        self.prefix = s3_config.get("prefix", "reviews/").rstrip("/") + "/"
+        self.profiles_folder = s3_config.get("profiles_folder", "profiles/").strip("/")
+        self.reviews_folder = s3_config.get("reviews_folder", "reviews/").strip("/")
+        self.delete_local_after_upload = s3_config.get("delete_local_after_upload", False)
+        self.s3_base_url = s3_config.get("s3_base_url", "")
+        
+        # Validate required settings
+        if not self.bucket_name:
+            log.error("S3 bucket_name is required when use_s3 is enabled")
+            self.enabled = False
+            return
+            
+        # Initialize S3 client
+        try:
+            session_kwargs = {"region_name": self.region_name}
+            
+            # Use credentials if provided, otherwise rely on environment/IAM
+            if self.aws_access_key_id and self.aws_secret_access_key:
+                session_kwargs.update({
+                    "aws_access_key_id": self.aws_access_key_id,
+                    "aws_secret_access_key": self.aws_secret_access_key
+                })
+            
+            self.s3_client = boto3.client("s3", **session_kwargs)
+            
+            # Test connection by checking if bucket exists
+            self.s3_client.head_bucket(Bucket=self.bucket_name)
+            log.info(f"S3 handler initialized successfully for bucket: {self.bucket_name}")
+            
+        except ClientError as e:
+            error_code = e.response.get('Error', {}).get('Code', '')
+            if error_code == '404':
+                log.error(f"S3 bucket '{self.bucket_name}' not found")
+            elif error_code == '403':
+                log.error(f"Access denied to S3 bucket '{self.bucket_name}'")
+            else:
+                log.error(f"Error connecting to S3: {e}")
+            self.enabled = False
+            
+        except Exception as e:
+            log.error(f"Error initializing S3 client: {e}")
+            self.enabled = False
+
+    def get_s3_url(self, key: str) -> str:
+        """Generate S3 URL for uploaded file"""
+        if self.s3_base_url:
+            return f"{self.s3_base_url.rstrip('/')}/{key}"
+        else:
+            return f"https://{self.bucket_name}.s3.{self.region_name}.amazonaws.com/{key}"
+
+    def upload_file(self, local_path: Path, s3_key: str) -> Optional[str]:
+        """
+        Upload a file to S3.
+        
+        Args:
+            local_path: Path to local file
+            s3_key: S3 key (path) for the uploaded file
+            
+        Returns:
+            S3 URL if successful, None if failed
+        """
+        if not self.enabled:
+            return None
+            
+        if not local_path.exists():
+            log.warning(f"Local file does not exist: {local_path}")
+            return None
+            
+        try:
+            # Upload file
+            self.s3_client.upload_file(
+                str(local_path),
+                self.bucket_name,
+                s3_key,
+                ExtraArgs={
+                    'ContentType': 'image/jpeg',
+                    'ACL': 'public-read'  # Make images publicly readable
+                }
+            )
+            
+            # Generate S3 URL
+            s3_url = self.get_s3_url(s3_key)
+            
+            # Delete local file if requested
+            if self.delete_local_after_upload:
+                try:
+                    local_path.unlink()
+                    log.debug(f"Deleted local file: {local_path}")
+                except Exception as e:
+                    log.warning(f"Failed to delete local file {local_path}: {e}")
+            
+            log.debug(f"Uploaded {local_path} to s3://{self.bucket_name}/{s3_key}")
+            return s3_url
+            
+        except ClientError as e:
+            log.error(f"Failed to upload {local_path} to S3: {e}")
+            return None
+        except Exception as e:
+            log.error(f"Unexpected error uploading {local_path} to S3: {e}")
+            return None
+
+    def upload_image(self, local_path: Path, filename: str, is_profile: bool = False) -> Optional[str]:
+        """
+        Upload an image to S3 with appropriate folder structure.
+        
+        Args:
+            local_path: Path to local image file
+            filename: Name of the file
+            is_profile: Whether this is a profile image
+            
+        Returns:
+            S3 URL if successful, None if failed
+        """
+        if not self.enabled:
+            return None
+            
+        # Create S3 key with appropriate folder structure
+        folder = self.profiles_folder if is_profile else self.reviews_folder
+        s3_key = f"{self.prefix}{folder}/{filename}"
+        
+        return self.upload_file(local_path, s3_key)
+
+    def upload_images_batch(self, image_files: Dict[str, tuple]) -> Dict[str, str]:
+        """
+        Upload multiple images to S3.
+        
+        Args:
+            image_files: Dict mapping filename to (local_path, is_profile) tuple
+            
+        Returns:
+            Dict mapping filename to S3 URL for successful uploads
+        """
+        if not self.enabled:
+            return {}
+            
+        results = {}
+        
+        for filename, (local_path, is_profile) in image_files.items():
+            s3_url = self.upload_image(local_path, filename, is_profile)
+            if s3_url:
+                results[filename] = s3_url
+                
+        if results:
+            log.info(f"Successfully uploaded {len(results)} images to S3")
+            
+        return results
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 18c46cf..728ee14 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,6 @@ pymongo==4.12.0
 pyyaml==6.0.1
 certifi==2024.7.4
 webdriver-manager==4.0.2
-setuptools==79.0.1
\ No newline at end of file
+setuptools==79.0.1
+boto3==1.35.1
+pytest==7.4.3
\ No newline at end of file
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..ad2650b
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,54 @@
+# Tests
+
+This directory contains pytest tests for the Google Reviews Scraper.
+
+## Running Tests
+
+1. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+2. Run all tests:
+```bash
+pytest tests/
+```
+
+3. Run specific test files:
+```bash
+pytest tests/test_mongodb_connection.py
+pytest tests/test_s3_connection.py
+```
+
+4. Run with verbose output:
+```bash
+pytest tests/ -v
+```
+
+## Test Coverage
+
+### MongoDB Connection Tests (`test_mongodb_connection.py`)
+- Tests MongoDB connection when enabled in config
+- Validates MongoDB configuration parameters
+- Tests basic database operations (insert/find/delete)
+- Skips tests when MongoDB is disabled
+
+### S3 Connection Tests (`test_s3_connection.py`)
+- Tests S3 connection when enabled in config
+- Validates S3 configuration parameters
+- Tests file upload/download operations
+- Tests S3Handler class initialization
+- Skips tests when S3 is disabled
+
+## Configuration
+
+Tests use the main `config.yaml` file in the project root. Make sure your configuration is properly set up:
+
+- For MongoDB tests: Ensure `use_mongodb: true` and valid MongoDB credentials
+- For S3 tests: Ensure `use_s3: true` and valid AWS credentials
+
+## Test Results
+
+- Tests will be skipped if the corresponding service (MongoDB/S3) is disabled in config
+- Failed connection tests indicate configuration or service availability issues
+- All tests should pass when services are properly configured and accessible
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..739954c
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# Tests package
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..cad1082
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,39 @@
+"""
+Test configuration and fixtures for Google Reviews Scraper tests.
+"""
+
+import pytest
+import yaml
+from pathlib import Path
+
+
+@pytest.fixture
+def config():
+    """Load configuration from config.yaml"""
+    config_path = Path(__file__).parent.parent / "config.yaml"
+    with open(config_path, 'r') as f:
+        return yaml.safe_load(f)
+
+
+@pytest.fixture
+def mongodb_config(config):
+    """Extract MongoDB configuration"""
+    return config.get("mongodb", {})
+
+
+@pytest.fixture
+def s3_config(config):
+    """Extract S3 configuration"""
+    return config.get("s3", {})
+
+
+@pytest.fixture
+def use_mongodb(config):
+    """Check if MongoDB is enabled"""
+    return config.get("use_mongodb", False)
+
+
+@pytest.fixture
+def use_s3(config):
+    """Check if S3 is enabled"""
+    return config.get("use_s3", False)
\ No newline at end of file
diff --git a/tests/test_mongodb_connection.py b/tests/test_mongodb_connection.py
new file mode 100644
index 0000000..6e8992f
--- /dev/null
+++ b/tests/test_mongodb_connection.py
@@ -0,0 +1,90 @@
+"""
+Test MongoDB connection functionality.
+"""
+
+import pytest
+from pymongo import MongoClient
+from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
+
+
+class TestMongoDBConnection:
+    """Test MongoDB connection and basic operations"""
+
+    def test_mongodb_connection_when_enabled(self, use_mongodb, mongodb_config):
+        """Test MongoDB connection when MongoDB is enabled in config"""
+        if not use_mongodb:
+            pytest.skip("MongoDB is disabled in configuration")
+            
+        if not mongodb_config:
+            pytest.fail("MongoDB is enabled but no MongoDB configuration found")
+            
+        uri = mongodb_config.get("uri")
+        if not uri:
+            pytest.fail("MongoDB URI not found in configuration")
+            
+        try:
+            # Create MongoDB client with shorter timeout for testing
+            client = MongoClient(uri, serverSelectionTimeoutMS=5000)
+            
+            # Test connection by pinging the server
+            client.admin.command('ping')
+            
+            # Test database access
+            database_name = mongodb_config.get("database", "reviews")
+            db = client[database_name]
+            
+            # Test collection access
+            collection_name = mongodb_config.get("collection", "google_reviews")
+            collection = db[collection_name]
+            
+            # Verify we can perform basic operations
+            # Test insert and delete a dummy document
+            test_doc = {"_id": "test_connection", "test": True}
+            collection.insert_one(test_doc)
+            
+            # Verify document was inserted
+            found_doc = collection.find_one({"_id": "test_connection"})
+            assert found_doc is not None
+            assert found_doc["test"] is True
+            
+            # Clean up test document
+            collection.delete_one({"_id": "test_connection"})
+            
+            # Verify document was deleted
+            found_doc = collection.find_one({"_id": "test_connection"})
+            assert found_doc is None
+            
+            client.close()
+            
+        except ConnectionFailure as e:
+            pytest.fail(f"Failed to connect to MongoDB: {e}")
+        except ServerSelectionTimeoutError as e:
+            pytest.fail(f"MongoDB server selection timeout: {e}")
+        except Exception as e:
+            pytest.fail(f"Unexpected error testing MongoDB: {e}")
+
+    def test_mongodb_config_validation(self, use_mongodb, mongodb_config):
+        """Test that MongoDB configuration is valid when enabled"""
+        if not use_mongodb:
+            pytest.skip("MongoDB is disabled in configuration")
+            
+        # Check required configuration fields
+        assert "uri" in mongodb_config, "MongoDB URI is required"
+        assert "database" in mongodb_config, "MongoDB database name is required"
+        assert "collection" in mongodb_config, "MongoDB collection name is required"
+        
+        # Validate URI format
+        uri = mongodb_config["uri"]
+        assert uri.startswith("mongodb://") or uri.startswith("mongodb+srv://"), "Invalid MongoDB URI format"
+        
+        # Validate names are not empty
+        assert mongodb_config["database"].strip(), "Database name cannot be empty"
+        assert mongodb_config["collection"].strip(), "Collection name cannot be empty"
+
+    def test_mongodb_skipped_when_disabled(self, use_mongodb):
+        """Test that MongoDB tests are skipped when disabled"""
+        if use_mongodb:
+            pytest.skip("MongoDB is enabled, this test is for disabled state")
+            
+        # This test passes if we reach here, meaning MongoDB is properly disabled
+        assert True
\ No newline at end of file
diff --git a/tests/test_s3_connection.py b/tests/test_s3_connection.py
new file mode 100644
index 0000000..917d103
--- /dev/null
+++ b/tests/test_s3_connection.py
@@ -0,0 +1,202 @@
+"""
+Test S3 connection functionality.
+"""
+
+import pytest
+import boto3
+from botocore.exceptions import ClientError, NoCredentialsError
+from pathlib import Path
+import tempfile
+import os
+
+
+class TestS3Connection:
+    """Test S3 connection and basic operations"""
+
+    def test_s3_connection_when_enabled(self, use_s3, s3_config):
+        """Test S3 connection when S3 is enabled in config"""
+        if not use_s3:
+            pytest.skip("S3 is disabled in configuration")
+            
+        if not s3_config:
+            pytest.fail("S3 is enabled but no S3 configuration found")
+            
+        # Validate required configuration
+        bucket_name = s3_config.get("bucket_name")
+        if not bucket_name:
+            pytest.fail("S3 bucket name not found in configuration")
+            
+        region_name = s3_config.get("region_name", "us-east-1")
+        
+        try:
+            # Create S3 client with credentials from config
+            session_kwargs = {"region_name": region_name}
+            
+            aws_access_key_id = s3_config.get("aws_access_key_id")
+            aws_secret_access_key = s3_config.get("aws_secret_access_key")
+            
+            if aws_access_key_id and aws_secret_access_key:
+                session_kwargs.update({
+                    "aws_access_key_id": aws_access_key_id,
+                    "aws_secret_access_key": aws_secret_access_key
+                })
+            
+            s3_client = boto3.client("s3", **session_kwargs)
+            
+            # Test bucket access by checking if bucket exists
+            s3_client.head_bucket(Bucket=bucket_name)
+            
+        except NoCredentialsError:
+            pytest.fail("AWS credentials not found. Check your configuration or environment.")
+        except ClientError as e:
+            error_code = e.response.get('Error', {}).get('Code', '')
+            if error_code == '404':
+                pytest.fail(f"S3 bucket '{bucket_name}' not found")
+            elif error_code == '403':
+                pytest.fail(f"Access denied to S3 bucket '{bucket_name}'. Check your credentials and permissions.")
+            else:
+                pytest.fail(f"S3 client error: {e}")
+        except Exception as e:
+            pytest.fail(f"Unexpected error testing S3 connection: {e}")
+
+    def test_s3_upload_download_when_enabled(self, use_s3, s3_config):
+        """Test S3 upload and download functionality"""
+        if not use_s3:
+            pytest.skip("S3 is disabled in configuration")
+            
+        if not s3_config:
+            pytest.fail("S3 is enabled but no S3 configuration found")
+            
+        bucket_name = s3_config.get("bucket_name")
+        if not bucket_name:
+            pytest.fail("S3 bucket name not found in configuration")
+            
+        region_name = s3_config.get("region_name", "us-east-1")
+        prefix = s3_config.get("prefix", "reviews/").rstrip("/") + "/"
+        profiles_folder = s3_config.get("profiles_folder", "profiles/").strip("/")
+        reviews_folder = s3_config.get("reviews_folder", "reviews/").strip("/")
+        
+        try:
+            # Create S3 client
+            session_kwargs = {"region_name": region_name}
+            
+            aws_access_key_id = s3_config.get("aws_access_key_id")
+            aws_secret_access_key = s3_config.get("aws_secret_access_key")
+            
+            if aws_access_key_id and aws_secret_access_key:
+                session_kwargs.update({
+                    "aws_access_key_id": aws_access_key_id,
+                    "aws_secret_access_key": aws_secret_access_key
+                })
+            
+            s3_client = boto3.client("s3", **session_kwargs)
+            
+            # Create a temporary test file
+            test_content = b"This is a test file for S3 upload"
+            # Test with reviews folder structure
+            test_key = f"{prefix}{reviews_folder}/test_file.txt"
+            
+            with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+                tmp_file.write(test_content)
+                tmp_file_path = tmp_file.name
+            
+            try:
+                # Test upload
+                s3_client.upload_file(
+                    tmp_file_path,
+                    bucket_name,
+                    test_key,
+                    ExtraArgs={'ACL': 'public-read'}
+                )
+                
+                # Test that file exists in S3
+                s3_client.head_object(Bucket=bucket_name, Key=test_key)
+                
+                # Test download
+                with tempfile.NamedTemporaryFile(delete=False) as download_file:
+                    download_path = download_file.name
+                
+                s3_client.download_file(bucket_name, test_key, download_path)
+                
+                # Verify downloaded content matches uploaded content
+                with open(download_path, 'rb') as f:
+                    downloaded_content = f.read()
+                
+                assert downloaded_content == test_content, "Downloaded content doesn't match uploaded content"
+                
+                # Clean up S3 object
+                s3_client.delete_object(Bucket=bucket_name, Key=test_key)
+                
+            finally:
+                # Clean up temporary files
+                if os.path.exists(tmp_file_path):
+                    os.unlink(tmp_file_path)
+                if os.path.exists(download_path):
+                    os.unlink(download_path)
+                    
+        except ClientError as e:
+            error_code = e.response.get('Error', {}).get('Code', '')
+            if error_code == '403':
+                pytest.fail(f"Access denied during S3 operations. Check your permissions.")
+            else:
+                pytest.fail(f"S3 operation failed: {e}")
+        except Exception as e:
+            pytest.fail(f"Unexpected error during S3 test: {e}")
+
+    def test_s3_config_validation(self, use_s3, s3_config):
+        """Test that S3 configuration is valid when enabled"""
+        if not use_s3:
+            pytest.skip("S3 is disabled in configuration")
+            
+        # Check required configuration fields
+        assert "bucket_name" in s3_config, "S3 bucket_name is required"
+        assert s3_config["bucket_name"].strip(), "S3 bucket_name cannot be empty"
+        
+        # Check optional fields have reasonable defaults
+        region_name = s3_config.get("region_name", "us-east-1")
+        assert region_name.strip(), "S3 region_name cannot be empty"
+        
+        # Validate prefix format if provided
+        prefix = s3_config.get("prefix", "")
+        if prefix and not prefix.endswith("/"):
+            # This is not an error, but log a warning that prefix should end with "/"
+            pass
+
+    def test_s3_skipped_when_disabled(self, use_s3):
+        """Test that S3 tests are skipped when disabled"""
+        if use_s3:
+            pytest.skip("S3 is enabled, this test is for disabled state")
+            
+        # This test passes if we reach here, meaning S3 is properly disabled
+        assert True
+
+    def test_s3_handler_initialization(self, config):
+        """Test S3Handler class initialization with current config"""
+        try:
+            # Import the S3Handler class
+            import sys
+            sys.path.append(str(Path(__file__).parent.parent))
+            from modules.s3_handler import S3Handler
+            
+            # Test initialization
+            s3_handler = S3Handler(config)
+            
+            # Check that handler respects the use_s3 setting
+            expected_enabled = config.get("use_s3", False)
+            assert s3_handler.enabled == expected_enabled, f"S3Handler enabled state should match config use_s3 setting"
+            
+            if expected_enabled:
+                # If S3 is enabled, check that configuration was loaded
+                s3_config = config.get("s3", {})
+                bucket_name = s3_config.get("bucket_name", "")
+                
+                if bucket_name:
+                    assert s3_handler.bucket_name == bucket_name, "S3Handler should load bucket name from config"
+                else:
+                    # If no bucket name, handler should be disabled
+                    assert not s3_handler.enabled, "S3Handler should be disabled when bucket_name is missing"
+                    
+        except ImportError:
+            pytest.fail("Could not import S3Handler class")
+        except Exception as e:
+            pytest.fail(f"Error testing S3Handler initialization: {e}")
\ No newline at end of file