#!/usr/bin/env python3
"""
Fetch CMS Hospital General Information from CMS.gov Data API

Source: https://data.cms.gov/provider-data/dataset/xubh-q36u
API: https://data.cms.gov/provider-data/api/1/datastore/query/xubh-q36u

Downloads active hospital locations with coordinates, ownership type, and status.
"""

import requests
import json
import csv
from datetime import datetime
from pathlib import Path

# Configuration
API_URL = "https://data.cms.gov/provider-data/api/1/datastore/query/xubh-q36u"
OUTPUT_DIR = Path(__file__).parent.parent.parent / "data"
CACHE_DIR = Path(__file__).parent.parent / "cache"

def fetch_cms_hospitals(limit=10000, offset=0):
    """
    Fetch hospital data from CMS API with pagination.

    Args:
        limit: Number of records per request (max 500 for CMS API)
        offset: Starting record number

    Returns:
        List of hospital records
    """
    all_hospitals = []
    page = 0

    while True:
        print(f"Fetching page {page + 1} (offset {offset})...")

        params = {
            "limit": min(limit, 500),  # CMS API max is 500 per request
            "offset": offset
        }

        try:
            response = requests.get(API_URL, params=params, timeout=30)
            response.raise_for_status()

            data = response.json()
            results = data.get('results', [])

            if not results:
                print(f"No more results. Total hospitals: {len(all_hospitals)}")
                break

            all_hospitals.extend(results)
            print(f"  Retrieved {len(results)} hospitals (total: {len(all_hospitals)})")

            # Check if we got fewer results than requested (last page)
            if len(results) < params['limit']:
                break

            offset += len(results)
            page += 1

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break

    return all_hospitals

def process_hospital_data(hospitals):
    """
    Process and clean hospital data for visualization use.

    Args:
        hospitals: List of raw hospital records

    Returns:
        List of cleaned hospital dictionaries
    """
    processed = []

    for hospital in hospitals:
        # Extract key fields (CMS API field names may vary)
        record = {
            'facility_id': hospital.get('facility_id', ''),
            'facility_name': hospital.get('facility_name', ''),
            'address': hospital.get('address', ''),
            'city': hospital.get('city', ''),
            'state': hospital.get('state', ''),
            'zip_code': hospital.get('zip_code', ''),
            'county_name': hospital.get('county_name', ''),
            'phone_number': hospital.get('phone_number', ''),
            'hospital_type': hospital.get('hospital_type', ''),
            'hospital_ownership': hospital.get('hospital_ownership', ''),
            'emergency_services': hospital.get('emergency_services', ''),
            'latitude': hospital.get('location', {}).get('lat', '') if isinstance(hospital.get('location'), dict) else '',
            'longitude': hospital.get('location', {}).get('lon', '') if isinstance(hospital.get('location'), dict) else '',
        }

        processed.append(record)

    return processed

def save_to_csv(hospitals, output_file):
    """Save hospital data to CSV file."""
    if not hospitals:
        print("No hospitals to save")
        return

    # Ensure output directory exists
    output_file.parent.mkdir(parents=True, exist_ok=True)

    # Get field names from first record
    fieldnames = list(hospitals[0].keys())

    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(hospitals)

    print(f"\n✓ Saved {len(hospitals):,} hospitals to {output_file}")

def save_metadata(hospitals, metadata_file):
    """Save dataset metadata."""
    metadata = {
        "dataset_name": "CMS Hospital General Information",
        "source": "Centers for Medicare & Medicaid Services",
        "url": "https://data.cms.gov/provider-data/dataset/xubh-q36u",
        "api_url": API_URL,
        "downloaded_date": datetime.now().isoformat(),
        "description": "Hospital locations, ownership, type, and emergency services from CMS",
        "rows": len(hospitals),
        "columns": len(hospitals[0].keys()) if hospitals else 0,
        "column_names": list(hospitals[0].keys()) if hospitals else [],
        "geographic_level": "Facility (Hospital)",
        "key_fields": [
            "facility_id - CMS Certification Number (CCN)",
            "facility_name - Hospital name",
            "latitude, longitude - Hospital coordinates",
            "hospital_type - Critical Access, Acute Care, etc.",
            "hospital_ownership - Government, Proprietary, Voluntary",
            "emergency_services - Yes/No"
        ],
        "use_cases": [
            "Healthcare desert identification",
            "Hospital accessibility analysis",
            "Rural healthcare planning",
            "Emergency services coverage"
        ]
    }

    metadata_file.parent.mkdir(parents=True, exist_ok=True)

    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"✓ Saved metadata to {metadata_file}")

def main():
    """Main execution function."""
    print("="*70)
    print("CMS Hospital General Information Fetcher")
    print("="*70)
    print()

    # Fetch data
    print("Fetching hospital data from CMS API...")
    hospitals_raw = fetch_cms_hospitals()

    if not hospitals_raw:
        print("No hospital data retrieved. Exiting.")
        return

    # Process data
    print(f"\nProcessing {len(hospitals_raw):,} hospital records...")
    hospitals = process_hospital_data(hospitals_raw)

    # Save to CSV
    output_csv = OUTPUT_DIR / "cms_hospitals_2025.csv"
    save_to_csv(hospitals, output_csv)

    # Save metadata
    metadata_file = OUTPUT_DIR / "cms_hospitals_2025_metadata.json"
    save_metadata(hospitals, metadata_file)

    # Also save raw JSON to cache for reference
    CACHE_DIR.mkdir(parents=True, exist_ok=True)
    cache_file = CACHE_DIR / f"cms_hospitals_raw_{datetime.now().strftime('%Y%m%d')}.json"
    with open(cache_file, 'w') as f:
        json.dump(hospitals_raw[:100], f, indent=2)  # Save first 100 for reference
    print(f"✓ Saved sample raw data to {cache_file}")

    print("\n" + "="*70)
    print(f"SUCCESS! {len(hospitals):,} hospitals ready for healthcare desert analysis")
    print("="*70)

if __name__ == "__main__":
    main()
