#!/usr/bin/env python3
"""
Food Deserts Data Collection
Fetches data on food access, grocery stores, SNAP usage, and nutritional outcomes
"""

import pandas as pd
import requests
import json
from pathlib import Path
from datetime import datetime
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
OUTPUT_DIR = Path("data")
CACHE_DIR = Path("cache")
OUTPUT_DIR.mkdir(exist_ok=True)
CACHE_DIR.mkdir(exist_ok=True)

CENSUS_API_KEY = os.getenv("CENSUS_API_KEY", "")
USE_CACHED = os.getenv("USE_CACHED_DATA", "true").lower() == "true"

# Data year
CENSUS_YEAR = 2022  # Latest ACS 5-year
USDA_YEAR = 2019    # Latest Food Access Research Atlas

def save_to_cache(df, filename):
    """Save DataFrame to cache"""
    cache_path = CACHE_DIR / f"{filename}.csv"
    df.to_csv(cache_path, index=False)
    print(f"✓ Cached: {cache_path}")

def load_cached_data(filename):
    """Load data from cache if available"""
    cache_path = CACHE_DIR / f"{filename}.csv"
    if USE_CACHED and cache_path.exists():
        print(f"✓ Using cached: {cache_path}")
        return pd.read_csv(cache_path, dtype=str)
    return None

def fetch_usda_food_access():
    """
    Fetch USDA Food Access Research Atlas data
    Source: https://www.ers.usda.gov/data-products/food-access-research-atlas/
    """
    print("\n📊 Fetching USDA Food Access Research Atlas...")

    cached = load_cached_data("usda_food_access")
    if cached is not None:
        return cached

    # USDA provides downloadable Excel/CSV files
    # URL for 2019 Food Access Research Atlas
    # Note: URL updated Dec 2025
    url = "https://www.ers.usda.gov/webdocs/DataFiles/80591/FoodAccessResearchAtlasData2019.xlsx"

    try:
        print(f"  Downloading from: {url}")
        df = pd.read_excel(url, engine='openpyxl')

        # Keep relevant columns
        columns_to_keep = [
            'CensusTract', 'State', 'County', 'Urban', 'POP2010',
            'LowIncomeTracts', 'LILATracts_1And10', 'LILATracts_halfAnd10',
            'LILATracts_1And20', 'LILATracts_Vehicle',
            'lapophalf', 'lapop1', 'lapop10', 'lapop20',
            'lalowihalf', 'lalowi1', 'lalowi10', 'lalowi20',
            'lasnaphalf', 'lasnap1', 'lasnap10', 'lasnap20'
        ]

        available_cols = [col for col in columns_to_keep if col in df.columns]
        df = df[available_cols]

        save_to_cache(df, "usda_food_access")
        return df

    except Exception as e:
        print(f"  ❌ Error fetching USDA data: {e}")
        print(f"  Download manually from: https://www.ers.usda.gov/data-products/food-access-research-atlas/")
        return pd.DataFrame()

def fetch_snap_participation():
    """
    Fetch SNAP (food stamps) participation by county
    Source: Census Bureau SAIPE or USDA FNS
    """
    print("\n📊 Fetching SNAP participation data...")

    cached = load_cached_data("snap_participation")
    if cached is not None:
        return cached

    # USDA Food and Nutrition Service provides state-level data
    # For county-level, we'll estimate from Census poverty + participation rates

    if not CENSUS_API_KEY:
        print("  ⚠️  No Census API key - using sample data")
        return pd.DataFrame()

    try:
        # Get poverty data as proxy for SNAP eligibility
        url = f"https://api.census.gov/data/{CENSUS_YEAR}/acs/acs5"
        params = {
            'get': 'NAME,B17001_001E,B17001_002E',  # Total pop, poverty pop
            'for': 'county:*',
            'key': CENSUS_API_KEY
        }

        response = requests.get(url, params=params)
        data = response.json()

        df = pd.DataFrame(data[1:], columns=data[0])
        df['fips'] = df['state'] + df['county']
        df.columns = ['name', 'total_pop', 'poverty_pop', 'state', 'county', 'fips']

        # Calculate poverty rate
        df['total_pop'] = pd.to_numeric(df['total_pop'])
        df['poverty_pop'] = pd.to_numeric(df['poverty_pop'])
        df['poverty_rate'] = (df['poverty_pop'] / df['total_pop'] * 100).round(2)

        # Estimate SNAP participation (rough proxy)
        # Typically 60-75% of eligible people participate
        df['snap_eligible_est'] = df['poverty_pop']
        df['snap_participants_est'] = (df['poverty_pop'] * 0.67).round(0).astype(int)

        save_to_cache(df, "snap_participation")
        return df

    except Exception as e:
        print(f"  ❌ Error: {e}")
        return pd.DataFrame()

def fetch_grocery_store_density():
    """
    Fetch grocery store and food retailer density by county
    Source: County Business Patterns (CBP) - Census Bureau
    """
    print("\n📊 Fetching grocery store density...")

    cached = load_cached_data("grocery_density")
    if cached is not None:
        return cached

    if not CENSUS_API_KEY:
        print("  ⚠️  No Census API key")
        return pd.DataFrame()

    try:
        # NAICS code 445 = Food and beverage stores
        # 4451 = Grocery stores
        # 4452 = Specialty food stores
        # 4453 = Beer, wine, and liquor stores

        year = 2021  # Latest CBP data
        url = f"https://api.census.gov/data/{year}/cbp"
        params = {
            'get': 'NAME,ESTAB,EMP',
            'for': 'county:*',
            'NAICS2017': '445',  # Food and beverage stores
            'key': CENSUS_API_KEY
        }

        response = requests.get(url, params=params)
        data = response.json()

        df = pd.DataFrame(data[1:], columns=data[0])
        df['fips'] = df['state'] + df['county']
        df.columns = ['name', 'establishments', 'employees', 'state', 'county', 'fips']

        df['establishments'] = pd.to_numeric(df['establishments'], errors='coerce')
        df['employees'] = pd.to_numeric(df['employees'], errors='coerce')

        save_to_cache(df, "grocery_density")
        return df

    except Exception as e:
        print(f"  ❌ Error: {e}")
        return pd.DataFrame()

def fetch_vehicle_access():
    """
    Fetch vehicle ownership data - critical for food access in rural areas
    Source: Census ACS
    """
    print("\n📊 Fetching vehicle access data...")

    cached = load_cached_data("vehicle_access")
    if cached is not None:
        return cached

    if not CENSUS_API_KEY:
        print("  ⚠️  No Census API key")
        return pd.DataFrame()

    try:
        url = f"https://api.census.gov/data/{CENSUS_YEAR}/acs/acs5"
        params = {
            'get': 'NAME,B25044_001E,B25044_003E,B25044_010E',  # Total households, no vehicle (owner), no vehicle (renter)
            'for': 'county:*',
            'key': CENSUS_API_KEY
        }

        response = requests.get(url, params=params)
        data = response.json()

        df = pd.DataFrame(data[1:], columns=data[0])
        df['fips'] = df['state'] + df['county']
        df.columns = ['name', 'total_households', 'no_vehicle_owner', 'no_vehicle_renter', 'state', 'county', 'fips']

        df['total_households'] = pd.to_numeric(df['total_households'])
        df['no_vehicle_owner'] = pd.to_numeric(df['no_vehicle_owner'])
        df['no_vehicle_renter'] = pd.to_numeric(df['no_vehicle_renter'])

        df['no_vehicle_total'] = df['no_vehicle_owner'] + df['no_vehicle_renter']
        df['no_vehicle_pct'] = (df['no_vehicle_total'] / df['total_households'] * 100).round(2)

        save_to_cache(df, "vehicle_access")
        return df

    except Exception as e:
        print(f"  ❌ Error: {e}")
        return pd.DataFrame()

def merge_all_data():
    """Merge all food desert datasets"""
    print("\n🔗 Merging all datasets...")

    usda = fetch_usda_food_access()
    snap = fetch_snap_participation()
    grocery = fetch_grocery_store_density()
    vehicles = fetch_vehicle_access()

    # Start with USDA data (tract level) or SNAP data (county level)
    if not snap.empty:
        merged = snap.copy()

        if not grocery.empty:
            merged = merged.merge(grocery[['fips', 'establishments', 'employees']],
                                 on='fips', how='left', suffixes=('', '_grocery'))

        if not vehicles.empty:
            merged = merged.merge(vehicles[['fips', 'no_vehicle_total', 'no_vehicle_pct']],
                                 on='fips', how='left')

        # Save merged dataset
        output_path = OUTPUT_DIR / "food_desert_merged.csv"
        merged.to_csv(output_path, index=False)
        print(f"✓ Saved: {output_path}")

        # Save metadata
        metadata = {
            'created': datetime.now().isoformat(),
            'rows': len(merged),
            'columns': list(merged.columns),
            'sources': [
                'Census ACS (poverty, vehicle access)',
                'Census CBP (grocery stores)',
                'USDA Food Access Research Atlas (recommended)'
            ]
        }
        meta_path = OUTPUT_DIR / "food_desert_merged_metadata.json"
        with open(meta_path, 'w') as f:
            json.dump(metadata, f, indent=2)

        return merged

    return pd.DataFrame()

if __name__ == "__main__":
    print("=" * 60)
    print("FOOD DESERTS DATA COLLECTION")
    print("=" * 60)

    result = merge_all_data()

    if not result.empty:
        print(f"\n✓ SUCCESS: {len(result)} counties processed")
        print(f"\nColumns: {', '.join(result.columns)}")
    else:
        print("\n❌ No data collected - check API keys and network")
