#!/usr/bin/env python3
"""
Wild Data Fetcher
Fetches "quirky" and unconventional datasets for exploratory visualization.

Datasets:
- NUFORC UFO Sightings (Hugging Face / NUFORC)
- NASA Meteorite Landings (NASA Open Data)
- USGS Earthquakes (USGS Real-time Feeds)
- NYC 311 Service Requests (NYC Open Data)
- NOAA Storm Events (NOAA NCEI)
- OpenFoodFacts (Product Database)
- DogSpeak/Audio (Hugging Face)
- GDELT (Global Events)

Usage:
    python fetch_wild_data.py
"""

import os
import json
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv

# Try importing huggingface_hub
try:
    from huggingface_hub import hf_hub_download
    HF_AVAILABLE = True
except ImportError:
    HF_AVAILABLE = False
    print("⚠️  huggingface_hub not installed. Some datasets (UFO, DogSpeak) may be skipped.")

# Load environment variables
load_dotenv()

# Configuration
# Point to repository root (data_trove/), not tools/fetchers/
BASE_DIR = Path(__file__).parent.parent.parent
DATA_DIR = BASE_DIR / 'data'
CACHE_DIR = BASE_DIR / 'cache'
WILD_DATA_DIR = DATA_DIR / 'wild'
WILD_CACHE_DIR = CACHE_DIR / 'wild'

# Ensure directories exist
WILD_DATA_DIR.mkdir(exist_ok=True, parents=True)
WILD_CACHE_DIR.mkdir(exist_ok=True, parents=True)

# Metadata tracking
metadata = {
    'collection_date': datetime.now().isoformat(),
    'sources': {},
    'files': {}
}

def fetch_ufo_sightings():
    """
    Fetch UFO Sightings from NUFORC via Hugging Face.
    """
    print("\n🛸 Fetching UFO Sightings...")
    
    if not HF_AVAILABLE:
        print("   ✗ Skipping: huggingface_hub not installed")
        return False

    repo_id = "kcimc/NUFORC" # Alternate repo
    
    try:
        # We'll try to download the README to confirm access
        hf_hub_download(repo_id=repo_id, filename="README.md", repo_type="dataset", cache_dir=WILD_CACHE_DIR)
        
        print(f"   ✓ Access verified for {repo_id}")
        metadata['sources']['ufo'] = f"https://huggingface.co/datasets/{repo_id}"
        return True
    except Exception as e:
        print(f"   ✗ Error accessing UFO data: {e}")
        return False

def fetch_nasa_meteorites():
    """
    Fetch Meteorite Landings from NASA legacy JSON endpoint.

    Note: Socrata API endpoints (gh4g-9sfh, y77d-th95) return 404 as of 2026-01.
    Using legacy S3-backed JSON endpoint instead.

    The endpoint returns a dict with 'meta' and 'data' keys. The 'data' key
    contains the actual meteorite records.
    """
    print("\n☄️  Fetching NASA Meteorite Landings...")
    # Legacy endpoint (redirects to S3)
    urls = [
        "https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.json"
    ]
    cache_file = WILD_CACHE_DIR / 'nasa_meteorites.json'

    for url in urls:
        try:
            print(f"   Trying {url}...")
            # Legacy endpoint returns full dataset without pagination
            response = requests.get(url, timeout=60, allow_redirects=True)

            if response.status_code == 200:
                data = response.json()
                with open(cache_file, 'w') as f:
                    json.dump(data, f)

                # Count actual records (data is in 'data' key)
                record_count = len(data.get('data', []))
                print(f"   ✓ Downloaded {record_count:,} meteorite landings")
                metadata['sources']['meteorites'] = url
                metadata['files']['meteorites'] = str(cache_file)
                metadata['record_counts'] = metadata.get('record_counts', {})
                metadata['record_counts']['meteorites'] = record_count
                return True
            else:
                print(f"   Status {response.status_code}: {response.text[:100]}")

        except Exception as e:
            print(f"   ✗ Error: {e}")

    print("   ✗ Failed to fetch meteorites from known endpoints")
    return False

def fetch_usgs_earthquakes():
    """
    Fetch USGS Earthquakes (Past 30 Days, Significant).
    """
    print("\n🌍 Fetching USGS Earthquakes...")
    # Summary feed: Significant earthquakes, past 30 days
    url = "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/significant_month.geojson"
    # Also fetch all 4.5+ for context
    url_all = "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/4.5_month.geojson"
    
    cache_file = WILD_CACHE_DIR / 'usgs_earthquakes_sig_month.geojson'
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        with open(cache_file, 'w') as f:
            f.write(response.text)
            
        data = response.json()
        count = data.get('metadata', {}).get('count', 0)
        print(f"   ✓ Downloaded {count} significant earthquakes")
        
        metadata['sources']['earthquakes'] = url
        metadata['files']['earthquakes'] = str(cache_file)
        return True
    except Exception as e:
        print(f"   ✗ Error fetching earthquakes: {e}")
        return False

def fetch_nyc_311():
    """
    Fetch NYC 311 Service Requests (Sample).
    """
    print("\n🗽 Fetching NYC 311 Service Requests (Sample)...")
    url = "https://data.cityofnewyork.us/resource/erm2-nwe9.json"
    cache_file = WILD_CACHE_DIR / 'nyc_311_sample.json'
    
    try:
        # Fetching recent 1000 complaints - Simplified query
        params = {
            '$limit': 1000
        }
        response = requests.get(url, params=params, timeout=60)
        response.raise_for_status()
        
        data = response.json()
        with open(cache_file, 'w') as f:
            json.dump(data, f)
            
        print(f"   ✓ Downloaded {len(data)} 311 requests")
        metadata['sources']['nyc_311'] = url
        metadata['files']['nyc_311'] = str(cache_file)
        return True
    except Exception as e:
        print(f"   ✗ Error fetching NYC 311: {e}")
        return False

def fetch_openfoodfacts_sample():
    """
    Fetch OpenFoodFacts Product Sample.
    """
    print("\n🥫 Fetching OpenFoodFacts Sample...")
    # Search for a category to get a diverse list, e.g., 'snacks'
    url = "https://world.openfoodfacts.org/cgi/search.pl"
    params = {
        'search_terms': 'snack',
        'search_simple': 1,
        'action': 'process',
        'json': 1,
        'page_size': 50
    }
    cache_file = WILD_CACHE_DIR / 'openfoodfacts_sample.json'
    
    try:
        # Increased timeout to 120s
        response = requests.get(url, params=params, timeout=120)
        response.raise_for_status()
        
        data = response.json()
        products = data.get('products', [])
        
        with open(cache_file, 'w') as f:
            json.dump(products, f)
            
        print(f"   ✓ Downloaded {len(products)} food products")
        metadata['sources']['food_facts'] = url
        metadata['files']['food_facts'] = str(cache_file)
        return True
    except Exception as e:
        print(f"   ✗ Error fetching OpenFoodFacts: {e}")
        return False

def fetch_gdelt_events():
    """
    Fetch GDELT 2.0 Events (Last 15 minutes).
    """
    print("\n📰 Fetching GDELT Global Events...")
    # GDELT updates every 15 minutes. We need to get the latest URL from the file list.
    list_url = "http://data.gdeltproject.org/gdeltv2/lastupdate.txt"
    
    try:
        response = requests.get(list_url, timeout=30)
        response.raise_for_status()
        
        # Parse the first line for the export CSV URL
        # Line format: size hash URL
        latest_line = response.text.split('\n')[0]
        csv_url = latest_line.split(' ')[2]
        
        print(f"   Downloading latest events from: {csv_url}")
        
        event_response = requests.get(csv_url, timeout=60)
        event_response.raise_for_status()
        
        filename = csv_url.split('/')[-1]
        cache_file = WILD_CACHE_DIR / filename
        
        with open(cache_file, 'wb') as f:
            f.write(event_response.content)
            
        print(f"   ✓ Downloaded GDELT events to {cache_file}")
        metadata['sources']['gdelt'] = csv_url
        metadata['files']['gdelt'] = str(cache_file)
        return True
    except Exception as e:
        print(f"   ✗ Error fetching GDELT: {e}")
        return False

def main():
    print("=" * 60)
    print("WILD DATA ACTUATION")
    print("=" * 60)

    # NASA meteorite fetcher updated to use legacy endpoint (2026-01-19)
    fetch_nasa_meteorites()
    fetch_usgs_earthquakes()
    fetch_nyc_311()
    fetch_openfoodfacts_sample()
    fetch_gdelt_events()
    fetch_ufo_sightings() # Requires HF
    
    # Save metadata
    meta_path = WILD_DATA_DIR / 'wild_metadata.json'
    with open(meta_path, 'w') as f:
        json.dump(metadata, f, indent=2)
        
    print("\n" + "=" * 60)
    print(f"✓ Metadata saved to {meta_path}")
    print("=" * 60)

if __name__ == "__main__":
    main()