#!/usr/bin/env python3
# -----------------------------------------------------------------------------
# File Purpose: Orchestrates niche dataset downloads (cables, space, Glottolog,
#               AS topology) into local cache for downstream visualizations.
# Primary Functions/Classes: fetch_submarine_cables, fetch_space_debris,
#                            fetch_endangered_languages, fetch_internet_topology
# Inputs and Outputs (I/O): Inputs optional --target CLI selector.
#                            Outputs cached files under ./cache and metadata
#                            JSON in ./data.
# -----------------------------------------------------------------------------
"""
Niche Data Fetcher
Actuates the capture of "Hidden Gem" datasets identified in Phase 5.
"""

import os
import json
import requests
import gzip
import shutil
import argparse
from pathlib import Path
from datetime import datetime

# Configuration
BASE_DIR = Path(__file__).parent
DATA_DIR = BASE_DIR / 'data'
CACHE_DIR = BASE_DIR / 'cache'

# Ensure directories exist
DATA_DIR.mkdir(exist_ok=True, parents=True)
CACHE_DIR.mkdir(exist_ok=True, parents=True)

metadata = {
    'collection_date': datetime.now().isoformat(),
    'sources': {},
    'files': {}
}

def fetch_submarine_cables():
    """
    Fetch TeleGeography Submarine Cable Map data.
    Source: https://github.com/telegeography/www.submarinecablemap.com
    """
    print("\n🌊 Fetching Submarine Cables...")
    # TeleGeography provides a structured public API endpoint for the map
    url = "https://www.submarinecablemap.com/api/v3/cable/cable-geo.json"
    cache_file = CACHE_DIR / 'submarine_cables.json'
    
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        
        with open(cache_file, 'wb') as f:
            f.write(response.content)
            
        print(f"   ✓ Downloaded Submarine Cables to {cache_file}")
        
        metadata['sources']['cables'] = url
        metadata['files']['cables'] = str(cache_file)
        return True
    except Exception as e:
        print(f"   ✗ Error fetching cables: {e}")
        return False

def fetch_space_debris():
    """
    Fetch Active Satellites and Debris TLEs.
    Source: Celestrak
    """
    print("\n🛰️  Fetching Space Debris (Celestrak)...")
    # Fetching 'Active' satellites and special '100 (or so) Brightest' for viz
    # Full catalog can be large; grabbing active for now.
    targets = [
        ("active_satellites", "https://celestrak.org/NORAD/elements/gp.php?GROUP=active&FORMAT=json"),
        ("space_stations", "https://celestrak.org/NORAD/elements/gp.php?GROUP=stations&FORMAT=json")
    ]
    
    success = True
    for name, url in targets:
        cache_file = CACHE_DIR / f'celestrak_{name}.json'
        try:
            print(f"   ...fetching {name}")
            response = requests.get(url, timeout=60)
            response.raise_for_status()
            
            with open(cache_file, 'wb') as f:
                f.write(response.content)
                
            metadata['sources'][f'space_{name}'] = url
            metadata['files'][f'space_{name}'] = str(cache_file)
        except Exception as e:
            print(f"   ✗ Error fetching {name}: {e}")
            success = False
            
    if success:
        print("   ✓ Downloaded Space Data")
    return success

def fetch_endangered_languages():
    """
    Fetch Endangered Languages (Glottolog / UNESCO mirror).
    Using Glottolog Languoid data (public JSON/CSV).
    """
    print("\n🗣️  Fetching Endangered Languages (Glottolog)...")
    # Glottolog provides a simplified list of languages and coordinates
    # We will try to fetch the glottolog-cldf (Cross-Linguistic Data Formats) release or a raw list.
    # A reliable static source for "languages with coords" is often used in tutorials.
    
    # Using a stable dataset from a public data repo for Glottolog/ISO mapping if main is too complex
    # For now, let's grab the 'glottolog_languoid.csv' from a known mirror or direct if possible.
    # Actually, let's target the UNESCO Atlas if a direct CSV link exists... it's often broken.
    # Let's fallback to a high-quality GitHub consolidated list.
    
    urls = [
        "https://raw.githubusercontent.com/glottolog/glottolog-cldf/master/cldf/languages.csv",
        "https://raw.githubusercontent.com/glottolog/glottolog-cldf/main/cldf/languages.csv",
    ]
    cache_file = CACHE_DIR / 'glottolog_languages.csv'
    
    for url in urls:
        try:
            print(f"   ...trying {url}")
            response = requests.get(url, timeout=60)
            response.raise_for_status()
            
            with open(cache_file, 'wb') as f:
                f.write(response.content)
                
            print(f"   ✓ Downloaded Glottolog Data to {cache_file}")
            metadata['sources']['languages'] = url
            metadata['files']['languages'] = str(cache_file)
            return True
        except Exception as e:
            print(f"   ✗ Error fetching languages from {url}: {e}")
            continue
    
    print("   ✗ Failed to download Glottolog language catalog from all sources.")
    return False

def fetch_internet_topology():
    """
    Fetch CAIDA AS Relationships.
    Source: CAIDA (public)
    """
    print("\n🕸️  Fetching Internet Topology (CAIDA Sample)...")
    # CAIDA releases 'as-relationships' datasets publicly (serial 1).
    # We need to find the latest file. This usually requires scraping an index or knowing the date.
    # We will try a 'latest' prediction or fail gracefully with a manual link.
    
    # Standard URL pattern: https://publicdata.caida.org/datasets/as-relationships/serial-1/20231201.as-rel.txt.gz
    # Constructing a likely recent URL.
    # Trying Dec 1st of previous year if current year fails, or just hardcoding a known recent.
    # We will try a few likely dates.
    
    
    # Standard URL pattern: https://publicdata.caida.org/datasets/as-relationships/serial-1/YYYYMMDD.as-rel.txt.bz2
    # Verified latest: 20251201 (Dec 1, 2025)
    
    dates = ["20251201", "20251101", "20240101"]
    
    for date in dates:
        url = f"https://publicdata.caida.org/datasets/as-relationships/serial-1/{date}.as-rel.txt.bz2"
        cache_file = CACHE_DIR / f'caida_as_relationships_{date}.txt.bz2'
        
        try:
            print(f"   ...trying {url}")
            response = requests.get(url, stream=True, timeout=60)
            if response.status_code == 200:
                with open(cache_file, 'wb') as f:
                    shutil.copyfileobj(response.raw, f)
                print(f"   ✓ Downloaded Internet Topology ({date}) to {cache_file}")
                metadata['sources']['topology'] = url
                metadata['files']['topology'] = str(cache_file)
                return True
        except Exception:
            continue
            
    print("   ✗ Failed to fetch CAIDA Topology from predicted URLs.")
    return False

FETCHERS = {
    'cables': fetch_submarine_cables,
    'space': fetch_space_debris,
    'glottolog': fetch_endangered_languages,
    'topology': fetch_internet_topology,
}


def run(selected_targets):
    for target in selected_targets:
        fetcher = FETCHERS.get(target)
        if fetcher:
            fetcher()


def main():
    print("=" * 60)
    print("NICHE DATA ACTUATION")
    print("=" * 60)
    
    parser = argparse.ArgumentParser(description="Fetch niche datasets into cache.")
    parser.add_argument(
        "--target",
        choices=["all", "cables", "space", "glottolog", "topology"],
        default="all",
        help="Limit fetch to a single dataset; defaults to all.",
    )
    args = parser.parse_args()

    selected = list(FETCHERS.keys()) if args.target == "all" else [args.target]
    run(selected)
    
    # Save metadata
    meta_path = DATA_DIR / 'niche_metadata.json'
    with open(meta_path, 'w') as f:
        json.dump(metadata, f, indent=2)
        
    print("\n" + "=" * 60)
    print(f"✓ Metadata saved to {meta_path}")
    print("=" * 60)

if __name__ == "__main__":
    main()
