#!/usr/bin/env python3
"""
Geographic Data Fetcher
Fetches geographic reference data for enriching datasets with coordinates.

Data Sources:
1. Natural Earth - Country centroids and geographic data
2. Glottolog - Language coordinates and classifications

Usage:
    python3 fetch_geographic_data.py

Output:
    - data/geographic/country_centroids.json
    - data/linguistic/glottolog_coordinates.json
"""

import json
import requests
import pandas as pd
import geopandas as gpd
from pathlib import Path
from datetime import datetime
import zipfile
import io

# Configuration
BASE_DIR = Path(__file__).parent.parent.parent
DATA_DIR = BASE_DIR / 'data'
GEOGRAPHIC_DIR = DATA_DIR / 'geographic'
LINGUISTIC_DIR = DATA_DIR / 'linguistic'
CACHE_DIR = BASE_DIR / 'cache' / 'geographic'

# Ensure directories exist
GEOGRAPHIC_DIR.mkdir(exist_ok=True, parents=True)
LINGUISTIC_DIR.mkdir(exist_ok=True, parents=True)
CACHE_DIR.mkdir(exist_ok=True, parents=True)

def fetch_natural_earth_centroids():
    """
    Fetch country centroids from Natural Earth.
    Using admin-0 label points which are optimized centroids for labeling.
    """
    print("\n" + "=" * 60)
    print("NATURAL EARTH COUNTRY CENTROIDS")
    print("=" * 60)

    # Natural Earth Admin 0 - Sovereignty Label Points
    # This gives us optimized centroids for each country
    url = "https://naciscdn.org/naturalearth/10m/cultural/ne_10m_admin_0_label_points.zip"

    print(f"\n📥 Downloading from: {url}")

    try:
        # Download the shapefile ZIP
        response = requests.get(url, timeout=120)
        response.raise_for_status()

        print(f"   ✓ Downloaded {len(response.content) / 1024:.1f} KB")

        # Extract and read the shapefile from memory
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            # Extract to cache directory
            z.extractall(CACHE_DIR / 'natural_earth')

        # Read the shapefile
        shapefile_path = CACHE_DIR / 'natural_earth' / 'ne_10m_admin_0_label_points.shp'
        gdf = gpd.read_file(shapefile_path)

        print(f"   ✓ Loaded {len(gdf)} country centroids")

        # Extract relevant fields and coordinates
        centroids = []
        for idx, row in gdf.iterrows():
            centroid = {
                'iso_a2': row.get('ISO_A2', ''),
                'iso_a3': row.get('ISO_A3', ''),
                'name': row.get('NAME', ''),
                'name_long': row.get('NAME_LONG', ''),
                'continent': row.get('CONTINENT', ''),
                'region_un': row.get('REGION_UN', ''),
                'subregion': row.get('SUBREGION', ''),
                'longitude': row.geometry.x,
                'latitude': row.geometry.y,
                'source': 'Natural Earth 1:10m Admin 0 Label Points'
            }
            centroids.append(centroid)

        # Create metadata
        metadata = {
            'source': 'Natural Earth Data',
            'dataset': 'Admin 0 - Sovereignty Label Points (1:10m)',
            'source_url': url,
            'download_date': datetime.now().isoformat(),
            'record_count': len(centroids),
            'description': 'Optimized centroid points for country labeling',
            'license': 'Public Domain',
            'coordinate_system': 'WGS84 (EPSG:4326)',
            'fields': {
                'iso_a2': '2-letter ISO country code',
                'iso_a3': '3-letter ISO country code',
                'name': 'Country name',
                'name_long': 'Formal country name',
                'continent': 'Continent',
                'region_un': 'UN region classification',
                'subregion': 'UN subregion',
                'longitude': 'Longitude (decimal degrees)',
                'latitude': 'Latitude (decimal degrees)'
            }
        }

        return centroids, metadata

    except Exception as e:
        print(f"   ✗ Error fetching Natural Earth data: {e}")
        raise

def fetch_glottolog_coordinates():
    """
    Fetch language coordinates from Glottolog.
    Glottolog provides geo-coordinates for ~7,000 languages.
    """
    print("\n" + "=" * 60)
    print("GLOTTOLOG LANGUAGE COORDINATES")
    print("=" * 60)

    # Glottolog languages with coordinates (CSV from GitHub)
    url = "https://cdstar.eva.mpg.de/bitstreams/EAEA0-E62D-ED67-FD05-0/languages_and_dialects_geo.csv"

    print(f"\n📥 Downloading from: {url}")

    try:
        # Download the CSV
        response = requests.get(url, timeout=120)
        response.raise_for_status()

        print(f"   ✓ Downloaded {len(response.content) / 1024:.1f} KB")

        # Parse CSV
        df = pd.read_csv(io.StringIO(response.text))

        print(f"   ✓ Loaded {len(df)} language coordinates")

        # Convert to list of dicts
        languages = df.to_dict('records')

        # Count languages with valid coordinates
        valid_coords = sum(1 for lang in languages
                          if pd.notna(lang.get('latitude')) and pd.notna(lang.get('longitude')))

        # Create metadata
        metadata = {
            'source': 'Glottolog',
            'source_url': url,
            'download_date': datetime.now().isoformat(),
            'record_count': len(languages),
            'coordinates_available': valid_coords,
            'description': 'Geographic coordinates for world languages from Glottolog database',
            'license': 'CC-BY-4.0',
            'coordinate_system': 'WGS84 (EPSG:4326)',
            'fields': {
                'glottocode': 'Unique Glottolog identifier',
                'name': 'Language name',
                'isocodes': 'ISO 639-3 codes (comma-separated if multiple)',
                'level': 'Language vs. Dialect',
                'macroarea': 'Geographic macro-area',
                'latitude': 'Latitude (decimal degrees)',
                'longitude': 'Longitude (decimal degrees)',
                'family_id': 'Language family identifier',
                'family_name': 'Language family name',
                'parent_id': 'Parent language/dialect identifier',
                'bookkeeping': 'Glottolog classification status'
            },
            'coverage': f'{valid_coords} of {len(languages)} languages have coordinates ({100 * valid_coords / len(languages):.1f}%)'
        }

        print(f"   ✓ {valid_coords} languages have coordinates ({100 * valid_coords / len(languages):.1f}%)")

        return languages, metadata

    except Exception as e:
        print(f"   ✗ Error fetching Glottolog data: {e}")
        raise

def save_data(centroids, centroids_meta, languages, languages_meta):
    """Save geographic data and metadata to JSON files."""

    # Save country centroids
    centroids_file = GEOGRAPHIC_DIR / 'country_centroids.json'
    with open(centroids_file, 'w', encoding='utf-8') as f:
        json.dump(centroids, f, indent=2, ensure_ascii=False)

    centroids_meta_file = GEOGRAPHIC_DIR / 'country_centroids_metadata.json'
    with open(centroids_meta_file, 'w', encoding='utf-8') as f:
        json.dump(centroids_meta, f, indent=2)

    file_size_kb = centroids_file.stat().st_size / 1024
    print(f"\n💾 Saved country centroids to: {centroids_file}")
    print(f"   File size: {file_size_kb:.1f} KB")
    print(f"   Records: {centroids_meta['record_count']}")

    # Save Glottolog language coordinates
    languages_file = LINGUISTIC_DIR / 'glottolog_coordinates.json'
    with open(languages_file, 'w', encoding='utf-8') as f:
        json.dump(languages, f, indent=2, ensure_ascii=False)

    languages_meta_file = LINGUISTIC_DIR / 'glottolog_coordinates_metadata.json'
    with open(languages_meta_file, 'w', encoding='utf-8') as f:
        json.dump(languages_meta, f, indent=2)

    file_size_mb = languages_file.stat().st_size / (1024 * 1024)
    print(f"\n💾 Saved language coordinates to: {languages_file}")
    print(f"   File size: {file_size_mb:.2f} MB")
    print(f"   Records: {languages_meta['record_count']}")
    print(f"   With coordinates: {languages_meta['coordinates_available']}")

    print("\n" + "=" * 60)
    print("GEOGRAPHIC DATA FETCH COMPLETE")
    print("=" * 60)

def main():
    """Main execution function."""
    print("=" * 60)
    print("GEOGRAPHIC COORDINATE FETCHER")
    print("=" * 60)

    try:
        # Fetch Natural Earth country centroids
        centroids, centroids_meta = fetch_natural_earth_centroids()

        # Fetch Glottolog language coordinates
        languages, languages_meta = fetch_glottolog_coordinates()

        # Save all data
        save_data(centroids, centroids_meta, languages, languages_meta)

        print("\n✅ All geographic data fetched successfully!")

        return 0

    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()
        return 1

if __name__ == "__main__":
    exit(main())
