#!/usr/bin/env python3
"""
External Linguistic Data Fetcher
Downloads WALS, PHOIBLE, and Glottolog datasets.

Sources (all Creative Commons licensed, no authentication required):
1. WALS (World Atlas of Language Structures) - 2,679 languages, 192 typological features
2. PHOIBLE - 2,186 languages, phoneme inventories
3. Glottolog - 26,000+ languoids with macroareas, endangerment

Usage:
    python3 fetch_external_linguistic_data.py

Output:
    - data/linguistic/wals_languages.csv
    - data/linguistic/wals_values.csv
    - data/linguistic/phoible.csv
    - data/linguistic/glottolog_languoid.csv
"""

import requests
from pathlib import Path
from datetime import datetime
import zipfile
import io

# Configuration
BASE_DIR = Path(__file__).parent.parent.parent
DATA_DIR = BASE_DIR / 'data' / 'linguistic'
CACHE_DIR = BASE_DIR / 'cache' / 'linguistic'

# Ensure directories exist
DATA_DIR.mkdir(exist_ok=True, parents=True)
CACHE_DIR.mkdir(exist_ok=True, parents=True)

# Data source URLs (direct downloads, no auth required)
SOURCES = {
    'wals_languages': {
        'url': 'https://raw.githubusercontent.com/cldf-datasets/wals/master/cldf/languages.csv',
        'output': 'wals_languages.csv',
        'description': 'WALS language index with ISO 639-3 codes'
    },
    'wals_values': {
        'url': 'https://raw.githubusercontent.com/cldf-datasets/wals/master/cldf/values.csv',
        'output': 'wals_values.csv',
        'description': 'WALS typological feature values'
    },
    'wals_parameters': {
        'url': 'https://raw.githubusercontent.com/cldf-datasets/wals/master/cldf/parameters.csv',
        'output': 'wals_parameters.csv',
        'description': 'WALS feature definitions (192 typological parameters)'
    },
    'phoible': {
        'url': 'https://raw.githubusercontent.com/phoible/dev/master/data/phoible.csv',
        'output': 'phoible.csv',
        'description': 'PHOIBLE phoneme inventories for 2,186 languages'
    },
    'glottolog_languoid': {
        'url': 'https://cdstar.eva.mpg.de/bitstreams/EAEA0-E7DE-FA06-8817-0/glottolog_languoid.csv.zip',
        'output': 'glottolog_languoid.csv',
        'description': 'Glottolog complete languoid data (26,000+ entries)',
        'is_zip': True
    }
}

def download_file(name, config):
    """Download a single file from URL."""
    print(f"\n📥 Downloading {name}...")
    print(f"   URL: {config['url']}")

    try:
        response = requests.get(config['url'], timeout=180)
        response.raise_for_status()

        file_size_mb = len(response.content) / (1024 * 1024)
        print(f"   ✓ Downloaded {file_size_mb:.2f} MB")

        # Handle ZIP files
        if config.get('is_zip'):
            print(f"   📦 Extracting ZIP...")
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                # Extract the CSV file
                csv_name = config['output']
                for file_info in z.filelist:
                    if file_info.filename.endswith('.csv'):
                        with z.open(file_info) as csv_file:
                            content = csv_file.read()
                            output_path = DATA_DIR / csv_name
                            with open(output_path, 'wb') as f:
                                f.write(content)
                            print(f"   ✓ Extracted to: {output_path}")
                            return len(content)
        else:
            # Save directly
            output_path = DATA_DIR / config['output']
            with open(output_path, 'wb') as f:
                f.write(response.content)
            print(f"   ✓ Saved to: {output_path}")
            return len(response.content)

    except Exception as e:
        print(f"   ✗ Error downloading {name}: {e}")
        raise

def main():
    """Main execution function."""
    print("=" * 70)
    print("EXTERNAL LINGUISTIC DATA FETCHER")
    print("=" * 70)
    print(f"\nTarget directory: {DATA_DIR}")
    print(f"Downloading {len(SOURCES)} datasets...")

    total_size = 0
    downloaded = []

    for name, config in SOURCES.items():
        try:
            size = download_file(name, config)
            total_size += size
            downloaded.append(name)
        except Exception as e:
            print(f"\n⚠️  Failed to download {name}, continuing...")
            continue

    print("\n" + "=" * 70)
    print("DOWNLOAD SUMMARY")
    print("=" * 70)
    print(f"\n✅ Successfully downloaded {len(downloaded)}/{len(SOURCES)} datasets")
    print(f"📦 Total size: {total_size / (1024 * 1024):.1f} MB")

    print("\n📊 Downloaded datasets:")
    for name in downloaded:
        output_file = DATA_DIR / SOURCES[name]['output']
        file_size_mb = output_file.stat().st_size / (1024 * 1024)
        print(f"   • {SOURCES[name]['output']}: {file_size_mb:.1f} MB")
        print(f"     {SOURCES[name]['description']}")

    print("\n" + "=" * 70)
    print("✅ External data fetch complete!")
    print("=" * 70)

    return 0

if __name__ == "__main__":
    exit(main())
