#!/usr/bin/env python3
"""
ISO 639-3 Language Codes Fetcher
Fetches ISO 639-3 language codes from SIL International.

ISO 639-3 is the international standard for language identification codes,
covering over 7,000 languages. Maintained by SIL International.

Usage:
    python3 fetch_iso_codes.py

Output:
    - data/linguistic/iso_639_3.json
    - data/linguistic/iso_639_3_metadata.json
"""

import json
import csv
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime
from io import StringIO

# Configuration
BASE_DIR = Path(__file__).parent.parent.parent
DATA_DIR = BASE_DIR / 'data'
LINGUISTIC_DIR = DATA_DIR / 'linguistic'
CACHE_DIR = BASE_DIR / 'cache' / 'linguistic'

# Ensure directories exist
LINGUISTIC_DIR.mkdir(exist_ok=True, parents=True)
CACHE_DIR.mkdir(exist_ok=True, parents=True)

# ISO 639-3 Data Source
ISO_639_3_URL = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab"

def fetch_iso_639_3():
    """
    Fetch ISO 639-3 language codes from SIL International.

    Returns tuple of (DataFrame, metadata_dict)
    """
    print("=" * 60)
    print("ISO 639-3 LANGUAGE CODES FETCHER")
    print("=" * 60)
    print(f"\n📥 Fetching from: {ISO_639_3_URL}")

    try:
        # Download the tab-separated file
        response = requests.get(ISO_639_3_URL, timeout=60)
        response.raise_for_status()

        # Parse as TSV (tab-separated values)
        df = pd.read_csv(StringIO(response.text), sep='\t')

        record_count = len(df)
        print(f"   ✓ Downloaded {record_count:,} language codes")

        # Convert to list of dicts for JSON export
        languages = df.to_dict('records')

        # Create metadata
        metadata = {
            'source': 'SIL International ISO 639-3',
            'source_url': ISO_639_3_URL,
            'download_date': datetime.now().isoformat(),
            'record_count': record_count,
            'description': 'ISO 639-3 international standard for language identification',
            'fields': {
                'Id': '3-letter language code (ISO 639-3)',
                'Part2B': '3-letter bibliographic code (ISO 639-2/B)',
                'Part2T': '3-letter terminological code (ISO 639-2/T)',
                'Part1': '2-letter code (ISO 639-1)',
                'Scope': 'Individual, Macrolanguage, or Special',
                'Language_Type': 'Living, Extinct, Ancient, Historic, Constructed',
                'Ref_Name': 'Reference language name',
                'Comment': 'Additional notes'
            },
            'license': 'SIL Open Font License / CC-BY-SA',
            'coverage': f'{record_count} languages worldwide',
            'language_types': df['Language_Type'].value_counts().to_dict() if 'Language_Type' in df.columns else {},
            'scopes': df['Scope'].value_counts().to_dict() if 'Scope' in df.columns else {}
        }

        return languages, metadata

    except Exception as e:
        print(f"   ✗ Error fetching ISO 639-3: {e}")
        raise

def save_data(languages, metadata):
    """Save language codes and metadata to JSON files."""

    # Save main data file
    iso_file = LINGUISTIC_DIR / 'iso_639_3.json'
    with open(iso_file, 'w', encoding='utf-8') as f:
        json.dump(languages, f, indent=2, ensure_ascii=False)

    file_size_mb = iso_file.stat().st_size / (1024 * 1024)
    print(f"\n💾 Saved ISO 639-3 codes to: {iso_file}")
    print(f"   File size: {file_size_mb:.2f} MB")

    # Save metadata
    meta_file = LINGUISTIC_DIR / 'iso_639_3_metadata.json'
    with open(meta_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2)

    print(f"💾 Saved metadata to: {meta_file}")

    # Print summary statistics
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Total languages: {metadata['record_count']:,}")

    if metadata.get('language_types'):
        print("\nBy type:")
        for lang_type, count in sorted(metadata['language_types'].items(), key=lambda x: x[1], reverse=True):
            print(f"  {lang_type}: {count:,}")

    if metadata.get('scopes'):
        print("\nBy scope:")
        for scope, count in sorted(metadata['scopes'].items(), key=lambda x: x[1], reverse=True):
            print(f"  {scope}: {count:,}")

    print("=" * 60)

def main():
    """Main execution function."""
    try:
        # Fetch data
        languages, metadata = fetch_iso_639_3()

        # Save to files
        save_data(languages, metadata)

        print("\n✅ ISO 639-3 fetch complete!")

    except Exception as e:
        print(f"\n❌ Error: {e}")
        return 1

    return 0

if __name__ == "__main__":
    exit(main())
