#!/usr/bin/env python3
"""
Accessibility Data Fetcher
Actuates the capture of public accessibility datasets identified in Phase 3.
"""

import os
import json
import requests
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv

# Try importing huggingface_hub
try:
    from huggingface_hub import hf_hub_download, list_datasets
    HF_AVAILABLE = True
except ImportError:
    HF_AVAILABLE = False
    print("⚠️  huggingface_hub not installed. Some datasets may not be accessible.")

# Load environment variables
load_dotenv()

# Configuration
BASE_DIR = Path(__file__).parent
DATA_DIR = BASE_DIR / 'data'
CACHE_DIR = BASE_DIR / 'cache'

# Ensure directories exist
DATA_DIR.mkdir(exist_ok=True, parents=True)
CACHE_DIR.mkdir(exist_ok=True, parents=True)

# Metadata tracking
metadata = {
    'collection_date': datetime.now().isoformat(),
    'sources': {},
    'record_counts': {}
}

def fetch_domesday_dataset():
    """
    Fetch Domesday Dataset (AAC Device purchases in UK).
    Source: Equality Time GitHub
    """
    print("\n🗣️  Fetching Domesday Dataset...")
    
    # Try main branch first, then master
    urls = [
        "https://raw.githubusercontent.com/EqualityTime/DomesdayDataset/main/Data/Clean/domesday_clean.csv",
        "https://raw.githubusercontent.com/EqualityTime/DomesdayDataset/master/Data/Clean/domesday_clean.csv",
        "https://raw.githubusercontent.com/EqualityTime/DomesdayDataset/master/Data/2014-04-01/Domesday.csv"
    ]
    cache_file = CACHE_DIR / 'domesday_aac.csv'
    
    for url in urls:
        try:
            print(f"   Trying {url}...")
            response = requests.get(url, timeout=30)
            if response.status_code == 200:
                with open(cache_file, 'wb') as f:
                    f.write(response.content)
                print(f"   ✓ Downloaded Domesday Dataset from to {cache_file}")
                
                # Count lines
                with open(cache_file, 'r') as f:
                    count = sum(1 for _ in f)
                
                metadata['sources']['domesday'] = url
                metadata['record_counts']['domesday'] = count
                return True
        except Exception:
            continue
            
    print("   ✗ Failed to fetch Domesday Dataset from any known URL")
    return False

def fetch_wlasl_metadata():
    """
    Fetch WLASL (Word-Level ASL) metadata/index.
    Source: dxli94/WLASL GitHub
    """
    print("\nTw  Fetching WLASL Metadata...")
    
    url = "https://raw.githubusercontent.com/dxli94/WLASL/master/start_kit/WLASL_v0.3.json"
    cache_file = CACHE_DIR / 'wlasl_index.json'
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        with open(cache_file, 'wb') as f:
            f.write(response.content)
            
        print(f"   ✓ Downloaded WLASL Index to {cache_file}")
        
        # Count entries
        with open(cache_file, 'r') as f:
            data = json.load(f)
            count = len(data)
            
        metadata['sources']['wlasl'] = url
        metadata['record_counts']['wlasl'] = count
        return True
        
    except Exception as e:
        print(f"   ✗ Error fetching WLASL: {e}")
        return False

def fetch_vizwiz_annotations():
    """
    Fetch VizWiz VQA Annotations (validation set).
    Source: VizWiz.org / various mirrors.
    Using a simplified direct link or placeholder if too large.
    """
    print("\n👁️  Fetching VizWiz Annotations (Validation)...")
    
    # Using the official detailed annotation link for Validation
    url = "https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations/val.json"
    cache_file = CACHE_DIR / 'vizwiz_val_annotations.json'
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        with open(cache_file, 'wb') as f:
            f.write(response.content)
            
        print(f"   ✓ Downloaded VizWiz Annotations to {cache_file}")
        
        with open(cache_file, 'r') as f:
            data = json.load(f)
            count = len(data) # Usually a list of annotations
            
        metadata['sources']['vizwiz'] = url
        metadata['record_counts']['vizwiz'] = count
        return True
        
    except Exception as e:
        print(f"   ✗ Error fetching VizWiz: {e}")
        return False

def fetch_aac_conversations():
    """
    Fetch AAC Conversations dataset sample from Hugging Face.
    """
    print("\n🤖 Fetching AAC Conversations (Hugging Face)...")
    
    if not HF_AVAILABLE:
        print("   ✗ Skipping: huggingface_hub not installed")
        return False
        
    # Dataset ID found via search: willwade/AACConversations
    # Note: Requires huggingface_hub
    
    dataset_id = "willwade/AACConversations"
    print(f"   ⬇️  Downloading {dataset_id}...")
    
    try:
        # Download the dataset (parquet or similar)
        # Using list_datasets to verify access first or just download a file
        
        # We'll try to download the readme to verify access/existence
        readme_path = hf_hub_download(repo_id=dataset_id, filename="README.md", repo_type="dataset", cache_dir=CACHE_DIR)
        print(f"   ✓ Verified access (README at {readme_path})")
        
        metadata['sources']['aac_conversations'] = f"https://huggingface.co/datasets/{dataset_id}"
        # detailed stats require full load, skipping to save bandwidth
        return True
        
    except Exception as e:
        print(f"   ✗ Error fetching AAC Conversations: {e}")
        return False

def main():
    print("=" * 60)
    print("ACCESSIBILITY DATA ACTUATION")
    print("=" * 60)
    
    fetch_domesday_dataset()
    fetch_wlasl_metadata()
    fetch_vizwiz_annotations()
    fetch_aac_conversations()
    
    # Save metadata
    meta_path = DATA_DIR / 'accessibility_metadata.json'
    with open(meta_path, 'w') as f:
        json.dump(metadata, f, indent=2)
        
    print("\n" + "=" * 60)
    print(f"✓ Metadata saved to {meta_path}")
    print("=" * 60)

if __name__ == "__main__":
    main()
