#!/usr/bin/env python3
"""
Emoji Usage Trends Data Fetcher
Fetches emoji usage statistics and trends over time.

Datasets:
- Unicode Emoji Frequency data
- Emoji usage trends from multiple sources

Usage:
    python fetch_emoji_data.py
    USE_CACHED_DATA=true python fetch_emoji_data.py
"""

import os
import json
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
BASE_DIR = Path(__file__).parent.parent.parent
DATA_DIR = BASE_DIR / 'data' / 'cultural'
CACHE_DIR = BASE_DIR / 'tools' / 'fetchers' / 'cache' / 'emoji'

# Ensure directories exist
DATA_DIR.mkdir(exist_ok=True, parents=True)
CACHE_DIR.mkdir(exist_ok=True, parents=True)

# Use cached data flag
USE_CACHED = os.getenv('USE_CACHED_DATA', 'false').lower() == 'true'


class EmojiDataFetcher:
    """Fetches emoji usage trends and statistics."""

    def __init__(self):
        self.cache_dir = CACHE_DIR
        self.data_dir = DATA_DIR

    def _get_cache_path(self, data_type: str) -> Path:
        """Generate cache path with date stamp."""
        date_str = datetime.now().strftime('%Y%m%d')
        return self.cache_dir / f"{data_type}_{date_str}.json"

    def _is_cache_valid(self, cache_path: Path, max_age_hours: int = 168) -> bool:
        """Check if cache exists and is recent enough (default: 7 days)."""
        if not cache_path.exists():
            return False
        file_age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
        return file_age < timedelta(hours=max_age_hours)

    def _fetch_with_cache(self, data_type: str, fetch_func, max_age_hours: int = 168):
        """Fetch data with caching logic."""
        cache_path = self._get_cache_path(data_type)

        if USE_CACHED and self._is_cache_valid(cache_path, max_age_hours):
            print(f"   Using cached data from {cache_path}")
            with open(cache_path) as f:
                return json.load(f)
        else:
            print(f"   Fetching fresh data...")
            data = fetch_func()
            with open(cache_path, 'w') as f:
                json.dump(data, f, indent=2)
            print(f"   Cached to {cache_path}")
            return data

    def fetch_unicode_emoji_list(self):
        """
        Fetch Unicode emoji list with categories and metadata.
        Source: Unicode.org emoji test data
        """
        print("\n😀 Fetching Unicode Emoji List...")

        def fetch():
            # Unicode emoji-test.txt contains all emoji with metadata
            url = "https://unicode.org/Public/emoji/latest/emoji-test.txt"

            try:
                response = requests.get(url, timeout=30)
                response.raise_for_status()

                emoji_data = []
                current_group = None
                current_subgroup = None

                for line in response.text.split('\n'):
                    line = line.strip()

                    # Parse group headers
                    if line.startswith('# group:'):
                        current_group = line.replace('# group:', '').strip()
                        continue
                    elif line.startswith('# subgroup:'):
                        current_subgroup = line.replace('# subgroup:', '').strip()
                        continue

                    # Parse emoji data lines
                    if line and not line.startswith('#') and ';' in line:
                        parts = line.split(';')
                        if len(parts) >= 2:
                            codepoints = parts[0].strip()
                            rest = parts[1].strip()

                            # Extract status and emoji character
                            if '#' in rest:
                                status_part, desc_part = rest.split('#', 1)
                                status = status_part.strip()

                                # Extract emoji character and name
                                desc_parts = desc_part.strip().split(' ', 1)
                                if len(desc_parts) >= 2:
                                    emoji_char = desc_parts[0]
                                    name = desc_parts[1].strip()

                                    emoji_data.append({
                                        'emoji': emoji_char,
                                        'codepoints': codepoints,
                                        'status': status,
                                        'name': name,
                                        'group': current_group,
                                        'subgroup': current_subgroup
                                    })

                print(f"   ✓ Parsed {len(emoji_data)} emoji from Unicode")
                return emoji_data

            except Exception as e:
                print(f"   ✗ Error fetching Unicode emoji list: {e}")
                return []

        return self._fetch_with_cache('unicode_emoji_list', fetch)

    def generate_emoji_categories(self, emoji_list):
        """Generate emoji by category summary."""
        print("\n📊 Generating Emoji Categories...")

        if not emoji_list:
            print("   ✗ No emoji data available")
            return None

        df = pd.DataFrame(emoji_list)

        # Count by group
        category_counts = df.groupby('group').size().to_dict()

        # Count by status
        status_counts = df.groupby('status').size().to_dict()

        categories = {
            'total_emoji': len(df),
            'by_group': category_counts,
            'by_status': status_counts,
            'groups': df.groupby('group').apply(
                lambda x: x[['emoji', 'name', 'subgroup']].to_dict('records')
            ).to_dict()
        }

        output_path = self.data_dir / 'emoji_by_category.json'
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(categories, f, indent=2, ensure_ascii=False)

        print(f"   ✓ Saved emoji categories to {output_path}")
        return categories

    def generate_mock_usage_timeline(self, emoji_list):
        """
        Generate mock emoji usage timeline based on introduction year.

        Note: Real usage data requires Twitter API or Google Ngrams data.
        This generates plausible trends based on emoji characteristics.
        """
        print("\n📈 Generating Emoji Usage Timeline...")

        if not emoji_list:
            print("   ✗ No emoji data available")
            return None

        # Popular emoji with estimated rise years (based on real trends)
        popular_emoji = {
            '😂': {'peak_year': 2016, 'name': 'face with tears of joy'},
            '❤️': {'peak_year': 2015, 'name': 'red heart'},
            '🔥': {'peak_year': 2017, 'name': 'fire'},
            '💀': {'peak_year': 2021, 'name': 'skull'},
            '😭': {'peak_year': 2018, 'name': 'loudly crying face'},
            '🙏': {'peak_year': 2019, 'name': 'folded hands'},
            '💯': {'peak_year': 2016, 'name': 'hundred points'},
            '😍': {'peak_year': 2017, 'name': 'smiling face with heart-eyes'},
            '🤔': {'peak_year': 2020, 'name': 'thinking face'},
            '👀': {'peak_year': 2019, 'name': 'eyes'},
        }

        timeline = []
        years = range(2016, 2027)  # 2016-2026

        for emoji_char, info in popular_emoji.items():
            for year in years:
                # Generate plausible usage score (peaks at peak_year, then gradual decline)
                peak_year = info['peak_year']
                if year < peak_year:
                    # Rising trend
                    usage_score = 30 + (year - 2016) * 10
                elif year == peak_year:
                    # Peak usage
                    usage_score = 100
                else:
                    # Gradual decline after peak
                    usage_score = max(40, 100 - (year - peak_year) * 8)

                timeline.append({
                    'emoji': emoji_char,
                    'name': info['name'],
                    'year': year,
                    'usage_score': usage_score,
                    'rank': None  # Will be calculated
                })

        # Calculate rank by year
        df = pd.DataFrame(timeline)
        df['rank'] = df.groupby('year')['usage_score'].rank(ascending=False, method='dense')
        timeline = df.to_dict('records')

        output_path = self.data_dir / 'emoji_usage_timeline.json'
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(timeline, f, indent=2, ensure_ascii=False)

        print(f"   ✓ Saved emoji usage timeline to {output_path}")
        print(f"   ℹ️  Note: Timeline uses mock data. Real usage requires Twitter/social media API.")
        return timeline

    def save_metadata(self, emoji_count, categories_count):
        """Save dataset metadata."""
        metadata = {
            'dataset_name': 'Emoji Usage Trends',
            'last_updated': datetime.now().strftime('%Y-%m-%d'),
            'source': 'Unicode Consortium emoji-test.txt',
            'record_count': {
                'total_emoji': emoji_count,
                'categories': categories_count
            },
            'fields': {
                'emoji': 'Unicode emoji character',
                'codepoints': 'Unicode codepoint(s)',
                'status': 'Emoji status (fully-qualified, minimally-qualified, unqualified)',
                'name': 'Official emoji name',
                'group': 'Top-level category (e.g., Smileys & Emotion)',
                'subgroup': 'Subcategory (e.g., face-smiling)'
            },
            'notes': (
                'Unicode emoji list with categories. '
                'Usage timeline is mock data for visualization purposes. '
                'Real usage trends require Twitter API or Google Ngrams data.'
            ),
            'data_limitations': (
                'Usage timeline is generated mock data based on known emoji trends (2016-2026). '
                'For production use, integrate with: '
                '1) Twitter API v2 for real-time usage, '
                '2) Emojipedia Trends API, '
                '3) Google Ngrams for historical text analysis'
            )
        }

        meta_path = self.data_dir / 'emoji_metadata.json'
        with open(meta_path, 'w') as f:
            json.dump(metadata, f, indent=2)

        print(f"\n✓ Metadata saved to {meta_path}")


def main():
    print("=" * 60)
    print("EMOJI USAGE TRENDS DATA FETCHER")
    print("=" * 60)

    fetcher = EmojiDataFetcher()

    # Fetch Unicode emoji list
    emoji_list = fetcher.fetch_unicode_emoji_list()

    if emoji_list:
        # Generate emoji by category
        categories = fetcher.generate_emoji_categories(emoji_list)

        # Generate usage timeline (mock data)
        timeline = fetcher.generate_mock_usage_timeline(emoji_list)

        # Save metadata
        fetcher.save_metadata(
            emoji_count=len(emoji_list),
            categories_count=len(categories['by_group']) if categories else 0
        )

    print("\n" + "=" * 60)
    print("✓ Emoji data fetching complete")
    print("=" * 60)


if __name__ == "__main__":
    main()