#!/usr/bin/env python3
"""
Wikipedia Pageviews Data Fetcher
Fetches Wikipedia article pageview statistics and trending topics.

Datasets:
- Wikipedia Pageviews API (2015-present)
- Top articles by day/month
- Historical pageview trends for specific articles

Usage:
    python fetch_wikipedia_pageviews.py
    USE_CACHED_DATA=true python fetch_wikipedia_pageviews.py

Note: No API key required. Uses Wikimedia REST API.
"""

import os
import json
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
BASE_DIR = Path(__file__).parent.parent.parent
DATA_DIR = BASE_DIR / 'data' / 'attention'
CACHE_DIR = BASE_DIR / 'tools' / 'fetchers' / 'cache' / 'wikipedia'

# Ensure directories exist
DATA_DIR.mkdir(exist_ok=True, parents=True)
CACHE_DIR.mkdir(exist_ok=True, parents=True)

# Use cached data flag
USE_CACHED = os.getenv('USE_CACHED_DATA', 'false').lower() == 'true'

# Wikimedia API base URL
WIKI_BASE_URL = "https://wikimedia.org/api/rest_v1"


class WikipediaPageviewsFetcher:
    """Fetches Wikipedia pageview statistics."""

    def __init__(self):
        self.cache_dir = CACHE_DIR
        self.data_dir = DATA_DIR
        self.headers = {
            'User-Agent': 'DataTroveBot/1.0 (https://dr.eamer.dev; luke@lukesteuber.com)'
        }

    def _get_cache_path(self, data_type: str) -> Path:
        """Generate cache path with date stamp."""
        date_str = datetime.now().strftime('%Y%m%d')
        return self.cache_dir / f"{data_type}_{date_str}.json"

    def _is_cache_valid(self, cache_path: Path, max_age_hours: int = 24) -> bool:
        """Check if cache exists and is recent enough (default: 24 hours)."""
        if not cache_path.exists():
            return False
        file_age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
        return file_age < timedelta(hours=max_age_hours)

    def _fetch_with_cache(self, data_type: str, fetch_func, max_age_hours: int = 24):
        """Fetch data with caching logic."""
        cache_path = self._get_cache_path(data_type)

        if USE_CACHED and self._is_cache_valid(cache_path, max_age_hours):
            print(f"   Using cached data from {cache_path}")
            with open(cache_path) as f:
                return json.load(f)
        else:
            print(f"   Fetching fresh data...")
            data = fetch_func()
            with open(cache_path, 'w') as f:
                json.dump(data, f, indent=2)
            print(f"   Cached to {cache_path}")
            return data

    def fetch_top_articles_by_day(self, year=None, month=None, day=None, limit=1000):
        """
        Fetch top articles for a specific day.

        Args:
            year: Year (default: yesterday)
            month: Month (01-12)
            day: Day (01-31)
            limit: Number of top articles to fetch (default: 1000)
        """
        # Default to yesterday if not specified
        if not all([year, month, day]):
            yesterday = datetime.now() - timedelta(days=1)
            year = yesterday.year
            month = yesterday.month
            day = yesterday.day

        date_str = f"{year}/{month:02d}/{day:02d}"
        print(f"\n📊 Fetching Top {limit} Articles for {date_str}...")

        def fetch():
            url = f"{WIKI_BASE_URL}/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month:02d}/{day:02d}"

            try:
                response = requests.get(url, headers=self.headers, timeout=30)
                response.raise_for_status()
                data = response.json()

                articles = data.get('items', [{}])[0].get('articles', [])

                # Filter out special pages
                filtered = [
                    article for article in articles[:limit]
                    if not article['article'].startswith(('Special:', 'Main_Page', '-'))
                ]

                print(f"   ✓ Fetched {len(filtered)} top articles")
                return filtered

            except Exception as e:
                print(f"   ✗ Error fetching top articles: {e}")
                return []

        return self._fetch_with_cache(f'top_articles_{year}{month:02d}{day:02d}', fetch)

    def fetch_article_pageviews(self, article_title, start_date, end_date):
        """
        Fetch pageview statistics for a specific article over time.

        Args:
            article_title: Article title (e.g., 'Python_(programming_language)')
            start_date: Start date (YYYYMMDD)
            end_date: End date (YYYYMMDD)
        """
        print(f"\n📈 Fetching pageviews for '{article_title}'...")

        def fetch():
            # URL encode article title
            encoded_title = requests.utils.quote(article_title)
            url = (
                f"{WIKI_BASE_URL}/metrics/pageviews/per-article/"
                f"en.wikipedia/all-access/user/{encoded_title}/daily/{start_date}/{end_date}"
            )

            try:
                response = requests.get(url, headers=self.headers, timeout=30)
                response.raise_for_status()
                data = response.json()

                items = data.get('items', [])
                print(f"   ✓ Fetched {len(items)} daily pageview records")
                return items

            except Exception as e:
                print(f"   ✗ Error fetching article pageviews: {e}")
                return []

        cache_key = f'article_{article_title}_{start_date}_{end_date}'
        return self._fetch_with_cache(cache_key, fetch)

    def fetch_trending_articles(self, days_back=30):
        """
        Fetch trending articles over the last N days.

        Args:
            days_back: Number of days to look back (default: 30)
        """
        print(f"\n🔥 Fetching Trending Articles (last {days_back} days)...")

        all_articles = {}

        for i in range(days_back):
            date = datetime.now() - timedelta(days=i+1)
            year = date.year
            month = date.month
            day = date.day

            # Fetch top 100 for each day
            daily_top = self.fetch_top_articles_by_day(year, month, day, limit=100)

            if daily_top:
                for article in daily_top:
                    title = article['article']
                    views = article['views']

                    if title not in all_articles:
                        all_articles[title] = {
                            'title': title,
                            'total_views': 0,
                            'days_in_top_100': 0,
                            'peak_views': 0,
                            'daily_views': []
                        }

                    all_articles[title]['total_views'] += views
                    all_articles[title]['days_in_top_100'] += 1
                    all_articles[title]['peak_views'] = max(
                        all_articles[title]['peak_views'], views
                    )
                    all_articles[title]['daily_views'].append({
                        'date': date.strftime('%Y-%m-%d'),
                        'views': views
                    })

        # Convert to sorted list
        trending = sorted(
            all_articles.values(),
            key=lambda x: x['total_views'],
            reverse=True
        )

        output_path = self.data_dir / 'wikipedia_trending.json'
        with open(output_path, 'w') as f:
            json.dump(trending[:500], f, indent=2)  # Top 500

        print(f"   ✓ Saved top 500 trending articles to {output_path}")
        return trending[:500]

    def analyze_event_articles(self):
        """
        Analyze pageviews for major event-related articles.

        This tracks articles that typically spike during major events.
        """
        print("\n🌍 Analyzing Event-Related Articles...")

        # Articles that typically correlate with major events
        event_articles = [
            'Donald_Trump',
            'Joe_Biden',
            'Climate_change',
            'COVID-19_pandemic',
            'Artificial_intelligence',
            'Russia',
            'Israel',
            'Taylor_Swift',
            'Elon_Musk',
            'United_States'
        ]

        # Fetch last 90 days of data
        end_date = datetime.now() - timedelta(days=1)
        start_date = end_date - timedelta(days=90)

        start_str = start_date.strftime('%Y%m%d')
        end_str = end_date.strftime('%Y%m%d')

        all_trends = []

        for article in event_articles:
            pageviews = self.fetch_article_pageviews(article, start_str, end_str)

            if pageviews:
                # Calculate statistics
                views_series = [item['views'] for item in pageviews]
                avg_views = sum(views_series) / len(views_series) if views_series else 0
                max_views = max(views_series) if views_series else 0

                all_trends.append({
                    'article': article,
                    'avg_daily_views': round(avg_views),
                    'peak_views': max_views,
                    'total_views': sum(views_series),
                    'timeline': pageviews
                })

        output_path = self.data_dir / 'wikipedia_event_articles.json'
        with open(output_path, 'w') as f:
            json.dump(all_trends, f, indent=2)

        print(f"   ✓ Saved event article analysis to {output_path}")
        return all_trends

    def save_metadata(self, trending_count, event_count, date_range):
        """Save dataset metadata."""
        metadata = {
            'dataset_name': 'Wikipedia Pageviews Trends',
            'last_updated': datetime.now().strftime('%Y-%m-%d'),
            'source': 'Wikimedia REST API (Pageviews)',
            'api_url': 'https://wikimedia.org/api/rest_v1',
            'record_count': {
                'trending_articles': trending_count,
                'event_articles': event_count
            },
            'date_range': date_range,
            'fields': {
                'article': 'Wikipedia article title',
                'total_views': 'Total pageviews over analysis period',
                'days_in_top_100': 'Number of days article appeared in top 100',
                'peak_views': 'Maximum daily pageviews',
                'avg_daily_views': 'Average daily pageviews',
                'timeline': 'Array of daily pageview counts with dates'
            },
            'notes': (
                'Wikipedia pageview data from Wikimedia REST API. '
                'Data available from July 2015 onwards. '
                'Trending articles are based on aggregated views over last 30 days. '
                'Event articles track major topics that spike during news events.'
            ),
            'api_key_required': False,
            'rate_limits': '5000 requests per hour per IP'
        }

        meta_path = self.data_dir / 'wikipedia_pageviews_metadata.json'
        with open(meta_path, 'w') as f:
            json.dump(metadata, f, indent=2)

        print(f"\n✓ Metadata saved to {meta_path}")


def main():
    print("=" * 60)
    print("WIKIPEDIA PAGEVIEWS DATA FETCHER")
    print("=" * 60)

    fetcher = WikipediaPageviewsFetcher()

    # Fetch top articles from yesterday
    yesterday = datetime.now() - timedelta(days=1)
    top_yesterday = fetcher.fetch_top_articles_by_day(
        year=yesterday.year,
        month=yesterday.month,
        day=yesterday.day,
        limit=1000
    )

    # Fetch trending articles (last 30 days)
    trending = fetcher.fetch_trending_articles(days_back=30)

    # Analyze event-related articles
    event_articles = fetcher.analyze_event_articles()

    # Save metadata
    if trending and event_articles:
        fetcher.save_metadata(
            trending_count=len(trending),
            event_count=len(event_articles),
            date_range='Last 90 days'
        )

    print("\n" + "=" * 60)
    print("✓ Wikipedia pageviews fetching complete")
    print("=" * 60)


if __name__ == "__main__":
    main()
