#!/usr/bin/env python3
"""
Parking Tickets Data Fetcher
Fetches NYC parking violation data from NYC Open Data.

Datasets:
- NYC Parking Violations (millions of tickets annually)
- Violation types, locations, times
- Sample dataset for visualization

Usage:
    python fetch_parking_tickets.py
    USE_CACHED_DATA=true python fetch_parking_tickets.py

Note: No API key required for NYC Open Data Socrata API.
"""

import os
import json
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
BASE_DIR = Path(__file__).parent.parent.parent
DATA_DIR = BASE_DIR / 'data' / 'urban'
CACHE_DIR = BASE_DIR / 'tools' / 'fetchers' / 'cache' / 'parking'

# Ensure directories exist
DATA_DIR.mkdir(exist_ok=True, parents=True)
CACHE_DIR.mkdir(exist_ok=True, parents=True)

# Use cached data flag
USE_CACHED = os.getenv('USE_CACHED_DATA', 'false').lower() == 'true'

# NYC Open Data Socrata API
# Current fiscal year parking violations dataset
NYC_PARKING_URL = "https://data.cityofnewyork.us/resource/pvqr-7yc4.json"


class ParkingTicketsFetcher:
    """Fetches NYC parking violations data."""

    def __init__(self):
        self.cache_dir = CACHE_DIR
        self.data_dir = DATA_DIR

    def _get_cache_path(self, data_type: str) -> Path:
        """Generate cache path with date stamp."""
        date_str = datetime.now().strftime('%Y%m%d')
        return self.cache_dir / f"{data_type}_{date_str}.json"

    def _is_cache_valid(self, cache_path: Path, max_age_hours: int = 24) -> bool:
        """Check if cache exists and is recent enough (default: 24 hours)."""
        if not cache_path.exists():
            return False
        file_age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
        return file_age < timedelta(hours=max_age_hours)

    def _fetch_with_cache(self, data_type: str, fetch_func, max_age_hours: int = 24):
        """Fetch data with caching logic."""
        cache_path = self._get_cache_path(data_type)

        if USE_CACHED and self._is_cache_valid(cache_path, max_age_hours):
            print(f"   Using cached data from {cache_path}")
            with open(cache_path) as f:
                return json.load(f)
        else:
            print(f"   Fetching fresh data...")
            data = fetch_func()
            with open(cache_path, 'w') as f:
                json.dump(data, f, indent=2)
            print(f"   Cached to {cache_path}")
            return data

    def fetch_recent_violations(self, limit=10000):
        """
        Fetch recent parking violations from NYC Open Data.

        Args:
            limit: Number of violations to fetch (default: 10,000)
                  Note: Full dataset has millions of records
        """
        print(f"\n🚗 Fetching Recent NYC Parking Violations (limit: {limit:,})...")

        def fetch():
            try:
                # Use Socrata API with limit
                params = {
                    '$limit': limit,
                    '$order': 'issue_date DESC'
                }

                response = requests.get(NYC_PARKING_URL, params=params, timeout=120)
                response.raise_for_status()

                data = response.json()
                print(f"   ✓ Fetched {len(data):,} violation records")
                return data

            except Exception as e:
                print(f"   ✗ Error fetching parking violations: {e}")
                return []

        return self._fetch_with_cache('parking_violations_sample', fetch)

    def analyze_violation_patterns(self, violations):
        """
        Analyze violation patterns from raw data.

        Args:
            violations: List of violation records
        """
        print("\n📊 Analyzing Violation Patterns...")

        if not violations:
            print("   ✗ No data to analyze")
            return None

        df = pd.DataFrame(violations)

        # Count by violation code
        if 'violation_code' in df.columns:
            top_violations = (
                df.groupby('violation_code')
                .size()
                .reset_index(name='count')
                .sort_values('count', ascending=False)
                .head(20)
            )

            # Add violation description if available
            if 'violation_description' in df.columns:
                desc_map = df.drop_duplicates('violation_code')[
                    ['violation_code', 'violation_description']
                ].set_index('violation_code')['violation_description'].to_dict()

                top_violations['description'] = top_violations['violation_code'].map(desc_map)

            violations_output = self.data_dir / 'top_violation_types.json'
            top_violations.to_json(violations_output, orient='records', indent=2)
            print(f"   ✓ Saved top violation types to {violations_output}")

        # Count by location (street)
        if 'street_name' in df.columns:
            top_streets = (
                df.groupby('street_name')
                .size()
                .reset_index(name='count')
                .sort_values('count', ascending=False)
                .head(50)
            )

            streets_output = self.data_dir / 'worst_streets_parking.json'
            top_streets.to_json(streets_output, orient='records', indent=2)
            print(f"   ✓ Saved worst parking streets to {streets_output}")

        # Analyze by time of day (if violation_time available)
        if 'violation_time' in df.columns:
            # Parse time (format: HHMMx where x is A/P for AM/PM)
            df['hour'] = df['violation_time'].astype(str).str[:2].apply(
                lambda x: int(x) if x.isdigit() else 0
            )

            hour_counts = (
                df.groupby('hour')
                .size()
                .reset_index(name='count')
                .sort_values('hour')
            )

            time_output = self.data_dir / 'violations_by_hour.json'
            hour_counts.to_json(time_output, orient='records', indent=2)
            print(f"   ✓ Saved violations by hour to {time_output}")

        # Analyze by vehicle make
        if 'vehicle_make' in df.columns:
            top_makes = (
                df.groupby('vehicle_make')
                .size()
                .reset_index(name='count')
                .sort_values('count', ascending=False)
                .head(20)
            )

            makes_output = self.data_dir / 'violations_by_vehicle.json'
            top_makes.to_json(makes_output, orient='records', indent=2)
            print(f"   ✓ Saved violations by vehicle make to {makes_output}")

        return {
            'top_violations': top_violations.to_dict('records') if 'violation_code' in df.columns else [],
            'top_streets': top_streets.to_dict('records') if 'street_name' in df.columns else [],
            'hour_distribution': hour_counts.to_dict('records') if 'violation_time' in df.columns else [],
            'top_makes': top_makes.to_dict('records') if 'vehicle_make' in df.columns else []
        }

    def export_sample_csv(self, violations):
        """
        Export sample violations to CSV for easy analysis.

        Args:
            violations: List of violation records
        """
        print("\n💾 Exporting Sample to CSV...")

        if not violations:
            print("   ✗ No data to export")
            return

        df = pd.DataFrame(violations)

        # Select key columns if available
        key_columns = [
            'summons_number', 'issue_date', 'violation_code', 'violation_description',
            'street_name', 'vehicle_make', 'vehicle_color', 'violation_time',
            'fine_amount', 'issuing_agency'
        ]

        available_columns = [col for col in key_columns if col in df.columns]

        if available_columns:
            df_export = df[available_columns]
        else:
            df_export = df

        output_path = self.data_dir / 'parking_violations_sample.csv'
        df_export.to_csv(output_path, index=False)

        print(f"   ✓ Saved {len(df_export):,} records to {output_path}")

    def save_metadata(self, record_count):
        """Save dataset metadata."""
        metadata = {
            'dataset_name': 'NYC Parking Violations',
            'last_updated': datetime.now().strftime('%Y-%m-%d'),
            'source': 'NYC Open Data - Department of Finance',
            'source_url': 'https://data.cityofnewyork.us/City-Government/Parking-Violations-Issued/pvqr-7yc4',
            'api_endpoint': NYC_PARKING_URL,
            'record_count': record_count,
            'sample_size': 'First 10,000 recent violations',
            'fields': {
                'summons_number': 'Unique summons identifier',
                'issue_date': 'Date violation was issued',
                'violation_code': 'Numeric violation code',
                'violation_description': 'Human-readable violation description',
                'street_name': 'Street where violation occurred',
                'vehicle_make': 'Vehicle manufacturer',
                'vehicle_color': 'Vehicle color',
                'violation_time': 'Time of violation (HHMMA/P format)',
                'fine_amount': 'Fine amount in dollars',
                'issuing_agency': 'Agency that issued the ticket'
            },
            'notes': (
                'NYC parking violations from Department of Finance. '
                'Dataset contains millions of records. '
                'This is a sample of 10,000 recent violations for visualization. '
                'Full dataset available via NYC Open Data API with pagination.'
            ),
            'update_frequency': 'Daily',
            'api_key_required': False,
            'rate_limits': 'Socrata API: No authentication required for public data'
        }

        meta_path = self.data_dir / 'parking_violations_metadata.json'
        with open(meta_path, 'w') as f:
            json.dump(metadata, f, indent=2)

        print(f"\n✓ Metadata saved to {meta_path}")


def main():
    print("=" * 60)
    print("NYC PARKING TICKETS DATA FETCHER")
    print("=" * 60)

    fetcher = ParkingTicketsFetcher()

    # Fetch recent violations (10,000 sample)
    violations = fetcher.fetch_recent_violations(limit=10000)

    if violations:
        # Analyze patterns
        patterns = fetcher.analyze_violation_patterns(violations)

        # Export sample CSV
        fetcher.export_sample_csv(violations)

        # Save metadata
        fetcher.save_metadata(record_count=len(violations))

    print("\n" + "=" * 60)
    print("✓ Parking tickets data fetching complete")
    print("=" * 60)


if __name__ == "__main__":
    main()
