Bjorn/data_consolidator.py

"""
data_consolidator.py - Data Consolidation Engine for Deep Learning
═══════════════════════════════════════════════════════════════════════════

Purpose:
    Consolidate logged features into training-ready datasets.
    Prepare data exports for deep learning on external PC.

Features:
    - Aggregate features across time windows
    - Compute statistical features
    - Create feature vectors for neural networks
    - Export in formats ready for TensorFlow/PyTorch
    - Incremental consolidation (low memory footprint)

Author: Bjorn Team
Version: 2.0.0
"""

import json
import csv
import time
import gzip
import heapq
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from pathlib import Path
from logger import Logger

logger = Logger(name="data_consolidator.py", level=20)

try:
    import requests
except ImportError:
    requests = None


class DataConsolidator:
    """
    Consolidates raw feature logs into training datasets.
    Optimized for Raspberry Pi Zero - processes in batches.
    """

    def __init__(self, shared_data, export_dir: str = None):
        """
        Initialize data consolidator

        Args:
            shared_data: SharedData instance
            export_dir: Directory for export files
        """
        self.shared_data = shared_data
        self.db = shared_data.db

        if export_dir is None:
            # Default to shared_data path (cross-platform)
            self.export_dir = Path(getattr(shared_data, 'ml_exports_dir', Path(shared_data.data_dir) / "ml_exports"))
        else:
            self.export_dir = Path(export_dir)

        self.export_dir.mkdir(parents=True, exist_ok=True)
        # Server health state consumed by orchestrator fallback logic.
        self.last_server_attempted = False
        self.last_server_contact_ok = None
        self._upload_backoff_until = 0.0
        self._upload_backoff_current_s = 0.0

        logger.info(f"DataConsolidator initialized, exports: {self.export_dir}")

    def _set_server_contact_state(self, attempted: bool, ok: Optional[bool]) -> None:
        self.last_server_attempted = bool(attempted)
        self.last_server_contact_ok = ok if attempted else None

    def _apply_upload_backoff(self, base_backoff_s: int, max_backoff_s: int = 3600) -> int:
        """
        Exponential upload retry backoff:
        base -> base*2 -> base*4 ... capped at max_backoff_s.
        Returns the delay (seconds) applied for the next retry window.
        """
        base = max(10, int(base_backoff_s))
        cap = max(base, int(max_backoff_s))
        prev = float(getattr(self, "_upload_backoff_current_s", 0.0) or 0.0)

        if prev <= 0:
            delay = base
        else:
            delay = min(cap, max(base, int(prev * 2)))

        self._upload_backoff_current_s = float(delay)
        self._upload_backoff_until = time.monotonic() + delay
        return int(delay)

    # ═══════════════════════════════════════════════════════════════════════
    # CONSOLIDATION ENGINE
    # ═══════════════════════════════════════════════════════════════════════

    def consolidate_features(
        self,
        batch_size: int = None,
        max_batches: Optional[int] = None
    ) -> Dict[str, int]:
        """
        Consolidate raw features into aggregated feature vectors.
        Processes unconsolidated records in batches.
        """
        if batch_size is None:
            batch_size = int(getattr(self.shared_data, "ai_batch_size", 100))
        batch_size = max(1, min(int(batch_size), 5000))
        stats = {
            'records_processed': 0,
            'records_aggregated': 0,
            'batches_completed': 0,
            'errors': 0
        }

        try:
            # Get unconsolidated records
            unconsolidated = self.db.query("""
                SELECT COUNT(*) as cnt
                FROM ml_features
                WHERE consolidated=0
            """)[0]['cnt']

            if unconsolidated == 0:
                logger.info("No unconsolidated features to process")
                return stats

            logger.info(f"Consolidating {unconsolidated} feature records...")

            batch_count = 0
            while True:
                if max_batches and batch_count >= max_batches:
                    break

                # Fetch batch
                batch = self.db.query(f"""
                    SELECT * FROM ml_features
                    WHERE consolidated=0
                    ORDER BY timestamp
                    LIMIT {batch_size}
                """)

                if not batch:
                    break

                # Process batch
                for record in batch:
                    try:
                        self._consolidate_single_record(record)
                        stats['records_processed'] += 1
                    except Exception as e:
                        logger.error(f"Error consolidating record {record['id']}: {e}")
                        stats['errors'] += 1

                # Mark as consolidated
                record_ids = [r['id'] for r in batch]
                placeholders = ','.join('?' * len(record_ids))
                self.db.execute(f"""
                    UPDATE ml_features
                    SET consolidated=1
                    WHERE id IN ({placeholders})
                """, record_ids)

                stats['batches_completed'] += 1
                batch_count += 1

                # Progress log
                if batch_count % 10 == 0:
                    logger.info(
                        f"Consolidation progress: {stats['records_processed']} records, "
                        f"{stats['batches_completed']} batches"
                    )

            logger.success(
                f"Consolidation complete: {stats['records_processed']} records processed, "
                f"{stats['errors']} errors"
            )

        except Exception as e:
            logger.error(f"Consolidation failed: {e}")
            stats['errors'] += 1

        return stats

    def _consolidate_single_record(self, record: Dict[str, Any]):
        """
        Process a single feature record into aggregated form.
        Computes statistical features and feature vectors.
        """
        try:
            # Parse JSON fields once — reused by _build_feature_vector to avoid double-parsing
            host_features = json.loads(record.get('host_features', '{}'))
            network_features = json.loads(record.get('network_features', '{}'))
            temporal_features = json.loads(record.get('temporal_features', '{}'))
            action_features = json.loads(record.get('action_features', '{}'))

            # Combine all features
            all_features = {
                **host_features,
                **network_features,
                **temporal_features,
                **action_features
            }

            # Build numerical feature vector — pass already-parsed dicts to avoid re-parsing
            feature_vector = self._build_feature_vector(
                host_features, network_features, temporal_features, action_features
            )

            # Determine time window
            raw_ts = record['timestamp']
            if isinstance(raw_ts, str):
                try:
                    timestamp = datetime.fromisoformat(raw_ts)
                except ValueError:
                    timestamp = datetime.now()
            elif isinstance(raw_ts, datetime):
                timestamp = raw_ts
            else:
                timestamp = datetime.now()

            hourly_window = timestamp.replace(minute=0, second=0, microsecond=0).isoformat()

            # Update or insert aggregated record
            self._update_aggregated_features(
                mac_address=record['mac_address'],
                time_window='hourly',
                timestamp=hourly_window,
                action_name=record['action_name'],
                success=record['success'],
                duration=record['duration_seconds'],
                reward=record['reward'],
                feature_vector=feature_vector,
                all_features=all_features
            )

        except Exception as e:
            logger.error(f"Error consolidating single record: {e}")
            raise

    def _build_feature_vector(
        self,
        host_features: Dict[str, Any],
        network_features: Dict[str, Any],
        temporal_features: Dict[str, Any],
        action_features: Dict[str, Any],
    ) -> Dict[str, float]:
        """
        Build a named feature dictionary from already-parsed feature dicts.
        Accepts pre-parsed dicts so JSON is never decoded twice per record.
        Uses shared ai_utils for consistency.
        """
        from ai_utils import extract_neural_features_dict

        return extract_neural_features_dict(
            host_features=host_features,
            network_features=network_features,
            temporal_features=temporal_features,
            action_features=action_features,
        )

    def _update_aggregated_features(
        self,
        mac_address: str,
        time_window: str,
        timestamp: str,
        action_name: str,
        success: int,
        duration: float,
        reward: float,
        feature_vector: Dict[str, float],
        all_features: Dict[str, Any]
    ):
        """
        Update or insert aggregated feature record.
        Accumulates statistics over the time window.
        """
        try:
            # Check if record exists
            existing = self.db.query("""
                SELECT * FROM ml_features_aggregated
                WHERE mac_address=? AND time_window=? AND computed_at=?
            """, (mac_address, time_window, timestamp))

            if existing:
                # Update existing record
                old = existing[0]
                new_total = old['total_actions'] + 1
                # ... typical stats update ...

                # Merge feature vectors (average each named feature)
                old_vector = json.loads(old['feature_vector']) # Now a Dict
                if isinstance(old_vector, list): # Migration handle
                     old_vector = {}

                merged_vector = {}
                # Combine keys from both
                all_keys = set(old_vector.keys()) | set(feature_vector.keys())
                for k in all_keys:
                    v_old = old_vector.get(k, 0.0)
                    v_new = feature_vector.get(k, 0.0)
                    merged_vector[k] = (v_old * old['total_actions'] + v_new) / new_total

                self.db.execute("""
                    UPDATE ml_features_aggregated
                    SET total_actions=total_actions+1,
                        success_rate=(success_rate*total_actions + ?)/(total_actions+1),
                        avg_duration=(avg_duration*total_actions + ?)/(total_actions+1),
                        total_reward=total_reward + ?,
                        feature_vector=?
                    WHERE mac_address=? AND time_window=? AND computed_at=?
                """, (
                    success,
                    duration,
                    reward,
                    json.dumps(merged_vector),
                    mac_address,
                    time_window,
                    timestamp
                ))
            else:
                # Insert new record
                self.db.execute("""
                    INSERT INTO ml_features_aggregated (
                        mac_address, time_window, computed_at,
                        total_actions, success_rate, avg_duration, total_reward,
                        feature_vector
                    ) VALUES (?, ?, ?, 1, ?, ?, ?, ?)
                """, (
                    mac_address,
                    time_window,
                    timestamp,
                    float(success),
                    duration,
                    reward,
                    json.dumps(feature_vector)
                ))

        except Exception as e:
            logger.error(f"Error updating aggregated features: {e}")
            raise

    # ═══════════════════════════════════════════════════════════════════════
    # EXPORT FUNCTIONS
    # ═══════════════════════════════════════════════════════════════════════

    def export_for_training(
        self,
        format: str = 'csv',
        compress: bool = True,
        max_records: Optional[int] = None
    ) -> Tuple[str, int]:
        """
        Export consolidated features for deep learning training.

        Args:
            format: 'csv', 'jsonl', or 'parquet'
            compress: Whether to gzip the output
            max_records: Maximum records to export (None = all)

        Returns:
            Tuple of (file_path, record_count)
        """
        try:
            if max_records is None:
                max_records = int(getattr(self.shared_data, "ai_export_max_records", 1000))
            max_records = max(100, min(int(max_records), 20000))

            # Generate filename
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            base_filename = f"bjorn_training_{timestamp}.{format}"

            if compress and format != 'parquet':
                base_filename += '.gz'

            filepath = self.export_dir / base_filename

            # Fetch data
            limit_clause = f"LIMIT {max_records}"
            records = self.db.query(f"""
                SELECT
                    mf.*,
                    mfa.feature_vector,
                    mfa.success_rate as aggregated_success_rate,
                    mfa.total_actions as aggregated_total_actions
                FROM ml_features mf
                LEFT JOIN ml_features_aggregated mfa
                    ON mf.mac_address = mfa.mac_address
                WHERE mf.consolidated=1 AND mf.export_batch_id IS NULL
                ORDER BY mf.timestamp DESC
                {limit_clause}
            """)

            if not records:
                logger.warning("No consolidated records to export")
                return "", 0

            # Extract IDs before export so we can free the records list early
            record_ids = [r['id'] for r in records]

            # Export based on format
            if format == 'csv':
                count = self._export_csv(records, filepath, compress)
            elif format == 'jsonl':
                count = self._export_jsonl(records, filepath, compress)
            elif format == 'parquet':
                count = self._export_parquet(records, filepath)
            else:
                raise ValueError(f"Unsupported format: {format}")

            # Free the large records list immediately after export — record_ids is all we still need
            del records

            # Create export batch record
            batch_id = self._create_export_batch(filepath, count)

            # Update records with batch ID
            placeholders = ','.join('?' * len(record_ids))
            self.db.execute(f"""
                UPDATE ml_features
                SET export_batch_id=?
                WHERE id IN ({placeholders})
            """, [batch_id] + record_ids)
            del record_ids

            logger.success(
                f"Exported {count} records to {filepath} "
                f"(batch_id={batch_id})"
            )

            return str(filepath), count

        except Exception as e:
            logger.error(f"Export failed: {e}")
            raise

    def _export_csv(
        self,
        records: List[Dict],
        filepath: Path,
        compress: bool
    ) -> int:
        """Export records as CSV"""
        open_func = gzip.open if compress else open
        mode = 'wt' if compress else 'w'

        # 1. Flatten all records first to collect all possible fieldnames
        flattened = []
        all_fieldnames = set()

        for r in records:
            flat = {
                'timestamp': r['timestamp'],
                'mac_address': r['mac_address'],
                'ip_address': r['ip_address'],
                'action_name': r['action_name'],
                'success': r['success'],
                'duration_seconds': r['duration_seconds'],
                'reward': r['reward']
            }

            # Parse and flatten features
            for field in ['host_features', 'network_features', 'temporal_features', 'action_features']:
                try:
                    features = json.loads(r.get(field, '{}'))
                    for k, v in features.items():
                        if isinstance(v, (int, float, bool, str)):
                            flat_key = f"{field}_{k}"
                            flat[flat_key] = v
                except Exception as e:
                    logger.debug(f"Skip bad JSON in {field}: {e}")

            # Add named feature vector
            if r.get('feature_vector'):
                try:
                    vector = json.loads(r['feature_vector'])
                    if isinstance(vector, dict):
                        for k, v in vector.items():
                            flat[f'feat_{k}'] = v
                    elif isinstance(vector, list):
                        for i, v in enumerate(vector):
                            flat[f'feature_{i}'] = v
                except Exception as e:
                    logger.debug(f"Skip bad feature vector: {e}")

            flattened.append(flat)
            all_fieldnames.update(flat.keys())

        # 2. Sort fieldnames for consistency
        sorted_fieldnames = sorted(list(all_fieldnames))
        all_fieldnames = None  # Free the set

        # 3. Write CSV
        with open_func(filepath, mode, newline='', encoding='utf-8') as f:
            if flattened:
                writer = csv.DictWriter(f, fieldnames=sorted_fieldnames)
                writer.writeheader()
                writer.writerows(flattened)

        count = len(flattened)
        flattened = None  # Free the expanded list
        return count

    def _export_jsonl(
        self,
        records: List[Dict],
        filepath: Path,
        compress: bool
    ) -> int:
        """Export records as JSON Lines"""
        open_func = gzip.open if compress else open
        mode = 'wt' if compress else 'w'

        with open_func(filepath, mode, encoding='utf-8') as f:
            for r in records:
                # Avoid mutating `records` in place to keep memory growth predictable.
                row = dict(r)
                for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
                    try:
                        row[field] = json.loads(row.get(field, '{}'))
                    except Exception:
                        row[field] = {}

                if row.get('feature_vector'):
                    try:
                        row['feature_vector'] = json.loads(row['feature_vector'])
                    except Exception:
                        row['feature_vector'] = {}

                f.write(json.dumps(row) + '\n')

        return len(records)

    def _export_parquet(self, records: List[Dict], filepath: Path) -> int:
        """Export records as Parquet (requires pyarrow)"""
        try:
            import pyarrow as pa
            import pyarrow.parquet as pq

            # Flatten records
            flattened = []
            for r in records:
                flat = dict(r)
                # Parse JSON fields
                for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
                    flat[field] = json.loads(r.get(field, '{}'))

                if r.get('feature_vector'):
                    flat['feature_vector'] = json.loads(r['feature_vector'])

                flattened.append(flat)

            # Convert to Arrow table
            table = pa.Table.from_pylist(flattened)

            # Write parquet
            pq.write_table(table, filepath, compression='snappy')

            return len(records)

        except ImportError:
            logger.error("Parquet export requires pyarrow. Falling back to CSV.")
            return self._export_csv(records, filepath.with_suffix('.csv'), compress=True)

    def _create_export_batch(self, filepath: Path, count: int) -> int:
        """Create export batch record and return batch ID"""
        result = self.db.execute("""
            INSERT INTO ml_export_batches (file_path, record_count, status)
            VALUES (?, ?, 'exported')
        """, (str(filepath), count))

        # Get the inserted ID
        batch_id = self.db.query("SELECT last_insert_rowid() as id")[0]['id']
        return batch_id

    # ═══════════════════════════════════════════════════════════════════════
    # UTILITY METHODS
    # ═══════════════════════════════════════════════════════════════════════

    def get_export_stats(self) -> Dict[str, Any]:
        """Get statistics about exports"""
        try:
            batches = self.db.query("""
                SELECT COUNT(*) as total_batches,
                       SUM(record_count) as total_records,
                       MAX(created_at) as last_export
                FROM ml_export_batches
                WHERE status='exported'
            """)[0]

            pending = self.db.query("""
                SELECT COUNT(*) as cnt
                FROM ml_features
                WHERE consolidated=1 AND export_batch_id IS NULL
            """)[0]['cnt']

            return {
                'total_export_batches': batches.get('total_batches', 0),
                'total_records_exported': batches.get('total_records', 0),
                'last_export_time': batches.get('last_export'),
                'pending_export_count': pending
            }
        except Exception as e:
            logger.error(f"Error getting export stats: {e}")
            return {}

    def flush_pending_uploads(self, max_files: int = 3) -> int:
        """
        Retry uploads for previously exported batches that were not transferred yet.
        Returns the number of successfully transferred files.
        """
        max_files = max(0, int(max_files))
        if max_files <= 0:
            return 0

        # No heavy "reliquat" tracking needed: pending uploads = files present in export_dir.
        files = self._list_pending_export_files(limit=max_files)
        ok = 0
        for fp in files:
            if self.upload_to_server(fp):
                ok += 1
            else:
                # Stop early when server is unreachable to avoid repeated noise.
                if self.last_server_attempted and self.last_server_contact_ok is False:
                    break
        return ok

    def _list_pending_export_files(self, limit: int = 3) -> List[str]:
        """
        Return oldest export files present in export_dir.
        This makes the backlog naturally equal to the number of files on disk.
        """
        limit = max(0, int(limit))
        if limit <= 0:
            return []

        try:
            d = Path(self.export_dir)
            if not d.exists():
                return []

            def _safe_mtime(path: Path) -> float:
                try:
                    return path.stat().st_mtime
                except Exception:
                    return float("inf")

            # Keep only the N oldest files in memory instead of sorting all candidates.
            files_iter = (p for p in d.glob("bjorn_training_*") if p.is_file())
            oldest = heapq.nsmallest(limit, files_iter, key=_safe_mtime)
            return [str(p) for p in oldest]
        except Exception:
            return []

    def _mark_batch_status(self, filepath: str, status: str, notes: str = "") -> None:
        """Update ml_export_batches status for a given file path (best-effort)."""
        try:
            self.db.execute(
                """
                UPDATE ml_export_batches
                SET status=?, notes=?
                WHERE file_path=?
                """,
                (status, notes or "", str(filepath)),
            )
        except Exception:
            pass

    def _safe_delete_uploaded_export(self, filepath: Path) -> None:
        """Delete a successfully-uploaded export file if configured to do so."""
        try:
            if not bool(self.shared_data.config.get("ai_delete_export_after_upload", True)):
                return

            fp = filepath.resolve()
            base = Path(self.export_dir).resolve()
            # Safety: only delete files under export_dir.
            if base not in fp.parents:
                return

            fp.unlink(missing_ok=True)  # Python 3.8+ supports missing_ok
        except TypeError:
            # Python < 3.8 fallback (not expected here, but safe)
            try:
                if filepath.exists():
                    filepath.unlink()
            except Exception:
                pass
        except Exception:
            pass

    def upload_to_server(self, filepath: str) -> bool:
        """
        Upload export file to AI Validation Server.

        Args:
            filepath: Path to the file to upload

        Returns:
            True if upload successful
        """
        self._set_server_contact_state(False, None)
        try:
            import requests
        except ImportError:
            requests = None

        if requests is None:
            logger.info_throttled(
                "AI upload skipped: requests not installed",
                key="ai_upload_no_requests",
                interval_s=600.0,
            )
            return False

        url = self.shared_data.config.get("ai_server_url")
        if not url:
            logger.info_throttled(
                "AI upload skipped: ai_server_url not configured",
                key="ai_upload_no_url",
                interval_s=600.0,
            )
            return False

        backoff_s = max(10, int(self.shared_data.config.get("ai_upload_retry_backoff_s", 120)))
        max_backoff_s = 3600
        now_mono = time.monotonic()
        if now_mono < self._upload_backoff_until:
            remaining = int(self._upload_backoff_until - now_mono)
            logger.debug(f"AI upload backoff active ({remaining}s remaining)")
            logger.info_throttled(
                "AI upload deferred: backoff active",
                key="ai_upload_backoff_active",
                interval_s=180.0,
            )
            return False

        try:
            filepath = Path(filepath)

            if not filepath.exists():
                logger.warning(f"AI upload skipped: file not found: {filepath}")
                self._mark_batch_status(str(filepath), "missing", "file not found")
                return False

            # Get MAC address for unique identification
            try:
                from ai_utils import get_system_mac
                mac = get_system_mac()
            except ImportError:
                mac = "unknown"

            logger.debug(f"Uploading {filepath.name} to AI Server ({url}) unique_id={mac}")
            self._set_server_contact_state(True, None)

            with open(filepath, 'rb') as f:
                files = {'file': f}
                # Send MAC as query param
                # Server expects ?mac_addr=...
                params = {'mac_addr': mac}

                # Short timeout to avoid blocking
                response = requests.post(f"{url}/upload", files=files, params=params, timeout=10)

            if response.status_code == 200:
                self._set_server_contact_state(True, True)
                self._upload_backoff_until = 0.0
                self._upload_backoff_current_s = 0.0
                logger.success(f"Uploaded {filepath.name} successfully")
                self._mark_batch_status(str(filepath), "transferred", "uploaded")
                self._safe_delete_uploaded_export(filepath)
                return True
            else:
                self._set_server_contact_state(True, False)
                next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
                logger.debug(
                    f"AI upload HTTP failure for {filepath.name}: status={response.status_code}, "
                    f"next retry in {next_retry_s}s"
                )
                logger.info_throttled(
                    f"AI upload deferred (HTTP {response.status_code})",
                    key=f"ai_upload_http_{response.status_code}",
                    interval_s=300.0,
                )
                return False

        except Exception as e:
            self._set_server_contact_state(True, False)
            next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
            logger.debug(f"AI upload exception for {filepath}: {e} (next retry in {next_retry_s}s)")
            logger.info_throttled(
                "AI upload deferred: server unreachable (retry later)",
                key="ai_upload_exception",
                interval_s=300.0,
            )
            return False

    def cleanup_old_exports(self, days: int = 30):
        """Delete export files older than N days"""
        try:
            cutoff = datetime.now() - timedelta(days=days)

            old_batches = self.db.query("""
                SELECT file_path FROM ml_export_batches
                WHERE created_at < ?
            """, (cutoff.isoformat(),))

            deleted = 0
            for batch in old_batches:
                filepath = Path(batch['file_path'])
                if filepath.exists():
                    filepath.unlink()
                    deleted += 1

            # Clean up database records
            self.db.execute("""
                DELETE FROM ml_export_batches
                WHERE created_at < ?
            """, (cutoff.isoformat(),))

            logger.info(f"Cleaned up {deleted} old export files")

        except Exception as e:
            logger.error(f"Cleanup failed: {e}")


# ═══════════════════════════════════════════════════════════════════════════
# END OF FILE
# ═══════════════════════════════════════════════════════════════════════════