Add RLUtils class for managing RL/AI dashboard endpoints

- Implemented methods for fetching AI stats, training history, and recent experiences. - Added functionality to set operation mode (MANUAL, AUTO, AI) with appropriate handling. - Included helper methods for querying the database and sending JSON responses. - Integrated model metadata extraction for visualization purposes.
2026-03-14 00:21:58 +00:00 · 2026-02-18 22:36:10 +01:00
parent b8a13cc698
commit eb20b168a6
684 changed files with 53278 additions and 27977 deletions
--- a/data_consolidator.py
+++ b/data_consolidator.py
@@ -0,0 +1,829 @@
+"""
+data_consolidator.py - Data Consolidation Engine for Deep Learning
+═══════════════════════════════════════════════════════════════════════════
+
+Purpose:
+    Consolidate logged features into training-ready datasets.
+    Prepare data exports for deep learning on external PC.
+
+Features:
+    - Aggregate features across time windows
+    - Compute statistical features
+    - Create feature vectors for neural networks
+    - Export in formats ready for TensorFlow/PyTorch
+    - Incremental consolidation (low memory footprint)
+
+Author: Bjorn Team
+Version: 2.0.0
+"""
+
+import json
+import csv
+import time
+import gzip
+import heapq
+from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional, Tuple
+from pathlib import Path
+from logger import Logger
+
+logger = Logger(name="data_consolidator.py", level=20)
+
+try:
+    import requests
+except ImportError:
+    requests = None
+
+
+class DataConsolidator:
+    """
+    Consolidates raw feature logs into training datasets.
+    Optimized for Raspberry Pi Zero - processes in batches.
+    """
+    
+    def __init__(self, shared_data, export_dir: str = None):
+        """
+        Initialize data consolidator
+        
+        Args:
+            shared_data: SharedData instance
+            export_dir: Directory for export files
+        """
+        self.shared_data = shared_data
+        self.db = shared_data.db
+        
+        if export_dir is None:
+            # Default to shared_data path (cross-platform)
+            self.export_dir = Path(getattr(shared_data, 'ml_exports_dir', Path(shared_data.data_dir) / "ml_exports"))
+        else:
+            self.export_dir = Path(export_dir)
+            
+        self.export_dir.mkdir(parents=True, exist_ok=True)
+        # Server health state consumed by orchestrator fallback logic.
+        self.last_server_attempted = False
+        self.last_server_contact_ok = None
+        self._upload_backoff_until = 0.0
+        self._upload_backoff_current_s = 0.0
+        
+        logger.info(f"DataConsolidator initialized, exports: {self.export_dir}")
+
+    def _set_server_contact_state(self, attempted: bool, ok: Optional[bool]) -> None:
+        self.last_server_attempted = bool(attempted)
+        self.last_server_contact_ok = ok if attempted else None
+
+    def _apply_upload_backoff(self, base_backoff_s: int, max_backoff_s: int = 3600) -> int:
+        """
+        Exponential upload retry backoff:
+        base -> base*2 -> base*4 ... capped at max_backoff_s.
+        Returns the delay (seconds) applied for the next retry window.
+        """
+        base = max(10, int(base_backoff_s))
+        cap = max(base, int(max_backoff_s))
+        prev = float(getattr(self, "_upload_backoff_current_s", 0.0) or 0.0)
+
+        if prev <= 0:
+            delay = base
+        else:
+            delay = min(cap, max(base, int(prev * 2)))
+
+        self._upload_backoff_current_s = float(delay)
+        self._upload_backoff_until = time.monotonic() + delay
+        return int(delay)
+    
+    # ═══════════════════════════════════════════════════════════════════════
+    # CONSOLIDATION ENGINE
+    # ═══════════════════════════════════════════════════════════════════════
+    
+    def consolidate_features(
+        self,
+        batch_size: int = None,
+        max_batches: Optional[int] = None
+    ) -> Dict[str, int]:
+        """
+        Consolidate raw features into aggregated feature vectors.
+        Processes unconsolidated records in batches.
+        """
+        if batch_size is None:
+            batch_size = int(getattr(self.shared_data, "ai_batch_size", 100))
+        batch_size = max(1, min(int(batch_size), 5000))
+        stats = {
+            'records_processed': 0,
+            'records_aggregated': 0,
+            'batches_completed': 0,
+            'errors': 0
+        }
+        
+        try:
+            # Get unconsolidated records
+            unconsolidated = self.db.query("""
+                SELECT COUNT(*) as cnt 
+                FROM ml_features 
+                WHERE consolidated=0
+            """)[0]['cnt']
+            
+            if unconsolidated == 0:
+                logger.info("No unconsolidated features to process")
+                return stats
+            
+            logger.info(f"Consolidating {unconsolidated} feature records...")
+            
+            batch_count = 0
+            while True:
+                if max_batches and batch_count >= max_batches:
+                    break
+                
+                # Fetch batch
+                batch = self.db.query(f"""
+                    SELECT * FROM ml_features 
+                    WHERE consolidated=0 
+                    ORDER BY timestamp 
+                    LIMIT {batch_size}
+                """)
+                
+                if not batch:
+                    break
+                
+                # Process batch
+                for record in batch:
+                    try:
+                        self._consolidate_single_record(record)
+                        stats['records_processed'] += 1
+                    except Exception as e:
+                        logger.error(f"Error consolidating record {record['id']}: {e}")
+                        stats['errors'] += 1
+                
+                # Mark as consolidated
+                record_ids = [r['id'] for r in batch]
+                placeholders = ','.join('?' * len(record_ids))
+                self.db.execute(f"""
+                    UPDATE ml_features 
+                    SET consolidated=1 
+                    WHERE id IN ({placeholders})
+                """, record_ids)
+                
+                stats['batches_completed'] += 1
+                batch_count += 1
+                
+                # Progress log
+                if batch_count % 10 == 0:
+                    logger.info(
+                        f"Consolidation progress: {stats['records_processed']} records, "
+                        f"{stats['batches_completed']} batches"
+                    )
+            
+            logger.success(
+                f"Consolidation complete: {stats['records_processed']} records processed, "
+                f"{stats['errors']} errors"
+            )
+            
+        except Exception as e:
+            logger.error(f"Consolidation failed: {e}")
+            stats['errors'] += 1
+        
+        return stats
+    
+    def _consolidate_single_record(self, record: Dict[str, Any]):
+        """
+        Process a single feature record into aggregated form.
+        Computes statistical features and feature vectors.
+        """
+        try:
+            # Parse JSON fields once — reused by _build_feature_vector to avoid double-parsing
+            host_features = json.loads(record.get('host_features', '{}'))
+            network_features = json.loads(record.get('network_features', '{}'))
+            temporal_features = json.loads(record.get('temporal_features', '{}'))
+            action_features = json.loads(record.get('action_features', '{}'))
+
+            # Combine all features
+            all_features = {
+                **host_features,
+                **network_features,
+                **temporal_features,
+                **action_features
+            }
+
+            # Build numerical feature vector — pass already-parsed dicts to avoid re-parsing
+            feature_vector = self._build_feature_vector(
+                host_features, network_features, temporal_features, action_features
+            )
+            
+            # Determine time window
+            raw_ts = record['timestamp']
+            if isinstance(raw_ts, str):
+                try:
+                    timestamp = datetime.fromisoformat(raw_ts)
+                except ValueError:
+                    timestamp = datetime.now()
+            elif isinstance(raw_ts, datetime):
+                timestamp = raw_ts
+            else:
+                timestamp = datetime.now()
+
+            hourly_window = timestamp.replace(minute=0, second=0, microsecond=0).isoformat()
+            
+            # Update or insert aggregated record
+            self._update_aggregated_features(
+                mac_address=record['mac_address'],
+                time_window='hourly',
+                timestamp=hourly_window,
+                action_name=record['action_name'],
+                success=record['success'],
+                duration=record['duration_seconds'],
+                reward=record['reward'],
+                feature_vector=feature_vector,
+                all_features=all_features
+            )
+            
+        except Exception as e:
+            logger.error(f"Error consolidating single record: {e}")
+            raise
+    
+    def _build_feature_vector(
+        self,
+        host_features: Dict[str, Any],
+        network_features: Dict[str, Any],
+        temporal_features: Dict[str, Any],
+        action_features: Dict[str, Any],
+    ) -> Dict[str, float]:
+        """
+        Build a named feature dictionary from already-parsed feature dicts.
+        Accepts pre-parsed dicts so JSON is never decoded twice per record.
+        Uses shared ai_utils for consistency.
+        """
+        from ai_utils import extract_neural_features_dict
+
+        return extract_neural_features_dict(
+            host_features=host_features,
+            network_features=network_features,
+            temporal_features=temporal_features,
+            action_features=action_features,
+        )
+    
+    def _update_aggregated_features(
+        self,
+        mac_address: str,
+        time_window: str,
+        timestamp: str,
+        action_name: str,
+        success: int,
+        duration: float,
+        reward: float,
+        feature_vector: Dict[str, float],
+        all_features: Dict[str, Any]
+    ):
+        """
+        Update or insert aggregated feature record.
+        Accumulates statistics over the time window.
+        """
+        try:
+            # Check if record exists
+            existing = self.db.query("""
+                SELECT * FROM ml_features_aggregated 
+                WHERE mac_address=? AND time_window=? AND computed_at=?
+            """, (mac_address, time_window, timestamp))
+            
+            if existing:
+                # Update existing record
+                old = existing[0]
+                new_total = old['total_actions'] + 1
+                # ... typical stats update ...
+                
+                # Merge feature vectors (average each named feature)
+                old_vector = json.loads(old['feature_vector']) # Now a Dict
+                if isinstance(old_vector, list): # Migration handle
+                     old_vector = {} 
+
+                merged_vector = {}
+                # Combine keys from both
+                all_keys = set(old_vector.keys()) | set(feature_vector.keys())
+                for k in all_keys:
+                    v_old = old_vector.get(k, 0.0)
+                    v_new = feature_vector.get(k, 0.0)
+                    merged_vector[k] = (v_old * old['total_actions'] + v_new) / new_total
+                
+                self.db.execute("""
+                    UPDATE ml_features_aggregated
+                    SET total_actions=total_actions+1,
+                        success_rate=(success_rate*total_actions + ?)/(total_actions+1),
+                        avg_duration=(avg_duration*total_actions + ?)/(total_actions+1),
+                        total_reward=total_reward + ?,
+                        feature_vector=?
+                    WHERE mac_address=? AND time_window=? AND computed_at=?
+                """, (
+                    success,
+                    duration,
+                    reward,
+                    json.dumps(merged_vector),
+                    mac_address,
+                    time_window,
+                    timestamp
+                ))
+            else:
+                # Insert new record
+                self.db.execute("""
+                    INSERT INTO ml_features_aggregated (
+                        mac_address, time_window, computed_at,
+                        total_actions, success_rate, avg_duration, total_reward,
+                        feature_vector
+                    ) VALUES (?, ?, ?, 1, ?, ?, ?, ?)
+                """, (
+                    mac_address,
+                    time_window,
+                    timestamp,
+                    float(success),
+                    duration,
+                    reward,
+                    json.dumps(feature_vector)
+                ))
+                
+        except Exception as e:
+            logger.error(f"Error updating aggregated features: {e}")
+            raise
+    
+    # ═══════════════════════════════════════════════════════════════════════
+    # EXPORT FUNCTIONS
+    # ═══════════════════════════════════════════════════════════════════════
+    
+    def export_for_training(
+        self,
+        format: str = 'csv',
+        compress: bool = True,
+        max_records: Optional[int] = None
+    ) -> Tuple[str, int]:
+        """
+        Export consolidated features for deep learning training.
+        
+        Args:
+            format: 'csv', 'jsonl', or 'parquet'
+            compress: Whether to gzip the output
+            max_records: Maximum records to export (None = all)
+        
+        Returns:
+            Tuple of (file_path, record_count)
+        """
+        try:
+            if max_records is None:
+                max_records = int(getattr(self.shared_data, "ai_export_max_records", 1000))
+            max_records = max(100, min(int(max_records), 20000))
+
+            # Generate filename
+            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+            base_filename = f"bjorn_training_{timestamp}.{format}"
+            
+            if compress and format != 'parquet':
+                base_filename += '.gz'
+            
+            filepath = self.export_dir / base_filename
+            
+            # Fetch data
+            limit_clause = f"LIMIT {max_records}"
+            records = self.db.query(f"""
+                SELECT 
+                    mf.*,
+                    mfa.feature_vector,
+                    mfa.success_rate as aggregated_success_rate,
+                    mfa.total_actions as aggregated_total_actions
+                FROM ml_features mf
+                LEFT JOIN ml_features_aggregated mfa 
+                    ON mf.mac_address = mfa.mac_address
+                WHERE mf.consolidated=1 AND mf.export_batch_id IS NULL
+                ORDER BY mf.timestamp DESC
+                {limit_clause}
+            """)
+            
+            if not records:
+                logger.warning("No consolidated records to export")
+                return "", 0
+
+            # Extract IDs before export so we can free the records list early
+            record_ids = [r['id'] for r in records]
+
+            # Export based on format
+            if format == 'csv':
+                count = self._export_csv(records, filepath, compress)
+            elif format == 'jsonl':
+                count = self._export_jsonl(records, filepath, compress)
+            elif format == 'parquet':
+                count = self._export_parquet(records, filepath)
+            else:
+                raise ValueError(f"Unsupported format: {format}")
+
+            # Free the large records list immediately after export — record_ids is all we still need
+            del records
+
+            # Create export batch record
+            batch_id = self._create_export_batch(filepath, count)
+
+            # Update records with batch ID
+            placeholders = ','.join('?' * len(record_ids))
+            self.db.execute(f"""
+                UPDATE ml_features
+                SET export_batch_id=?
+                WHERE id IN ({placeholders})
+            """, [batch_id] + record_ids)
+            del record_ids
+            
+            logger.success(
+                f"Exported {count} records to {filepath} "
+                f"(batch_id={batch_id})"
+            )
+            
+            return str(filepath), count
+            
+        except Exception as e:
+            logger.error(f"Export failed: {e}")
+            raise
+    
+    def _export_csv(
+        self,
+        records: List[Dict],
+        filepath: Path,
+        compress: bool
+    ) -> int:
+        """Export records as CSV"""
+        open_func = gzip.open if compress else open
+        mode = 'wt' if compress else 'w'
+        
+        # 1. Flatten all records first to collect all possible fieldnames
+        flattened = []
+        all_fieldnames = set()
+        
+        for r in records:
+            flat = {
+                'timestamp': r['timestamp'],
+                'mac_address': r['mac_address'],
+                'ip_address': r['ip_address'],
+                'action_name': r['action_name'],
+                'success': r['success'],
+                'duration_seconds': r['duration_seconds'],
+                'reward': r['reward']
+            }
+            
+            # Parse and flatten features
+            for field in ['host_features', 'network_features', 'temporal_features', 'action_features']:
+                try:
+                    features = json.loads(r.get(field, '{}'))
+                    for k, v in features.items():
+                        if isinstance(v, (int, float, bool, str)):
+                            flat_key = f"{field}_{k}"
+                            flat[flat_key] = v
+                except Exception as e:
+                    logger.debug(f"Skip bad JSON in {field}: {e}")
+            
+            # Add named feature vector
+            if r.get('feature_vector'):
+                try:
+                    vector = json.loads(r['feature_vector'])
+                    if isinstance(vector, dict):
+                        for k, v in vector.items():
+                            flat[f'feat_{k}'] = v
+                    elif isinstance(vector, list):
+                        for i, v in enumerate(vector):
+                            flat[f'feature_{i}'] = v
+                except Exception as e:
+                    logger.debug(f"Skip bad feature vector: {e}")
+            
+            flattened.append(flat)
+            all_fieldnames.update(flat.keys())
+            
+        # 2. Sort fieldnames for consistency
+        sorted_fieldnames = sorted(list(all_fieldnames))
+        all_fieldnames = None  # Free the set
+
+        # 3. Write CSV
+        with open_func(filepath, mode, newline='', encoding='utf-8') as f:
+            if flattened:
+                writer = csv.DictWriter(f, fieldnames=sorted_fieldnames)
+                writer.writeheader()
+                writer.writerows(flattened)
+
+        count = len(flattened)
+        flattened = None  # Free the expanded list
+        return count
+    
+    def _export_jsonl(
+        self,
+        records: List[Dict],
+        filepath: Path,
+        compress: bool
+    ) -> int:
+        """Export records as JSON Lines"""
+        open_func = gzip.open if compress else open
+        mode = 'wt' if compress else 'w'
+        
+        with open_func(filepath, mode, encoding='utf-8') as f:
+            for r in records:
+                # Avoid mutating `records` in place to keep memory growth predictable.
+                row = dict(r)
+                for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
+                    try:
+                        row[field] = json.loads(row.get(field, '{}'))
+                    except Exception:
+                        row[field] = {}
+
+                if row.get('feature_vector'):
+                    try:
+                        row['feature_vector'] = json.loads(row['feature_vector'])
+                    except Exception:
+                        row['feature_vector'] = {}
+
+                f.write(json.dumps(row) + '\n')
+        
+        return len(records)
+    
+    def _export_parquet(self, records: List[Dict], filepath: Path) -> int:
+        """Export records as Parquet (requires pyarrow)"""
+        try:
+            import pyarrow as pa
+            import pyarrow.parquet as pq
+            
+            # Flatten records
+            flattened = []
+            for r in records:
+                flat = dict(r)
+                # Parse JSON fields
+                for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
+                    flat[field] = json.loads(r.get(field, '{}'))
+                
+                if r.get('feature_vector'):
+                    flat['feature_vector'] = json.loads(r['feature_vector'])
+                
+                flattened.append(flat)
+            
+            # Convert to Arrow table
+            table = pa.Table.from_pylist(flattened)
+            
+            # Write parquet
+            pq.write_table(table, filepath, compression='snappy')
+            
+            return len(records)
+            
+        except ImportError:
+            logger.error("Parquet export requires pyarrow. Falling back to CSV.")
+            return self._export_csv(records, filepath.with_suffix('.csv'), compress=True)
+    
+    def _create_export_batch(self, filepath: Path, count: int) -> int:
+        """Create export batch record and return batch ID"""
+        result = self.db.execute("""
+            INSERT INTO ml_export_batches (file_path, record_count, status)
+            VALUES (?, ?, 'exported')
+        """, (str(filepath), count))
+        
+        # Get the inserted ID
+        batch_id = self.db.query("SELECT last_insert_rowid() as id")[0]['id']
+        return batch_id
+    
+    # ═══════════════════════════════════════════════════════════════════════
+    # UTILITY METHODS
+    # ═══════════════════════════════════════════════════════════════════════
+    
+    def get_export_stats(self) -> Dict[str, Any]:
+        """Get statistics about exports"""
+        try:
+            batches = self.db.query("""
+                SELECT COUNT(*) as total_batches,
+                       SUM(record_count) as total_records,
+                       MAX(created_at) as last_export
+                FROM ml_export_batches
+                WHERE status='exported'
+            """)[0]
+            
+            pending = self.db.query("""
+                SELECT COUNT(*) as cnt 
+                FROM ml_features 
+                WHERE consolidated=1 AND export_batch_id IS NULL
+            """)[0]['cnt']
+            
+            return {
+                'total_export_batches': batches.get('total_batches', 0),
+                'total_records_exported': batches.get('total_records', 0),
+                'last_export_time': batches.get('last_export'),
+                'pending_export_count': pending
+            }
+        except Exception as e:
+            logger.error(f"Error getting export stats: {e}")
+            return {}
+
+    def flush_pending_uploads(self, max_files: int = 3) -> int:
+        """
+        Retry uploads for previously exported batches that were not transferred yet.
+        Returns the number of successfully transferred files.
+        """
+        max_files = max(0, int(max_files))
+        if max_files <= 0:
+            return 0
+
+        # No heavy "reliquat" tracking needed: pending uploads = files present in export_dir.
+        files = self._list_pending_export_files(limit=max_files)
+        ok = 0
+        for fp in files:
+            if self.upload_to_server(fp):
+                ok += 1
+            else:
+                # Stop early when server is unreachable to avoid repeated noise.
+                if self.last_server_attempted and self.last_server_contact_ok is False:
+                    break
+        return ok
+
+    def _list_pending_export_files(self, limit: int = 3) -> List[str]:
+        """
+        Return oldest export files present in export_dir.
+        This makes the backlog naturally equal to the number of files on disk.
+        """
+        limit = max(0, int(limit))
+        if limit <= 0:
+            return []
+
+        try:
+            d = Path(self.export_dir)
+            if not d.exists():
+                return []
+
+            def _safe_mtime(path: Path) -> float:
+                try:
+                    return path.stat().st_mtime
+                except Exception:
+                    return float("inf")
+
+            # Keep only the N oldest files in memory instead of sorting all candidates.
+            files_iter = (p for p in d.glob("bjorn_training_*") if p.is_file())
+            oldest = heapq.nsmallest(limit, files_iter, key=_safe_mtime)
+            return [str(p) for p in oldest]
+        except Exception:
+            return []
+
+    def _mark_batch_status(self, filepath: str, status: str, notes: str = "") -> None:
+        """Update ml_export_batches status for a given file path (best-effort)."""
+        try:
+            self.db.execute(
+                """
+                UPDATE ml_export_batches
+                SET status=?, notes=?
+                WHERE file_path=?
+                """,
+                (status, notes or "", str(filepath)),
+            )
+        except Exception:
+            pass
+
+    def _safe_delete_uploaded_export(self, filepath: Path) -> None:
+        """Delete a successfully-uploaded export file if configured to do so."""
+        try:
+            if not bool(self.shared_data.config.get("ai_delete_export_after_upload", True)):
+                return
+
+            fp = filepath.resolve()
+            base = Path(self.export_dir).resolve()
+            # Safety: only delete files under export_dir.
+            if base not in fp.parents:
+                return
+
+            fp.unlink(missing_ok=True)  # Python 3.8+ supports missing_ok
+        except TypeError:
+            # Python < 3.8 fallback (not expected here, but safe)
+            try:
+                if filepath.exists():
+                    filepath.unlink()
+            except Exception:
+                pass
+        except Exception:
+            pass
+    
+    def upload_to_server(self, filepath: str) -> bool:
+        """
+        Upload export file to AI Validation Server.
+        
+        Args:
+            filepath: Path to the file to upload
+            
+        Returns:
+            True if upload successful
+        """
+        self._set_server_contact_state(False, None)
+        try:
+            import requests
+        except ImportError:
+            requests = None
+
+        if requests is None:
+            logger.info_throttled(
+                "AI upload skipped: requests not installed",
+                key="ai_upload_no_requests",
+                interval_s=600.0,
+            )
+            return False
+            
+        url = self.shared_data.config.get("ai_server_url")
+        if not url:
+            logger.info_throttled(
+                "AI upload skipped: ai_server_url not configured",
+                key="ai_upload_no_url",
+                interval_s=600.0,
+            )
+            return False
+
+        backoff_s = max(10, int(self.shared_data.config.get("ai_upload_retry_backoff_s", 120)))
+        max_backoff_s = 3600
+        now_mono = time.monotonic()
+        if now_mono < self._upload_backoff_until:
+            remaining = int(self._upload_backoff_until - now_mono)
+            logger.debug(f"AI upload backoff active ({remaining}s remaining)")
+            logger.info_throttled(
+                "AI upload deferred: backoff active",
+                key="ai_upload_backoff_active",
+                interval_s=180.0,
+            )
+            return False
+            
+        try:
+            filepath = Path(filepath)
+            
+            if not filepath.exists():
+                logger.warning(f"AI upload skipped: file not found: {filepath}")
+                self._mark_batch_status(str(filepath), "missing", "file not found")
+                return False
+
+            # Get MAC address for unique identification
+            try:
+                from ai_utils import get_system_mac
+                mac = get_system_mac()
+            except ImportError:
+                mac = "unknown"
+                
+            logger.debug(f"Uploading {filepath.name} to AI Server ({url}) unique_id={mac}")
+            self._set_server_contact_state(True, None)
+            
+            with open(filepath, 'rb') as f:
+                files = {'file': f}
+                # Send MAC as query param
+                # Server expects ?mac_addr=...
+                params = {'mac_addr': mac}
+                
+                # Short timeout to avoid blocking
+                response = requests.post(f"{url}/upload", files=files, params=params, timeout=10)
+            
+            if response.status_code == 200:
+                self._set_server_contact_state(True, True)
+                self._upload_backoff_until = 0.0
+                self._upload_backoff_current_s = 0.0
+                logger.success(f"Uploaded {filepath.name} successfully")
+                self._mark_batch_status(str(filepath), "transferred", "uploaded")
+                self._safe_delete_uploaded_export(filepath)
+                return True
+            else:
+                self._set_server_contact_state(True, False)
+                next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
+                logger.debug(
+                    f"AI upload HTTP failure for {filepath.name}: status={response.status_code}, "
+                    f"next retry in {next_retry_s}s"
+                )
+                logger.info_throttled(
+                    f"AI upload deferred (HTTP {response.status_code})",
+                    key=f"ai_upload_http_{response.status_code}",
+                    interval_s=300.0,
+                )
+                return False
+                
+        except Exception as e:
+            self._set_server_contact_state(True, False)
+            next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
+            logger.debug(f"AI upload exception for {filepath}: {e} (next retry in {next_retry_s}s)")
+            logger.info_throttled(
+                "AI upload deferred: server unreachable (retry later)",
+                key="ai_upload_exception",
+                interval_s=300.0,
+            )
+            return False
+    
+    def cleanup_old_exports(self, days: int = 30):
+        """Delete export files older than N days"""
+        try:
+            cutoff = datetime.now() - timedelta(days=days)
+            
+            old_batches = self.db.query("""
+                SELECT file_path FROM ml_export_batches
+                WHERE created_at < ?
+            """, (cutoff.isoformat(),))
+            
+            deleted = 0
+            for batch in old_batches:
+                filepath = Path(batch['file_path'])
+                if filepath.exists():
+                    filepath.unlink()
+                    deleted += 1
+            
+            # Clean up database records
+            self.db.execute("""
+                DELETE FROM ml_export_batches
+                WHERE created_at < ?
+            """, (cutoff.isoformat(),))
+            
+            logger.info(f"Cleaned up {deleted} old export files")
+            
+        except Exception as e:
+            logger.error(f"Cleanup failed: {e}")
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# END OF FILE
+# ═══════════════════════════════════════════════════════════════════════════