Add RLUtils class for managing RL/AI dashboard endpoints

- Implemented methods for fetching AI stats, training history, and recent experiences. - Added functionality to set operation mode (MANUAL, AUTO, AI) with appropriate handling. - Included helper methods for querying the database and sending JSON responses. - Integrated model metadata extraction for visualization purposes.
2026-03-09 06:01:59 +00:00 · 2026-02-18 22:36:10 +01:00
parent b8a13cc698
commit eb20b168a6
684 changed files with 53278 additions and 27977 deletions
--- a/Bjorn.py
+++ b/Bjorn.py
@@ -1,173 +1,625 @@
-# bjorn.py
-import threading
-import signal
+# Bjorn.py
+# Main entry point and supervisor for the Bjorn project
+# Manages lifecycle of threads, health monitoring, and crash protection.
+# OPTIMIZED FOR PI ZERO 2: Low CPU overhead, aggressive RAM management.
+
 import logging
-import time
-import sys
+import os
+import signal
 import subprocess
-import re
-from init_shared import shared_data
-from display import Display, handle_exit_display
+import sys
+import threading
+import time
+import gc 
+import tracemalloc
+import atexit
+
 from comment import Commentaireia
-from webapp import web_thread, handle_exit_web
-from orchestrator import Orchestrator
+from display import Display, handle_exit_display
+from init_shared import shared_data
 from logger import Logger
+from orchestrator import Orchestrator
+from runtime_state_updater import RuntimeStateUpdater
+from webapp import web_thread

 logger = Logger(name="Bjorn.py", level=logging.DEBUG)
+_shutdown_lock = threading.Lock()
+_shutdown_started = False
+_instance_lock_fd = None
+_instance_lock_path = "/tmp/bjorn_160226.lock"
+
+try:
+    import fcntl
+except Exception:
+    fcntl = None
+
+
+def _release_instance_lock():
+    global _instance_lock_fd
+    if _instance_lock_fd is None:
+        return
+    try:
+        if fcntl is not None:
+            try:
+                fcntl.flock(_instance_lock_fd.fileno(), fcntl.LOCK_UN)
+            except Exception:
+                pass
+        _instance_lock_fd.close()
+    except Exception:
+        pass
+    _instance_lock_fd = None
+
+
+def _acquire_instance_lock() -> bool:
+    """Ensure only one Bjorn_160226 process can run at once."""
+    global _instance_lock_fd
+    if _instance_lock_fd is not None:
+        return True
+
+    try:
+        fd = open(_instance_lock_path, "a+", encoding="utf-8")
+    except Exception as exc:
+        logger.error(f"Unable to open instance lock file {_instance_lock_path}: {exc}")
+        return True
+
+    if fcntl is None:
+        _instance_lock_fd = fd
+        return True
+
+    try:
+        fcntl.flock(fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+        fd.seek(0)
+        fd.truncate()
+        fd.write(str(os.getpid()))
+        fd.flush()
+    except OSError:
+        try:
+            fd.seek(0)
+            owner_pid = fd.read().strip() or "unknown"
+        except Exception:
+            owner_pid = "unknown"
+        logger.critical(f"Another Bjorn instance is already running (pid={owner_pid}).")
+        try:
+            fd.close()
+        except Exception:
+            pass
+        return False
+
+    _instance_lock_fd = fd
+    return True
+
+
+class HealthMonitor(threading.Thread):
+    """Periodic runtime health logger (threads/fd/rss/queue/epd metrics)."""
+
+    def __init__(self, shared_data_, interval_s: int = 60):
+        super().__init__(daemon=True, name="HealthMonitor")
+        self.shared_data = shared_data_
+        self.interval_s = max(10, int(interval_s))
+        self._stop_event = threading.Event()
+        self._tm_prev_snapshot = None
+        self._tm_last_report = 0.0
+
+    def stop(self):
+        self._stop_event.set()
+
+    def _fd_count(self) -> int:
+        try:
+            return len(os.listdir("/proc/self/fd"))
+        except Exception:
+            return -1
+
+    def _rss_kb(self) -> int:
+        try:
+            with open("/proc/self/status", "r", encoding="utf-8") as fh:
+                for line in fh:
+                    if line.startswith("VmRSS:"):
+                        parts = line.split()
+                        if len(parts) >= 2:
+                            return int(parts[1])
+        except Exception:
+            pass
+        return -1
+
+    def _queue_counts(self):
+        pending = running = scheduled = -1
+        try:
+            # Using query_one safe method from database
+            row = self.shared_data.db.query_one(
+                """
+                SELECT
+                    SUM(CASE WHEN status='pending' THEN 1 ELSE 0 END) AS pending,
+                    SUM(CASE WHEN status='running' THEN 1 ELSE 0 END) AS running,
+                    SUM(CASE WHEN status='scheduled' THEN 1 ELSE 0 END) AS scheduled
+                FROM action_queue
+                """
+            )
+            if row:
+                pending = int(row.get("pending") or 0)
+                running = int(row.get("running") or 0)
+                scheduled = int(row.get("scheduled") or 0)
+        except Exception as exc:
+            logger.error_throttled(
+                f"Health monitor queue count query failed: {exc}",
+                key="health_queue_counts",
+                interval_s=120,
+            )
+        return pending, running, scheduled
+
+    def run(self):
+        while not self._stop_event.wait(self.interval_s):
+            try:
+                threads = threading.enumerate()
+                thread_count = len(threads)
+                top_threads = ",".join(t.name for t in threads[:8])
+                fd_count = self._fd_count()
+                rss_kb = self._rss_kb()
+                pending, running, scheduled = self._queue_counts()
+
+                # Lock to safely read shared metrics without race conditions
+                with self.shared_data.health_lock:
+                    display_metrics = dict(getattr(self.shared_data, "display_runtime_metrics", {}) or {})
+
+                epd_enabled = int(display_metrics.get("epd_enabled", 0))
+                epd_failures = int(display_metrics.get("failed_updates", 0))
+                epd_reinit = int(display_metrics.get("reinit_attempts", 0))
+                epd_headless = int(display_metrics.get("headless", 0))
+                epd_last_success = display_metrics.get("last_success_epoch", 0)
+
+                logger.info(
+                    "health "
+                    f"thread_count={thread_count} "
+                    f"rss_kb={rss_kb} "
+                    f"queue_pending={pending} "
+                    f"epd_failures={epd_failures} "
+                    f"epd_reinit={epd_reinit} "
+                )
+
+                # Optional: tracemalloc report (only if enabled via PYTHONTRACEMALLOC or tracemalloc.start()).
+                try:
+                    if tracemalloc.is_tracing():
+                        now = time.monotonic()
+                        tm_interval = float(self.shared_data.config.get("tracemalloc_report_interval_s", 300) or 300)
+                        if tm_interval > 0 and (now - self._tm_last_report) >= tm_interval:
+                            self._tm_last_report = now
+                            top_n = int(self.shared_data.config.get("tracemalloc_top_n", 10) or 10)
+                            top_n = max(3, min(top_n, 25))
+
+                            snap = tracemalloc.take_snapshot()
+                            if self._tm_prev_snapshot is not None:
+                                stats = snap.compare_to(self._tm_prev_snapshot, "lineno")[:top_n]
+                                logger.info(f"mem_top (tracemalloc diff, top_n={top_n})")
+                                for st in stats:
+                                    logger.info(f"mem_top {st}")
+                            else:
+                                stats = snap.statistics("lineno")[:top_n]
+                                logger.info(f"mem_top (tracemalloc, top_n={top_n})")
+                                for st in stats:
+                                    logger.info(f"mem_top {st}")
+                            self._tm_prev_snapshot = snap
+                except Exception as exc:
+                    logger.error_throttled(
+                        f"Health monitor tracemalloc failure: {exc}",
+                        key="health_tracemalloc_error",
+                        interval_s=300,
+                    )
+            except Exception as exc:
+                logger.error_throttled(
+                    f"Health monitor loop failure: {exc}",
+                    key="health_loop_error",
+                    interval_s=120,
+                )
+

 class Bjorn:
-    """Main class for Bjorn. Manages the primary operations of the application."""
-    def __init__(self, shared_data):
-        self.shared_data = shared_data
+    """Main class for Bjorn. Manages orchestration lifecycle."""
+
+    def __init__(self, shared_data_):
+        self.shared_data = shared_data_
        self.commentaire_ia = Commentaireia()
        self.orchestrator_thread = None
        self.orchestrator = None
        self.network_connected = False
        self.wifi_connected = False
-        self.previous_network_connected = None  # Pour garder une trace de l'état précédent
+        self.previous_network_connected = None
+        self._orch_lock = threading.Lock()
+        self._last_net_check = 0  # Throttling for network scan
+        self._last_orch_stop_attempt = 0.0

    def run(self):
-        """Main loop for Bjorn. Waits for Wi-Fi connection and starts Orchestrator."""
-        # Wait for startup delay if configured in shared data
-        if hasattr(self.shared_data, 'startup_delay') and self.shared_data.startup_delay > 0:
+        """Main loop for Bjorn. Waits for network and starts/stops Orchestrator based on mode."""
+        if hasattr(self.shared_data, "startup_delay") and self.shared_data.startup_delay > 0:
            logger.info(f"Waiting for startup delay: {self.shared_data.startup_delay} seconds")
            time.sleep(self.shared_data.startup_delay)

-        # Main loop to keep Bjorn running
+        backoff_s = 1.0
        while not self.shared_data.should_exit:
-            if not self.shared_data.manual_mode:
-                self.check_and_start_orchestrator()
-            time.sleep(10)  # Main loop idle waiting
+            try:
+                # Manual mode must stop orchestration so the user keeps full control.
+                if self.shared_data.operation_mode == "MANUAL":
+                    # Avoid spamming stop requests if already stopped.
+                    if self.orchestrator_thread is not None and self.orchestrator_thread.is_alive():
+                        self.stop_orchestrator()
+                else:
+                    self.check_and_start_orchestrator()
+
+                time.sleep(5)
+                backoff_s = 1.0  # Reset backoff on success
+
+            except Exception as exc:
+                logger.error(f"Bjorn main loop error: {exc}")
+                logger.error_throttled(
+                    "Bjorn main loop entering backoff due to repeated errors",
+                    key="bjorn_main_loop_backoff",
+                    interval_s=60,
+                )
+                time.sleep(backoff_s)
+                backoff_s = min(backoff_s * 2.0, 30.0)

    def check_and_start_orchestrator(self):
-        """Check Wi-Fi and start the orchestrator if connected."""
+        if self.shared_data.operation_mode == "MANUAL":
+            return
        if self.is_network_connected():
            self.wifi_connected = True
            if self.orchestrator_thread is None or not self.orchestrator_thread.is_alive():
                self.start_orchestrator()
        else:
            self.wifi_connected = False
-            logger.info("Waiting for Wi-Fi connection to start Orchestrator...")
+            logger.info_throttled(
+                "Waiting for network connection to start Orchestrator...",
+                key="bjorn_wait_network",
+                interval_s=30,
+            )

    def start_orchestrator(self):
-        """Start the orchestrator thread."""
-        self.is_network_connected() # reCheck if Wi-Fi is connected before starting the orchestrator
-        # time.sleep(10)  # Wait for network to stabilize
-        if self.wifi_connected:  # Check if Wi-Fi is connected before starting the orchestrator
-            if self.orchestrator_thread is None or not self.orchestrator_thread.is_alive():
-                logger.info("Starting Orchestrator thread...")
-                self.shared_data.orchestrator_should_exit = False
-                self.shared_data.manual_mode = False
-                self.orchestrator = Orchestrator()
-                self.orchestrator_thread = threading.Thread(target=self.orchestrator.run)
-                self.orchestrator_thread.start()
-                logger.info("Orchestrator thread started, automatic mode activated.")
-            else:
-                logger.info("Orchestrator thread is already running.")
-        else:
-            pass
+        with self._orch_lock:
+            # Re-check network inside lock
+            if not self.network_connected:
+                return
+            if self.orchestrator_thread is not None and self.orchestrator_thread.is_alive():
+                logger.debug("Orchestrator thread is already running.")
+                return
+
+            logger.info("Starting Orchestrator thread...")
+            self.shared_data.orchestrator_should_exit = False
            
+            self.orchestrator = Orchestrator()
+            self.orchestrator_thread = threading.Thread(
+                target=self.orchestrator.run,
+                daemon=True,
+                name="OrchestratorMain",
+            )
+            self.orchestrator_thread.start()
+            logger.info("Orchestrator thread started.")
+
    def stop_orchestrator(self):
-        """Stop the orchestrator thread."""
-        self.shared_data.manual_mode = True
-        logger.info("Stop button pressed. Manual mode activated & Stopping Orchestrator...")
-        if self.orchestrator_thread is not None and self.orchestrator_thread.is_alive():
-            logger.info("Stopping Orchestrator thread...")
+        with self._orch_lock:
+            thread = self.orchestrator_thread
+            if thread is None or not thread.is_alive():
+                self.orchestrator_thread = None
+                self.orchestrator = None
+                return
+
+            # Keep MANUAL sticky so supervisor does not auto-restart orchestration.
+            try:
+                self.shared_data.operation_mode = "MANUAL"
+            except Exception:
+                pass
+
+            now = time.time()
+            if now - self._last_orch_stop_attempt >= 10.0:
+                logger.info("Stop requested: stopping Orchestrator")
+                self._last_orch_stop_attempt = now
            self.shared_data.orchestrator_should_exit = True
-            self.orchestrator_thread.join()
-            logger.info("Orchestrator thread stopped.")
+            self.shared_data.queue_event.set() # Wake up thread
+            thread.join(timeout=10.0)
+            
+            if thread.is_alive():
+                logger.warning_throttled(
+                    "Orchestrator thread did not stop gracefully",
+                    key="orch_stop_not_graceful",
+                    interval_s=20,
+                )
+                return
+
+            self.orchestrator_thread = None
+            self.orchestrator = None
            self.shared_data.bjorn_orch_status = "IDLE"
            self.shared_data.bjorn_status_text2 = ""
-            self.shared_data.manual_mode = True
-        else:
-            logger.info("Orchestrator thread is not running.")

-    
    def is_network_connected(self):
-        """Checks for network connectivity on eth0 or wlan0 using ip command (replacing deprecated ifconfig)."""
-        logger = logging.getLogger("Bjorn.py")
+        """Checks for network connectivity with throttling and low-CPU checks."""
+        now = time.time()
+        # Throttling: Do not scan more than once every 10 seconds
+        if now - self._last_net_check < 10:
+            return self.network_connected
+        
+        self._last_net_check = now

        def interface_has_ip(interface_name):
            try:
-                # Use 'ip -4 addr show <interface>' to check for IPv4 address
+                # OPTIMIZATION: Check /sys/class/net first to avoid spawning subprocess if interface doesn't exist
+                if not os.path.exists(f"/sys/class/net/{interface_name}"):
+                    return False
+                
+                # Check for IP address
                result = subprocess.run(
-                    ['ip', '-4', 'addr', 'show', interface_name], 
-                    stdout=subprocess.PIPE, 
-                    stderr=subprocess.PIPE, 
-                    text=True
+                    ["ip", "-4", "addr", "show", interface_name],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                    timeout=2,
                )
                if result.returncode != 0:
                    return False
-                # Check if output contains "inet" which indicates an IP address
-                return 'inet' in result.stdout
+                return "inet " in result.stdout
            except Exception:
                return False

-        eth_connected = interface_has_ip('eth0')
-        wifi_connected = interface_has_ip('wlan0')
+        eth_connected = interface_has_ip("eth0")
+        wifi_connected = interface_has_ip("wlan0")

        self.network_connected = eth_connected or wifi_connected

        if self.network_connected != self.previous_network_connected:
            if self.network_connected:
-                logger.info(f"Network is connected (eth0={eth_connected}, wlan0={wifi_connected}).")
+                logger.info(f"Network status changed: Connected (eth0={eth_connected}, wlan0={wifi_connected})")
            else:
-                logger.warning("No active network connections found.")
-            
+                logger.warning("Network status changed: Connection lost")
            self.previous_network_connected = self.network_connected

        return self.network_connected

-    
    @staticmethod
-    def start_display():
-        """Start the display thread"""
-        display = Display(shared_data)
-        display_thread = threading.Thread(target=display.run)
-        display_thread.start()
-        return display_thread
+    def start_display(old_display=None):
+        # Ensure the previous Display's controller is fully stopped to release frames
+        if old_display is not None:
+            try:
+                old_display.display_controller.stop(timeout=3.0)
+            except Exception:
+                pass

-def handle_exit(sig, frame, display_thread, bjorn_thread, web_thread):
-    """Handles the termination of the main, display, and web threads."""
+        display = Display(shared_data)
+        display_thread = threading.Thread(
+            target=display.run,
+            daemon=True,
+            name="DisplayMain",
+        )
+        display_thread.start()
+        return display_thread, display
+
+
+def _request_shutdown():
+    """Signals all threads to stop."""
    shared_data.should_exit = True
-    shared_data.orchestrator_should_exit = True  # Ensure orchestrator stops
-    shared_data.display_should_exit = True  # Ensure display stops
-    shared_data.webapp_should_exit = True  # Ensure web server stops
-    handle_exit_display(sig, frame, display_thread)
-    if display_thread.is_alive():
-        display_thread.join()
-    if bjorn_thread.is_alive():
-        bjorn_thread.join()
-    if web_thread.is_alive():
-        web_thread.join()
-    logger.info("Main loop finished. Clean exit.")
-    sys.exit(0)
+    shared_data.orchestrator_should_exit = True
+    shared_data.display_should_exit = True
+    shared_data.webapp_should_exit = True
+    shared_data.queue_event.set()
+
+
+def handle_exit(
+    sig,
+    frame,
+    display_thread,
+    bjorn_thread,
+    web_thread_obj,
+    health_thread=None,
+    runtime_state_thread=None,
+    from_signal=False,
+):
+    global _shutdown_started
+
+    with _shutdown_lock:
+        if _shutdown_started:
+            if from_signal:
+                logger.warning("Forcing exit (SIGINT/SIGTERM received twice)")
+                os._exit(130)
+            return
+        _shutdown_started = True
+
+    logger.info(f"Shutdown signal received: {sig}")
+    _request_shutdown()
+
+    # 1. Stop Display (handles EPD cleanup)
+    try:
+        handle_exit_display(sig, frame, display_thread)
+    except Exception:
+        pass
+
+    # 2. Stop Health Monitor
+    try:
+        if health_thread and hasattr(health_thread, "stop"):
+            health_thread.stop()
+    except Exception:
+        pass
+
+    # 2b. Stop Runtime State Updater
+    try:
+        if runtime_state_thread and hasattr(runtime_state_thread, "stop"):
+            runtime_state_thread.stop()
+    except Exception:
+        pass
+
+    # 3. Stop Web Server
+    try:
+        if web_thread_obj and hasattr(web_thread_obj, "shutdown"):
+            web_thread_obj.shutdown()
+    except Exception:
+        pass
+
+    # 4. Join all threads
+    for thread in (display_thread, bjorn_thread, web_thread_obj, health_thread, runtime_state_thread):
+        try:
+            if thread and thread.is_alive():
+                thread.join(timeout=5.0)
+        except Exception:
+            pass
+
+    # 5. Close Database (Prevent corruption)
+    try:
+        if hasattr(shared_data, "db") and hasattr(shared_data.db, "close"):
+            shared_data.db.close()
+    except Exception as exc:
+        logger.error(f"Database shutdown error: {exc}")
+
+    logger.info("Bjorn stopped. Clean exit.")
+    _release_instance_lock()
+    if from_signal:
+        sys.exit(0)
+
+
+def _install_thread_excepthook():
+    def _hook(args):
+        logger.error(f"Unhandled thread exception: {args.thread.name} - {args.exc_type.__name__}: {args.exc_value}")
+        # We don't force shutdown here to avoid killing the app on minor thread glitches, 
+        # unless it's critical. The Crash Shield will handle restarts.
+    threading.excepthook = _hook
+

 if __name__ == "__main__":
-    logger.info("Starting threads")
+    if not _acquire_instance_lock():
+        sys.exit(1)
+    atexit.register(_release_instance_lock)
+    _install_thread_excepthook()
+
+    display_thread = None
+    display_instance = None
+    bjorn_thread = None
+    health_thread = None
+    runtime_state_thread = None
+    last_gc_time = time.time()

    try:
-        logger.info("Loading shared data config...")
+        logger.info("Bjorn Startup: Loading config...")
        shared_data.load_config()

-        logger.info("Starting display thread...")
-        shared_data.display_should_exit = False  # Initialize display should_exit
-        display_thread = Bjorn.start_display()
+        logger.info("Starting Runtime State Updater...")
+        runtime_state_thread = RuntimeStateUpdater(shared_data)
+        runtime_state_thread.start()

-        logger.info("Starting Bjorn thread...")
+        logger.info("Starting Display...")
+        shared_data.display_should_exit = False
+        display_thread, display_instance = Bjorn.start_display()
+
+        logger.info("Starting Bjorn Core...")
        bjorn = Bjorn(shared_data)
-        shared_data.bjorn_instance = bjorn  # Assigner l'instance de Bjorn à shared_data
-        bjorn_thread = threading.Thread(target=bjorn.run)
+        shared_data.bjorn_instance = bjorn
+        bjorn_thread = threading.Thread(target=bjorn.run, daemon=True, name="BjornMain")
        bjorn_thread.start()

-        if shared_data.config["websrv"]:
-            logger.info("Starting the web server...")
-            web_thread.start()
+        if shared_data.config.get("websrv", False):
+            logger.info("Starting Web Server...")
+            if not web_thread.is_alive():
+                web_thread.start()

-        signal.signal(signal.SIGINT, lambda sig, frame: handle_exit(sig, frame, display_thread, bjorn_thread, web_thread))
-        signal.signal(signal.SIGTERM, lambda sig, frame: handle_exit(sig, frame, display_thread, bjorn_thread, web_thread))
+        health_interval = int(shared_data.config.get("health_log_interval", 60))
+        health_thread = HealthMonitor(shared_data, interval_s=health_interval)
+        health_thread.start()

-    except Exception as e:
-        logger.error(f"An exception occurred during thread start: {e}")
-        handle_exit_display(signal.SIGINT, None)
-        exit(1)
+        # Signal Handlers
+        exit_handler = lambda s, f: handle_exit(
+            s,
+            f,
+            display_thread,
+            bjorn_thread,
+            web_thread,
+            health_thread,
+            runtime_state_thread,
+            True,
+        )
+        signal.signal(signal.SIGINT, exit_handler)
+        signal.signal(signal.SIGTERM, exit_handler)
+
+        # --- SUPERVISOR LOOP (Crash Shield) ---
+        restart_times = []
+        max_restarts = 5
+        restart_window_s = 300
+
+        logger.info("Bjorn Supervisor running.")
+
+        while not shared_data.should_exit:
+            time.sleep(2) # CPU Friendly polling
+            now = time.time()
+
+            # --- OPTIMIZATION: Periodic Garbage Collection ---
+            # Forces cleanup of circular references and free RAM every 2 mins
+            if now - last_gc_time > 120:
+                gc.collect()
+                last_gc_time = now
+                logger.debug("System: Forced Garbage Collection executed.")
+
+            # --- CRASH SHIELD: Bjorn Thread ---
+            if bjorn_thread and not bjorn_thread.is_alive() and not shared_data.should_exit:
+                restart_times = [t for t in restart_times if (now - t) <= restart_window_s]
+                restart_times.append(now)
+                
+                if len(restart_times) <= max_restarts:
+                    logger.warning("Crash Shield: Restarting Bjorn Main Thread")
+                    bjorn_thread = threading.Thread(target=bjorn.run, daemon=True, name="BjornMain")
+                    bjorn_thread.start()
+                else:
+                    logger.critical("Crash Shield: Bjorn exceeded restart budget. Shutting down.")
+                    _request_shutdown()
+                    break
+
+            # --- CRASH SHIELD: Display Thread ---
+            if display_thread and not display_thread.is_alive() and not shared_data.should_exit:
+                restart_times = [t for t in restart_times if (now - t) <= restart_window_s]
+                restart_times.append(now)
+                if len(restart_times) <= max_restarts:
+                    logger.warning("Crash Shield: Restarting Display Thread")
+                    display_thread, display_instance = Bjorn.start_display(old_display=display_instance)
+                else:
+                    logger.critical("Crash Shield: Display exceeded restart budget. Shutting down.")
+                    _request_shutdown()
+                    break
+
+            # --- CRASH SHIELD: Runtime State Updater ---
+            if runtime_state_thread and not runtime_state_thread.is_alive() and not shared_data.should_exit:
+                restart_times = [t for t in restart_times if (now - t) <= restart_window_s]
+                restart_times.append(now)
+                if len(restart_times) <= max_restarts:
+                    logger.warning("Crash Shield: Restarting Runtime State Updater")
+                    runtime_state_thread = RuntimeStateUpdater(shared_data)
+                    runtime_state_thread.start()
+                else:
+                    logger.critical("Crash Shield: Runtime State Updater exceeded restart budget. Shutting down.")
+                    _request_shutdown()
+                    break
+
+        # Exit cleanup
+        if health_thread:
+            health_thread.stop()
+        if runtime_state_thread:
+            runtime_state_thread.stop()
+
+        handle_exit(
+            signal.SIGTERM,
+            None,
+            display_thread,
+            bjorn_thread,
+            web_thread,
+            health_thread,
+            runtime_state_thread,
+            False,
+        )
+
+    except Exception as exc:
+        logger.critical(f"Critical bootstrap failure: {exc}")
+        _request_shutdown()
+        # Try to clean up anyway
+        try:
+            handle_exit(
+                signal.SIGTERM,
+                None,
+                display_thread,
+                bjorn_thread,
+                web_thread,
+                health_thread,
+                runtime_state_thread,
+                False,
+            )
+        except:
+            pass
+        sys.exit(1)