Add RLUtils class for managing RL/AI dashboard endpoints

- Implemented methods for fetching AI stats, training history, and recent experiences.
- Added functionality to set operation mode (MANUAL, AUTO, AI) with appropriate handling.
- Included helper methods for querying the database and sending JSON responses.
- Integrated model metadata extraction for visualization purposes.
This commit is contained in:
Fabien POLLY
2026-02-18 22:36:10 +01:00
parent b8a13cc698
commit eb20b168a6
684 changed files with 53278 additions and 27977 deletions

656
Bjorn.py
View File

@@ -1,173 +1,625 @@
# bjorn.py
import threading
import signal
# Bjorn.py
# Main entry point and supervisor for the Bjorn project
# Manages lifecycle of threads, health monitoring, and crash protection.
# OPTIMIZED FOR PI ZERO 2: Low CPU overhead, aggressive RAM management.
import logging
import time
import sys
import os
import signal
import subprocess
import re
from init_shared import shared_data
from display import Display, handle_exit_display
import sys
import threading
import time
import gc
import tracemalloc
import atexit
from comment import Commentaireia
from webapp import web_thread, handle_exit_web
from orchestrator import Orchestrator
from display import Display, handle_exit_display
from init_shared import shared_data
from logger import Logger
from orchestrator import Orchestrator
from runtime_state_updater import RuntimeStateUpdater
from webapp import web_thread
logger = Logger(name="Bjorn.py", level=logging.DEBUG)
_shutdown_lock = threading.Lock()
_shutdown_started = False
_instance_lock_fd = None
_instance_lock_path = "/tmp/bjorn_160226.lock"
try:
import fcntl
except Exception:
fcntl = None
def _release_instance_lock():
global _instance_lock_fd
if _instance_lock_fd is None:
return
try:
if fcntl is not None:
try:
fcntl.flock(_instance_lock_fd.fileno(), fcntl.LOCK_UN)
except Exception:
pass
_instance_lock_fd.close()
except Exception:
pass
_instance_lock_fd = None
def _acquire_instance_lock() -> bool:
"""Ensure only one Bjorn_160226 process can run at once."""
global _instance_lock_fd
if _instance_lock_fd is not None:
return True
try:
fd = open(_instance_lock_path, "a+", encoding="utf-8")
except Exception as exc:
logger.error(f"Unable to open instance lock file {_instance_lock_path}: {exc}")
return True
if fcntl is None:
_instance_lock_fd = fd
return True
try:
fcntl.flock(fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
fd.seek(0)
fd.truncate()
fd.write(str(os.getpid()))
fd.flush()
except OSError:
try:
fd.seek(0)
owner_pid = fd.read().strip() or "unknown"
except Exception:
owner_pid = "unknown"
logger.critical(f"Another Bjorn instance is already running (pid={owner_pid}).")
try:
fd.close()
except Exception:
pass
return False
_instance_lock_fd = fd
return True
class HealthMonitor(threading.Thread):
"""Periodic runtime health logger (threads/fd/rss/queue/epd metrics)."""
def __init__(self, shared_data_, interval_s: int = 60):
super().__init__(daemon=True, name="HealthMonitor")
self.shared_data = shared_data_
self.interval_s = max(10, int(interval_s))
self._stop_event = threading.Event()
self._tm_prev_snapshot = None
self._tm_last_report = 0.0
def stop(self):
self._stop_event.set()
def _fd_count(self) -> int:
try:
return len(os.listdir("/proc/self/fd"))
except Exception:
return -1
def _rss_kb(self) -> int:
try:
with open("/proc/self/status", "r", encoding="utf-8") as fh:
for line in fh:
if line.startswith("VmRSS:"):
parts = line.split()
if len(parts) >= 2:
return int(parts[1])
except Exception:
pass
return -1
def _queue_counts(self):
pending = running = scheduled = -1
try:
# Using query_one safe method from database
row = self.shared_data.db.query_one(
"""
SELECT
SUM(CASE WHEN status='pending' THEN 1 ELSE 0 END) AS pending,
SUM(CASE WHEN status='running' THEN 1 ELSE 0 END) AS running,
SUM(CASE WHEN status='scheduled' THEN 1 ELSE 0 END) AS scheduled
FROM action_queue
"""
)
if row:
pending = int(row.get("pending") or 0)
running = int(row.get("running") or 0)
scheduled = int(row.get("scheduled") or 0)
except Exception as exc:
logger.error_throttled(
f"Health monitor queue count query failed: {exc}",
key="health_queue_counts",
interval_s=120,
)
return pending, running, scheduled
def run(self):
while not self._stop_event.wait(self.interval_s):
try:
threads = threading.enumerate()
thread_count = len(threads)
top_threads = ",".join(t.name for t in threads[:8])
fd_count = self._fd_count()
rss_kb = self._rss_kb()
pending, running, scheduled = self._queue_counts()
# Lock to safely read shared metrics without race conditions
with self.shared_data.health_lock:
display_metrics = dict(getattr(self.shared_data, "display_runtime_metrics", {}) or {})
epd_enabled = int(display_metrics.get("epd_enabled", 0))
epd_failures = int(display_metrics.get("failed_updates", 0))
epd_reinit = int(display_metrics.get("reinit_attempts", 0))
epd_headless = int(display_metrics.get("headless", 0))
epd_last_success = display_metrics.get("last_success_epoch", 0)
logger.info(
"health "
f"thread_count={thread_count} "
f"rss_kb={rss_kb} "
f"queue_pending={pending} "
f"epd_failures={epd_failures} "
f"epd_reinit={epd_reinit} "
)
# Optional: tracemalloc report (only if enabled via PYTHONTRACEMALLOC or tracemalloc.start()).
try:
if tracemalloc.is_tracing():
now = time.monotonic()
tm_interval = float(self.shared_data.config.get("tracemalloc_report_interval_s", 300) or 300)
if tm_interval > 0 and (now - self._tm_last_report) >= tm_interval:
self._tm_last_report = now
top_n = int(self.shared_data.config.get("tracemalloc_top_n", 10) or 10)
top_n = max(3, min(top_n, 25))
snap = tracemalloc.take_snapshot()
if self._tm_prev_snapshot is not None:
stats = snap.compare_to(self._tm_prev_snapshot, "lineno")[:top_n]
logger.info(f"mem_top (tracemalloc diff, top_n={top_n})")
for st in stats:
logger.info(f"mem_top {st}")
else:
stats = snap.statistics("lineno")[:top_n]
logger.info(f"mem_top (tracemalloc, top_n={top_n})")
for st in stats:
logger.info(f"mem_top {st}")
self._tm_prev_snapshot = snap
except Exception as exc:
logger.error_throttled(
f"Health monitor tracemalloc failure: {exc}",
key="health_tracemalloc_error",
interval_s=300,
)
except Exception as exc:
logger.error_throttled(
f"Health monitor loop failure: {exc}",
key="health_loop_error",
interval_s=120,
)
class Bjorn:
"""Main class for Bjorn. Manages the primary operations of the application."""
def __init__(self, shared_data):
self.shared_data = shared_data
"""Main class for Bjorn. Manages orchestration lifecycle."""
def __init__(self, shared_data_):
self.shared_data = shared_data_
self.commentaire_ia = Commentaireia()
self.orchestrator_thread = None
self.orchestrator = None
self.network_connected = False
self.wifi_connected = False
self.previous_network_connected = None # Pour garder une trace de l'état précédent
self.previous_network_connected = None
self._orch_lock = threading.Lock()
self._last_net_check = 0 # Throttling for network scan
self._last_orch_stop_attempt = 0.0
def run(self):
"""Main loop for Bjorn. Waits for Wi-Fi connection and starts Orchestrator."""
# Wait for startup delay if configured in shared data
if hasattr(self.shared_data, 'startup_delay') and self.shared_data.startup_delay > 0:
"""Main loop for Bjorn. Waits for network and starts/stops Orchestrator based on mode."""
if hasattr(self.shared_data, "startup_delay") and self.shared_data.startup_delay > 0:
logger.info(f"Waiting for startup delay: {self.shared_data.startup_delay} seconds")
time.sleep(self.shared_data.startup_delay)
# Main loop to keep Bjorn running
backoff_s = 1.0
while not self.shared_data.should_exit:
if not self.shared_data.manual_mode:
self.check_and_start_orchestrator()
time.sleep(10) # Main loop idle waiting
try:
# Manual mode must stop orchestration so the user keeps full control.
if self.shared_data.operation_mode == "MANUAL":
# Avoid spamming stop requests if already stopped.
if self.orchestrator_thread is not None and self.orchestrator_thread.is_alive():
self.stop_orchestrator()
else:
self.check_and_start_orchestrator()
time.sleep(5)
backoff_s = 1.0 # Reset backoff on success
except Exception as exc:
logger.error(f"Bjorn main loop error: {exc}")
logger.error_throttled(
"Bjorn main loop entering backoff due to repeated errors",
key="bjorn_main_loop_backoff",
interval_s=60,
)
time.sleep(backoff_s)
backoff_s = min(backoff_s * 2.0, 30.0)
def check_and_start_orchestrator(self):
"""Check Wi-Fi and start the orchestrator if connected."""
if self.shared_data.operation_mode == "MANUAL":
return
if self.is_network_connected():
self.wifi_connected = True
if self.orchestrator_thread is None or not self.orchestrator_thread.is_alive():
self.start_orchestrator()
else:
self.wifi_connected = False
logger.info("Waiting for Wi-Fi connection to start Orchestrator...")
logger.info_throttled(
"Waiting for network connection to start Orchestrator...",
key="bjorn_wait_network",
interval_s=30,
)
def start_orchestrator(self):
"""Start the orchestrator thread."""
self.is_network_connected() # reCheck if Wi-Fi is connected before starting the orchestrator
# time.sleep(10) # Wait for network to stabilize
if self.wifi_connected: # Check if Wi-Fi is connected before starting the orchestrator
if self.orchestrator_thread is None or not self.orchestrator_thread.is_alive():
logger.info("Starting Orchestrator thread...")
self.shared_data.orchestrator_should_exit = False
self.shared_data.manual_mode = False
self.orchestrator = Orchestrator()
self.orchestrator_thread = threading.Thread(target=self.orchestrator.run)
self.orchestrator_thread.start()
logger.info("Orchestrator thread started, automatic mode activated.")
else:
logger.info("Orchestrator thread is already running.")
else:
pass
with self._orch_lock:
# Re-check network inside lock
if not self.network_connected:
return
if self.orchestrator_thread is not None and self.orchestrator_thread.is_alive():
logger.debug("Orchestrator thread is already running.")
return
logger.info("Starting Orchestrator thread...")
self.shared_data.orchestrator_should_exit = False
self.orchestrator = Orchestrator()
self.orchestrator_thread = threading.Thread(
target=self.orchestrator.run,
daemon=True,
name="OrchestratorMain",
)
self.orchestrator_thread.start()
logger.info("Orchestrator thread started.")
def stop_orchestrator(self):
"""Stop the orchestrator thread."""
self.shared_data.manual_mode = True
logger.info("Stop button pressed. Manual mode activated & Stopping Orchestrator...")
if self.orchestrator_thread is not None and self.orchestrator_thread.is_alive():
logger.info("Stopping Orchestrator thread...")
with self._orch_lock:
thread = self.orchestrator_thread
if thread is None or not thread.is_alive():
self.orchestrator_thread = None
self.orchestrator = None
return
# Keep MANUAL sticky so supervisor does not auto-restart orchestration.
try:
self.shared_data.operation_mode = "MANUAL"
except Exception:
pass
now = time.time()
if now - self._last_orch_stop_attempt >= 10.0:
logger.info("Stop requested: stopping Orchestrator")
self._last_orch_stop_attempt = now
self.shared_data.orchestrator_should_exit = True
self.orchestrator_thread.join()
logger.info("Orchestrator thread stopped.")
self.shared_data.queue_event.set() # Wake up thread
thread.join(timeout=10.0)
if thread.is_alive():
logger.warning_throttled(
"Orchestrator thread did not stop gracefully",
key="orch_stop_not_graceful",
interval_s=20,
)
return
self.orchestrator_thread = None
self.orchestrator = None
self.shared_data.bjorn_orch_status = "IDLE"
self.shared_data.bjorn_status_text2 = ""
self.shared_data.manual_mode = True
else:
logger.info("Orchestrator thread is not running.")
def is_network_connected(self):
"""Checks for network connectivity on eth0 or wlan0 using ip command (replacing deprecated ifconfig)."""
logger = logging.getLogger("Bjorn.py")
"""Checks for network connectivity with throttling and low-CPU checks."""
now = time.time()
# Throttling: Do not scan more than once every 10 seconds
if now - self._last_net_check < 10:
return self.network_connected
self._last_net_check = now
def interface_has_ip(interface_name):
try:
# Use 'ip -4 addr show <interface>' to check for IPv4 address
# OPTIMIZATION: Check /sys/class/net first to avoid spawning subprocess if interface doesn't exist
if not os.path.exists(f"/sys/class/net/{interface_name}"):
return False
# Check for IP address
result = subprocess.run(
['ip', '-4', 'addr', 'show', interface_name],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
["ip", "-4", "addr", "show", interface_name],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=2,
)
if result.returncode != 0:
return False
# Check if output contains "inet" which indicates an IP address
return 'inet' in result.stdout
return "inet " in result.stdout
except Exception:
return False
eth_connected = interface_has_ip('eth0')
wifi_connected = interface_has_ip('wlan0')
eth_connected = interface_has_ip("eth0")
wifi_connected = interface_has_ip("wlan0")
self.network_connected = eth_connected or wifi_connected
if self.network_connected != self.previous_network_connected:
if self.network_connected:
logger.info(f"Network is connected (eth0={eth_connected}, wlan0={wifi_connected}).")
logger.info(f"Network status changed: Connected (eth0={eth_connected}, wlan0={wifi_connected})")
else:
logger.warning("No active network connections found.")
logger.warning("Network status changed: Connection lost")
self.previous_network_connected = self.network_connected
return self.network_connected
@staticmethod
def start_display():
"""Start the display thread"""
display = Display(shared_data)
display_thread = threading.Thread(target=display.run)
display_thread.start()
return display_thread
def start_display(old_display=None):
# Ensure the previous Display's controller is fully stopped to release frames
if old_display is not None:
try:
old_display.display_controller.stop(timeout=3.0)
except Exception:
pass
def handle_exit(sig, frame, display_thread, bjorn_thread, web_thread):
"""Handles the termination of the main, display, and web threads."""
display = Display(shared_data)
display_thread = threading.Thread(
target=display.run,
daemon=True,
name="DisplayMain",
)
display_thread.start()
return display_thread, display
def _request_shutdown():
"""Signals all threads to stop."""
shared_data.should_exit = True
shared_data.orchestrator_should_exit = True # Ensure orchestrator stops
shared_data.display_should_exit = True # Ensure display stops
shared_data.webapp_should_exit = True # Ensure web server stops
handle_exit_display(sig, frame, display_thread)
if display_thread.is_alive():
display_thread.join()
if bjorn_thread.is_alive():
bjorn_thread.join()
if web_thread.is_alive():
web_thread.join()
logger.info("Main loop finished. Clean exit.")
sys.exit(0)
shared_data.orchestrator_should_exit = True
shared_data.display_should_exit = True
shared_data.webapp_should_exit = True
shared_data.queue_event.set()
def handle_exit(
sig,
frame,
display_thread,
bjorn_thread,
web_thread_obj,
health_thread=None,
runtime_state_thread=None,
from_signal=False,
):
global _shutdown_started
with _shutdown_lock:
if _shutdown_started:
if from_signal:
logger.warning("Forcing exit (SIGINT/SIGTERM received twice)")
os._exit(130)
return
_shutdown_started = True
logger.info(f"Shutdown signal received: {sig}")
_request_shutdown()
# 1. Stop Display (handles EPD cleanup)
try:
handle_exit_display(sig, frame, display_thread)
except Exception:
pass
# 2. Stop Health Monitor
try:
if health_thread and hasattr(health_thread, "stop"):
health_thread.stop()
except Exception:
pass
# 2b. Stop Runtime State Updater
try:
if runtime_state_thread and hasattr(runtime_state_thread, "stop"):
runtime_state_thread.stop()
except Exception:
pass
# 3. Stop Web Server
try:
if web_thread_obj and hasattr(web_thread_obj, "shutdown"):
web_thread_obj.shutdown()
except Exception:
pass
# 4. Join all threads
for thread in (display_thread, bjorn_thread, web_thread_obj, health_thread, runtime_state_thread):
try:
if thread and thread.is_alive():
thread.join(timeout=5.0)
except Exception:
pass
# 5. Close Database (Prevent corruption)
try:
if hasattr(shared_data, "db") and hasattr(shared_data.db, "close"):
shared_data.db.close()
except Exception as exc:
logger.error(f"Database shutdown error: {exc}")
logger.info("Bjorn stopped. Clean exit.")
_release_instance_lock()
if from_signal:
sys.exit(0)
def _install_thread_excepthook():
def _hook(args):
logger.error(f"Unhandled thread exception: {args.thread.name} - {args.exc_type.__name__}: {args.exc_value}")
# We don't force shutdown here to avoid killing the app on minor thread glitches,
# unless it's critical. The Crash Shield will handle restarts.
threading.excepthook = _hook
if __name__ == "__main__":
logger.info("Starting threads")
if not _acquire_instance_lock():
sys.exit(1)
atexit.register(_release_instance_lock)
_install_thread_excepthook()
display_thread = None
display_instance = None
bjorn_thread = None
health_thread = None
runtime_state_thread = None
last_gc_time = time.time()
try:
logger.info("Loading shared data config...")
logger.info("Bjorn Startup: Loading config...")
shared_data.load_config()
logger.info("Starting display thread...")
shared_data.display_should_exit = False # Initialize display should_exit
display_thread = Bjorn.start_display()
logger.info("Starting Runtime State Updater...")
runtime_state_thread = RuntimeStateUpdater(shared_data)
runtime_state_thread.start()
logger.info("Starting Bjorn thread...")
logger.info("Starting Display...")
shared_data.display_should_exit = False
display_thread, display_instance = Bjorn.start_display()
logger.info("Starting Bjorn Core...")
bjorn = Bjorn(shared_data)
shared_data.bjorn_instance = bjorn # Assigner l'instance de Bjorn à shared_data
bjorn_thread = threading.Thread(target=bjorn.run)
shared_data.bjorn_instance = bjorn
bjorn_thread = threading.Thread(target=bjorn.run, daemon=True, name="BjornMain")
bjorn_thread.start()
if shared_data.config["websrv"]:
logger.info("Starting the web server...")
web_thread.start()
if shared_data.config.get("websrv", False):
logger.info("Starting Web Server...")
if not web_thread.is_alive():
web_thread.start()
signal.signal(signal.SIGINT, lambda sig, frame: handle_exit(sig, frame, display_thread, bjorn_thread, web_thread))
signal.signal(signal.SIGTERM, lambda sig, frame: handle_exit(sig, frame, display_thread, bjorn_thread, web_thread))
health_interval = int(shared_data.config.get("health_log_interval", 60))
health_thread = HealthMonitor(shared_data, interval_s=health_interval)
health_thread.start()
except Exception as e:
logger.error(f"An exception occurred during thread start: {e}")
handle_exit_display(signal.SIGINT, None)
exit(1)
# Signal Handlers
exit_handler = lambda s, f: handle_exit(
s,
f,
display_thread,
bjorn_thread,
web_thread,
health_thread,
runtime_state_thread,
True,
)
signal.signal(signal.SIGINT, exit_handler)
signal.signal(signal.SIGTERM, exit_handler)
# --- SUPERVISOR LOOP (Crash Shield) ---
restart_times = []
max_restarts = 5
restart_window_s = 300
logger.info("Bjorn Supervisor running.")
while not shared_data.should_exit:
time.sleep(2) # CPU Friendly polling
now = time.time()
# --- OPTIMIZATION: Periodic Garbage Collection ---
# Forces cleanup of circular references and free RAM every 2 mins
if now - last_gc_time > 120:
gc.collect()
last_gc_time = now
logger.debug("System: Forced Garbage Collection executed.")
# --- CRASH SHIELD: Bjorn Thread ---
if bjorn_thread and not bjorn_thread.is_alive() and not shared_data.should_exit:
restart_times = [t for t in restart_times if (now - t) <= restart_window_s]
restart_times.append(now)
if len(restart_times) <= max_restarts:
logger.warning("Crash Shield: Restarting Bjorn Main Thread")
bjorn_thread = threading.Thread(target=bjorn.run, daemon=True, name="BjornMain")
bjorn_thread.start()
else:
logger.critical("Crash Shield: Bjorn exceeded restart budget. Shutting down.")
_request_shutdown()
break
# --- CRASH SHIELD: Display Thread ---
if display_thread and not display_thread.is_alive() and not shared_data.should_exit:
restart_times = [t for t in restart_times if (now - t) <= restart_window_s]
restart_times.append(now)
if len(restart_times) <= max_restarts:
logger.warning("Crash Shield: Restarting Display Thread")
display_thread, display_instance = Bjorn.start_display(old_display=display_instance)
else:
logger.critical("Crash Shield: Display exceeded restart budget. Shutting down.")
_request_shutdown()
break
# --- CRASH SHIELD: Runtime State Updater ---
if runtime_state_thread and not runtime_state_thread.is_alive() and not shared_data.should_exit:
restart_times = [t for t in restart_times if (now - t) <= restart_window_s]
restart_times.append(now)
if len(restart_times) <= max_restarts:
logger.warning("Crash Shield: Restarting Runtime State Updater")
runtime_state_thread = RuntimeStateUpdater(shared_data)
runtime_state_thread.start()
else:
logger.critical("Crash Shield: Runtime State Updater exceeded restart budget. Shutting down.")
_request_shutdown()
break
# Exit cleanup
if health_thread:
health_thread.stop()
if runtime_state_thread:
runtime_state_thread.stop()
handle_exit(
signal.SIGTERM,
None,
display_thread,
bjorn_thread,
web_thread,
health_thread,
runtime_state_thread,
False,
)
except Exception as exc:
logger.critical(f"Critical bootstrap failure: {exc}")
_request_shutdown()
# Try to clean up anyway
try:
handle_exit(
signal.SIGTERM,
None,
display_thread,
bjorn_thread,
web_thread,
health_thread,
runtime_state_thread,
False,
)
except:
pass
sys.exit(1)