Add RLUtils class for managing RL/AI dashboard endpoints

- Implemented methods for fetching AI stats, training history, and recent experiences.
- Added functionality to set operation mode (MANUAL, AUTO, AI) with appropriate handling.
- Included helper methods for querying the database and sending JSON responses.
- Integrated model metadata extraction for visualization purposes.
This commit is contained in:
Fabien POLLY
2026-02-18 22:36:10 +01:00
parent b8a13cc698
commit eb20b168a6
684 changed files with 53278 additions and 27977 deletions

536
web_utils/debug_utils.py Normal file
View File

@@ -0,0 +1,536 @@
"""
Debug / Profiling utilities for the Bjorn Debug page.
Exposes process-level and per-thread metrics via /proc (no external deps).
Designed for Pi Zero 2: lightweight reads, no subprocess spawning.
OPTIMIZED: minimal allocations, cached tracemalloc, /proc/self/smaps for C memory.
"""
import json
import os
import sys
import threading
import time
import tracemalloc
from logger import Logger
logger = Logger(name="debug_utils")
_SC_CLK_TCK = os.sysconf("SC_CLK_TCK") if hasattr(os, "sysconf") else 100
# ---------------------------------------------------------------------------
# /proc helpers
# ---------------------------------------------------------------------------
def _read_proc_status():
result = {}
try:
with open("/proc/self/status", "r", encoding="utf-8") as f:
for line in f:
if line.startswith("VmRSS:"):
result["vm_rss_kb"] = int(line.split()[1])
elif line.startswith("VmSize:"):
result["vm_size_kb"] = int(line.split()[1])
elif line.startswith("VmPeak:"):
result["vm_peak_kb"] = int(line.split()[1])
elif line.startswith("VmSwap:"):
result["vm_swap_kb"] = int(line.split()[1])
elif line.startswith("FDSize:"):
result["fd_slots"] = int(line.split()[1])
elif line.startswith("Threads:"):
result["kernel_threads"] = int(line.split()[1])
elif line.startswith("RssAnon:"):
result["rss_anon_kb"] = int(line.split()[1])
elif line.startswith("RssFile:"):
result["rss_file_kb"] = int(line.split()[1])
elif line.startswith("RssShmem:"):
result["rss_shmem_kb"] = int(line.split()[1])
except Exception:
pass
return result
def _fd_count():
try:
return len(os.listdir("/proc/self/fd"))
except Exception:
return -1
def _read_open_files():
"""Read open FDs — reuses a single dict to minimize allocations."""
fd_dir = "/proc/self/fd"
fd_map = {}
try:
fds = os.listdir(fd_dir)
except Exception:
return []
for fd in fds:
try:
target = os.readlink(fd_dir + "/" + fd)
except Exception:
target = "???"
if target.startswith("/"):
ftype = "device" if "/dev/" in target else "proc" if target.startswith("/proc/") else "temp" if (target.startswith("/tmp/") or target.startswith("/run/")) else "file"
elif target.startswith("socket:"):
ftype = "socket"
elif target.startswith("pipe:"):
ftype = "pipe"
elif target.startswith("anon_inode:"):
ftype = "anon"
else:
ftype = "other"
entry = fd_map.get(target)
if entry is None:
entry = {"target": target, "type": ftype, "count": 0, "fds": []}
fd_map[target] = entry
entry["count"] += 1
if len(entry["fds"]) < 5:
entry["fds"].append(int(fd))
result = sorted(fd_map.values(), key=lambda x: (-x["count"], x["target"]))
return result
def _read_thread_stats():
threads = []
task_dir = "/proc/self/task"
try:
tids = os.listdir(task_dir)
except Exception:
return threads
for tid in tids:
try:
with open(task_dir + "/" + tid + "/stat", "r", encoding="utf-8") as f:
raw = f.read()
i1 = raw.find("(")
i2 = raw.rfind(")")
if i1 < 0 or i2 < 0:
continue
name = raw[i1 + 1:i2]
fields = raw[i2 + 2:].split()
state = fields[0] if fields else "?"
utime = int(fields[11]) if len(fields) > 11 else 0
stime = int(fields[12]) if len(fields) > 12 else 0
threads.append({
"tid": int(tid),
"name": name,
"state": state,
"cpu_ticks": utime + stime,
})
except Exception:
continue
return threads
def _get_python_threads_rich():
"""Enumerate Python threads with target + current frame. Minimal allocations."""
frames = sys._current_frames()
result = []
for t in threading.enumerate():
ident = t.ident
nid = getattr(t, "native_id", None)
# Target function info
target = getattr(t, "_target", None)
if target is not None:
tf = getattr(target, "__qualname__", getattr(target, "__name__", "?"))
tm = getattr(target, "__module__", "")
# Source file — use __code__ directly (avoids importing inspect)
tfile = ""
code = getattr(target, "__code__", None)
if code:
tfile = getattr(code, "co_filename", "")
else:
tf = "(main)" if t.name == "MainThread" else "(no target)"
tm = ""
tfile = ""
# Current stack — top 5 frames, build compact strings directly
stack = []
frame = frames.get(ident)
depth = 0
while frame is not None and depth < 5:
co = frame.f_code
fn = co.co_filename
# Shorten: last 2 path components
sep = fn.rfind("/")
if sep > 0:
sep2 = fn.rfind("/", 0, sep)
short = fn[sep2 + 1:] if sep2 >= 0 else fn
else:
short = fn
stack.append({
"file": short,
"line": frame.f_lineno,
"func": co.co_name,
})
frame = frame.f_back
depth += 1
# Release frame reference immediately
del frame
result.append({
"name": t.name,
"daemon": t.daemon,
"alive": t.is_alive(),
"ident": ident,
"native_id": nid,
"target_func": tf,
"target_module": tm,
"target_file": tfile,
"stack_top": stack,
})
# Release all frame references
del frames
return result
def _system_cpu_mem():
result = {"cpu_count": 1, "mem_total_kb": 0, "mem_available_kb": 0}
try:
with open("/proc/meminfo", "r", encoding="utf-8") as f:
for line in f:
if line.startswith("MemTotal:"):
result["mem_total_kb"] = int(line.split()[1])
elif line.startswith("MemAvailable:"):
result["mem_available_kb"] = int(line.split()[1])
except Exception:
pass
try:
result["cpu_count"] = len(os.sched_getaffinity(0))
except Exception:
try:
result["cpu_count"] = os.cpu_count() or 1
except Exception:
pass
return result
def _read_smaps_rollup():
"""
Read /proc/self/smaps_rollup for a breakdown of what consumes RSS.
This shows: Shared_Clean, Shared_Dirty, Private_Clean, Private_Dirty,
which helps identify C extension memory vs Python heap vs mmap.
"""
result = {}
try:
with open("/proc/self/smaps_rollup", "r", encoding="utf-8") as f:
for line in f:
parts = line.split()
if len(parts) >= 2:
key = parts[0].rstrip(":")
if key in ("Rss", "Pss", "Shared_Clean", "Shared_Dirty",
"Private_Clean", "Private_Dirty", "Referenced",
"Anonymous", "Swap", "Locked"):
result[key.lower() + "_kb"] = int(parts[1])
except Exception:
pass
return result
# ---------------------------------------------------------------------------
# Cached tracemalloc — take snapshot at most every 5s to reduce overhead
# ---------------------------------------------------------------------------
_tm_cache_lock = threading.Lock()
_tm_cache = None # (current, peak, by_file, by_line)
_tm_cache_time = 0.0
_TM_CACHE_TTL = 5.0 # seconds
def _get_tracemalloc_cached():
"""Return cached tracemalloc data, refreshing at most every 5s."""
global _tm_cache, _tm_cache_time
if not tracemalloc.is_tracing():
return 0, 0, [], []
now = time.monotonic()
with _tm_cache_lock:
if _tm_cache is not None and (now - _tm_cache_time) < _TM_CACHE_TTL:
return _tm_cache
# Take snapshot outside the lock (it's slow)
current, peak = tracemalloc.get_traced_memory()
snap = tracemalloc.take_snapshot()
# Single statistics call — use lineno (more useful), derive file-level client-side
stats_line = snap.statistics("lineno")[:30]
top_by_line = []
file_agg = {}
for s in stats_line:
frame = s.traceback[0] if s.traceback else None
if frame is None:
continue
fn = frame.filename
sep = fn.rfind("/")
if sep > 0:
sep2 = fn.rfind("/", 0, sep)
short = fn[sep2 + 1:] if sep2 >= 0 else fn
else:
short = fn
top_by_line.append({
"file": short,
"full_path": fn,
"line": frame.lineno,
"size_kb": round(s.size / 1024, 1),
"count": s.count,
})
# Aggregate by file
if fn not in file_agg:
file_agg[fn] = {"file": short, "full_path": fn, "size_kb": 0, "count": 0}
file_agg[fn]["size_kb"] += round(s.size / 1024, 1)
file_agg[fn]["count"] += s.count
# Also get file-level stats for files that don't appear in line-level top
stats_file = snap.statistics("filename")[:20]
for s in stats_file:
fn = str(s.traceback) if hasattr(s.traceback, '__str__') else ""
# traceback for filename stats is just the filename
raw_fn = s.traceback[0].filename if s.traceback else fn
if raw_fn not in file_agg:
sep = raw_fn.rfind("/")
if sep > 0:
sep2 = raw_fn.rfind("/", 0, sep)
short = raw_fn[sep2 + 1:] if sep2 >= 0 else raw_fn
else:
short = raw_fn
file_agg[raw_fn] = {"file": short, "full_path": raw_fn, "size_kb": 0, "count": 0}
entry = file_agg[raw_fn]
# Use the larger of aggregated or direct stats
direct_kb = round(s.size / 1024, 1)
if direct_kb > entry["size_kb"]:
entry["size_kb"] = direct_kb
if s.count > entry["count"]:
entry["count"] = s.count
top_by_file = sorted(file_agg.values(), key=lambda x: -x["size_kb"])[:20]
# Release snapshot immediately
del snap
result = (current, peak, top_by_file, top_by_line)
with _tm_cache_lock:
_tm_cache = result
_tm_cache_time = now
return result
# ---------------------------------------------------------------------------
# Snapshot + history ring buffer
# ---------------------------------------------------------------------------
_MAX_HISTORY = 120
_history_lock = threading.Lock()
_history = []
_prev_thread_ticks = {}
_prev_proc_ticks = 0
_prev_wall = 0.0
def _take_snapshot():
global _prev_thread_ticks, _prev_proc_ticks, _prev_wall
now = time.time()
wall_delta = now - _prev_wall if _prev_wall > 0 else 1.0
tick_budget = wall_delta * _SC_CLK_TCK
# Process-level
status = _read_proc_status()
fd_open = _fd_count()
sys_info = _system_cpu_mem()
smaps = _read_smaps_rollup()
# Thread CPU from /proc
raw_threads = _read_thread_stats()
thread_details = []
new_ticks_map = {}
total_proc_ticks = 0
for t in raw_threads:
tid = t["tid"]
prev = _prev_thread_ticks.get(tid, t["cpu_ticks"])
delta = max(0, t["cpu_ticks"] - prev)
cpu_pct = (delta / tick_budget * 100.0) if tick_budget > 0 else 0.0
new_ticks_map[tid] = t["cpu_ticks"]
total_proc_ticks += t["cpu_ticks"]
thread_details.append({
"tid": tid,
"name": t["name"],
"state": t["state"],
"cpu_pct": round(cpu_pct, 2),
"cpu_ticks_total": t["cpu_ticks"],
})
thread_details.sort(key=lambda x: x["cpu_pct"], reverse=True)
proc_delta = total_proc_ticks - _prev_proc_ticks if _prev_proc_ticks else 0
proc_cpu_pct = (proc_delta / tick_budget * 100.0) if tick_budget > 0 else 0.0
_prev_thread_ticks = new_ticks_map
_prev_proc_ticks = total_proc_ticks
_prev_wall = now
# Python threads
py_threads = _get_python_threads_rich()
# Match kernel TIDs to Python threads
native_to_py = {}
for pt in py_threads:
nid = pt.get("native_id")
if nid is not None:
native_to_py[nid] = pt
for td in thread_details:
pt = native_to_py.get(td["tid"])
if pt:
td["py_name"] = pt["name"]
td["py_target"] = pt.get("target_func", "")
td["py_module"] = pt.get("target_module", "")
td["py_file"] = pt.get("target_file", "")
if pt.get("stack_top"):
top = pt["stack_top"][0]
td["py_current"] = f"{top['file']}:{top['line']} {top['func']}()"
# tracemalloc (cached, refreshes every 5s)
tm_current, tm_peak, tm_by_file, tm_by_line = _get_tracemalloc_cached()
# Open files
open_files = _read_open_files()
# Memory breakdown
rss_kb = status.get("vm_rss_kb", 0)
tm_current_kb = round(tm_current / 1024, 1)
# C/native memory = RSS - Python traced (approximation)
rss_anon_kb = status.get("rss_anon_kb", 0)
rss_file_kb = status.get("rss_file_kb", 0)
snapshot = {
"ts": round(now, 3),
"proc_cpu_pct": round(proc_cpu_pct, 2),
"rss_kb": rss_kb,
"vm_size_kb": status.get("vm_size_kb", 0),
"vm_peak_kb": status.get("vm_peak_kb", 0),
"vm_swap_kb": status.get("vm_swap_kb", 0),
"fd_open": fd_open,
"fd_slots": status.get("fd_slots", 0),
"kernel_threads": status.get("kernel_threads", 0),
"py_thread_count": len(py_threads),
"sys_cpu_count": sys_info["cpu_count"],
"sys_mem_total_kb": sys_info["mem_total_kb"],
"sys_mem_available_kb": sys_info["mem_available_kb"],
# Memory breakdown
"rss_anon_kb": rss_anon_kb,
"rss_file_kb": rss_file_kb,
"rss_shmem_kb": status.get("rss_shmem_kb", 0),
"private_dirty_kb": smaps.get("private_dirty_kb", 0),
"private_clean_kb": smaps.get("private_clean_kb", 0),
"shared_dirty_kb": smaps.get("shared_dirty_kb", 0),
"shared_clean_kb": smaps.get("shared_clean_kb", 0),
# Data
"threads": thread_details,
"py_threads": py_threads,
"tracemalloc_active": tracemalloc.is_tracing(),
"tracemalloc_current_kb": tm_current_kb,
"tracemalloc_peak_kb": round(tm_peak / 1024, 1),
"tracemalloc_by_file": tm_by_file,
"tracemalloc_by_line": tm_by_line,
"open_files": open_files,
}
with _history_lock:
_history.append({
"ts": snapshot["ts"],
"proc_cpu_pct": snapshot["proc_cpu_pct"],
"rss_kb": rss_kb,
"fd_open": fd_open,
"py_thread_count": snapshot["py_thread_count"],
"kernel_threads": snapshot["kernel_threads"],
"vm_swap_kb": snapshot["vm_swap_kb"],
"private_dirty_kb": snapshot["private_dirty_kb"],
})
if len(_history) > _MAX_HISTORY:
del _history[: len(_history) - _MAX_HISTORY]
return snapshot
# ---------------------------------------------------------------------------
# WebUtils class
# ---------------------------------------------------------------------------
class DebugUtils:
def __init__(self, shared_data):
self.shared_data = shared_data
def get_snapshot(self, handler):
try:
data = _take_snapshot()
self._send_json(handler, data)
except Exception as exc:
logger.error(f"debug snapshot error: {exc}")
self._send_json(handler, {"error": str(exc)}, status=500)
def get_history(self, handler):
try:
with _history_lock:
data = list(_history)
self._send_json(handler, {"history": data})
except Exception as exc:
logger.error(f"debug history error: {exc}")
self._send_json(handler, {"error": str(exc)}, status=500)
def toggle_tracemalloc(self, data):
global _tm_cache, _tm_cache_time
action = data.get("action", "status")
try:
if action == "start":
if not tracemalloc.is_tracing():
tracemalloc.start(int(data.get("nframes", 10)))
return {"status": "ok", "tracing": True}
elif action == "stop":
if tracemalloc.is_tracing():
tracemalloc.stop()
with _tm_cache_lock:
_tm_cache = None
_tm_cache_time = 0.0
return {"status": "ok", "tracing": False}
else:
return {"status": "ok", "tracing": tracemalloc.is_tracing()}
except Exception as exc:
return {"status": "error", "message": str(exc)}
def get_gc_stats(self, handler):
import gc
try:
counts = gc.get_count()
thresholds = gc.get_threshold()
self._send_json(handler, {
"gc_enabled": gc.isenabled(),
"counts": {"gen0": counts[0], "gen1": counts[1], "gen2": counts[2]},
"thresholds": {"gen0": thresholds[0], "gen1": thresholds[1], "gen2": thresholds[2]},
})
except Exception as exc:
self._send_json(handler, {"error": str(exc)}, status=500)
def force_gc(self, data):
import gc
try:
return {"status": "ok", "collected": gc.collect()}
except Exception as exc:
return {"status": "error", "message": str(exc)}
@staticmethod
def _send_json(handler, data, status=200):
handler.send_response(status)
handler.send_header("Content-Type", "application/json")
handler.end_headers()
handler.wfile.write(json.dumps(data, default=str).encode("utf-8"))