mirror of
https://github.com/infinition/Bjorn.git
synced 2026-03-10 14:42:04 +00:00
Add RLUtils class for managing RL/AI dashboard endpoints
- Implemented methods for fetching AI stats, training history, and recent experiences. - Added functionality to set operation mode (MANUAL, AUTO, AI) with appropriate handling. - Included helper methods for querying the database and sending JSON responses. - Integrated model metadata extraction for visualization purposes.
This commit is contained in:
536
web_utils/debug_utils.py
Normal file
536
web_utils/debug_utils.py
Normal file
@@ -0,0 +1,536 @@
|
||||
"""
|
||||
Debug / Profiling utilities for the Bjorn Debug page.
|
||||
Exposes process-level and per-thread metrics via /proc (no external deps).
|
||||
Designed for Pi Zero 2: lightweight reads, no subprocess spawning.
|
||||
OPTIMIZED: minimal allocations, cached tracemalloc, /proc/self/smaps for C memory.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import tracemalloc
|
||||
|
||||
from logger import Logger
|
||||
|
||||
logger = Logger(name="debug_utils")
|
||||
|
||||
_SC_CLK_TCK = os.sysconf("SC_CLK_TCK") if hasattr(os, "sysconf") else 100
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# /proc helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _read_proc_status():
|
||||
result = {}
|
||||
try:
|
||||
with open("/proc/self/status", "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if line.startswith("VmRSS:"):
|
||||
result["vm_rss_kb"] = int(line.split()[1])
|
||||
elif line.startswith("VmSize:"):
|
||||
result["vm_size_kb"] = int(line.split()[1])
|
||||
elif line.startswith("VmPeak:"):
|
||||
result["vm_peak_kb"] = int(line.split()[1])
|
||||
elif line.startswith("VmSwap:"):
|
||||
result["vm_swap_kb"] = int(line.split()[1])
|
||||
elif line.startswith("FDSize:"):
|
||||
result["fd_slots"] = int(line.split()[1])
|
||||
elif line.startswith("Threads:"):
|
||||
result["kernel_threads"] = int(line.split()[1])
|
||||
elif line.startswith("RssAnon:"):
|
||||
result["rss_anon_kb"] = int(line.split()[1])
|
||||
elif line.startswith("RssFile:"):
|
||||
result["rss_file_kb"] = int(line.split()[1])
|
||||
elif line.startswith("RssShmem:"):
|
||||
result["rss_shmem_kb"] = int(line.split()[1])
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
|
||||
def _fd_count():
|
||||
try:
|
||||
return len(os.listdir("/proc/self/fd"))
|
||||
except Exception:
|
||||
return -1
|
||||
|
||||
|
||||
def _read_open_files():
|
||||
"""Read open FDs — reuses a single dict to minimize allocations."""
|
||||
fd_dir = "/proc/self/fd"
|
||||
fd_map = {}
|
||||
try:
|
||||
fds = os.listdir(fd_dir)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
for fd in fds:
|
||||
try:
|
||||
target = os.readlink(fd_dir + "/" + fd)
|
||||
except Exception:
|
||||
target = "???"
|
||||
|
||||
if target.startswith("/"):
|
||||
ftype = "device" if "/dev/" in target else "proc" if target.startswith("/proc/") else "temp" if (target.startswith("/tmp/") or target.startswith("/run/")) else "file"
|
||||
elif target.startswith("socket:"):
|
||||
ftype = "socket"
|
||||
elif target.startswith("pipe:"):
|
||||
ftype = "pipe"
|
||||
elif target.startswith("anon_inode:"):
|
||||
ftype = "anon"
|
||||
else:
|
||||
ftype = "other"
|
||||
|
||||
entry = fd_map.get(target)
|
||||
if entry is None:
|
||||
entry = {"target": target, "type": ftype, "count": 0, "fds": []}
|
||||
fd_map[target] = entry
|
||||
entry["count"] += 1
|
||||
if len(entry["fds"]) < 5:
|
||||
entry["fds"].append(int(fd))
|
||||
|
||||
result = sorted(fd_map.values(), key=lambda x: (-x["count"], x["target"]))
|
||||
return result
|
||||
|
||||
|
||||
def _read_thread_stats():
|
||||
threads = []
|
||||
task_dir = "/proc/self/task"
|
||||
try:
|
||||
tids = os.listdir(task_dir)
|
||||
except Exception:
|
||||
return threads
|
||||
|
||||
for tid in tids:
|
||||
try:
|
||||
with open(task_dir + "/" + tid + "/stat", "r", encoding="utf-8") as f:
|
||||
raw = f.read()
|
||||
i1 = raw.find("(")
|
||||
i2 = raw.rfind(")")
|
||||
if i1 < 0 or i2 < 0:
|
||||
continue
|
||||
name = raw[i1 + 1:i2]
|
||||
fields = raw[i2 + 2:].split()
|
||||
state = fields[0] if fields else "?"
|
||||
utime = int(fields[11]) if len(fields) > 11 else 0
|
||||
stime = int(fields[12]) if len(fields) > 12 else 0
|
||||
threads.append({
|
||||
"tid": int(tid),
|
||||
"name": name,
|
||||
"state": state,
|
||||
"cpu_ticks": utime + stime,
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
return threads
|
||||
|
||||
|
||||
def _get_python_threads_rich():
|
||||
"""Enumerate Python threads with target + current frame. Minimal allocations."""
|
||||
frames = sys._current_frames()
|
||||
result = []
|
||||
|
||||
for t in threading.enumerate():
|
||||
ident = t.ident
|
||||
nid = getattr(t, "native_id", None)
|
||||
|
||||
# Target function info
|
||||
target = getattr(t, "_target", None)
|
||||
if target is not None:
|
||||
tf = getattr(target, "__qualname__", getattr(target, "__name__", "?"))
|
||||
tm = getattr(target, "__module__", "")
|
||||
# Source file — use __code__ directly (avoids importing inspect)
|
||||
tfile = ""
|
||||
code = getattr(target, "__code__", None)
|
||||
if code:
|
||||
tfile = getattr(code, "co_filename", "")
|
||||
else:
|
||||
tf = "(main)" if t.name == "MainThread" else "(no target)"
|
||||
tm = ""
|
||||
tfile = ""
|
||||
|
||||
# Current stack — top 5 frames, build compact strings directly
|
||||
stack = []
|
||||
frame = frames.get(ident)
|
||||
depth = 0
|
||||
while frame is not None and depth < 5:
|
||||
co = frame.f_code
|
||||
fn = co.co_filename
|
||||
# Shorten: last 2 path components
|
||||
sep = fn.rfind("/")
|
||||
if sep > 0:
|
||||
sep2 = fn.rfind("/", 0, sep)
|
||||
short = fn[sep2 + 1:] if sep2 >= 0 else fn
|
||||
else:
|
||||
short = fn
|
||||
stack.append({
|
||||
"file": short,
|
||||
"line": frame.f_lineno,
|
||||
"func": co.co_name,
|
||||
})
|
||||
frame = frame.f_back
|
||||
depth += 1
|
||||
# Release frame reference immediately
|
||||
del frame
|
||||
|
||||
result.append({
|
||||
"name": t.name,
|
||||
"daemon": t.daemon,
|
||||
"alive": t.is_alive(),
|
||||
"ident": ident,
|
||||
"native_id": nid,
|
||||
"target_func": tf,
|
||||
"target_module": tm,
|
||||
"target_file": tfile,
|
||||
"stack_top": stack,
|
||||
})
|
||||
|
||||
# Release all frame references
|
||||
del frames
|
||||
return result
|
||||
|
||||
|
||||
def _system_cpu_mem():
|
||||
result = {"cpu_count": 1, "mem_total_kb": 0, "mem_available_kb": 0}
|
||||
try:
|
||||
with open("/proc/meminfo", "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if line.startswith("MemTotal:"):
|
||||
result["mem_total_kb"] = int(line.split()[1])
|
||||
elif line.startswith("MemAvailable:"):
|
||||
result["mem_available_kb"] = int(line.split()[1])
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result["cpu_count"] = len(os.sched_getaffinity(0))
|
||||
except Exception:
|
||||
try:
|
||||
result["cpu_count"] = os.cpu_count() or 1
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
|
||||
def _read_smaps_rollup():
|
||||
"""
|
||||
Read /proc/self/smaps_rollup for a breakdown of what consumes RSS.
|
||||
This shows: Shared_Clean, Shared_Dirty, Private_Clean, Private_Dirty,
|
||||
which helps identify C extension memory vs Python heap vs mmap.
|
||||
"""
|
||||
result = {}
|
||||
try:
|
||||
with open("/proc/self/smaps_rollup", "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
key = parts[0].rstrip(":")
|
||||
if key in ("Rss", "Pss", "Shared_Clean", "Shared_Dirty",
|
||||
"Private_Clean", "Private_Dirty", "Referenced",
|
||||
"Anonymous", "Swap", "Locked"):
|
||||
result[key.lower() + "_kb"] = int(parts[1])
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cached tracemalloc — take snapshot at most every 5s to reduce overhead
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_tm_cache_lock = threading.Lock()
|
||||
_tm_cache = None # (current, peak, by_file, by_line)
|
||||
_tm_cache_time = 0.0
|
||||
_TM_CACHE_TTL = 5.0 # seconds
|
||||
|
||||
|
||||
def _get_tracemalloc_cached():
|
||||
"""Return cached tracemalloc data, refreshing at most every 5s."""
|
||||
global _tm_cache, _tm_cache_time
|
||||
|
||||
if not tracemalloc.is_tracing():
|
||||
return 0, 0, [], []
|
||||
|
||||
now = time.monotonic()
|
||||
with _tm_cache_lock:
|
||||
if _tm_cache is not None and (now - _tm_cache_time) < _TM_CACHE_TTL:
|
||||
return _tm_cache
|
||||
|
||||
# Take snapshot outside the lock (it's slow)
|
||||
current, peak = tracemalloc.get_traced_memory()
|
||||
snap = tracemalloc.take_snapshot()
|
||||
|
||||
# Single statistics call — use lineno (more useful), derive file-level client-side
|
||||
stats_line = snap.statistics("lineno")[:30]
|
||||
top_by_line = []
|
||||
file_agg = {}
|
||||
for s in stats_line:
|
||||
frame = s.traceback[0] if s.traceback else None
|
||||
if frame is None:
|
||||
continue
|
||||
fn = frame.filename
|
||||
sep = fn.rfind("/")
|
||||
if sep > 0:
|
||||
sep2 = fn.rfind("/", 0, sep)
|
||||
short = fn[sep2 + 1:] if sep2 >= 0 else fn
|
||||
else:
|
||||
short = fn
|
||||
top_by_line.append({
|
||||
"file": short,
|
||||
"full_path": fn,
|
||||
"line": frame.lineno,
|
||||
"size_kb": round(s.size / 1024, 1),
|
||||
"count": s.count,
|
||||
})
|
||||
# Aggregate by file
|
||||
if fn not in file_agg:
|
||||
file_agg[fn] = {"file": short, "full_path": fn, "size_kb": 0, "count": 0}
|
||||
file_agg[fn]["size_kb"] += round(s.size / 1024, 1)
|
||||
file_agg[fn]["count"] += s.count
|
||||
|
||||
# Also get file-level stats for files that don't appear in line-level top
|
||||
stats_file = snap.statistics("filename")[:20]
|
||||
for s in stats_file:
|
||||
fn = str(s.traceback) if hasattr(s.traceback, '__str__') else ""
|
||||
# traceback for filename stats is just the filename
|
||||
raw_fn = s.traceback[0].filename if s.traceback else fn
|
||||
if raw_fn not in file_agg:
|
||||
sep = raw_fn.rfind("/")
|
||||
if sep > 0:
|
||||
sep2 = raw_fn.rfind("/", 0, sep)
|
||||
short = raw_fn[sep2 + 1:] if sep2 >= 0 else raw_fn
|
||||
else:
|
||||
short = raw_fn
|
||||
file_agg[raw_fn] = {"file": short, "full_path": raw_fn, "size_kb": 0, "count": 0}
|
||||
entry = file_agg[raw_fn]
|
||||
# Use the larger of aggregated or direct stats
|
||||
direct_kb = round(s.size / 1024, 1)
|
||||
if direct_kb > entry["size_kb"]:
|
||||
entry["size_kb"] = direct_kb
|
||||
if s.count > entry["count"]:
|
||||
entry["count"] = s.count
|
||||
|
||||
top_by_file = sorted(file_agg.values(), key=lambda x: -x["size_kb"])[:20]
|
||||
|
||||
# Release snapshot immediately
|
||||
del snap
|
||||
|
||||
result = (current, peak, top_by_file, top_by_line)
|
||||
with _tm_cache_lock:
|
||||
_tm_cache = result
|
||||
_tm_cache_time = now
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Snapshot + history ring buffer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_MAX_HISTORY = 120
|
||||
_history_lock = threading.Lock()
|
||||
_history = []
|
||||
_prev_thread_ticks = {}
|
||||
_prev_proc_ticks = 0
|
||||
_prev_wall = 0.0
|
||||
|
||||
|
||||
def _take_snapshot():
|
||||
global _prev_thread_ticks, _prev_proc_ticks, _prev_wall
|
||||
|
||||
now = time.time()
|
||||
wall_delta = now - _prev_wall if _prev_wall > 0 else 1.0
|
||||
tick_budget = wall_delta * _SC_CLK_TCK
|
||||
|
||||
# Process-level
|
||||
status = _read_proc_status()
|
||||
fd_open = _fd_count()
|
||||
sys_info = _system_cpu_mem()
|
||||
smaps = _read_smaps_rollup()
|
||||
|
||||
# Thread CPU from /proc
|
||||
raw_threads = _read_thread_stats()
|
||||
thread_details = []
|
||||
new_ticks_map = {}
|
||||
total_proc_ticks = 0
|
||||
|
||||
for t in raw_threads:
|
||||
tid = t["tid"]
|
||||
prev = _prev_thread_ticks.get(tid, t["cpu_ticks"])
|
||||
delta = max(0, t["cpu_ticks"] - prev)
|
||||
cpu_pct = (delta / tick_budget * 100.0) if tick_budget > 0 else 0.0
|
||||
new_ticks_map[tid] = t["cpu_ticks"]
|
||||
total_proc_ticks += t["cpu_ticks"]
|
||||
thread_details.append({
|
||||
"tid": tid,
|
||||
"name": t["name"],
|
||||
"state": t["state"],
|
||||
"cpu_pct": round(cpu_pct, 2),
|
||||
"cpu_ticks_total": t["cpu_ticks"],
|
||||
})
|
||||
|
||||
thread_details.sort(key=lambda x: x["cpu_pct"], reverse=True)
|
||||
|
||||
proc_delta = total_proc_ticks - _prev_proc_ticks if _prev_proc_ticks else 0
|
||||
proc_cpu_pct = (proc_delta / tick_budget * 100.0) if tick_budget > 0 else 0.0
|
||||
|
||||
_prev_thread_ticks = new_ticks_map
|
||||
_prev_proc_ticks = total_proc_ticks
|
||||
_prev_wall = now
|
||||
|
||||
# Python threads
|
||||
py_threads = _get_python_threads_rich()
|
||||
|
||||
# Match kernel TIDs to Python threads
|
||||
native_to_py = {}
|
||||
for pt in py_threads:
|
||||
nid = pt.get("native_id")
|
||||
if nid is not None:
|
||||
native_to_py[nid] = pt
|
||||
|
||||
for td in thread_details:
|
||||
pt = native_to_py.get(td["tid"])
|
||||
if pt:
|
||||
td["py_name"] = pt["name"]
|
||||
td["py_target"] = pt.get("target_func", "")
|
||||
td["py_module"] = pt.get("target_module", "")
|
||||
td["py_file"] = pt.get("target_file", "")
|
||||
if pt.get("stack_top"):
|
||||
top = pt["stack_top"][0]
|
||||
td["py_current"] = f"{top['file']}:{top['line']} {top['func']}()"
|
||||
|
||||
# tracemalloc (cached, refreshes every 5s)
|
||||
tm_current, tm_peak, tm_by_file, tm_by_line = _get_tracemalloc_cached()
|
||||
|
||||
# Open files
|
||||
open_files = _read_open_files()
|
||||
|
||||
# Memory breakdown
|
||||
rss_kb = status.get("vm_rss_kb", 0)
|
||||
tm_current_kb = round(tm_current / 1024, 1)
|
||||
# C/native memory = RSS - Python traced (approximation)
|
||||
rss_anon_kb = status.get("rss_anon_kb", 0)
|
||||
rss_file_kb = status.get("rss_file_kb", 0)
|
||||
|
||||
snapshot = {
|
||||
"ts": round(now, 3),
|
||||
"proc_cpu_pct": round(proc_cpu_pct, 2),
|
||||
"rss_kb": rss_kb,
|
||||
"vm_size_kb": status.get("vm_size_kb", 0),
|
||||
"vm_peak_kb": status.get("vm_peak_kb", 0),
|
||||
"vm_swap_kb": status.get("vm_swap_kb", 0),
|
||||
"fd_open": fd_open,
|
||||
"fd_slots": status.get("fd_slots", 0),
|
||||
"kernel_threads": status.get("kernel_threads", 0),
|
||||
"py_thread_count": len(py_threads),
|
||||
"sys_cpu_count": sys_info["cpu_count"],
|
||||
"sys_mem_total_kb": sys_info["mem_total_kb"],
|
||||
"sys_mem_available_kb": sys_info["mem_available_kb"],
|
||||
# Memory breakdown
|
||||
"rss_anon_kb": rss_anon_kb,
|
||||
"rss_file_kb": rss_file_kb,
|
||||
"rss_shmem_kb": status.get("rss_shmem_kb", 0),
|
||||
"private_dirty_kb": smaps.get("private_dirty_kb", 0),
|
||||
"private_clean_kb": smaps.get("private_clean_kb", 0),
|
||||
"shared_dirty_kb": smaps.get("shared_dirty_kb", 0),
|
||||
"shared_clean_kb": smaps.get("shared_clean_kb", 0),
|
||||
# Data
|
||||
"threads": thread_details,
|
||||
"py_threads": py_threads,
|
||||
"tracemalloc_active": tracemalloc.is_tracing(),
|
||||
"tracemalloc_current_kb": tm_current_kb,
|
||||
"tracemalloc_peak_kb": round(tm_peak / 1024, 1),
|
||||
"tracemalloc_by_file": tm_by_file,
|
||||
"tracemalloc_by_line": tm_by_line,
|
||||
"open_files": open_files,
|
||||
}
|
||||
|
||||
with _history_lock:
|
||||
_history.append({
|
||||
"ts": snapshot["ts"],
|
||||
"proc_cpu_pct": snapshot["proc_cpu_pct"],
|
||||
"rss_kb": rss_kb,
|
||||
"fd_open": fd_open,
|
||||
"py_thread_count": snapshot["py_thread_count"],
|
||||
"kernel_threads": snapshot["kernel_threads"],
|
||||
"vm_swap_kb": snapshot["vm_swap_kb"],
|
||||
"private_dirty_kb": snapshot["private_dirty_kb"],
|
||||
})
|
||||
if len(_history) > _MAX_HISTORY:
|
||||
del _history[: len(_history) - _MAX_HISTORY]
|
||||
|
||||
return snapshot
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# WebUtils class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class DebugUtils:
|
||||
def __init__(self, shared_data):
|
||||
self.shared_data = shared_data
|
||||
|
||||
def get_snapshot(self, handler):
|
||||
try:
|
||||
data = _take_snapshot()
|
||||
self._send_json(handler, data)
|
||||
except Exception as exc:
|
||||
logger.error(f"debug snapshot error: {exc}")
|
||||
self._send_json(handler, {"error": str(exc)}, status=500)
|
||||
|
||||
def get_history(self, handler):
|
||||
try:
|
||||
with _history_lock:
|
||||
data = list(_history)
|
||||
self._send_json(handler, {"history": data})
|
||||
except Exception as exc:
|
||||
logger.error(f"debug history error: {exc}")
|
||||
self._send_json(handler, {"error": str(exc)}, status=500)
|
||||
|
||||
def toggle_tracemalloc(self, data):
|
||||
global _tm_cache, _tm_cache_time
|
||||
action = data.get("action", "status")
|
||||
try:
|
||||
if action == "start":
|
||||
if not tracemalloc.is_tracing():
|
||||
tracemalloc.start(int(data.get("nframes", 10)))
|
||||
return {"status": "ok", "tracing": True}
|
||||
elif action == "stop":
|
||||
if tracemalloc.is_tracing():
|
||||
tracemalloc.stop()
|
||||
with _tm_cache_lock:
|
||||
_tm_cache = None
|
||||
_tm_cache_time = 0.0
|
||||
return {"status": "ok", "tracing": False}
|
||||
else:
|
||||
return {"status": "ok", "tracing": tracemalloc.is_tracing()}
|
||||
except Exception as exc:
|
||||
return {"status": "error", "message": str(exc)}
|
||||
|
||||
def get_gc_stats(self, handler):
|
||||
import gc
|
||||
try:
|
||||
counts = gc.get_count()
|
||||
thresholds = gc.get_threshold()
|
||||
self._send_json(handler, {
|
||||
"gc_enabled": gc.isenabled(),
|
||||
"counts": {"gen0": counts[0], "gen1": counts[1], "gen2": counts[2]},
|
||||
"thresholds": {"gen0": thresholds[0], "gen1": thresholds[1], "gen2": thresholds[2]},
|
||||
})
|
||||
except Exception as exc:
|
||||
self._send_json(handler, {"error": str(exc)}, status=500)
|
||||
|
||||
def force_gc(self, data):
|
||||
import gc
|
||||
try:
|
||||
return {"status": "ok", "collected": gc.collect()}
|
||||
except Exception as exc:
|
||||
return {"status": "error", "message": str(exc)}
|
||||
|
||||
@staticmethod
|
||||
def _send_json(handler, data, status=200):
|
||||
handler.send_response(status)
|
||||
handler.send_header("Content-Type", "application/json")
|
||||
handler.end_headers()
|
||||
handler.wfile.write(json.dumps(data, default=str).encode("utf-8"))
|
||||
Reference in New Issue
Block a user