Files
Bjorn/data_consolidator.py
Fabien POLLY eb20b168a6 Add RLUtils class for managing RL/AI dashboard endpoints
- Implemented methods for fetching AI stats, training history, and recent experiences.
- Added functionality to set operation mode (MANUAL, AUTO, AI) with appropriate handling.
- Included helper methods for querying the database and sending JSON responses.
- Integrated model metadata extraction for visualization purposes.
2026-02-18 22:36:10 +01:00

830 lines
32 KiB
Python

"""
data_consolidator.py - Data Consolidation Engine for Deep Learning
═══════════════════════════════════════════════════════════════════════════
Purpose:
Consolidate logged features into training-ready datasets.
Prepare data exports for deep learning on external PC.
Features:
- Aggregate features across time windows
- Compute statistical features
- Create feature vectors for neural networks
- Export in formats ready for TensorFlow/PyTorch
- Incremental consolidation (low memory footprint)
Author: Bjorn Team
Version: 2.0.0
"""
import json
import csv
import time
import gzip
import heapq
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from pathlib import Path
from logger import Logger
logger = Logger(name="data_consolidator.py", level=20)
try:
import requests
except ImportError:
requests = None
class DataConsolidator:
"""
Consolidates raw feature logs into training datasets.
Optimized for Raspberry Pi Zero - processes in batches.
"""
def __init__(self, shared_data, export_dir: str = None):
"""
Initialize data consolidator
Args:
shared_data: SharedData instance
export_dir: Directory for export files
"""
self.shared_data = shared_data
self.db = shared_data.db
if export_dir is None:
# Default to shared_data path (cross-platform)
self.export_dir = Path(getattr(shared_data, 'ml_exports_dir', Path(shared_data.data_dir) / "ml_exports"))
else:
self.export_dir = Path(export_dir)
self.export_dir.mkdir(parents=True, exist_ok=True)
# Server health state consumed by orchestrator fallback logic.
self.last_server_attempted = False
self.last_server_contact_ok = None
self._upload_backoff_until = 0.0
self._upload_backoff_current_s = 0.0
logger.info(f"DataConsolidator initialized, exports: {self.export_dir}")
def _set_server_contact_state(self, attempted: bool, ok: Optional[bool]) -> None:
self.last_server_attempted = bool(attempted)
self.last_server_contact_ok = ok if attempted else None
def _apply_upload_backoff(self, base_backoff_s: int, max_backoff_s: int = 3600) -> int:
"""
Exponential upload retry backoff:
base -> base*2 -> base*4 ... capped at max_backoff_s.
Returns the delay (seconds) applied for the next retry window.
"""
base = max(10, int(base_backoff_s))
cap = max(base, int(max_backoff_s))
prev = float(getattr(self, "_upload_backoff_current_s", 0.0) or 0.0)
if prev <= 0:
delay = base
else:
delay = min(cap, max(base, int(prev * 2)))
self._upload_backoff_current_s = float(delay)
self._upload_backoff_until = time.monotonic() + delay
return int(delay)
# ═══════════════════════════════════════════════════════════════════════
# CONSOLIDATION ENGINE
# ═══════════════════════════════════════════════════════════════════════
def consolidate_features(
self,
batch_size: int = None,
max_batches: Optional[int] = None
) -> Dict[str, int]:
"""
Consolidate raw features into aggregated feature vectors.
Processes unconsolidated records in batches.
"""
if batch_size is None:
batch_size = int(getattr(self.shared_data, "ai_batch_size", 100))
batch_size = max(1, min(int(batch_size), 5000))
stats = {
'records_processed': 0,
'records_aggregated': 0,
'batches_completed': 0,
'errors': 0
}
try:
# Get unconsolidated records
unconsolidated = self.db.query("""
SELECT COUNT(*) as cnt
FROM ml_features
WHERE consolidated=0
""")[0]['cnt']
if unconsolidated == 0:
logger.info("No unconsolidated features to process")
return stats
logger.info(f"Consolidating {unconsolidated} feature records...")
batch_count = 0
while True:
if max_batches and batch_count >= max_batches:
break
# Fetch batch
batch = self.db.query(f"""
SELECT * FROM ml_features
WHERE consolidated=0
ORDER BY timestamp
LIMIT {batch_size}
""")
if not batch:
break
# Process batch
for record in batch:
try:
self._consolidate_single_record(record)
stats['records_processed'] += 1
except Exception as e:
logger.error(f"Error consolidating record {record['id']}: {e}")
stats['errors'] += 1
# Mark as consolidated
record_ids = [r['id'] for r in batch]
placeholders = ','.join('?' * len(record_ids))
self.db.execute(f"""
UPDATE ml_features
SET consolidated=1
WHERE id IN ({placeholders})
""", record_ids)
stats['batches_completed'] += 1
batch_count += 1
# Progress log
if batch_count % 10 == 0:
logger.info(
f"Consolidation progress: {stats['records_processed']} records, "
f"{stats['batches_completed']} batches"
)
logger.success(
f"Consolidation complete: {stats['records_processed']} records processed, "
f"{stats['errors']} errors"
)
except Exception as e:
logger.error(f"Consolidation failed: {e}")
stats['errors'] += 1
return stats
def _consolidate_single_record(self, record: Dict[str, Any]):
"""
Process a single feature record into aggregated form.
Computes statistical features and feature vectors.
"""
try:
# Parse JSON fields once — reused by _build_feature_vector to avoid double-parsing
host_features = json.loads(record.get('host_features', '{}'))
network_features = json.loads(record.get('network_features', '{}'))
temporal_features = json.loads(record.get('temporal_features', '{}'))
action_features = json.loads(record.get('action_features', '{}'))
# Combine all features
all_features = {
**host_features,
**network_features,
**temporal_features,
**action_features
}
# Build numerical feature vector — pass already-parsed dicts to avoid re-parsing
feature_vector = self._build_feature_vector(
host_features, network_features, temporal_features, action_features
)
# Determine time window
raw_ts = record['timestamp']
if isinstance(raw_ts, str):
try:
timestamp = datetime.fromisoformat(raw_ts)
except ValueError:
timestamp = datetime.now()
elif isinstance(raw_ts, datetime):
timestamp = raw_ts
else:
timestamp = datetime.now()
hourly_window = timestamp.replace(minute=0, second=0, microsecond=0).isoformat()
# Update or insert aggregated record
self._update_aggregated_features(
mac_address=record['mac_address'],
time_window='hourly',
timestamp=hourly_window,
action_name=record['action_name'],
success=record['success'],
duration=record['duration_seconds'],
reward=record['reward'],
feature_vector=feature_vector,
all_features=all_features
)
except Exception as e:
logger.error(f"Error consolidating single record: {e}")
raise
def _build_feature_vector(
self,
host_features: Dict[str, Any],
network_features: Dict[str, Any],
temporal_features: Dict[str, Any],
action_features: Dict[str, Any],
) -> Dict[str, float]:
"""
Build a named feature dictionary from already-parsed feature dicts.
Accepts pre-parsed dicts so JSON is never decoded twice per record.
Uses shared ai_utils for consistency.
"""
from ai_utils import extract_neural_features_dict
return extract_neural_features_dict(
host_features=host_features,
network_features=network_features,
temporal_features=temporal_features,
action_features=action_features,
)
def _update_aggregated_features(
self,
mac_address: str,
time_window: str,
timestamp: str,
action_name: str,
success: int,
duration: float,
reward: float,
feature_vector: Dict[str, float],
all_features: Dict[str, Any]
):
"""
Update or insert aggregated feature record.
Accumulates statistics over the time window.
"""
try:
# Check if record exists
existing = self.db.query("""
SELECT * FROM ml_features_aggregated
WHERE mac_address=? AND time_window=? AND computed_at=?
""", (mac_address, time_window, timestamp))
if existing:
# Update existing record
old = existing[0]
new_total = old['total_actions'] + 1
# ... typical stats update ...
# Merge feature vectors (average each named feature)
old_vector = json.loads(old['feature_vector']) # Now a Dict
if isinstance(old_vector, list): # Migration handle
old_vector = {}
merged_vector = {}
# Combine keys from both
all_keys = set(old_vector.keys()) | set(feature_vector.keys())
for k in all_keys:
v_old = old_vector.get(k, 0.0)
v_new = feature_vector.get(k, 0.0)
merged_vector[k] = (v_old * old['total_actions'] + v_new) / new_total
self.db.execute("""
UPDATE ml_features_aggregated
SET total_actions=total_actions+1,
success_rate=(success_rate*total_actions + ?)/(total_actions+1),
avg_duration=(avg_duration*total_actions + ?)/(total_actions+1),
total_reward=total_reward + ?,
feature_vector=?
WHERE mac_address=? AND time_window=? AND computed_at=?
""", (
success,
duration,
reward,
json.dumps(merged_vector),
mac_address,
time_window,
timestamp
))
else:
# Insert new record
self.db.execute("""
INSERT INTO ml_features_aggregated (
mac_address, time_window, computed_at,
total_actions, success_rate, avg_duration, total_reward,
feature_vector
) VALUES (?, ?, ?, 1, ?, ?, ?, ?)
""", (
mac_address,
time_window,
timestamp,
float(success),
duration,
reward,
json.dumps(feature_vector)
))
except Exception as e:
logger.error(f"Error updating aggregated features: {e}")
raise
# ═══════════════════════════════════════════════════════════════════════
# EXPORT FUNCTIONS
# ═══════════════════════════════════════════════════════════════════════
def export_for_training(
self,
format: str = 'csv',
compress: bool = True,
max_records: Optional[int] = None
) -> Tuple[str, int]:
"""
Export consolidated features for deep learning training.
Args:
format: 'csv', 'jsonl', or 'parquet'
compress: Whether to gzip the output
max_records: Maximum records to export (None = all)
Returns:
Tuple of (file_path, record_count)
"""
try:
if max_records is None:
max_records = int(getattr(self.shared_data, "ai_export_max_records", 1000))
max_records = max(100, min(int(max_records), 20000))
# Generate filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
base_filename = f"bjorn_training_{timestamp}.{format}"
if compress and format != 'parquet':
base_filename += '.gz'
filepath = self.export_dir / base_filename
# Fetch data
limit_clause = f"LIMIT {max_records}"
records = self.db.query(f"""
SELECT
mf.*,
mfa.feature_vector,
mfa.success_rate as aggregated_success_rate,
mfa.total_actions as aggregated_total_actions
FROM ml_features mf
LEFT JOIN ml_features_aggregated mfa
ON mf.mac_address = mfa.mac_address
WHERE mf.consolidated=1 AND mf.export_batch_id IS NULL
ORDER BY mf.timestamp DESC
{limit_clause}
""")
if not records:
logger.warning("No consolidated records to export")
return "", 0
# Extract IDs before export so we can free the records list early
record_ids = [r['id'] for r in records]
# Export based on format
if format == 'csv':
count = self._export_csv(records, filepath, compress)
elif format == 'jsonl':
count = self._export_jsonl(records, filepath, compress)
elif format == 'parquet':
count = self._export_parquet(records, filepath)
else:
raise ValueError(f"Unsupported format: {format}")
# Free the large records list immediately after export — record_ids is all we still need
del records
# Create export batch record
batch_id = self._create_export_batch(filepath, count)
# Update records with batch ID
placeholders = ','.join('?' * len(record_ids))
self.db.execute(f"""
UPDATE ml_features
SET export_batch_id=?
WHERE id IN ({placeholders})
""", [batch_id] + record_ids)
del record_ids
logger.success(
f"Exported {count} records to {filepath} "
f"(batch_id={batch_id})"
)
return str(filepath), count
except Exception as e:
logger.error(f"Export failed: {e}")
raise
def _export_csv(
self,
records: List[Dict],
filepath: Path,
compress: bool
) -> int:
"""Export records as CSV"""
open_func = gzip.open if compress else open
mode = 'wt' if compress else 'w'
# 1. Flatten all records first to collect all possible fieldnames
flattened = []
all_fieldnames = set()
for r in records:
flat = {
'timestamp': r['timestamp'],
'mac_address': r['mac_address'],
'ip_address': r['ip_address'],
'action_name': r['action_name'],
'success': r['success'],
'duration_seconds': r['duration_seconds'],
'reward': r['reward']
}
# Parse and flatten features
for field in ['host_features', 'network_features', 'temporal_features', 'action_features']:
try:
features = json.loads(r.get(field, '{}'))
for k, v in features.items():
if isinstance(v, (int, float, bool, str)):
flat_key = f"{field}_{k}"
flat[flat_key] = v
except Exception as e:
logger.debug(f"Skip bad JSON in {field}: {e}")
# Add named feature vector
if r.get('feature_vector'):
try:
vector = json.loads(r['feature_vector'])
if isinstance(vector, dict):
for k, v in vector.items():
flat[f'feat_{k}'] = v
elif isinstance(vector, list):
for i, v in enumerate(vector):
flat[f'feature_{i}'] = v
except Exception as e:
logger.debug(f"Skip bad feature vector: {e}")
flattened.append(flat)
all_fieldnames.update(flat.keys())
# 2. Sort fieldnames for consistency
sorted_fieldnames = sorted(list(all_fieldnames))
all_fieldnames = None # Free the set
# 3. Write CSV
with open_func(filepath, mode, newline='', encoding='utf-8') as f:
if flattened:
writer = csv.DictWriter(f, fieldnames=sorted_fieldnames)
writer.writeheader()
writer.writerows(flattened)
count = len(flattened)
flattened = None # Free the expanded list
return count
def _export_jsonl(
self,
records: List[Dict],
filepath: Path,
compress: bool
) -> int:
"""Export records as JSON Lines"""
open_func = gzip.open if compress else open
mode = 'wt' if compress else 'w'
with open_func(filepath, mode, encoding='utf-8') as f:
for r in records:
# Avoid mutating `records` in place to keep memory growth predictable.
row = dict(r)
for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
try:
row[field] = json.loads(row.get(field, '{}'))
except Exception:
row[field] = {}
if row.get('feature_vector'):
try:
row['feature_vector'] = json.loads(row['feature_vector'])
except Exception:
row['feature_vector'] = {}
f.write(json.dumps(row) + '\n')
return len(records)
def _export_parquet(self, records: List[Dict], filepath: Path) -> int:
"""Export records as Parquet (requires pyarrow)"""
try:
import pyarrow as pa
import pyarrow.parquet as pq
# Flatten records
flattened = []
for r in records:
flat = dict(r)
# Parse JSON fields
for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
flat[field] = json.loads(r.get(field, '{}'))
if r.get('feature_vector'):
flat['feature_vector'] = json.loads(r['feature_vector'])
flattened.append(flat)
# Convert to Arrow table
table = pa.Table.from_pylist(flattened)
# Write parquet
pq.write_table(table, filepath, compression='snappy')
return len(records)
except ImportError:
logger.error("Parquet export requires pyarrow. Falling back to CSV.")
return self._export_csv(records, filepath.with_suffix('.csv'), compress=True)
def _create_export_batch(self, filepath: Path, count: int) -> int:
"""Create export batch record and return batch ID"""
result = self.db.execute("""
INSERT INTO ml_export_batches (file_path, record_count, status)
VALUES (?, ?, 'exported')
""", (str(filepath), count))
# Get the inserted ID
batch_id = self.db.query("SELECT last_insert_rowid() as id")[0]['id']
return batch_id
# ═══════════════════════════════════════════════════════════════════════
# UTILITY METHODS
# ═══════════════════════════════════════════════════════════════════════
def get_export_stats(self) -> Dict[str, Any]:
"""Get statistics about exports"""
try:
batches = self.db.query("""
SELECT COUNT(*) as total_batches,
SUM(record_count) as total_records,
MAX(created_at) as last_export
FROM ml_export_batches
WHERE status='exported'
""")[0]
pending = self.db.query("""
SELECT COUNT(*) as cnt
FROM ml_features
WHERE consolidated=1 AND export_batch_id IS NULL
""")[0]['cnt']
return {
'total_export_batches': batches.get('total_batches', 0),
'total_records_exported': batches.get('total_records', 0),
'last_export_time': batches.get('last_export'),
'pending_export_count': pending
}
except Exception as e:
logger.error(f"Error getting export stats: {e}")
return {}
def flush_pending_uploads(self, max_files: int = 3) -> int:
"""
Retry uploads for previously exported batches that were not transferred yet.
Returns the number of successfully transferred files.
"""
max_files = max(0, int(max_files))
if max_files <= 0:
return 0
# No heavy "reliquat" tracking needed: pending uploads = files present in export_dir.
files = self._list_pending_export_files(limit=max_files)
ok = 0
for fp in files:
if self.upload_to_server(fp):
ok += 1
else:
# Stop early when server is unreachable to avoid repeated noise.
if self.last_server_attempted and self.last_server_contact_ok is False:
break
return ok
def _list_pending_export_files(self, limit: int = 3) -> List[str]:
"""
Return oldest export files present in export_dir.
This makes the backlog naturally equal to the number of files on disk.
"""
limit = max(0, int(limit))
if limit <= 0:
return []
try:
d = Path(self.export_dir)
if not d.exists():
return []
def _safe_mtime(path: Path) -> float:
try:
return path.stat().st_mtime
except Exception:
return float("inf")
# Keep only the N oldest files in memory instead of sorting all candidates.
files_iter = (p for p in d.glob("bjorn_training_*") if p.is_file())
oldest = heapq.nsmallest(limit, files_iter, key=_safe_mtime)
return [str(p) for p in oldest]
except Exception:
return []
def _mark_batch_status(self, filepath: str, status: str, notes: str = "") -> None:
"""Update ml_export_batches status for a given file path (best-effort)."""
try:
self.db.execute(
"""
UPDATE ml_export_batches
SET status=?, notes=?
WHERE file_path=?
""",
(status, notes or "", str(filepath)),
)
except Exception:
pass
def _safe_delete_uploaded_export(self, filepath: Path) -> None:
"""Delete a successfully-uploaded export file if configured to do so."""
try:
if not bool(self.shared_data.config.get("ai_delete_export_after_upload", True)):
return
fp = filepath.resolve()
base = Path(self.export_dir).resolve()
# Safety: only delete files under export_dir.
if base not in fp.parents:
return
fp.unlink(missing_ok=True) # Python 3.8+ supports missing_ok
except TypeError:
# Python < 3.8 fallback (not expected here, but safe)
try:
if filepath.exists():
filepath.unlink()
except Exception:
pass
except Exception:
pass
def upload_to_server(self, filepath: str) -> bool:
"""
Upload export file to AI Validation Server.
Args:
filepath: Path to the file to upload
Returns:
True if upload successful
"""
self._set_server_contact_state(False, None)
try:
import requests
except ImportError:
requests = None
if requests is None:
logger.info_throttled(
"AI upload skipped: requests not installed",
key="ai_upload_no_requests",
interval_s=600.0,
)
return False
url = self.shared_data.config.get("ai_server_url")
if not url:
logger.info_throttled(
"AI upload skipped: ai_server_url not configured",
key="ai_upload_no_url",
interval_s=600.0,
)
return False
backoff_s = max(10, int(self.shared_data.config.get("ai_upload_retry_backoff_s", 120)))
max_backoff_s = 3600
now_mono = time.monotonic()
if now_mono < self._upload_backoff_until:
remaining = int(self._upload_backoff_until - now_mono)
logger.debug(f"AI upload backoff active ({remaining}s remaining)")
logger.info_throttled(
"AI upload deferred: backoff active",
key="ai_upload_backoff_active",
interval_s=180.0,
)
return False
try:
filepath = Path(filepath)
if not filepath.exists():
logger.warning(f"AI upload skipped: file not found: {filepath}")
self._mark_batch_status(str(filepath), "missing", "file not found")
return False
# Get MAC address for unique identification
try:
from ai_utils import get_system_mac
mac = get_system_mac()
except ImportError:
mac = "unknown"
logger.debug(f"Uploading {filepath.name} to AI Server ({url}) unique_id={mac}")
self._set_server_contact_state(True, None)
with open(filepath, 'rb') as f:
files = {'file': f}
# Send MAC as query param
# Server expects ?mac_addr=...
params = {'mac_addr': mac}
# Short timeout to avoid blocking
response = requests.post(f"{url}/upload", files=files, params=params, timeout=10)
if response.status_code == 200:
self._set_server_contact_state(True, True)
self._upload_backoff_until = 0.0
self._upload_backoff_current_s = 0.0
logger.success(f"Uploaded {filepath.name} successfully")
self._mark_batch_status(str(filepath), "transferred", "uploaded")
self._safe_delete_uploaded_export(filepath)
return True
else:
self._set_server_contact_state(True, False)
next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
logger.debug(
f"AI upload HTTP failure for {filepath.name}: status={response.status_code}, "
f"next retry in {next_retry_s}s"
)
logger.info_throttled(
f"AI upload deferred (HTTP {response.status_code})",
key=f"ai_upload_http_{response.status_code}",
interval_s=300.0,
)
return False
except Exception as e:
self._set_server_contact_state(True, False)
next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
logger.debug(f"AI upload exception for {filepath}: {e} (next retry in {next_retry_s}s)")
logger.info_throttled(
"AI upload deferred: server unreachable (retry later)",
key="ai_upload_exception",
interval_s=300.0,
)
return False
def cleanup_old_exports(self, days: int = 30):
"""Delete export files older than N days"""
try:
cutoff = datetime.now() - timedelta(days=days)
old_batches = self.db.query("""
SELECT file_path FROM ml_export_batches
WHERE created_at < ?
""", (cutoff.isoformat(),))
deleted = 0
for batch in old_batches:
filepath = Path(batch['file_path'])
if filepath.exists():
filepath.unlink()
deleted += 1
# Clean up database records
self.db.execute("""
DELETE FROM ml_export_batches
WHERE created_at < ?
""", (cutoff.isoformat(),))
logger.info(f"Cleaned up {deleted} old export files")
except Exception as e:
logger.error(f"Cleanup failed: {e}")
# ═══════════════════════════════════════════════════════════════════════════
# END OF FILE
# ═══════════════════════════════════════════════════════════════════════════