mirror of
https://github.com/infinition/Bjorn.git
synced 2026-03-14 00:21:58 +00:00
Add RLUtils class for managing RL/AI dashboard endpoints
- Implemented methods for fetching AI stats, training history, and recent experiences. - Added functionality to set operation mode (MANUAL, AUTO, AI) with appropriate handling. - Included helper methods for querying the database and sending JSON responses. - Integrated model metadata extraction for visualization purposes.
This commit is contained in:
829
data_consolidator.py
Normal file
829
data_consolidator.py
Normal file
@@ -0,0 +1,829 @@
|
||||
"""
|
||||
data_consolidator.py - Data Consolidation Engine for Deep Learning
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
Purpose:
|
||||
Consolidate logged features into training-ready datasets.
|
||||
Prepare data exports for deep learning on external PC.
|
||||
|
||||
Features:
|
||||
- Aggregate features across time windows
|
||||
- Compute statistical features
|
||||
- Create feature vectors for neural networks
|
||||
- Export in formats ready for TensorFlow/PyTorch
|
||||
- Incremental consolidation (low memory footprint)
|
||||
|
||||
Author: Bjorn Team
|
||||
Version: 2.0.0
|
||||
"""
|
||||
|
||||
import json
|
||||
import csv
|
||||
import time
|
||||
import gzip
|
||||
import heapq
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from pathlib import Path
|
||||
from logger import Logger
|
||||
|
||||
logger = Logger(name="data_consolidator.py", level=20)
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
requests = None
|
||||
|
||||
|
||||
class DataConsolidator:
|
||||
"""
|
||||
Consolidates raw feature logs into training datasets.
|
||||
Optimized for Raspberry Pi Zero - processes in batches.
|
||||
"""
|
||||
|
||||
def __init__(self, shared_data, export_dir: str = None):
|
||||
"""
|
||||
Initialize data consolidator
|
||||
|
||||
Args:
|
||||
shared_data: SharedData instance
|
||||
export_dir: Directory for export files
|
||||
"""
|
||||
self.shared_data = shared_data
|
||||
self.db = shared_data.db
|
||||
|
||||
if export_dir is None:
|
||||
# Default to shared_data path (cross-platform)
|
||||
self.export_dir = Path(getattr(shared_data, 'ml_exports_dir', Path(shared_data.data_dir) / "ml_exports"))
|
||||
else:
|
||||
self.export_dir = Path(export_dir)
|
||||
|
||||
self.export_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Server health state consumed by orchestrator fallback logic.
|
||||
self.last_server_attempted = False
|
||||
self.last_server_contact_ok = None
|
||||
self._upload_backoff_until = 0.0
|
||||
self._upload_backoff_current_s = 0.0
|
||||
|
||||
logger.info(f"DataConsolidator initialized, exports: {self.export_dir}")
|
||||
|
||||
def _set_server_contact_state(self, attempted: bool, ok: Optional[bool]) -> None:
|
||||
self.last_server_attempted = bool(attempted)
|
||||
self.last_server_contact_ok = ok if attempted else None
|
||||
|
||||
def _apply_upload_backoff(self, base_backoff_s: int, max_backoff_s: int = 3600) -> int:
|
||||
"""
|
||||
Exponential upload retry backoff:
|
||||
base -> base*2 -> base*4 ... capped at max_backoff_s.
|
||||
Returns the delay (seconds) applied for the next retry window.
|
||||
"""
|
||||
base = max(10, int(base_backoff_s))
|
||||
cap = max(base, int(max_backoff_s))
|
||||
prev = float(getattr(self, "_upload_backoff_current_s", 0.0) or 0.0)
|
||||
|
||||
if prev <= 0:
|
||||
delay = base
|
||||
else:
|
||||
delay = min(cap, max(base, int(prev * 2)))
|
||||
|
||||
self._upload_backoff_current_s = float(delay)
|
||||
self._upload_backoff_until = time.monotonic() + delay
|
||||
return int(delay)
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# CONSOLIDATION ENGINE
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
def consolidate_features(
|
||||
self,
|
||||
batch_size: int = None,
|
||||
max_batches: Optional[int] = None
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
Consolidate raw features into aggregated feature vectors.
|
||||
Processes unconsolidated records in batches.
|
||||
"""
|
||||
if batch_size is None:
|
||||
batch_size = int(getattr(self.shared_data, "ai_batch_size", 100))
|
||||
batch_size = max(1, min(int(batch_size), 5000))
|
||||
stats = {
|
||||
'records_processed': 0,
|
||||
'records_aggregated': 0,
|
||||
'batches_completed': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
try:
|
||||
# Get unconsolidated records
|
||||
unconsolidated = self.db.query("""
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM ml_features
|
||||
WHERE consolidated=0
|
||||
""")[0]['cnt']
|
||||
|
||||
if unconsolidated == 0:
|
||||
logger.info("No unconsolidated features to process")
|
||||
return stats
|
||||
|
||||
logger.info(f"Consolidating {unconsolidated} feature records...")
|
||||
|
||||
batch_count = 0
|
||||
while True:
|
||||
if max_batches and batch_count >= max_batches:
|
||||
break
|
||||
|
||||
# Fetch batch
|
||||
batch = self.db.query(f"""
|
||||
SELECT * FROM ml_features
|
||||
WHERE consolidated=0
|
||||
ORDER BY timestamp
|
||||
LIMIT {batch_size}
|
||||
""")
|
||||
|
||||
if not batch:
|
||||
break
|
||||
|
||||
# Process batch
|
||||
for record in batch:
|
||||
try:
|
||||
self._consolidate_single_record(record)
|
||||
stats['records_processed'] += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error consolidating record {record['id']}: {e}")
|
||||
stats['errors'] += 1
|
||||
|
||||
# Mark as consolidated
|
||||
record_ids = [r['id'] for r in batch]
|
||||
placeholders = ','.join('?' * len(record_ids))
|
||||
self.db.execute(f"""
|
||||
UPDATE ml_features
|
||||
SET consolidated=1
|
||||
WHERE id IN ({placeholders})
|
||||
""", record_ids)
|
||||
|
||||
stats['batches_completed'] += 1
|
||||
batch_count += 1
|
||||
|
||||
# Progress log
|
||||
if batch_count % 10 == 0:
|
||||
logger.info(
|
||||
f"Consolidation progress: {stats['records_processed']} records, "
|
||||
f"{stats['batches_completed']} batches"
|
||||
)
|
||||
|
||||
logger.success(
|
||||
f"Consolidation complete: {stats['records_processed']} records processed, "
|
||||
f"{stats['errors']} errors"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Consolidation failed: {e}")
|
||||
stats['errors'] += 1
|
||||
|
||||
return stats
|
||||
|
||||
def _consolidate_single_record(self, record: Dict[str, Any]):
|
||||
"""
|
||||
Process a single feature record into aggregated form.
|
||||
Computes statistical features and feature vectors.
|
||||
"""
|
||||
try:
|
||||
# Parse JSON fields once — reused by _build_feature_vector to avoid double-parsing
|
||||
host_features = json.loads(record.get('host_features', '{}'))
|
||||
network_features = json.loads(record.get('network_features', '{}'))
|
||||
temporal_features = json.loads(record.get('temporal_features', '{}'))
|
||||
action_features = json.loads(record.get('action_features', '{}'))
|
||||
|
||||
# Combine all features
|
||||
all_features = {
|
||||
**host_features,
|
||||
**network_features,
|
||||
**temporal_features,
|
||||
**action_features
|
||||
}
|
||||
|
||||
# Build numerical feature vector — pass already-parsed dicts to avoid re-parsing
|
||||
feature_vector = self._build_feature_vector(
|
||||
host_features, network_features, temporal_features, action_features
|
||||
)
|
||||
|
||||
# Determine time window
|
||||
raw_ts = record['timestamp']
|
||||
if isinstance(raw_ts, str):
|
||||
try:
|
||||
timestamp = datetime.fromisoformat(raw_ts)
|
||||
except ValueError:
|
||||
timestamp = datetime.now()
|
||||
elif isinstance(raw_ts, datetime):
|
||||
timestamp = raw_ts
|
||||
else:
|
||||
timestamp = datetime.now()
|
||||
|
||||
hourly_window = timestamp.replace(minute=0, second=0, microsecond=0).isoformat()
|
||||
|
||||
# Update or insert aggregated record
|
||||
self._update_aggregated_features(
|
||||
mac_address=record['mac_address'],
|
||||
time_window='hourly',
|
||||
timestamp=hourly_window,
|
||||
action_name=record['action_name'],
|
||||
success=record['success'],
|
||||
duration=record['duration_seconds'],
|
||||
reward=record['reward'],
|
||||
feature_vector=feature_vector,
|
||||
all_features=all_features
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error consolidating single record: {e}")
|
||||
raise
|
||||
|
||||
def _build_feature_vector(
|
||||
self,
|
||||
host_features: Dict[str, Any],
|
||||
network_features: Dict[str, Any],
|
||||
temporal_features: Dict[str, Any],
|
||||
action_features: Dict[str, Any],
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Build a named feature dictionary from already-parsed feature dicts.
|
||||
Accepts pre-parsed dicts so JSON is never decoded twice per record.
|
||||
Uses shared ai_utils for consistency.
|
||||
"""
|
||||
from ai_utils import extract_neural_features_dict
|
||||
|
||||
return extract_neural_features_dict(
|
||||
host_features=host_features,
|
||||
network_features=network_features,
|
||||
temporal_features=temporal_features,
|
||||
action_features=action_features,
|
||||
)
|
||||
|
||||
def _update_aggregated_features(
|
||||
self,
|
||||
mac_address: str,
|
||||
time_window: str,
|
||||
timestamp: str,
|
||||
action_name: str,
|
||||
success: int,
|
||||
duration: float,
|
||||
reward: float,
|
||||
feature_vector: Dict[str, float],
|
||||
all_features: Dict[str, Any]
|
||||
):
|
||||
"""
|
||||
Update or insert aggregated feature record.
|
||||
Accumulates statistics over the time window.
|
||||
"""
|
||||
try:
|
||||
# Check if record exists
|
||||
existing = self.db.query("""
|
||||
SELECT * FROM ml_features_aggregated
|
||||
WHERE mac_address=? AND time_window=? AND computed_at=?
|
||||
""", (mac_address, time_window, timestamp))
|
||||
|
||||
if existing:
|
||||
# Update existing record
|
||||
old = existing[0]
|
||||
new_total = old['total_actions'] + 1
|
||||
# ... typical stats update ...
|
||||
|
||||
# Merge feature vectors (average each named feature)
|
||||
old_vector = json.loads(old['feature_vector']) # Now a Dict
|
||||
if isinstance(old_vector, list): # Migration handle
|
||||
old_vector = {}
|
||||
|
||||
merged_vector = {}
|
||||
# Combine keys from both
|
||||
all_keys = set(old_vector.keys()) | set(feature_vector.keys())
|
||||
for k in all_keys:
|
||||
v_old = old_vector.get(k, 0.0)
|
||||
v_new = feature_vector.get(k, 0.0)
|
||||
merged_vector[k] = (v_old * old['total_actions'] + v_new) / new_total
|
||||
|
||||
self.db.execute("""
|
||||
UPDATE ml_features_aggregated
|
||||
SET total_actions=total_actions+1,
|
||||
success_rate=(success_rate*total_actions + ?)/(total_actions+1),
|
||||
avg_duration=(avg_duration*total_actions + ?)/(total_actions+1),
|
||||
total_reward=total_reward + ?,
|
||||
feature_vector=?
|
||||
WHERE mac_address=? AND time_window=? AND computed_at=?
|
||||
""", (
|
||||
success,
|
||||
duration,
|
||||
reward,
|
||||
json.dumps(merged_vector),
|
||||
mac_address,
|
||||
time_window,
|
||||
timestamp
|
||||
))
|
||||
else:
|
||||
# Insert new record
|
||||
self.db.execute("""
|
||||
INSERT INTO ml_features_aggregated (
|
||||
mac_address, time_window, computed_at,
|
||||
total_actions, success_rate, avg_duration, total_reward,
|
||||
feature_vector
|
||||
) VALUES (?, ?, ?, 1, ?, ?, ?, ?)
|
||||
""", (
|
||||
mac_address,
|
||||
time_window,
|
||||
timestamp,
|
||||
float(success),
|
||||
duration,
|
||||
reward,
|
||||
json.dumps(feature_vector)
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating aggregated features: {e}")
|
||||
raise
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# EXPORT FUNCTIONS
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
def export_for_training(
|
||||
self,
|
||||
format: str = 'csv',
|
||||
compress: bool = True,
|
||||
max_records: Optional[int] = None
|
||||
) -> Tuple[str, int]:
|
||||
"""
|
||||
Export consolidated features for deep learning training.
|
||||
|
||||
Args:
|
||||
format: 'csv', 'jsonl', or 'parquet'
|
||||
compress: Whether to gzip the output
|
||||
max_records: Maximum records to export (None = all)
|
||||
|
||||
Returns:
|
||||
Tuple of (file_path, record_count)
|
||||
"""
|
||||
try:
|
||||
if max_records is None:
|
||||
max_records = int(getattr(self.shared_data, "ai_export_max_records", 1000))
|
||||
max_records = max(100, min(int(max_records), 20000))
|
||||
|
||||
# Generate filename
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
base_filename = f"bjorn_training_{timestamp}.{format}"
|
||||
|
||||
if compress and format != 'parquet':
|
||||
base_filename += '.gz'
|
||||
|
||||
filepath = self.export_dir / base_filename
|
||||
|
||||
# Fetch data
|
||||
limit_clause = f"LIMIT {max_records}"
|
||||
records = self.db.query(f"""
|
||||
SELECT
|
||||
mf.*,
|
||||
mfa.feature_vector,
|
||||
mfa.success_rate as aggregated_success_rate,
|
||||
mfa.total_actions as aggregated_total_actions
|
||||
FROM ml_features mf
|
||||
LEFT JOIN ml_features_aggregated mfa
|
||||
ON mf.mac_address = mfa.mac_address
|
||||
WHERE mf.consolidated=1 AND mf.export_batch_id IS NULL
|
||||
ORDER BY mf.timestamp DESC
|
||||
{limit_clause}
|
||||
""")
|
||||
|
||||
if not records:
|
||||
logger.warning("No consolidated records to export")
|
||||
return "", 0
|
||||
|
||||
# Extract IDs before export so we can free the records list early
|
||||
record_ids = [r['id'] for r in records]
|
||||
|
||||
# Export based on format
|
||||
if format == 'csv':
|
||||
count = self._export_csv(records, filepath, compress)
|
||||
elif format == 'jsonl':
|
||||
count = self._export_jsonl(records, filepath, compress)
|
||||
elif format == 'parquet':
|
||||
count = self._export_parquet(records, filepath)
|
||||
else:
|
||||
raise ValueError(f"Unsupported format: {format}")
|
||||
|
||||
# Free the large records list immediately after export — record_ids is all we still need
|
||||
del records
|
||||
|
||||
# Create export batch record
|
||||
batch_id = self._create_export_batch(filepath, count)
|
||||
|
||||
# Update records with batch ID
|
||||
placeholders = ','.join('?' * len(record_ids))
|
||||
self.db.execute(f"""
|
||||
UPDATE ml_features
|
||||
SET export_batch_id=?
|
||||
WHERE id IN ({placeholders})
|
||||
""", [batch_id] + record_ids)
|
||||
del record_ids
|
||||
|
||||
logger.success(
|
||||
f"Exported {count} records to {filepath} "
|
||||
f"(batch_id={batch_id})"
|
||||
)
|
||||
|
||||
return str(filepath), count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Export failed: {e}")
|
||||
raise
|
||||
|
||||
def _export_csv(
|
||||
self,
|
||||
records: List[Dict],
|
||||
filepath: Path,
|
||||
compress: bool
|
||||
) -> int:
|
||||
"""Export records as CSV"""
|
||||
open_func = gzip.open if compress else open
|
||||
mode = 'wt' if compress else 'w'
|
||||
|
||||
# 1. Flatten all records first to collect all possible fieldnames
|
||||
flattened = []
|
||||
all_fieldnames = set()
|
||||
|
||||
for r in records:
|
||||
flat = {
|
||||
'timestamp': r['timestamp'],
|
||||
'mac_address': r['mac_address'],
|
||||
'ip_address': r['ip_address'],
|
||||
'action_name': r['action_name'],
|
||||
'success': r['success'],
|
||||
'duration_seconds': r['duration_seconds'],
|
||||
'reward': r['reward']
|
||||
}
|
||||
|
||||
# Parse and flatten features
|
||||
for field in ['host_features', 'network_features', 'temporal_features', 'action_features']:
|
||||
try:
|
||||
features = json.loads(r.get(field, '{}'))
|
||||
for k, v in features.items():
|
||||
if isinstance(v, (int, float, bool, str)):
|
||||
flat_key = f"{field}_{k}"
|
||||
flat[flat_key] = v
|
||||
except Exception as e:
|
||||
logger.debug(f"Skip bad JSON in {field}: {e}")
|
||||
|
||||
# Add named feature vector
|
||||
if r.get('feature_vector'):
|
||||
try:
|
||||
vector = json.loads(r['feature_vector'])
|
||||
if isinstance(vector, dict):
|
||||
for k, v in vector.items():
|
||||
flat[f'feat_{k}'] = v
|
||||
elif isinstance(vector, list):
|
||||
for i, v in enumerate(vector):
|
||||
flat[f'feature_{i}'] = v
|
||||
except Exception as e:
|
||||
logger.debug(f"Skip bad feature vector: {e}")
|
||||
|
||||
flattened.append(flat)
|
||||
all_fieldnames.update(flat.keys())
|
||||
|
||||
# 2. Sort fieldnames for consistency
|
||||
sorted_fieldnames = sorted(list(all_fieldnames))
|
||||
all_fieldnames = None # Free the set
|
||||
|
||||
# 3. Write CSV
|
||||
with open_func(filepath, mode, newline='', encoding='utf-8') as f:
|
||||
if flattened:
|
||||
writer = csv.DictWriter(f, fieldnames=sorted_fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(flattened)
|
||||
|
||||
count = len(flattened)
|
||||
flattened = None # Free the expanded list
|
||||
return count
|
||||
|
||||
def _export_jsonl(
|
||||
self,
|
||||
records: List[Dict],
|
||||
filepath: Path,
|
||||
compress: bool
|
||||
) -> int:
|
||||
"""Export records as JSON Lines"""
|
||||
open_func = gzip.open if compress else open
|
||||
mode = 'wt' if compress else 'w'
|
||||
|
||||
with open_func(filepath, mode, encoding='utf-8') as f:
|
||||
for r in records:
|
||||
# Avoid mutating `records` in place to keep memory growth predictable.
|
||||
row = dict(r)
|
||||
for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
|
||||
try:
|
||||
row[field] = json.loads(row.get(field, '{}'))
|
||||
except Exception:
|
||||
row[field] = {}
|
||||
|
||||
if row.get('feature_vector'):
|
||||
try:
|
||||
row['feature_vector'] = json.loads(row['feature_vector'])
|
||||
except Exception:
|
||||
row['feature_vector'] = {}
|
||||
|
||||
f.write(json.dumps(row) + '\n')
|
||||
|
||||
return len(records)
|
||||
|
||||
def _export_parquet(self, records: List[Dict], filepath: Path) -> int:
|
||||
"""Export records as Parquet (requires pyarrow)"""
|
||||
try:
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
# Flatten records
|
||||
flattened = []
|
||||
for r in records:
|
||||
flat = dict(r)
|
||||
# Parse JSON fields
|
||||
for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
|
||||
flat[field] = json.loads(r.get(field, '{}'))
|
||||
|
||||
if r.get('feature_vector'):
|
||||
flat['feature_vector'] = json.loads(r['feature_vector'])
|
||||
|
||||
flattened.append(flat)
|
||||
|
||||
# Convert to Arrow table
|
||||
table = pa.Table.from_pylist(flattened)
|
||||
|
||||
# Write parquet
|
||||
pq.write_table(table, filepath, compression='snappy')
|
||||
|
||||
return len(records)
|
||||
|
||||
except ImportError:
|
||||
logger.error("Parquet export requires pyarrow. Falling back to CSV.")
|
||||
return self._export_csv(records, filepath.with_suffix('.csv'), compress=True)
|
||||
|
||||
def _create_export_batch(self, filepath: Path, count: int) -> int:
|
||||
"""Create export batch record and return batch ID"""
|
||||
result = self.db.execute("""
|
||||
INSERT INTO ml_export_batches (file_path, record_count, status)
|
||||
VALUES (?, ?, 'exported')
|
||||
""", (str(filepath), count))
|
||||
|
||||
# Get the inserted ID
|
||||
batch_id = self.db.query("SELECT last_insert_rowid() as id")[0]['id']
|
||||
return batch_id
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# UTILITY METHODS
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
def get_export_stats(self) -> Dict[str, Any]:
|
||||
"""Get statistics about exports"""
|
||||
try:
|
||||
batches = self.db.query("""
|
||||
SELECT COUNT(*) as total_batches,
|
||||
SUM(record_count) as total_records,
|
||||
MAX(created_at) as last_export
|
||||
FROM ml_export_batches
|
||||
WHERE status='exported'
|
||||
""")[0]
|
||||
|
||||
pending = self.db.query("""
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM ml_features
|
||||
WHERE consolidated=1 AND export_batch_id IS NULL
|
||||
""")[0]['cnt']
|
||||
|
||||
return {
|
||||
'total_export_batches': batches.get('total_batches', 0),
|
||||
'total_records_exported': batches.get('total_records', 0),
|
||||
'last_export_time': batches.get('last_export'),
|
||||
'pending_export_count': pending
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting export stats: {e}")
|
||||
return {}
|
||||
|
||||
def flush_pending_uploads(self, max_files: int = 3) -> int:
|
||||
"""
|
||||
Retry uploads for previously exported batches that were not transferred yet.
|
||||
Returns the number of successfully transferred files.
|
||||
"""
|
||||
max_files = max(0, int(max_files))
|
||||
if max_files <= 0:
|
||||
return 0
|
||||
|
||||
# No heavy "reliquat" tracking needed: pending uploads = files present in export_dir.
|
||||
files = self._list_pending_export_files(limit=max_files)
|
||||
ok = 0
|
||||
for fp in files:
|
||||
if self.upload_to_server(fp):
|
||||
ok += 1
|
||||
else:
|
||||
# Stop early when server is unreachable to avoid repeated noise.
|
||||
if self.last_server_attempted and self.last_server_contact_ok is False:
|
||||
break
|
||||
return ok
|
||||
|
||||
def _list_pending_export_files(self, limit: int = 3) -> List[str]:
|
||||
"""
|
||||
Return oldest export files present in export_dir.
|
||||
This makes the backlog naturally equal to the number of files on disk.
|
||||
"""
|
||||
limit = max(0, int(limit))
|
||||
if limit <= 0:
|
||||
return []
|
||||
|
||||
try:
|
||||
d = Path(self.export_dir)
|
||||
if not d.exists():
|
||||
return []
|
||||
|
||||
def _safe_mtime(path: Path) -> float:
|
||||
try:
|
||||
return path.stat().st_mtime
|
||||
except Exception:
|
||||
return float("inf")
|
||||
|
||||
# Keep only the N oldest files in memory instead of sorting all candidates.
|
||||
files_iter = (p for p in d.glob("bjorn_training_*") if p.is_file())
|
||||
oldest = heapq.nsmallest(limit, files_iter, key=_safe_mtime)
|
||||
return [str(p) for p in oldest]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def _mark_batch_status(self, filepath: str, status: str, notes: str = "") -> None:
|
||||
"""Update ml_export_batches status for a given file path (best-effort)."""
|
||||
try:
|
||||
self.db.execute(
|
||||
"""
|
||||
UPDATE ml_export_batches
|
||||
SET status=?, notes=?
|
||||
WHERE file_path=?
|
||||
""",
|
||||
(status, notes or "", str(filepath)),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _safe_delete_uploaded_export(self, filepath: Path) -> None:
|
||||
"""Delete a successfully-uploaded export file if configured to do so."""
|
||||
try:
|
||||
if not bool(self.shared_data.config.get("ai_delete_export_after_upload", True)):
|
||||
return
|
||||
|
||||
fp = filepath.resolve()
|
||||
base = Path(self.export_dir).resolve()
|
||||
# Safety: only delete files under export_dir.
|
||||
if base not in fp.parents:
|
||||
return
|
||||
|
||||
fp.unlink(missing_ok=True) # Python 3.8+ supports missing_ok
|
||||
except TypeError:
|
||||
# Python < 3.8 fallback (not expected here, but safe)
|
||||
try:
|
||||
if filepath.exists():
|
||||
filepath.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def upload_to_server(self, filepath: str) -> bool:
|
||||
"""
|
||||
Upload export file to AI Validation Server.
|
||||
|
||||
Args:
|
||||
filepath: Path to the file to upload
|
||||
|
||||
Returns:
|
||||
True if upload successful
|
||||
"""
|
||||
self._set_server_contact_state(False, None)
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
requests = None
|
||||
|
||||
if requests is None:
|
||||
logger.info_throttled(
|
||||
"AI upload skipped: requests not installed",
|
||||
key="ai_upload_no_requests",
|
||||
interval_s=600.0,
|
||||
)
|
||||
return False
|
||||
|
||||
url = self.shared_data.config.get("ai_server_url")
|
||||
if not url:
|
||||
logger.info_throttled(
|
||||
"AI upload skipped: ai_server_url not configured",
|
||||
key="ai_upload_no_url",
|
||||
interval_s=600.0,
|
||||
)
|
||||
return False
|
||||
|
||||
backoff_s = max(10, int(self.shared_data.config.get("ai_upload_retry_backoff_s", 120)))
|
||||
max_backoff_s = 3600
|
||||
now_mono = time.monotonic()
|
||||
if now_mono < self._upload_backoff_until:
|
||||
remaining = int(self._upload_backoff_until - now_mono)
|
||||
logger.debug(f"AI upload backoff active ({remaining}s remaining)")
|
||||
logger.info_throttled(
|
||||
"AI upload deferred: backoff active",
|
||||
key="ai_upload_backoff_active",
|
||||
interval_s=180.0,
|
||||
)
|
||||
return False
|
||||
|
||||
try:
|
||||
filepath = Path(filepath)
|
||||
|
||||
if not filepath.exists():
|
||||
logger.warning(f"AI upload skipped: file not found: {filepath}")
|
||||
self._mark_batch_status(str(filepath), "missing", "file not found")
|
||||
return False
|
||||
|
||||
# Get MAC address for unique identification
|
||||
try:
|
||||
from ai_utils import get_system_mac
|
||||
mac = get_system_mac()
|
||||
except ImportError:
|
||||
mac = "unknown"
|
||||
|
||||
logger.debug(f"Uploading {filepath.name} to AI Server ({url}) unique_id={mac}")
|
||||
self._set_server_contact_state(True, None)
|
||||
|
||||
with open(filepath, 'rb') as f:
|
||||
files = {'file': f}
|
||||
# Send MAC as query param
|
||||
# Server expects ?mac_addr=...
|
||||
params = {'mac_addr': mac}
|
||||
|
||||
# Short timeout to avoid blocking
|
||||
response = requests.post(f"{url}/upload", files=files, params=params, timeout=10)
|
||||
|
||||
if response.status_code == 200:
|
||||
self._set_server_contact_state(True, True)
|
||||
self._upload_backoff_until = 0.0
|
||||
self._upload_backoff_current_s = 0.0
|
||||
logger.success(f"Uploaded {filepath.name} successfully")
|
||||
self._mark_batch_status(str(filepath), "transferred", "uploaded")
|
||||
self._safe_delete_uploaded_export(filepath)
|
||||
return True
|
||||
else:
|
||||
self._set_server_contact_state(True, False)
|
||||
next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
|
||||
logger.debug(
|
||||
f"AI upload HTTP failure for {filepath.name}: status={response.status_code}, "
|
||||
f"next retry in {next_retry_s}s"
|
||||
)
|
||||
logger.info_throttled(
|
||||
f"AI upload deferred (HTTP {response.status_code})",
|
||||
key=f"ai_upload_http_{response.status_code}",
|
||||
interval_s=300.0,
|
||||
)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self._set_server_contact_state(True, False)
|
||||
next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
|
||||
logger.debug(f"AI upload exception for {filepath}: {e} (next retry in {next_retry_s}s)")
|
||||
logger.info_throttled(
|
||||
"AI upload deferred: server unreachable (retry later)",
|
||||
key="ai_upload_exception",
|
||||
interval_s=300.0,
|
||||
)
|
||||
return False
|
||||
|
||||
def cleanup_old_exports(self, days: int = 30):
|
||||
"""Delete export files older than N days"""
|
||||
try:
|
||||
cutoff = datetime.now() - timedelta(days=days)
|
||||
|
||||
old_batches = self.db.query("""
|
||||
SELECT file_path FROM ml_export_batches
|
||||
WHERE created_at < ?
|
||||
""", (cutoff.isoformat(),))
|
||||
|
||||
deleted = 0
|
||||
for batch in old_batches:
|
||||
filepath = Path(batch['file_path'])
|
||||
if filepath.exists():
|
||||
filepath.unlink()
|
||||
deleted += 1
|
||||
|
||||
# Clean up database records
|
||||
self.db.execute("""
|
||||
DELETE FROM ml_export_batches
|
||||
WHERE created_at < ?
|
||||
""", (cutoff.isoformat(),))
|
||||
|
||||
logger.info(f"Cleaned up {deleted} old export files")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cleanup failed: {e}")
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# END OF FILE
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
Reference in New Issue
Block a user