mirror of
https://github.com/infinition/Bjorn.git
synced 2026-03-09 06:01:59 +00:00
- Implemented methods for fetching AI stats, training history, and recent experiences. - Added functionality to set operation mode (MANUAL, AUTO, AI) with appropriate handling. - Included helper methods for querying the database and sending JSON responses. - Integrated model metadata extraction for visualization purposes.
830 lines
32 KiB
Python
830 lines
32 KiB
Python
"""
|
|
data_consolidator.py - Data Consolidation Engine for Deep Learning
|
|
═══════════════════════════════════════════════════════════════════════════
|
|
|
|
Purpose:
|
|
Consolidate logged features into training-ready datasets.
|
|
Prepare data exports for deep learning on external PC.
|
|
|
|
Features:
|
|
- Aggregate features across time windows
|
|
- Compute statistical features
|
|
- Create feature vectors for neural networks
|
|
- Export in formats ready for TensorFlow/PyTorch
|
|
- Incremental consolidation (low memory footprint)
|
|
|
|
Author: Bjorn Team
|
|
Version: 2.0.0
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
import time
|
|
import gzip
|
|
import heapq
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
from pathlib import Path
|
|
from logger import Logger
|
|
|
|
logger = Logger(name="data_consolidator.py", level=20)
|
|
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
requests = None
|
|
|
|
|
|
class DataConsolidator:
|
|
"""
|
|
Consolidates raw feature logs into training datasets.
|
|
Optimized for Raspberry Pi Zero - processes in batches.
|
|
"""
|
|
|
|
def __init__(self, shared_data, export_dir: str = None):
|
|
"""
|
|
Initialize data consolidator
|
|
|
|
Args:
|
|
shared_data: SharedData instance
|
|
export_dir: Directory for export files
|
|
"""
|
|
self.shared_data = shared_data
|
|
self.db = shared_data.db
|
|
|
|
if export_dir is None:
|
|
# Default to shared_data path (cross-platform)
|
|
self.export_dir = Path(getattr(shared_data, 'ml_exports_dir', Path(shared_data.data_dir) / "ml_exports"))
|
|
else:
|
|
self.export_dir = Path(export_dir)
|
|
|
|
self.export_dir.mkdir(parents=True, exist_ok=True)
|
|
# Server health state consumed by orchestrator fallback logic.
|
|
self.last_server_attempted = False
|
|
self.last_server_contact_ok = None
|
|
self._upload_backoff_until = 0.0
|
|
self._upload_backoff_current_s = 0.0
|
|
|
|
logger.info(f"DataConsolidator initialized, exports: {self.export_dir}")
|
|
|
|
def _set_server_contact_state(self, attempted: bool, ok: Optional[bool]) -> None:
|
|
self.last_server_attempted = bool(attempted)
|
|
self.last_server_contact_ok = ok if attempted else None
|
|
|
|
def _apply_upload_backoff(self, base_backoff_s: int, max_backoff_s: int = 3600) -> int:
|
|
"""
|
|
Exponential upload retry backoff:
|
|
base -> base*2 -> base*4 ... capped at max_backoff_s.
|
|
Returns the delay (seconds) applied for the next retry window.
|
|
"""
|
|
base = max(10, int(base_backoff_s))
|
|
cap = max(base, int(max_backoff_s))
|
|
prev = float(getattr(self, "_upload_backoff_current_s", 0.0) or 0.0)
|
|
|
|
if prev <= 0:
|
|
delay = base
|
|
else:
|
|
delay = min(cap, max(base, int(prev * 2)))
|
|
|
|
self._upload_backoff_current_s = float(delay)
|
|
self._upload_backoff_until = time.monotonic() + delay
|
|
return int(delay)
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# CONSOLIDATION ENGINE
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
def consolidate_features(
|
|
self,
|
|
batch_size: int = None,
|
|
max_batches: Optional[int] = None
|
|
) -> Dict[str, int]:
|
|
"""
|
|
Consolidate raw features into aggregated feature vectors.
|
|
Processes unconsolidated records in batches.
|
|
"""
|
|
if batch_size is None:
|
|
batch_size = int(getattr(self.shared_data, "ai_batch_size", 100))
|
|
batch_size = max(1, min(int(batch_size), 5000))
|
|
stats = {
|
|
'records_processed': 0,
|
|
'records_aggregated': 0,
|
|
'batches_completed': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
try:
|
|
# Get unconsolidated records
|
|
unconsolidated = self.db.query("""
|
|
SELECT COUNT(*) as cnt
|
|
FROM ml_features
|
|
WHERE consolidated=0
|
|
""")[0]['cnt']
|
|
|
|
if unconsolidated == 0:
|
|
logger.info("No unconsolidated features to process")
|
|
return stats
|
|
|
|
logger.info(f"Consolidating {unconsolidated} feature records...")
|
|
|
|
batch_count = 0
|
|
while True:
|
|
if max_batches and batch_count >= max_batches:
|
|
break
|
|
|
|
# Fetch batch
|
|
batch = self.db.query(f"""
|
|
SELECT * FROM ml_features
|
|
WHERE consolidated=0
|
|
ORDER BY timestamp
|
|
LIMIT {batch_size}
|
|
""")
|
|
|
|
if not batch:
|
|
break
|
|
|
|
# Process batch
|
|
for record in batch:
|
|
try:
|
|
self._consolidate_single_record(record)
|
|
stats['records_processed'] += 1
|
|
except Exception as e:
|
|
logger.error(f"Error consolidating record {record['id']}: {e}")
|
|
stats['errors'] += 1
|
|
|
|
# Mark as consolidated
|
|
record_ids = [r['id'] for r in batch]
|
|
placeholders = ','.join('?' * len(record_ids))
|
|
self.db.execute(f"""
|
|
UPDATE ml_features
|
|
SET consolidated=1
|
|
WHERE id IN ({placeholders})
|
|
""", record_ids)
|
|
|
|
stats['batches_completed'] += 1
|
|
batch_count += 1
|
|
|
|
# Progress log
|
|
if batch_count % 10 == 0:
|
|
logger.info(
|
|
f"Consolidation progress: {stats['records_processed']} records, "
|
|
f"{stats['batches_completed']} batches"
|
|
)
|
|
|
|
logger.success(
|
|
f"Consolidation complete: {stats['records_processed']} records processed, "
|
|
f"{stats['errors']} errors"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Consolidation failed: {e}")
|
|
stats['errors'] += 1
|
|
|
|
return stats
|
|
|
|
def _consolidate_single_record(self, record: Dict[str, Any]):
|
|
"""
|
|
Process a single feature record into aggregated form.
|
|
Computes statistical features and feature vectors.
|
|
"""
|
|
try:
|
|
# Parse JSON fields once — reused by _build_feature_vector to avoid double-parsing
|
|
host_features = json.loads(record.get('host_features', '{}'))
|
|
network_features = json.loads(record.get('network_features', '{}'))
|
|
temporal_features = json.loads(record.get('temporal_features', '{}'))
|
|
action_features = json.loads(record.get('action_features', '{}'))
|
|
|
|
# Combine all features
|
|
all_features = {
|
|
**host_features,
|
|
**network_features,
|
|
**temporal_features,
|
|
**action_features
|
|
}
|
|
|
|
# Build numerical feature vector — pass already-parsed dicts to avoid re-parsing
|
|
feature_vector = self._build_feature_vector(
|
|
host_features, network_features, temporal_features, action_features
|
|
)
|
|
|
|
# Determine time window
|
|
raw_ts = record['timestamp']
|
|
if isinstance(raw_ts, str):
|
|
try:
|
|
timestamp = datetime.fromisoformat(raw_ts)
|
|
except ValueError:
|
|
timestamp = datetime.now()
|
|
elif isinstance(raw_ts, datetime):
|
|
timestamp = raw_ts
|
|
else:
|
|
timestamp = datetime.now()
|
|
|
|
hourly_window = timestamp.replace(minute=0, second=0, microsecond=0).isoformat()
|
|
|
|
# Update or insert aggregated record
|
|
self._update_aggregated_features(
|
|
mac_address=record['mac_address'],
|
|
time_window='hourly',
|
|
timestamp=hourly_window,
|
|
action_name=record['action_name'],
|
|
success=record['success'],
|
|
duration=record['duration_seconds'],
|
|
reward=record['reward'],
|
|
feature_vector=feature_vector,
|
|
all_features=all_features
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error consolidating single record: {e}")
|
|
raise
|
|
|
|
def _build_feature_vector(
|
|
self,
|
|
host_features: Dict[str, Any],
|
|
network_features: Dict[str, Any],
|
|
temporal_features: Dict[str, Any],
|
|
action_features: Dict[str, Any],
|
|
) -> Dict[str, float]:
|
|
"""
|
|
Build a named feature dictionary from already-parsed feature dicts.
|
|
Accepts pre-parsed dicts so JSON is never decoded twice per record.
|
|
Uses shared ai_utils for consistency.
|
|
"""
|
|
from ai_utils import extract_neural_features_dict
|
|
|
|
return extract_neural_features_dict(
|
|
host_features=host_features,
|
|
network_features=network_features,
|
|
temporal_features=temporal_features,
|
|
action_features=action_features,
|
|
)
|
|
|
|
def _update_aggregated_features(
|
|
self,
|
|
mac_address: str,
|
|
time_window: str,
|
|
timestamp: str,
|
|
action_name: str,
|
|
success: int,
|
|
duration: float,
|
|
reward: float,
|
|
feature_vector: Dict[str, float],
|
|
all_features: Dict[str, Any]
|
|
):
|
|
"""
|
|
Update or insert aggregated feature record.
|
|
Accumulates statistics over the time window.
|
|
"""
|
|
try:
|
|
# Check if record exists
|
|
existing = self.db.query("""
|
|
SELECT * FROM ml_features_aggregated
|
|
WHERE mac_address=? AND time_window=? AND computed_at=?
|
|
""", (mac_address, time_window, timestamp))
|
|
|
|
if existing:
|
|
# Update existing record
|
|
old = existing[0]
|
|
new_total = old['total_actions'] + 1
|
|
# ... typical stats update ...
|
|
|
|
# Merge feature vectors (average each named feature)
|
|
old_vector = json.loads(old['feature_vector']) # Now a Dict
|
|
if isinstance(old_vector, list): # Migration handle
|
|
old_vector = {}
|
|
|
|
merged_vector = {}
|
|
# Combine keys from both
|
|
all_keys = set(old_vector.keys()) | set(feature_vector.keys())
|
|
for k in all_keys:
|
|
v_old = old_vector.get(k, 0.0)
|
|
v_new = feature_vector.get(k, 0.0)
|
|
merged_vector[k] = (v_old * old['total_actions'] + v_new) / new_total
|
|
|
|
self.db.execute("""
|
|
UPDATE ml_features_aggregated
|
|
SET total_actions=total_actions+1,
|
|
success_rate=(success_rate*total_actions + ?)/(total_actions+1),
|
|
avg_duration=(avg_duration*total_actions + ?)/(total_actions+1),
|
|
total_reward=total_reward + ?,
|
|
feature_vector=?
|
|
WHERE mac_address=? AND time_window=? AND computed_at=?
|
|
""", (
|
|
success,
|
|
duration,
|
|
reward,
|
|
json.dumps(merged_vector),
|
|
mac_address,
|
|
time_window,
|
|
timestamp
|
|
))
|
|
else:
|
|
# Insert new record
|
|
self.db.execute("""
|
|
INSERT INTO ml_features_aggregated (
|
|
mac_address, time_window, computed_at,
|
|
total_actions, success_rate, avg_duration, total_reward,
|
|
feature_vector
|
|
) VALUES (?, ?, ?, 1, ?, ?, ?, ?)
|
|
""", (
|
|
mac_address,
|
|
time_window,
|
|
timestamp,
|
|
float(success),
|
|
duration,
|
|
reward,
|
|
json.dumps(feature_vector)
|
|
))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating aggregated features: {e}")
|
|
raise
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# EXPORT FUNCTIONS
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
def export_for_training(
|
|
self,
|
|
format: str = 'csv',
|
|
compress: bool = True,
|
|
max_records: Optional[int] = None
|
|
) -> Tuple[str, int]:
|
|
"""
|
|
Export consolidated features for deep learning training.
|
|
|
|
Args:
|
|
format: 'csv', 'jsonl', or 'parquet'
|
|
compress: Whether to gzip the output
|
|
max_records: Maximum records to export (None = all)
|
|
|
|
Returns:
|
|
Tuple of (file_path, record_count)
|
|
"""
|
|
try:
|
|
if max_records is None:
|
|
max_records = int(getattr(self.shared_data, "ai_export_max_records", 1000))
|
|
max_records = max(100, min(int(max_records), 20000))
|
|
|
|
# Generate filename
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
base_filename = f"bjorn_training_{timestamp}.{format}"
|
|
|
|
if compress and format != 'parquet':
|
|
base_filename += '.gz'
|
|
|
|
filepath = self.export_dir / base_filename
|
|
|
|
# Fetch data
|
|
limit_clause = f"LIMIT {max_records}"
|
|
records = self.db.query(f"""
|
|
SELECT
|
|
mf.*,
|
|
mfa.feature_vector,
|
|
mfa.success_rate as aggregated_success_rate,
|
|
mfa.total_actions as aggregated_total_actions
|
|
FROM ml_features mf
|
|
LEFT JOIN ml_features_aggregated mfa
|
|
ON mf.mac_address = mfa.mac_address
|
|
WHERE mf.consolidated=1 AND mf.export_batch_id IS NULL
|
|
ORDER BY mf.timestamp DESC
|
|
{limit_clause}
|
|
""")
|
|
|
|
if not records:
|
|
logger.warning("No consolidated records to export")
|
|
return "", 0
|
|
|
|
# Extract IDs before export so we can free the records list early
|
|
record_ids = [r['id'] for r in records]
|
|
|
|
# Export based on format
|
|
if format == 'csv':
|
|
count = self._export_csv(records, filepath, compress)
|
|
elif format == 'jsonl':
|
|
count = self._export_jsonl(records, filepath, compress)
|
|
elif format == 'parquet':
|
|
count = self._export_parquet(records, filepath)
|
|
else:
|
|
raise ValueError(f"Unsupported format: {format}")
|
|
|
|
# Free the large records list immediately after export — record_ids is all we still need
|
|
del records
|
|
|
|
# Create export batch record
|
|
batch_id = self._create_export_batch(filepath, count)
|
|
|
|
# Update records with batch ID
|
|
placeholders = ','.join('?' * len(record_ids))
|
|
self.db.execute(f"""
|
|
UPDATE ml_features
|
|
SET export_batch_id=?
|
|
WHERE id IN ({placeholders})
|
|
""", [batch_id] + record_ids)
|
|
del record_ids
|
|
|
|
logger.success(
|
|
f"Exported {count} records to {filepath} "
|
|
f"(batch_id={batch_id})"
|
|
)
|
|
|
|
return str(filepath), count
|
|
|
|
except Exception as e:
|
|
logger.error(f"Export failed: {e}")
|
|
raise
|
|
|
|
def _export_csv(
|
|
self,
|
|
records: List[Dict],
|
|
filepath: Path,
|
|
compress: bool
|
|
) -> int:
|
|
"""Export records as CSV"""
|
|
open_func = gzip.open if compress else open
|
|
mode = 'wt' if compress else 'w'
|
|
|
|
# 1. Flatten all records first to collect all possible fieldnames
|
|
flattened = []
|
|
all_fieldnames = set()
|
|
|
|
for r in records:
|
|
flat = {
|
|
'timestamp': r['timestamp'],
|
|
'mac_address': r['mac_address'],
|
|
'ip_address': r['ip_address'],
|
|
'action_name': r['action_name'],
|
|
'success': r['success'],
|
|
'duration_seconds': r['duration_seconds'],
|
|
'reward': r['reward']
|
|
}
|
|
|
|
# Parse and flatten features
|
|
for field in ['host_features', 'network_features', 'temporal_features', 'action_features']:
|
|
try:
|
|
features = json.loads(r.get(field, '{}'))
|
|
for k, v in features.items():
|
|
if isinstance(v, (int, float, bool, str)):
|
|
flat_key = f"{field}_{k}"
|
|
flat[flat_key] = v
|
|
except Exception as e:
|
|
logger.debug(f"Skip bad JSON in {field}: {e}")
|
|
|
|
# Add named feature vector
|
|
if r.get('feature_vector'):
|
|
try:
|
|
vector = json.loads(r['feature_vector'])
|
|
if isinstance(vector, dict):
|
|
for k, v in vector.items():
|
|
flat[f'feat_{k}'] = v
|
|
elif isinstance(vector, list):
|
|
for i, v in enumerate(vector):
|
|
flat[f'feature_{i}'] = v
|
|
except Exception as e:
|
|
logger.debug(f"Skip bad feature vector: {e}")
|
|
|
|
flattened.append(flat)
|
|
all_fieldnames.update(flat.keys())
|
|
|
|
# 2. Sort fieldnames for consistency
|
|
sorted_fieldnames = sorted(list(all_fieldnames))
|
|
all_fieldnames = None # Free the set
|
|
|
|
# 3. Write CSV
|
|
with open_func(filepath, mode, newline='', encoding='utf-8') as f:
|
|
if flattened:
|
|
writer = csv.DictWriter(f, fieldnames=sorted_fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(flattened)
|
|
|
|
count = len(flattened)
|
|
flattened = None # Free the expanded list
|
|
return count
|
|
|
|
def _export_jsonl(
|
|
self,
|
|
records: List[Dict],
|
|
filepath: Path,
|
|
compress: bool
|
|
) -> int:
|
|
"""Export records as JSON Lines"""
|
|
open_func = gzip.open if compress else open
|
|
mode = 'wt' if compress else 'w'
|
|
|
|
with open_func(filepath, mode, encoding='utf-8') as f:
|
|
for r in records:
|
|
# Avoid mutating `records` in place to keep memory growth predictable.
|
|
row = dict(r)
|
|
for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
|
|
try:
|
|
row[field] = json.loads(row.get(field, '{}'))
|
|
except Exception:
|
|
row[field] = {}
|
|
|
|
if row.get('feature_vector'):
|
|
try:
|
|
row['feature_vector'] = json.loads(row['feature_vector'])
|
|
except Exception:
|
|
row['feature_vector'] = {}
|
|
|
|
f.write(json.dumps(row) + '\n')
|
|
|
|
return len(records)
|
|
|
|
def _export_parquet(self, records: List[Dict], filepath: Path) -> int:
|
|
"""Export records as Parquet (requires pyarrow)"""
|
|
try:
|
|
import pyarrow as pa
|
|
import pyarrow.parquet as pq
|
|
|
|
# Flatten records
|
|
flattened = []
|
|
for r in records:
|
|
flat = dict(r)
|
|
# Parse JSON fields
|
|
for field in ['host_features', 'network_features', 'temporal_features', 'action_features', 'raw_event']:
|
|
flat[field] = json.loads(r.get(field, '{}'))
|
|
|
|
if r.get('feature_vector'):
|
|
flat['feature_vector'] = json.loads(r['feature_vector'])
|
|
|
|
flattened.append(flat)
|
|
|
|
# Convert to Arrow table
|
|
table = pa.Table.from_pylist(flattened)
|
|
|
|
# Write parquet
|
|
pq.write_table(table, filepath, compression='snappy')
|
|
|
|
return len(records)
|
|
|
|
except ImportError:
|
|
logger.error("Parquet export requires pyarrow. Falling back to CSV.")
|
|
return self._export_csv(records, filepath.with_suffix('.csv'), compress=True)
|
|
|
|
def _create_export_batch(self, filepath: Path, count: int) -> int:
|
|
"""Create export batch record and return batch ID"""
|
|
result = self.db.execute("""
|
|
INSERT INTO ml_export_batches (file_path, record_count, status)
|
|
VALUES (?, ?, 'exported')
|
|
""", (str(filepath), count))
|
|
|
|
# Get the inserted ID
|
|
batch_id = self.db.query("SELECT last_insert_rowid() as id")[0]['id']
|
|
return batch_id
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# UTILITY METHODS
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
def get_export_stats(self) -> Dict[str, Any]:
|
|
"""Get statistics about exports"""
|
|
try:
|
|
batches = self.db.query("""
|
|
SELECT COUNT(*) as total_batches,
|
|
SUM(record_count) as total_records,
|
|
MAX(created_at) as last_export
|
|
FROM ml_export_batches
|
|
WHERE status='exported'
|
|
""")[0]
|
|
|
|
pending = self.db.query("""
|
|
SELECT COUNT(*) as cnt
|
|
FROM ml_features
|
|
WHERE consolidated=1 AND export_batch_id IS NULL
|
|
""")[0]['cnt']
|
|
|
|
return {
|
|
'total_export_batches': batches.get('total_batches', 0),
|
|
'total_records_exported': batches.get('total_records', 0),
|
|
'last_export_time': batches.get('last_export'),
|
|
'pending_export_count': pending
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting export stats: {e}")
|
|
return {}
|
|
|
|
def flush_pending_uploads(self, max_files: int = 3) -> int:
|
|
"""
|
|
Retry uploads for previously exported batches that were not transferred yet.
|
|
Returns the number of successfully transferred files.
|
|
"""
|
|
max_files = max(0, int(max_files))
|
|
if max_files <= 0:
|
|
return 0
|
|
|
|
# No heavy "reliquat" tracking needed: pending uploads = files present in export_dir.
|
|
files = self._list_pending_export_files(limit=max_files)
|
|
ok = 0
|
|
for fp in files:
|
|
if self.upload_to_server(fp):
|
|
ok += 1
|
|
else:
|
|
# Stop early when server is unreachable to avoid repeated noise.
|
|
if self.last_server_attempted and self.last_server_contact_ok is False:
|
|
break
|
|
return ok
|
|
|
|
def _list_pending_export_files(self, limit: int = 3) -> List[str]:
|
|
"""
|
|
Return oldest export files present in export_dir.
|
|
This makes the backlog naturally equal to the number of files on disk.
|
|
"""
|
|
limit = max(0, int(limit))
|
|
if limit <= 0:
|
|
return []
|
|
|
|
try:
|
|
d = Path(self.export_dir)
|
|
if not d.exists():
|
|
return []
|
|
|
|
def _safe_mtime(path: Path) -> float:
|
|
try:
|
|
return path.stat().st_mtime
|
|
except Exception:
|
|
return float("inf")
|
|
|
|
# Keep only the N oldest files in memory instead of sorting all candidates.
|
|
files_iter = (p for p in d.glob("bjorn_training_*") if p.is_file())
|
|
oldest = heapq.nsmallest(limit, files_iter, key=_safe_mtime)
|
|
return [str(p) for p in oldest]
|
|
except Exception:
|
|
return []
|
|
|
|
def _mark_batch_status(self, filepath: str, status: str, notes: str = "") -> None:
|
|
"""Update ml_export_batches status for a given file path (best-effort)."""
|
|
try:
|
|
self.db.execute(
|
|
"""
|
|
UPDATE ml_export_batches
|
|
SET status=?, notes=?
|
|
WHERE file_path=?
|
|
""",
|
|
(status, notes or "", str(filepath)),
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
def _safe_delete_uploaded_export(self, filepath: Path) -> None:
|
|
"""Delete a successfully-uploaded export file if configured to do so."""
|
|
try:
|
|
if not bool(self.shared_data.config.get("ai_delete_export_after_upload", True)):
|
|
return
|
|
|
|
fp = filepath.resolve()
|
|
base = Path(self.export_dir).resolve()
|
|
# Safety: only delete files under export_dir.
|
|
if base not in fp.parents:
|
|
return
|
|
|
|
fp.unlink(missing_ok=True) # Python 3.8+ supports missing_ok
|
|
except TypeError:
|
|
# Python < 3.8 fallback (not expected here, but safe)
|
|
try:
|
|
if filepath.exists():
|
|
filepath.unlink()
|
|
except Exception:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
def upload_to_server(self, filepath: str) -> bool:
|
|
"""
|
|
Upload export file to AI Validation Server.
|
|
|
|
Args:
|
|
filepath: Path to the file to upload
|
|
|
|
Returns:
|
|
True if upload successful
|
|
"""
|
|
self._set_server_contact_state(False, None)
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
requests = None
|
|
|
|
if requests is None:
|
|
logger.info_throttled(
|
|
"AI upload skipped: requests not installed",
|
|
key="ai_upload_no_requests",
|
|
interval_s=600.0,
|
|
)
|
|
return False
|
|
|
|
url = self.shared_data.config.get("ai_server_url")
|
|
if not url:
|
|
logger.info_throttled(
|
|
"AI upload skipped: ai_server_url not configured",
|
|
key="ai_upload_no_url",
|
|
interval_s=600.0,
|
|
)
|
|
return False
|
|
|
|
backoff_s = max(10, int(self.shared_data.config.get("ai_upload_retry_backoff_s", 120)))
|
|
max_backoff_s = 3600
|
|
now_mono = time.monotonic()
|
|
if now_mono < self._upload_backoff_until:
|
|
remaining = int(self._upload_backoff_until - now_mono)
|
|
logger.debug(f"AI upload backoff active ({remaining}s remaining)")
|
|
logger.info_throttled(
|
|
"AI upload deferred: backoff active",
|
|
key="ai_upload_backoff_active",
|
|
interval_s=180.0,
|
|
)
|
|
return False
|
|
|
|
try:
|
|
filepath = Path(filepath)
|
|
|
|
if not filepath.exists():
|
|
logger.warning(f"AI upload skipped: file not found: {filepath}")
|
|
self._mark_batch_status(str(filepath), "missing", "file not found")
|
|
return False
|
|
|
|
# Get MAC address for unique identification
|
|
try:
|
|
from ai_utils import get_system_mac
|
|
mac = get_system_mac()
|
|
except ImportError:
|
|
mac = "unknown"
|
|
|
|
logger.debug(f"Uploading {filepath.name} to AI Server ({url}) unique_id={mac}")
|
|
self._set_server_contact_state(True, None)
|
|
|
|
with open(filepath, 'rb') as f:
|
|
files = {'file': f}
|
|
# Send MAC as query param
|
|
# Server expects ?mac_addr=...
|
|
params = {'mac_addr': mac}
|
|
|
|
# Short timeout to avoid blocking
|
|
response = requests.post(f"{url}/upload", files=files, params=params, timeout=10)
|
|
|
|
if response.status_code == 200:
|
|
self._set_server_contact_state(True, True)
|
|
self._upload_backoff_until = 0.0
|
|
self._upload_backoff_current_s = 0.0
|
|
logger.success(f"Uploaded {filepath.name} successfully")
|
|
self._mark_batch_status(str(filepath), "transferred", "uploaded")
|
|
self._safe_delete_uploaded_export(filepath)
|
|
return True
|
|
else:
|
|
self._set_server_contact_state(True, False)
|
|
next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
|
|
logger.debug(
|
|
f"AI upload HTTP failure for {filepath.name}: status={response.status_code}, "
|
|
f"next retry in {next_retry_s}s"
|
|
)
|
|
logger.info_throttled(
|
|
f"AI upload deferred (HTTP {response.status_code})",
|
|
key=f"ai_upload_http_{response.status_code}",
|
|
interval_s=300.0,
|
|
)
|
|
return False
|
|
|
|
except Exception as e:
|
|
self._set_server_contact_state(True, False)
|
|
next_retry_s = self._apply_upload_backoff(backoff_s, max_backoff_s)
|
|
logger.debug(f"AI upload exception for {filepath}: {e} (next retry in {next_retry_s}s)")
|
|
logger.info_throttled(
|
|
"AI upload deferred: server unreachable (retry later)",
|
|
key="ai_upload_exception",
|
|
interval_s=300.0,
|
|
)
|
|
return False
|
|
|
|
def cleanup_old_exports(self, days: int = 30):
|
|
"""Delete export files older than N days"""
|
|
try:
|
|
cutoff = datetime.now() - timedelta(days=days)
|
|
|
|
old_batches = self.db.query("""
|
|
SELECT file_path FROM ml_export_batches
|
|
WHERE created_at < ?
|
|
""", (cutoff.isoformat(),))
|
|
|
|
deleted = 0
|
|
for batch in old_batches:
|
|
filepath = Path(batch['file_path'])
|
|
if filepath.exists():
|
|
filepath.unlink()
|
|
deleted += 1
|
|
|
|
# Clean up database records
|
|
self.db.execute("""
|
|
DELETE FROM ml_export_batches
|
|
WHERE created_at < ?
|
|
""", (cutoff.isoformat(),))
|
|
|
|
logger.info(f"Cleaned up {deleted} old export files")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cleanup failed: {e}")
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# END OF FILE
|
|
# ═══════════════════════════════════════════════════════════════════════════
|