Bjorn/feature_logger.py

"""
feature_logger.py - Dynamic Feature Logging Engine for Bjorn
═══════════════════════════════════════════════════════════════════════════

Purpose:
    Automatically capture ALL relevant features from action executions
    for deep learning model training. No manual feature declaration needed.

Architecture:
    - Automatic feature extraction from all data sources
    - Time-series aggregation
    - Network topology features
    - Action success patterns
    - Lightweight storage optimized for Pi Zero
    - Export format ready for deep learning

Author: Bjorn Team (Enhanced AI Version)
Version: 2.0.0
"""

import json
import time
import hashlib
import random
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from collections import defaultdict, deque
from logger import Logger

logger = Logger(name="feature_logger.py", level=20)


class FeatureLogger:
    """
    Captures comprehensive features from network reconnaissance
    and action execution for deep learning.
    """

    def __init__(self, shared_data):
        """Initialize feature logger with database connection"""
        self.shared_data = shared_data
        self.db = shared_data.db
        self._max_hosts_tracked = max(
            64, int(getattr(self.shared_data, "ai_feature_hosts_limit", 512))
        )

        # Rolling windows for temporal features (memory efficient)
        self.recent_actions = deque(maxlen=100)
        self.host_history = defaultdict(lambda: deque(maxlen=50))

        # Initialize feature tables
        self._ensure_tables_exist()

        logger.info("FeatureLogger initialized - auto-discovery mode enabled")

    # ═══════════════════════════════════════════════════════════════════════
    # DATABASE SCHEMA
    # ═══════════════════════════════════════════════════════════════════════

    def _ensure_tables_exist(self):
        """Create feature logging tables if they don't exist"""
        try:
            # Main feature log table
            self.db.execute("""
                CREATE TABLE IF NOT EXISTS ml_features (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    timestamp TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,

                    -- Identifiers
                    mac_address TEXT,
                    ip_address TEXT,
                    action_name TEXT,

                    -- Context features (JSON)
                    host_features TEXT,      -- Vendor, ports, services, etc.
                    network_features TEXT,   -- Topology, neighbors, subnets
                    temporal_features TEXT,  -- Time patterns, sequences
                    action_features TEXT,    -- Action-specific metadata

                    -- Outcome
                    success INTEGER,
                    duration_seconds REAL,
                    reward REAL,

                    -- Raw event data (for replay)
                    raw_event TEXT,

                    -- Consolidation status
                    consolidated INTEGER DEFAULT 0,
                    export_batch_id INTEGER
                )
            """)

            # Index for fast queries
            self.db.execute("""
                CREATE INDEX IF NOT EXISTS idx_ml_features_mac
                ON ml_features(mac_address, timestamp DESC)
            """)

            self.db.execute("""
                CREATE INDEX IF NOT EXISTS idx_ml_features_consolidated
                ON ml_features(consolidated, timestamp)
            """)

            # Aggregated features table (pre-computed for efficiency)
            self.db.execute("""
                CREATE TABLE IF NOT EXISTS ml_features_aggregated (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    computed_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,

                    mac_address TEXT,
                    time_window TEXT,  -- 'hourly', 'daily', 'weekly'

                    -- Aggregated metrics
                    total_actions INTEGER,
                    success_rate REAL,
                    avg_duration REAL,
                    total_reward REAL,

                    -- Action distribution
                    action_counts TEXT,  -- JSON: {action_name: count}

                    -- Discovery metrics
                    new_ports_found INTEGER,
                    new_services_found INTEGER,
                    credentials_found INTEGER,

                    -- Feature vector (for DL)
                    feature_vector TEXT,  -- JSON array of numerical features

                    UNIQUE(mac_address, time_window, computed_at)
                )
            """)

            # Export batches tracking
            self.db.execute("""
                CREATE TABLE IF NOT EXISTS ml_export_batches (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
                    record_count INTEGER,
                    file_path TEXT,
                    status TEXT DEFAULT 'pending',  -- pending, exported, transferred
                    notes TEXT
                )
            """)

            logger.info("ML feature tables initialized")

        except Exception as e:
            logger.error(f"Failed to create ML tables: {e}")

    # ═══════════════════════════════════════════════════════════════════════
    # AUTOMATIC FEATURE EXTRACTION
    # ═══════════════════════════════════════════════════════════════════════

    def log_action_execution(
        self,
        mac_address: str,
        ip_address: str,
        action_name: str,
        success: bool,
        duration: float,
        reward: float,
        raw_event: Dict[str, Any]
    ):
        """
        Log a complete action execution with automatically extracted features.

        Args:
            mac_address: Target MAC address
            ip_address: Target IP address
            action_name: Name of executed action
            success: Whether action succeeded
            duration: Execution time in seconds
            reward: Calculated reward value
            raw_event: Complete event data (for replay/debugging)
        """
        try:
            # Shield against missing MAC
            if not mac_address:
                logger.debug("Skipping ML log: missing MAC address")
                return

            # Extract features from multiple sources
            host_features = self._extract_host_features(mac_address, ip_address)
            network_features = self._extract_network_features(mac_address)
            temporal_features = self._extract_temporal_features(mac_address, action_name)
            action_features = self._extract_action_features(action_name, raw_event)

            # Store in database
            self.db.execute("""
                INSERT INTO ml_features (
                    mac_address, ip_address, action_name,
                    host_features, network_features, temporal_features, action_features,
                    success, duration_seconds, reward, raw_event
                )
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                mac_address, ip_address, action_name,
                json.dumps(host_features),
                json.dumps(network_features),
                json.dumps(temporal_features),
                json.dumps(action_features),
                1 if success else 0,
                duration,
                reward,
                json.dumps(raw_event)
            ))

            # Update rolling windows
            self.recent_actions.append({
                'mac': mac_address,
                'action': action_name,
                'success': success,
                'timestamp': time.time()
            })

            self.host_history[mac_address].append({
                'action': action_name,
                'success': success,
                'timestamp': time.time()
            })
            self._prune_host_history()

            logger.debug(
                f"Logged features for {action_name} on {mac_address} "
                f"(success={success}, features={len(host_features)}+{len(network_features)}+"
                f"{len(temporal_features)}+{len(action_features)})"
            )

            # Prune old database records to save disk space (keep last 1000)
            if random.random() < 0.05: # 5% chance to prune to avoid overhead every hit
                self._prune_database_records()

        except Exception as e:
            logger.error(f"Failed to log action execution: {e}")

    def _prune_host_history(self):
        """Bound host_history keys to avoid unbounded growth over very long runtimes."""
        try:
            current_size = len(self.host_history)
            if current_size <= self._max_hosts_tracked:
                return

            overflow = current_size - self._max_hosts_tracked
            ranked = []
            for mac, entries in self.host_history.items():
                if entries:
                    ranked.append((entries[-1]['timestamp'], mac))
                else:
                    ranked.append((0.0, mac))
            ranked.sort(key=lambda x: x[0])  # oldest first

            for _, mac in ranked[:overflow]:
                self.host_history.pop(mac, None)
        except Exception:
            pass

    def _prune_database_records(self, limit: int = 1000):
        """Keep the ml_features table within a reasonable size limit."""
        try:
            self.db.execute(f"""
                DELETE FROM ml_features
                WHERE id NOT IN (
                    SELECT id FROM ml_features
                    ORDER BY timestamp DESC
                    LIMIT {limit}
                )
            """)
        except Exception as e:
            logger.debug(f"Failed to prune ml_features: {e}")

    def _extract_host_features(self, mac: str, ip: str) -> Dict[str, Any]:
        """
        Extract features about the target host.
        Auto-discovers all relevant attributes from database.
        """
        features = {}

        try:
            # Get host data
            host = self.db.get_host_by_mac(mac)
            if not host:
                return features

            # Basic identifiers (hashed for privacy if needed)
            features['mac_hash'] = hashlib.md5(mac.encode()).hexdigest()[:8]
            features['vendor_oui'] = mac[:8].upper() if mac else None

            # Vendor classification
            vendor = host.get('vendor', '')
            features['vendor'] = vendor
            features['vendor_category'] = self._categorize_vendor(vendor)

            # Network interfaces
            ips = [p.strip() for p in (host.get('ips', '') or '').split(';') if p.strip()]
            features['ip_count'] = len(ips)
            features['has_multiple_ips'] = len(ips) > 1

            # Subnet classification
            if ips:
                features['subnet'] = '.'.join(ips[0].split('.')[:3]) + '.0/24'
                features['is_private'] = self._is_private_ip(ips[0])

            # Open ports
            ports_str = host.get('ports', '') or ''
            ports = [int(p) for p in ports_str.split(';') if p.strip().isdigit()]
            features['port_count'] = len(ports)
            features['ports'] = sorted(ports)[:20]  # Limit to top 20

            # Port profiles (auto-detect common patterns)
            features['port_profile'] = self._detect_port_profile(ports)
            features['has_ssh'] = 22 in ports
            features['has_http'] = 80 in ports or 8080 in ports
            features['has_https'] = 443 in ports
            features['has_smb'] = 445 in ports
            features['has_rdp'] = 3389 in ports
            features['has_database'] = any(p in ports for p in [3306, 5432, 1433, 27017])

            # Services detected
            services = self._get_services_for_host(mac)
            features['service_count'] = len(services)
            features['services'] = services

            # Hostnames
            hostnames = [h.strip() for h in (host.get('hostnames', '') or '').split(';') if h.strip()]
            features['hostname_count'] = len(hostnames)
            if hostnames:
                features['primary_hostname'] = hostnames[0]
                features['hostname_hints'] = self._extract_hostname_hints(hostnames[0])

            # First/last seen
            features['first_seen'] = host.get('first_seen')
            features['last_seen'] = host.get('last_seen')

            # Calculate age
            if host.get('first_seen'):
                ts = host['first_seen']
                if isinstance(ts, str):
                    try:
                        first_seen_dt = datetime.fromisoformat(ts)
                    except ValueError:
                        # Fallback for other formats if needed
                        first_seen_dt = datetime.now()
                elif isinstance(ts, datetime):
                    first_seen_dt = ts
                else:
                    first_seen_dt = datetime.now()

                age_hours = (datetime.now() - first_seen_dt).total_seconds() / 3600
                features['age_hours'] = round(age_hours, 2)
                features['is_new'] = age_hours < 24

            # Credentials found
            creds = self._get_credentials_for_host(mac)
            features['credential_count'] = len(creds)
            features['has_credentials'] = len(creds) > 0

            # OS fingerprinting hints
            features['os_hints'] = self._guess_os(vendor, ports, hostnames)

        except Exception as e:
            logger.error(f"Error extracting host features: {e}")

        return features

    def _extract_network_features(self, mac: str) -> Dict[str, Any]:
        """
        Extract network topology and relationship features.
        Discovers patterns in the network structure.
        """
        features = {}

        try:
            # Get all hosts
            all_hosts = self.db.get_all_hosts()

            # Network size
            features['total_hosts'] = len(all_hosts)

            # Subnet distribution
            subnet_counts = defaultdict(int)
            for h in all_hosts:
                ips = [p.strip() for p in (h.get('ips', '') or '').split(';') if p.strip()]
                if ips:
                    subnet = '.'.join(ips[0].split('.')[:3]) + '.0'
                    subnet_counts[subnet] += 1

            features['subnet_count'] = len(subnet_counts)
            features['largest_subnet_size'] = max(subnet_counts.values()) if subnet_counts else 0

            # Similar hosts (same vendor)
            target_host = self.db.get_host_by_mac(mac)
            if target_host:
                vendor = target_host.get('vendor', '')
                similar = sum(1 for h in all_hosts if h.get('vendor') == vendor)
                features['similar_vendor_count'] = similar

            # Port correlation (hosts with similar port profiles)
            target_ports = set()
            if target_host:
                ports_str = target_host.get('ports', '') or ''
                target_ports = {int(p) for p in ports_str.split(';') if p.strip().isdigit()}

            if target_ports:
                similar_port_hosts = 0
                for h in all_hosts:
                    if h.get('mac_address') == mac:
                        continue
                    ports_str = h.get('ports', '') or ''
                    other_ports = {int(p) for p in ports_str.split(';') if p.strip().isdigit()}

                    # Calculate Jaccard similarity
                    if other_ports:
                        intersection = len(target_ports & other_ports)
                        union = len(target_ports | other_ports)
                        similarity = intersection / union if union > 0 else 0
                        if similarity > 0.5:  # >50% similar
                            similar_port_hosts += 1

                features['similar_port_profile_count'] = similar_port_hosts

            # Network activity level
            recent_hosts = sum(1 for h in all_hosts
                             if self._is_recently_active(h.get('last_seen')))
            features['active_host_ratio'] = round(recent_hosts / len(all_hosts), 2) if all_hosts else 0

        except Exception as e:
            logger.error(f"Error extracting network features: {e}")

        return features

    def _extract_temporal_features(self, mac: str, action: str) -> Dict[str, Any]:
        """
        Extract time-based and sequence features.
        Discovers temporal patterns in attack sequences.
        """
        features = {}

        try:
            # Current time features
            now = datetime.now()
            features['hour_of_day'] = now.hour
            features['day_of_week'] = now.weekday()
            features['is_weekend'] = now.weekday() >= 5
            features['is_night'] = now.hour < 6 or now.hour >= 22

            # Action history for this host
            history = list(self.host_history.get(mac, []))
            features['previous_action_count'] = len(history)

            if history:
                # Last action
                last = history[-1]
                features['last_action'] = last['action']
                features['last_action_success'] = last['success']
                features['seconds_since_last'] = round(time.time() - last['timestamp'], 1)

                # Success rate history
                successes = sum(1 for h in history if h['success'])
                features['historical_success_rate'] = round(successes / len(history), 2)

                # Action sequence
                recent_sequence = [h['action'] for h in history[-5:]]
                features['recent_action_sequence'] = recent_sequence

                # Repeated action detection
                same_action_count = sum(1 for h in history if h['action'] == action)
                features['same_action_attempts'] = same_action_count
                features['is_retry'] = same_action_count > 0

            # Global action patterns
            recent = list(self.recent_actions)
            if recent:
                # Action distribution in recent history
                action_counts = defaultdict(int)
                for a in recent:
                    action_counts[a['action']] += 1

                features['most_common_recent_action'] = max(
                    action_counts.items(),
                    key=lambda x: x[1]
                )[0] if action_counts else None

                # Global success rate
                global_successes = sum(1 for a in recent if a['success'])
                features['global_success_rate'] = round(
                    global_successes / len(recent), 2
                )

            # Time since first seen
            host = self.db.get_host_by_mac(mac)
            if host and host.get('first_seen'):
                ts = host['first_seen']
                if isinstance(ts, str):
                    try:
                        first_seen = datetime.fromisoformat(ts)
                    except ValueError:
                        first_seen = now
                elif isinstance(ts, datetime):
                    first_seen = ts
                else:
                    first_seen = now

                features['hours_since_discovery'] = round(
                    (now - first_seen).total_seconds() / 3600, 1
                )

        except Exception as e:
            logger.error(f"Error extracting temporal features: {e}")

        return features

    def _extract_action_features(self, action_name: str, raw_event: Dict) -> Dict[str, Any]:
        """
        Extract action-specific features.
        Auto-discovers relevant metadata from action execution.
        """
        features = {}

        try:
            features['action_name'] = action_name

            # Action type classification
            features['action_type'] = self._classify_action_type(action_name)

            # Port-specific actions
            port = raw_event.get('port')
            if port:
                features['target_port'] = int(port)
                features['is_standard_port'] = int(port) < 1024

            # Extract any additional metadata from raw event
            # This allows actions to add custom features
            if 'metadata' in raw_event:
                metadata = raw_event['metadata']
                if isinstance(metadata, dict):
                    # Flatten metadata into features
                    for key, value in metadata.items():
                        if isinstance(value, (int, float, bool, str)):
                            features[f'meta_{key}'] = value

            # Execution context
            features['operation_mode'] = self.shared_data.operation_mode

        except Exception as e:
            logger.error(f"Error extracting action features: {e}")

        return features

    # ═══════════════════════════════════════════════════════════════════════
    # HELPER METHODS
    # ═══════════════════════════════════════════════════════════════════════

    def _categorize_vendor(self, vendor: str) -> str:
        """Categorize vendor into high-level groups"""
        if not vendor:
            return 'unknown'

        vendor_lower = vendor.lower()

        categories = {
            'networking': ['cisco', 'juniper', 'ubiquiti', 'mikrotik', 'tp-link', 'netgear', 'asus', 'd-link', 'linksys'],
            'iot': ['hikvision', 'dahua', 'axis', 'hanwha', 'tuya', 'sonoff', 'shelly', 'xiaomi', 'yeelight'],
            'nas': ['synology', 'qnap', 'netapp', 'truenas', 'unraid'],
            'compute': ['raspberry', 'intel', 'apple', 'dell', 'hp', 'lenovo', 'acer'],
            'virtualization': ['vmware', 'microsoft', 'citrix', 'proxmox'],
            'mobile': ['apple', 'samsung', 'huawei', 'xiaomi', 'google', 'oneplus']
        }

        for category, vendors in categories.items():
            if any(v in vendor_lower for v in vendors):
                return category

        return 'other'

    def _is_private_ip(self, ip: str) -> bool:
        """Check if IP is in private range"""
        if not ip:
            return False

        parts = ip.split('.')
        if len(parts) != 4:
            return False

        try:
            first = int(parts[0])
            second = int(parts[1])

            return (
                first == 10 or
                (first == 172 and 16 <= second <= 31) or
                (first == 192 and second == 168)
            )
        except:
            return False

    def _detect_port_profile(self, ports: List[int]) -> str:
        """Auto-detect device type from port signature"""
        if not ports:
            return 'unknown'

        port_set = set(ports)

        profiles = {
            'camera': {554, 80, 8000, 37777},
            'web_server': {80, 443, 8080, 8443},
            'nas': {5000, 5001, 548, 139, 445},
            'database': {3306, 5432, 1433, 27017, 6379},
            'linux_server': {22, 80, 443},
            'windows_server': {135, 139, 445, 3389},
            'printer': {9100, 515, 631},
            'router': {22, 23, 80, 443, 161}
        }

        max_overlap = 0
        best_profile = 'generic'

        for profile_name, profile_ports in profiles.items():
            overlap = len(port_set & profile_ports)
            if overlap > max_overlap:
                max_overlap = overlap
                best_profile = profile_name

        return best_profile if max_overlap >= 2 else 'generic'

    def _get_services_for_host(self, mac: str) -> List[str]:
        """Get list of detected services for host"""
        try:
            results = self.db.query("""
                SELECT DISTINCT service
                FROM port_services
                WHERE mac_address=?
            """, (mac,))

            return [r['service'] for r in results if r.get('service')]
        except:
            return []

    def _extract_hostname_hints(self, hostname: str) -> List[str]:
        """Extract hints from hostname"""
        if not hostname:
            return []

        hints = []
        hostname_lower = hostname.lower()

        keywords = {
            'nas': ['nas', 'storage', 'diskstation'],
            'camera': ['cam', 'ipc', 'nvr', 'dvr'],
            'router': ['router', 'gateway', 'gw'],
            'server': ['server', 'srv', 'host'],
            'printer': ['printer', 'print'],
            'iot': ['iot', 'sensor', 'smart']
        }

        for hint, words in keywords.items():
            if any(word in hostname_lower for word in words):
                hints.append(hint)

        return hints

    def _get_credentials_for_host(self, mac: str) -> List[Dict]:
        """Get credentials found for host"""
        try:
            return self.db.query("""
                SELECT service, user, port
                FROM creds
                WHERE mac_address=?
            """, (mac,))
        except:
            return []

    def _guess_os(self, vendor: str, ports: List[int], hostnames: List[str]) -> str:
        """Guess OS from available indicators"""
        if not vendor and not ports and not hostnames:
            return 'unknown'

        vendor_lower = (vendor or '').lower()
        port_set = set(ports or [])
        hostname = hostnames[0].lower() if hostnames else ''

        # Strong indicators
        if 'microsoft' in vendor_lower or 3389 in port_set:
            return 'windows'
        if 'apple' in vendor_lower or 'mac' in hostname:
            return 'macos'
        if 'raspberry' in vendor_lower:
            return 'linux'

        # Port-based guessing
        if {22, 80} <= port_set:
            return 'linux'
        if {135, 139, 445} <= port_set:
            return 'windows'

        # Hostname hints
        if any(word in hostname for word in ['ubuntu', 'debian', 'centos', 'rhel']):
            return 'linux'

        return 'unknown'

    def _is_recently_active(self, last_seen: Optional[str]) -> bool:
        """Check if host was active in last 24h"""
        if not last_seen:
            return False

        try:
            if isinstance(last_seen, str):
                last_seen_dt = datetime.fromisoformat(last_seen)
            elif isinstance(last_seen, datetime):
                last_seen_dt = last_seen
            else:
                return False

            hours_ago = (datetime.now() - last_seen_dt).total_seconds() / 3600
            return hours_ago < 24
        except:
            return False

    def _classify_action_type(self, action_name: str) -> str:
        """Classify action into high-level categories"""
        action_lower = action_name.lower()

        if 'brute' in action_lower or 'crack' in action_lower:
            return 'bruteforce'
        elif 'scan' in action_lower or 'enum' in action_lower:
            return 'enumeration'
        elif 'exploit' in action_lower:
            return 'exploitation'
        elif 'dump' in action_lower or 'extract' in action_lower:
            return 'extraction'
        else:
            return 'other'

    # ═══════════════════════════════════════════════════════════════════════
    # FEATURE AGGREGATION & EXPORT
    # ═══════════════════════════════════════════════════════════════════════

    def get_stats(self) -> Dict[str, Any]:
        """Get current feature logging statistics"""
        try:
            total = self.db.query("SELECT COUNT(*) as cnt FROM ml_features")[0]['cnt']
            unconsolidated = self.db.query(
                "SELECT COUNT(*) as cnt FROM ml_features WHERE consolidated=0"
            )[0]['cnt']

            return {
                'total_features_logged': total,
                'unconsolidated_count': unconsolidated,
                'ready_for_export': unconsolidated,
                'recent_actions_buffer': len(self.recent_actions),
                'hosts_tracked': len(self.host_history)
            }
        except Exception as e:
            logger.error(f"Error getting feature stats: {e}")
            return {}


# ═══════════════════════════════════════════════════════════════════════════
# END OF FILE
# ═══════════════════════════════════════════════════════════════════════════