diff --git a/.replit b/.replit index 3dc4618..7950bb9 100644 --- a/.replit +++ b/.replit @@ -18,6 +18,10 @@ externalPort = 80 localPort = 41303 externalPort = 3002 +[[ports]] +localPort = 42815 +externalPort = 3001 + [[ports]] localPort = 43471 externalPort = 3003 diff --git a/attached_assets/Pasted--deployment-train-hybrid-production-sh--1764007807845_1764007807845.txt b/attached_assets/Pasted--deployment-train-hybrid-production-sh--1764007807845_1764007807845.txt new file mode 100644 index 0000000..d6671a2 --- /dev/null +++ b/attached_assets/Pasted--deployment-train-hybrid-production-sh--1764007807845_1764007807845.txt @@ -0,0 +1,54 @@ + ./deployment/train_hybrid_production.sh +======================================================================= + TRAINING HYBRID ML DETECTOR - DATI REALI +======================================================================= + +πŸ“‚ Caricamento credenziali database da .env... +βœ… Credenziali caricate: + Host: localhost + Port: 5432 + Database: ids_database + User: ids_user + Password: ****** (nascosta) + +🎯 Parametri training: + Periodo: ultimi 7 giorni + Max records: 1000000 + +🐍 Python: /opt/ids/python_ml/venv/bin/python + +πŸ“Š Verifica dati disponibili nel database... + primo_log | ultimo_log | periodo_totale | totale_records +---------------------+---------------------+----------------+---------------- + 2025-11-22 10:03:21 | 2025-11-24 17:58:17 | 2 giorni | 234,316,667 +(1 row) + + +πŸš€ Avvio training... + +======================================================================= +[WARNING] Extended Isolation Forest not available, using standard IF + +====================================================================== + IDS HYBRID ML TRAINING - UNSUPERVISED MODE +====================================================================== +[TRAIN] Loading last 7 days of real traffic from database... + +❌ Error: column "dest_ip" does not exist +LINE 5: dest_ip, + ^ + +Traceback (most recent call last): + File "/opt/ids/python_ml/train_hybrid.py", line 365, in main + train_unsupervised(args) + File "/opt/ids/python_ml/train_hybrid.py", line 91, in train_unsupervised + logs_df = train_on_real_traffic(db_config, days=args.days) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/ids/python_ml/train_hybrid.py", line 50, in train_on_real_traffic + cursor.execute(query, (days,)) + File "/opt/ids/python_ml/venv/lib64/python3.11/site-packages/psycopg2/extras.py", line 236, in execute + return super().execute(query, vars) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +psycopg2.errors.UndefinedColumn: column "dest_ip" does not exist +LINE 5: dest_ip, + ^ diff --git a/deployment/run_ml_training.sh b/deployment/run_ml_training.sh new file mode 100644 index 0000000..8a0b133 --- /dev/null +++ b/deployment/run_ml_training.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# +# ML Training Wrapper - Esecuzione Automatica via Systemd +# Carica credenziali da .env in modo sicuro +# + +set -e + +IDS_ROOT="/opt/ids" +ENV_FILE="$IDS_ROOT/.env" +PYTHON_ML_DIR="$IDS_ROOT/python_ml" +VENV_PYTHON="$PYTHON_ML_DIR/venv/bin/python" +LOG_DIR="/var/log/ids" + +# Crea directory log se non esiste +mkdir -p "$LOG_DIR" + +# File log dedicato +LOG_FILE="$LOG_DIR/ml-training.log" + +# Funzione logging +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +log "=========================================" +log "ML Training - Avvio automatico" +log "=========================================" + +# Verifica .env +if [ ! -f "$ENV_FILE" ]; then + log "ERROR: File .env non trovato: $ENV_FILE" + exit 1 +fi + +# Carica variabili ambiente +log "Caricamento credenziali database..." +set -a +source "$ENV_FILE" +set +a + +# Verifica credenziali +if [ -z "$PGPASSWORD" ]; then + log "ERROR: PGPASSWORD non trovata in .env" + exit 1 +fi + +DB_HOST="${PGHOST:-localhost}" +DB_PORT="${PGPORT:-5432}" +DB_NAME="${PGDATABASE:-ids}" +DB_USER="${PGUSER:-postgres}" + +log "Database: $DB_USER@$DB_HOST:$DB_PORT/$DB_NAME" + +# Verifica venv +if [ ! -f "$VENV_PYTHON" ]; then + log "ERROR: Venv Python non trovato: $VENV_PYTHON" + exit 1 +fi + +# Parametri training +DAYS="${ML_TRAINING_DAYS:-7}" # Default 7 giorni, configurabile via env var + +log "Training ultimi $DAYS giorni di traffico..." + +# Esegui training +cd "$PYTHON_ML_DIR" +"$VENV_PYTHON" train_hybrid.py --train --source database \ + --db-host "$DB_HOST" \ + --db-port "$DB_PORT" \ + --db-name "$DB_NAME" \ + --db-user "$DB_USER" \ + --db-password "$PGPASSWORD" \ + --days "$DAYS" 2>&1 | tee -a "$LOG_FILE" + +# Check exit code +if [ ${PIPESTATUS[0]} -eq 0 ]; then + log "=========================================" + log "βœ… Training completato con successo!" + log "=========================================" + log "Modelli salvati in: $PYTHON_ML_DIR/models/" + log "" + log "Il ML backend caricherΓ  automaticamente i nuovi modelli al prossimo riavvio." + log "Per applicare immediatamente: sudo systemctl restart ids-ml-backend" + exit 0 +else + log "=========================================" + log "❌ ERRORE durante il training" + log "=========================================" + log "Controlla log completo: $LOG_FILE" + exit 1 +fi diff --git a/deployment/setup_ml_training_timer.sh b/deployment/setup_ml_training_timer.sh new file mode 100644 index 0000000..d41b778 --- /dev/null +++ b/deployment/setup_ml_training_timer.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# Setup ML Training Systemd Timer +# Configura training automatico settimanale del modello ML hybrid +# + +set -e + +echo "================================================================" +echo " SETUP ML TRAINING TIMER - Training Automatico Settimanale" +echo "================================================================" +echo "" + +# Verifica root +if [ "$EUID" -ne 0 ]; then + echo "❌ ERRORE: Questo script deve essere eseguito come root" + echo " Usa: sudo $0" + exit 1 +fi + +IDS_ROOT="/opt/ids" +SYSTEMD_DIR="/etc/systemd/system" + +# Verifica directory IDS +if [ ! -d "$IDS_ROOT" ]; then + echo "❌ ERRORE: Directory IDS non trovata: $IDS_ROOT" + exit 1 +fi + +echo "πŸ“ Directory IDS: $IDS_ROOT" +echo "" + +# 1. Copia systemd files +echo "πŸ“‹ Step 1: Installazione systemd units..." + +cp "$IDS_ROOT/deployment/systemd/ids-ml-training.service" "$SYSTEMD_DIR/" +cp "$IDS_ROOT/deployment/systemd/ids-ml-training.timer" "$SYSTEMD_DIR/" + +echo " βœ… Service copiato: $SYSTEMD_DIR/ids-ml-training.service" +echo " βœ… Timer copiato: $SYSTEMD_DIR/ids-ml-training.timer" +echo "" + +# 2. Rendi eseguibile script +echo "πŸ”§ Step 2: Permessi script..." +chmod +x "$IDS_ROOT/deployment/run_ml_training.sh" +echo " βœ… Script eseguibile: $IDS_ROOT/deployment/run_ml_training.sh" +echo "" + +# 3. Reload systemd +echo "πŸ”„ Step 3: Reload systemd daemon..." +systemctl daemon-reload +echo " βœ… Daemon reloaded" +echo "" + +# 4. Enable e start timer +echo "πŸš€ Step 4: Attivazione timer..." +systemctl enable ids-ml-training.timer +systemctl start ids-ml-training.timer +echo " βœ… Timer attivato e avviato" +echo "" + +# 5. Verifica status +echo "πŸ“Š Step 5: Verifica configurazione..." +echo "" +echo "Timer status:" +systemctl status ids-ml-training.timer --no-pager +echo "" +echo "Prossima esecuzione:" +systemctl list-timers ids-ml-training.timer --no-pager +echo "" + +echo "================================================================" +echo "βœ… SETUP COMPLETATO!" +echo "================================================================" +echo "" +echo "πŸ“… Schedule: Ogni LunedΓ¬ alle 03:00 AM" +echo "πŸ“ Log: /var/log/ids/ml-training.log" +echo "" +echo "πŸ” COMANDI UTILI:" +echo "" +echo " # Verifica timer attivo" +echo " systemctl status ids-ml-training.timer" +echo "" +echo " # Vedi prossima esecuzione" +echo " systemctl list-timers ids-ml-training.timer" +echo "" +echo " # Esegui training manualmente ORA" +echo " sudo systemctl start ids-ml-training.service" +echo "" +echo " # Vedi log training" +echo " journalctl -u ids-ml-training.service -f" +echo " tail -f /var/log/ids/ml-training.log" +echo "" +echo " # Disabilita training automatico" +echo " sudo systemctl stop ids-ml-training.timer" +echo " sudo systemctl disable ids-ml-training.timer" +echo "" +echo "================================================================" diff --git a/deployment/systemd/ids-ml-training.service b/deployment/systemd/ids-ml-training.service new file mode 100644 index 0000000..b146e78 --- /dev/null +++ b/deployment/systemd/ids-ml-training.service @@ -0,0 +1,30 @@ +[Unit] +Description=IDS ML Hybrid Detector Training +Documentation=https://github.com/your-repo/ids +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=oneshot +User=root +WorkingDirectory=/opt/ids/python_ml + +# Carica environment file per credenziali database +EnvironmentFile=/opt/ids/.env + +# Esegui training +ExecStart=/opt/ids/deployment/run_ml_training.sh + +# Timeout generoso (training puΓ² richiedere fino a 30 min) +TimeoutStartSec=1800 + +# Log +StandardOutput=journal +StandardError=journal +SyslogIdentifier=ids-ml-training + +# Restart policy +Restart=no + +[Install] +WantedBy=multi-user.target diff --git a/deployment/systemd/ids-ml-training.timer b/deployment/systemd/ids-ml-training.timer new file mode 100644 index 0000000..74b080a --- /dev/null +++ b/deployment/systemd/ids-ml-training.timer @@ -0,0 +1,17 @@ +[Unit] +Description=IDS ML Training - Weekly Retraining +Documentation=https://github.com/your-repo/ids +Requires=ids-ml-training.service + +[Timer] +# Esecuzione settimanale: ogni LunedΓ¬ alle 03:00 AM +OnCalendar=Mon *-*-* 03:00:00 + +# Persistenza: se il server era spento, esegui al prossimo boot +Persistent=true + +# Accuratezza: 5 minuti di tolleranza +AccuracySec=5min + +[Install] +WantedBy=timers.target diff --git a/python_ml/ml_hybrid_detector.py b/python_ml/ml_hybrid_detector.py index 87be785..5cc42f5 100644 --- a/python_ml/ml_hybrid_detector.py +++ b/python_ml/ml_hybrid_detector.py @@ -102,8 +102,19 @@ class MLHybridDetector: group = group.sort_values('timestamp') # Volume features (5) - total_packets = group['packets'].sum() if 'packets' in group.columns else len(group) - total_bytes = group['bytes'].sum() if 'bytes' in group.columns else 0 + # Handle different database schemas + if 'packets' in group.columns: + total_packets = group['packets'].sum() + else: + total_packets = len(group) # Each row = 1 packet + + if 'bytes' in group.columns: + total_bytes = group['bytes'].sum() + elif 'packet_length' in group.columns: + total_bytes = group['packet_length'].sum() # Use packet_length from MikroTik logs + else: + total_bytes = 0 + conn_count = len(group) avg_packet_size = total_bytes / max(total_packets, 1) bytes_per_second = total_bytes / max((group['timestamp'].max() - group['timestamp'].min()).total_seconds(), 1) @@ -151,6 +162,9 @@ class MLHybridDetector: if 'bytes' in group.columns and 'packets' in group.columns: group['packet_size'] = group['bytes'] / group['packets'].replace(0, 1) packet_size_variance = group['packet_size'].std() + elif 'packet_length' in group.columns: + # Use packet_length directly for variance + packet_size_variance = group['packet_length'].std() else: packet_size_variance = 0 diff --git a/python_ml/train_hybrid.py b/python_ml/train_hybrid.py index 912fca6..78fc009 100644 --- a/python_ml/train_hybrid.py +++ b/python_ml/train_hybrid.py @@ -35,11 +35,10 @@ def train_on_real_traffic(db_config: dict, days: int = 7) -> pd.DataFrame: SELECT timestamp, source_ip, - dest_ip, - dest_port, + destination_ip as dest_ip, + destination_port as dest_port, protocol, - packets, - bytes, + packet_length, action FROM network_logs WHERE timestamp > NOW() - INTERVAL '%s days' diff --git a/replit.md b/replit.md index 431bc74..3e8179d 100644 --- a/replit.md +++ b/replit.md @@ -123,4 +123,32 @@ The IDS employs a React-based frontend for real-time monitoring, detection visua - `requirements.txt`: Rimosso `eif==2.0.2` e `Cython==3.0.5` (non piΓΉ necessari) - `deployment/install_ml_deps.sh`: Semplificato da 4 a 2 step, nessuna compilazione - `deployment/CHECKLIST_ML_HYBRID.md`: Aggiornato con nuove istruzioni semplificate + +### πŸ”„ Database Schema Adaptation & Auto-Training (24 Nov 2025 - 23:30) +- **Database Schema Fix**: Adattato ML detector allo schema reale `network_logs` + - Query SQL corretta: `destination_ip` (non `dest_ip`), `destination_port` (non `dest_port`) + - Feature extraction: supporto `packet_length` invece di `packets`/`bytes` separati + - Backward compatible: funziona sia con schema MikroTik che dataset CICIDS2017 +- **Training Automatico Settimanale**: + - Script wrapper: `deployment/run_ml_training.sh` (carica credenziali da .env) + - Systemd service: `ids-ml-training.service` + - Systemd timer: `ids-ml-training.timer` (ogni LunedΓ¬ 03:00 AM) + - Setup automatico: `./deployment/setup_ml_training_timer.sh` + - Log persistenti: `/var/log/ids/ml-training.log` +- **Workflow Completo**: + 1. Timer systemd esegue training settimanale automatico + 2. Script carica ultimi 7 giorni di traffico dal database (234M+ records) + 3. Training Hybrid ML (IF + Ensemble + Feature Selection) + 4. Modelli salvati in `python_ml/models/` + 5. ML backend li carica automaticamente al prossimo riavvio +- **Files creati**: + - `deployment/run_ml_training.sh` - Wrapper sicuro per training + - `deployment/train_hybrid_production.sh` - Script training manuale completo + - `deployment/systemd/ids-ml-training.service` - Service systemd + - `deployment/systemd/ids-ml-training.timer` - Timer settimanale + - `deployment/setup_ml_training_timer.sh` - Setup automatico +- **Files modificati**: + - `python_ml/train_hybrid.py` - Query SQL adattata allo schema DB reale + - `python_ml/ml_hybrid_detector.py` - Supporto `packet_length`, backward compatible + - `python_ml/dataset_loader.py` - Fix timestamp mancante in dataset sintetico - **Impatto**: Sistema userΓ  automaticamente sklearn IF tramite fallback, tutti gli 8 checkpoint fail-fast funzionano identicamente \ No newline at end of file