#!/usr/bin/env python3 import pandas as pd from sqlalchemy import create_engine from sqlalchemy.sql import text from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor from sklearn.svm import OneClassSVM from joblib import dump, load import logging import gc import os import time from datetime import datetime, timedelta import numpy as np import argparse import sys import traceback import warnings warnings.filterwarnings('ignore') # Configurazione del logging semplificata logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('analisys_debug.log') ] ) # Configurazione del database DB_USER = os.environ.get('DB_USER', 'root') DB_PASSWORD = os.environ.get('DB_PASSWORD', 'Hdgtejskjjc0-') DB_HOST = os.environ.get('DB_HOST', 'localhost') DB_NAME = os.environ.get('DB_DATABASE', 'LOG_MIKROTIK') # Cartella per i modelli MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models') try: os.makedirs(MODEL_DIR, exist_ok=True) except Exception as e: logging.error(f"Errore nella creazione della directory models: {e}") MODEL_DIR = '.' # Percorsi dei modelli IF_MODEL_PATH = os.path.join(MODEL_DIR, 'isolation_forest.joblib') LOF_MODEL_PATH = os.path.join(MODEL_DIR, 'lof.joblib') SVM_MODEL_PATH = os.path.join(MODEL_DIR, 'svm.joblib') ENSEMBLE_MODEL_PATH = os.path.join(MODEL_DIR, 'ensemble_weights.joblib') PREPROCESSOR_PATH = os.path.join(MODEL_DIR, 'preprocessor.joblib') # Parametri semplificati TRAINING_FREQUENCY_HOURS = 12 MAX_TRAINING_SAMPLES = 50000 # Ridotto drasticamente MIN_TRAINING_SAMPLES = 500 CONNECTION_TIMEOUT = 15 # Timeout ridotto # Colori per output class Colors: BLUE = '\033[94m' GREEN = '\033[92m' YELLOW = '\033[93m' RED = '\033[91m' BOLD = '\033[1m' END = '\033[0m' def log_phase(message): print(f"\n{Colors.BOLD}{Colors.GREEN}▶ {message}{Colors.END}\n") logging.info(f"FASE: {message}") def log_result(message): print(f"{Colors.BLUE}✓ {message}{Colors.END}") logging.info(f"RISULTATO: {message}") def log_warning(message): print(f"{Colors.YELLOW}⚠ {message}{Colors.END}") logging.warning(message) def log_error(message): print(f"{Colors.RED}✗ {message}{Colors.END}") logging.error(message) def connect_to_database(): """Connessione database semplificata""" try: log_phase("Connessione al database") connection_string = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}" # Configurazione minimalista engine = create_engine( connection_string, pool_size=2, max_overflow=3, pool_recycle=1800, pool_pre_ping=True, pool_timeout=CONNECTION_TIMEOUT, echo=False, connect_args={ 'connection_timeout': CONNECTION_TIMEOUT, 'autocommit': True } ) # Test connessione veloce with engine.connect() as conn: result = conn.execute(text("SELECT 1")).fetchone() if result[0] == 1: log_result("Database connesso") return engine return None except Exception as e: log_error(f"Errore connessione: {e}") return None def extract_data_simple(engine, max_records=50000): """Estrazione dati con query semplicissima""" try: log_phase(f"Estrazione dati (max {max_records} record)") # Query SEMPLICISSIMA - solo gli ultimi N record query = text(""" SELECT ID, Data, Ora, Host, IndirizzoIP, Messaggio1, Messaggio2, Messaggio3 FROM Esterna ORDER BY ID DESC LIMIT :max_records """) log_result("Esecuzione query semplice...") # Timeout di 60 secondi per la query start_time = time.time() df = pd.read_sql(query, engine, params={"max_records": max_records}) elapsed = time.time() - start_time log_result(f"Estratti {len(df)} record in {elapsed:.1f} secondi") return df except Exception as e: log_error(f"Errore estrazione: {e}") return pd.DataFrame() def prepare_data_simple(df): """Preparazione dati ultra-semplificata""" try: log_phase("Preparazione dati semplificata") if df.empty: log_error("DataFrame vuoto") return None, None # Feature minimaliste - solo 50 feature per velocità massima feature_data = {} n_samples = len(df) # 1. Feature temporali base (10 feature) if 'Data' in df.columns and 'Ora' in df.columns: try: df['Data'] = pd.to_datetime(df['Data'], errors='coerce') df['Ora'] = pd.to_timedelta(df['Ora'].astype(str), errors='coerce') df['Timestamp'] = df['Data'] + df['Ora'] feature_data['hour'] = df['Timestamp'].dt.hour.fillna(0).values feature_data['day'] = df['Timestamp'].dt.dayofweek.fillna(0).values feature_data['minute'] = df['Timestamp'].dt.minute.fillna(0).values except: feature_data['hour'] = np.zeros(n_samples) feature_data['day'] = np.zeros(n_samples) feature_data['minute'] = np.zeros(n_samples) else: feature_data['hour'] = np.zeros(n_samples) feature_data['day'] = np.zeros(n_samples) feature_data['minute'] = np.zeros(n_samples) # Aggiungi 7 feature temporali semplici for i in range(7): feature_data[f'time_{i}'] = np.random.random(n_samples) * 0.1 # Rumore minimo # 2. Feature protocollo semplici (15 feature) if 'Messaggio1' in df.columns: proto_data = df['Messaggio1'].fillna('').astype(str) # Protocolli più comuni protocols = ['TCP', 'UDP', 'ICMP', 'HTTP', 'SSH', 'FTP', 'DNS'] for i, protocol in enumerate(protocols): feature_data[f'proto_{i}'] = proto_data.str.contains(protocol, case=False).astype(int).values # Riempi rimanenti for i in range(len(protocols), 15): feature_data[f'proto_{i}'] = np.zeros(n_samples) else: for i in range(15): feature_data[f'proto_{i}'] = np.zeros(n_samples) # 3. Feature Host semplici (5 feature) if 'Host' in df.columns: host_data = df['Host'].fillna('').astype(str) feature_data['host_fibra'] = host_data.str.contains('FIBRA', case=False).astype(int).values feature_data['host_empty'] = df['Host'].isna().astype(int).values feature_data['host_len'] = host_data.str.len().values / 100.0 # Normalizzato else: feature_data['host_fibra'] = np.zeros(n_samples) feature_data['host_empty'] = np.zeros(n_samples) feature_data['host_len'] = np.zeros(n_samples) # Riempi fino a 5 for i in range(3, 5): feature_data[f'host_{i}'] = np.zeros(n_samples) # 4. Feature IP semplici (10 feature) if 'Messaggio2' in df.columns: ip_data = df['Messaggio2'].str.split(':').str[0].fillna('unknown').astype(str) # Hash semplice per IP for i in range(10): feature_data[f'ip_{i}'] = (pd.util.hash_array(ip_data.values) % (2**(i+3))) / (2**(i+3)) else: for i in range(10): feature_data[f'ip_{i}'] = np.zeros(n_samples) # 5. Feature ID semplici (10 feature) if 'ID' in df.columns: id_values = df['ID'].fillna(0).values id_normalized = (id_values - id_values.min()) / (id_values.max() - id_values.min() + 1) for i in range(10): feature_data[f'id_{i}'] = np.roll(id_normalized, i) * (0.9 ** i) else: for i in range(10): feature_data[f'id_{i}'] = np.zeros(n_samples) # Verifica che abbiamo esattamente 50 feature total_features = len(feature_data) if total_features != 50: log_warning(f"Feature count: {total_features}, aggiustando a 50") if total_features < 50: for i in range(total_features, 50): feature_data[f'extra_{i}'] = np.zeros(n_samples) else: # Rimuovi feature in eccesso keys_to_remove = list(feature_data.keys())[50:] for key in keys_to_remove: del feature_data[key] # Crea array numpy feature_names = sorted(feature_data.keys()) X = np.column_stack([feature_data[name] for name in feature_names]) # Preprocessor semplice preprocessor = { 'feature_columns': feature_names, 'n_features': len(feature_names) } # Salva preprocessor try: dump(preprocessor, PREPROCESSOR_PATH) log_result(f"Preprocessor salvato: {X.shape[1]} feature") except Exception as e: log_warning(f"Errore salvataggio preprocessor: {e}") log_result(f"Dati preparati: {X.shape[0]} esempi, {X.shape[1]} feature") return X, preprocessor except Exception as e: log_error(f"Errore preparazione dati: {e}") return None, None def train_models_simple(X): """Addestramento semplificato e veloce""" try: log_phase("Addestramento modelli semplificato") if X.shape[0] < MIN_TRAINING_SAMPLES: log_error(f"Troppo pochi campioni: {X.shape[0]} < {MIN_TRAINING_SAMPLES}") return None, None, None, None # Campionamento se necessario if X.shape[0] > MAX_TRAINING_SAMPLES: log_warning(f"Campionamento da {X.shape[0]} a {MAX_TRAINING_SAMPLES}") indices = np.random.choice(X.shape[0], MAX_TRAINING_SAMPLES, replace=False) X_train = X[indices] else: X_train = X log_result(f"Addestramento su {X_train.shape[0]} esempi") # 1. Isolation Forest veloce log_result("Addestramento Isolation Forest...") if_model = IsolationForest( n_estimators=30, # Molto ridotto contamination=0.05, random_state=42, n_jobs=2, # Limitato per evitare sovraccarico max_samples=min(500, X_train.shape[0]), max_features=0.7 ) if_model.fit(X_train) log_result("✓ Isolation Forest OK") # 2. LOF veloce (campione molto piccolo) log_result("Addestramento LOF...") lof_sample_size = min(5000, X_train.shape[0]) if X_train.shape[0] > lof_sample_size: lof_indices = np.random.choice(X_train.shape[0], lof_sample_size, replace=False) lof_sample = X_train[lof_indices] else: lof_sample = X_train lof_model = LocalOutlierFactor( n_neighbors=min(5, lof_sample.shape[0] // 50), contamination=0.05, novelty=True, n_jobs=2 ) lof_model.fit(lof_sample) log_result("✓ LOF OK") # 3. SVM veloce (campione piccolissimo) log_result("Addestramento SVM...") svm_sample_size = min(2000, X_train.shape[0]) if X_train.shape[0] > svm_sample_size: svm_indices = np.random.choice(X_train.shape[0], svm_sample_size, replace=False) svm_sample = X_train[svm_indices] else: svm_sample = X_train svm_model = OneClassSVM( kernel='rbf', gamma='scale', nu=0.05 ) svm_model.fit(svm_sample) log_result("✓ SVM OK") # 4. Salvataggio log_result("Salvataggio modelli...") try: dump(if_model, IF_MODEL_PATH) dump(lof_model, LOF_MODEL_PATH) dump(svm_model, SVM_MODEL_PATH) ensemble_weights = { 'isolation_forest': 0.70, # Peso maggiore per IF 'lof': 0.20, 'svm': 0.10 } dump(ensemble_weights, ENSEMBLE_MODEL_PATH) log_result("✓ Modelli salvati") except Exception as e: log_error(f"Errore salvataggio: {e}") return None, None, None, None return if_model, lof_model, svm_model, ensemble_weights except Exception as e: log_error(f"Errore addestramento: {e}") return None, None, None, None def save_model_timestamp(): """Salva timestamp (semplificato)""" try: with open(os.path.join(MODEL_DIR, 'last_training.txt'), 'w') as f: f.write(datetime.now().isoformat()) log_result("Timestamp salvato") return True except Exception as e: log_warning(f"Errore salvataggio timestamp: {e}") return False def needs_training(force_training=False): """Verifica necessità addestramento (semplificato)""" if force_training: log_result("Riaddestramento forzato") return True try: timestamp_file = os.path.join(MODEL_DIR, 'last_training.txt') if not os.path.exists(timestamp_file): log_result("Nessun addestramento precedente") return True with open(timestamp_file, 'r') as f: last_trained_str = f.read().strip() last_trained = datetime.fromisoformat(last_trained_str) now = datetime.now() hours_diff = (now - last_trained).total_seconds() / 3600 if hours_diff >= TRAINING_FREQUENCY_HOURS: log_result(f"Ultimo addestramento: {hours_diff:.1f} ore fa - necessario") return True else: log_result(f"Ultimo addestramento: {hours_diff:.1f} ore fa - non necessario") return False except Exception as e: log_warning(f"Errore verifica: {e}") return True def test_database_connection(): """Test connessione semplice""" try: engine = connect_to_database() if not engine: return False with engine.connect() as conn: result = conn.execute(text("SELECT 1")).fetchone() if result and result[0] == 1: log_result("Test database OK") # Test veloce tabella Esterna try: result = conn.execute(text("SELECT COUNT(*) FROM Esterna LIMIT 1")).fetchone() log_result("Tabella Esterna accessibile") except: log_error("Tabella Esterna non accessibile") return False return True return False except Exception as e: log_error(f"Errore test: {e}") return False def main(): """Funzione principale semplificata""" parser = argparse.ArgumentParser(description='Addestramento semplificato v01') parser.add_argument('--force-training', action='store_true', help='Forza riaddestramento') parser.add_argument('--test', action='store_true', help='Test connessione') parser.add_argument('--max-records', type=int, default=50000, help='Max record (default: 50k)') parser.add_argument('--debug', action='store_true', help='Debug logging') args = parser.parse_args() if args.debug: logging.getLogger().setLevel(logging.DEBUG) log_phase("Sistema addestramento SEMPLIFICATO v01") log_result(f"Config: max {args.max_records} record") start_time = time.time() # Test veloce if args.test: log_phase("Test connessione") if test_database_connection(): log_result("Test SUPERATO") sys.exit(0) else: log_error("Test FALLITO") sys.exit(1) # Test connessione if not test_database_connection(): log_error("Database non raggiungibile") sys.exit(1) # Verifica necessità if not needs_training(args.force_training): log_result("Addestramento non necessario") sys.exit(0) try: # Connessione engine = connect_to_database() if not engine: log_error("Connessione fallita") sys.exit(1) # Estrazione semplice df = extract_data_simple(engine, args.max_records) if df.empty: log_error("Nessun dato estratto") sys.exit(1) # Preparazione semplice X, preprocessor = prepare_data_simple(df) if X is None: log_error("Preparazione dati fallita") sys.exit(1) # Addestramento semplice models = train_models_simple(X) if all(m is not None for m in models): log_phase("SUCCESSO!") # Salva timestamp save_model_timestamp() # Statistiche finali elapsed = time.time() - start_time log_result(f"Tempo totale: {elapsed:.1f} secondi") log_result(f"Campioni: {X.shape[0]}") log_result(f"Feature: {X.shape[1]}") else: log_error("Addestramento fallito") sys.exit(1) except Exception as e: log_error(f"Errore generale: {e}") logging.error(traceback.format_exc()) sys.exit(1) if __name__ == "__main__": main()