Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: full_checkpoint Replit-Commit-Event-Id: 1c71ce6e-1a3e-4f53-bb5d-77cdd22b8ea3
513 lines
18 KiB
Python
513 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
import pandas as pd
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy.sql import text
|
|
from sklearn.ensemble import IsolationForest
|
|
from sklearn.neighbors import LocalOutlierFactor
|
|
from sklearn.svm import OneClassSVM
|
|
from joblib import dump, load
|
|
import logging
|
|
import gc
|
|
import os
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
import numpy as np
|
|
import argparse
|
|
import sys
|
|
import traceback
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
|
|
# Configurazione del logging semplificata
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout),
|
|
logging.FileHandler('analisys_debug.log')
|
|
]
|
|
)
|
|
|
|
# Configurazione del database
|
|
DB_USER = os.environ.get('DB_USER', 'root')
|
|
DB_PASSWORD = os.environ.get('DB_PASSWORD', 'Hdgtejskjjc0-')
|
|
DB_HOST = os.environ.get('DB_HOST', 'localhost')
|
|
DB_NAME = os.environ.get('DB_DATABASE', 'LOG_MIKROTIK')
|
|
|
|
# Cartella per i modelli
|
|
MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models')
|
|
try:
|
|
os.makedirs(MODEL_DIR, exist_ok=True)
|
|
except Exception as e:
|
|
logging.error(f"Errore nella creazione della directory models: {e}")
|
|
MODEL_DIR = '.'
|
|
|
|
# Percorsi dei modelli
|
|
IF_MODEL_PATH = os.path.join(MODEL_DIR, 'isolation_forest.joblib')
|
|
LOF_MODEL_PATH = os.path.join(MODEL_DIR, 'lof.joblib')
|
|
SVM_MODEL_PATH = os.path.join(MODEL_DIR, 'svm.joblib')
|
|
ENSEMBLE_MODEL_PATH = os.path.join(MODEL_DIR, 'ensemble_weights.joblib')
|
|
PREPROCESSOR_PATH = os.path.join(MODEL_DIR, 'preprocessor.joblib')
|
|
|
|
# Parametri semplificati
|
|
TRAINING_FREQUENCY_HOURS = 12
|
|
MAX_TRAINING_SAMPLES = 50000 # Ridotto drasticamente
|
|
MIN_TRAINING_SAMPLES = 500
|
|
CONNECTION_TIMEOUT = 15 # Timeout ridotto
|
|
|
|
# Colori per output
|
|
class Colors:
|
|
BLUE = '\033[94m'
|
|
GREEN = '\033[92m'
|
|
YELLOW = '\033[93m'
|
|
RED = '\033[91m'
|
|
BOLD = '\033[1m'
|
|
END = '\033[0m'
|
|
|
|
def log_phase(message):
|
|
print(f"\n{Colors.BOLD}{Colors.GREEN}▶ {message}{Colors.END}\n")
|
|
logging.info(f"FASE: {message}")
|
|
|
|
def log_result(message):
|
|
print(f"{Colors.BLUE}✓ {message}{Colors.END}")
|
|
logging.info(f"RISULTATO: {message}")
|
|
|
|
def log_warning(message):
|
|
print(f"{Colors.YELLOW}⚠ {message}{Colors.END}")
|
|
logging.warning(message)
|
|
|
|
def log_error(message):
|
|
print(f"{Colors.RED}✗ {message}{Colors.END}")
|
|
logging.error(message)
|
|
|
|
def connect_to_database():
|
|
"""Connessione database semplificata"""
|
|
try:
|
|
log_phase("Connessione al database")
|
|
|
|
connection_string = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"
|
|
|
|
# Configurazione minimalista
|
|
engine = create_engine(
|
|
connection_string,
|
|
pool_size=2,
|
|
max_overflow=3,
|
|
pool_recycle=1800,
|
|
pool_pre_ping=True,
|
|
pool_timeout=CONNECTION_TIMEOUT,
|
|
echo=False,
|
|
connect_args={
|
|
'connection_timeout': CONNECTION_TIMEOUT,
|
|
'autocommit': True
|
|
}
|
|
)
|
|
|
|
# Test connessione veloce
|
|
with engine.connect() as conn:
|
|
result = conn.execute(text("SELECT 1")).fetchone()
|
|
if result[0] == 1:
|
|
log_result("Database connesso")
|
|
return engine
|
|
|
|
return None
|
|
except Exception as e:
|
|
log_error(f"Errore connessione: {e}")
|
|
return None
|
|
|
|
def extract_data_simple(engine, max_records=50000):
|
|
"""Estrazione dati con query semplicissima"""
|
|
try:
|
|
log_phase(f"Estrazione dati (max {max_records} record)")
|
|
|
|
# Query SEMPLICISSIMA - solo gli ultimi N record
|
|
query = text("""
|
|
SELECT ID, Data, Ora, Host, IndirizzoIP, Messaggio1, Messaggio2, Messaggio3
|
|
FROM Esterna
|
|
ORDER BY ID DESC
|
|
LIMIT :max_records
|
|
""")
|
|
|
|
log_result("Esecuzione query semplice...")
|
|
|
|
# Timeout di 60 secondi per la query
|
|
start_time = time.time()
|
|
df = pd.read_sql(query, engine, params={"max_records": max_records})
|
|
elapsed = time.time() - start_time
|
|
|
|
log_result(f"Estratti {len(df)} record in {elapsed:.1f} secondi")
|
|
return df
|
|
|
|
except Exception as e:
|
|
log_error(f"Errore estrazione: {e}")
|
|
return pd.DataFrame()
|
|
|
|
def prepare_data_simple(df):
|
|
"""Preparazione dati ultra-semplificata"""
|
|
try:
|
|
log_phase("Preparazione dati semplificata")
|
|
|
|
if df.empty:
|
|
log_error("DataFrame vuoto")
|
|
return None, None
|
|
|
|
# Feature minimaliste - solo 50 feature per velocità massima
|
|
feature_data = {}
|
|
n_samples = len(df)
|
|
|
|
# 1. Feature temporali base (10 feature)
|
|
if 'Data' in df.columns and 'Ora' in df.columns:
|
|
try:
|
|
df['Data'] = pd.to_datetime(df['Data'], errors='coerce')
|
|
df['Ora'] = pd.to_timedelta(df['Ora'].astype(str), errors='coerce')
|
|
df['Timestamp'] = df['Data'] + df['Ora']
|
|
feature_data['hour'] = df['Timestamp'].dt.hour.fillna(0).values
|
|
feature_data['day'] = df['Timestamp'].dt.dayofweek.fillna(0).values
|
|
feature_data['minute'] = df['Timestamp'].dt.minute.fillna(0).values
|
|
except:
|
|
feature_data['hour'] = np.zeros(n_samples)
|
|
feature_data['day'] = np.zeros(n_samples)
|
|
feature_data['minute'] = np.zeros(n_samples)
|
|
else:
|
|
feature_data['hour'] = np.zeros(n_samples)
|
|
feature_data['day'] = np.zeros(n_samples)
|
|
feature_data['minute'] = np.zeros(n_samples)
|
|
|
|
# Aggiungi 7 feature temporali semplici
|
|
for i in range(7):
|
|
feature_data[f'time_{i}'] = np.random.random(n_samples) * 0.1 # Rumore minimo
|
|
|
|
# 2. Feature protocollo semplici (15 feature)
|
|
if 'Messaggio1' in df.columns:
|
|
proto_data = df['Messaggio1'].fillna('').astype(str)
|
|
# Protocolli più comuni
|
|
protocols = ['TCP', 'UDP', 'ICMP', 'HTTP', 'SSH', 'FTP', 'DNS']
|
|
|
|
for i, protocol in enumerate(protocols):
|
|
feature_data[f'proto_{i}'] = proto_data.str.contains(protocol, case=False).astype(int).values
|
|
|
|
# Riempi rimanenti
|
|
for i in range(len(protocols), 15):
|
|
feature_data[f'proto_{i}'] = np.zeros(n_samples)
|
|
else:
|
|
for i in range(15):
|
|
feature_data[f'proto_{i}'] = np.zeros(n_samples)
|
|
|
|
# 3. Feature Host semplici (5 feature)
|
|
if 'Host' in df.columns:
|
|
host_data = df['Host'].fillna('').astype(str)
|
|
feature_data['host_fibra'] = host_data.str.contains('FIBRA', case=False).astype(int).values
|
|
feature_data['host_empty'] = df['Host'].isna().astype(int).values
|
|
feature_data['host_len'] = host_data.str.len().values / 100.0 # Normalizzato
|
|
else:
|
|
feature_data['host_fibra'] = np.zeros(n_samples)
|
|
feature_data['host_empty'] = np.zeros(n_samples)
|
|
feature_data['host_len'] = np.zeros(n_samples)
|
|
|
|
# Riempi fino a 5
|
|
for i in range(3, 5):
|
|
feature_data[f'host_{i}'] = np.zeros(n_samples)
|
|
|
|
# 4. Feature IP semplici (10 feature)
|
|
if 'Messaggio2' in df.columns:
|
|
ip_data = df['Messaggio2'].str.split(':').str[0].fillna('unknown').astype(str)
|
|
# Hash semplice per IP
|
|
for i in range(10):
|
|
feature_data[f'ip_{i}'] = (pd.util.hash_array(ip_data.values) % (2**(i+3))) / (2**(i+3))
|
|
else:
|
|
for i in range(10):
|
|
feature_data[f'ip_{i}'] = np.zeros(n_samples)
|
|
|
|
# 5. Feature ID semplici (10 feature)
|
|
if 'ID' in df.columns:
|
|
id_values = df['ID'].fillna(0).values
|
|
id_normalized = (id_values - id_values.min()) / (id_values.max() - id_values.min() + 1)
|
|
|
|
for i in range(10):
|
|
feature_data[f'id_{i}'] = np.roll(id_normalized, i) * (0.9 ** i)
|
|
else:
|
|
for i in range(10):
|
|
feature_data[f'id_{i}'] = np.zeros(n_samples)
|
|
|
|
# Verifica che abbiamo esattamente 50 feature
|
|
total_features = len(feature_data)
|
|
if total_features != 50:
|
|
log_warning(f"Feature count: {total_features}, aggiustando a 50")
|
|
if total_features < 50:
|
|
for i in range(total_features, 50):
|
|
feature_data[f'extra_{i}'] = np.zeros(n_samples)
|
|
else:
|
|
# Rimuovi feature in eccesso
|
|
keys_to_remove = list(feature_data.keys())[50:]
|
|
for key in keys_to_remove:
|
|
del feature_data[key]
|
|
|
|
# Crea array numpy
|
|
feature_names = sorted(feature_data.keys())
|
|
X = np.column_stack([feature_data[name] for name in feature_names])
|
|
|
|
# Preprocessor semplice
|
|
preprocessor = {
|
|
'feature_columns': feature_names,
|
|
'n_features': len(feature_names)
|
|
}
|
|
|
|
# Salva preprocessor
|
|
try:
|
|
dump(preprocessor, PREPROCESSOR_PATH)
|
|
log_result(f"Preprocessor salvato: {X.shape[1]} feature")
|
|
except Exception as e:
|
|
log_warning(f"Errore salvataggio preprocessor: {e}")
|
|
|
|
log_result(f"Dati preparati: {X.shape[0]} esempi, {X.shape[1]} feature")
|
|
return X, preprocessor
|
|
|
|
except Exception as e:
|
|
log_error(f"Errore preparazione dati: {e}")
|
|
return None, None
|
|
|
|
def train_models_simple(X):
|
|
"""Addestramento semplificato e veloce"""
|
|
try:
|
|
log_phase("Addestramento modelli semplificato")
|
|
|
|
if X.shape[0] < MIN_TRAINING_SAMPLES:
|
|
log_error(f"Troppo pochi campioni: {X.shape[0]} < {MIN_TRAINING_SAMPLES}")
|
|
return None, None, None, None
|
|
|
|
# Campionamento se necessario
|
|
if X.shape[0] > MAX_TRAINING_SAMPLES:
|
|
log_warning(f"Campionamento da {X.shape[0]} a {MAX_TRAINING_SAMPLES}")
|
|
indices = np.random.choice(X.shape[0], MAX_TRAINING_SAMPLES, replace=False)
|
|
X_train = X[indices]
|
|
else:
|
|
X_train = X
|
|
|
|
log_result(f"Addestramento su {X_train.shape[0]} esempi")
|
|
|
|
# 1. Isolation Forest veloce
|
|
log_result("Addestramento Isolation Forest...")
|
|
if_model = IsolationForest(
|
|
n_estimators=30, # Molto ridotto
|
|
contamination=0.05,
|
|
random_state=42,
|
|
n_jobs=2, # Limitato per evitare sovraccarico
|
|
max_samples=min(500, X_train.shape[0]),
|
|
max_features=0.7
|
|
)
|
|
if_model.fit(X_train)
|
|
log_result("✓ Isolation Forest OK")
|
|
|
|
# 2. LOF veloce (campione molto piccolo)
|
|
log_result("Addestramento LOF...")
|
|
lof_sample_size = min(5000, X_train.shape[0])
|
|
if X_train.shape[0] > lof_sample_size:
|
|
lof_indices = np.random.choice(X_train.shape[0], lof_sample_size, replace=False)
|
|
lof_sample = X_train[lof_indices]
|
|
else:
|
|
lof_sample = X_train
|
|
|
|
lof_model = LocalOutlierFactor(
|
|
n_neighbors=min(5, lof_sample.shape[0] // 50),
|
|
contamination=0.05,
|
|
novelty=True,
|
|
n_jobs=2
|
|
)
|
|
lof_model.fit(lof_sample)
|
|
log_result("✓ LOF OK")
|
|
|
|
# 3. SVM veloce (campione piccolissimo)
|
|
log_result("Addestramento SVM...")
|
|
svm_sample_size = min(2000, X_train.shape[0])
|
|
if X_train.shape[0] > svm_sample_size:
|
|
svm_indices = np.random.choice(X_train.shape[0], svm_sample_size, replace=False)
|
|
svm_sample = X_train[svm_indices]
|
|
else:
|
|
svm_sample = X_train
|
|
|
|
svm_model = OneClassSVM(
|
|
kernel='rbf',
|
|
gamma='scale',
|
|
nu=0.05
|
|
)
|
|
svm_model.fit(svm_sample)
|
|
log_result("✓ SVM OK")
|
|
|
|
# 4. Salvataggio
|
|
log_result("Salvataggio modelli...")
|
|
try:
|
|
dump(if_model, IF_MODEL_PATH)
|
|
dump(lof_model, LOF_MODEL_PATH)
|
|
dump(svm_model, SVM_MODEL_PATH)
|
|
|
|
ensemble_weights = {
|
|
'isolation_forest': 0.70, # Peso maggiore per IF
|
|
'lof': 0.20,
|
|
'svm': 0.10
|
|
}
|
|
dump(ensemble_weights, ENSEMBLE_MODEL_PATH)
|
|
|
|
log_result("✓ Modelli salvati")
|
|
|
|
except Exception as e:
|
|
log_error(f"Errore salvataggio: {e}")
|
|
return None, None, None, None
|
|
|
|
return if_model, lof_model, svm_model, ensemble_weights
|
|
|
|
except Exception as e:
|
|
log_error(f"Errore addestramento: {e}")
|
|
return None, None, None, None
|
|
|
|
def save_model_timestamp():
|
|
"""Salva timestamp (semplificato)"""
|
|
try:
|
|
with open(os.path.join(MODEL_DIR, 'last_training.txt'), 'w') as f:
|
|
f.write(datetime.now().isoformat())
|
|
log_result("Timestamp salvato")
|
|
return True
|
|
except Exception as e:
|
|
log_warning(f"Errore salvataggio timestamp: {e}")
|
|
return False
|
|
|
|
def needs_training(force_training=False):
|
|
"""Verifica necessità addestramento (semplificato)"""
|
|
if force_training:
|
|
log_result("Riaddestramento forzato")
|
|
return True
|
|
|
|
try:
|
|
timestamp_file = os.path.join(MODEL_DIR, 'last_training.txt')
|
|
if not os.path.exists(timestamp_file):
|
|
log_result("Nessun addestramento precedente")
|
|
return True
|
|
|
|
with open(timestamp_file, 'r') as f:
|
|
last_trained_str = f.read().strip()
|
|
|
|
last_trained = datetime.fromisoformat(last_trained_str)
|
|
now = datetime.now()
|
|
hours_diff = (now - last_trained).total_seconds() / 3600
|
|
|
|
if hours_diff >= TRAINING_FREQUENCY_HOURS:
|
|
log_result(f"Ultimo addestramento: {hours_diff:.1f} ore fa - necessario")
|
|
return True
|
|
else:
|
|
log_result(f"Ultimo addestramento: {hours_diff:.1f} ore fa - non necessario")
|
|
return False
|
|
|
|
except Exception as e:
|
|
log_warning(f"Errore verifica: {e}")
|
|
return True
|
|
|
|
def test_database_connection():
|
|
"""Test connessione semplice"""
|
|
try:
|
|
engine = connect_to_database()
|
|
if not engine:
|
|
return False
|
|
|
|
with engine.connect() as conn:
|
|
result = conn.execute(text("SELECT 1")).fetchone()
|
|
if result and result[0] == 1:
|
|
log_result("Test database OK")
|
|
|
|
# Test veloce tabella Esterna
|
|
try:
|
|
result = conn.execute(text("SELECT COUNT(*) FROM Esterna LIMIT 1")).fetchone()
|
|
log_result("Tabella Esterna accessibile")
|
|
except:
|
|
log_error("Tabella Esterna non accessibile")
|
|
return False
|
|
|
|
return True
|
|
return False
|
|
except Exception as e:
|
|
log_error(f"Errore test: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Funzione principale semplificata"""
|
|
parser = argparse.ArgumentParser(description='Addestramento semplificato v01')
|
|
parser.add_argument('--force-training', action='store_true', help='Forza riaddestramento')
|
|
parser.add_argument('--test', action='store_true', help='Test connessione')
|
|
parser.add_argument('--max-records', type=int, default=50000, help='Max record (default: 50k)')
|
|
parser.add_argument('--debug', action='store_true', help='Debug logging')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.debug:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
log_phase("Sistema addestramento SEMPLIFICATO v01")
|
|
log_result(f"Config: max {args.max_records} record")
|
|
|
|
start_time = time.time()
|
|
|
|
# Test veloce
|
|
if args.test:
|
|
log_phase("Test connessione")
|
|
if test_database_connection():
|
|
log_result("Test SUPERATO")
|
|
sys.exit(0)
|
|
else:
|
|
log_error("Test FALLITO")
|
|
sys.exit(1)
|
|
|
|
# Test connessione
|
|
if not test_database_connection():
|
|
log_error("Database non raggiungibile")
|
|
sys.exit(1)
|
|
|
|
# Verifica necessità
|
|
if not needs_training(args.force_training):
|
|
log_result("Addestramento non necessario")
|
|
sys.exit(0)
|
|
|
|
try:
|
|
# Connessione
|
|
engine = connect_to_database()
|
|
if not engine:
|
|
log_error("Connessione fallita")
|
|
sys.exit(1)
|
|
|
|
# Estrazione semplice
|
|
df = extract_data_simple(engine, args.max_records)
|
|
|
|
if df.empty:
|
|
log_error("Nessun dato estratto")
|
|
sys.exit(1)
|
|
|
|
# Preparazione semplice
|
|
X, preprocessor = prepare_data_simple(df)
|
|
|
|
if X is None:
|
|
log_error("Preparazione dati fallita")
|
|
sys.exit(1)
|
|
|
|
# Addestramento semplice
|
|
models = train_models_simple(X)
|
|
|
|
if all(m is not None for m in models):
|
|
log_phase("SUCCESSO!")
|
|
|
|
# Salva timestamp
|
|
save_model_timestamp()
|
|
|
|
# Statistiche finali
|
|
elapsed = time.time() - start_time
|
|
|
|
log_result(f"Tempo totale: {elapsed:.1f} secondi")
|
|
log_result(f"Campioni: {X.shape[0]}")
|
|
log_result(f"Feature: {X.shape[1]}")
|
|
|
|
else:
|
|
log_error("Addestramento fallito")
|
|
sys.exit(1)
|
|
|
|
except Exception as e:
|
|
log_error(f"Errore generale: {e}")
|
|
logging.error(traceback.format_exc())
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |