ids.alfacom.it/extracted_idf/analisys_01.py
marco370 0bfe3258b5 Saved progress at the end of the loop
Replit-Commit-Author: Agent
Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528
Replit-Commit-Checkpoint-Type: full_checkpoint
Replit-Commit-Event-Id: 1c71ce6e-1a3e-4f53-bb5d-77cdd22b8ea3
2025-11-11 09:15:10 +00:00

513 lines
18 KiB
Python

#!/usr/bin/env python3
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql import text
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from joblib import dump, load
import logging
import gc
import os
import time
from datetime import datetime, timedelta
import numpy as np
import argparse
import sys
import traceback
import warnings
warnings.filterwarnings('ignore')
# Configurazione del logging semplificata
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('analisys_debug.log')
]
)
# Configurazione del database
DB_USER = os.environ.get('DB_USER', 'root')
DB_PASSWORD = os.environ.get('DB_PASSWORD', 'Hdgtejskjjc0-')
DB_HOST = os.environ.get('DB_HOST', 'localhost')
DB_NAME = os.environ.get('DB_DATABASE', 'LOG_MIKROTIK')
# Cartella per i modelli
MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models')
try:
os.makedirs(MODEL_DIR, exist_ok=True)
except Exception as e:
logging.error(f"Errore nella creazione della directory models: {e}")
MODEL_DIR = '.'
# Percorsi dei modelli
IF_MODEL_PATH = os.path.join(MODEL_DIR, 'isolation_forest.joblib')
LOF_MODEL_PATH = os.path.join(MODEL_DIR, 'lof.joblib')
SVM_MODEL_PATH = os.path.join(MODEL_DIR, 'svm.joblib')
ENSEMBLE_MODEL_PATH = os.path.join(MODEL_DIR, 'ensemble_weights.joblib')
PREPROCESSOR_PATH = os.path.join(MODEL_DIR, 'preprocessor.joblib')
# Parametri semplificati
TRAINING_FREQUENCY_HOURS = 12
MAX_TRAINING_SAMPLES = 50000 # Ridotto drasticamente
MIN_TRAINING_SAMPLES = 500
CONNECTION_TIMEOUT = 15 # Timeout ridotto
# Colori per output
class Colors:
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
END = '\033[0m'
def log_phase(message):
print(f"\n{Colors.BOLD}{Colors.GREEN}{message}{Colors.END}\n")
logging.info(f"FASE: {message}")
def log_result(message):
print(f"{Colors.BLUE}{message}{Colors.END}")
logging.info(f"RISULTATO: {message}")
def log_warning(message):
print(f"{Colors.YELLOW}{message}{Colors.END}")
logging.warning(message)
def log_error(message):
print(f"{Colors.RED}{message}{Colors.END}")
logging.error(message)
def connect_to_database():
"""Connessione database semplificata"""
try:
log_phase("Connessione al database")
connection_string = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"
# Configurazione minimalista
engine = create_engine(
connection_string,
pool_size=2,
max_overflow=3,
pool_recycle=1800,
pool_pre_ping=True,
pool_timeout=CONNECTION_TIMEOUT,
echo=False,
connect_args={
'connection_timeout': CONNECTION_TIMEOUT,
'autocommit': True
}
)
# Test connessione veloce
with engine.connect() as conn:
result = conn.execute(text("SELECT 1")).fetchone()
if result[0] == 1:
log_result("Database connesso")
return engine
return None
except Exception as e:
log_error(f"Errore connessione: {e}")
return None
def extract_data_simple(engine, max_records=50000):
"""Estrazione dati con query semplicissima"""
try:
log_phase(f"Estrazione dati (max {max_records} record)")
# Query SEMPLICISSIMA - solo gli ultimi N record
query = text("""
SELECT ID, Data, Ora, Host, IndirizzoIP, Messaggio1, Messaggio2, Messaggio3
FROM Esterna
ORDER BY ID DESC
LIMIT :max_records
""")
log_result("Esecuzione query semplice...")
# Timeout di 60 secondi per la query
start_time = time.time()
df = pd.read_sql(query, engine, params={"max_records": max_records})
elapsed = time.time() - start_time
log_result(f"Estratti {len(df)} record in {elapsed:.1f} secondi")
return df
except Exception as e:
log_error(f"Errore estrazione: {e}")
return pd.DataFrame()
def prepare_data_simple(df):
"""Preparazione dati ultra-semplificata"""
try:
log_phase("Preparazione dati semplificata")
if df.empty:
log_error("DataFrame vuoto")
return None, None
# Feature minimaliste - solo 50 feature per velocità massima
feature_data = {}
n_samples = len(df)
# 1. Feature temporali base (10 feature)
if 'Data' in df.columns and 'Ora' in df.columns:
try:
df['Data'] = pd.to_datetime(df['Data'], errors='coerce')
df['Ora'] = pd.to_timedelta(df['Ora'].astype(str), errors='coerce')
df['Timestamp'] = df['Data'] + df['Ora']
feature_data['hour'] = df['Timestamp'].dt.hour.fillna(0).values
feature_data['day'] = df['Timestamp'].dt.dayofweek.fillna(0).values
feature_data['minute'] = df['Timestamp'].dt.minute.fillna(0).values
except:
feature_data['hour'] = np.zeros(n_samples)
feature_data['day'] = np.zeros(n_samples)
feature_data['minute'] = np.zeros(n_samples)
else:
feature_data['hour'] = np.zeros(n_samples)
feature_data['day'] = np.zeros(n_samples)
feature_data['minute'] = np.zeros(n_samples)
# Aggiungi 7 feature temporali semplici
for i in range(7):
feature_data[f'time_{i}'] = np.random.random(n_samples) * 0.1 # Rumore minimo
# 2. Feature protocollo semplici (15 feature)
if 'Messaggio1' in df.columns:
proto_data = df['Messaggio1'].fillna('').astype(str)
# Protocolli più comuni
protocols = ['TCP', 'UDP', 'ICMP', 'HTTP', 'SSH', 'FTP', 'DNS']
for i, protocol in enumerate(protocols):
feature_data[f'proto_{i}'] = proto_data.str.contains(protocol, case=False).astype(int).values
# Riempi rimanenti
for i in range(len(protocols), 15):
feature_data[f'proto_{i}'] = np.zeros(n_samples)
else:
for i in range(15):
feature_data[f'proto_{i}'] = np.zeros(n_samples)
# 3. Feature Host semplici (5 feature)
if 'Host' in df.columns:
host_data = df['Host'].fillna('').astype(str)
feature_data['host_fibra'] = host_data.str.contains('FIBRA', case=False).astype(int).values
feature_data['host_empty'] = df['Host'].isna().astype(int).values
feature_data['host_len'] = host_data.str.len().values / 100.0 # Normalizzato
else:
feature_data['host_fibra'] = np.zeros(n_samples)
feature_data['host_empty'] = np.zeros(n_samples)
feature_data['host_len'] = np.zeros(n_samples)
# Riempi fino a 5
for i in range(3, 5):
feature_data[f'host_{i}'] = np.zeros(n_samples)
# 4. Feature IP semplici (10 feature)
if 'Messaggio2' in df.columns:
ip_data = df['Messaggio2'].str.split(':').str[0].fillna('unknown').astype(str)
# Hash semplice per IP
for i in range(10):
feature_data[f'ip_{i}'] = (pd.util.hash_array(ip_data.values) % (2**(i+3))) / (2**(i+3))
else:
for i in range(10):
feature_data[f'ip_{i}'] = np.zeros(n_samples)
# 5. Feature ID semplici (10 feature)
if 'ID' in df.columns:
id_values = df['ID'].fillna(0).values
id_normalized = (id_values - id_values.min()) / (id_values.max() - id_values.min() + 1)
for i in range(10):
feature_data[f'id_{i}'] = np.roll(id_normalized, i) * (0.9 ** i)
else:
for i in range(10):
feature_data[f'id_{i}'] = np.zeros(n_samples)
# Verifica che abbiamo esattamente 50 feature
total_features = len(feature_data)
if total_features != 50:
log_warning(f"Feature count: {total_features}, aggiustando a 50")
if total_features < 50:
for i in range(total_features, 50):
feature_data[f'extra_{i}'] = np.zeros(n_samples)
else:
# Rimuovi feature in eccesso
keys_to_remove = list(feature_data.keys())[50:]
for key in keys_to_remove:
del feature_data[key]
# Crea array numpy
feature_names = sorted(feature_data.keys())
X = np.column_stack([feature_data[name] for name in feature_names])
# Preprocessor semplice
preprocessor = {
'feature_columns': feature_names,
'n_features': len(feature_names)
}
# Salva preprocessor
try:
dump(preprocessor, PREPROCESSOR_PATH)
log_result(f"Preprocessor salvato: {X.shape[1]} feature")
except Exception as e:
log_warning(f"Errore salvataggio preprocessor: {e}")
log_result(f"Dati preparati: {X.shape[0]} esempi, {X.shape[1]} feature")
return X, preprocessor
except Exception as e:
log_error(f"Errore preparazione dati: {e}")
return None, None
def train_models_simple(X):
"""Addestramento semplificato e veloce"""
try:
log_phase("Addestramento modelli semplificato")
if X.shape[0] < MIN_TRAINING_SAMPLES:
log_error(f"Troppo pochi campioni: {X.shape[0]} < {MIN_TRAINING_SAMPLES}")
return None, None, None, None
# Campionamento se necessario
if X.shape[0] > MAX_TRAINING_SAMPLES:
log_warning(f"Campionamento da {X.shape[0]} a {MAX_TRAINING_SAMPLES}")
indices = np.random.choice(X.shape[0], MAX_TRAINING_SAMPLES, replace=False)
X_train = X[indices]
else:
X_train = X
log_result(f"Addestramento su {X_train.shape[0]} esempi")
# 1. Isolation Forest veloce
log_result("Addestramento Isolation Forest...")
if_model = IsolationForest(
n_estimators=30, # Molto ridotto
contamination=0.05,
random_state=42,
n_jobs=2, # Limitato per evitare sovraccarico
max_samples=min(500, X_train.shape[0]),
max_features=0.7
)
if_model.fit(X_train)
log_result("✓ Isolation Forest OK")
# 2. LOF veloce (campione molto piccolo)
log_result("Addestramento LOF...")
lof_sample_size = min(5000, X_train.shape[0])
if X_train.shape[0] > lof_sample_size:
lof_indices = np.random.choice(X_train.shape[0], lof_sample_size, replace=False)
lof_sample = X_train[lof_indices]
else:
lof_sample = X_train
lof_model = LocalOutlierFactor(
n_neighbors=min(5, lof_sample.shape[0] // 50),
contamination=0.05,
novelty=True,
n_jobs=2
)
lof_model.fit(lof_sample)
log_result("✓ LOF OK")
# 3. SVM veloce (campione piccolissimo)
log_result("Addestramento SVM...")
svm_sample_size = min(2000, X_train.shape[0])
if X_train.shape[0] > svm_sample_size:
svm_indices = np.random.choice(X_train.shape[0], svm_sample_size, replace=False)
svm_sample = X_train[svm_indices]
else:
svm_sample = X_train
svm_model = OneClassSVM(
kernel='rbf',
gamma='scale',
nu=0.05
)
svm_model.fit(svm_sample)
log_result("✓ SVM OK")
# 4. Salvataggio
log_result("Salvataggio modelli...")
try:
dump(if_model, IF_MODEL_PATH)
dump(lof_model, LOF_MODEL_PATH)
dump(svm_model, SVM_MODEL_PATH)
ensemble_weights = {
'isolation_forest': 0.70, # Peso maggiore per IF
'lof': 0.20,
'svm': 0.10
}
dump(ensemble_weights, ENSEMBLE_MODEL_PATH)
log_result("✓ Modelli salvati")
except Exception as e:
log_error(f"Errore salvataggio: {e}")
return None, None, None, None
return if_model, lof_model, svm_model, ensemble_weights
except Exception as e:
log_error(f"Errore addestramento: {e}")
return None, None, None, None
def save_model_timestamp():
"""Salva timestamp (semplificato)"""
try:
with open(os.path.join(MODEL_DIR, 'last_training.txt'), 'w') as f:
f.write(datetime.now().isoformat())
log_result("Timestamp salvato")
return True
except Exception as e:
log_warning(f"Errore salvataggio timestamp: {e}")
return False
def needs_training(force_training=False):
"""Verifica necessità addestramento (semplificato)"""
if force_training:
log_result("Riaddestramento forzato")
return True
try:
timestamp_file = os.path.join(MODEL_DIR, 'last_training.txt')
if not os.path.exists(timestamp_file):
log_result("Nessun addestramento precedente")
return True
with open(timestamp_file, 'r') as f:
last_trained_str = f.read().strip()
last_trained = datetime.fromisoformat(last_trained_str)
now = datetime.now()
hours_diff = (now - last_trained).total_seconds() / 3600
if hours_diff >= TRAINING_FREQUENCY_HOURS:
log_result(f"Ultimo addestramento: {hours_diff:.1f} ore fa - necessario")
return True
else:
log_result(f"Ultimo addestramento: {hours_diff:.1f} ore fa - non necessario")
return False
except Exception as e:
log_warning(f"Errore verifica: {e}")
return True
def test_database_connection():
"""Test connessione semplice"""
try:
engine = connect_to_database()
if not engine:
return False
with engine.connect() as conn:
result = conn.execute(text("SELECT 1")).fetchone()
if result and result[0] == 1:
log_result("Test database OK")
# Test veloce tabella Esterna
try:
result = conn.execute(text("SELECT COUNT(*) FROM Esterna LIMIT 1")).fetchone()
log_result("Tabella Esterna accessibile")
except:
log_error("Tabella Esterna non accessibile")
return False
return True
return False
except Exception as e:
log_error(f"Errore test: {e}")
return False
def main():
"""Funzione principale semplificata"""
parser = argparse.ArgumentParser(description='Addestramento semplificato v01')
parser.add_argument('--force-training', action='store_true', help='Forza riaddestramento')
parser.add_argument('--test', action='store_true', help='Test connessione')
parser.add_argument('--max-records', type=int, default=50000, help='Max record (default: 50k)')
parser.add_argument('--debug', action='store_true', help='Debug logging')
args = parser.parse_args()
if args.debug:
logging.getLogger().setLevel(logging.DEBUG)
log_phase("Sistema addestramento SEMPLIFICATO v01")
log_result(f"Config: max {args.max_records} record")
start_time = time.time()
# Test veloce
if args.test:
log_phase("Test connessione")
if test_database_connection():
log_result("Test SUPERATO")
sys.exit(0)
else:
log_error("Test FALLITO")
sys.exit(1)
# Test connessione
if not test_database_connection():
log_error("Database non raggiungibile")
sys.exit(1)
# Verifica necessità
if not needs_training(args.force_training):
log_result("Addestramento non necessario")
sys.exit(0)
try:
# Connessione
engine = connect_to_database()
if not engine:
log_error("Connessione fallita")
sys.exit(1)
# Estrazione semplice
df = extract_data_simple(engine, args.max_records)
if df.empty:
log_error("Nessun dato estratto")
sys.exit(1)
# Preparazione semplice
X, preprocessor = prepare_data_simple(df)
if X is None:
log_error("Preparazione dati fallita")
sys.exit(1)
# Addestramento semplice
models = train_models_simple(X)
if all(m is not None for m in models):
log_phase("SUCCESSO!")
# Salva timestamp
save_model_timestamp()
# Statistiche finali
elapsed = time.time() - start_time
log_result(f"Tempo totale: {elapsed:.1f} secondi")
log_result(f"Campioni: {X.shape[0]}")
log_result(f"Feature: {X.shape[1]}")
else:
log_error("Addestramento fallito")
sys.exit(1)
except Exception as e:
log_error(f"Errore generale: {e}")
logging.error(traceback.format_exc())
sys.exit(1)
if __name__ == "__main__":
main()