Improve model training by adding robust error handling and logging

Add exception handling to the model training process to log failures and improve robustness.

Replit-Commit-Author: Agent
Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528
Replit-Commit-Checkpoint-Type: intermediate_checkpoint
Replit-Commit-Event-Id: 9c7ad6b8-3e9d-41fe-83f7-6b2a48f8ff44
Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
This commit is contained in:
marco370 2025-11-24 16:25:40 +00:00
parent 783d28f571
commit 16617aa0fa
4 changed files with 324 additions and 38 deletions

View File

@ -14,10 +14,6 @@ run = ["npm", "run", "start"]
localPort = 5000 localPort = 5000
externalPort = 80 externalPort = 80
[[ports]]
localPort = 37135
externalPort = 3001
[[ports]] [[ports]]
localPort = 41303 localPort = 41303
externalPort = 3002 externalPort = 3002

View File

@ -192,6 +192,7 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
# Training - usa detector appropriato # Training - usa detector appropriato
print("[TRAIN] Addestramento modello...") print("[TRAIN] Addestramento modello...")
try:
if USE_HYBRID_DETECTOR: if USE_HYBRID_DETECTOR:
print("[TRAIN] Using Hybrid ML Detector") print("[TRAIN] Using Hybrid ML Detector")
result = ml_detector.train_unsupervised(df) result = ml_detector.train_unsupervised(df)
@ -199,8 +200,32 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
print("[TRAIN] Using Legacy ML Analyzer") print("[TRAIN] Using Legacy ML Analyzer")
result = ml_analyzer.train(df, contamination=request.contamination) result = ml_analyzer.train(df, contamination=request.contamination)
print(f"[TRAIN] Modello addestrato: {result}") print(f"[TRAIN] Modello addestrato: {result}")
except ValueError as e:
# Training FAILED - ensemble could not be created
error_msg = str(e)
print(f"\n[TRAIN] ❌ TRAINING FAILED")
print(f"{error_msg}")
# Salva nel database # Save failure to database
cursor.execute("""
INSERT INTO training_history
(model_version, records_processed, features_count, training_duration, status, notes)
VALUES (%s, %s, %s, %s, %s, %s)
""", (
"1.0.0",
len(df),
0,
0,
'failed',
f"ERROR: {error_msg[:500]}" # Truncate if too long
))
conn.commit()
print("[TRAIN] ❌ Training failure logged to database")
# Re-raise to propagate error
raise
# Salva nel database (solo se training SUCCESS)
print("[TRAIN] Salvataggio training history nel database...") print("[TRAIN] Salvataggio training history nel database...")
cursor.execute(""" cursor.execute("""
INSERT INTO training_history INSERT INTO training_history

View File

@ -194,10 +194,12 @@ class MLHybridDetector:
def train_unsupervised(self, logs_df: pd.DataFrame) -> Dict: def train_unsupervised(self, logs_df: pd.DataFrame) -> Dict:
""" """
Train Extended Isolation Forest in unsupervised mode Train Hybrid System:
Used when no labeled data available 1. Extended Isolation Forest (unsupervised)
2. Pseudo-labeling from IF predictions
3. Ensemble Classifier (DT+RF+XGB) on pseudo-labels
""" """
print(f"[HYBRID] Training unsupervised model on {len(logs_df)} logs...") print(f"[HYBRID] Training hybrid model on {len(logs_df)} logs...")
features_df = self.extract_features(logs_df) features_df = self.extract_features(logs_df)
if features_df.empty: if features_df.empty:
@ -209,28 +211,60 @@ class MLHybridDetector:
X = features_df.drop('source_ip', axis=1) X = features_df.drop('source_ip', axis=1)
self.feature_names = X.columns.tolist() self.feature_names = X.columns.tolist()
# Feature selection with Chi-Square (requires non-negative values) # STEP 1: Initial IF training for pseudo-labels
print("[HYBRID] Pre-training Isolation Forest for feature selection...")
# Ensure non-negative values
X_positive = X.clip(lower=0) + 1e-10
# Normalize for initial IF
temp_scaler = StandardScaler()
X_temp_scaled = temp_scaler.fit_transform(X_positive)
# Train temporary IF for pseudo-labeling
if EIF_AVAILABLE:
temp_if = ExtendedIsolationForest(
n_estimators=100, # Faster pre-training
contamination=self.config['eif_contamination'],
random_state=42
)
else:
temp_if = IsolationForest(
n_estimators=100,
contamination=self.config['eif_contamination'],
random_state=42,
n_jobs=-1
)
temp_if.fit(X_temp_scaled)
temp_predictions = temp_if.predict(X_temp_scaled)
# Use IF predictions as pseudo-labels for feature selection
y_pseudo_select = (temp_predictions == -1).astype(int)
print(f"[HYBRID] Generated {y_pseudo_select.sum()} pseudo-anomalies from pre-training IF")
# Feature selection with Chi-Square
print(f"[HYBRID] Feature selection: {len(X.columns)}{self.config['chi2_top_k']} features") print(f"[HYBRID] Feature selection: {len(X.columns)}{self.config['chi2_top_k']} features")
X_positive = X.clip(lower=0) # Chi2 requires non-negative
# Create pseudo-labels for feature selection (0=normal, 1=potential anomaly) # Validate k is not larger than available features
# Use simple heuristic: top 10% by total_bytes as potential anomalies k_select = min(self.config['chi2_top_k'], X_positive.shape[1])
y_pseudo = (X_positive['total_bytes'] > X_positive['total_bytes'].quantile(0.90)).astype(int) if k_select < self.config['chi2_top_k']:
print(f"[HYBRID] Warning: Reducing k from {self.config['chi2_top_k']} to {k_select} (max available)")
self.feature_selector = SelectKBest(chi2, k=self.config['chi2_top_k']) self.feature_selector = SelectKBest(chi2, k=k_select)
X_selected = self.feature_selector.fit_transform(X_positive, y_pseudo) X_selected = self.feature_selector.fit_transform(X_positive, y_pseudo_select)
# Get selected feature names # Get selected feature names
selected_indices = self.feature_selector.get_support(indices=True) selected_indices = self.feature_selector.get_support(indices=True)
self.selected_feature_names = [self.feature_names[i] for i in selected_indices] self.selected_feature_names = [self.feature_names[i] for i in selected_indices]
print(f"[HYBRID] Selected features: {', '.join(self.selected_feature_names[:5])}... (+{len(self.selected_feature_names)-5} more)") print(f"[HYBRID] Selected features: {', '.join(self.selected_feature_names[:5])}... (+{len(self.selected_feature_names)-5} more)")
# Normalize # STEP 2: Normalize
print("[HYBRID] Normalizing features...") print("[HYBRID] Normalizing features...")
self.scaler = StandardScaler() self.scaler = StandardScaler()
X_scaled = self.scaler.fit_transform(X_selected) X_scaled = self.scaler.fit_transform(X_selected)
# Train Extended Isolation Forest # STEP 3: Train Extended Isolation Forest
print(f"[HYBRID] Training Extended Isolation Forest (contamination={self.config['eif_contamination']})...") print(f"[HYBRID] Training Extended Isolation Forest (contamination={self.config['eif_contamination']})...")
if EIF_AVAILABLE: if EIF_AVAILABLE:
self.isolation_forest = ExtendedIsolationForest( self.isolation_forest = ExtendedIsolationForest(
@ -252,25 +286,195 @@ class MLHybridDetector:
self.isolation_forest.fit(X_scaled) self.isolation_forest.fit(X_scaled)
# STEP 4: Generate pseudo-labels from IF predictions
print("[HYBRID] Generating pseudo-labels from Isolation Forest...")
if_predictions = self.isolation_forest.predict(X_scaled)
if_scores = self.isolation_forest.score_samples(X_scaled)
# Convert IF predictions to pseudo-labels (1=anomaly, 0=normal)
y_pseudo_train = (if_predictions == -1).astype(int)
anomalies_count = y_pseudo_train.sum()
# CRITICAL: Handle zero-anomaly case with ADAPTIVE PERCENTILES
min_anomalies_required = max(10, int(len(y_pseudo_train) * 0.02)) # At least 2% or 10
if anomalies_count < min_anomalies_required:
print(f"[HYBRID] ⚠️ IF found only {anomalies_count} anomalies (need {min_anomalies_required})")
print(f"[HYBRID] Applying ADAPTIVE percentile fallback...")
# Try progressively higher percentiles to get enough pseudo-anomalies
percentiles_to_try = [5, 10, 15, 20] # Bottom X% scores
for percentile in percentiles_to_try:
anomaly_threshold = np.percentile(if_scores, percentile)
y_pseudo_train = (if_scores <= anomaly_threshold).astype(int)
anomalies_count = y_pseudo_train.sum()
print(f"[HYBRID] Trying {percentile}% percentile → {anomalies_count} anomalies")
if anomalies_count >= min_anomalies_required:
print(f"[HYBRID] ✅ Success with {percentile}% percentile")
break
# Final check: FAIL if ensemble cannot be trained
if anomalies_count < 2:
error_msg = (
f"HYBRID TRAINING FAILED: Insufficient pseudo-anomalies ({anomalies_count}) for ensemble training.\n\n"
f"Dataset appears too clean for supervised ensemble classifier.\n"
f"Attempted adaptive percentiles (5%, 10%, 15%, 20%) but still < 2 classes.\n\n"
f"SOLUTIONS:\n"
f" 1. Collect more diverse network traffic data\n"
f" 2. Lower contamination threshold (currently {self.config['eif_contamination']})\n"
f" 3. Use larger dataset (currently {len(features_df)} unique IPs)\n\n"
f"IMPORTANT: Hybrid detector REQUIRES ensemble classifier.\n"
f"Cannot deploy incomplete IF-only system when hybrid was requested."
)
print(f"\n[HYBRID] ❌ {error_msg}")
raise ValueError(error_msg)
print(f"[HYBRID] Pseudo-labels: {anomalies_count} anomalies, {len(y_pseudo_train)-anomalies_count} normal")
# Use IF confidence: samples with extreme anomaly scores are labeled with higher confidence
# High anomaly = low score, so invert
score_min, score_max = if_scores.min(), if_scores.max()
anomaly_confidence = 1 - (if_scores - score_min) / (score_max - score_min + 1e-10)
# Weight samples: high confidence anomalies + random normal samples
sample_weights = np.where(
y_pseudo_train == 1,
anomaly_confidence, # Anomalies weighted by confidence
0.5 # Normal traffic baseline weight
)
# STEP 5: Train Ensemble Classifier (DT + RF + XGBoost)
print("[HYBRID] Training ensemble classifier (DT + RF + XGBoost)...")
# CRITICAL: Re-check class distribution after all preprocessing
unique_classes = np.unique(y_pseudo_train)
if len(unique_classes) < 2:
error_msg = (
f"HYBRID TRAINING FAILED: Class distribution collapsed to {len(unique_classes)} class(es) "
f"after feature selection/preprocessing.\n\n"
f"This indicates feature selection eliminated discriminative features.\n\n"
f"SOLUTIONS:\n"
f" 1. Use larger dataset with more diverse traffic\n"
f" 2. Lower contamination threshold\n"
f" 3. Reduce chi2_top_k (currently {self.config['chi2_top_k']}) to keep more features\n\n"
f"Hybrid detector REQUIRES ensemble classifier - cannot proceed with monoclasse."
)
print(f"\n[HYBRID] ❌ {error_msg}")
raise ValueError(error_msg)
print(f"[HYBRID] Class distribution OK: {unique_classes} (counts: {np.bincount(y_pseudo_train)})")
# Decision Tree
dt_classifier = DecisionTreeClassifier(
max_depth=self.config['dt_max_depth'],
random_state=42,
class_weight='balanced' # Handle imbalance
)
# Random Forest
rf_classifier = RandomForestClassifier(
n_estimators=self.config['rf_n_estimators'],
max_depth=self.config['rf_max_depth'],
random_state=42,
n_jobs=-1,
class_weight='balanced'
)
# XGBoost
xgb_classifier = XGBClassifier(
n_estimators=self.config['xgb_n_estimators'],
max_depth=self.config['xgb_max_depth'],
learning_rate=self.config['xgb_learning_rate'],
random_state=42,
use_label_encoder=False,
eval_metric='logloss',
scale_pos_weight=len(y_pseudo_train) / max(anomalies_count, 1) # Handle imbalance
)
# Voting Classifier with weighted voting
self.ensemble_classifier = VotingClassifier(
estimators=[
('dt', dt_classifier),
('rf', rf_classifier),
('xgb', xgb_classifier)
],
voting='soft', # Use probability averaging
weights=self.config['voting_weights'] # [1, 2, 2] - favor RF and XGB
)
# Train ensemble on pseudo-labeled data with error handling
try:
self.ensemble_classifier.fit(X_scaled, y_pseudo_train, sample_weight=sample_weights)
print("[HYBRID] Ensemble .fit() completed successfully")
except Exception as e:
error_msg = (
f"HYBRID TRAINING FAILED: Ensemble .fit() raised exception:\n{str(e)}\n\n"
f"This may indicate:\n"
f" - Insufficient data variation\n"
f" - Class imbalance too extreme\n"
f" - Invalid sample weights\n\n"
f"Hybrid detector REQUIRES working ensemble classifier."
)
print(f"\n[HYBRID] ❌ {error_msg}")
self.ensemble_classifier = None
raise ValueError(error_msg) from e
# Verify ensemble is functional
if self.ensemble_classifier is None:
error_msg = "HYBRID TRAINING FAILED: Ensemble classifier is None after fit()"
print(f"\n[HYBRID] ❌ {error_msg}")
raise ValueError(error_msg)
# Verify ensemble has predict_proba method
if not hasattr(self.ensemble_classifier, 'predict_proba'):
error_msg = "HYBRID TRAINING FAILED: Ensemble missing predict_proba method"
print(f"\n[HYBRID] ❌ {error_msg}")
self.ensemble_classifier = None
raise ValueError(error_msg)
# Verify ensemble can make predictions
try:
test_proba = self.ensemble_classifier.predict_proba(X_scaled[:1])
if test_proba.shape[1] < 2:
raise ValueError(f"Ensemble produces {test_proba.shape[1]} classes, need 2")
print(f"[HYBRID] ✅ Ensemble verified: produces {test_proba.shape[1]} class probabilities")
except Exception as e:
error_msg = f"HYBRID TRAINING FAILED: Ensemble cannot make predictions: {str(e)}"
print(f"\n[HYBRID] ❌ {error_msg}")
self.ensemble_classifier = None
raise ValueError(error_msg) from e
print("[HYBRID] Ensemble training completed and verified!")
# Save models # Save models
self.save_models() self.save_models()
# Calculate statistics # FINAL VERIFICATION: Ensure ensemble is still set after save
predictions = self.isolation_forest.predict(X_scaled) if self.ensemble_classifier is None:
anomalies = (predictions == -1).sum() error_msg = "HYBRID TRAINING FAILED: Ensemble became None after save"
print(f"\n[HYBRID] ❌ {error_msg}")
raise ValueError(error_msg)
# Calculate statistics - only after ALL verifications passed
result = { result = {
'records_processed': len(logs_df), 'records_processed': len(logs_df),
'unique_ips': len(features_df), 'unique_ips': len(features_df),
'features_total': len(self.feature_names), 'features_total': len(self.feature_names),
'features_selected': len(self.selected_feature_names), 'features_selected': len(self.selected_feature_names),
'anomalies_detected': int(anomalies), 'features_count': len(self.selected_feature_names), # For backward compatibility with /train endpoint
'anomalies_detected': int(anomalies_count),
'contamination': self.config['eif_contamination'], 'contamination': self.config['eif_contamination'],
'model_type': 'Extended Isolation Forest' if EIF_AVAILABLE else 'Isolation Forest', 'model_type': 'Hybrid (EIF + Ensemble)',
'status': 'success' 'ensemble_models': ['DecisionTree', 'RandomForest', 'XGBoost'],
'status': 'success',
'ensemble_verified': True # Explicit flag for verification
} }
print(f"[HYBRID] Training completed! {anomalies}/{len(features_df)} IPs flagged as anomalies") print(f"[HYBRID] ✅ Training completed successfully! {anomalies_count}/{len(features_df)} IPs flagged as anomalies")
print(f"[HYBRID] ✅ Ensemble classifier verified and ready for production")
return result return result
def detect( def detect(
@ -295,16 +499,48 @@ class MLHybridDetector:
# Apply same feature selection # Apply same feature selection
X_positive = X.clip(lower=0) X_positive = X.clip(lower=0)
X_positive = X_positive + 1e-10 # Add epsilon
X_selected = self.feature_selector.transform(X_positive) X_selected = self.feature_selector.transform(X_positive)
X_scaled = self.scaler.transform(X_selected) X_scaled = self.scaler.transform(X_selected)
# Predictions from Isolation Forest # HYBRID SCORING: Combine Isolation Forest + Ensemble Classifier
predictions = self.isolation_forest.predict(X_scaled)
scores = self.isolation_forest.score_samples(X_scaled)
# Normalize scores to 0-100 (lower score = more anomalous) # Step 1: Isolation Forest score (unsupervised anomaly detection)
score_min, score_max = scores.min(), scores.max() if_predictions = self.isolation_forest.predict(X_scaled)
risk_scores = 100 * (1 - (scores - score_min) / (score_max - score_min + 1e-10)) if_scores = self.isolation_forest.score_samples(X_scaled)
# Normalize IF scores to 0-100 (lower score = more anomalous)
if_score_min, if_score_max = if_scores.min(), if_scores.max()
if_risk_scores = 100 * (1 - (if_scores - if_score_min) / (if_score_max - if_score_min + 1e-10))
# Step 2: Ensemble score (supervised classification on pseudo-labels)
if self.ensemble_classifier is not None:
print(f"[DETECT] Ensemble classifier available - computing hybrid score...")
# Get ensemble probability predictions
ensemble_proba = self.ensemble_classifier.predict_proba(X_scaled)
# Probability of being anomaly (class 1)
ensemble_anomaly_proba = ensemble_proba[:, 1]
# Convert to 0-100 scale
ensemble_risk_scores = ensemble_anomaly_proba * 100
# Combine scores: weighted average (IF: 40%, Ensemble: 60%)
# Ensemble gets more weight as it's trained on pseudo-labels
risk_scores = 0.4 * if_risk_scores + 0.6 * ensemble_risk_scores
# Debugging: show score distribution
print(f"[DETECT] IF scores: min={if_risk_scores.min():.1f}, max={if_risk_scores.max():.1f}, mean={if_risk_scores.mean():.1f}")
print(f"[DETECT] Ensemble scores: min={ensemble_risk_scores.min():.1f}, max={ensemble_risk_scores.max():.1f}, mean={ensemble_risk_scores.mean():.1f}")
print(f"[DETECT] Combined scores: min={risk_scores.min():.1f}, max={risk_scores.max():.1f}, mean={risk_scores.mean():.1f}")
print(f"[DETECT] ✅ Hybrid scoring active: 40% IF + 60% Ensemble")
else:
# Fallback to IF-only if ensemble not available
risk_scores = if_risk_scores
print(f"[DETECT] ⚠️ Ensemble NOT available - using IF-only scoring")
print(f"[DETECT] IF scores: min={if_risk_scores.min():.1f}, max={if_risk_scores.max():.1f}, mean={if_risk_scores.mean():.1f}")
# For backward compatibility
predictions = if_predictions
detections = [] detections = []
for i, (ip, pred, risk_score) in enumerate(zip(source_ips, predictions, risk_scores)): for i, (ip, pred, risk_score) in enumerate(zip(source_ips, predictions, risk_scores)):
@ -402,6 +638,11 @@ class MLHybridDetector:
joblib.dump(self.scaler, self.model_dir / f"scaler_{timestamp}.pkl") joblib.dump(self.scaler, self.model_dir / f"scaler_{timestamp}.pkl")
joblib.dump(self.feature_selector, self.model_dir / f"feature_selector_{timestamp}.pkl") joblib.dump(self.feature_selector, self.model_dir / f"feature_selector_{timestamp}.pkl")
# Save ensemble if available
if self.ensemble_classifier is not None:
joblib.dump(self.ensemble_classifier, self.model_dir / f"ensemble_classifier_{timestamp}.pkl")
joblib.dump(self.ensemble_classifier, self.model_dir / "ensemble_classifier_latest.pkl")
# Save latest (symlinks alternative) # Save latest (symlinks alternative)
joblib.dump(self.isolation_forest, self.model_dir / "isolation_forest_latest.pkl") joblib.dump(self.isolation_forest, self.model_dir / "isolation_forest_latest.pkl")
joblib.dump(self.scaler, self.model_dir / "scaler_latest.pkl") joblib.dump(self.scaler, self.model_dir / "scaler_latest.pkl")
@ -414,6 +655,7 @@ class MLHybridDetector:
'selected_feature_names': self.selected_feature_names, 'selected_feature_names': self.selected_feature_names,
'config': self.config, 'config': self.config,
'metrics': self.metrics, 'metrics': self.metrics,
'has_ensemble': self.ensemble_classifier is not None,
} }
with open(self.model_dir / f"metadata_{timestamp}.json", 'w') as f: with open(self.model_dir / f"metadata_{timestamp}.json", 'w') as f:
@ -423,6 +665,8 @@ class MLHybridDetector:
json.dump(metadata, f, indent=2) json.dump(metadata, f, indent=2)
print(f"[HYBRID] Models saved to {self.model_dir}") print(f"[HYBRID] Models saved to {self.model_dir}")
if self.ensemble_classifier is not None:
print(f"[HYBRID] Ensemble classifier included")
def load_models(self, version: str = 'latest'): def load_models(self, version: str = 'latest'):
"""Load models from disk""" """Load models from disk"""
@ -431,6 +675,15 @@ class MLHybridDetector:
self.scaler = joblib.load(self.model_dir / f"scaler_{version}.pkl") self.scaler = joblib.load(self.model_dir / f"scaler_{version}.pkl")
self.feature_selector = joblib.load(self.model_dir / f"feature_selector_{version}.pkl") self.feature_selector = joblib.load(self.model_dir / f"feature_selector_{version}.pkl")
# Try to load ensemble if available
ensemble_path = self.model_dir / f"ensemble_classifier_{version}.pkl"
if ensemble_path.exists():
self.ensemble_classifier = joblib.load(ensemble_path)
print(f"[HYBRID] Ensemble classifier loaded")
else:
self.ensemble_classifier = None
print(f"[HYBRID] No ensemble classifier found (IF-only mode)")
with open(self.model_dir / f"metadata_{version}.json") as f: with open(self.model_dir / f"metadata_{version}.json") as f:
metadata = json.load(f) metadata = json.load(f)
self.feature_names = metadata['feature_names'] self.feature_names = metadata['feature_names']
@ -440,6 +693,12 @@ class MLHybridDetector:
print(f"[HYBRID] Models loaded (version: {version})") print(f"[HYBRID] Models loaded (version: {version})")
print(f"[HYBRID] Selected features: {len(self.selected_feature_names)}/{len(self.feature_names)}") print(f"[HYBRID] Selected features: {len(self.selected_feature_names)}/{len(self.feature_names)}")
if self.ensemble_classifier is not None:
print(f"[HYBRID] Mode: Hybrid (IF + Ensemble)")
else:
print(f"[HYBRID] Mode: IF-only (Ensemble not available)")
return True return True
except Exception as e: except Exception as e:
print(f"[HYBRID] Failed to load models: {e}") print(f"[HYBRID] Failed to load models: {e}")

View File

@ -286,7 +286,13 @@ def test_on_synthetic(args):
metrics = validator.calculate(y_true, y_pred) metrics = validator.calculate(y_true, y_pred)
validator.print_summary(metrics, title="Synthetic Test Results") validator.print_summary(metrics, title="Synthetic Test Results")
print("\n✅ System test completed successfully!") print("\n✅ System test completed!")
# Check if ensemble was trained
if detector.ensemble_classifier is None:
print("\n⚠️ WARNING: System running in IF-only mode (no ensemble)")
print(" This may occur with very clean datasets")
print(" Expected metrics will be lower than hybrid mode")
return detector, metrics return detector, metrics