Improve model training by adding robust error handling and logging

Add exception handling to the model training process to log failures and improve robustness. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: intermediate_checkpoint Replit-Commit-Event-Id: 9c7ad6b8-3e9d-41fe-83f7-6b2a48f8ff44 Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
2025-11-24 16:25:40 +00:00 · 2025-11-24 16:25:40 +00:00 · 16617aa0fa
commit 16617aa0fa
parent 783d28f571
4 changed files with 324 additions and 38 deletions
--- a/.replit
+++ b/.replit
@ -14,10 +14,6 @@ run = ["npm", "run", "start"]
 localPort = 5000
 externalPort = 80
 [[ports]]
 localPort = 37135
 externalPort = 3001
 [[ports]]
 localPort = 41303
 externalPort = 3002
--- a/python_ml/main.py
+++ b/python_ml/main.py
@ -192,6 +192,7 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
            # Training - usa detector appropriato
            print("[TRAIN] Addestramento modello...")
            try:
                if USE_HYBRID_DETECTOR:
                    print("[TRAIN] Using Hybrid ML Detector")
                    result = ml_detector.train_unsupervised(df)
@ -199,8 +200,32 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
                    print("[TRAIN] Using Legacy ML Analyzer")
                    result = ml_analyzer.train(df, contamination=request.contamination)
                print(f"[TRAIN] Modello addestrato: {result}")
            except ValueError as e:
                # Training FAILED - ensemble could not be created
                error_msg = str(e)
                print(f"\n[TRAIN] ❌ TRAINING FAILED")
                print(f"{error_msg}")
-            # Salva nel database
+                # Save failure to database
                cursor.execute("""
                    INSERT INTO training_history 
                    (model_version, records_processed, features_count, training_duration, status, notes)
                    VALUES (%s, %s, %s, %s, %s, %s)
                """, (
                    "1.0.0",
                    len(df),
                    0,
                    0,
                    'failed',
                    f"ERROR: {error_msg[:500]}"  # Truncate if too long
                ))
                conn.commit()
                print("[TRAIN] ❌ Training failure logged to database")
                # Re-raise to propagate error
                raise
            # Salva nel database (solo se training SUCCESS)
            print("[TRAIN] Salvataggio training history nel database...")
            cursor.execute("""
                INSERT INTO training_history 
--- a/python_ml/ml_hybrid_detector.py
+++ b/python_ml/ml_hybrid_detector.py
@ -194,10 +194,12 @@ class MLHybridDetector:
    def train_unsupervised(self, logs_df: pd.DataFrame) -> Dict:
        """
-        Train Extended Isolation Forest in unsupervised mode
+        Train Hybrid System:
-        Used when no labeled data available
+        1. Extended Isolation Forest (unsupervised)
        2. Pseudo-labeling from IF predictions
        3. Ensemble Classifier (DT+RF+XGB) on pseudo-labels
        """
-        print(f"[HYBRID] Training unsupervised model on {len(logs_df)} logs...")
+        print(f"[HYBRID] Training hybrid model on {len(logs_df)} logs...")
        features_df = self.extract_features(logs_df)
        if features_df.empty:
@ -209,28 +211,60 @@ class MLHybridDetector:
        X = features_df.drop('source_ip', axis=1)
        self.feature_names = X.columns.tolist()
-        # Feature selection with Chi-Square (requires non-negative values)
+        # STEP 1: Initial IF training for pseudo-labels
        print("[HYBRID] Pre-training Isolation Forest for feature selection...")
        # Ensure non-negative values
        X_positive = X.clip(lower=0) + 1e-10
        # Normalize for initial IF
        temp_scaler = StandardScaler()
        X_temp_scaled = temp_scaler.fit_transform(X_positive)
        # Train temporary IF for pseudo-labeling
        if EIF_AVAILABLE:
            temp_if = ExtendedIsolationForest(
                n_estimators=100,  # Faster pre-training
                contamination=self.config['eif_contamination'],
                random_state=42
            )
        else:
            temp_if = IsolationForest(
                n_estimators=100,
                contamination=self.config['eif_contamination'],
                random_state=42,
                n_jobs=-1
            )
        temp_if.fit(X_temp_scaled)
        temp_predictions = temp_if.predict(X_temp_scaled)
        # Use IF predictions as pseudo-labels for feature selection
        y_pseudo_select = (temp_predictions == -1).astype(int)
        print(f"[HYBRID] Generated {y_pseudo_select.sum()} pseudo-anomalies from pre-training IF")
        # Feature selection with Chi-Square
        print(f"[HYBRID] Feature selection: {len(X.columns)} → {self.config['chi2_top_k']} features")
        X_positive = X.clip(lower=0)  # Chi2 requires non-negative
-        # Create pseudo-labels for feature selection (0=normal, 1=potential anomaly)
+        # Validate k is not larger than available features
-        # Use simple heuristic: top 10% by total_bytes as potential anomalies
+        k_select = min(self.config['chi2_top_k'], X_positive.shape[1])
-        y_pseudo = (X_positive['total_bytes'] > X_positive['total_bytes'].quantile(0.90)).astype(int)
+        if k_select < self.config['chi2_top_k']:
            print(f"[HYBRID] Warning: Reducing k from {self.config['chi2_top_k']} to {k_select} (max available)")
-        self.feature_selector = SelectKBest(chi2, k=self.config['chi2_top_k'])
+        self.feature_selector = SelectKBest(chi2, k=k_select)
-        X_selected = self.feature_selector.fit_transform(X_positive, y_pseudo)
+        X_selected = self.feature_selector.fit_transform(X_positive, y_pseudo_select)
        # Get selected feature names
        selected_indices = self.feature_selector.get_support(indices=True)
        self.selected_feature_names = [self.feature_names[i] for i in selected_indices]
        print(f"[HYBRID] Selected features: {', '.join(self.selected_feature_names[:5])}... (+{len(self.selected_feature_names)-5} more)")
-        # Normalize
+        # STEP 2: Normalize
        print("[HYBRID] Normalizing features...")
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_selected)
-        # Train Extended Isolation Forest
+        # STEP 3: Train Extended Isolation Forest
        print(f"[HYBRID] Training Extended Isolation Forest (contamination={self.config['eif_contamination']})...")
        if EIF_AVAILABLE:
            self.isolation_forest = ExtendedIsolationForest(
@ -252,25 +286,195 @@ class MLHybridDetector:
        self.isolation_forest.fit(X_scaled)
        # STEP 4: Generate pseudo-labels from IF predictions
        print("[HYBRID] Generating pseudo-labels from Isolation Forest...")
        if_predictions = self.isolation_forest.predict(X_scaled)
        if_scores = self.isolation_forest.score_samples(X_scaled)
        # Convert IF predictions to pseudo-labels (1=anomaly, 0=normal)
        y_pseudo_train = (if_predictions == -1).astype(int)
        anomalies_count = y_pseudo_train.sum()
        # CRITICAL: Handle zero-anomaly case with ADAPTIVE PERCENTILES
        min_anomalies_required = max(10, int(len(y_pseudo_train) * 0.02))  # At least 2% or 10
        if anomalies_count < min_anomalies_required:
            print(f"[HYBRID] ⚠️  IF found only {anomalies_count} anomalies (need {min_anomalies_required})")
            print(f"[HYBRID] Applying ADAPTIVE percentile fallback...")
            # Try progressively higher percentiles to get enough pseudo-anomalies
            percentiles_to_try = [5, 10, 15, 20]  # Bottom X% scores
            for percentile in percentiles_to_try:
                anomaly_threshold = np.percentile(if_scores, percentile)
                y_pseudo_train = (if_scores <= anomaly_threshold).astype(int)
                anomalies_count = y_pseudo_train.sum()
                print(f"[HYBRID]   Trying {percentile}% percentile → {anomalies_count} anomalies")
                if anomalies_count >= min_anomalies_required:
                    print(f"[HYBRID] ✅ Success with {percentile}% percentile")
                    break
        # Final check: FAIL if ensemble cannot be trained
        if anomalies_count < 2:
            error_msg = (
                f"HYBRID TRAINING FAILED: Insufficient pseudo-anomalies ({anomalies_count}) for ensemble training.\n\n"
                f"Dataset appears too clean for supervised ensemble classifier.\n"
                f"Attempted adaptive percentiles (5%, 10%, 15%, 20%) but still < 2 classes.\n\n"
                f"SOLUTIONS:\n"
                f"  1. Collect more diverse network traffic data\n"
                f"  2. Lower contamination threshold (currently {self.config['eif_contamination']})\n"
                f"  3. Use larger dataset (currently {len(features_df)} unique IPs)\n\n"
                f"IMPORTANT: Hybrid detector REQUIRES ensemble classifier.\n"
                f"Cannot deploy incomplete IF-only system when hybrid was requested."
            )
            print(f"\n[HYBRID] ❌ {error_msg}")
            raise ValueError(error_msg)
        print(f"[HYBRID] Pseudo-labels: {anomalies_count} anomalies, {len(y_pseudo_train)-anomalies_count} normal")
        # Use IF confidence: samples with extreme anomaly scores are labeled with higher confidence
        # High anomaly = low score, so invert
        score_min, score_max = if_scores.min(), if_scores.max()
        anomaly_confidence = 1 - (if_scores - score_min) / (score_max - score_min + 1e-10)
        # Weight samples: high confidence anomalies + random normal samples
        sample_weights = np.where(
            y_pseudo_train == 1,
            anomaly_confidence,  # Anomalies weighted by confidence
            0.5  # Normal traffic baseline weight
        )
        # STEP 5: Train Ensemble Classifier (DT + RF + XGBoost)
        print("[HYBRID] Training ensemble classifier (DT + RF + XGBoost)...")
        # CRITICAL: Re-check class distribution after all preprocessing
        unique_classes = np.unique(y_pseudo_train)
        if len(unique_classes) < 2:
            error_msg = (
                f"HYBRID TRAINING FAILED: Class distribution collapsed to {len(unique_classes)} class(es) "
                f"after feature selection/preprocessing.\n\n"
                f"This indicates feature selection eliminated discriminative features.\n\n"
                f"SOLUTIONS:\n"
                f"  1. Use larger dataset with more diverse traffic\n"
                f"  2. Lower contamination threshold\n"
                f"  3. Reduce chi2_top_k (currently {self.config['chi2_top_k']}) to keep more features\n\n"
                f"Hybrid detector REQUIRES ensemble classifier - cannot proceed with monoclasse."
            )
            print(f"\n[HYBRID] ❌ {error_msg}")
            raise ValueError(error_msg)
        print(f"[HYBRID] Class distribution OK: {unique_classes} (counts: {np.bincount(y_pseudo_train)})")
        # Decision Tree
        dt_classifier = DecisionTreeClassifier(
            max_depth=self.config['dt_max_depth'],
            random_state=42,
            class_weight='balanced'  # Handle imbalance
        )
        # Random Forest
        rf_classifier = RandomForestClassifier(
            n_estimators=self.config['rf_n_estimators'],
            max_depth=self.config['rf_max_depth'],
            random_state=42,
            n_jobs=-1,
            class_weight='balanced'
        )
        # XGBoost
        xgb_classifier = XGBClassifier(
            n_estimators=self.config['xgb_n_estimators'],
            max_depth=self.config['xgb_max_depth'],
            learning_rate=self.config['xgb_learning_rate'],
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss',
            scale_pos_weight=len(y_pseudo_train) / max(anomalies_count, 1)  # Handle imbalance
        )
        # Voting Classifier with weighted voting
        self.ensemble_classifier = VotingClassifier(
            estimators=[
                ('dt', dt_classifier),
                ('rf', rf_classifier),
                ('xgb', xgb_classifier)
            ],
            voting='soft',  # Use probability averaging
            weights=self.config['voting_weights']  # [1, 2, 2] - favor RF and XGB
        )
        # Train ensemble on pseudo-labeled data with error handling
        try:
            self.ensemble_classifier.fit(X_scaled, y_pseudo_train, sample_weight=sample_weights)
            print("[HYBRID] Ensemble .fit() completed successfully")
        except Exception as e:
            error_msg = (
                f"HYBRID TRAINING FAILED: Ensemble .fit() raised exception:\n{str(e)}\n\n"
                f"This may indicate:\n"
                f"  - Insufficient data variation\n"
                f"  - Class imbalance too extreme\n"
                f"  - Invalid sample weights\n\n"
                f"Hybrid detector REQUIRES working ensemble classifier."
            )
            print(f"\n[HYBRID] ❌ {error_msg}")
            self.ensemble_classifier = None
            raise ValueError(error_msg) from e
        # Verify ensemble is functional
        if self.ensemble_classifier is None:
            error_msg = "HYBRID TRAINING FAILED: Ensemble classifier is None after fit()"
            print(f"\n[HYBRID] ❌ {error_msg}")
            raise ValueError(error_msg)
        # Verify ensemble has predict_proba method
        if not hasattr(self.ensemble_classifier, 'predict_proba'):
            error_msg = "HYBRID TRAINING FAILED: Ensemble missing predict_proba method"
            print(f"\n[HYBRID] ❌ {error_msg}")
            self.ensemble_classifier = None
            raise ValueError(error_msg)
        # Verify ensemble can make predictions
        try:
            test_proba = self.ensemble_classifier.predict_proba(X_scaled[:1])
            if test_proba.shape[1] < 2:
                raise ValueError(f"Ensemble produces {test_proba.shape[1]} classes, need 2")
            print(f"[HYBRID] ✅ Ensemble verified: produces {test_proba.shape[1]} class probabilities")
        except Exception as e:
            error_msg = f"HYBRID TRAINING FAILED: Ensemble cannot make predictions: {str(e)}"
            print(f"\n[HYBRID] ❌ {error_msg}")
            self.ensemble_classifier = None
            raise ValueError(error_msg) from e
        print("[HYBRID] Ensemble training completed and verified!")
        # Save models
        self.save_models()
-        # Calculate statistics
+        # FINAL VERIFICATION: Ensure ensemble is still set after save
-        predictions = self.isolation_forest.predict(X_scaled)
+        if self.ensemble_classifier is None:
-        anomalies = (predictions == -1).sum()
+            error_msg = "HYBRID TRAINING FAILED: Ensemble became None after save"
            print(f"\n[HYBRID] ❌ {error_msg}")
            raise ValueError(error_msg)
        # Calculate statistics - only after ALL verifications passed
        result = {
            'records_processed': len(logs_df),
            'unique_ips': len(features_df),
            'features_total': len(self.feature_names),
            'features_selected': len(self.selected_feature_names),
-            'anomalies_detected': int(anomalies),
+            'features_count': len(self.selected_feature_names),  # For backward compatibility with /train endpoint
            'anomalies_detected': int(anomalies_count),
            'contamination': self.config['eif_contamination'],
-            'model_type': 'Extended Isolation Forest' if EIF_AVAILABLE else 'Isolation Forest',
+            'model_type': 'Hybrid (EIF + Ensemble)',
-            'status': 'success'
+            'ensemble_models': ['DecisionTree', 'RandomForest', 'XGBoost'],
            'status': 'success',
            'ensemble_verified': True  # Explicit flag for verification
        }
-        print(f"[HYBRID] Training completed! {anomalies}/{len(features_df)} IPs flagged as anomalies")
+        print(f"[HYBRID] ✅ Training completed successfully! {anomalies_count}/{len(features_df)} IPs flagged as anomalies")
        print(f"[HYBRID] ✅ Ensemble classifier verified and ready for production")
        return result
    def detect(
@ -295,16 +499,48 @@ class MLHybridDetector:
        # Apply same feature selection
        X_positive = X.clip(lower=0)
        X_positive = X_positive + 1e-10  # Add epsilon
        X_selected = self.feature_selector.transform(X_positive)
        X_scaled = self.scaler.transform(X_selected)
-        # Predictions from Isolation Forest
+        # HYBRID SCORING: Combine Isolation Forest + Ensemble Classifier
        predictions = self.isolation_forest.predict(X_scaled)
        scores = self.isolation_forest.score_samples(X_scaled)
-        # Normalize scores to 0-100 (lower score = more anomalous)
+        # Step 1: Isolation Forest score (unsupervised anomaly detection)
-        score_min, score_max = scores.min(), scores.max()
+        if_predictions = self.isolation_forest.predict(X_scaled)
-        risk_scores = 100 * (1 - (scores - score_min) / (score_max - score_min + 1e-10))
+        if_scores = self.isolation_forest.score_samples(X_scaled)
        # Normalize IF scores to 0-100 (lower score = more anomalous)
        if_score_min, if_score_max = if_scores.min(), if_scores.max()
        if_risk_scores = 100 * (1 - (if_scores - if_score_min) / (if_score_max - if_score_min + 1e-10))
        # Step 2: Ensemble score (supervised classification on pseudo-labels)
        if self.ensemble_classifier is not None:
            print(f"[DETECT] Ensemble classifier available - computing hybrid score...")
            # Get ensemble probability predictions
            ensemble_proba = self.ensemble_classifier.predict_proba(X_scaled)
            # Probability of being anomaly (class 1)
            ensemble_anomaly_proba = ensemble_proba[:, 1]
            # Convert to 0-100 scale
            ensemble_risk_scores = ensemble_anomaly_proba * 100
            # Combine scores: weighted average (IF: 40%, Ensemble: 60%)
            # Ensemble gets more weight as it's trained on pseudo-labels
            risk_scores = 0.4 * if_risk_scores + 0.6 * ensemble_risk_scores
            # Debugging: show score distribution
            print(f"[DETECT] IF scores: min={if_risk_scores.min():.1f}, max={if_risk_scores.max():.1f}, mean={if_risk_scores.mean():.1f}")
            print(f"[DETECT] Ensemble scores: min={ensemble_risk_scores.min():.1f}, max={ensemble_risk_scores.max():.1f}, mean={ensemble_risk_scores.mean():.1f}")
            print(f"[DETECT] Combined scores: min={risk_scores.min():.1f}, max={risk_scores.max():.1f}, mean={risk_scores.mean():.1f}")
            print(f"[DETECT] ✅ Hybrid scoring active: 40% IF + 60% Ensemble")
        else:
            # Fallback to IF-only if ensemble not available
            risk_scores = if_risk_scores
            print(f"[DETECT] ⚠️  Ensemble NOT available - using IF-only scoring")
            print(f"[DETECT] IF scores: min={if_risk_scores.min():.1f}, max={if_risk_scores.max():.1f}, mean={if_risk_scores.mean():.1f}")
        # For backward compatibility
        predictions = if_predictions
        detections = []
        for i, (ip, pred, risk_score) in enumerate(zip(source_ips, predictions, risk_scores)):
@ -402,6 +638,11 @@ class MLHybridDetector:
        joblib.dump(self.scaler, self.model_dir / f"scaler_{timestamp}.pkl")
        joblib.dump(self.feature_selector, self.model_dir / f"feature_selector_{timestamp}.pkl")
        # Save ensemble if available
        if self.ensemble_classifier is not None:
            joblib.dump(self.ensemble_classifier, self.model_dir / f"ensemble_classifier_{timestamp}.pkl")
            joblib.dump(self.ensemble_classifier, self.model_dir / "ensemble_classifier_latest.pkl")
        # Save latest (symlinks alternative)
        joblib.dump(self.isolation_forest, self.model_dir / "isolation_forest_latest.pkl")
        joblib.dump(self.scaler, self.model_dir / "scaler_latest.pkl")
@ -414,6 +655,7 @@ class MLHybridDetector:
            'selected_feature_names': self.selected_feature_names,
            'config': self.config,
            'metrics': self.metrics,
            'has_ensemble': self.ensemble_classifier is not None,
        }
        with open(self.model_dir / f"metadata_{timestamp}.json", 'w') as f:
@ -423,6 +665,8 @@ class MLHybridDetector:
            json.dump(metadata, f, indent=2)
        print(f"[HYBRID] Models saved to {self.model_dir}")
        if self.ensemble_classifier is not None:
            print(f"[HYBRID] Ensemble classifier included")
    def load_models(self, version: str = 'latest'):
        """Load models from disk"""
@ -431,6 +675,15 @@ class MLHybridDetector:
            self.scaler = joblib.load(self.model_dir / f"scaler_{version}.pkl")
            self.feature_selector = joblib.load(self.model_dir / f"feature_selector_{version}.pkl")
            # Try to load ensemble if available
            ensemble_path = self.model_dir / f"ensemble_classifier_{version}.pkl"
            if ensemble_path.exists():
                self.ensemble_classifier = joblib.load(ensemble_path)
                print(f"[HYBRID] Ensemble classifier loaded")
            else:
                self.ensemble_classifier = None
                print(f"[HYBRID] No ensemble classifier found (IF-only mode)")
            with open(self.model_dir / f"metadata_{version}.json") as f:
                metadata = json.load(f)
                self.feature_names = metadata['feature_names']
@ -440,6 +693,12 @@ class MLHybridDetector:
            print(f"[HYBRID] Models loaded (version: {version})")
            print(f"[HYBRID] Selected features: {len(self.selected_feature_names)}/{len(self.feature_names)}")
            if self.ensemble_classifier is not None:
                print(f"[HYBRID] Mode: Hybrid (IF + Ensemble)")
            else:
                print(f"[HYBRID] Mode: IF-only (Ensemble not available)")
            return True
        except Exception as e:
            print(f"[HYBRID] Failed to load models: {e}")
--- a/python_ml/train_hybrid.py
+++ b/python_ml/train_hybrid.py
@ -286,7 +286,13 @@ def test_on_synthetic(args):
    metrics = validator.calculate(y_true, y_pred)
    validator.print_summary(metrics, title="Synthetic Test Results")
-    print("\n✅ System test completed successfully!")
+    print("\n✅ System test completed!")
    # Check if ensemble was trained
    if detector.ensemble_classifier is None:
        print("\n⚠️  WARNING: System running in IF-only mode (no ensemble)")
        print("   This may occur with very clean datasets")
        print("   Expected metrics will be lower than hybrid mode")
    return detector, metrics