Improve model training by adding robust error handling and logging
Add exception handling to the model training process to log failures and improve robustness. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: intermediate_checkpoint Replit-Commit-Event-Id: 9c7ad6b8-3e9d-41fe-83f7-6b2a48f8ff44 Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
This commit is contained in:
parent
783d28f571
commit
16617aa0fa
4
.replit
4
.replit
@ -14,10 +14,6 @@ run = ["npm", "run", "start"]
|
|||||||
localPort = 5000
|
localPort = 5000
|
||||||
externalPort = 80
|
externalPort = 80
|
||||||
|
|
||||||
[[ports]]
|
|
||||||
localPort = 37135
|
|
||||||
externalPort = 3001
|
|
||||||
|
|
||||||
[[ports]]
|
[[ports]]
|
||||||
localPort = 41303
|
localPort = 41303
|
||||||
externalPort = 3002
|
externalPort = 3002
|
||||||
|
|||||||
@ -192,6 +192,7 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
|
|||||||
|
|
||||||
# Training - usa detector appropriato
|
# Training - usa detector appropriato
|
||||||
print("[TRAIN] Addestramento modello...")
|
print("[TRAIN] Addestramento modello...")
|
||||||
|
try:
|
||||||
if USE_HYBRID_DETECTOR:
|
if USE_HYBRID_DETECTOR:
|
||||||
print("[TRAIN] Using Hybrid ML Detector")
|
print("[TRAIN] Using Hybrid ML Detector")
|
||||||
result = ml_detector.train_unsupervised(df)
|
result = ml_detector.train_unsupervised(df)
|
||||||
@ -199,8 +200,32 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
|
|||||||
print("[TRAIN] Using Legacy ML Analyzer")
|
print("[TRAIN] Using Legacy ML Analyzer")
|
||||||
result = ml_analyzer.train(df, contamination=request.contamination)
|
result = ml_analyzer.train(df, contamination=request.contamination)
|
||||||
print(f"[TRAIN] Modello addestrato: {result}")
|
print(f"[TRAIN] Modello addestrato: {result}")
|
||||||
|
except ValueError as e:
|
||||||
|
# Training FAILED - ensemble could not be created
|
||||||
|
error_msg = str(e)
|
||||||
|
print(f"\n[TRAIN] ❌ TRAINING FAILED")
|
||||||
|
print(f"{error_msg}")
|
||||||
|
|
||||||
# Salva nel database
|
# Save failure to database
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO training_history
|
||||||
|
(model_version, records_processed, features_count, training_duration, status, notes)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s)
|
||||||
|
""", (
|
||||||
|
"1.0.0",
|
||||||
|
len(df),
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
'failed',
|
||||||
|
f"ERROR: {error_msg[:500]}" # Truncate if too long
|
||||||
|
))
|
||||||
|
conn.commit()
|
||||||
|
print("[TRAIN] ❌ Training failure logged to database")
|
||||||
|
|
||||||
|
# Re-raise to propagate error
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Salva nel database (solo se training SUCCESS)
|
||||||
print("[TRAIN] Salvataggio training history nel database...")
|
print("[TRAIN] Salvataggio training history nel database...")
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
INSERT INTO training_history
|
INSERT INTO training_history
|
||||||
|
|||||||
@ -194,10 +194,12 @@ class MLHybridDetector:
|
|||||||
|
|
||||||
def train_unsupervised(self, logs_df: pd.DataFrame) -> Dict:
|
def train_unsupervised(self, logs_df: pd.DataFrame) -> Dict:
|
||||||
"""
|
"""
|
||||||
Train Extended Isolation Forest in unsupervised mode
|
Train Hybrid System:
|
||||||
Used when no labeled data available
|
1. Extended Isolation Forest (unsupervised)
|
||||||
|
2. Pseudo-labeling from IF predictions
|
||||||
|
3. Ensemble Classifier (DT+RF+XGB) on pseudo-labels
|
||||||
"""
|
"""
|
||||||
print(f"[HYBRID] Training unsupervised model on {len(logs_df)} logs...")
|
print(f"[HYBRID] Training hybrid model on {len(logs_df)} logs...")
|
||||||
|
|
||||||
features_df = self.extract_features(logs_df)
|
features_df = self.extract_features(logs_df)
|
||||||
if features_df.empty:
|
if features_df.empty:
|
||||||
@ -209,28 +211,60 @@ class MLHybridDetector:
|
|||||||
X = features_df.drop('source_ip', axis=1)
|
X = features_df.drop('source_ip', axis=1)
|
||||||
self.feature_names = X.columns.tolist()
|
self.feature_names = X.columns.tolist()
|
||||||
|
|
||||||
# Feature selection with Chi-Square (requires non-negative values)
|
# STEP 1: Initial IF training for pseudo-labels
|
||||||
|
print("[HYBRID] Pre-training Isolation Forest for feature selection...")
|
||||||
|
|
||||||
|
# Ensure non-negative values
|
||||||
|
X_positive = X.clip(lower=0) + 1e-10
|
||||||
|
|
||||||
|
# Normalize for initial IF
|
||||||
|
temp_scaler = StandardScaler()
|
||||||
|
X_temp_scaled = temp_scaler.fit_transform(X_positive)
|
||||||
|
|
||||||
|
# Train temporary IF for pseudo-labeling
|
||||||
|
if EIF_AVAILABLE:
|
||||||
|
temp_if = ExtendedIsolationForest(
|
||||||
|
n_estimators=100, # Faster pre-training
|
||||||
|
contamination=self.config['eif_contamination'],
|
||||||
|
random_state=42
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
temp_if = IsolationForest(
|
||||||
|
n_estimators=100,
|
||||||
|
contamination=self.config['eif_contamination'],
|
||||||
|
random_state=42,
|
||||||
|
n_jobs=-1
|
||||||
|
)
|
||||||
|
|
||||||
|
temp_if.fit(X_temp_scaled)
|
||||||
|
temp_predictions = temp_if.predict(X_temp_scaled)
|
||||||
|
|
||||||
|
# Use IF predictions as pseudo-labels for feature selection
|
||||||
|
y_pseudo_select = (temp_predictions == -1).astype(int)
|
||||||
|
print(f"[HYBRID] Generated {y_pseudo_select.sum()} pseudo-anomalies from pre-training IF")
|
||||||
|
|
||||||
|
# Feature selection with Chi-Square
|
||||||
print(f"[HYBRID] Feature selection: {len(X.columns)} → {self.config['chi2_top_k']} features")
|
print(f"[HYBRID] Feature selection: {len(X.columns)} → {self.config['chi2_top_k']} features")
|
||||||
X_positive = X.clip(lower=0) # Chi2 requires non-negative
|
|
||||||
|
|
||||||
# Create pseudo-labels for feature selection (0=normal, 1=potential anomaly)
|
# Validate k is not larger than available features
|
||||||
# Use simple heuristic: top 10% by total_bytes as potential anomalies
|
k_select = min(self.config['chi2_top_k'], X_positive.shape[1])
|
||||||
y_pseudo = (X_positive['total_bytes'] > X_positive['total_bytes'].quantile(0.90)).astype(int)
|
if k_select < self.config['chi2_top_k']:
|
||||||
|
print(f"[HYBRID] Warning: Reducing k from {self.config['chi2_top_k']} to {k_select} (max available)")
|
||||||
|
|
||||||
self.feature_selector = SelectKBest(chi2, k=self.config['chi2_top_k'])
|
self.feature_selector = SelectKBest(chi2, k=k_select)
|
||||||
X_selected = self.feature_selector.fit_transform(X_positive, y_pseudo)
|
X_selected = self.feature_selector.fit_transform(X_positive, y_pseudo_select)
|
||||||
|
|
||||||
# Get selected feature names
|
# Get selected feature names
|
||||||
selected_indices = self.feature_selector.get_support(indices=True)
|
selected_indices = self.feature_selector.get_support(indices=True)
|
||||||
self.selected_feature_names = [self.feature_names[i] for i in selected_indices]
|
self.selected_feature_names = [self.feature_names[i] for i in selected_indices]
|
||||||
print(f"[HYBRID] Selected features: {', '.join(self.selected_feature_names[:5])}... (+{len(self.selected_feature_names)-5} more)")
|
print(f"[HYBRID] Selected features: {', '.join(self.selected_feature_names[:5])}... (+{len(self.selected_feature_names)-5} more)")
|
||||||
|
|
||||||
# Normalize
|
# STEP 2: Normalize
|
||||||
print("[HYBRID] Normalizing features...")
|
print("[HYBRID] Normalizing features...")
|
||||||
self.scaler = StandardScaler()
|
self.scaler = StandardScaler()
|
||||||
X_scaled = self.scaler.fit_transform(X_selected)
|
X_scaled = self.scaler.fit_transform(X_selected)
|
||||||
|
|
||||||
# Train Extended Isolation Forest
|
# STEP 3: Train Extended Isolation Forest
|
||||||
print(f"[HYBRID] Training Extended Isolation Forest (contamination={self.config['eif_contamination']})...")
|
print(f"[HYBRID] Training Extended Isolation Forest (contamination={self.config['eif_contamination']})...")
|
||||||
if EIF_AVAILABLE:
|
if EIF_AVAILABLE:
|
||||||
self.isolation_forest = ExtendedIsolationForest(
|
self.isolation_forest = ExtendedIsolationForest(
|
||||||
@ -252,25 +286,195 @@ class MLHybridDetector:
|
|||||||
|
|
||||||
self.isolation_forest.fit(X_scaled)
|
self.isolation_forest.fit(X_scaled)
|
||||||
|
|
||||||
|
# STEP 4: Generate pseudo-labels from IF predictions
|
||||||
|
print("[HYBRID] Generating pseudo-labels from Isolation Forest...")
|
||||||
|
if_predictions = self.isolation_forest.predict(X_scaled)
|
||||||
|
if_scores = self.isolation_forest.score_samples(X_scaled)
|
||||||
|
|
||||||
|
# Convert IF predictions to pseudo-labels (1=anomaly, 0=normal)
|
||||||
|
y_pseudo_train = (if_predictions == -1).astype(int)
|
||||||
|
anomalies_count = y_pseudo_train.sum()
|
||||||
|
|
||||||
|
# CRITICAL: Handle zero-anomaly case with ADAPTIVE PERCENTILES
|
||||||
|
min_anomalies_required = max(10, int(len(y_pseudo_train) * 0.02)) # At least 2% or 10
|
||||||
|
|
||||||
|
if anomalies_count < min_anomalies_required:
|
||||||
|
print(f"[HYBRID] ⚠️ IF found only {anomalies_count} anomalies (need {min_anomalies_required})")
|
||||||
|
print(f"[HYBRID] Applying ADAPTIVE percentile fallback...")
|
||||||
|
|
||||||
|
# Try progressively higher percentiles to get enough pseudo-anomalies
|
||||||
|
percentiles_to_try = [5, 10, 15, 20] # Bottom X% scores
|
||||||
|
for percentile in percentiles_to_try:
|
||||||
|
anomaly_threshold = np.percentile(if_scores, percentile)
|
||||||
|
y_pseudo_train = (if_scores <= anomaly_threshold).astype(int)
|
||||||
|
anomalies_count = y_pseudo_train.sum()
|
||||||
|
|
||||||
|
print(f"[HYBRID] Trying {percentile}% percentile → {anomalies_count} anomalies")
|
||||||
|
|
||||||
|
if anomalies_count >= min_anomalies_required:
|
||||||
|
print(f"[HYBRID] ✅ Success with {percentile}% percentile")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Final check: FAIL if ensemble cannot be trained
|
||||||
|
if anomalies_count < 2:
|
||||||
|
error_msg = (
|
||||||
|
f"HYBRID TRAINING FAILED: Insufficient pseudo-anomalies ({anomalies_count}) for ensemble training.\n\n"
|
||||||
|
f"Dataset appears too clean for supervised ensemble classifier.\n"
|
||||||
|
f"Attempted adaptive percentiles (5%, 10%, 15%, 20%) but still < 2 classes.\n\n"
|
||||||
|
f"SOLUTIONS:\n"
|
||||||
|
f" 1. Collect more diverse network traffic data\n"
|
||||||
|
f" 2. Lower contamination threshold (currently {self.config['eif_contamination']})\n"
|
||||||
|
f" 3. Use larger dataset (currently {len(features_df)} unique IPs)\n\n"
|
||||||
|
f"IMPORTANT: Hybrid detector REQUIRES ensemble classifier.\n"
|
||||||
|
f"Cannot deploy incomplete IF-only system when hybrid was requested."
|
||||||
|
)
|
||||||
|
print(f"\n[HYBRID] ❌ {error_msg}")
|
||||||
|
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
print(f"[HYBRID] Pseudo-labels: {anomalies_count} anomalies, {len(y_pseudo_train)-anomalies_count} normal")
|
||||||
|
|
||||||
|
# Use IF confidence: samples with extreme anomaly scores are labeled with higher confidence
|
||||||
|
# High anomaly = low score, so invert
|
||||||
|
score_min, score_max = if_scores.min(), if_scores.max()
|
||||||
|
anomaly_confidence = 1 - (if_scores - score_min) / (score_max - score_min + 1e-10)
|
||||||
|
|
||||||
|
# Weight samples: high confidence anomalies + random normal samples
|
||||||
|
sample_weights = np.where(
|
||||||
|
y_pseudo_train == 1,
|
||||||
|
anomaly_confidence, # Anomalies weighted by confidence
|
||||||
|
0.5 # Normal traffic baseline weight
|
||||||
|
)
|
||||||
|
|
||||||
|
# STEP 5: Train Ensemble Classifier (DT + RF + XGBoost)
|
||||||
|
print("[HYBRID] Training ensemble classifier (DT + RF + XGBoost)...")
|
||||||
|
|
||||||
|
# CRITICAL: Re-check class distribution after all preprocessing
|
||||||
|
unique_classes = np.unique(y_pseudo_train)
|
||||||
|
if len(unique_classes) < 2:
|
||||||
|
error_msg = (
|
||||||
|
f"HYBRID TRAINING FAILED: Class distribution collapsed to {len(unique_classes)} class(es) "
|
||||||
|
f"after feature selection/preprocessing.\n\n"
|
||||||
|
f"This indicates feature selection eliminated discriminative features.\n\n"
|
||||||
|
f"SOLUTIONS:\n"
|
||||||
|
f" 1. Use larger dataset with more diverse traffic\n"
|
||||||
|
f" 2. Lower contamination threshold\n"
|
||||||
|
f" 3. Reduce chi2_top_k (currently {self.config['chi2_top_k']}) to keep more features\n\n"
|
||||||
|
f"Hybrid detector REQUIRES ensemble classifier - cannot proceed with monoclasse."
|
||||||
|
)
|
||||||
|
print(f"\n[HYBRID] ❌ {error_msg}")
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
print(f"[HYBRID] Class distribution OK: {unique_classes} (counts: {np.bincount(y_pseudo_train)})")
|
||||||
|
|
||||||
|
# Decision Tree
|
||||||
|
dt_classifier = DecisionTreeClassifier(
|
||||||
|
max_depth=self.config['dt_max_depth'],
|
||||||
|
random_state=42,
|
||||||
|
class_weight='balanced' # Handle imbalance
|
||||||
|
)
|
||||||
|
|
||||||
|
# Random Forest
|
||||||
|
rf_classifier = RandomForestClassifier(
|
||||||
|
n_estimators=self.config['rf_n_estimators'],
|
||||||
|
max_depth=self.config['rf_max_depth'],
|
||||||
|
random_state=42,
|
||||||
|
n_jobs=-1,
|
||||||
|
class_weight='balanced'
|
||||||
|
)
|
||||||
|
|
||||||
|
# XGBoost
|
||||||
|
xgb_classifier = XGBClassifier(
|
||||||
|
n_estimators=self.config['xgb_n_estimators'],
|
||||||
|
max_depth=self.config['xgb_max_depth'],
|
||||||
|
learning_rate=self.config['xgb_learning_rate'],
|
||||||
|
random_state=42,
|
||||||
|
use_label_encoder=False,
|
||||||
|
eval_metric='logloss',
|
||||||
|
scale_pos_weight=len(y_pseudo_train) / max(anomalies_count, 1) # Handle imbalance
|
||||||
|
)
|
||||||
|
|
||||||
|
# Voting Classifier with weighted voting
|
||||||
|
self.ensemble_classifier = VotingClassifier(
|
||||||
|
estimators=[
|
||||||
|
('dt', dt_classifier),
|
||||||
|
('rf', rf_classifier),
|
||||||
|
('xgb', xgb_classifier)
|
||||||
|
],
|
||||||
|
voting='soft', # Use probability averaging
|
||||||
|
weights=self.config['voting_weights'] # [1, 2, 2] - favor RF and XGB
|
||||||
|
)
|
||||||
|
|
||||||
|
# Train ensemble on pseudo-labeled data with error handling
|
||||||
|
try:
|
||||||
|
self.ensemble_classifier.fit(X_scaled, y_pseudo_train, sample_weight=sample_weights)
|
||||||
|
print("[HYBRID] Ensemble .fit() completed successfully")
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = (
|
||||||
|
f"HYBRID TRAINING FAILED: Ensemble .fit() raised exception:\n{str(e)}\n\n"
|
||||||
|
f"This may indicate:\n"
|
||||||
|
f" - Insufficient data variation\n"
|
||||||
|
f" - Class imbalance too extreme\n"
|
||||||
|
f" - Invalid sample weights\n\n"
|
||||||
|
f"Hybrid detector REQUIRES working ensemble classifier."
|
||||||
|
)
|
||||||
|
print(f"\n[HYBRID] ❌ {error_msg}")
|
||||||
|
self.ensemble_classifier = None
|
||||||
|
raise ValueError(error_msg) from e
|
||||||
|
|
||||||
|
# Verify ensemble is functional
|
||||||
|
if self.ensemble_classifier is None:
|
||||||
|
error_msg = "HYBRID TRAINING FAILED: Ensemble classifier is None after fit()"
|
||||||
|
print(f"\n[HYBRID] ❌ {error_msg}")
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
# Verify ensemble has predict_proba method
|
||||||
|
if not hasattr(self.ensemble_classifier, 'predict_proba'):
|
||||||
|
error_msg = "HYBRID TRAINING FAILED: Ensemble missing predict_proba method"
|
||||||
|
print(f"\n[HYBRID] ❌ {error_msg}")
|
||||||
|
self.ensemble_classifier = None
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
# Verify ensemble can make predictions
|
||||||
|
try:
|
||||||
|
test_proba = self.ensemble_classifier.predict_proba(X_scaled[:1])
|
||||||
|
if test_proba.shape[1] < 2:
|
||||||
|
raise ValueError(f"Ensemble produces {test_proba.shape[1]} classes, need 2")
|
||||||
|
print(f"[HYBRID] ✅ Ensemble verified: produces {test_proba.shape[1]} class probabilities")
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"HYBRID TRAINING FAILED: Ensemble cannot make predictions: {str(e)}"
|
||||||
|
print(f"\n[HYBRID] ❌ {error_msg}")
|
||||||
|
self.ensemble_classifier = None
|
||||||
|
raise ValueError(error_msg) from e
|
||||||
|
|
||||||
|
print("[HYBRID] Ensemble training completed and verified!")
|
||||||
|
|
||||||
# Save models
|
# Save models
|
||||||
self.save_models()
|
self.save_models()
|
||||||
|
|
||||||
# Calculate statistics
|
# FINAL VERIFICATION: Ensure ensemble is still set after save
|
||||||
predictions = self.isolation_forest.predict(X_scaled)
|
if self.ensemble_classifier is None:
|
||||||
anomalies = (predictions == -1).sum()
|
error_msg = "HYBRID TRAINING FAILED: Ensemble became None after save"
|
||||||
|
print(f"\n[HYBRID] ❌ {error_msg}")
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
# Calculate statistics - only after ALL verifications passed
|
||||||
result = {
|
result = {
|
||||||
'records_processed': len(logs_df),
|
'records_processed': len(logs_df),
|
||||||
'unique_ips': len(features_df),
|
'unique_ips': len(features_df),
|
||||||
'features_total': len(self.feature_names),
|
'features_total': len(self.feature_names),
|
||||||
'features_selected': len(self.selected_feature_names),
|
'features_selected': len(self.selected_feature_names),
|
||||||
'anomalies_detected': int(anomalies),
|
'features_count': len(self.selected_feature_names), # For backward compatibility with /train endpoint
|
||||||
|
'anomalies_detected': int(anomalies_count),
|
||||||
'contamination': self.config['eif_contamination'],
|
'contamination': self.config['eif_contamination'],
|
||||||
'model_type': 'Extended Isolation Forest' if EIF_AVAILABLE else 'Isolation Forest',
|
'model_type': 'Hybrid (EIF + Ensemble)',
|
||||||
'status': 'success'
|
'ensemble_models': ['DecisionTree', 'RandomForest', 'XGBoost'],
|
||||||
|
'status': 'success',
|
||||||
|
'ensemble_verified': True # Explicit flag for verification
|
||||||
}
|
}
|
||||||
|
|
||||||
print(f"[HYBRID] Training completed! {anomalies}/{len(features_df)} IPs flagged as anomalies")
|
print(f"[HYBRID] ✅ Training completed successfully! {anomalies_count}/{len(features_df)} IPs flagged as anomalies")
|
||||||
|
print(f"[HYBRID] ✅ Ensemble classifier verified and ready for production")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def detect(
|
def detect(
|
||||||
@ -295,16 +499,48 @@ class MLHybridDetector:
|
|||||||
|
|
||||||
# Apply same feature selection
|
# Apply same feature selection
|
||||||
X_positive = X.clip(lower=0)
|
X_positive = X.clip(lower=0)
|
||||||
|
X_positive = X_positive + 1e-10 # Add epsilon
|
||||||
X_selected = self.feature_selector.transform(X_positive)
|
X_selected = self.feature_selector.transform(X_positive)
|
||||||
X_scaled = self.scaler.transform(X_selected)
|
X_scaled = self.scaler.transform(X_selected)
|
||||||
|
|
||||||
# Predictions from Isolation Forest
|
# HYBRID SCORING: Combine Isolation Forest + Ensemble Classifier
|
||||||
predictions = self.isolation_forest.predict(X_scaled)
|
|
||||||
scores = self.isolation_forest.score_samples(X_scaled)
|
|
||||||
|
|
||||||
# Normalize scores to 0-100 (lower score = more anomalous)
|
# Step 1: Isolation Forest score (unsupervised anomaly detection)
|
||||||
score_min, score_max = scores.min(), scores.max()
|
if_predictions = self.isolation_forest.predict(X_scaled)
|
||||||
risk_scores = 100 * (1 - (scores - score_min) / (score_max - score_min + 1e-10))
|
if_scores = self.isolation_forest.score_samples(X_scaled)
|
||||||
|
|
||||||
|
# Normalize IF scores to 0-100 (lower score = more anomalous)
|
||||||
|
if_score_min, if_score_max = if_scores.min(), if_scores.max()
|
||||||
|
if_risk_scores = 100 * (1 - (if_scores - if_score_min) / (if_score_max - if_score_min + 1e-10))
|
||||||
|
|
||||||
|
# Step 2: Ensemble score (supervised classification on pseudo-labels)
|
||||||
|
if self.ensemble_classifier is not None:
|
||||||
|
print(f"[DETECT] Ensemble classifier available - computing hybrid score...")
|
||||||
|
|
||||||
|
# Get ensemble probability predictions
|
||||||
|
ensemble_proba = self.ensemble_classifier.predict_proba(X_scaled)
|
||||||
|
# Probability of being anomaly (class 1)
|
||||||
|
ensemble_anomaly_proba = ensemble_proba[:, 1]
|
||||||
|
# Convert to 0-100 scale
|
||||||
|
ensemble_risk_scores = ensemble_anomaly_proba * 100
|
||||||
|
|
||||||
|
# Combine scores: weighted average (IF: 40%, Ensemble: 60%)
|
||||||
|
# Ensemble gets more weight as it's trained on pseudo-labels
|
||||||
|
risk_scores = 0.4 * if_risk_scores + 0.6 * ensemble_risk_scores
|
||||||
|
|
||||||
|
# Debugging: show score distribution
|
||||||
|
print(f"[DETECT] IF scores: min={if_risk_scores.min():.1f}, max={if_risk_scores.max():.1f}, mean={if_risk_scores.mean():.1f}")
|
||||||
|
print(f"[DETECT] Ensemble scores: min={ensemble_risk_scores.min():.1f}, max={ensemble_risk_scores.max():.1f}, mean={ensemble_risk_scores.mean():.1f}")
|
||||||
|
print(f"[DETECT] Combined scores: min={risk_scores.min():.1f}, max={risk_scores.max():.1f}, mean={risk_scores.mean():.1f}")
|
||||||
|
print(f"[DETECT] ✅ Hybrid scoring active: 40% IF + 60% Ensemble")
|
||||||
|
else:
|
||||||
|
# Fallback to IF-only if ensemble not available
|
||||||
|
risk_scores = if_risk_scores
|
||||||
|
print(f"[DETECT] ⚠️ Ensemble NOT available - using IF-only scoring")
|
||||||
|
print(f"[DETECT] IF scores: min={if_risk_scores.min():.1f}, max={if_risk_scores.max():.1f}, mean={if_risk_scores.mean():.1f}")
|
||||||
|
|
||||||
|
# For backward compatibility
|
||||||
|
predictions = if_predictions
|
||||||
|
|
||||||
detections = []
|
detections = []
|
||||||
for i, (ip, pred, risk_score) in enumerate(zip(source_ips, predictions, risk_scores)):
|
for i, (ip, pred, risk_score) in enumerate(zip(source_ips, predictions, risk_scores)):
|
||||||
@ -402,6 +638,11 @@ class MLHybridDetector:
|
|||||||
joblib.dump(self.scaler, self.model_dir / f"scaler_{timestamp}.pkl")
|
joblib.dump(self.scaler, self.model_dir / f"scaler_{timestamp}.pkl")
|
||||||
joblib.dump(self.feature_selector, self.model_dir / f"feature_selector_{timestamp}.pkl")
|
joblib.dump(self.feature_selector, self.model_dir / f"feature_selector_{timestamp}.pkl")
|
||||||
|
|
||||||
|
# Save ensemble if available
|
||||||
|
if self.ensemble_classifier is not None:
|
||||||
|
joblib.dump(self.ensemble_classifier, self.model_dir / f"ensemble_classifier_{timestamp}.pkl")
|
||||||
|
joblib.dump(self.ensemble_classifier, self.model_dir / "ensemble_classifier_latest.pkl")
|
||||||
|
|
||||||
# Save latest (symlinks alternative)
|
# Save latest (symlinks alternative)
|
||||||
joblib.dump(self.isolation_forest, self.model_dir / "isolation_forest_latest.pkl")
|
joblib.dump(self.isolation_forest, self.model_dir / "isolation_forest_latest.pkl")
|
||||||
joblib.dump(self.scaler, self.model_dir / "scaler_latest.pkl")
|
joblib.dump(self.scaler, self.model_dir / "scaler_latest.pkl")
|
||||||
@ -414,6 +655,7 @@ class MLHybridDetector:
|
|||||||
'selected_feature_names': self.selected_feature_names,
|
'selected_feature_names': self.selected_feature_names,
|
||||||
'config': self.config,
|
'config': self.config,
|
||||||
'metrics': self.metrics,
|
'metrics': self.metrics,
|
||||||
|
'has_ensemble': self.ensemble_classifier is not None,
|
||||||
}
|
}
|
||||||
|
|
||||||
with open(self.model_dir / f"metadata_{timestamp}.json", 'w') as f:
|
with open(self.model_dir / f"metadata_{timestamp}.json", 'w') as f:
|
||||||
@ -423,6 +665,8 @@ class MLHybridDetector:
|
|||||||
json.dump(metadata, f, indent=2)
|
json.dump(metadata, f, indent=2)
|
||||||
|
|
||||||
print(f"[HYBRID] Models saved to {self.model_dir}")
|
print(f"[HYBRID] Models saved to {self.model_dir}")
|
||||||
|
if self.ensemble_classifier is not None:
|
||||||
|
print(f"[HYBRID] Ensemble classifier included")
|
||||||
|
|
||||||
def load_models(self, version: str = 'latest'):
|
def load_models(self, version: str = 'latest'):
|
||||||
"""Load models from disk"""
|
"""Load models from disk"""
|
||||||
@ -431,6 +675,15 @@ class MLHybridDetector:
|
|||||||
self.scaler = joblib.load(self.model_dir / f"scaler_{version}.pkl")
|
self.scaler = joblib.load(self.model_dir / f"scaler_{version}.pkl")
|
||||||
self.feature_selector = joblib.load(self.model_dir / f"feature_selector_{version}.pkl")
|
self.feature_selector = joblib.load(self.model_dir / f"feature_selector_{version}.pkl")
|
||||||
|
|
||||||
|
# Try to load ensemble if available
|
||||||
|
ensemble_path = self.model_dir / f"ensemble_classifier_{version}.pkl"
|
||||||
|
if ensemble_path.exists():
|
||||||
|
self.ensemble_classifier = joblib.load(ensemble_path)
|
||||||
|
print(f"[HYBRID] Ensemble classifier loaded")
|
||||||
|
else:
|
||||||
|
self.ensemble_classifier = None
|
||||||
|
print(f"[HYBRID] No ensemble classifier found (IF-only mode)")
|
||||||
|
|
||||||
with open(self.model_dir / f"metadata_{version}.json") as f:
|
with open(self.model_dir / f"metadata_{version}.json") as f:
|
||||||
metadata = json.load(f)
|
metadata = json.load(f)
|
||||||
self.feature_names = metadata['feature_names']
|
self.feature_names = metadata['feature_names']
|
||||||
@ -440,6 +693,12 @@ class MLHybridDetector:
|
|||||||
|
|
||||||
print(f"[HYBRID] Models loaded (version: {version})")
|
print(f"[HYBRID] Models loaded (version: {version})")
|
||||||
print(f"[HYBRID] Selected features: {len(self.selected_feature_names)}/{len(self.feature_names)}")
|
print(f"[HYBRID] Selected features: {len(self.selected_feature_names)}/{len(self.feature_names)}")
|
||||||
|
|
||||||
|
if self.ensemble_classifier is not None:
|
||||||
|
print(f"[HYBRID] Mode: Hybrid (IF + Ensemble)")
|
||||||
|
else:
|
||||||
|
print(f"[HYBRID] Mode: IF-only (Ensemble not available)")
|
||||||
|
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HYBRID] Failed to load models: {e}")
|
print(f"[HYBRID] Failed to load models: {e}")
|
||||||
|
|||||||
@ -286,7 +286,13 @@ def test_on_synthetic(args):
|
|||||||
metrics = validator.calculate(y_true, y_pred)
|
metrics = validator.calculate(y_true, y_pred)
|
||||||
validator.print_summary(metrics, title="Synthetic Test Results")
|
validator.print_summary(metrics, title="Synthetic Test Results")
|
||||||
|
|
||||||
print("\n✅ System test completed successfully!")
|
print("\n✅ System test completed!")
|
||||||
|
|
||||||
|
# Check if ensemble was trained
|
||||||
|
if detector.ensemble_classifier is None:
|
||||||
|
print("\n⚠️ WARNING: System running in IF-only mode (no ensemble)")
|
||||||
|
print(" This may occur with very clean datasets")
|
||||||
|
print(" Expected metrics will be lower than hybrid mode")
|
||||||
|
|
||||||
return detector, metrics
|
return detector, metrics
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user