Update system to use hybrid detector and improve validation accuracy

Update main.py endpoints to use the hybrid detector and improve validation logic in train_hybrid.py by mapping detections using source_ip. Also, add synthetic source_ip to dataset_loader.py for both CICIDS2017 and synthetic datasets. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: intermediate_checkpoint Replit-Commit-Event-Id: 5c4982f1-3d37-47da-9253-c04888f5ff64 Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
2025-11-24 16:02:49 +00:00 · 2025-11-24 16:02:49 +00:00 · 8b16800bb6
commit 8b16800bb6
parent 4bc4bc5a31
4 changed files with 87 additions and 24 deletions
--- a/.replit
+++ b/.replit
@ -15,7 +15,7 @@ localPort = 5000
 externalPort = 80

 [[ports]]
-localPort = 40719
+localPort = 37135
 externalPort = 3001

 [[ports]]
--- a/python_ml/dataset_loader.py
+++ b/python_ml/dataset_loader.py
@ -245,6 +245,12 @@ Expected files:
        ids_features['attack_type'] = df['attack_type']
        ids_features['is_attack'] = df['is_attack']
        
+        # Add synthetic source_ip for validation (CICIDS doesn't have this field)
+        # Generate unique IPs: 10.0.x.y format
+        n_samples = len(df)
+        source_ips = [f"10.0.{i//256}.{i%256}" for i in range(n_samples)]
+        ids_features['source_ip'] = source_ips
+        
        ids_df = pd.DataFrame(ids_features)
        
        # Clip negative values
@ -354,6 +360,10 @@ Expected files:
        data['is_attack'] = is_attack
        data['attack_type'] = attack_types
        
+        # Add synthetic source_ip (simulate real traffic from 100 unique IPs)
+        unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)]
+        data['source_ip'] = np.random.choice(unique_ips, n_samples)
+        
        df = pd.DataFrame(data)
        
        # Make attacks more extreme
--- a/python_ml/main.py
+++ b/python_ml/main.py
@ -113,11 +113,21 @@ class UnblockIPRequest(BaseModel):

@app.get("/")
 async def root():
+    # Check which detector is active
+    if USE_HYBRID_DETECTOR:
+        model_loaded = ml_detector.isolation_forest is not None
+        model_type = "hybrid"
+    else:
+        model_loaded = ml_analyzer.model is not None
+        model_type = "legacy"
+    
    return {
        "service": "IDS API",
-        "version": "1.0.0",
+        "version": "2.0.0",
        "status": "running",
-        "model_loaded": ml_analyzer.model is not None
+        "model_type": model_type,
+        "model_loaded": model_loaded,
+        "use_hybrid": USE_HYBRID_DETECTOR
    }

@app.get("/health")
@ -130,10 +140,19 @@ async def health_check():
    except Exception as e:
        db_status = f"error: {str(e)}"
    
+    # Check model status
+    if USE_HYBRID_DETECTOR:
+        model_status = "loaded" if ml_detector.isolation_forest is not None else "not_loaded"
+        model_type = "hybrid (EIF + Feature Selection)"
+    else:
+        model_status = "loaded" if ml_analyzer.model is not None else "not_loaded"
+        model_type = "legacy (Isolation Forest)"
+    
    return {
        "status": "healthy",
        "database": db_status,
-        "ml_model": "loaded" if ml_analyzer.model is not None else "not_loaded",
+        "ml_model": model_status,
+        "ml_model_type": model_type,
        "timestamp": datetime.now().isoformat()
    }

@ -171,8 +190,13 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
            # Converti in DataFrame
            df = pd.DataFrame(logs)
            
-            # Training
+            # Training - usa detector appropriato
            print("[TRAIN] Addestramento modello...")
+            if USE_HYBRID_DETECTOR:
+                print("[TRAIN] Using Hybrid ML Detector")
+                result = ml_detector.train_unsupervised(df)
+            else:
+                print("[TRAIN] Using Legacy ML Analyzer")
                result = ml_analyzer.train(df, contamination=request.contamination)
            print(f"[TRAIN] Modello addestrato: {result}")
            
@ -225,6 +249,16 @@ async def detect_anomalies(request: DetectRequest):
    Rileva anomalie nei log recenti
    Opzionalmente blocca automaticamente IP anomali
    """
+    # Check model loaded
+    if USE_HYBRID_DETECTOR:
+        if ml_detector.isolation_forest is None:
+            # Try to load
+            if not ml_detector.load_models():
+                raise HTTPException(
+                    status_code=400,
+                    detail="Modello hybrid non addestrato. Esegui /train prima."
+                )
+    else:
        if ml_analyzer.model is None:
            # Prova a caricare modello salvato
            if not ml_analyzer.load_model():
@ -254,7 +288,16 @@ async def detect_anomalies(request: DetectRequest):
        # Converti in DataFrame
        df = pd.DataFrame(logs)
        
-        # Detection
+        # Detection - usa detector appropriato
+        if USE_HYBRID_DETECTOR:
+            print("[DETECT] Using Hybrid ML Detector")
+            # Hybrid detector returns different format
+            detections = ml_detector.detect(df, mode='confidence')
+            # Convert to legacy format for compatibility
+            for det in detections:
+                det['confidence'] = det['confidence_level']  # Map confidence_level to confidence
+        else:
+            print("[DETECT] Using Legacy ML Analyzer")
            detections = ml_analyzer.detect(df, risk_threshold=request.risk_threshold)
        
        # Geolocation lookup service - BATCH ASYNC per performance
--- a/python_ml/train_hybrid.py
+++ b/python_ml/train_hybrid.py
@ -170,16 +170,20 @@ def validate_with_cicids(args):
    detections = detector.detect(test_logs, mode='all')
    
    # Convert detections to binary predictions
-    detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= 60}
+    # Create set of detected IPs with risk_score >= 60 (configurable threshold)
+    detection_threshold = 60
+    detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold}
    
-    # Create predictions array
+    print(f"[VALIDATE] Detected {len(detected_ips)} unique IPs above threshold {detection_threshold}")
+    
+    # Create predictions array by mapping source_ip
    y_true = test_df['is_attack'].values
    y_pred = np.zeros(len(test_df), dtype=int)
    
-    # This is approximate - in real scenario each row would have source_ip
-    # For now, mark all as detected if any IP detected
-    if len(detected_ips) > 0:
-        y_pred = np.where(test_df.index.isin(range(int(len(detected_ips) * len(test_df) / test_df['is_attack'].sum()))), 1, 0)
+    # Map detections to test_df rows using source_ip
+    for i, row in test_df.iterrows():
+        if row['source_ip'] in detected_ips:
+            y_pred[i] = 1
    
    # Calculate metrics
    print("\n[VALIDATE] Calculating validation metrics...")
@ -266,11 +270,17 @@ def test_on_synthetic(args):
        print(f"  {i}. {d['source_ip']}: risk={d['risk_score']:.1f}, "
              f"type={d['anomaly_type']}, confidence={d['confidence_level']}")
    
-    # Simple validation
+    # Validation - map detections to test_df rows using source_ip
+    detection_threshold = 60
+    detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold}
+    
    y_true = test_df['is_attack'].values
-    detected_indices = test_df.index[:len(detections)]  # Simplified
    y_pred = np.zeros(len(test_df), dtype=int)
-    y_pred[detected_indices] = 1
+    
+    # Map detections to test_df rows
+    for i, row in test_df.iterrows():
+        if row['source_ip'] in detected_ips:
+            y_pred[i] = 1
    
    validator = ValidationMetrics()
    metrics = validator.calculate(y_true, y_pred)