Update system to use hybrid detector and improve validation accuracy

Update main.py endpoints to use the hybrid detector and improve validation logic in train_hybrid.py by mapping detections using source_ip. Also, add synthetic source_ip to dataset_loader.py for both CICIDS2017 and synthetic datasets. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: intermediate_checkpoint Replit-Commit-Event-Id: 5c4982f1-3d37-47da-9253-c04888f5ff64 Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
2025-11-24 16:02:49 +00:00 · 2025-11-24 16:02:49 +00:00 · 8b16800bb6
commit 8b16800bb6
parent 4bc4bc5a31
4 changed files with 87 additions and 24 deletions
--- a/.replit
+++ b/.replit
@ -15,7 +15,7 @@ localPort = 5000
 externalPort = 80
 [[ports]]
-localPort = 40719
+localPort = 37135
 externalPort = 3001
 [[ports]]
--- a/python_ml/dataset_loader.py
+++ b/python_ml/dataset_loader.py
@ -245,6 +245,12 @@ Expected files:
        ids_features['attack_type'] = df['attack_type']
        ids_features['is_attack'] = df['is_attack']
        # Add synthetic source_ip for validation (CICIDS doesn't have this field)
        # Generate unique IPs: 10.0.x.y format
        n_samples = len(df)
        source_ips = [f"10.0.{i//256}.{i%256}" for i in range(n_samples)]
        ids_features['source_ip'] = source_ips
        ids_df = pd.DataFrame(ids_features)
        # Clip negative values
@ -354,6 +360,10 @@ Expected files:
        data['is_attack'] = is_attack
        data['attack_type'] = attack_types
        # Add synthetic source_ip (simulate real traffic from 100 unique IPs)
        unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)]
        data['source_ip'] = np.random.choice(unique_ips, n_samples)
        df = pd.DataFrame(data)
        # Make attacks more extreme
--- a/python_ml/main.py
+++ b/python_ml/main.py
@ -113,11 +113,21 @@ class UnblockIPRequest(BaseModel):
@app.get("/")
 async def root():
    # Check which detector is active
    if USE_HYBRID_DETECTOR:
        model_loaded = ml_detector.isolation_forest is not None
        model_type = "hybrid"
    else:
        model_loaded = ml_analyzer.model is not None
        model_type = "legacy"
    return {
        "service": "IDS API",
-        "version": "1.0.0",
+        "version": "2.0.0",
        "status": "running",
-        "model_loaded": ml_analyzer.model is not None
+        "model_type": model_type,
        "model_loaded": model_loaded,
        "use_hybrid": USE_HYBRID_DETECTOR
    }
@app.get("/health")
@ -130,10 +140,19 @@ async def health_check():
    except Exception as e:
        db_status = f"error: {str(e)}"
    # Check model status
    if USE_HYBRID_DETECTOR:
        model_status = "loaded" if ml_detector.isolation_forest is not None else "not_loaded"
        model_type = "hybrid (EIF + Feature Selection)"
    else:
        model_status = "loaded" if ml_analyzer.model is not None else "not_loaded"
        model_type = "legacy (Isolation Forest)"
    return {
        "status": "healthy",
        "database": db_status,
-        "ml_model": "loaded" if ml_analyzer.model is not None else "not_loaded",
+        "ml_model": model_status,
        "ml_model_type": model_type,
        "timestamp": datetime.now().isoformat()
    }
@ -171,9 +190,14 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
            # Converti in DataFrame
            df = pd.DataFrame(logs)
-            # Training
+            # Training - usa detector appropriato
            print("[TRAIN] Addestramento modello...")
-            result = ml_analyzer.train(df, contamination=request.contamination)
+            if USE_HYBRID_DETECTOR:
                print("[TRAIN] Using Hybrid ML Detector")
                result = ml_detector.train_unsupervised(df)
            else:
                print("[TRAIN] Using Legacy ML Analyzer")
                result = ml_analyzer.train(df, contamination=request.contamination)
            print(f"[TRAIN] Modello addestrato: {result}")
            # Salva nel database
@ -225,13 +249,23 @@ async def detect_anomalies(request: DetectRequest):
    Rileva anomalie nei log recenti
    Opzionalmente blocca automaticamente IP anomali
    """
-    if ml_analyzer.model is None:
+    # Check model loaded
-        # Prova a caricare modello salvato
+    if USE_HYBRID_DETECTOR:
-        if not ml_analyzer.load_model():
+        if ml_detector.isolation_forest is None:
-            raise HTTPException(
+            # Try to load
-                status_code=400,
+            if not ml_detector.load_models():
-                detail="Modello non addestrato. Esegui /train prima."
+                raise HTTPException(
-            )
+                    status_code=400,
                    detail="Modello hybrid non addestrato. Esegui /train prima."
                )
    else:
        if ml_analyzer.model is None:
            # Prova a caricare modello salvato
            if not ml_analyzer.load_model():
                raise HTTPException(
                    status_code=400,
                    detail="Modello non addestrato. Esegui /train prima."
                )
    try:
        conn = get_db_connection()
@ -254,8 +288,17 @@ async def detect_anomalies(request: DetectRequest):
        # Converti in DataFrame
        df = pd.DataFrame(logs)
-        # Detection
+        # Detection - usa detector appropriato
-        detections = ml_analyzer.detect(df, risk_threshold=request.risk_threshold)
+        if USE_HYBRID_DETECTOR:
            print("[DETECT] Using Hybrid ML Detector")
            # Hybrid detector returns different format
            detections = ml_detector.detect(df, mode='confidence')
            # Convert to legacy format for compatibility
            for det in detections:
                det['confidence'] = det['confidence_level']  # Map confidence_level to confidence
        else:
            print("[DETECT] Using Legacy ML Analyzer")
            detections = ml_analyzer.detect(df, risk_threshold=request.risk_threshold)
        # Geolocation lookup service - BATCH ASYNC per performance
        geo_service = get_geo_service()
--- a/python_ml/train_hybrid.py
+++ b/python_ml/train_hybrid.py
@ -170,16 +170,20 @@ def validate_with_cicids(args):
    detections = detector.detect(test_logs, mode='all')
    # Convert detections to binary predictions
-    detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= 60}
+    # Create set of detected IPs with risk_score >= 60 (configurable threshold)
    detection_threshold = 60
    detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold}
-    # Create predictions array
+    print(f"[VALIDATE] Detected {len(detected_ips)} unique IPs above threshold {detection_threshold}")
    # Create predictions array by mapping source_ip
    y_true = test_df['is_attack'].values
    y_pred = np.zeros(len(test_df), dtype=int)
-    # This is approximate - in real scenario each row would have source_ip
+    # Map detections to test_df rows using source_ip
-    # For now, mark all as detected if any IP detected
+    for i, row in test_df.iterrows():
-    if len(detected_ips) > 0:
+        if row['source_ip'] in detected_ips:
-        y_pred = np.where(test_df.index.isin(range(int(len(detected_ips) * len(test_df) / test_df['is_attack'].sum()))), 1, 0)
+            y_pred[i] = 1
    # Calculate metrics
    print("\n[VALIDATE] Calculating validation metrics...")
@ -266,11 +270,17 @@ def test_on_synthetic(args):
        print(f"  {i}. {d['source_ip']}: risk={d['risk_score']:.1f}, "
              f"type={d['anomaly_type']}, confidence={d['confidence_level']}")
-    # Simple validation
+    # Validation - map detections to test_df rows using source_ip
    detection_threshold = 60
    detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold}
    y_true = test_df['is_attack'].values
    detected_indices = test_df.index[:len(detections)]  # Simplified
    y_pred = np.zeros(len(test_df), dtype=int)
-    y_pred[detected_indices] = 1
+    
    # Map detections to test_df rows
    for i, row in test_df.iterrows():
        if row['source_ip'] in detected_ips:
            y_pred[i] = 1
    validator = ValidationMetrics()
    metrics = validator.calculate(y_true, y_pred)