diff --git a/.replit b/.replit index bcba2e8..d388bbc 100644 --- a/.replit +++ b/.replit @@ -15,7 +15,7 @@ localPort = 5000 externalPort = 80 [[ports]] -localPort = 40719 +localPort = 37135 externalPort = 3001 [[ports]] diff --git a/python_ml/dataset_loader.py b/python_ml/dataset_loader.py index ae51a1e..dccdd3f 100644 --- a/python_ml/dataset_loader.py +++ b/python_ml/dataset_loader.py @@ -245,6 +245,12 @@ Expected files: ids_features['attack_type'] = df['attack_type'] ids_features['is_attack'] = df['is_attack'] + # Add synthetic source_ip for validation (CICIDS doesn't have this field) + # Generate unique IPs: 10.0.x.y format + n_samples = len(df) + source_ips = [f"10.0.{i//256}.{i%256}" for i in range(n_samples)] + ids_features['source_ip'] = source_ips + ids_df = pd.DataFrame(ids_features) # Clip negative values @@ -354,6 +360,10 @@ Expected files: data['is_attack'] = is_attack data['attack_type'] = attack_types + # Add synthetic source_ip (simulate real traffic from 100 unique IPs) + unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)] + data['source_ip'] = np.random.choice(unique_ips, n_samples) + df = pd.DataFrame(data) # Make attacks more extreme diff --git a/python_ml/main.py b/python_ml/main.py index c6e4a61..ed87083 100644 --- a/python_ml/main.py +++ b/python_ml/main.py @@ -113,11 +113,21 @@ class UnblockIPRequest(BaseModel): @app.get("/") async def root(): + # Check which detector is active + if USE_HYBRID_DETECTOR: + model_loaded = ml_detector.isolation_forest is not None + model_type = "hybrid" + else: + model_loaded = ml_analyzer.model is not None + model_type = "legacy" + return { "service": "IDS API", - "version": "1.0.0", + "version": "2.0.0", "status": "running", - "model_loaded": ml_analyzer.model is not None + "model_type": model_type, + "model_loaded": model_loaded, + "use_hybrid": USE_HYBRID_DETECTOR } @app.get("/health") @@ -130,10 +140,19 @@ async def health_check(): except Exception as e: db_status = f"error: {str(e)}" + # Check model status + if USE_HYBRID_DETECTOR: + model_status = "loaded" if ml_detector.isolation_forest is not None else "not_loaded" + model_type = "hybrid (EIF + Feature Selection)" + else: + model_status = "loaded" if ml_analyzer.model is not None else "not_loaded" + model_type = "legacy (Isolation Forest)" + return { "status": "healthy", "database": db_status, - "ml_model": "loaded" if ml_analyzer.model is not None else "not_loaded", + "ml_model": model_status, + "ml_model_type": model_type, "timestamp": datetime.now().isoformat() } @@ -171,9 +190,14 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks): # Converti in DataFrame df = pd.DataFrame(logs) - # Training + # Training - usa detector appropriato print("[TRAIN] Addestramento modello...") - result = ml_analyzer.train(df, contamination=request.contamination) + if USE_HYBRID_DETECTOR: + print("[TRAIN] Using Hybrid ML Detector") + result = ml_detector.train_unsupervised(df) + else: + print("[TRAIN] Using Legacy ML Analyzer") + result = ml_analyzer.train(df, contamination=request.contamination) print(f"[TRAIN] Modello addestrato: {result}") # Salva nel database @@ -225,13 +249,23 @@ async def detect_anomalies(request: DetectRequest): Rileva anomalie nei log recenti Opzionalmente blocca automaticamente IP anomali """ - if ml_analyzer.model is None: - # Prova a caricare modello salvato - if not ml_analyzer.load_model(): - raise HTTPException( - status_code=400, - detail="Modello non addestrato. Esegui /train prima." - ) + # Check model loaded + if USE_HYBRID_DETECTOR: + if ml_detector.isolation_forest is None: + # Try to load + if not ml_detector.load_models(): + raise HTTPException( + status_code=400, + detail="Modello hybrid non addestrato. Esegui /train prima." + ) + else: + if ml_analyzer.model is None: + # Prova a caricare modello salvato + if not ml_analyzer.load_model(): + raise HTTPException( + status_code=400, + detail="Modello non addestrato. Esegui /train prima." + ) try: conn = get_db_connection() @@ -254,8 +288,17 @@ async def detect_anomalies(request: DetectRequest): # Converti in DataFrame df = pd.DataFrame(logs) - # Detection - detections = ml_analyzer.detect(df, risk_threshold=request.risk_threshold) + # Detection - usa detector appropriato + if USE_HYBRID_DETECTOR: + print("[DETECT] Using Hybrid ML Detector") + # Hybrid detector returns different format + detections = ml_detector.detect(df, mode='confidence') + # Convert to legacy format for compatibility + for det in detections: + det['confidence'] = det['confidence_level'] # Map confidence_level to confidence + else: + print("[DETECT] Using Legacy ML Analyzer") + detections = ml_analyzer.detect(df, risk_threshold=request.risk_threshold) # Geolocation lookup service - BATCH ASYNC per performance geo_service = get_geo_service() diff --git a/python_ml/train_hybrid.py b/python_ml/train_hybrid.py index 367e3c9..e676f94 100644 --- a/python_ml/train_hybrid.py +++ b/python_ml/train_hybrid.py @@ -170,16 +170,20 @@ def validate_with_cicids(args): detections = detector.detect(test_logs, mode='all') # Convert detections to binary predictions - detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= 60} + # Create set of detected IPs with risk_score >= 60 (configurable threshold) + detection_threshold = 60 + detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold} - # Create predictions array + print(f"[VALIDATE] Detected {len(detected_ips)} unique IPs above threshold {detection_threshold}") + + # Create predictions array by mapping source_ip y_true = test_df['is_attack'].values y_pred = np.zeros(len(test_df), dtype=int) - # This is approximate - in real scenario each row would have source_ip - # For now, mark all as detected if any IP detected - if len(detected_ips) > 0: - y_pred = np.where(test_df.index.isin(range(int(len(detected_ips) * len(test_df) / test_df['is_attack'].sum()))), 1, 0) + # Map detections to test_df rows using source_ip + for i, row in test_df.iterrows(): + if row['source_ip'] in detected_ips: + y_pred[i] = 1 # Calculate metrics print("\n[VALIDATE] Calculating validation metrics...") @@ -266,11 +270,17 @@ def test_on_synthetic(args): print(f" {i}. {d['source_ip']}: risk={d['risk_score']:.1f}, " f"type={d['anomaly_type']}, confidence={d['confidence_level']}") - # Simple validation + # Validation - map detections to test_df rows using source_ip + detection_threshold = 60 + detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold} + y_true = test_df['is_attack'].values - detected_indices = test_df.index[:len(detections)] # Simplified y_pred = np.zeros(len(test_df), dtype=int) - y_pred[detected_indices] = 1 + + # Map detections to test_df rows + for i, row in test_df.iterrows(): + if row['source_ip'] in detected_ips: + y_pred[i] = 1 validator = ValidationMetrics() metrics = validator.calculate(y_true, y_pred)