Update system to use hybrid detector and improve validation accuracy

Update main.py endpoints to use the hybrid detector and improve validation logic in train_hybrid.py by mapping detections using source_ip. Also, add synthetic source_ip to dataset_loader.py for both CICIDS2017 and synthetic datasets.

Replit-Commit-Author: Agent
Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528
Replit-Commit-Checkpoint-Type: intermediate_checkpoint
Replit-Commit-Event-Id: 5c4982f1-3d37-47da-9253-c04888f5ff64
Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
This commit is contained in:
marco370 2025-11-24 16:02:49 +00:00
parent 4bc4bc5a31
commit 8b16800bb6
4 changed files with 87 additions and 24 deletions

View File

@ -15,7 +15,7 @@ localPort = 5000
externalPort = 80
[[ports]]
localPort = 40719
localPort = 37135
externalPort = 3001
[[ports]]

View File

@ -245,6 +245,12 @@ Expected files:
ids_features['attack_type'] = df['attack_type']
ids_features['is_attack'] = df['is_attack']
# Add synthetic source_ip for validation (CICIDS doesn't have this field)
# Generate unique IPs: 10.0.x.y format
n_samples = len(df)
source_ips = [f"10.0.{i//256}.{i%256}" for i in range(n_samples)]
ids_features['source_ip'] = source_ips
ids_df = pd.DataFrame(ids_features)
# Clip negative values
@ -354,6 +360,10 @@ Expected files:
data['is_attack'] = is_attack
data['attack_type'] = attack_types
# Add synthetic source_ip (simulate real traffic from 100 unique IPs)
unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)]
data['source_ip'] = np.random.choice(unique_ips, n_samples)
df = pd.DataFrame(data)
# Make attacks more extreme

View File

@ -113,11 +113,21 @@ class UnblockIPRequest(BaseModel):
@app.get("/")
async def root():
# Check which detector is active
if USE_HYBRID_DETECTOR:
model_loaded = ml_detector.isolation_forest is not None
model_type = "hybrid"
else:
model_loaded = ml_analyzer.model is not None
model_type = "legacy"
return {
"service": "IDS API",
"version": "1.0.0",
"version": "2.0.0",
"status": "running",
"model_loaded": ml_analyzer.model is not None
"model_type": model_type,
"model_loaded": model_loaded,
"use_hybrid": USE_HYBRID_DETECTOR
}
@app.get("/health")
@ -130,10 +140,19 @@ async def health_check():
except Exception as e:
db_status = f"error: {str(e)}"
# Check model status
if USE_HYBRID_DETECTOR:
model_status = "loaded" if ml_detector.isolation_forest is not None else "not_loaded"
model_type = "hybrid (EIF + Feature Selection)"
else:
model_status = "loaded" if ml_analyzer.model is not None else "not_loaded"
model_type = "legacy (Isolation Forest)"
return {
"status": "healthy",
"database": db_status,
"ml_model": "loaded" if ml_analyzer.model is not None else "not_loaded",
"ml_model": model_status,
"ml_model_type": model_type,
"timestamp": datetime.now().isoformat()
}
@ -171,8 +190,13 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
# Converti in DataFrame
df = pd.DataFrame(logs)
# Training
# Training - usa detector appropriato
print("[TRAIN] Addestramento modello...")
if USE_HYBRID_DETECTOR:
print("[TRAIN] Using Hybrid ML Detector")
result = ml_detector.train_unsupervised(df)
else:
print("[TRAIN] Using Legacy ML Analyzer")
result = ml_analyzer.train(df, contamination=request.contamination)
print(f"[TRAIN] Modello addestrato: {result}")
@ -225,6 +249,16 @@ async def detect_anomalies(request: DetectRequest):
Rileva anomalie nei log recenti
Opzionalmente blocca automaticamente IP anomali
"""
# Check model loaded
if USE_HYBRID_DETECTOR:
if ml_detector.isolation_forest is None:
# Try to load
if not ml_detector.load_models():
raise HTTPException(
status_code=400,
detail="Modello hybrid non addestrato. Esegui /train prima."
)
else:
if ml_analyzer.model is None:
# Prova a caricare modello salvato
if not ml_analyzer.load_model():
@ -254,7 +288,16 @@ async def detect_anomalies(request: DetectRequest):
# Converti in DataFrame
df = pd.DataFrame(logs)
# Detection
# Detection - usa detector appropriato
if USE_HYBRID_DETECTOR:
print("[DETECT] Using Hybrid ML Detector")
# Hybrid detector returns different format
detections = ml_detector.detect(df, mode='confidence')
# Convert to legacy format for compatibility
for det in detections:
det['confidence'] = det['confidence_level'] # Map confidence_level to confidence
else:
print("[DETECT] Using Legacy ML Analyzer")
detections = ml_analyzer.detect(df, risk_threshold=request.risk_threshold)
# Geolocation lookup service - BATCH ASYNC per performance

View File

@ -170,16 +170,20 @@ def validate_with_cicids(args):
detections = detector.detect(test_logs, mode='all')
# Convert detections to binary predictions
detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= 60}
# Create set of detected IPs with risk_score >= 60 (configurable threshold)
detection_threshold = 60
detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold}
# Create predictions array
print(f"[VALIDATE] Detected {len(detected_ips)} unique IPs above threshold {detection_threshold}")
# Create predictions array by mapping source_ip
y_true = test_df['is_attack'].values
y_pred = np.zeros(len(test_df), dtype=int)
# This is approximate - in real scenario each row would have source_ip
# For now, mark all as detected if any IP detected
if len(detected_ips) > 0:
y_pred = np.where(test_df.index.isin(range(int(len(detected_ips) * len(test_df) / test_df['is_attack'].sum()))), 1, 0)
# Map detections to test_df rows using source_ip
for i, row in test_df.iterrows():
if row['source_ip'] in detected_ips:
y_pred[i] = 1
# Calculate metrics
print("\n[VALIDATE] Calculating validation metrics...")
@ -266,11 +270,17 @@ def test_on_synthetic(args):
print(f" {i}. {d['source_ip']}: risk={d['risk_score']:.1f}, "
f"type={d['anomaly_type']}, confidence={d['confidence_level']}")
# Simple validation
# Validation - map detections to test_df rows using source_ip
detection_threshold = 60
detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold}
y_true = test_df['is_attack'].values
detected_indices = test_df.index[:len(detections)] # Simplified
y_pred = np.zeros(len(test_df), dtype=int)
y_pred[detected_indices] = 1
# Map detections to test_df rows
for i, row in test_df.iterrows():
if row['source_ip'] in detected_ips:
y_pred[i] = 1
validator = ValidationMetrics()
metrics = validator.calculate(y_true, y_pred)