Update system to use hybrid detector and improve validation accuracy
Update main.py endpoints to use the hybrid detector and improve validation logic in train_hybrid.py by mapping detections using source_ip. Also, add synthetic source_ip to dataset_loader.py for both CICIDS2017 and synthetic datasets. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: intermediate_checkpoint Replit-Commit-Event-Id: 5c4982f1-3d37-47da-9253-c04888f5ff64 Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
This commit is contained in:
parent
4bc4bc5a31
commit
8b16800bb6
2
.replit
2
.replit
@ -15,7 +15,7 @@ localPort = 5000
|
|||||||
externalPort = 80
|
externalPort = 80
|
||||||
|
|
||||||
[[ports]]
|
[[ports]]
|
||||||
localPort = 40719
|
localPort = 37135
|
||||||
externalPort = 3001
|
externalPort = 3001
|
||||||
|
|
||||||
[[ports]]
|
[[ports]]
|
||||||
|
|||||||
@ -245,6 +245,12 @@ Expected files:
|
|||||||
ids_features['attack_type'] = df['attack_type']
|
ids_features['attack_type'] = df['attack_type']
|
||||||
ids_features['is_attack'] = df['is_attack']
|
ids_features['is_attack'] = df['is_attack']
|
||||||
|
|
||||||
|
# Add synthetic source_ip for validation (CICIDS doesn't have this field)
|
||||||
|
# Generate unique IPs: 10.0.x.y format
|
||||||
|
n_samples = len(df)
|
||||||
|
source_ips = [f"10.0.{i//256}.{i%256}" for i in range(n_samples)]
|
||||||
|
ids_features['source_ip'] = source_ips
|
||||||
|
|
||||||
ids_df = pd.DataFrame(ids_features)
|
ids_df = pd.DataFrame(ids_features)
|
||||||
|
|
||||||
# Clip negative values
|
# Clip negative values
|
||||||
@ -354,6 +360,10 @@ Expected files:
|
|||||||
data['is_attack'] = is_attack
|
data['is_attack'] = is_attack
|
||||||
data['attack_type'] = attack_types
|
data['attack_type'] = attack_types
|
||||||
|
|
||||||
|
# Add synthetic source_ip (simulate real traffic from 100 unique IPs)
|
||||||
|
unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)]
|
||||||
|
data['source_ip'] = np.random.choice(unique_ips, n_samples)
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
# Make attacks more extreme
|
# Make attacks more extreme
|
||||||
|
|||||||
@ -113,11 +113,21 @@ class UnblockIPRequest(BaseModel):
|
|||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def root():
|
async def root():
|
||||||
|
# Check which detector is active
|
||||||
|
if USE_HYBRID_DETECTOR:
|
||||||
|
model_loaded = ml_detector.isolation_forest is not None
|
||||||
|
model_type = "hybrid"
|
||||||
|
else:
|
||||||
|
model_loaded = ml_analyzer.model is not None
|
||||||
|
model_type = "legacy"
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"service": "IDS API",
|
"service": "IDS API",
|
||||||
"version": "1.0.0",
|
"version": "2.0.0",
|
||||||
"status": "running",
|
"status": "running",
|
||||||
"model_loaded": ml_analyzer.model is not None
|
"model_type": model_type,
|
||||||
|
"model_loaded": model_loaded,
|
||||||
|
"use_hybrid": USE_HYBRID_DETECTOR
|
||||||
}
|
}
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
@ -130,10 +140,19 @@ async def health_check():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
db_status = f"error: {str(e)}"
|
db_status = f"error: {str(e)}"
|
||||||
|
|
||||||
|
# Check model status
|
||||||
|
if USE_HYBRID_DETECTOR:
|
||||||
|
model_status = "loaded" if ml_detector.isolation_forest is not None else "not_loaded"
|
||||||
|
model_type = "hybrid (EIF + Feature Selection)"
|
||||||
|
else:
|
||||||
|
model_status = "loaded" if ml_analyzer.model is not None else "not_loaded"
|
||||||
|
model_type = "legacy (Isolation Forest)"
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "healthy",
|
"status": "healthy",
|
||||||
"database": db_status,
|
"database": db_status,
|
||||||
"ml_model": "loaded" if ml_analyzer.model is not None else "not_loaded",
|
"ml_model": model_status,
|
||||||
|
"ml_model_type": model_type,
|
||||||
"timestamp": datetime.now().isoformat()
|
"timestamp": datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,9 +190,14 @@ async def train_model(request: TrainRequest, background_tasks: BackgroundTasks):
|
|||||||
# Converti in DataFrame
|
# Converti in DataFrame
|
||||||
df = pd.DataFrame(logs)
|
df = pd.DataFrame(logs)
|
||||||
|
|
||||||
# Training
|
# Training - usa detector appropriato
|
||||||
print("[TRAIN] Addestramento modello...")
|
print("[TRAIN] Addestramento modello...")
|
||||||
result = ml_analyzer.train(df, contamination=request.contamination)
|
if USE_HYBRID_DETECTOR:
|
||||||
|
print("[TRAIN] Using Hybrid ML Detector")
|
||||||
|
result = ml_detector.train_unsupervised(df)
|
||||||
|
else:
|
||||||
|
print("[TRAIN] Using Legacy ML Analyzer")
|
||||||
|
result = ml_analyzer.train(df, contamination=request.contamination)
|
||||||
print(f"[TRAIN] Modello addestrato: {result}")
|
print(f"[TRAIN] Modello addestrato: {result}")
|
||||||
|
|
||||||
# Salva nel database
|
# Salva nel database
|
||||||
@ -225,13 +249,23 @@ async def detect_anomalies(request: DetectRequest):
|
|||||||
Rileva anomalie nei log recenti
|
Rileva anomalie nei log recenti
|
||||||
Opzionalmente blocca automaticamente IP anomali
|
Opzionalmente blocca automaticamente IP anomali
|
||||||
"""
|
"""
|
||||||
if ml_analyzer.model is None:
|
# Check model loaded
|
||||||
# Prova a caricare modello salvato
|
if USE_HYBRID_DETECTOR:
|
||||||
if not ml_analyzer.load_model():
|
if ml_detector.isolation_forest is None:
|
||||||
raise HTTPException(
|
# Try to load
|
||||||
status_code=400,
|
if not ml_detector.load_models():
|
||||||
detail="Modello non addestrato. Esegui /train prima."
|
raise HTTPException(
|
||||||
)
|
status_code=400,
|
||||||
|
detail="Modello hybrid non addestrato. Esegui /train prima."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if ml_analyzer.model is None:
|
||||||
|
# Prova a caricare modello salvato
|
||||||
|
if not ml_analyzer.load_model():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Modello non addestrato. Esegui /train prima."
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
conn = get_db_connection()
|
conn = get_db_connection()
|
||||||
@ -254,8 +288,17 @@ async def detect_anomalies(request: DetectRequest):
|
|||||||
# Converti in DataFrame
|
# Converti in DataFrame
|
||||||
df = pd.DataFrame(logs)
|
df = pd.DataFrame(logs)
|
||||||
|
|
||||||
# Detection
|
# Detection - usa detector appropriato
|
||||||
detections = ml_analyzer.detect(df, risk_threshold=request.risk_threshold)
|
if USE_HYBRID_DETECTOR:
|
||||||
|
print("[DETECT] Using Hybrid ML Detector")
|
||||||
|
# Hybrid detector returns different format
|
||||||
|
detections = ml_detector.detect(df, mode='confidence')
|
||||||
|
# Convert to legacy format for compatibility
|
||||||
|
for det in detections:
|
||||||
|
det['confidence'] = det['confidence_level'] # Map confidence_level to confidence
|
||||||
|
else:
|
||||||
|
print("[DETECT] Using Legacy ML Analyzer")
|
||||||
|
detections = ml_analyzer.detect(df, risk_threshold=request.risk_threshold)
|
||||||
|
|
||||||
# Geolocation lookup service - BATCH ASYNC per performance
|
# Geolocation lookup service - BATCH ASYNC per performance
|
||||||
geo_service = get_geo_service()
|
geo_service = get_geo_service()
|
||||||
|
|||||||
@ -170,16 +170,20 @@ def validate_with_cicids(args):
|
|||||||
detections = detector.detect(test_logs, mode='all')
|
detections = detector.detect(test_logs, mode='all')
|
||||||
|
|
||||||
# Convert detections to binary predictions
|
# Convert detections to binary predictions
|
||||||
detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= 60}
|
# Create set of detected IPs with risk_score >= 60 (configurable threshold)
|
||||||
|
detection_threshold = 60
|
||||||
|
detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold}
|
||||||
|
|
||||||
# Create predictions array
|
print(f"[VALIDATE] Detected {len(detected_ips)} unique IPs above threshold {detection_threshold}")
|
||||||
|
|
||||||
|
# Create predictions array by mapping source_ip
|
||||||
y_true = test_df['is_attack'].values
|
y_true = test_df['is_attack'].values
|
||||||
y_pred = np.zeros(len(test_df), dtype=int)
|
y_pred = np.zeros(len(test_df), dtype=int)
|
||||||
|
|
||||||
# This is approximate - in real scenario each row would have source_ip
|
# Map detections to test_df rows using source_ip
|
||||||
# For now, mark all as detected if any IP detected
|
for i, row in test_df.iterrows():
|
||||||
if len(detected_ips) > 0:
|
if row['source_ip'] in detected_ips:
|
||||||
y_pred = np.where(test_df.index.isin(range(int(len(detected_ips) * len(test_df) / test_df['is_attack'].sum()))), 1, 0)
|
y_pred[i] = 1
|
||||||
|
|
||||||
# Calculate metrics
|
# Calculate metrics
|
||||||
print("\n[VALIDATE] Calculating validation metrics...")
|
print("\n[VALIDATE] Calculating validation metrics...")
|
||||||
@ -266,11 +270,17 @@ def test_on_synthetic(args):
|
|||||||
print(f" {i}. {d['source_ip']}: risk={d['risk_score']:.1f}, "
|
print(f" {i}. {d['source_ip']}: risk={d['risk_score']:.1f}, "
|
||||||
f"type={d['anomaly_type']}, confidence={d['confidence_level']}")
|
f"type={d['anomaly_type']}, confidence={d['confidence_level']}")
|
||||||
|
|
||||||
# Simple validation
|
# Validation - map detections to test_df rows using source_ip
|
||||||
|
detection_threshold = 60
|
||||||
|
detected_ips = {d['source_ip'] for d in detections if d['risk_score'] >= detection_threshold}
|
||||||
|
|
||||||
y_true = test_df['is_attack'].values
|
y_true = test_df['is_attack'].values
|
||||||
detected_indices = test_df.index[:len(detections)] # Simplified
|
|
||||||
y_pred = np.zeros(len(test_df), dtype=int)
|
y_pred = np.zeros(len(test_df), dtype=int)
|
||||||
y_pred[detected_indices] = 1
|
|
||||||
|
# Map detections to test_df rows
|
||||||
|
for i, row in test_df.iterrows():
|
||||||
|
if row['source_ip'] in detected_ips:
|
||||||
|
y_pred[i] = 1
|
||||||
|
|
||||||
validator = ValidationMetrics()
|
validator = ValidationMetrics()
|
||||||
metrics = validator.calculate(y_true, y_pred)
|
metrics = validator.calculate(y_true, y_pred)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user