From 7ba039a547e51fb21a809d206645b199601e96fb Mon Sep 17 00:00:00 2001 From: marco370 <48531002-marco370@users.noreply.replit.com> Date: Mon, 24 Nov 2025 17:57:22 +0000 Subject: [PATCH] Fix index out of bounds error during synthetic data testing Corrected an indexing error in `train_hybrid.py` by using `enumerate` to ensure accurate mapping of detections to the test dataset, resolving an `IndexError` when processing synthetic data. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: full_checkpoint Replit-Commit-Event-Id: d05c3dd2-6349-426d-be9c-ec80a07ea78f Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2 --- ...andard-IF--1764006995649_1764006995649.txt | 75 +++++++++++++++++++ python_ml/train_hybrid.py | 6 +- 2 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 attached_assets/Pasted--python-train-hybrid-py-test-WARNING-Extended-Isolation-Forest-not-available-using-standard-IF--1764006995649_1764006995649.txt diff --git a/attached_assets/Pasted--python-train-hybrid-py-test-WARNING-Extended-Isolation-Forest-not-available-using-standard-IF--1764006995649_1764006995649.txt b/attached_assets/Pasted--python-train-hybrid-py-test-WARNING-Extended-Isolation-Forest-not-available-using-standard-IF--1764006995649_1764006995649.txt new file mode 100644 index 0000000..23b7049 --- /dev/null +++ b/attached_assets/Pasted--python-train-hybrid-py-test-WARNING-Extended-Isolation-Forest-not-available-using-standard-IF--1764006995649_1764006995649.txt @@ -0,0 +1,75 @@ + python train_hybrid.py --test +[WARNING] Extended Isolation Forest not available, using standard IF + +====================================================================== + IDS HYBRID ML TEST - SYNTHETIC DATA +====================================================================== +INFO:dataset_loader:Creating sample dataset (10000 samples)... +INFO:dataset_loader:Sample dataset created: 10000 rows +INFO:dataset_loader:Attack distribution: +attack_type +normal 8981 +brute_force 273 +suspicious 258 +ddos 257 +port_scan 231 +Name: count, dtype: int64 + +[TEST] Created synthetic dataset: 10000 samples + Normal: 8,981 (89.8%) + Attacks: 1,019 (10.2%) + +[TEST] Training on 6,281 normal samples... +[HYBRID] Training hybrid model on 6281 logs... +[HYBRID] Extracted features for 100 unique IPs +[HYBRID] Pre-training Isolation Forest for feature selection... +[HYBRID] Generated 3 pseudo-anomalies from pre-training IF +[HYBRID] Feature selection: 25 → 18 features +[HYBRID] Selected features: total_packets, conn_count, time_span_seconds, conn_per_second, hour_of_day... (+13 more) +[HYBRID] Normalizing features... +[HYBRID] Training Extended Isolation Forest (contamination=0.03)... +/opt/ids/python_ml/venv/lib64/python3.11/site-packages/sklearn/ensemble/_iforest.py:307: UserWarning: max_samples (256) is greater than the total number of samples (100). max_samples will be set to n_samples for estimation. + warn( +[HYBRID] Generating pseudo-labels from Isolation Forest... +[HYBRID] ⚠ IF found only 3 anomalies (need 10) +[HYBRID] Applying ADAPTIVE percentile fallback... +[HYBRID] Trying 5% percentile → 5 anomalies +[HYBRID] Trying 10% percentile → 10 anomalies +[HYBRID] ✅ Success with 10% percentile +[HYBRID] Pseudo-labels: 10 anomalies, 90 normal +[HYBRID] Training ensemble classifier (DT + RF + XGBoost)... +[HYBRID] Class distribution OK: [0 1] (counts: [90 10]) +[HYBRID] Ensemble .fit() completed successfully +[HYBRID] ✅ Ensemble verified: produces 2 class probabilities +[HYBRID] Ensemble training completed and verified! +[HYBRID] Models saved to models +[HYBRID] Ensemble classifier included +[HYBRID] ✅ Training completed successfully! 10/100 IPs flagged as anomalies +[HYBRID] ✅ Ensemble classifier verified and ready for production +[DETECT] Ensemble classifier available - computing hybrid score... +[DETECT] IF scores: min=0.0, max=100.0, mean=57.6 +[DETECT] Ensemble scores: min=86.9, max=97.2, mean=92.1 +[DETECT] Combined scores: min=54.3, max=93.1, mean=78.3 +[DETECT] ✅ Hybrid scoring active: 40% IF + 60% Ensemble + +[TEST] Detection results: + Total detections: 100 + High confidence: 0 + Medium confidence: 85 + Low confidence: 15 + +[TEST] Top 5 detections: + 1. 192.168.0.24: risk=93.1, type=suspicious, confidence=medium + 2. 192.168.0.27: risk=92.7, type=suspicious, confidence=medium + 3. 192.168.0.88: risk=92.5, type=suspicious, confidence=medium + 4. 192.168.0.70: risk=92.3, type=suspicious, confidence=medium + 5. 192.168.0.4: risk=91.4, type=suspicious, confidence=medium + +❌ Error: index 7000 is out of bounds for axis 0 with size 3000 +Traceback (most recent call last): + File "/opt/ids/python_ml/train_hybrid.py", line 361, in main + test_on_synthetic(args) + File "/opt/ids/python_ml/train_hybrid.py", line 283, in test_on_synthetic + y_pred[i] = 1 + ~~~~~~^^^ +IndexError: index 7000 is out of bounds for axis 0 with size 3000 \ No newline at end of file diff --git a/python_ml/train_hybrid.py b/python_ml/train_hybrid.py index 9950bdd..912fca6 100644 --- a/python_ml/train_hybrid.py +++ b/python_ml/train_hybrid.py @@ -277,10 +277,10 @@ def test_on_synthetic(args): y_true = test_df['is_attack'].values y_pred = np.zeros(len(test_df), dtype=int) - # Map detections to test_df rows - for i, row in test_df.iterrows(): + # Map detections to test_df rows (use enumerate for correct indexing) + for idx, (_, row) in enumerate(test_df.iterrows()): if row['source_ip'] in detected_ips: - y_pred[i] = 1 + y_pred[idx] = 1 validator = ValidationMetrics() metrics = validator.calculate(y_true, y_pred)