From 9fe25322179d2d51421e6c4e68ac078ab6bcc497 Mon Sep 17 00:00:00 2001 From: marco370 <48531002-marco370@users.noreply.replit.com> Date: Mon, 24 Nov 2025 17:52:16 +0000 Subject: [PATCH] Add timestamp to synthetic data for accurate model testing Add a 'timestamp' column to the synthetic dataset generation in `python_ml/dataset_loader.py` to resolve a `KeyError` during model training and testing. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: intermediate_checkpoint Replit-Commit-Event-Id: 276a3bd4-aaee-40c9-acb7-027f23274a9f Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2 --- ...andard-IF--1764006677142_1764006677142.txt | 54 +++++++++++++++++++ python_ml/dataset_loader.py | 11 ++++ 2 files changed, 65 insertions(+) create mode 100644 attached_assets/Pasted-python-train-hybrid-py-test-WARNING-Extended-Isolation-Forest-not-available-using-standard-IF--1764006677142_1764006677142.txt diff --git a/attached_assets/Pasted-python-train-hybrid-py-test-WARNING-Extended-Isolation-Forest-not-available-using-standard-IF--1764006677142_1764006677142.txt b/attached_assets/Pasted-python-train-hybrid-py-test-WARNING-Extended-Isolation-Forest-not-available-using-standard-IF--1764006677142_1764006677142.txt new file mode 100644 index 0000000..980f3ae --- /dev/null +++ b/attached_assets/Pasted-python-train-hybrid-py-test-WARNING-Extended-Isolation-Forest-not-available-using-standard-IF--1764006677142_1764006677142.txt @@ -0,0 +1,54 @@ +python train_hybrid.py --test +[WARNING] Extended Isolation Forest not available, using standard IF + +====================================================================== + IDS HYBRID ML TEST - SYNTHETIC DATA +====================================================================== +INFO:dataset_loader:Creating sample dataset (10000 samples)... +INFO:dataset_loader:Sample dataset created: 10000 rows +INFO:dataset_loader:Attack distribution: +attack_type +normal 8981 +brute_force 273 +suspicious 258 +ddos 257 +port_scan 231 +Name: count, dtype: int64 + +[TEST] Created synthetic dataset: 10000 samples + Normal: 8,981 (89.8%) + Attacks: 1,019 (10.2%) + +[TEST] Training on 6,281 normal samples... +[HYBRID] Training hybrid model on 6281 logs... + +❌ Error: 'timestamp' +Traceback (most recent call last): + File "/opt/ids/python_ml/venv/lib64/python3.11/site-packages/pandas/core/indexes/base.py", line 3790, in get_loc + return self._engine.get_loc(casted_key) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "index.pyx", line 152, in pandas._libs.index.IndexEngine.get_loc + File "index.pyx", line 181, in pandas._libs.index.IndexEngine.get_loc + File "pandas/_libs/hashtable_class_helper.pxi", line 7080, in pandas._libs.hashtable.PyObjectHashTable.get_item + File "pandas/_libs/hashtable_class_helper.pxi", line 7088, in pandas._libs.hashtable.PyObjectHashTable.get_item +KeyError: 'timestamp' + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/opt/ids/python_ml/train_hybrid.py", line 361, in main + test_on_synthetic(args) + File "/opt/ids/python_ml/train_hybrid.py", line 249, in test_on_synthetic + detector.train_unsupervised(normal_train) + File "/opt/ids/python_ml/ml_hybrid_detector.py", line 204, in train_unsupervised + features_df = self.extract_features(logs_df) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/ids/python_ml/ml_hybrid_detector.py", line 98, in extract_features + logs_df['timestamp'] = pd.to_datetime(logs_df['timestamp']) + ~~~~~~~^^^^^^^^^^^^^ + File "/opt/ids/python_ml/venv/lib64/python3.11/site-packages/pandas/core/frame.py", line 3893, in __getitem__ + indexer = self.columns.get_loc(key) + ^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/ids/python_ml/venv/lib64/python3.11/site-packages/pandas/core/indexes/base.py", line 3797, in get_loc + raise KeyError(key) from err +KeyError: 'timestamp' \ No newline at end of file diff --git a/python_ml/dataset_loader.py b/python_ml/dataset_loader.py index dccdd3f..a233a28 100644 --- a/python_ml/dataset_loader.py +++ b/python_ml/dataset_loader.py @@ -364,6 +364,17 @@ Expected files: unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)] data['source_ip'] = np.random.choice(unique_ips, n_samples) + # Add timestamp column (simulate last 7 days of traffic) + from datetime import datetime, timedelta + now = datetime.now() + start_time = now - timedelta(days=7) + + # Generate timestamps randomly distributed over last 7 days + time_range_seconds = 7 * 24 * 3600 # 7 days in seconds + random_offsets = np.random.uniform(0, time_range_seconds, n_samples) + timestamps = [start_time + timedelta(seconds=offset) for offset in random_offsets] + data['timestamp'] = timestamps + df = pd.DataFrame(data) # Make attacks more extreme