Add timestamp to synthetic data for accurate model testing
Add a 'timestamp' column to the synthetic dataset generation in `python_ml/dataset_loader.py` to resolve a `KeyError` during model training and testing. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: intermediate_checkpoint Replit-Commit-Event-Id: 276a3bd4-aaee-40c9-acb7-027f23274a9f Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
This commit is contained in:
parent
db54fc3235
commit
9fe2532217
@ -0,0 +1,54 @@
|
|||||||
|
python train_hybrid.py --test
|
||||||
|
[WARNING] Extended Isolation Forest not available, using standard IF
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
IDS HYBRID ML TEST - SYNTHETIC DATA
|
||||||
|
======================================================================
|
||||||
|
INFO:dataset_loader:Creating sample dataset (10000 samples)...
|
||||||
|
INFO:dataset_loader:Sample dataset created: 10000 rows
|
||||||
|
INFO:dataset_loader:Attack distribution:
|
||||||
|
attack_type
|
||||||
|
normal 8981
|
||||||
|
brute_force 273
|
||||||
|
suspicious 258
|
||||||
|
ddos 257
|
||||||
|
port_scan 231
|
||||||
|
Name: count, dtype: int64
|
||||||
|
|
||||||
|
[TEST] Created synthetic dataset: 10000 samples
|
||||||
|
Normal: 8,981 (89.8%)
|
||||||
|
Attacks: 1,019 (10.2%)
|
||||||
|
|
||||||
|
[TEST] Training on 6,281 normal samples...
|
||||||
|
[HYBRID] Training hybrid model on 6281 logs...
|
||||||
|
|
||||||
|
❌ Error: 'timestamp'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/opt/ids/python_ml/venv/lib64/python3.11/site-packages/pandas/core/indexes/base.py", line 3790, in get_loc
|
||||||
|
return self._engine.get_loc(casted_key)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "index.pyx", line 152, in pandas._libs.index.IndexEngine.get_loc
|
||||||
|
File "index.pyx", line 181, in pandas._libs.index.IndexEngine.get_loc
|
||||||
|
File "pandas/_libs/hashtable_class_helper.pxi", line 7080, in pandas._libs.hashtable.PyObjectHashTable.get_item
|
||||||
|
File "pandas/_libs/hashtable_class_helper.pxi", line 7088, in pandas._libs.hashtable.PyObjectHashTable.get_item
|
||||||
|
KeyError: 'timestamp'
|
||||||
|
|
||||||
|
The above exception was the direct cause of the following exception:
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/opt/ids/python_ml/train_hybrid.py", line 361, in main
|
||||||
|
test_on_synthetic(args)
|
||||||
|
File "/opt/ids/python_ml/train_hybrid.py", line 249, in test_on_synthetic
|
||||||
|
detector.train_unsupervised(normal_train)
|
||||||
|
File "/opt/ids/python_ml/ml_hybrid_detector.py", line 204, in train_unsupervised
|
||||||
|
features_df = self.extract_features(logs_df)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/ids/python_ml/ml_hybrid_detector.py", line 98, in extract_features
|
||||||
|
logs_df['timestamp'] = pd.to_datetime(logs_df['timestamp'])
|
||||||
|
~~~~~~~^^^^^^^^^^^^^
|
||||||
|
File "/opt/ids/python_ml/venv/lib64/python3.11/site-packages/pandas/core/frame.py", line 3893, in __getitem__
|
||||||
|
indexer = self.columns.get_loc(key)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/opt/ids/python_ml/venv/lib64/python3.11/site-packages/pandas/core/indexes/base.py", line 3797, in get_loc
|
||||||
|
raise KeyError(key) from err
|
||||||
|
KeyError: 'timestamp'
|
||||||
@ -364,6 +364,17 @@ Expected files:
|
|||||||
unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)]
|
unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)]
|
||||||
data['source_ip'] = np.random.choice(unique_ips, n_samples)
|
data['source_ip'] = np.random.choice(unique_ips, n_samples)
|
||||||
|
|
||||||
|
# Add timestamp column (simulate last 7 days of traffic)
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
now = datetime.now()
|
||||||
|
start_time = now - timedelta(days=7)
|
||||||
|
|
||||||
|
# Generate timestamps randomly distributed over last 7 days
|
||||||
|
time_range_seconds = 7 * 24 * 3600 # 7 days in seconds
|
||||||
|
random_offsets = np.random.uniform(0, time_range_seconds, n_samples)
|
||||||
|
timestamps = [start_time + timedelta(seconds=offset) for offset in random_offsets]
|
||||||
|
data['timestamp'] = timestamps
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
# Make attacks more extreme
|
# Make attacks more extreme
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user