ids.alfacom.it/python_ml/dataset_loader.py
marco370 8b16800bb6 Update system to use hybrid detector and improve validation accuracy
Update main.py endpoints to use the hybrid detector and improve validation logic in train_hybrid.py by mapping detections using source_ip. Also, add synthetic source_ip to dataset_loader.py for both CICIDS2017 and synthetic datasets.

Replit-Commit-Author: Agent
Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528
Replit-Commit-Checkpoint-Type: intermediate_checkpoint
Replit-Commit-Event-Id: 5c4982f1-3d37-47da-9253-c04888f5ff64
Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/449cf7c4-c97a-45ae-8234-e5c5b8d6a84f/7a657272-55ba-4a79-9a2e-f1ed9bc7a528/2lUhxO2
2025-11-24 16:02:49 +00:00

385 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
CICIDS2017 Dataset Loader and Preprocessor
Downloads, cleans, and maps CICIDS2017 features to IDS feature space
"""
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, Tuple, Optional
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class CICIDS2017Loader:
"""
Loads and preprocesses CICIDS2017 dataset
Maps 80 CIC features to 25 IDS features
"""
DATASET_INFO = {
'name': 'CICIDS2017',
'source': 'Canadian Institute for Cybersecurity',
'url': 'https://www.unb.ca/cic/datasets/ids-2017.html',
'size_gb': 7.8,
'files': [
'Monday-WorkingHours.pcap_ISCX.csv',
'Tuesday-WorkingHours.pcap_ISCX.csv',
'Wednesday-workingHours.pcap_ISCX.csv',
'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
'Friday-WorkingHours-Morning.pcap_ISCX.csv',
'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
]
}
# Mapping CIC feature names → IDS feature names
FEATURE_MAPPING = {
# Volume features
'Total Fwd Packets': 'total_packets',
'Total Backward Packets': 'total_packets', # Combined
'Total Length of Fwd Packets': 'total_bytes',
'Total Length of Bwd Packets': 'total_bytes', # Combined
'Flow Duration': 'time_span_seconds',
# Temporal features
'Flow Packets/s': 'conn_per_second',
'Flow Bytes/s': 'bytes_per_second',
'Fwd Packets/s': 'packets_per_conn',
# Protocol diversity
'Protocol': 'unique_protocols',
'Destination Port': 'unique_dest_ports',
# Port scanning
'Fwd PSH Flags': 'port_scan_score',
'Fwd URG Flags': 'port_scan_score',
# Behavioral
'Fwd Packet Length Mean': 'avg_packet_size',
'Fwd Packet Length Std': 'packet_size_variance',
'Bwd Packet Length Mean': 'avg_packet_size',
'Bwd Packet Length Std': 'packet_size_variance',
# Burst patterns
'Subflow Fwd Packets': 'max_burst',
'Subflow Fwd Bytes': 'burst_variance',
}
# Attack type mapping
ATTACK_LABELS = {
'BENIGN': 'normal',
'DoS Hulk': 'ddos',
'DoS GoldenEye': 'ddos',
'DoS slowloris': 'ddos',
'DoS Slowhttptest': 'ddos',
'DDoS': 'ddos',
'PortScan': 'port_scan',
'FTP-Patator': 'brute_force',
'SSH-Patator': 'brute_force',
'Bot': 'botnet',
'Web Attack Brute Force': 'brute_force',
'Web Attack XSS': 'suspicious',
'Web Attack Sql Injection': 'suspicious',
'Infiltration': 'suspicious',
'Heartbleed': 'suspicious',
}
def __init__(self, data_dir: str = "datasets/cicids2017"):
self.data_dir = Path(data_dir)
self.data_dir.mkdir(parents=True, exist_ok=True)
def download_instructions(self) -> str:
"""Return download instructions for CICIDS2017"""
instructions = f"""
╔══════════════════════════════════════════════════════════════════╗
║ CICIDS2017 Dataset Download Instructions ║
╚══════════════════════════════════════════════════════════════════╝
Dataset: {self.DATASET_INFO['name']}
Source: {self.DATASET_INFO['source']}
Size: {self.DATASET_INFO['size_gb']} GB
URL: {self.DATASET_INFO['url']}
MANUAL DOWNLOAD (Recommended):
1. Visit: {self.DATASET_INFO['url']}
2. Register/Login (free account required)
3. Download CSV files for all days (Monday-Friday)
4. Extract to: {self.data_dir.absolute()}
Expected files:
"""
for i, fname in enumerate(self.DATASET_INFO['files'], 1):
instructions += f" {i}. {fname}\n"
instructions += f"\nAfter download, run: python_ml/train_hybrid.py --validate\n"
instructions += "=" * 66
return instructions
def check_dataset_exists(self) -> Tuple[bool, list]:
"""Check if dataset files exist"""
missing_files = []
for fname in self.DATASET_INFO['files']:
fpath = self.data_dir / fname
if not fpath.exists():
missing_files.append(fname)
exists = len(missing_files) == 0
return exists, missing_files
def load_day(self, day_file: str, sample_frac: float = 1.0) -> pd.DataFrame:
"""
Load single day CSV file
sample_frac: fraction to sample (0.1 = 10% for testing)
"""
fpath = self.data_dir / day_file
if not fpath.exists():
raise FileNotFoundError(f"Dataset file not found: {fpath}")
logger.info(f"Loading {day_file}...")
# CICIDS2017 has known issues: extra space before column names, inf values
df = pd.read_csv(fpath, skipinitialspace=True)
# Strip whitespace from column names
df.columns = df.columns.str.strip()
# Sample if requested
if sample_frac < 1.0:
df = df.sample(frac=sample_frac, random_state=42)
logger.info(f"Sampled {len(df)} rows ({sample_frac*100:.0f}%)")
return df
def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Clean and preprocess CICIDS2017 data
- Remove NaN and Inf values
- Fix data types
- Map labels
"""
logger.info(f"Preprocessing {len(df)} rows...")
# Replace inf with NaN, then drop
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
# Map attack labels
if ' Label' in df.columns:
df['attack_type'] = df[' Label'].map(self.ATTACK_LABELS)
df['is_attack'] = (df['attack_type'] != 'normal').astype(int)
elif 'Label' in df.columns:
df['attack_type'] = df['Label'].map(self.ATTACK_LABELS)
df['is_attack'] = (df['attack_type'] != 'normal').astype(int)
else:
logger.warning("No label column found, assuming all BENIGN")
df['attack_type'] = 'normal'
df['is_attack'] = 0
# Remove unknown attack types
df = df[df['attack_type'].notna()]
logger.info(f"After preprocessing: {len(df)} rows")
logger.info(f"Attack distribution:\n{df['attack_type'].value_counts()}")
return df
def map_to_ids_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Map 80 CICIDS2017 features → 25 IDS features
This is approximate mapping for validation purposes
"""
logger.info("Mapping CICIDS features to IDS feature space...")
ids_features = {}
# Volume features (combine fwd+bwd)
ids_features['total_packets'] = (
df.get('Total Fwd Packets', 0) +
df.get('Total Backward Packets', 0)
)
ids_features['total_bytes'] = (
df.get('Total Length of Fwd Packets', 0) +
df.get('Total Length of Bwd Packets', 0)
)
ids_features['conn_count'] = 1 # Each row = 1 flow
ids_features['avg_packet_size'] = df.get('Fwd Packet Length Mean', 0)
ids_features['bytes_per_second'] = df.get('Flow Bytes/s', 0)
# Temporal features
ids_features['time_span_seconds'] = df.get('Flow Duration', 0) / 1_000_000 # Microseconds to seconds
ids_features['conn_per_second'] = df.get('Flow Packets/s', 0)
ids_features['hour_of_day'] = 12 # Unknown, use midday
ids_features['day_of_week'] = 3 # Unknown, use Wednesday
# Burst detection (approximate)
ids_features['max_burst'] = df.get('Subflow Fwd Packets', 0)
ids_features['avg_burst'] = df.get('Subflow Fwd Packets', 0)
ids_features['burst_variance'] = df.get('Subflow Fwd Bytes', 0).apply(lambda x: max(0, x))
ids_features['avg_interval'] = 1.0 # Unknown
# Protocol diversity
ids_features['unique_protocols'] = 1 # Each row = single protocol
ids_features['unique_dest_ports'] = 1
ids_features['unique_dest_ips'] = 1
ids_features['protocol_entropy'] = 0
ids_features['tcp_ratio'] = (df.get('Protocol', 6) == 6).astype(int)
ids_features['udp_ratio'] = (df.get('Protocol', 17) == 17).astype(int)
# Port scanning detection
ids_features['unique_ports_contacted'] = df.get('Destination Port', 0).apply(lambda x: 1 if x > 0 else 0)
ids_features['port_scan_score'] = (df.get('Fwd PSH Flags', 0) + df.get('Fwd URG Flags', 0)) / 2
ids_features['sequential_ports'] = 0
# Behavioral anomalies
ids_features['packets_per_conn'] = ids_features['total_packets']
ids_features['packet_size_variance'] = df.get('Fwd Packet Length Std', 0)
ids_features['blocked_ratio'] = 0
# Add labels
ids_features['attack_type'] = df['attack_type']
ids_features['is_attack'] = df['is_attack']
# Add synthetic source_ip for validation (CICIDS doesn't have this field)
# Generate unique IPs: 10.0.x.y format
n_samples = len(df)
source_ips = [f"10.0.{i//256}.{i%256}" for i in range(n_samples)]
ids_features['source_ip'] = source_ips
ids_df = pd.DataFrame(ids_features)
# Clip negative values
numeric_cols = ids_df.select_dtypes(include=[np.number]).columns
ids_df[numeric_cols] = ids_df[numeric_cols].clip(lower=0)
logger.info(f"Mapped to {len(ids_df.columns)} IDS features")
return ids_df
def load_and_process_all(
self,
sample_frac: float = 1.0,
train_ratio: float = 0.7,
val_ratio: float = 0.15
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Load all days, preprocess, map to IDS features, and split
Returns: train_df, val_df, test_df
"""
exists, missing = self.check_dataset_exists()
if not exists:
raise FileNotFoundError(
f"Missing dataset files: {missing}\n\n"
f"{self.download_instructions()}"
)
all_data = []
for fname in self.DATASET_INFO['files']:
try:
df = self.load_day(fname, sample_frac=sample_frac)
df = self.preprocess(df)
df_ids = self.map_to_ids_features(df)
all_data.append(df_ids)
except Exception as e:
logger.error(f"Failed to load {fname}: {e}")
continue
if not all_data:
raise ValueError("No data loaded successfully")
# Combine all days
combined = pd.concat(all_data, ignore_index=True)
logger.info(f"Combined dataset: {len(combined)} rows")
# Shuffle
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)
# Split train/val/test
n = len(combined)
n_train = int(n * train_ratio)
n_val = int(n * val_ratio)
train_df = combined.iloc[:n_train]
val_df = combined.iloc[n_train:n_train+n_val]
test_df = combined.iloc[n_train+n_val:]
logger.info(f"Split: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")
return train_df, val_df, test_df
def create_sample_dataset(self, n_samples: int = 10000) -> pd.DataFrame:
"""
Create synthetic sample dataset for testing
Mimics CICIDS2017 structure
"""
logger.info(f"Creating sample dataset ({n_samples} samples)...")
np.random.seed(42)
# Generate synthetic features
data = {
'total_packets': np.random.lognormal(3, 1.5, n_samples).astype(int),
'total_bytes': np.random.lognormal(8, 2, n_samples).astype(int),
'conn_count': np.ones(n_samples, dtype=int),
'avg_packet_size': np.random.normal(500, 200, n_samples),
'bytes_per_second': np.random.lognormal(6, 2, n_samples),
'time_span_seconds': np.random.exponential(10, n_samples),
'conn_per_second': np.random.exponential(5, n_samples),
'hour_of_day': np.random.randint(0, 24, n_samples),
'day_of_week': np.random.randint(0, 7, n_samples),
'max_burst': np.random.poisson(20, n_samples),
'avg_burst': np.random.poisson(15, n_samples),
'burst_variance': np.random.exponential(5, n_samples),
'avg_interval': np.random.exponential(0.1, n_samples),
'unique_protocols': np.ones(n_samples, dtype=int),
'unique_dest_ports': np.ones(n_samples, dtype=int),
'unique_dest_ips': np.ones(n_samples, dtype=int),
'protocol_entropy': np.zeros(n_samples),
'tcp_ratio': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),
'udp_ratio': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
'unique_ports_contacted': np.ones(n_samples, dtype=int),
'port_scan_score': np.random.beta(1, 10, n_samples),
'sequential_ports': np.zeros(n_samples, dtype=int),
'packets_per_conn': np.random.lognormal(3, 1.5, n_samples),
'packet_size_variance': np.random.exponential(100, n_samples),
'blocked_ratio': np.zeros(n_samples),
}
# Generate labels: 90% normal, 10% attacks
is_attack = np.random.choice([0, 1], n_samples, p=[0.9, 0.1])
attack_types = np.where(
is_attack == 1,
np.random.choice(['ddos', 'port_scan', 'brute_force', 'suspicious'], n_samples),
'normal'
)
data['is_attack'] = is_attack
data['attack_type'] = attack_types
# Add synthetic source_ip (simulate real traffic from 100 unique IPs)
unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)]
data['source_ip'] = np.random.choice(unique_ips, n_samples)
df = pd.DataFrame(data)
# Make attacks more extreme
attack_mask = df['is_attack'] == 1
df.loc[attack_mask, 'total_packets'] *= 10
df.loc[attack_mask, 'total_bytes'] *= 15
df.loc[attack_mask, 'conn_per_second'] *= 20
logger.info(f"Sample dataset created: {len(df)} rows")
logger.info(f"Attack distribution:\n{df['attack_type'].value_counts()}")
return df
# Utility function
def get_cicids2017_loader(data_dir: str = "datasets/cicids2017") -> CICIDS2017Loader:
"""Factory function to get loader instance"""
return CICIDS2017Loader(data_dir)