""" CICIDS2017 Dataset Loader and Preprocessor Downloads, cleans, and maps CICIDS2017 features to IDS feature space """ import pandas as pd import numpy as np from pathlib import Path from typing import Dict, Tuple, Optional import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class CICIDS2017Loader: """ Loads and preprocesses CICIDS2017 dataset Maps 80 CIC features to 25 IDS features """ DATASET_INFO = { 'name': 'CICIDS2017', 'source': 'Canadian Institute for Cybersecurity', 'url': 'https://www.unb.ca/cic/datasets/ids-2017.html', 'size_gb': 7.8, 'files': [ 'Monday-WorkingHours.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', ] } # Mapping CIC feature names → IDS feature names FEATURE_MAPPING = { # Volume features 'Total Fwd Packets': 'total_packets', 'Total Backward Packets': 'total_packets', # Combined 'Total Length of Fwd Packets': 'total_bytes', 'Total Length of Bwd Packets': 'total_bytes', # Combined 'Flow Duration': 'time_span_seconds', # Temporal features 'Flow Packets/s': 'conn_per_second', 'Flow Bytes/s': 'bytes_per_second', 'Fwd Packets/s': 'packets_per_conn', # Protocol diversity 'Protocol': 'unique_protocols', 'Destination Port': 'unique_dest_ports', # Port scanning 'Fwd PSH Flags': 'port_scan_score', 'Fwd URG Flags': 'port_scan_score', # Behavioral 'Fwd Packet Length Mean': 'avg_packet_size', 'Fwd Packet Length Std': 'packet_size_variance', 'Bwd Packet Length Mean': 'avg_packet_size', 'Bwd Packet Length Std': 'packet_size_variance', # Burst patterns 'Subflow Fwd Packets': 'max_burst', 'Subflow Fwd Bytes': 'burst_variance', } # Attack type mapping ATTACK_LABELS = { 'BENIGN': 'normal', 'DoS Hulk': 'ddos', 'DoS GoldenEye': 'ddos', 'DoS slowloris': 'ddos', 'DoS Slowhttptest': 'ddos', 'DDoS': 'ddos', 'PortScan': 'port_scan', 'FTP-Patator': 'brute_force', 'SSH-Patator': 'brute_force', 'Bot': 'botnet', 'Web Attack – Brute Force': 'brute_force', 'Web Attack – XSS': 'suspicious', 'Web Attack – Sql Injection': 'suspicious', 'Infiltration': 'suspicious', 'Heartbleed': 'suspicious', } def __init__(self, data_dir: str = "datasets/cicids2017"): self.data_dir = Path(data_dir) self.data_dir.mkdir(parents=True, exist_ok=True) def download_instructions(self) -> str: """Return download instructions for CICIDS2017""" instructions = f""" ╔══════════════════════════════════════════════════════════════════╗ ║ CICIDS2017 Dataset Download Instructions ║ ╚══════════════════════════════════════════════════════════════════╝ Dataset: {self.DATASET_INFO['name']} Source: {self.DATASET_INFO['source']} Size: {self.DATASET_INFO['size_gb']} GB URL: {self.DATASET_INFO['url']} MANUAL DOWNLOAD (Recommended): 1. Visit: {self.DATASET_INFO['url']} 2. Register/Login (free account required) 3. Download CSV files for all days (Monday-Friday) 4. Extract to: {self.data_dir.absolute()} Expected files: """ for i, fname in enumerate(self.DATASET_INFO['files'], 1): instructions += f" {i}. {fname}\n" instructions += f"\nAfter download, run: python_ml/train_hybrid.py --validate\n" instructions += "=" * 66 return instructions def check_dataset_exists(self) -> Tuple[bool, list]: """Check if dataset files exist""" missing_files = [] for fname in self.DATASET_INFO['files']: fpath = self.data_dir / fname if not fpath.exists(): missing_files.append(fname) exists = len(missing_files) == 0 return exists, missing_files def load_day(self, day_file: str, sample_frac: float = 1.0) -> pd.DataFrame: """ Load single day CSV file sample_frac: fraction to sample (0.1 = 10% for testing) """ fpath = self.data_dir / day_file if not fpath.exists(): raise FileNotFoundError(f"Dataset file not found: {fpath}") logger.info(f"Loading {day_file}...") # CICIDS2017 has known issues: extra space before column names, inf values df = pd.read_csv(fpath, skipinitialspace=True) # Strip whitespace from column names df.columns = df.columns.str.strip() # Sample if requested if sample_frac < 1.0: df = df.sample(frac=sample_frac, random_state=42) logger.info(f"Sampled {len(df)} rows ({sample_frac*100:.0f}%)") return df def preprocess(self, df: pd.DataFrame) -> pd.DataFrame: """ Clean and preprocess CICIDS2017 data - Remove NaN and Inf values - Fix data types - Map labels """ logger.info(f"Preprocessing {len(df)} rows...") # Replace inf with NaN, then drop df = df.replace([np.inf, -np.inf], np.nan) df = df.dropna() # Map attack labels if ' Label' in df.columns: df['attack_type'] = df[' Label'].map(self.ATTACK_LABELS) df['is_attack'] = (df['attack_type'] != 'normal').astype(int) elif 'Label' in df.columns: df['attack_type'] = df['Label'].map(self.ATTACK_LABELS) df['is_attack'] = (df['attack_type'] != 'normal').astype(int) else: logger.warning("No label column found, assuming all BENIGN") df['attack_type'] = 'normal' df['is_attack'] = 0 # Remove unknown attack types df = df[df['attack_type'].notna()] logger.info(f"After preprocessing: {len(df)} rows") logger.info(f"Attack distribution:\n{df['attack_type'].value_counts()}") return df def map_to_ids_features(self, df: pd.DataFrame) -> pd.DataFrame: """ Map 80 CICIDS2017 features → 25 IDS features This is approximate mapping for validation purposes """ logger.info("Mapping CICIDS features to IDS feature space...") ids_features = {} # Volume features (combine fwd+bwd) ids_features['total_packets'] = ( df.get('Total Fwd Packets', 0) + df.get('Total Backward Packets', 0) ) ids_features['total_bytes'] = ( df.get('Total Length of Fwd Packets', 0) + df.get('Total Length of Bwd Packets', 0) ) ids_features['conn_count'] = 1 # Each row = 1 flow ids_features['avg_packet_size'] = df.get('Fwd Packet Length Mean', 0) ids_features['bytes_per_second'] = df.get('Flow Bytes/s', 0) # Temporal features ids_features['time_span_seconds'] = df.get('Flow Duration', 0) / 1_000_000 # Microseconds to seconds ids_features['conn_per_second'] = df.get('Flow Packets/s', 0) ids_features['hour_of_day'] = 12 # Unknown, use midday ids_features['day_of_week'] = 3 # Unknown, use Wednesday # Burst detection (approximate) ids_features['max_burst'] = df.get('Subflow Fwd Packets', 0) ids_features['avg_burst'] = df.get('Subflow Fwd Packets', 0) ids_features['burst_variance'] = df.get('Subflow Fwd Bytes', 0).apply(lambda x: max(0, x)) ids_features['avg_interval'] = 1.0 # Unknown # Protocol diversity ids_features['unique_protocols'] = 1 # Each row = single protocol ids_features['unique_dest_ports'] = 1 ids_features['unique_dest_ips'] = 1 ids_features['protocol_entropy'] = 0 ids_features['tcp_ratio'] = (df.get('Protocol', 6) == 6).astype(int) ids_features['udp_ratio'] = (df.get('Protocol', 17) == 17).astype(int) # Port scanning detection ids_features['unique_ports_contacted'] = df.get('Destination Port', 0).apply(lambda x: 1 if x > 0 else 0) ids_features['port_scan_score'] = (df.get('Fwd PSH Flags', 0) + df.get('Fwd URG Flags', 0)) / 2 ids_features['sequential_ports'] = 0 # Behavioral anomalies ids_features['packets_per_conn'] = ids_features['total_packets'] ids_features['packet_size_variance'] = df.get('Fwd Packet Length Std', 0) ids_features['blocked_ratio'] = 0 # Add labels ids_features['attack_type'] = df['attack_type'] ids_features['is_attack'] = df['is_attack'] # Add synthetic source_ip for validation (CICIDS doesn't have this field) # Generate unique IPs: 10.0.x.y format n_samples = len(df) source_ips = [f"10.0.{i//256}.{i%256}" for i in range(n_samples)] ids_features['source_ip'] = source_ips ids_df = pd.DataFrame(ids_features) # Clip negative values numeric_cols = ids_df.select_dtypes(include=[np.number]).columns ids_df[numeric_cols] = ids_df[numeric_cols].clip(lower=0) logger.info(f"Mapped to {len(ids_df.columns)} IDS features") return ids_df def load_and_process_all( self, sample_frac: float = 1.0, train_ratio: float = 0.7, val_ratio: float = 0.15 ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Load all days, preprocess, map to IDS features, and split Returns: train_df, val_df, test_df """ exists, missing = self.check_dataset_exists() if not exists: raise FileNotFoundError( f"Missing dataset files: {missing}\n\n" f"{self.download_instructions()}" ) all_data = [] for fname in self.DATASET_INFO['files']: try: df = self.load_day(fname, sample_frac=sample_frac) df = self.preprocess(df) df_ids = self.map_to_ids_features(df) all_data.append(df_ids) except Exception as e: logger.error(f"Failed to load {fname}: {e}") continue if not all_data: raise ValueError("No data loaded successfully") # Combine all days combined = pd.concat(all_data, ignore_index=True) logger.info(f"Combined dataset: {len(combined)} rows") # Shuffle combined = combined.sample(frac=1, random_state=42).reset_index(drop=True) # Split train/val/test n = len(combined) n_train = int(n * train_ratio) n_val = int(n * val_ratio) train_df = combined.iloc[:n_train] val_df = combined.iloc[n_train:n_train+n_val] test_df = combined.iloc[n_train+n_val:] logger.info(f"Split: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}") return train_df, val_df, test_df def create_sample_dataset(self, n_samples: int = 10000) -> pd.DataFrame: """ Create synthetic sample dataset for testing Mimics CICIDS2017 structure """ logger.info(f"Creating sample dataset ({n_samples} samples)...") np.random.seed(42) # Generate synthetic features data = { 'total_packets': np.random.lognormal(3, 1.5, n_samples).astype(int), 'total_bytes': np.random.lognormal(8, 2, n_samples).astype(int), 'conn_count': np.ones(n_samples, dtype=int), 'avg_packet_size': np.random.normal(500, 200, n_samples), 'bytes_per_second': np.random.lognormal(6, 2, n_samples), 'time_span_seconds': np.random.exponential(10, n_samples), 'conn_per_second': np.random.exponential(5, n_samples), 'hour_of_day': np.random.randint(0, 24, n_samples), 'day_of_week': np.random.randint(0, 7, n_samples), 'max_burst': np.random.poisson(20, n_samples), 'avg_burst': np.random.poisson(15, n_samples), 'burst_variance': np.random.exponential(5, n_samples), 'avg_interval': np.random.exponential(0.1, n_samples), 'unique_protocols': np.ones(n_samples, dtype=int), 'unique_dest_ports': np.ones(n_samples, dtype=int), 'unique_dest_ips': np.ones(n_samples, dtype=int), 'protocol_entropy': np.zeros(n_samples), 'tcp_ratio': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]), 'udp_ratio': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]), 'unique_ports_contacted': np.ones(n_samples, dtype=int), 'port_scan_score': np.random.beta(1, 10, n_samples), 'sequential_ports': np.zeros(n_samples, dtype=int), 'packets_per_conn': np.random.lognormal(3, 1.5, n_samples), 'packet_size_variance': np.random.exponential(100, n_samples), 'blocked_ratio': np.zeros(n_samples), } # Generate labels: 90% normal, 10% attacks is_attack = np.random.choice([0, 1], n_samples, p=[0.9, 0.1]) attack_types = np.where( is_attack == 1, np.random.choice(['ddos', 'port_scan', 'brute_force', 'suspicious'], n_samples), 'normal' ) data['is_attack'] = is_attack data['attack_type'] = attack_types # Add synthetic source_ip (simulate real traffic from 100 unique IPs) unique_ips = [f"192.168.{i//256}.{i%256}" for i in range(100)] data['source_ip'] = np.random.choice(unique_ips, n_samples) df = pd.DataFrame(data) # Make attacks more extreme attack_mask = df['is_attack'] == 1 df.loc[attack_mask, 'total_packets'] *= 10 df.loc[attack_mask, 'total_bytes'] *= 15 df.loc[attack_mask, 'conn_per_second'] *= 20 logger.info(f"Sample dataset created: {len(df)} rows") logger.info(f"Attack distribution:\n{df['attack_type'].value_counts()}") return df # Utility function def get_cicids2017_loader(data_dir: str = "datasets/cicids2017") -> CICIDS2017Loader: """Factory function to get loader instance""" return CICIDS2017Loader(data_dir)