#!/bin/bash # ========================================================================= # DIAGNOSI COMPLETA TESLA M60 + TENSORFLOW # Identifica esattamente il problema e la soluzione # ========================================================================= set -e echo "🔍 DIAGNOSI COMPLETA TESLA M60 + TENSORFLOW" echo "===========================================" # 1. VERIFICA DRIVER NVIDIA echo "🎮 VERIFICA DRIVER NVIDIA..." nvidia-smi --query-gpu=name,driver_version,compute_cap,memory.total --format=csv,noheader echo -e "\n🔧 Verifica librerie driver..." ls -la /usr/lib64/libcuda* || echo "❌ libcuda non trovato" ls -la /usr/lib64/libnvidia* | head -5 || echo "❌ libnvidia non trovato" # 2. VERIFICA INSTALLAZIONI CUDA echo -e "\n📦 VERIFICA INSTALLAZIONI CUDA..." echo "CUDA 12.2:" ls -la /usr/local/cuda-12.2/lib64/libcuda* 2>/dev/null || echo "❌ CUDA 12.2 libcuda non trovato" echo "CUDA 11.8:" ls -la /usr/local/cuda-11.8/lib64/libcuda* 2>/dev/null || echo "❌ CUDA 11.8 libcuda non trovato" # 3. VERIFICA cuDNN echo -e "\n📚 VERIFICA cuDNN..." echo "Sistema cuDNN:" ls -la /usr/lib64/libcudnn* | head -3 echo "CUDA 11.8 cuDNN:" ls -la /usr/local/cuda-11.8/lib64/libcudnn* | head -3 # 4. TEST TENSORFLOW DETTAGLIATO echo -e "\n🧪 TEST TENSORFLOW DETTAGLIATO..." # Abilita logging massimo per debug export TF_CPP_MIN_LOG_LEVEL=0 export CUDA_VISIBLE_DEVICES=0 export CUDA_HOME=/usr/local/cuda-11.8 export LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64:/usr/lib64:$LD_LIBRARY_PATH python3 -c " import os import sys os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' # Massimo logging os.environ['CUDA_VISIBLE_DEVICES'] = '0' os.environ['CUDA_HOME'] = '/usr/local/cuda-11.8' print('=== TENSORFLOW DEBUG DETTAGLIATO ===') print('Python version:', sys.version) print('CUDA_HOME:', os.environ.get('CUDA_HOME')) print('LD_LIBRARY_PATH:', os.environ.get('LD_LIBRARY_PATH')) try: import tensorflow as tf print('\\n✅ TensorFlow importato:', tf.__version__) # Verifica build info try: build_info = tf.sysconfig.get_build_info() print('\\n📋 TensorFlow Build Info:') for key, value in build_info.items(): if 'cuda' in key.lower() or 'gpu' in key.lower(): print(f' {key}: {value}') except Exception as e: print('⚠️ Build info error:', e) # Test CUDA availability print('\\n🔍 CUDA Tests:') print('is_built_with_cuda():', tf.test.is_built_with_cuda()) print('is_gpu_available():', tf.test.is_gpu_available()) print('is_built_with_gpu_support():', tf.test.is_built_with_gpu_support()) # Lista device fisici physical_devices = tf.config.list_physical_devices() print('\\n📱 All Physical Devices:') for device in physical_devices: print(f' {device}') # Test specifico GPU gpu_devices = tf.config.list_physical_devices('GPU') print(f'\\n🎮 GPU Devices: {len(gpu_devices)}') if gpu_devices: for i, gpu in enumerate(gpu_devices): print(f' GPU {i}: {gpu}') try: details = tf.config.experimental.get_device_details(gpu) print(f' Details: {details}') except Exception as e: print(f' Details error: {e}') except ImportError as e: print('❌ TensorFlow import failed:', e) except Exception as e: print('❌ TensorFlow error:', e) import traceback traceback.print_exc() " # 5. TEST LIBRERIE DLOPEN echo -e "\n🔗 TEST DLOPEN LIBRERIE..." python3 -c " import ctypes import os os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-11.8/lib64:/usr/lib64' print('=== TEST DLOPEN LIBRERIE ===') # Test librerie critiche libraries = [ '/usr/lib64/libcuda.so.1', '/usr/local/cuda-11.8/lib64/libcudart.so', '/usr/local/cuda-11.8/lib64/libcublas.so', '/usr/local/cuda-11.8/lib64/libcufft.so', '/usr/local/cuda-11.8/lib64/libcudnn.so' ] for lib in libraries: try: handle = ctypes.CDLL(lib) print(f'✅ {lib}: OK') except Exception as e: print(f'❌ {lib}: {e}') " # 6. VERIFICA COMPATIBILITÀ VERSIONI echo -e "\n🔄 VERIFICA COMPATIBILITÀ VERSIONI..." echo "Compute Capability Tesla M60:" nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits echo -e "\nVersioni installate:" echo "Driver NVIDIA: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits)" echo "CUDA Runtime: $(nvcc --version | grep 'release' || echo 'N/A')" python3 -c " import tensorflow as tf try: info = tf.sysconfig.get_build_info() print(f'TensorFlow CUDA: {info.get(\"cuda_version\", \"N/A\")}') print(f'TensorFlow cuDNN: {info.get(\"cudnn_version\", \"N/A\")}') except: print('TensorFlow build info non disponibile') " # 7. POSSIBILI SOLUZIONI echo -e "\n💡 POSSIBILI SOLUZIONI..." echo "Basandoci sui risultati della diagnosi:" echo "" echo "OPZIONE 1: TensorFlow più vecchio (2.10.x)" echo " • pip3 install tensorflow==2.10.1" echo " • Supporto garantito Tesla M60 CC 5.2" echo "" echo "OPZIONE 2: TensorFlow con conda (librerie embedded)" echo " • conda install tensorflow-gpu=2.12" echo " • Include librerie CUDA ottimizzate" echo "" echo "OPZIONE 3: CPU-only con ottimizzazioni" echo " • Uso esclusivo CPU con parallelizzazione" echo " • Performance comunque accettabili per DDoS Detection" echo "" echo "OPZIONE 4: Build TensorFlow custom" echo " • Compilazione specifica per Tesla M60" echo " • Tempo richiesto: 2-3 ore" # 8. RACCOMANDAZIONE FINALE echo -e "\n🎯 RACCOMANDAZIONE..." GPU_COUNT=$(python3 -c "import tensorflow as tf; print(len(tf.config.list_physical_devices('GPU')))" 2>/dev/null || echo "0") if [ "$GPU_COUNT" = "0" ]; then echo "❌ GPU non rilevata - RACCOMANDAZIONE:" echo "" echo "🚀 SOLUZIONE IMMEDIATA: TensorFlow 2.10.1" echo " pip3 uninstall tensorflow" echo " pip3 install tensorflow==2.10.1" echo " # TF 2.10.1 ha miglior supporto hardware legacy" echo "" echo "📊 ALTERNATIVE:" echo " • CPU-only: Performance 75K record/sec (accettabile)" echo " • Upgrade hardware: GPU moderna (RTX/Tesla P100+)" echo " • Cloud GPU: AWS/GCP Tesla V100/A100" else echo "✅ GPU rilevata - sistema funzionante!" fi echo -e "\n🔚 DIAGNOSI COMPLETATA"