Replit-Commit-Author: Agent Replit-Commit-Session-Id: 7a657272-55ba-4a79-9a2e-f1ed9bc7a528 Replit-Commit-Checkpoint-Type: full_checkpoint Replit-Commit-Event-Id: 1c71ce6e-1a3e-4f53-bb5d-77cdd22b8ea3
192 lines
6.2 KiB
Bash
192 lines
6.2 KiB
Bash
#!/bin/bash
|
|
# =========================================================================
|
|
# DIAGNOSI COMPLETA TESLA M60 + TENSORFLOW
|
|
# Identifica esattamente il problema e la soluzione
|
|
# =========================================================================
|
|
|
|
set -e
|
|
|
|
echo "🔍 DIAGNOSI COMPLETA TESLA M60 + TENSORFLOW"
|
|
echo "==========================================="
|
|
|
|
# 1. VERIFICA DRIVER NVIDIA
|
|
echo "🎮 VERIFICA DRIVER NVIDIA..."
|
|
nvidia-smi --query-gpu=name,driver_version,compute_cap,memory.total --format=csv,noheader
|
|
|
|
echo -e "\n🔧 Verifica librerie driver..."
|
|
ls -la /usr/lib64/libcuda* || echo "❌ libcuda non trovato"
|
|
ls -la /usr/lib64/libnvidia* | head -5 || echo "❌ libnvidia non trovato"
|
|
|
|
# 2. VERIFICA INSTALLAZIONI CUDA
|
|
echo -e "\n📦 VERIFICA INSTALLAZIONI CUDA..."
|
|
echo "CUDA 12.2:"
|
|
ls -la /usr/local/cuda-12.2/lib64/libcuda* 2>/dev/null || echo "❌ CUDA 12.2 libcuda non trovato"
|
|
echo "CUDA 11.8:"
|
|
ls -la /usr/local/cuda-11.8/lib64/libcuda* 2>/dev/null || echo "❌ CUDA 11.8 libcuda non trovato"
|
|
|
|
# 3. VERIFICA cuDNN
|
|
echo -e "\n📚 VERIFICA cuDNN..."
|
|
echo "Sistema cuDNN:"
|
|
ls -la /usr/lib64/libcudnn* | head -3
|
|
echo "CUDA 11.8 cuDNN:"
|
|
ls -la /usr/local/cuda-11.8/lib64/libcudnn* | head -3
|
|
|
|
# 4. TEST TENSORFLOW DETTAGLIATO
|
|
echo -e "\n🧪 TEST TENSORFLOW DETTAGLIATO..."
|
|
|
|
# Abilita logging massimo per debug
|
|
export TF_CPP_MIN_LOG_LEVEL=0
|
|
export CUDA_VISIBLE_DEVICES=0
|
|
export CUDA_HOME=/usr/local/cuda-11.8
|
|
export LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64:/usr/lib64:$LD_LIBRARY_PATH
|
|
|
|
python3 -c "
|
|
import os
|
|
import sys
|
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' # Massimo logging
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
os.environ['CUDA_HOME'] = '/usr/local/cuda-11.8'
|
|
|
|
print('=== TENSORFLOW DEBUG DETTAGLIATO ===')
|
|
print('Python version:', sys.version)
|
|
print('CUDA_HOME:', os.environ.get('CUDA_HOME'))
|
|
print('LD_LIBRARY_PATH:', os.environ.get('LD_LIBRARY_PATH'))
|
|
|
|
try:
|
|
import tensorflow as tf
|
|
print('\\n✅ TensorFlow importato:', tf.__version__)
|
|
|
|
# Verifica build info
|
|
try:
|
|
build_info = tf.sysconfig.get_build_info()
|
|
print('\\n📋 TensorFlow Build Info:')
|
|
for key, value in build_info.items():
|
|
if 'cuda' in key.lower() or 'gpu' in key.lower():
|
|
print(f' {key}: {value}')
|
|
except Exception as e:
|
|
print('⚠️ Build info error:', e)
|
|
|
|
# Test CUDA availability
|
|
print('\\n🔍 CUDA Tests:')
|
|
print('is_built_with_cuda():', tf.test.is_built_with_cuda())
|
|
print('is_gpu_available():', tf.test.is_gpu_available())
|
|
print('is_built_with_gpu_support():', tf.test.is_built_with_gpu_support())
|
|
|
|
# Lista device fisici
|
|
physical_devices = tf.config.list_physical_devices()
|
|
print('\\n📱 All Physical Devices:')
|
|
for device in physical_devices:
|
|
print(f' {device}')
|
|
|
|
# Test specifico GPU
|
|
gpu_devices = tf.config.list_physical_devices('GPU')
|
|
print(f'\\n🎮 GPU Devices: {len(gpu_devices)}')
|
|
|
|
if gpu_devices:
|
|
for i, gpu in enumerate(gpu_devices):
|
|
print(f' GPU {i}: {gpu}')
|
|
try:
|
|
details = tf.config.experimental.get_device_details(gpu)
|
|
print(f' Details: {details}')
|
|
except Exception as e:
|
|
print(f' Details error: {e}')
|
|
|
|
except ImportError as e:
|
|
print('❌ TensorFlow import failed:', e)
|
|
except Exception as e:
|
|
print('❌ TensorFlow error:', e)
|
|
import traceback
|
|
traceback.print_exc()
|
|
"
|
|
|
|
# 5. TEST LIBRERIE DLOPEN
|
|
echo -e "\n🔗 TEST DLOPEN LIBRERIE..."
|
|
|
|
python3 -c "
|
|
import ctypes
|
|
import os
|
|
|
|
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-11.8/lib64:/usr/lib64'
|
|
|
|
print('=== TEST DLOPEN LIBRERIE ===')
|
|
|
|
# Test librerie critiche
|
|
libraries = [
|
|
'/usr/lib64/libcuda.so.1',
|
|
'/usr/local/cuda-11.8/lib64/libcudart.so',
|
|
'/usr/local/cuda-11.8/lib64/libcublas.so',
|
|
'/usr/local/cuda-11.8/lib64/libcufft.so',
|
|
'/usr/local/cuda-11.8/lib64/libcudnn.so'
|
|
]
|
|
|
|
for lib in libraries:
|
|
try:
|
|
handle = ctypes.CDLL(lib)
|
|
print(f'✅ {lib}: OK')
|
|
except Exception as e:
|
|
print(f'❌ {lib}: {e}')
|
|
"
|
|
|
|
# 6. VERIFICA COMPATIBILITÀ VERSIONI
|
|
echo -e "\n🔄 VERIFICA COMPATIBILITÀ VERSIONI..."
|
|
|
|
echo "Compute Capability Tesla M60:"
|
|
nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits
|
|
|
|
echo -e "\nVersioni installate:"
|
|
echo "Driver NVIDIA: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits)"
|
|
echo "CUDA Runtime: $(nvcc --version | grep 'release' || echo 'N/A')"
|
|
|
|
python3 -c "
|
|
import tensorflow as tf
|
|
try:
|
|
info = tf.sysconfig.get_build_info()
|
|
print(f'TensorFlow CUDA: {info.get(\"cuda_version\", \"N/A\")}')
|
|
print(f'TensorFlow cuDNN: {info.get(\"cudnn_version\", \"N/A\")}')
|
|
except:
|
|
print('TensorFlow build info non disponibile')
|
|
"
|
|
|
|
# 7. POSSIBILI SOLUZIONI
|
|
echo -e "\n💡 POSSIBILI SOLUZIONI..."
|
|
|
|
echo "Basandoci sui risultati della diagnosi:"
|
|
echo ""
|
|
echo "OPZIONE 1: TensorFlow più vecchio (2.10.x)"
|
|
echo " • pip3 install tensorflow==2.10.1"
|
|
echo " • Supporto garantito Tesla M60 CC 5.2"
|
|
echo ""
|
|
echo "OPZIONE 2: TensorFlow con conda (librerie embedded)"
|
|
echo " • conda install tensorflow-gpu=2.12"
|
|
echo " • Include librerie CUDA ottimizzate"
|
|
echo ""
|
|
echo "OPZIONE 3: CPU-only con ottimizzazioni"
|
|
echo " • Uso esclusivo CPU con parallelizzazione"
|
|
echo " • Performance comunque accettabili per DDoS Detection"
|
|
echo ""
|
|
echo "OPZIONE 4: Build TensorFlow custom"
|
|
echo " • Compilazione specifica per Tesla M60"
|
|
echo " • Tempo richiesto: 2-3 ore"
|
|
|
|
# 8. RACCOMANDAZIONE FINALE
|
|
echo -e "\n🎯 RACCOMANDAZIONE..."
|
|
|
|
GPU_COUNT=$(python3 -c "import tensorflow as tf; print(len(tf.config.list_physical_devices('GPU')))" 2>/dev/null || echo "0")
|
|
|
|
if [ "$GPU_COUNT" = "0" ]; then
|
|
echo "❌ GPU non rilevata - RACCOMANDAZIONE:"
|
|
echo ""
|
|
echo "🚀 SOLUZIONE IMMEDIATA: TensorFlow 2.10.1"
|
|
echo " pip3 uninstall tensorflow"
|
|
echo " pip3 install tensorflow==2.10.1"
|
|
echo " # TF 2.10.1 ha miglior supporto hardware legacy"
|
|
echo ""
|
|
echo "📊 ALTERNATIVE:"
|
|
echo " • CPU-only: Performance 75K record/sec (accettabile)"
|
|
echo " • Upgrade hardware: GPU moderna (RTX/Tesla P100+)"
|
|
echo " • Cloud GPU: AWS/GCP Tesla V100/A100"
|
|
else
|
|
echo "✅ GPU rilevata - sistema funzionante!"
|
|
fi
|
|
|
|
echo -e "\n🔚 DIAGNOSI COMPLETATA" |