PyTorch/TensorFlow完整环境配置与优化指南
本手册介绍如何在Linux服务器上搭建完整的深度学习开发环境,包括NVIDIA驱动、CUDA、cuDNN、Python环境以及主流深度学习框架(PyTorch和TensorFlow)的安装配置。
# 更新系统包
sudo apt update && sudo apt upgrade -y
# 安装必要工具
sudo apt install -y build-essential dkms linux-headers-$(uname -r)
sudo apt install -y wget curl git vim htop tree
# 创建黑名单文件
echo "blacklist nouveau" | sudo tee /etc/modprobe.d/blacklist-nouveau.conf
echo "options nouveau modeset=0" | sudo tee -a /etc/modprobe.d/blacklist-nouveau.conf
# 更新initramfs
sudo update-initramfs -u
# 重启系统
sudo reboot
lsmod | grep nouveau 应无输出。
# 查看推荐驱动版本
ubuntu-drivers devices
# 自动安装推荐驱动
sudo ubuntu-drivers autoinstall
# 或手动安装特定版本(以535为例)
wget https://us.download.nvidia.com/XFree86/Linux-x86_64/535.154.05/NVIDIA-Linux-x86_64-535.154.05.run
chmod +x NVIDIA-Linux-x86_64-535.154.05.run
sudo ./NVIDIA-Linux-x86_64-535.154.05.run
# 查看GPU信息
nvidia-smi
# 输出示例
# +---------------------------------------------------------------------------------------+
# | NVIDIA-SMI 535.154.05 Driver Version: 535.154.05 CUDA Version: 12.2 |
# |-----------------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
# | | | MIG M. |
# |=========================================+======================+======================|
# | 0 NVIDIA GeForce RTX 3090 Off | 00000000:01:00.0 Off | N/A |
# | 0% 35C P8 25W / 350W | 0MiB / 24576MiB | 0% Default |
# +-----------------------------------------+----------------------+----------------------+
# 下载CUDA 12.1(推荐版本)
wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
# 运行安装程序
sudo sh cuda_12.1.0_530.30.02_linux.run
# 编辑bashrc
echo 'export PATH=/usr/local/cuda/bin:$PATH' | sudo tee /etc/profile.d/cuda.sh
echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' | sudo tee -a /etc/profile.d/cuda.sh
# 生效配置
source /etc/profile.d/cuda.sh
# 验证安装
nvcc --version
# 输出:Cuda compilation tools, release 12.1, V12.1.105
从NVIDIA官网下载对应CUDA版本的cuDNN(需要开发者账号):
# 下载cuDNN 8.9 for CUDA 12.x
# 文件名:cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz
# 解压
tar -xvf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz
# 复制文件到CUDA目录
cd cudnn-linux-x86_64-8.9.7.29_cuda12-archive
sudo cp include/cudnn*.h /usr/local/cuda/include/
sudo cp lib/libcudnn* /usr/local/cuda/lib64/
sudo chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn*
# 查看cuDNN版本
cat /usr/local/cuda/include/cudnn_version.h | grep CUDNN_MAJOR -A 2
# 输出示例
# #define CUDNN_MAJOR 8
# #define CUDNN_MINOR 9
# #define CUDNN_PATCHLEVEL 7
# 下载Miniconda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
# 安装
bash Miniconda3-latest-Linux-x86_64.sh
# 初始化
~/miniconda3/bin/conda init bash
source ~/.bashrc
# 添加清华镜像(国内用户)
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
conda config --set show_channel_urls yes
# 更新conda
conda update -n base -c defaults conda
# 创建Python 3.10环境
conda create -n pytorch python=3.10 -y
# 激活环境
conda activate pytorch
# CUDA 12.1版本
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
# 或pip安装
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
python -c "
import torch
print(f'PyTorch版本: {torch.__version__}')
print(f'CUDA可用: {torch.cuda.is_available()}')
print(f'CUDA版本: {torch.version.cuda}')
print(f'GPU数量: {torch.cuda.device_count()}')
if torch.cuda.is_available():
print(f'GPU型号: {torch.cuda.get_device_name(0)}')
print(f'GPU显存: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB')
"
# 输出示例
# PyTorch版本: 2.1.0+cu121
# CUDA可用: True
# CUDA版本: 12.1
# GPU数量: 1
# GPU型号: NVIDIA GeForce RTX 3090
# GPU显存: 24.00 GB
python -c "
import torch
import time
# 创建张量
a = torch.randn(10000, 10000).cuda()
b = torch.randn(10000, 10000).cuda()
# 预热
for _ in range(10):
c = torch.matmul(a, b)
torch.cuda.synchronize()
# 测试
torch.cuda.synchronize()
start = time.time()
c = torch.matmul(a, b)
torch.cuda.synchronize()
elapsed = time.time() - start
print(f'矩阵乘法耗时: {elapsed:.4f}秒')
print(f'计算速度: {2 * 10000**3 / elapsed / 1e12:.2f} TFLOPS')
"
# 输出示例
# 矩阵乘法耗时: 0.5234秒
# 计算速度: 3.82 TFLOPS
# 创建新环境
conda create -n tensorflow python=3.10 -y
conda activate tensorflow
# 安装TensorFlow GPU版本
pip install tensorflow[and-cuda]
# 或使用conda
conda install -c conda-forge tensorflow-gpu
python -c "
import tensorflow as tf
print(f'TensorFlow版本: {tf.__version__}')
print(f'CUDA可用: {tf.test.is_built_with_cuda()}')
print(f'GPU可用: {tf.test.is_gpu_available()}')
print(f'GPU列表: {tf.config.list_physical_devices(\"GPU\")}')
"
# 输出示例
# TensorFlow版本: 2.15.0
# CUDA可用: True
# GPU可用: True
# GPU列表: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
# 安装JupyterLab
conda install -c conda-forge jupyterlab
# 配置密码
jupyter lab password
# 生成配置文件
jupyter lab --generate-config
# 编辑配置文件
vim ~/.jupyter/jupyter_lab_config.py
# 添加以下配置
c.NotebookApp.ip = '0.0.0.0'
c.NotebookApp.port = 8888
c.NotebookApp.open_browser = False
c.NotebookApp.allow_root = True
# 数据科学
pip install numpy pandas scipy scikit-learn matplotlib seaborn
# 深度学习辅助
pip install transformers datasets accelerate
# 计算机视觉
pip install opencv-python pillow scikit-image
# 自然语言处理
pip install nltk spacy
# 模型部署
pip install onnx onnxruntime tensorrt
# 安装VS Code Server(code-server)
curl -fsSL https://code-server.dev/install.sh | sh
# 启动服务
code-server --bind-addr 0.0.0.0:8080 --auth password
# 启用TF32(Ampere及以上GPU)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# 启用cuDNN自动调优
torch.backends.cudnn.benchmark = True
# 使用混合精度训练
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for data, target in dataloader:
optimizer.zero_grad()
with autocast():
output = model(data)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# 梯度累积
accumulation_steps = 4
for i, (data, target) in enumerate(dataloader):
output = model(data)
loss = criterion(output, target) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
# 梯度检查点
from torch.utils.checkpoint import checkpoint
class CheckpointedModel(nn.Module):
def forward(self, x):
return checkpoint(self.layer, x)
# 查看PyTorch编译的CUDA版本
python -c "import torch; print(torch.version.cuda)"
# 查看系统CUDA版本
nvcc --version
# 如果不匹配,重新安装对应版本的PyTorch
# 例如系统CUDA 11.8
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
# 清空缓存
import torch
torch.cuda.empty_cache()
# 查看显存使用
print(f'已分配: {torch.cuda.memory_allocated() / 1024**3:.2f} GB')
print(f'保留: {torch.cuda.memory_reserved() / 1024**3:.2f} GB')
# 使用内存优化器
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5, fused=True)
# DataParallel(简单但效率较低)
model = nn.DataParallel(model)
# DistributedDataParallel(推荐)
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
# 初始化进程组
dist.init_process_group(backend='nccl')
# 包装模型
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
# 导出conda环境
conda env export > environment.yml
# 导出pip包
pip freeze > requirements.txt
# 从conda文件创建环境
conda env create -f environment.yml
# 从pip文件安装
pip install -r requirements.txt