52 lines
2.9 KiB
Bash
52 lines
2.9 KiB
Bash
# -*- coding: utf-8 -*-
|
|
# @Author: Weisen Pan
|
|
|
|
# Load environment modules required for execution
|
|
# This block sets up necessary modules, including compilers and deep learning libraries
|
|
source /etc/profile.d/modules.sh
|
|
module load gcc/11.2.0 # Load GCC compiler version 11.2.0
|
|
module load openmpi/4.1.3 # Load OpenMPI for distributed computing
|
|
module load cuda/11.5/11.5.2 # Load CUDA 11.5 for GPU acceleration
|
|
module load cudnn/8.3/8.3.3 # Load cuDNN for deep neural network operations
|
|
module load nccl/2.11/2.11.4-1 # Load NCCL for multi-GPU communication
|
|
module load python/3.10/3.10.4 # Load Python version 3.10
|
|
|
|
# Activate the Python environment
|
|
# This line activates a Python virtual environment with required packages (e.g., PyTorch and Horovod)
|
|
source ~/venv/pytorch1.11+horovod/bin/activate
|
|
|
|
# Create and clean the log directory for this job
|
|
# The log directory is where all training logs will be stored for this specific job
|
|
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
|
|
rm -rf ${LOG_PATH} # Remove any pre-existing log directory
|
|
mkdir -p ${LOG_PATH} # Create a new log directory
|
|
|
|
# Prepare the local dataset storage
|
|
# This copies the dataset to a local directory for faster access during training
|
|
DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/"
|
|
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_PATH}
|
|
|
|
# Change to the working directory of the federated training scripts
|
|
# The working directory contains the necessary scripts for running the training process
|
|
cd EdgeFLite
|
|
|
|
# Execute the federated training process with the specified configuration
|
|
# This command runs the federated learning training script with several parameters
|
|
python run_gkt.py \
|
|
--is_fed=1 # Enables federated learning mode
|
|
--fixed_cluster=0 # Dynamic clusters during training
|
|
--split_factor=1 # Data split factor for federated learning
|
|
--num_clusters=20 # Number of clusters for federated training
|
|
--num_selected=20 # Number of selected devices per round
|
|
--arch="wide_resnetsl50_2" # Model architecture (Wide ResNet with layers)
|
|
--dataset="pill_base" # Dataset being used for training
|
|
--num_classes=98 # Number of classes in the dataset
|
|
--is_single_branch=0 # Multi-branch model
|
|
--is_amp=0 # Disable automatic mixed precision
|
|
--num_rounds=350 # Total number of communication rounds in federated learning
|
|
--fed_epochs=1 # Number of local epochs per device
|
|
--batch_size=32 # Batch size for training
|
|
--crop_size=224 # Crop size for image preprocessing
|
|
--spid="FGKT_W502_20c_350r" # Unique identifier for the specific training experiment
|
|
--data=${DATA_PATH} # Path to the dataset being used for training
|