45 lines
2.5 KiB
Bash
45 lines
2.5 KiB
Bash
# -*- coding: utf-8 -*-
|
|
# @Author: Weisen Pan
|
|
|
|
# Load environment modules and required dependencies
|
|
source /etc/profile.d/modules.sh
|
|
module load gcc/11.2.0 # Load GCC compiler version 11.2.0
|
|
module load openmpi/4.1.3 # Load OpenMPI version 4.1.3 for distributed computing
|
|
module load cuda/11.5/11.5.2 # Load CUDA version 11.5.2 for GPU acceleration
|
|
module load cudnn/8.3/8.3.3 # Load cuDNN version 8.3.3 for deep learning libraries
|
|
module load nccl/2.11/2.11.4-1 # Load NCCL version 2.11.4 for multi-GPU communication
|
|
module load python/3.10/3.10.4 # Load Python version 3.10.4
|
|
|
|
# Activate the virtual Python environment
|
|
source ~/venv/pytorch1.11+horovod/bin/activate # Activate the virtual environment with PyTorch 1.11 and Horovod
|
|
|
|
# Define the log directory, clean up old records if any, and recreate the directory
|
|
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
|
|
rm -rf ${LOG_PATH} # Remove the existing log directory if it exists
|
|
mkdir -p ${LOG_PATH} # Create the log directory
|
|
|
|
# Set up the local data directory and copy the dataset into it
|
|
DATA_STORAGE="${SGE_LOCALDIR}/${JOB_ID}/"
|
|
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_STORAGE} # Copy CIFAR-100 dataset into the local storage directory
|
|
|
|
# Navigate to the working directory where training scripts are located
|
|
cd EdgeFLite # Change directory to the project EdgeFLite
|
|
|
|
# Execute the training script with federated learning parameters
|
|
python run_gkt.py \
|
|
--is_fed=1 # Enable federated learning mode
|
|
--fixed_cluster=0 # Allow dynamic cluster selection
|
|
--split_factor=1 # Set the split factor for cluster selection
|
|
--num_clusters=20 # Specify the number of clusters for federated learning
|
|
--num_selected=20 # Specify the number of selected clusters for each round
|
|
--arch="wide_resnet16_8" # Use the Wide ResNet16_8 architecture
|
|
--dataset="cifar10" # Specify the dataset as CIFAR-10
|
|
--num_classes=10 # Set the number of classes for classification
|
|
--is_single_branch=0 # Use multiple branches (not single branch)
|
|
--is_amp=0 # Disable automatic mixed precision (AMP)
|
|
--num_rounds=300 # Specify the number of federated learning rounds
|
|
--fed_epochs=1 # Set the number of epochs per round for federated learning
|
|
--cifar10_non_iid="quantity_skew" # Use non-iid data distribution with quantity skew for CIFAR-10
|
|
--spid="FGKT_W168_20c_skew" # Set the specific process ID for tracking
|
|
--data=${DATA_STORAGE} # Specify the local data storage path
|