# -*- coding: utf-8 -*- # @Author: Weisen Pan # Load environment modules required for execution source /etc/profile.d/modules.sh # Load the GCC compiler version 11.2.0 module load gcc/11.2.0 # Load the OpenMPI version 4.1.3 for distributed computing module load openmpi/4.1.3 # Load CUDA version 11.5 (subversion 11.5.2) for GPU acceleration module load cuda/11.5/11.5.2 # Load cuDNN version 8.3 (subversion 8.3.3) for deep learning libraries module load cudnn/8.3/8.3.3 # Load NCCL version 2.11 (subversion 2.11.4-1) for multi-GPU communication module load nccl/2.11/2.11.4-1 # Load Python version 3.10 (subversion 3.10.4) as the programming language module load python/3.10/3.10.4 # Activate the Python virtual environment for PyTorch and Horovod source ~/venv/pytorch1.11+horovod/bin/activate # Create and clean the log directory for this job LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}" # Remove any existing log directory to avoid conflicts rm -rf ${LOG_PATH} # Create a fresh log directory for the current job mkdir -p ${LOG_PATH} # Prepare the local dataset storage DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/" # Copy the dataset for local processing to improve performance cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_PATH} # Change to the working directory of the federated training scripts cd EdgeFLite # Execute the federated training process with the specified configuration python run_gkt.py \ --is_fed=1 \ # Enable federated training mode --fixed_cluster=0 \ # Do not fix clusters --split_factor=1 \ # Set the split factor to 1 --num_clusters=20 \ # Number of clusters to use in federated training --num_selected=20 \ # Number of selected clusters per round --arch="wide_resnetsl50_2" \ # Use the wide ResNet-50_2 architecture --dataset="pill_base" \ # Specify the dataset to use (Pill Base) --num_classes=98 \ # Number of classes in the dataset --is_single_branch=0 \ # Enable multi-branch training --is_amp=0 \ # Disable automatic mixed precision training --num_rounds=350 \ # Number of federated training rounds --fed_epochs=1 \ # Number of epochs per federated round --batch_size=32 \ # Batch size for training --crop_size=224 \ # Image crop size --spid="FGKT_W502_20c_350r" \ # Specify the unique session ID for logging --data=${DATA_PATH} # Path to the dataset