# -*- coding: utf-8 -*- # @Author: Weisen Pan # Load necessary modules and dependencies source /etc/profile.d/modules.sh # Load GCC version 11.2.0 module load gcc/11.2.0 # Load OpenMPI version 4.1.3 for distributed computing module load openmpi/4.1.3 # Load CUDA version 11.5 (subversion 11.5.2) for GPU acceleration module load cuda/11.5/11.5.2 # Load cuDNN version 8.3 (subversion 8.3.3) for deep learning operations module load cudnn/8.3/8.3.3 # Load NCCL version 2.11 (subversion 2.11.4-1) for multi-GPU communication module load nccl/2.11/2.11.4-1 # Load Python version 3.10 (subversion 3.10.4) module load python/3.10/3.10.4 # Activate the Python virtual environment for PyTorch 1.11 + Horovod source ~/venv/pytorch1.11+horovod/bin/activate # Configure the output log directory and clean up any existing records OUTPUT_LOG_DIR="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}" # Remove any previous log files from the directory rm -rf ${OUTPUT_LOG_DIR} # Create a fresh directory for storing logs mkdir -p ${OUTPUT_LOG_DIR} # Copy the dataset to a local directory for processing during training LOCAL_DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/" # Copy the dataset files from the performance test directory to the local directory cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${LOCAL_DATA_PATH} # Switch to the working directory containing the EdgeFLite training scripts cd EdgeFLite # Run the federated learning training script with the specified settings python run_gkt.py \ --is_fed=1 \ # Enable federated learning --fixed_cluster=0 \ # Disable fixed clusters --split_factor=1 \ # Set data split factor --num_clusters=20 \ # Specify number of clusters --num_selected=20 \ # Specify number of selected clients --arch="wide_resnet16_8" \ # Use Wide ResNet 16-8 architecture --dataset="cifar10" \ # Set dataset to CIFAR-10 --num_classes=10 \ # Set number of classes --is_single_branch=0 \ # Use multi-branch training --is_amp=0 \ # Disable automatic mixed precision (AMP) --num_rounds=300 \ # Set number of training rounds --fed_epochs=1 \ # Set number of federated learning epochs per round --spid="fedgkt_wrn168_split1_cifar10_20clients_20choose_300rounds" \ # Set session ID --data=${LOCAL_DATA_PATH} # Set path to the local dataset