# -*- coding: utf-8 -*- # @Author: Weisen Pan # Load environment modules required for execution # This block sets up necessary modules, including compilers and deep learning libraries source /etc/profile.d/modules.sh module load gcc/11.2.0 # Load GCC compiler version 11.2.0 module load openmpi/4.1.3 # Load OpenMPI for distributed computing module load cuda/11.5/11.5.2 # Load CUDA 11.5 for GPU acceleration module load cudnn/8.3/8.3.3 # Load cuDNN for deep neural network operations module load nccl/2.11/2.11.4-1 # Load NCCL for multi-GPU communication module load python/3.10/3.10.4 # Load Python version 3.10 # Activate the Python environment # This line activates a Python virtual environment with required packages (e.g., PyTorch and Horovod) source ~/venv/pytorch1.11+horovod/bin/activate # Create and clean the log directory for this job # The log directory is where all training logs will be stored for this specific job LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}" rm -rf ${LOG_PATH} # Remove any pre-existing log directory mkdir -p ${LOG_PATH} # Create a new log directory # Prepare the local dataset storage # This copies the dataset to a local directory for faster access during training DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/" cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_PATH} # Change to the working directory of the federated training scripts # The working directory contains the necessary scripts for running the training process cd EdgeFLite # Execute the federated training process with the specified configuration # This command runs the federated learning training script with several parameters python run_gkt.py \ --is_fed=1 # Enables federated learning mode --fixed_cluster=0 # Dynamic clusters during training --split_factor=1 # Data split factor for federated learning --num_clusters=20 # Number of clusters for federated training --num_selected=20 # Number of selected devices per round --arch="wide_resnetsl50_2" # Model architecture (Wide ResNet with layers) --dataset="pill_base" # Dataset being used for training --num_classes=98 # Number of classes in the dataset --is_single_branch=0 # Multi-branch model --is_amp=0 # Disable automatic mixed precision --num_rounds=350 # Total number of communication rounds in federated learning --fed_epochs=1 # Number of local epochs per device --batch_size=32 # Batch size for training --crop_size=224 # Crop size for image preprocessing --spid="FGKT_W502_20c_350r" # Unique identifier for the specific training experiment --data=${DATA_PATH} # Path to the dataset being used for training