use-case-and-architecture/EdgeFLite/scripts/EdgeFLite_W168_96c_650r32.sh

# -*- coding: utf-8 -*-
# @Author: Weisen Pan

# Load environment modules required for execution
# This block sets up necessary modules, including compilers and deep learning libraries
source /etc/profile.d/modules.sh
module load gcc/11.2.0           # Load GCC compiler version 11.2.0
module load openmpi/4.1.3        # Load OpenMPI for distributed computing
module load cuda/11.5/11.5.2     # Load CUDA 11.5 for GPU acceleration
module load cudnn/8.3/8.3.3      # Load cuDNN for deep neural network operations
module load nccl/2.11/2.11.4-1   # Load NCCL for multi-GPU communication
module load python/3.10/3.10.4   # Load Python version 3.10

# Activate the Python environment
# This line activates a Python virtual environment with required packages (e.g., PyTorch and Horovod)
source ~/venv/pytorch1.11+horovod/bin/activate

# Create and clean the log directory for this job
# The log directory is where all training logs will be stored for this specific job
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
rm -rf ${LOG_PATH}        # Remove any pre-existing log directory
mkdir -p ${LOG_PATH}      # Create a new log directory

# Prepare the local dataset storage
# This copies the dataset to a local directory for faster access during training
DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/"
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_PATH}

# Change to the working directory of the federated training scripts
# The working directory contains the necessary scripts for running the training process
cd EdgeFLite

# Execute the federated training process with the specified configuration
# This command runs the federated learning training script with several parameters
python run_gkt.py \
    --is_fed=1                   # Enables federated learning mode
    --fixed_cluster=0             # Dynamic clusters during training
    --split_factor=1              # Data split factor for federated learning
    --num_clusters=20             # Number of clusters for federated training
    --num_selected=20             # Number of selected devices per round
    --arch="wide_resnetsl50_2"    # Model architecture (Wide ResNet with layers)
    --dataset="pill_base"         # Dataset being used for training
    --num_classes=98              # Number of classes in the dataset
    --is_single_branch=0          # Multi-branch model
    --is_amp=0                    # Disable automatic mixed precision
    --num_rounds=350              # Total number of communication rounds in federated learning
    --fed_epochs=1                # Number of local epochs per device
    --batch_size=32               # Batch size for training
    --crop_size=224               # Crop size for image preprocessing
    --spid="FGKT_W502_20c_350r"   # Unique identifier for the specific training experiment
    --data=${DATA_PATH}           # Path to the dataset being used for training