# -*- coding: utf-8 -*- # @Author: Weisen Pan # Load necessary modules and dependencies source /etc/profile.d/modules.sh module load gcc/11.2.0 module load openmpi/4.1.3 module load cuda/11.5/11.5.2 module load cudnn/8.3/8.3.3 module load nccl/2.11/2.11.4-1 module load python/3.10/3.10.4 # Activate the Python environment source ~/venv/pytorch1.11+horovod/bin/activate # Configure log directory and clean up any existing records OUTPUT_LOG_DIR="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}" rm -rf ${OUTPUT_LOG_DIR} mkdir -p ${OUTPUT_LOG_DIR} # Copy dataset to local directory for processing LOCAL_DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/" cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${LOCAL_DATA_PATH} # Switch to the working directory containing the training scripts cd EdgeFLite # Run the training script with specified settings for federated learning python run_gkt.py \ --is_fed=1 \ # Enable federated learning mode --fixed_cluster=0 \ # Use dynamic clustering --split_factor=1 \ # Split factor for distributed computation --num_clusters=20 \ # Number of clusters to create --num_selected=20 \ # Number of selected clients per round --arch="wide_resnet16_8" \ # Architecture to use (Wide ResNet-16-8) --dataset="cifar10" \ # Dataset to use (CIFAR-10) --num_classes=10 \ # Number of classes in the dataset --is_single_branch=0 \ # Disable single branch training mode --is_amp=0 \ # Disable automatic mixed precision --num_rounds=300 \ # Number of communication rounds --fed_epochs=1 \ # Number of local epochs for each client per round --spid="fedgkt_wrn168_split1_cifar10_20clients_20choose_300rounds" \ # Unique ID for the experiment --data=${LOCAL_DATA_PATH} # Local path to the dataset