# -*- coding: utf-8 -*- # @Author: Weisen Pan # Load necessary system modules for the environment source /etc/profile.d/modules.sh module load gcc/11.2.0 # Load GCC compiler version 11.2.0 module load openmpi/4.1.3 # Load OpenMPI version 4.1.3 for parallel processing module load cuda/11.5/11.5.2 # Load CUDA version 11.5.2 for GPU acceleration module load cudnn/8.3/8.3.3 # Load cuDNN version 8.3.3 for deep learning libraries module load nccl/2.11/2.11.4-1 # Load NCCL version 2.11.4-1 for multi-GPU communication module load python/3.10/3.10.4 # Load Python version 3.10.4 # Activate the virtual environment for PyTorch and Horovod source ~/venv/pytorch1.11+horovod/bin/activate # Set up the log directory and remove any previous log records LOG_OUTPUT="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}" rm -rf ${LOG_OUTPUT} # Clean previous logs mkdir -p ${LOG_OUTPUT} # Create new log directory # Prepare local storage for the dataset LOCAL_DATA_DIR="${SGE_LOCALDIR}/${JOB_ID}/" # Set local storage path cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${LOCAL_DATA_DIR} # Copy CIFAR-100 data to local storage # Move to the project directory cd EdgeFLite # Run the federated learning experiment with the specified parameters python run_gkt.py \ --is_fed=1 \ # Enable federated learning --fixed_cluster=0 \ # Use dynamic clustering --split_factor=1 \ # Set split factor --num_clusters=20 \ # Number of clusters in the federation --num_selected=20 \ # Number of selected clients per round --arch=resnet_model_110sl \ # Model architecture: ResNet-110 small layer --dataset=cifar100 \ # Dataset: CIFAR-100 --num_classes=100 \ # Number of classes in the dataset --is_single_branch=0 \ # Enable multi-branch model --is_amp=0 \ # Disable automatic mixed precision --num_rounds=650 \ # Total number of federated learning rounds --fed_epochs=1 \ # Number of local epochs per round --cifar100_non_iid="quantity_skew" \ # Specify non-IID scenario: quantity skew --spid="FGKT_R110_20c_skew" \ # Experiment identifier --data=${LOCAL_DATA_DIR} # Path to the local dataset