# -*- coding: utf-8 -*-
# @Author: Weisen Pan

# Load necessary system modules
source /etc/profile.d/modules.sh

# Load the GCC module version 11.2.0
module load gcc/11.2.0

# Load the OpenMPI module version 4.1.3
module load openmpi/4.1.3

# Load the CUDA module version 11.5.2
module load cuda/11.5/11.5.2

# Load the cuDNN module version 8.3.3
module load cudnn/8.3/8.3.3

# Load the NCCL module version 2.11.4-1
module load nccl/2.11/2.11.4-1

# Load the Python module version 3.10.4
module load python/3.10/3.10.4

# Activate the virtual environment for PyTorch and Horovod
source ~/venv/pytorch1.11+horovod/bin/activate

# Set up the log directory and clean previous records if they exist
LOG_OUTPUT="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
rm -rf ${LOG_OUTPUT}  # Remove previous log files
mkdir -p ${LOG_OUTPUT}  # Create a new directory for logs

# Prepare local storage for the dataset by copying it to a local directory
LOCAL_DATA_DIR="${SGE_LOCALDIR}/${JOB_ID}/"
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${LOCAL_DATA_DIR}

# Navigate to the EdgeFLite project directory
cd EdgeFLite

# Run the federated learning experiment with the specified parameters
python run_gkt.py \
    --is_fed=1 \  # Enable federated learning
    --fixed_cluster=0 \  # Disable fixed cluster settings
    --split_factor=1 \  # Use split factor of 1
    --num_clusters=20 \  # Set the number of clusters to 20
    --num_selected=20 \  # Select 20 clients for each round
    --arch=resnet_model_110sl \  # Use ResNet110 single branch architecture
    --dataset=cifar100 \  # Use CIFAR-100 dataset
    --num_classes=100 \  # Set the number of classes to 100
    --is_single_branch=0 \  # Use multiple branches in the model
    --is_amp=0 \  # Disable automatic mixed precision
    --num_rounds=650 \  # Set the number of communication rounds to 650
    --fed_epochs=1 \  # Set the number of federated epochs to 1
    --cifar100_non_iid="quantity_skew" \  # Apply non-IID data partitioning (quantity skew)
    --spid="FGKT_R110_20c_skew" \  # Set the experiment ID
    --data=${LOCAL_DATA_DIR}  # Set the path to the dataset in local storage