Weisen Pan 4ec0a23e73 Edge Federated Learning for Improved Training Efficiency
Change-Id: Ic4e43992e1674946cb69e0221659b0261259196c
2024-09-18 18:39:43 -07:00

69 lines
2.4 KiB
CSV

import pandas as pd
import os
from glob import glob
from PIL import Image
import torch
from sklearn.model_selection import train_test_split
import pickle
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torchvision import apply_transformations
# Loading the info_mapdata for the skin_dataset dataset
info_mapdata = pd.read_csv('dataset_hub/skin_dataset/data/skin_info_map.csv')
print(info_mapdata.head())
# Mapping lesion abbreviations to their full names
lesion_labels = {
'nv': 'Melanocytic nevi',
'mel': 'Melanoma',
'bkl': 'Benign keratosis-like lesions',
'bcc': 'Basal cell carcinoma',
'akiec': 'Actinic keratoses',
'vasc': 'Vascular lesions',
'df': 'Dermatofibroma'
}
# Combine images from both dataset parts into one dictionary
image_paths = {os.path.splitext(os.path.basename(img))[0]: img
for img in glob(os.path.join("dataset_hub/skin_dataset/data", '*', '*.jpg'))}
# Mapping the image paths and cell types to the DataFrame
info_mapdata['image_path'] = info_mapdata['image_id'].map(image_paths.get)
info_mapdata['cell_type'] = info_mapdata['dx'].map(lesion_labels.get)
info_mapdata['label'] = pd.Categorical(info_mapdata['cell_type']).workspaces
# Display the count of each cell type and their enworkspaced labels
print(info_mapdata['cell_type'].value_counts())
print(info_mapdata['label'].value_counts())
# Custom Dataset class for PyTorch
class SkinDataset(Dataset):
def __init__(self, dataframe, apply_transformation=None):
self.dataframe = dataframe
self.apply_transformation = apply_transformation
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
img = Image.open(self.dataframe.loc[idx, 'image_path']).resize((64, 64))
label = torch.tensor(self.dataframe.loc[idx, 'label'], dtype=torch.long)
if self.apply_transformation:
img = self.apply_transformation(img)
return img, label
# Splitting the data into train and test sets
train_data, test_data = train_test_split(info_mapdata, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
# Save the train and test data to pickle files
with open("skin_dataset_train.pkl", "wb") as train_file:
pickle.dump(train_data, train_file)
with open("skin_dataset_test.pkl", "wb") as test_file:
pickle.dump(test_data, test_file)