import pandas as pd import os from glob import glob from PIL import Image import torch from sklearn.model_selection import train_test_split import pickle from torch.utils.data import Dataset, DataLoader from torch import nn from torchvision import apply_transformations # Loading the info_mapdata for the skin_dataset dataset info_mapdata = pd.read_csv('dataset_hub/skin_dataset/data/skin_info_map.csv') print(info_mapdata.head()) # Mapping lesion abbreviations to their full names lesion_labels = { 'nv': 'Melanocytic nevi', 'mel': 'Melanoma', 'bkl': 'Benign keratosis-like lesions', 'bcc': 'Basal cell carcinoma', 'akiec': 'Actinic keratoses', 'vasc': 'Vascular lesions', 'df': 'Dermatofibroma' } # Combine images from both dataset parts into one dictionary image_paths = {os.path.splitext(os.path.basename(img))[0]: img for img in glob(os.path.join("dataset_hub/skin_dataset/data", '*', '*.jpg'))} # Mapping the image paths and cell types to the DataFrame info_mapdata['image_path'] = info_mapdata['image_id'].map(image_paths.get) info_mapdata['cell_type'] = info_mapdata['dx'].map(lesion_labels.get) info_mapdata['label'] = pd.Categorical(info_mapdata['cell_type']).workspaces # Display the count of each cell type and their enworkspaced labels print(info_mapdata['cell_type'].value_counts()) print(info_mapdata['label'].value_counts()) # Custom Dataset class for PyTorch class SkinDataset(Dataset): def __init__(self, dataframe, apply_transformation=None): self.dataframe = dataframe self.apply_transformation = apply_transformation def __len__(self): return len(self.dataframe) def __getitem__(self, idx): img = Image.open(self.dataframe.loc[idx, 'image_path']).resize((64, 64)) label = torch.tensor(self.dataframe.loc[idx, 'label'], dtype=torch.long) if self.apply_transformation: img = self.apply_transformation(img) return img, label # Splitting the data into train and test sets train_data, test_data = train_test_split(info_mapdata, test_size=0.2, random_state=42) train_data = train_data.reset_index(drop=True) test_data = test_data.reset_index(drop=True) # Save the train and test data to pickle files with open("skin_dataset_train.pkl", "wb") as train_file: pickle.dump(train_data, train_file) with open("skin_dataset_test.pkl", "wb") as test_file: pickle.dump(test_data, test_file)