Source code for MEDiml.learning.DesignExperiment

import platform
import re
from itertools import combinations, product
from pathlib import Path
from typing import Dict, List

import pandas as pd
import yaml

from ..utils.get_institutions_from_ids import get_institutions_from_ids
from ..utils.json_utils import load_json, posix_to_string, save_json
from .ml_utils import cross_validation_split, get_stratified_splits


[docs] class DesignExperiment:
[docs] def __init__(self, path_study: Path, path_settings: Path, experiment_label: str) -> None: """ Constructor of the class DesignExperiment. Args: path_study (Path): Path to the main study folder where the outcomes, learning patients and holdout patients dictionaries are found. path_settings (Path): Path to the settings file. experiment_label (str): String specifying the label to attach to a given learning experiment in "path_experiments". This label will be attached to the ml__$experiments_label$.json file as well as the learn__$experiment_label$ folder. This label is used to keep track of different experiments with different settings (e.g. radiomics, scans, machine learning algorithms, etc.). Returns: None """ self.path_study = Path(path_study) self.path_settings = Path(path_settings) self.experiment_label = str(experiment_label) self.path_ml_object = None
def __create_folder_and_content( self, path_learn: Path, run_name: str, patients_train: List, patients_test: List, ml_path: Path ) -> List: """ Creates json files needed for a given run Args: path_learn (Path): path to the main learning folder containing information about the training and test set. run_name (str): name for a given run. patients_train (List): list of patients in the training set. patients_test (List): list of patients in the test set. ml_path (Path): path to the given run. Returns: List: list of paths to the given run. """ paths_ml = dict() path_run = path_learn / run_name Path.mkdir(path_run, exist_ok=True) path_train = path_run / 'patientsTrain.json' path_test = path_run / 'patientsTest.json' save_json(path_train, sorted(patients_train)) save_json(path_test, sorted(patients_test)) paths_ml['patientsTrain'] = path_train paths_ml['patientsTest'] = path_test paths_ml['outcomes'] = self.path_study / 'outcomes.csv' paths_ml['ml'] = self.path_ml_object paths_ml['results'] = path_run / 'run_results.json' path_file = path_run / 'paths_ml.json' paths_ml = posix_to_string(paths_ml) ml_path.append(path_file) save_json(path_file, paths_ml) return ml_path def __load_config(self) -> Dict: """Loads the YAML master configuration file.""" with open(self.path_settings, 'r') as file: return yaml.safe_load(file) def __get_learning_dict(self) -> Path: """ Generates a dictionary containing all settings for the learning experiment using a single YAML master configuration. Returns: Path: Path to the saved experiment-specific YAML options. """ # Safety check: Verify master config exists if not self.path_settings.exists(): raise FileNotFoundError( f"Master configuration file not found at: {self.path_settings}. " "Please ensure the consolidated YAML is in the settings folder." ) # Load the Master Config config = self.__load_config() # Assemble Experiment Metadata # We maintain the structure while adding run-specific info ml_options = { 'os': platform.system(), 'experiment_label': self.experiment_label, 'config_source': str(self.path_settings), # Directly map the sections from the YAML for downstream use 'design': config.get('design'), 'variables': config.get('variables'), 'datacleaning': config.get('data_cleaning'), 'fSetReduction': config.get('feature_reduction'), 'normalization': config.get('normalization'), 'modeling': config.get('modeling'), 'study_metadata': config.get('study_metadata') } # Experiment Label Safety Check if not self.experiment_label: raise ValueError("Experiment label is empty. Class was not initialized properly.") return ml_options def __fill_learner_dict(self) -> Path: """ Fills the main expirement dictionary from the settings in the different json files. This main dictionary will hold all the settings for the data processing and learning experiment. Returns: Path: Path to the learner object. """ # Initialization all_datacleaning = list() all_normalization = list() all_fset_reduction = list() ml = self.__get_learning_dict() # Machine learning variables if 'variables' in list(ml.keys()): var_options = ml['variables'] fields = list(var_options.keys()) vars = [(idx, s) for idx, s in enumerate(fields) if re.match(r"^var[0-9]{1,}$", s)] var_names = [var[1] for var in vars] # list of var names # For each variable, organize the option in the ML dictionary for (idx, var) in vars: vars_dict = ml['variables'] var_struct = vars_dict[var] # Radiomics variables if 'radiomics' in var_struct['nameType'].lower(): # Get radiomics features in workspace if 'settofeatures' in var_struct['path'].lower(): name_folder = re.match(r"setTo(.*)inWorkspace", var_struct['path']).group(1) path_features = self.path_study / name_folder # Get radiomics features in path provided in the dictionary by the user else: path_features = var_struct['path'] scans = var_struct['scans'] # list of imaging sequences rois = var_struct['rois'] # list of roi labels im_spaces = var_struct['imSpaces'] # list of image spaces (filterd and original) use_combinations = var_struct['use_combinations'] if 'use_combinations' in list(var_struct.keys()) else False # boolean to use combinations of scans and im_spaces if use_combinations: all_combinations = [] scans = list(var_struct['combinations'].keys()) for scan in scans: im_spaces = list(var_struct['combinations'][scan]) all_combinations += list(product([scan], rois, im_spaces)) else: all_combinations = list(product(scans, rois, im_spaces)) # Initialize dict to hold all paths to radiomics features (csv and txt files) path = dict() for idx, (scan, roi, im_space) in enumerate(all_combinations): rad_tab_x = {} name_tab = 'radTab' + str(idx+1) radiomics_table_name = 'radiomics__' + scan + '(' + roi + ')__' + im_space rad_tab_x['csv'] = path_features / (radiomics_table_name + '.csv') rad_tab_x['txt'] = path_features / (radiomics_table_name + '.txt') rad_tab_x['type'] = path_features / (scan + '(' + roi + ')__' + im_space) # check if file exist if not rad_tab_x['csv'].exists(): raise FileNotFoundError(f"File {rad_tab_x['csv']} does not exist.") if not rad_tab_x['txt'].exists(): raise FileNotFoundError(f"File {rad_tab_x['txt']} does not exist.") path[name_tab] = rad_tab_x # Add path to ml dict for the current variable vars_dict[var]['path'] = path # Add to ml dict for the current variable ml['variables'].update(vars_dict) # Clinical or other variables (For ex: Volume) else: # get path to csv file of features if not var_struct['path']: if var_options['pathCSV'] == 'setToCSVinWorkspace': path_csv = self.path_study / 'CSV' else: path_csv = var_options['pathCSV'] var_struct['path'] = path_csv / var_struct['nameFile'] # Add to ml dict for the current variable ml['variables'].update(vars_dict) # Initialize data processing methods if 'cleaning_profile' in var_struct.keys(): all_datacleaning.append(var_struct['cleaning_profile']) if 'normalization' in var_struct.keys(): all_normalization.append((var_struct['normalization'])) if 'reduction_method' in var_struct.keys(): all_fset_reduction.append(var_struct['reduction_method']) # Combinations of variables if 'combinations' in var_options.keys(): if var_options['combinations'] == ['all']: # Combine all variables combs = [comb for i in range(len(vars)) for comb in combinations(var_names, i+1)] combstrings = ['_'.join(elt) for elt in combs] ml['variables']['combinations'] = combstrings # Save the ML dictionary if self.experiment_label == "": raise ValueError("Experiment label is empty. Class was not initialized properly.") path_ml_object = self.path_study / f'ml_test__{self.experiment_label}.json' ml = posix_to_string(ml) # Convert all paths to string save_json(path_ml_object, ml) # return ml return path_ml_object
[docs] def create_experiment(self) -> Dict: """ Create the machine learning experiment dictionary, organizes each test/split information in a seperate folder. Args: ml (dict, optional): Dictionary containing all the machine learning settings. Defaults to None. Returns: Dict: Dictionary containing all the organized machine learning settings. """ # Initialization ml_path = list() ml = load_json(self.path_ml_object) # Learning set patients_learn = load_json(self.path_study / 'patientsLearn.json') # Outcomes table outcomes_table = pd.read_csv(self.path_study / 'outcomes.csv', index_col=0) # keep only patients in learn set and outcomes table patients_to_keep = list(filter(lambda x: x in patients_learn, outcomes_table.index.values.tolist())) outcomes_table = outcomes_table.loc[patients_to_keep] # Get the "experiment label" from ml__$experiment_label$.json if self.experiment_label: experiment_label = self.experiment_label else: experiment_label = Path(self.path_ml_object).name[4:-5] # Create the folder for the training and testing sets (machine learning) information name_learn = 'learn__' + experiment_label path_learn = Path(self.path_study) / name_learn Path.mkdir(path_learn, exist_ok=True) # Getting the type of test_sets test_sets_types = ml['design']['active_method'] # Creating the sets for the different machine learning runs for type_set in test_sets_types: # Random splits if type_set.lower() == 'random': # Get the experiment options for the sets random_info = ml['design'][type_set] method = random_info['method'] n_splits = random_info['nSplits'] stratify_institutions = random_info['stratifyInstitutions'] test_proportion = random_info['testProportion'] seed = random_info['seed'] if method == 'SubSampling': # Get the training and testing sets patients_train, patients_test = get_stratified_splits( outcomes_table, n_splits, test_proportion, seed, stratify_institutions ) # If patients are not in a list if type(patients_train) != list and not hasattr((patients_train), "__len__"): patients_train = [patients_train] patients_test = [patients_test] for i in range(n_splits): # Create a folder for each split/run run_name = "test__{0:03}".format(i+1) ml_path = self.__create_folder_and_content( path_learn, run_name, patients_train[i], patients_test[i], ml_path ) # Institutions-based splits elif type_set.lower() == 'institutions': # Get institutions run info patient_ids = pd.Series(outcomes_table.index) institution_cat_vector = get_institutions_from_ids(patient_ids) institution_cats = list(set(institution_cat_vector)) n_institution = len(institution_cats) # The 'Institutions' argument only make sense if n_institutions > 1 if n_institution > 1: for i in range(n_institution): cat = institution_cats[i] patients_train = [elt for elt in patient_ids if cat not in elt] patients_test = [elt for elt in patient_ids if cat in elt] run_name = f"test__{cat}" # Create a folder for each split/run ml_path = self.__create_folder_and_content( path_learn, run_name, patients_train, patients_test, ml_path ) if n_institution > 2: size_inst = list() for i in range(n_institution): cat = institution_cats[i] size_inst.append(sum([1 if cat in elt else 0 for elt in institution_cat_vector])) ind_max = size_inst.index(max(size_inst)) str_test = list() for i in range(n_institution): if i != ind_max: cat = institution_cats[i] str_test.append(cat) cat = institution_cats[ind_max] patients_train = [elt for elt in patient_ids if cat in elt] patients_test = [elt for elt in patient_ids if cat not in elt] run_name = f"test__{'_'.join(str_test)}" # Create a folder for each split/run ml_path = self.__create_folder_and_content( path_learn, run_name, patients_train, patients_test, ml_path ) elif type_set.lower() == 'cv': # Get the experiment options for the sets cv_info = ml['design'][type_set] n_folds = cv_info['nFolds'] seed = cv_info['seed'] # Get the training and testing sets patients_train, patients_test = cross_validation_split( outcomes_table, n_folds, seed=seed ) # If patients are not in a list if type(patients_train) != list and not hasattr((patients_train), "__len__"): patients_train = [patients_train] patients_test = [patients_test] for i in range(n_folds): # Create a folder for each split/run run_name = "test__{0:03}".format(i+1) ml_path = self.__create_folder_and_content( path_learn, run_name, patients_train[i], patients_test[i], ml_path ) else: raise ValueError("The type of test set is not recognized. Must be 'random' or 'institutions'.") # Make ml_path a dictionary to easily save it in json return {f"run{idx+1}": value for idx, value in enumerate(ml_path)}
[docs] def generate_experiment(self): """ Generate the json files containing all the options the experiment. The json files will then be used in machine learning. """ # Fill the ml options dictionary self.path_ml_object = self.__fill_learner_dict() # Generate the experiment dictionary experiment_dict = self.create_experiment() # Saving the final experiment dictionary path_file = self.path_study / f'path_file_ml_paths__{self.experiment_label}.json' experiment_dict = posix_to_string(experiment_dict) # Convert all paths to string save_json(path_file, experiment_dict) return path_file