Source code for MEDiml.learning.DesignExperiment

import platform
import re
from itertools import combinations, product
from pathlib import Path
from typing import Dict, List

import pandas as pd
import yaml

from ..utils.get_institutions_from_ids import get_institutions_from_ids
from ..utils.json_utils import load_json, posix_to_string, save_json
from .ml_utils import cross_validation_split, get_stratified_splits



[docs]
class DesignExperiment:

[docs]
    def __init__(self, path_study: Path, path_settings: Path, experiment_label: str) -> None:
        """
        Constructor of the class DesignExperiment.

        Args:
            path_study (Path): Path to the main study folder where the outcomes, 
                learning patients and holdout patients dictionaries are found.
            path_settings (Path): Path to the settings file.
            experiment_label (str): String specifying the label to attach to a given learning experiment in 
                "path_experiments". This label will be attached to the ml__$experiments_label$.json file as well
                as the learn__$experiment_label$ folder. This label is used to keep track of different experiments 
                with different settings (e.g. radiomics, scans, machine learning algorithms, etc.).
        
        Returns:
            None
        """
        self.path_study = Path(path_study)
        self.path_settings = Path(path_settings)
        self.experiment_label = str(experiment_label)
        self.path_ml_object = None


    def __create_folder_and_content(
            self, 
            path_learn: Path,
            run_name: str, 
            patients_train: List,
            patients_test: List, 
            ml_path: Path
        ) -> List:
        """
        Creates json files needed for a given run
        
        Args:
            path_learn (Path): path to the main learning folder containing information about the training and test set.
            run_name (str): name for a given run.
            patients_train (List): list of patients in the training set.
            patients_test (List): list of patients in the test set.
            ml_path (Path): path to the given run.
        
        Returns:
            List: list of paths to the given run.
        """
        paths_ml = dict()
        path_run = path_learn / run_name
        Path.mkdir(path_run, exist_ok=True)
        path_train = path_run / 'patientsTrain.json'
        path_test = path_run / 'patientsTest.json'
        save_json(path_train, sorted(patients_train))
        save_json(path_test, sorted(patients_test))
        paths_ml['patientsTrain'] = path_train
        paths_ml['patientsTest'] = path_test
        paths_ml['outcomes'] = self.path_study / 'outcomes.csv'
        paths_ml['ml'] = self.path_ml_object
        paths_ml['results'] = path_run / 'run_results.json'
        path_file = path_run / 'paths_ml.json'
        paths_ml = posix_to_string(paths_ml)
        ml_path.append(path_file)
        save_json(path_file, paths_ml)

        return ml_path
    
    def __load_config(self) -> Dict:
        """Loads the YAML master configuration file."""
        with open(self.path_settings, 'r') as file:
            return yaml.safe_load(file)

    def __get_learning_dict(self) -> Path:
        """
        Generates a dictionary containing all settings for the learning experiment
        using a single YAML master configuration.
        
        Returns:
            Path: Path to the saved experiment-specific YAML options.
        """
        # Safety check: Verify master config exists
        if not self.path_settings.exists():
            raise FileNotFoundError(
                f"Master configuration file not found at: {self.path_settings}. "
                "Please ensure the consolidated YAML is in the settings folder."
            )

        # Load the Master Config
        config = self.__load_config()

        # Assemble Experiment Metadata
        # We maintain the structure while adding run-specific info
        ml_options = {
            'os': platform.system(),
            'experiment_label': self.experiment_label,
            'config_source': str(self.path_settings),
            # Directly map the sections from the YAML for downstream use
            'design': config.get('design'),
            'variables': config.get('variables'),
            'datacleaning': config.get('data_cleaning'),
            'fSetReduction': config.get('feature_reduction'),
            'normalization': config.get('normalization'),
            'modeling': config.get('modeling'),
            'study_metadata': config.get('study_metadata')
        }

        # Experiment Label Safety Check
        if not self.experiment_label:
            raise ValueError("Experiment label is empty. Class was not initialized properly.")

        return ml_options

    def __fill_learner_dict(self) -> Path:
        """
        Fills the main expirement dictionary from the settings in the different json files. 
        This main dictionary will hold all the settings for the data processing and learning experiment.
        
        Returns:
            Path: Path to the learner object.
        """
        # Initialization
        all_datacleaning = list()
        all_normalization = list()
        all_fset_reduction = list()
        ml = self.__get_learning_dict()

        # Machine learning variables
        if 'variables' in list(ml.keys()):
            var_options = ml['variables']
            fields = list(var_options.keys())
            vars = [(idx, s) for idx, s in enumerate(fields) if re.match(r"^var[0-9]{1,}$", s)]
            var_names = [var[1] for var in vars]  # list of var names

            # For each variable, organize the option in the ML dictionary
            for (idx, var) in vars:
                vars_dict = ml['variables']
                var_struct = vars_dict[var]
                
                # Radiomics variables
                if 'radiomics' in var_struct['nameType'].lower():
                    # Get radiomics features in workspace
                    if 'settofeatures' in var_struct['path'].lower():
                        name_folder = re.match(r"setTo(.*)inWorkspace", var_struct['path']).group(1)
                        path_features = self.path_study / name_folder
                    # Get radiomics features in path provided in the dictionary by the user 
                    else:
                        path_features = var_struct['path']
                    scans = var_struct['scans'] # list of imaging sequences
                    rois = var_struct['rois'] # list of roi labels
                    im_spaces = var_struct['imSpaces'] # list of image spaces (filterd and original)
                    use_combinations = var_struct['use_combinations'] if 'use_combinations' in list(var_struct.keys()) else False # boolean to use combinations of scans and im_spaces
                    if use_combinations:
                        all_combinations = []
                        scans = list(var_struct['combinations'].keys())
                        for scan in scans:
                            im_spaces = list(var_struct['combinations'][scan])
                            all_combinations += list(product([scan], rois, im_spaces))
                    else:
                        all_combinations = list(product(scans, rois, im_spaces))
                    
                    # Initialize dict to hold all paths to radiomics features (csv and txt files)
                    path = dict() 
                    for idx, (scan, roi, im_space) in enumerate(all_combinations):
                        rad_tab_x = {}
                        name_tab = 'radTab' + str(idx+1)
                        radiomics_table_name = 'radiomics__' + scan + '(' + roi + ')__' + im_space
                        rad_tab_x['csv'] = path_features / (radiomics_table_name + '.csv')
                        rad_tab_x['txt'] = path_features / (radiomics_table_name + '.txt')
                        rad_tab_x['type'] = path_features / (scan + '(' + roi + ')__' + im_space)
                        
                        # check if file exist
                        if not rad_tab_x['csv'].exists():
                            raise FileNotFoundError(f"File {rad_tab_x['csv']} does not exist.")
                        if not rad_tab_x['txt'].exists():
                            raise FileNotFoundError(f"File {rad_tab_x['txt']} does not exist.")
                        
                        path[name_tab] = rad_tab_x
                    
                    # Add path to ml dict for the current variable
                    vars_dict[var]['path'] = path
                    
                    # Add to ml dict for the current variable
                    ml['variables'].update(vars_dict)
                
                # Clinical or other variables (For ex: Volume)
                else:
                    # get path to csv file of features
                    if not var_struct['path']:
                        if var_options['pathCSV'] == 'setToCSVinWorkspace':
                            path_csv = self.path_study / 'CSV'
                        else:
                            path_csv = var_options['pathCSV']
                        var_struct['path'] = path_csv / var_struct['nameFile']
                
                # Add to ml dict for the current variable
                ml['variables'].update(vars_dict)

                # Initialize data processing methods
                if 'cleaning_profile' in var_struct.keys():
                    all_datacleaning.append(var_struct['cleaning_profile'])
                if 'normalization' in var_struct.keys():
                    all_normalization.append((var_struct['normalization']))
                if 'reduction_method' in var_struct.keys():
                    all_fset_reduction.append(var_struct['reduction_method'])

            # Combinations of variables
            if 'combinations' in var_options.keys():
                if var_options['combinations'] == ['all']:  # Combine all variables
                    combs = [comb for i in range(len(vars)) for comb in combinations(var_names, i+1)]
                    combstrings = ['_'.join(elt) for elt in combs]
                    ml['variables']['combinations'] = combstrings

        # Save the ML dictionary
        if self.experiment_label == "":
            raise ValueError("Experiment label is empty. Class was not initialized properly.")
        path_ml_object = self.path_study / f'ml_test__{self.experiment_label}.json'
        ml = posix_to_string(ml)    # Convert all paths to string
        save_json(path_ml_object, ml)
        
        # return ml
        return path_ml_object


[docs]
    def create_experiment(self) -> Dict:
        """
        Create the machine learning experiment dictionary, organizes each test/split information in a seperate folder.

        Args:
            ml (dict, optional): Dictionary containing all the machine learning settings. Defaults to None.
        
        Returns:
            Dict: Dictionary containing all the organized machine learning settings.
        """
        # Initialization
        ml_path = list()
        ml = load_json(self.path_ml_object)

        # Learning set
        patients_learn = load_json(self.path_study / 'patientsLearn.json')
        
        # Outcomes table
        outcomes_table = pd.read_csv(self.path_study / 'outcomes.csv', index_col=0)

        # keep only patients in learn set and outcomes table
        patients_to_keep = list(filter(lambda x: x in patients_learn, outcomes_table.index.values.tolist()))
        outcomes_table = outcomes_table.loc[patients_to_keep]

        # Get the "experiment label" from ml__$experiment_label$.json
        if self.experiment_label:
            experiment_label = self.experiment_label
        else:
            experiment_label = Path(self.path_ml_object).name[4:-5]

        # Create the folder for the training and testing sets (machine learning) information
        name_learn = 'learn__' + experiment_label
        path_learn = Path(self.path_study) / name_learn
        Path.mkdir(path_learn, exist_ok=True)

        # Getting the type of test_sets
        test_sets_types = ml['design']['active_method']

        # Creating the sets for the different machine learning runs
        for type_set in test_sets_types:
            # Random splits
            if type_set.lower() == 'random':
                # Get the experiment options for the sets
                random_info = ml['design'][type_set]
                method = random_info['method']
                n_splits = random_info['nSplits']
                stratify_institutions = random_info['stratifyInstitutions']
                test_proportion = random_info['testProportion']
                seed = random_info['seed']
                if method == 'SubSampling':
                    # Get the training and testing sets
                    patients_train, patients_test = get_stratified_splits(
                        outcomes_table, n_splits, 
                        test_proportion, seed, 
                        stratify_institutions
                    )

                    # If patients are not in a list
                    if type(patients_train) != list and not hasattr((patients_train), "__len__"):
                        patients_train = [patients_train]
                        patients_test = [patients_test]
                    
                    for i in range(n_splits):
                        # Create a folder for each split/run
                        run_name = "test__{0:03}".format(i+1)
                        ml_path = self.__create_folder_and_content(
                            path_learn, 
                            run_name, 
                            patients_train[i], 
                            patients_test[i], 
                            ml_path
                        )
            # Institutions-based splits
            elif type_set.lower() == 'institutions':
                # Get institutions run info
                patient_ids = pd.Series(outcomes_table.index)
                institution_cat_vector = get_institutions_from_ids(patient_ids)
                institution_cats = list(set(institution_cat_vector))
                n_institution = len(institution_cats)
                # The 'Institutions' argument only make sense if n_institutions > 1
                if n_institution > 1:
                    for i in range(n_institution):
                        cat = institution_cats[i]
                        patients_train = [elt for elt in patient_ids if cat not in elt]
                        patients_test = [elt for elt in patient_ids if cat in elt]
                        run_name = f"test__{cat}"
                        # Create a folder for each split/run
                        ml_path = self.__create_folder_and_content(
                            path_learn,
                            run_name,
                            patients_train,
                            patients_test,
                            ml_path
                        )
                    if n_institution > 2:
                        size_inst = list()
                        for i in range(n_institution):
                            cat = institution_cats[i]
                            size_inst.append(sum([1 if cat in elt else 0 for elt in institution_cat_vector]))
                        ind_max = size_inst.index(max(size_inst))
                        str_test = list()
                        for i in range(n_institution):
                            if i != ind_max:
                                cat = institution_cats[i]
                                str_test.append(cat)
                        cat = institution_cats[ind_max]
                        patients_train = [elt for elt in patient_ids if cat in elt]
                        patients_test = [elt for elt in patient_ids if cat not in elt]
                        run_name = f"test__{'_'.join(str_test)}"
                        # Create a folder for each split/run
                        ml_path = self.__create_folder_and_content(
                            path_learn,
                            run_name,
                            patients_train,
                            patients_test,
                            ml_path
                        )
            elif type_set.lower() == 'cv':
                # Get the experiment options for the sets
                cv_info = ml['design'][type_set]
                n_folds = cv_info['nFolds']
                seed = cv_info['seed']

                # Get the training and testing sets
                patients_train, patients_test = cross_validation_split(
                    outcomes_table,
                    n_folds,
                    seed=seed
                )

                # If patients are not in a list
                if type(patients_train) != list and not hasattr((patients_train), "__len__"):
                    patients_train = [patients_train]
                    patients_test = [patients_test]

                for i in range(n_folds):
                    # Create a folder for each split/run
                    run_name = "test__{0:03}".format(i+1)
                    ml_path = self.__create_folder_and_content(
                        path_learn,
                        run_name,
                        patients_train[i],
                        patients_test[i],
                        ml_path
                    )
            else:
                raise ValueError("The type of test set is not recognized. Must be 'random' or 'institutions'.")

        # Make ml_path a dictionary to easily save it in json
        return {f"run{idx+1}": value for idx, value in enumerate(ml_path)}



[docs]
    def generate_experiment(self):
        """
        Generate the json files containing all the options the experiment.
        The json files will then be used in machine learning.
        """
        # Fill the ml options dictionary
        self.path_ml_object = self.__fill_learner_dict()
        
        # Generate the experiment dictionary
        experiment_dict = self.create_experiment()
        
        # Saving the final experiment dictionary
        path_file = self.path_study / f'path_file_ml_paths__{self.experiment_label}.json'
        experiment_dict = posix_to_string(experiment_dict)  # Convert all paths to string
        save_json(path_file, experiment_dict)
        
        return path_file