Source code for MEDiml.wrangling.DataManager

import json
import logging
import os
import pickle
import re
from dataclasses import dataclass
from pathlib import Path
from time import time
from typing import List, Union

import matplotlib.pyplot as plt
import nibabel as nib
import numpy as np
import pandas as pd
import pydicom
import pydicom.misc
import ray
from nilearn import image
from numpyencoder import NumpyEncoder
from tqdm import tqdm, trange

from ..MEDscan import MEDscan
from ..processing.PETSUVConverter import PETSUVConverter
from ..processing.segmentation import get_roi_from_indexes
from ..utils.get_file_paths import get_file_paths
from ..utils.get_patient_names import get_patient_names
from ..utils.imref import imref3d
from ..utils.json_utils import load_json, save_json
from ..utils.save_MEDscan import save_MEDscan
from .ProcessDICOM import ProcessDICOM



[docs]
class DataManager(object):
    """Reads all the raw data (DICOM, NIfTI) content and organizes it in instances of the MEDscan class."""



[docs]
    @dataclass
    class DICOM(object):
        """DICOM data management class that will organize data during the conversion to MEDscan class process"""
        stack_series_rs: List
        stack_path_rs: List
        stack_frame_rs: List
        cell_series_id: List
        cell_path_rs: List
        cell_path_images: List
        cell_frame_rs: List
        cell_frame_id: List




[docs]
    @dataclass
    class NIfTI(object):
        """NIfTI data management class that will organize data during the conversion to MEDscan class process"""
        stack_path_images: List
        stack_path_roi: List
        stack_path_all: List




[docs]
    @dataclass
    class Paths(object):
        """Paths management class that will organize the paths used in the processing"""
        _path_to_dicoms: List
        _path_to_niftis: List
        _path_csv: Union[Path, str]
        _path_save: Union[Path, str]
        _path_save_checks: Union[Path, str]
        _path_pre_checks_settings: Union[Path, str]



[docs]
    def __init__(
            self, 
            path_to_dicoms: List = [],
            path_to_niftis: List = [],
            path_csv: Union[Path, str] = None,
            path_save: Union[Path, str] = None,
            path_save_checks: Union[Path, str] = None,
            path_pre_checks_settings: Union[Path, str] = None,
            save: bool = True,
            n_batch: int = 2
    ) -> None:
        """Constructor of the class DataManager.

        Args:
            path_to_dicoms (Union[Path, str], optional): Full path to the starting directory
                where the DICOM data is located.
            path_to_niftis (Union[Path, str], optional): Full path to the starting directory
                where the NIfTI is located.
            path_csv (Union[Path, str], optional): Full path to the CSV file containing the scans info list.
            path_save (Union[Path, str], optional): Full path to the directory where to save all the MEDscan classes.
            path_save_checks(Union[Path, str], optional): Full path to the directory where to save all 
                the pre-radiomics checks analysis results.
            path_pre_checks_settings(Union[Path, str], optional): Full path to the JSON file of the pre-checks analysis
                parameters.
            save (bool, optional): True to save the MEDscan classes in `path_save`.
            n_batch (int, optional): Numerical value specifying the number of batch to use in the
                parallel computations (use 0 for serial computation).
        
        Returns:
            None
        """
        # Convert all paths to Pathlib.Path
        if path_to_dicoms:
            path_to_dicoms = Path(path_to_dicoms)
        if path_to_niftis:
            path_to_niftis = Path(path_to_niftis)
        if path_csv:
            path_csv = Path(path_csv)
        if path_save:
            path_save = Path(path_save)
        if path_save_checks:
            path_save_checks = Path(path_save_checks)
        if path_pre_checks_settings:
            path_pre_checks_settings = Path(path_pre_checks_settings)

        self.paths = self.Paths(
                path_to_dicoms,
                path_to_niftis,
                path_csv,
                path_save,
                path_save_checks,
                path_pre_checks_settings,
        )
        self.save = save
        self.n_batch = n_batch
        self.__dicom = self.DICOM(
                stack_series_rs=[],
                stack_path_rs=[],
                stack_frame_rs=[],
                cell_series_id=[],
                cell_path_rs=[],
                cell_path_images=[],
                cell_frame_rs=[],
                cell_frame_id=[]
        )
        self.__nifti = self.NIfTI(
                stack_path_images=[],
                stack_path_roi=[],
                stack_path_all=[]
        )
        self.path_to_objects = []
        self.summary = {}
        self.csv_data = None
        self.__studies = []
        self.__institutions = []
        self.__scans = []


    def __find_uid_cell_index(self, uid: Union[str, List[str]], cell: List[str]) -> List: 
        """Finds the cell with the same `uid`. If not is present in `cell`, creates a new position
        in the `cell` for the new `uid`.

        Args:
            uid (Union[str, List[str]]):  Unique identifier of the Series to find.
            cell (List[str]): List of Unique identifiers of the Series.

        Returns:
            Union[List[str], str]: List or string of the uid  
        """
        return [len(cell)] if uid not in cell else[i for i, e in enumerate(cell) if e == uid]

    def __get_list_of_files(self, dir_name: str) -> List:
        """Gets all files in the given directory

        Args:
            dir_name (str): directory name

        Returns:
            List: List of all files in the directory
        """
        list_of_file = os.listdir(dir_name)
        all_files = list()
        for entry in list_of_file:
            full_path = os.path.join(dir_name, entry)
            if os.path.isdir(full_path):
                all_files = all_files + self.__get_list_of_files(full_path)
            else:
                all_files.append(full_path)

        return all_files

    def __get_MEDscan_name_save(self, medscan: MEDscan) -> str:
        """Returns the name that will be used to save the MEDscan instance, based on the values of the attributes.

        Args:
            medscan(MEDscan): A MEDscan class instance.
        
        Returns:
            str: String of the name save.
        """
        series_description = medscan.series_description.translate({ord(ch): '-' for ch in '/\\ ()&:*'})
        name_id = medscan.patientID.translate({ord(ch): '-' for ch in '/\\ ()&:*'})
        # final saving name
        name_complete = name_id + '__' + series_description + '.' + medscan.type + '.npy'
        return name_complete

    def __associate_rt_stuct(self) -> None:
        """Associates the imaging volumes to their mask using UIDs

        Returns:
            None
        """
        print('--> Associating all RT objects to imaging volumes')
        n_rs = len(self.__dicom.stack_path_rs)
        self.__dicom.stack_series_rs = list(dict.fromkeys(self.__dicom.stack_series_rs))
        if n_rs:
            for i in trange(0, n_rs):
                try: 
                    # PUT ALL THE DICOM PATHS WITH THE SAME UID IN THE SAME PATH LIST
                    ind_series_id = self.__find_uid_cell_index(
                                                        self.__dicom.stack_series_rs[i], 
                                                        self.__dicom.cell_series_id)
                    for n in range(len(ind_series_id)):
                        if ind_series_id[n] < len(self.__dicom.cell_path_rs):
                            self.__dicom.cell_path_rs[ind_series_id[n]] += [self.__dicom.stack_path_rs[i]]
                except:
                    ind_series_id = self.__find_uid_cell_index(
                                                        self.__dicom.stack_frame_rs[i], 
                                                        self.__dicom.cell_frame_id)
                    for n in range(len(ind_series_id)):
                        if ind_series_id[n] < len(self.__dicom.cell_path_rs):
                            self.__dicom.cell_path_rs[ind_series_id[n]] += [self.__dicom.stack_path_rs[i]]
        print('DONE')

    def __read_all_dicoms(self) -> None:
        """Reads all the dicom files in the all the paths of the attribute `_path_to_dicoms`

        Returns:
            None
        """
        # SCANNING ALL FOLDERS IN INITIAL DIRECTORY
        print('\n--> Scanning all folders in initial directory...', end='')
        p = Path(self.paths._path_to_dicoms)
        e_rglob = '*.dcm'

        # EXTRACT ALL FILES IN THE PATH TO DICOMS
        if self.paths._path_to_dicoms.is_dir():
            stack_folder_temp = list(p.rglob(e_rglob))
            stack_folder = [x for x in stack_folder_temp if not x.is_dir()]
        elif str(self.paths._path_to_dicoms).find('json') != -1:
            with open(self.paths._path_to_dicoms) as f:
                data = json.load(f)
                for value in data.values():
                    stack_folder_temp = value
            directory_name = str(stack_folder_temp).replace("'", '').replace('[', '').replace(']', '')
            stack_folder = self.__get_list_of_files(directory_name)
        else:
            raise ValueError("The given dicom folder path either doesn't exist or not a folder.")
        # READ ALL DICOM FILES AND UPDATE ATTRIBUTES FOR FURTHER PROCESSING
        for file in tqdm(stack_folder):
            if pydicom.misc.is_dicom(file):
                try:
                    info = pydicom.dcmread(str(file))
                    if info.Modality in ['MR', 'PT', 'CT']:
                        ind_series_id = self.__find_uid_cell_index(
                                                        info.SeriesInstanceUID, 
                                                        self.__dicom.cell_series_id)[0]
                        if ind_series_id == len(self.__dicom.cell_series_id):  # New volume
                            self.__dicom.cell_series_id = self.__dicom.cell_series_id + [info.SeriesInstanceUID]
                            self.__dicom.cell_frame_id += [info.FrameOfReferenceUID]
                            self.__dicom.cell_path_images += [[]]
                            self.__dicom.cell_path_rs = self.__dicom.cell_path_rs + [[]]
                        self.__dicom.cell_path_images[ind_series_id] += [file]
                    elif info.Modality == 'RTSTRUCT':
                        self.__dicom.stack_path_rs += [file]
                        try:
                            series_uid = info.ReferencedFrameOfReferenceSequence[
                                        0].RTReferencedStudySequence[
                                        0].RTReferencedSeriesSequence[
                                        0].SeriesInstanceUID
                        except:
                            series_uid = 'NotFound'
                        self.__dicom.stack_series_rs += [series_uid]
                        try:
                            frame_uid = info.ReferencedFrameOfReferenceSequence[0].FrameOfReferenceUID
                        except:
                            frame_uid = info.FrameOfReferenceUID
                        self.__dicom.stack_frame_rs += [frame_uid]
                    else:
                        print("Modality not supported: ", info.Modality)

                except Exception as e:
                    print(f'Error while reading: {file}, error: {e}\n')
                    continue
        print('DONE')

        # ASSOCIATE ALL VOLUMES TO THEIR MASK
        self.__associate_rt_stuct()


[docs]
    def process_all_dicoms(self) -> Union[List[MEDscan], None]:
        """This function reads the DICOM content of all the sub-folder tree of a starting directory defined by
        `path_to_dicoms`. It then organizes the data (files throughout the starting directory are associated by
        'SeriesInstanceUID') in the MEDscan class including the region of  interest (ROI) defined by an
        associated RTstruct. All MEDscan classes hereby created are saved in `path_save` with a name
        varying with every scan.

        Returns:
            List[MEDscan]: List of MEDscan instances.
        """
        # Initialize ray
        if ray.is_initialized():
            ray.shutdown()
        ray.init(local_mode=True, include_dashboard=False)

        print('--> Reading all DICOM objects to create MEDscan classes')
        self.__read_all_dicoms()

        print('--> Processing DICOMs and creating MEDscan objects')
        n_scans = len(self.__dicom.cell_path_images)
        if self.n_batch is None:
            n_batch = 1
        elif n_scans < self.n_batch:
            n_batch = n_scans
        else:
            n_batch = self.n_batch

        # Distribute the first tasks to all workers
        pds = [ProcessDICOM(
                        self.__dicom.cell_path_images[i], 
                        self.__dicom.cell_path_rs[i], 
                        self.paths._path_save,
                        self.save)
            for i in range(n_batch)]
        
        ids = [pd.process_files() for pd in pds]

        # Update the path to the created instances
        if self.save:
            for name_save in ray.get(ids):
                if self.paths._path_save:
                    self.path_to_objects.append(str(self.paths._path_save / name_save))
                # Update processing summary
                if name_save.split('_')[0].count('-') >= 2:
                    scan_type = name_save[name_save.find('__')+2 : name_save.find('.')]
                    if name_save.split('-')[0] not in self.__studies:
                        self.__studies.append(name_save.split('-')[0])  # add new study
                    if name_save.split('-')[1] not in self.__institutions:
                        self.__institutions.append(name_save.split('-')[1])  # add new study
                    if name_save.split('-')[0] not in self.summary:
                        self.summary[name_save.split('-')[0]] = {}
                    if name_save.split('-')[1] not  in self.summary[name_save.split('-')[0]]:
                        self.summary[name_save.split('-')[0]][name_save.split('-')[1]] = {}  # add new institution
                    if scan_type not in self.__scans:
                        self.__scans.append(scan_type)
                    if scan_type not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]]:
                        self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type] = []
                    if name_save not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type]:
                        self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type].append(name_save)
                else:
                    logging.warning(f"The patient ID of the following file: {name_save} does not respect the MEDiml "\
                        "naming convention 'study-institution-id' (Ex: Glioma-TCGA-001)")

        nb_job_left = n_scans - n_batch

        return ray.get(ids) if not self.save else None

        # Distribute the remaining tasks
        for _ in trange(n_scans):
            _, ids = ray.wait(ids, num_returns=1)
            if nb_job_left > 0:
                idx = n_scans - nb_job_left
                pd = ProcessDICOM(
                        self.__dicom.cell_path_images[idx], 
                        self.__dicom.cell_path_rs[idx], 
                        self.paths._path_save,
                        self.save)
                ids.extend([pd.process_files()])
                nb_job_left -= 1

            # Update the path to the created instances
            if self.save:
                for name_save in ray.get(ids):
                    if self.paths._path_save:
                        self.path_to_objects.extend(str(self.paths._path_save / name_save))
                    # Update processing summary
                    if name_save.split('_')[0].count('-') >= 2:
                        scan_type = name_save[name_save.find('__')+2 : name_save.find('.')]
                        if name_save.split('-')[0] not in self.__studies:
                            self.__studies.append(name_save.split('-')[0])  # add new study
                        if name_save.split('-')[1] not in self.__institutions:
                            self.__institutions.append(name_save.split('-')[1])  # add new study
                        if name_save.split('-')[0] not in self.summary:
                            self.summary[name_save.split('-')[0]] = {}
                        if name_save.split('-')[1] not  in self.summary[name_save.split('-')[0]]:
                            self.summary[name_save.split('-')[0]][name_save.split('-')[1]] = {}  # add new institution
                        if scan_type not in self.__scans:
                            self.__scans.append(scan_type)
                        if scan_type not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]]:
                            self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type] = []
                        if name_save not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type]:
                            self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type].append(name_save)
                    else:
                        logging.warning(f"The patient ID of the following file: {name_save} does not respect the MEDiml "\
                            "naming convention 'study-institution-id' (Ex: Glioma-TCGA-001)")
            else:
                return ray.get(ids)
        print('DONE')


    def __read_all_niftis(self) -> None:
        """Reads all files in the initial path and organizes other path to images and roi
        in the class attributes.

        Returns:
            None.
        """
        print('\n--> Scanning all folders in initial directory')
        if not self.paths._path_to_niftis:
            raise ValueError("The path to the niftis is not defined")
        p = Path(self.paths._path_to_niftis)
        e_rglob1 = '*.nii'
        e_rglob2 = '*.nii.gz'

        # EXTRACT ALL FILES IN THE PATH TO DICOMS
        if p.is_dir():
            self.__nifti.stack_path_all = list(p.rglob(e_rglob1))
            self.__nifti.stack_path_all.extend(list(p.rglob(e_rglob2)))
        else:
            raise TypeError(f"{p} must be a path to a directory")

        all_niftis = list(self.__nifti.stack_path_all)
        for i in trange(0, len(all_niftis)):
            if 'ROI' in all_niftis[i].name.split("."):
                self.__nifti.stack_path_roi.append(all_niftis[i])
            else:
                self.__nifti.stack_path_images.append(all_niftis[i])
        print('DONE')

    def __associate_roi_to_image(
            self,
            image_file: Union[Path, str], 
            medscan: MEDscan, 
            nifti: nib.Nifti1Image,
            path_roi_data: Path = None
        ) -> MEDscan:
        """Extracts all ROI data from the given path for the given patient ID and updates all class attributes with
        the new extracted data.

        Args:
            image_file(Union[Path, str]): Path to the ROI data.
            medscan (MEDscan): MEDscan class instance that will hold the data. 

        Returns:
            MEDscan: Returns a MEDscan instance with updated roi attributes.
        """
        def load_mask(_id, file, medscan):
            # Load the patient's ROI nifti files:
            if file.name.startswith(_id) and 'ROI' in file.name.split("."):
                roi = nib.load(file)
                roi = image.resample_to_img(roi, nifti, interpolation='nearest')
                roi_data = roi.get_fdata()
                roi_name = file.name[file.name.find("(") + 1 : file.name.find(")")]
                name_set = file.name[file.name.find("_") + 2 : file.name.find("(")]
                medscan.data.ROI.update_indexes(key=roi_index, indexes=np.nonzero(roi_data.flatten()))
                medscan.data.ROI.update_name_set(key=roi_index, name_set=name_set)
                medscan.data.ROI.update_roi_name(key=roi_index, roi_name=roi_name)  
            else:
                raise ValueError(f"The ROI file for patient ID: {_id} "
                    f"was not found in the given path: {file} or was not correctly named.")   
        
        image_file = Path(image_file)
        roi_index = 0
        if not path_roi_data:
            if not self.paths._path_to_niftis:
                raise ValueError("The path to the niftis is not defined")
            else:
                path_roi_data = self.paths._path_to_niftis

            for file in self.__nifti.stack_path_roi:
                _id = file.name.split("(")[0] if ("(") in file.name else file.name # id is PatientID__ImagingScanName
                load_mask(_id, file, medscan)
                roi_index += 1
        else:
            _id = image_file.name.split("(")[0] if ("(") in image_file.name else image_file.name # id is PatientID__ImagingScanName
            load_mask(_id, path_roi_data, medscan)

        return medscan

    def __associate_spatialRef(self, nifti_file: Union[Path, str], medscan: MEDscan) -> MEDscan:
        """Computes the imref3d spatialRef using a NIFTI file and updates the spatialRef attribute.

        Args:
            nifti_file(Union[Path, str]): Path to the nifti data.
            medscan (MEDscan): MEDscan class instance that will hold the data.

        Returns:
            MEDscan: Returns a MEDscan instance with updated spatialRef attribute.
        """
        # Loading the nifti file :
        nifti = nib.load(nifti_file)
        nifti_data = medscan.data.volume.array

        # spatialRef Creation
        pixel_x = abs(nifti.affine[0, 0])
        pixel_y = abs(nifti.affine[1, 1])
        slices = abs(nifti.affine[2, 2])
        min_grid = nifti.affine[:3, 3] * [-1.0, -1.0, 1.0] # x and y are flipped
        min_x_grid = min_grid[0]
        min_y_grid = min_grid[1]
        min_z_grid = min_grid[2]
        size_image = np.shape(nifti_data)
        spatialRef = imref3d(size_image, abs(pixel_x), abs(pixel_y), abs(slices))
        spatialRef.XWorldLimits = (np.array(spatialRef.XWorldLimits) -
                                (spatialRef.XWorldLimits[0] -
                                    (min_x_grid-pixel_x/2))
                                ).tolist()
        spatialRef.YWorldLimits = (np.array(spatialRef.YWorldLimits) -
                                (spatialRef.YWorldLimits[0] -
                                    (min_y_grid-pixel_y/2))
                                ).tolist()
        spatialRef.ZWorldLimits = (np.array(spatialRef.ZWorldLimits) -
                                (spatialRef.ZWorldLimits[0] -
                                    (min_z_grid-slices/2))
                                ).tolist()

        # Converting the results into lists
        spatialRef.ImageSize = spatialRef.ImageSize.tolist()
        spatialRef.XIntrinsicLimits = spatialRef.XIntrinsicLimits.tolist()
        spatialRef.YIntrinsicLimits = spatialRef.YIntrinsicLimits.tolist()
        spatialRef.ZIntrinsicLimits = spatialRef.ZIntrinsicLimits.tolist()

        # update spatialRef in the volume sub-class
        medscan.data.volume.update_spatialRef(spatialRef)

        return medscan

    def __process_one_nifti(self, nifti_file: Union[Path, str], path_data) -> MEDscan:
        """
        Processes one NIfTI file to create a MEDscan class instance.

        Args:
            nifti_file (Union[Path, str]): Path to the NIfTI file.
            path_data (Union[Path, str]): Path to the data.
        
        Returns:
            MEDscan: MEDscan class instance.
        """
        medscan = MEDscan()
        medscan.patientID = os.path.basename(nifti_file).split("_")[0]
        medscan.type = os.path.basename(nifti_file).split(".")[-3]
        medscan.series_description = nifti_file.name[nifti_file.name.find('__') + 2: nifti_file.name.find('(')]
        medscan.format = "nifti"
        medscan.data.set_orientation(orientation="Axial")
        medscan.data.set_patient_position(patient_position="HFS")
        medscan.data.volume.array = nib.load(nifti_file).get_fdata()
        medscan.data.volume.scan_rot = None
        
        # Update spatialRef
        self.__associate_spatialRef(nifti_file, medscan)
        
        # Assiocate ROI
        medscan = self.__associate_roi_to_image(nifti_file, medscan, nib.load(nifti_file), path_data)

        return medscan
    

[docs]
    def process_all(self) -> None:
        """Processes both DICOM & NIfTI content to create MEDscan classes
        """
        self.process_all_dicoms()
        self.process_all_niftis()



[docs]
    def process_all_niftis(self) -> List[MEDscan]:
        """This function reads the NIfTI content of all the sub-folder tree of a starting directory. 
        It then organizes the data in the MEDscan class including the region of  interest (ROI)
        defined by an associated mask file. All MEDscan classes hereby created are saved in a specific path
        with a name specific name varying with every scan.

        Args:
            None.

        Returns:
            List[MEDscan]: List of MEDscan instances.
        """

        # Reading all NIfTI files
        self.__read_all_niftis()

        # Create the MEDscan instances
        print('--> Reading all NIfTI objects (imaging volumes & masks) to create MEDscan classes')
        list_instances = []
        for file in tqdm(self.__nifti.stack_path_images):
            # Assert the list of instances does not exceed the a size of 10
            if len(list_instances) >= 10:
                print('The number of MEDscan instances exceeds 10, please consider saving the instances')
                break
            # INITIALIZE MEDscan INSTANCE AND UPDATE ATTRIBUTES
            medscan = MEDscan()
            medscan.patientID = os.path.basename(file).split("_")[0]
            medscan.type = os.path.basename(file).split(".")[-3]
            medscan.series_description = file.name[file.name.find('__') + 2: file.name.find('(')] if '__' in file.name else ""
            medscan.format = "nifti"
            medscan.data.set_orientation(orientation="Axial")
            medscan.data.set_patient_position(patient_position="HFS")
            medscan.data.volume.array = nib.load(file).get_fdata()
            
            # RAS to LPS
            #medscan.data.volume.convert_to_LPS()
            medscan.data.volume.scan_rot = None
            
            # Update spatialRef
            medscan = self.__associate_spatialRef(file, medscan)
            
            # Get ROI
            medscan = self.__associate_roi_to_image(file, medscan, nib.load(file))

            # SAVE MEDscan INSTANCE
            if self.save and self.paths._path_save:
                save_MEDscan(medscan, self.paths._path_save)
            else:
                list_instances.append(medscan)
            
            # Update the path to the created instances
            name_save = self.__get_MEDscan_name_save(medscan)

            # Clear memory
            del medscan

            # Update the path to the created instances
            if self.paths._path_save:
                self.path_to_objects.append(str(self.paths._path_save / name_save))
            
            # Update processing summary
            if name_save.split('_')[0].count('-') >= 2:
                scan_type = name_save[name_save.find('__')+2 : name_save.find('.')]
                if name_save.split('-')[0] not in self.__studies:
                    self.__studies.append(name_save.split('-')[0])  # add new study
                if name_save.split('-')[1] not in self.__institutions:
                    self.__institutions.append(name_save.split('-')[1])  # add new institution
                if name_save.split('-')[0] not in self.summary:
                    self.summary[name_save.split('-')[0]] = {}  # add new study to summary
                if name_save.split('-')[1] not  in self.summary[name_save.split('-')[0]]:
                    self.summary[name_save.split('-')[0]][name_save.split('-')[1]] = {}  # add new institution
                if scan_type not in self.__scans:
                    self.__scans.append(scan_type)
                if scan_type not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]]:
                    self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type] = []
                if name_save not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type]:
                    self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type].append(name_save)
            else:
                if self.save:
                    logging.warning(f"The patient ID of the following file: {name_save} does not respect the MEDiml "\
                        "naming convention 'study-institution-id' (Ex: Glioma-TCGA-001)")
        print('DONE')

        if list_instances:
            return list_instances



[docs]
    def process_one_nifti(self, path_image: Union[Path, str], path_mask: Union[Path, str]) -> MEDscan:
        """Processes one NIfTI file to create a MEDscan class instance.

        Args:
            nifti_file (Union[Path, str]): Path to the NIfTI file.
            path_data (Union[Path, str]): Path to the data.
        
        Returns:
            MEDscan: MEDscan class instance.
        """
        medscan = self.__process_one_nifti(path_image, path_mask)

        # SAVE MEDscan INSTANCE
        if self.save and self.paths._path_save:
            save_MEDscan(medscan, self.paths._path_save)

        return medscan

    

[docs]
    def update_from_csv(self, path_csv: Union[str, Path] = None) -> None:
        """Updates the class from a given CSV and summarizes the processed scans again according to it.

        Args:
            path_csv(optional, Union[str, Path]): Path to a csv file, if not given, will check
                for csv info in the class attributes.
        
        Returns:
            None
        """
        if not (path_csv or self.paths._path_csv):
            print('No csv provided, no updates will be made')
        else:
            if path_csv:
                self.paths._path_csv = path_csv
            # Extract roi type label from csv file name
            name_csv = self.paths._path_csv.name
            roi_type_label = name_csv[name_csv.find('_')+1 : name_csv.find('.')]

            # Create a dictionary
            csv_data = {}
            csv_data[roi_type_label] = pd.read_csv(self.paths._path_csv)
            self.csv_data = csv_data
            self.summarize()



[docs]
    def summarize(self, retrun_summary: bool = False) -> None:
        """Creates and shows a summary of processed scans organized by study, institution, scan type and roi type

        Args:
            retrun_summary (bool, optional): If True, will return the summary as a dictionary.
        
        Returns:
            None
        """
        def count_scans(summary):
            count = 0
            if type(summary) == dict:
                for study in summary:
                    if type(summary[study]) == dict:
                        for institution in summary[study]:
                            if type(summary[study][institution]) == dict:
                                for scan in self.summary[study][institution]:
                                    count += len(summary[study][institution][scan])
                            else:
                                count += len(summary[study][institution])
                    else:
                        count += len(summary[study])
            elif type(summary) == list:
                count = len(summary)
            return count

        summary_df = pd.DataFrame(columns=['study', 'institution', 'scan_type', 'roi_type', 'count'])

        for study in self.summary:
            summary_df = summary_df.append({
                                        'study': study,
                                        'institution': "",
                                        'scan_type': "",
                                        'roi_type': "",
                                        'count' : count_scans(self.summary)
                                        }, ignore_index=True)
            for institution in self.summary[study]:
                summary_df = summary_df.append({
                                            'study': study,
                                            'institution': institution,
                                            'scan_type': "",
                                            'roi_type': "",
                                            'count' : count_scans(self.summary[study][institution])
                                            }, ignore_index=True)
                for scan in self.summary[study][institution]:
                    summary_df = summary_df.append({
                                                'study': study,
                                                'institution': institution,
                                                'scan_type': scan,
                                                'roi_type': "",
                                                'count' : count_scans(self.summary[study][institution][scan])
                                                }, ignore_index=True)
                    if self.csv_data:
                        roi_count = 0
                        for roi_type in self.csv_data:
                            csv_table = pd.DataFrame(self.csv_data[roi_type])
                            csv_table['under'] = '_'
                            csv_table['dot'] = '.'
                            csv_table['npy'] = '.npy'
                            name_patients = (pd.Series(
                                csv_table[['PatientID', 'under', 'under',
                                        'ImagingScanName',
                                        'dot',
                                        'ImagingModality',
                                        'npy']].fillna('').values.tolist()).str.join('')).tolist()
                            for patient_id in self.summary[study][institution][scan]:
                                if patient_id in name_patients:
                                    roi_count += 1
                            summary_df = summary_df.append({
                                                'study': study,
                                                'institution': institution,
                                                'scan_type': scan,
                                                'roi_type': roi_type,
                                                'count' : roi_count
                                                }, ignore_index=True)
        print(summary_df.to_markdown(index=False))

        if retrun_summary:
            return summary_df


    def __pre_radiomics_checks_dimensions(
        self,
        path_data: Union[Path, str] = None,
        wildcards_dimensions: List[str] = [],
        min_percentile: float = 0.05,
        max_percentile: float = 0.95,
        save: bool = False
        ) -> None:
        """Finds proper voxels dimension options for radiomics analyses for a group of scans

        Args:
            path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the 
                inner-class ``Paths`` in the current instance.
            wildcards_dimensions(List[str], optional): List of wildcards that determines the scans 
                that will be analyzed. You can learn more about wildcards in
                :ref:`this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`.
            min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05.
            max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95.
            save (bool, optional): If True, will save the results in a json file. Defaults to False.
        
        Returns:
            None.
        """
        xy_dim = {
            "data": [],
            "mean": [],
            "median": [],
            "std": [],
            "min": [],
            "max": [],
            f"p{min_percentile}": [],
            f"p{max_percentile}": []
        }
        z_dim = {
            "data": [],
            "mean": [],
            "median": [],
            "std": [],
            "min": [],
            "max": [],
            f"p{min_percentile}": [],
            f"p{max_percentile}": []
        }
        if type(wildcards_dimensions) is str:
            wildcards_dimensions = [wildcards_dimensions]

        if len(wildcards_dimensions) == 0:
            print("Wildcard is empty, the pre-checks will be aborted")
            return

        # Updating plotting params
        plt.rcParams["figure.figsize"] = (20,20)
        plt.rcParams.update({'font.size': 22})

        # TODO: seperate by studies and scan type (MRscan, CTscan...)
        # TODO: Two summaries (df, list of names saves) -> 
        # name_save = name_save(ROI) : Glioma-Huashan-001__T1.MRscan.npy({GTV})
        file_paths = list()
        for w in range(len(wildcards_dimensions)):
            wildcard = wildcards_dimensions[w]
            if path_data:
                file_paths = get_file_paths(path_data, wildcard)
            elif self.paths._path_save:
                file_paths = get_file_paths(self.paths._path_save, wildcard)
            else:
                raise ValueError("Path data is invalid.")
            n_files = len(file_paths)
            xy_dim["data"] = np.zeros((n_files, 1))
            xy_dim["data"] = np.multiply(xy_dim["data"], np.nan)
            z_dim["data"] = np.zeros((n_files, 1))
            z_dim["data"] = np.multiply(z_dim["data"], np.nan)
            for f in tqdm(range(len(file_paths))):
                try:
                    if file_paths[f].name.endswith("nii.gz") or file_paths[f].name.endswith("nii"):
                        with open(file_paths[f], 'rb') as file:
                            medscan = pickle.load(file)
                        xy_dim["data"][f] = medscan.header.get_zooms()[0]
                        z_dim["data"][f]  = medscan.header.get_zooms()[2]
                    else:
                        with open(file_paths[f], 'rb') as file:
                            medscan = pickle.load(file)
                        xy_dim["data"][f] = medscan.data.volume.spatialRef.PixelExtentInWorldX
                        z_dim["data"][f]  = medscan.data.volume.spatialRef.PixelExtentInWorldZ
                except Exception as e:
                    print(e)

            # Running analysis
            xy_dim["data"] = np.concatenate(xy_dim["data"])
            xy_dim["mean"] = np.mean(xy_dim["data"][~np.isnan(xy_dim["data"])])
            xy_dim["median"] = np.median(xy_dim["data"][~np.isnan(xy_dim["data"])])
            xy_dim["std"] = np.std(xy_dim["data"][~np.isnan(xy_dim["data"])])
            xy_dim["min"] = np.min(xy_dim["data"][~np.isnan(xy_dim["data"])])
            xy_dim["max"] = np.max(xy_dim["data"][~np.isnan(xy_dim["data"])])
            xy_dim[f"p{min_percentile}"] = np.percentile(xy_dim["data"][~np.isnan(xy_dim["data"])], 
                                                        min_percentile)
            xy_dim[f"p{max_percentile}"] = np.percentile(xy_dim["data"][~np.isnan(xy_dim["data"])], 
                                                        max_percentile)
            z_dim["mean"] = np.mean(z_dim["data"][~np.isnan(z_dim["data"])])
            z_dim["median"] = np.median(z_dim["data"][~np.isnan(z_dim["data"])])
            z_dim["std"] = np.std(z_dim["data"][~np.isnan(z_dim["data"])])
            z_dim["min"] = np.min(z_dim["data"][~np.isnan(z_dim["data"])])
            z_dim["max"] = np.max(z_dim["data"][~np.isnan(z_dim["data"])])
            z_dim[f"p{min_percentile}"] = np.percentile(z_dim["data"][~np.isnan(z_dim["data"])], 
                                                        min_percentile)
            z_dim[f"p{max_percentile}"] = np.percentile(z_dim["data"][~np.isnan(z_dim["data"])], max_percentile)
            xy_dim["data"] = xy_dim["data"].tolist()
            z_dim["data"] = z_dim["data"].tolist()
            
            # Plotting xy-spacing data histogram
            df_xy = pd.DataFrame(xy_dim["data"], columns=['data'])
            del xy_dim["data"]  # no interest in keeping data (we only need statistics)
            ax = df_xy.hist(column='data')
            min_quant, max_quant, median = df_xy.quantile(min_percentile), df_xy.quantile(max_percentile), df_xy.median()
            for x in ax[0]:
                x.axvline(min_quant.data, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}")
                x.axvline(max_quant.data, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}")
                x.axvline(median.data, linestyle='solid', color='gold', label=f"Median: {float(median.data):.3f}")
                x.grid(False)
                plt.title(f"Voxels xy-spacing checks for {wildcard}")
                plt.legend()
                # Save the plot
                if save:
                    plt.savefig(self.paths._path_save_checks / ('Voxels_xy_check.png'))
                else:
                    plt.show()
            
            # Plotting z-spacing data histogram
            df_z = pd.DataFrame(z_dim["data"], columns=['data'])
            del z_dim["data"]  # no interest in keeping data (we only need statistics)
            ax = df_z.hist(column='data')
            min_quant, max_quant, median = df_z.quantile(min_percentile), df_z.quantile(max_percentile), df_z.median()
            for x in ax[0]:
                x.axvline(min_quant.data, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}")
                x.axvline(max_quant.data, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}")
                x.axvline(median.data, linestyle='solid', color='gold', label=f"Median: {float(median.data):.3f}")
                x.grid(False)
                plt.title(f"Voxels z-spacing checks for {wildcard}")
                plt.legend()
                # Save the plot
                if save:
                    plt.savefig(self.paths._path_save_checks / ('Voxels_z_check.png'))
                else:
                    plt.show()
                
            # Saving files using wildcard for name
            if save:
                wildcard = str(wildcard).replace('*', '').replace('.npy', '.json')
                save_json(self.paths._path_save_checks / ('xyDim_' + wildcard), xy_dim, cls=NumpyEncoder)
                save_json(self.paths._path_save_checks / ('zDim_' + wildcard), z_dim, cls=NumpyEncoder)

    def __pre_radiomics_checks_window(
        self,
        path_data: Union[str, Path] = None,
        wildcards_window: List = [], 
        path_csv: Union[str, Path] = None,
        min_percentile: float = 0.05,
        max_percentile: float = 0.95,
        bin_width: int = 0,
        hist_range: list = [],
        nifti: bool = True,
        save: bool = False
        ) -> None:
        """Finds proper re-segmentation ranges options for radiomics analyses for a group of scans

        Args:
            path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the 
                inner-class ``Paths`` in the current instance.
            wildcards_window(List[str], optional): List of wildcards that determines the scans 
                that will be analyzed. You can learn more about wildcards in
                :ref:`this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`.
            path_csv(Union[str, Path], optional): Path to a csv file containing a list of the scans that will be
                analyzed (a CSV file for a single ROI type).
            min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05.
            max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95.
            bin_width(int, optional): Width of the bins for the histograms. If not provided, will use the 
                default number of bins in the method 
                :ref:`pandas.DataFrame.hist <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.hist.html>`: 10 bins.
            hist_range(list, optional): Range of the histograms. If empty, will use the minimum and maximum values.
            nifti(bool, optional): If True, will use the NIfTI files, otherwise will use the numpy files.
            save (bool, optional): If True, will save the results in a json file. Defaults to False.
        
        Returns:
            None.
        """
        # Updating plotting params
        plt.rcParams["figure.figsize"] = (20,20)
        plt.rcParams.update({'font.size': 22})
        
        if type(wildcards_window) is str:
            wildcards_window = [wildcards_window]

        if len(wildcards_window) == 0:
            print("Wilcards is empty")
            return
        if path_csv:
            self.paths._path_csv = Path(path_csv)
        roi_table = pd.read_csv(self.paths._path_csv)
        if nifti:
            roi_table['under'] = '_'
            roi_table['dot'] = '.'
            roi_table['roi_label'] = 'GTV'
            roi_table['oparenthesis'] = '('
            roi_table['cparenthesis'] = ')'
            roi_table['ext'] = '.nii.gz'
            patient_names = (pd.Series(
                roi_table[['PatientID', 'under', 'under',
                        'ImagingScanName',
                        'oparenthesis',
                        'roi_label',
                        'cparenthesis',
                        'dot',
                        'ImagingModality',
                        'ext']].fillna('').values.tolist()).str.join('')).tolist()
        else:
            roi_names = [[], [], []]
            roi_names[0] = roi_table['PatientID']
            roi_names[1] = roi_table['ImagingScanName']
            roi_names[2] = roi_table['ImagingModality']
            patient_names = get_patient_names(roi_names)
        for w in range(len(wildcards_window)):
            temp_val = []
            temp = []
            file_paths = []
            roi_data= {
                "data": [],
                "mean": [],
                "median": [],
                "std": [],
                "min": [],
                "max": [],
                f"p{min_percentile}": [],
                f"p{max_percentile}": []
            }
            wildcard = wildcards_window[w]
            if path_data:
                file_paths = get_file_paths(path_data, wildcard)
            elif self.paths._path_save:
                path_data = self.paths._path_save
                file_paths = get_file_paths(self.paths._path_save, wildcard)
            else:
                raise ValueError("Path data is invalid.")
            n_files = len(file_paths)
            i = 0
            for f in tqdm(range(n_files)):
                file = file_paths[f]
                _, filename = os.path.split(file)
                filename, ext = os.path.splitext(filename)
                patient_name = filename + ext
                try:
                    if file.name.endswith('nii.gz') or file.name.endswith('nii'):
                        medscan = self.__process_one_nifti(file, path_data)
                    else:
                        with open(file, 'rb') as file:
                            medscan = pickle.load(file)
                        if re.search('PTscan', wildcard) and medscan.format != 'nifti':
                            suv_converter = PETSUVConverter(medscan.dicomH)
                            medscan.data.volume.array = suv_converter.compute(np.double(medscan.data.volume.array))
                    patient_names = pd.Index(patient_names)
                    ind_roi = patient_names.get_loc(patient_name)
                    name_roi = roi_table.loc[ind_roi][3]
                    vol_obj_init, roi_obj_init = get_roi_from_indexes(medscan, name_roi, 'box')
                    temp = vol_obj_init.data[roi_obj_init.data == 1]
                    temp_val.append(len(temp))
                    roi_data["data"].append(np.zeros(shape=(n_files, temp_val[i])))
                    roi_data["data"][i] = temp
                    i+=1
                    del medscan
                    del vol_obj_init
                    del roi_obj_init
                except Exception as e:
                    print(f"Problem with patient {patient_name}, error: {e}")
            
            roi_data["data"] = np.concatenate(roi_data["data"])
            roi_data["mean"] = np.mean(roi_data["data"][~np.isnan(roi_data["data"])])
            roi_data["median"] = np.median(roi_data["data"][~np.isnan(roi_data["data"])])
            roi_data["std"] = np.std(roi_data["data"][~np.isnan(roi_data["data"])])
            roi_data["min"] = np.min(roi_data["data"][~np.isnan(roi_data["data"])])
            roi_data["max"] = np.max(roi_data["data"][~np.isnan(roi_data["data"])])
            roi_data[f"p{min_percentile}"] = np.percentile(roi_data["data"][~np.isnan(roi_data["data"])], 
                                                        min_percentile)
            roi_data[f"p{max_percentile}"] = np.percentile(roi_data["data"][~np.isnan(roi_data["data"])], 
                                                        max_percentile)
            
            # Set bin width if not provided
            if bin_width != 0:
                if hist_range:
                    nb_bins = (round(hist_range[1]) - round(hist_range[0])) // bin_width
                else:
                    nb_bins = (round(roi_data["max"]) - round(roi_data["min"])) // bin_width
            else:
                nb_bins = 10
                if hist_range:
                    bin_width = int((round(hist_range[1]) - round(hist_range[0])) // nb_bins)
                else:
                    bin_width = int((round(roi_data["max"]) - round(roi_data["min"])) // nb_bins)
            nb_bins = int(nb_bins)

            # Set histogram range if not provided
            if not hist_range:
                hist_range = (roi_data["min"], roi_data["max"])

           # re-segment data according to histogram range
            roi_data["data"] = roi_data["data"][(roi_data["data"] > hist_range[0]) & (roi_data["data"] < hist_range[1])]
            df_data = pd.DataFrame(roi_data["data"], columns=['data'])
            del roi_data["data"]  # no interest in keeping data (we only need statistics)

            # Plot histogram
            ax = df_data.hist(column='data', bins=nb_bins, range=(hist_range[0], hist_range[1]), edgecolor='black')
            min_quant, max_quant= df_data.quantile(min_percentile), df_data.quantile(max_percentile)
            for x in ax[0]:
                x.axvline(min_quant.data, linestyle=':', color='r', label=f"{min_percentile*100}% Percentile: {float(min_quant):.3f}")
                x.axvline(max_quant.data, linestyle=':', color='g', label=f"{max_percentile*100}% Percentile: {float(max_quant):.3f}")
                x.grid(False)
                x.xaxis.set_ticks(np.arange(hist_range[0], hist_range[1], bin_width, dtype=int))
                x.set_xticklabels(x.get_xticks(), rotation=45)
                x.xaxis.set_tick_params(pad=15)
                plt.title(f"Intensity range checks for {wildcard}, bw={bin_width}")
                plt.legend()
                # Save the plot
                if save:
                    plt.savefig(self.paths._path_save_checks / ('Intensity_range_check_' + f'bw_{bin_width}.png'))
                else:
                    plt.show()
            
            # save final checks
            if save:
                wildcard = str(wildcard).replace('*', '').replace('.npy', '.json')
                save_json(self.paths._path_save_checks / ('roi_data_' + wildcard), roi_data, cls=NumpyEncoder)


[docs]
    def pre_radiomics_checks(self,
                            path_data: Union[str, Path] = None,
                            wildcards_dimensions: List = [],
                            wildcards_window: List = [],
                            path_csv: Union[str, Path] = None,
                            min_percentile: float = 0.05,
                            max_percentile: float = 0.95,
                            bin_width: int = 0,
                            hist_range: list = [],
                            nifti: bool = False,
                            save: bool = False) -> None:
        """Finds proper dimension and re-segmentation ranges options for radiomics analyses. 

        The resulting files from this method can then be analyzed and used to set up radiomics 
        parameters options in computation methods.

        Args:
            path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the 
                inner-class ``Paths`` in the current instance.
            wildcards_dimensions(List[str], optional): List of wildcards that determines the scans 
                that will be analyzed. You can learn more about wildcards in
                `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_.
            wildcards_window(List[str], optional): List of wildcards that determines the scans 
                that will be analyzed. You can learn more about wildcards in
                `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_.
            path_csv(Union[str, Path], optional): Path to a csv file containing a list of the scans that will be
                analyzed (a CSV file for a single ROI type).
            min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05.
            max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95.
            bin_width(int, optional): Width of the bins for the histograms. If not provided, will use the 
                default number of bins in the method 
                :ref:`pandas.DataFrame.hist <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.hist.html>`: 10 bins.
            hist_range(list, optional): Range of the histograms. If empty, will use the minimum and maximum values.
            nifti (bool, optional): Set to True if the scans are nifti files. Defaults to False.
            save (bool, optional): If True, will save the results in a json file. Defaults to False.

        Returns:
            None
        """
        # Initialization
        path_study = Path.cwd()

        # Load params
        if not self.paths._path_pre_checks_settings:
            if not wildcards_dimensions or not wildcards_window:
                raise ValueError("path to pre-checks settings is None.\
                    wildcards_dimensions and wildcards_window need to be specified")
        else:
            settings = self.paths._path_pre_checks_settings
            settings = load_json(settings)
            settings = settings['pre_radiomics_checks']  

            # Setting up paths
            if 'path_save_checks' in settings and settings['path_save_checks']:
                self.paths._path_save_checks = Path(settings['path_save_checks']) 
            if 'path_csv' in settings and settings['path_csv']:
                self.paths._path_csv = Path(settings['path_csv']) 

            # Wildcards of groups of files to analyze for dimensions in path_data.
            # See for example: https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html
            # Keep the cell empty if no dimension checks are to be performed.
            if not wildcards_dimensions:
                wildcards_dimensions = []
                for i in range(len(settings['wildcards_dimensions'])):
                    wildcards_dimensions.append(settings['wildcards_dimensions'][i])

            # ROI intensity window checks params
            if not wildcards_window:
                wildcards_window = []
                for i in range(len(settings['wildcards_window'])):
                    wildcards_window.append(settings['wildcards_window'][i])

        # PRE-RADIOMICS CHECKS
        if not self.paths._path_save_checks:
            if (path_study / 'checks').exists():
                self.paths._path_save_checks = Path(path_study / 'checks')
            else:
                os.mkdir(path_study / 'checks')
                self.paths._path_save_checks = Path(path_study / 'checks')
        else:
            if self.paths._path_save_checks.name != 'checks':
                if (self.paths._path_save_checks / 'checks').exists():
                    self.paths._path_save_checks /= 'checks'
                else:
                    os.mkdir(self.paths._path_save_checks / 'checks')
                    self.paths._path_save_checks = Path(self.paths._path_save_checks / 'checks')

        # Initializing plotting params
        plt.rcParams["figure.figsize"] = (20,20)
        plt.rcParams.update({'font.size': 22})
        
        start = time()
        print('\n\n************************* PRE-RADIOMICS CHECKS *************************', end='')

        # 1. PRE-RADIOMICS CHECKS -- DIMENSIONS
        start1 = time()
        print('\n--> PRE-RADIOMICS CHECKS -- DIMENSIONS ... ', end='')
        self.__pre_radiomics_checks_dimensions(
                                        path_data, 
                                        wildcards_dimensions, 
                                        min_percentile, 
                                        max_percentile,
                                        save)
        print('DONE', end='')
        time1 = f"{time() - start1:.2f}"
        print(f'\nElapsed time: {time1} sec', end='')

        # 2. PRE-RADIOMICS CHECKS - WINDOW
        start2 = time()
        print('\n\n--> PRE-RADIOMICS CHECKS -- WINDOW ... \n', end='')
        self.__pre_radiomics_checks_window(
                                        path_data, 
                                        wildcards_window, 
                                        path_csv,
                                        min_percentile, 
                                        max_percentile,
                                        bin_width,
                                        hist_range,
                                        nifti,
                                        save)
        print('DONE', end='')
        time2 = f"{time() - start2:.2f}"
        print(f'\nElapsed time: {time2} sec', end='')

        time_elapsed = f"{time() - start:.2f}"
        print(f'\n\n--> TOTAL TIME FOR PRE-RADIOMICS CHECKS: {time_elapsed} seconds')
        print('-------------------------------------------------------------------------------------')



[docs]
    def perform_mr_imaging_summary(self, 
                                wildcards_scans: List[str],
                                path_data: Path = None,
                                path_save_checks: Path = None,
                                min_percentile: float = 0.05,
                                max_percentile: float = 0.95
                                ) -> None:
        """
        Summarizes MRI imaging acquisition parameters. Plots summary histograms
        for different dimensions and saves all acquisition parameters locally in JSON files.

        Args:
            wildcards_scans (List[str]): List of wildcards that determines the scans 
                that will be analyzed (Only MRI scans will be analyzed). You can learn more about wildcards in
                `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_.
                For example: ``[\"STS*.MRscan.npy\"]``.
            path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the 
                inner-class ``Paths`` in the current instance.
            path_save_checks (Path, optional): Path where to save the checks, if not specified will use the one 
                in the current instance.
            min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05.
            max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95.
        
        Returns:
            None.
        """
        # Initializing data structures
        class param:
            dates = []
            manufacturer = []
            scanning_sequence = []
            class years:
                data = []

            class fieldStrength:
                data = []

            class repetitionTime:
                data = []

            class echoTime:
                data = []

            class inversionTime:
                data = []

            class echoTrainLength:
                data = []

            class flipAngle:
                data = []

            class numberAverages:
                data = []

            class xyDim:
                data = []

            class zDim:
                data = []
        
        if len(wildcards_scans) == 0:
            print('wildcards_scans is empty')

        # wildcards checks:
        no_mr_scan = True
        for wildcard in wildcards_scans:
            if 'MRscan' in wildcard:
                no_mr_scan = False
        if no_mr_scan:
            raise ValueError(f"wildcards: {wildcards_scans} does not include MR scans. (Only MR scans are supported)")
        
        # Initialization
        if path_data is None:
            if self.paths._path_save:
                path_data = Path(self.paths._path_save)
            else:
                print("No path to data was given and path save is None.")
                return 0
        
        if not path_save_checks:
            if self.paths._path_save_checks:
                path_save_checks = Path(self.paths._path_save_checks)
            else:
                if (Path(os.getcwd()) / "checks").exists():
                    path_save_checks = Path(os.getcwd()) / "checks"
                else:
                    path_save_checks = (Path(os.getcwd()) / "checks").mkdir()
        # Looping through all the different wildcards
        for i in tqdm(range(len(wildcards_scans))):
            wildcard = wildcards_scans[i]
            file_paths = get_file_paths(path_data, wildcard)
            n_files = len(file_paths)
            param.dates = np.zeros(n_files)
            param.years.data = np.zeros((n_files, 1))
            param.years.data = np.multiply(param.years.data, np.NaN)
            param.manufacturer = [None] * n_files
            param.scanning_sequence = [None] * n_files
            param.fieldStrength.data = np.zeros((n_files, 1))
            param.fieldStrength.data = np.multiply(param.fieldStrength.data, np.NaN)
            param.repetitionTime.data = np.zeros((n_files, 1))
            param.repetitionTime.data = np.multiply(param.repetitionTime.data, np.NaN)
            param.echoTime.data = np.zeros((n_files, 1))
            param.echoTime.data = np.multiply(param.echoTime.data, np.NaN)
            param.inversionTime.data = np.zeros((n_files, 1))
            param.inversionTime.data = np.multiply(param.inversionTime.data, np.NaN)
            param.echoTrainLength.data = np.zeros((n_files, 1))
            param.echoTrainLength.data = np.multiply(param.echoTrainLength.data, np.NaN)
            param.flipAngle.data = np.zeros((n_files, 1))
            param.flipAngle.data = np.multiply(param.flipAngle.data, np.NaN)
            param.numberAverages.data = np.zeros((n_files, 1))
            param.numberAverages.data = np.multiply(param.numberAverages.data, np.NaN)
            param.xyDim.data = np.zeros((n_files, 1))
            param.xyDim.data = np.multiply(param.xyDim.data, np.NaN)
            param.zDim.data = np.zeros((n_files, 1))
            param.zDim.data = np.multiply(param.zDim.data, np.NaN)
            
            # Loading and recording data
            for f in tqdm(range(n_files)):
                file = file_paths[f]

                #Open file for warning
                try:
                    warn_file = open(path_save_checks / 'imaging_summary_mr_warnings.txt', 'a')
                except IOError:
                    print("Could not open warning file")

                # Loading Data
                try:
                    print(f'\nCurrently working on: {file}', file = warn_file)
                    with open(path_data / file, 'rb') as fe: medscan = pickle.load(fe)

                    # Example of DICOM header
                    info = medscan.dicomH[1]
                    # Recording dates (info.AcquistionDates)
                    try:
                        param.dates[f] = info.AcquisitionDate
                    except AttributeError:
                        param.dates[f] = info.StudyDate
                    # Recording years
                    try:
                        y = str(param.dates[f])  # Only the first four characters represent the years
                        param.years.data[f] = y[0:4]
                    except Exception as e:
                        print(f'Cannot read years of: {file}. Error: {e}', file = warn_file)
                    # Recording manufacturers
                    try:
                        param.manufacturer[f] = info.Manufacturer
                    except Exception as e:
                        print(f'Cannot read manufacturer of: {file}. Error: {e}', file = warn_file)
                    # Recording scanning sequence
                    try:
                        param.scanning_sequence[f] = info.scanning_sequence
                    except Exception as e:
                        print(f'Cannot read scanning sequence of: {file}. Error: {e}', file = warn_file)
                    # Recording field strength
                    try:
                        param.fieldStrength.data[f] = info.MagneticFieldStrength
                    except Exception as e:
                        print(f'Cannot read field strength of: {file}. Error: {e}', file = warn_file)
                    # Recording repetition time
                    try:
                        param.repetitionTime.data[f] = info.RepetitionTime
                    except Exception as e:
                        print(f'Cannot read repetition time of: {file}. Error: {e}', file = warn_file)
                    # Recording echo time
                    try:
                        param.echoTime.data[f] = info.EchoTime
                    except Exception as e:
                        print(f'Cannot read echo time of: {file}. Error: {e}', file = warn_file)
                    # Recording inversion time
                    try:
                        param.inversionTime.data[f] = info.InversionTime
                    except Exception as e:
                        print(f'Cannot read inversion time of: {file}. Error: {e}', file = warn_file)
                    # Recording echo train length
                    try:
                        param.echoTrainLength.data[f] = info.EchoTrainLength
                    except Exception as e:
                        print(f'Cannot read echo train length of: {file}. Error: {e}', file = warn_file)
                    # Recording flip angle
                    try:
                        param.flipAngle.data[f] = info.FlipAngle
                    except Exception as e:
                        print(f'Cannot read flip angle of: {file}. Error: {e}', file = warn_file)
                    # Recording number of averages
                    try:
                        param.numberAverages.data[f] = info.NumberOfAverages
                    except Exception as e:
                        print(f'Cannot read number averages of: {file}. Error: {e}', file = warn_file)
                    # Recording xy spacing
                    try:
                        param.xyDim.data[f] = medscan.data.volume.spatialRef.PixelExtentInWorldX
                    except Exception as e:
                        print(f'Cannot read x spacing of: {file}. Error: {e}', file = warn_file)
                    # Recording z spacing
                    try:
                        param.zDim.data[f] = medscan.data.volume.spatialRef.PixelExtentInWorldZ
                    except Exception as e:
                        print(f'Cannot read z spacing of: {file}', file = warn_file)
                except Exception as e:
                    print(f'Cannot read file: {file}. Error: {e}', file = warn_file)

                warn_file.close()

            # Summarize data
                # Summarizing years
            df_years = pd.DataFrame(param.years.data, 
                                    columns=['years']).describe(percentiles=[min_percentile, max_percentile], 
                                    include='all')
                # Summarizing field strength
            df_fs = pd.DataFrame(param.fieldStrength.data, 
                                columns=['fieldStrength']).describe(percentiles=[min_percentile, max_percentile], 
                                include='all')
                # Summarizing  repetition time
            df_rt = pd.DataFrame(param.repetitionTime.data, 
                                columns=['repetitionTime']).describe(percentiles=[min_percentile, max_percentile], 
                                include='all')
                # Summarizing echo time
            df_et = pd.DataFrame(param.echoTime.data, 
                                columns=['echoTime']).describe(percentiles=[min_percentile, max_percentile], 
                                include='all')
                # Summarizing inversion time
            df_it = pd.DataFrame(param.inversionTime.data, 
                                columns=['inversionTime']).describe(percentiles=[min_percentile, max_percentile], 
                                include='all')
                # Summarizing echo train length
            df_etl = pd.DataFrame(param.echoTrainLength.data, 
                                columns=['echoTrainLength']).describe(percentiles=[min_percentile, max_percentile], 
                                include='all')
                # Summarizing flip  angle
            df_fa = pd.DataFrame(param.flipAngle.data, 
                                columns=['flipAngle']).describe(percentiles=[min_percentile, max_percentile], 
                                include='all')
                # Summarizing number of  averages
            df_na = pd.DataFrame(param.numberAverages.data, 
                                columns=['numberAverages']).describe(percentiles=[min_percentile, max_percentile], 
                                include='all')
                # Summarizing xy-spacing
            df_xy = pd.DataFrame(param.xyDim.data, 
                                columns=['xyDim'])
                # Summarizing z-spacing
            df_z = pd.DataFrame(param.zDim.data, 
                                columns=['zDim'])

            # Plotting xy-spacing histogram
            ax = df_xy.hist(column='xyDim')
            min_quant, max_quant, average = df_xy.quantile(min_percentile), df_xy.quantile(max_percentile), param.xyDim.data.mean()
            for x in ax[0]:
                x.axvline(min_quant.xyDim, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}")
                x.axvline(max_quant.xyDim, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}")
                x.axvline(average, linestyle='solid', color='gold', label=f"Average: {float(average):.3f}")
                x.grid(False)
                plt.title(f"MR xy-spacing imaging summary for {wildcard}")
                plt.legend()
                plt.show()
            
            # Plotting z-spacing histogram
            ax = df_z.hist(column='zDim')
            min_quant, max_quant, average = df_z.quantile(min_percentile), df_z.quantile(max_percentile), param.zDim.data.mean()
            for x in ax[0]:
                x.axvline(min_quant.zDim, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}")
                x.axvline(max_quant.zDim, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}")
                x.axvline(average, linestyle='solid', color='gold', label=f"Average: {float(average):.3f}")
                x.grid(False)
                plt.title(f"MR z-spacing imaging summary for {wildcard}")
                plt.legend()
                plt.show()
            
            # Summarizing xy-spacing
            df_xy = df_xy.describe(percentiles=[min_percentile, max_percentile], include='all')
            # Summarizing  z-spacing
            df_z = df_z.describe(percentiles=[min_percentile, max_percentile], include='all')
            
            # Saving data
            name_save = wildcard.replace('*', '').replace('.npy', '')
            save_name = 'imagingSummary__' + name_save + ".json"
            df_all = [df_years, df_fs, df_rt, df_et, df_it, df_etl, df_fa, df_na, df_xy, df_z]
            df_all = df_all[0].join(df_all[1:])
            df_all.to_json(path_save_checks / save_name, orient='columns', indent=4)



[docs]
    def perform_ct_imaging_summary(self, 
                                wildcards_scans: List[str],
                                path_data: Path = None,
                                path_save_checks: Path = None,
                                min_percentile: float = 0.05,
                                max_percentile: float = 0.95
                                ) -> None:
        """
        Summarizes CT imaging acquisition parameters. Plots summary histograms
        for different dimensions and saves all acquisition parameters locally in JSON files.

        Args:
            wildcards_scans (List[str]): List of wildcards that determines the scans 
                that will be analyzed (Only MRI scans will be analyzed). You can learn more about wildcards in
                `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_.
                For example: ``[\"STS*.CTscan.npy\"]``.
            path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the 
                inner-class ``Paths`` in the current instance.
            path_save_checks (Path, optional): Path where to save the checks, if not specified will use the one 
                in the current instance.
            min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05.
            max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95.
        
        Returns:
            None.
        """

        class param:
            manufacturer = []
            dates = []
            kernel = []

            class years:
                data = []
            class voltage:
                data = []
            class exposure:
                data = []
            class xyDim:
                data = []
            class zDim:
                data = []

        if len(wildcards_scans) == 0:
            print('wildcards_scans is empty')

        # wildcards checks:
        no_mr_scan = True
        for wildcard in wildcards_scans:
            if 'CTscan' in wildcard:
                no_mr_scan = False
        if no_mr_scan:
            raise ValueError(f"wildcards: {wildcards_scans} does not include CT scans. (Only CT scans are supported)")

        # Initialization
        if path_data is None:
            if self.paths._path_save:
                path_data = Path(self.paths._path_save)
            else:
                print("No path to data was given and path save is None.")
                return 0
        
        if not path_save_checks:
            if self.paths._path_save_checks:
                path_save_checks = Path(self.paths._path_save_checks)
            else:
                if (Path(os.getcwd()) / "checks").exists():
                    path_save_checks = Path(os.getcwd()) / "checks"
                else:
                    path_save_checks = (Path(os.getcwd()) / "checks").mkdir()

        # Looping through all the different wildcards
        for i in tqdm(range(len(wildcards_scans))):
            wildcard = wildcards_scans[i]
            file_paths = get_file_paths(path_data, wildcard)
            n_files = len(file_paths)
            param.dates = np.zeros(n_files)
            param.years.data = np.zeros(n_files)
            param.years.data = np.multiply(param.years.data, np.NaN)
            param.manufacturer = [None] * n_files
            param.voltage.data = np.zeros(n_files)
            param.voltage.data = np.multiply(param.voltage.data, np.NaN)
            param.exposure.data = np.zeros(n_files)
            param.exposure.data = np.multiply(param.exposure.data, np.NaN)
            param.kernel = [None] * n_files
            param.xyDim.data = np.zeros(n_files)
            param.xyDim.data = np.multiply(param.xyDim.data, np.NaN)
            param.zDim.data = np.zeros(n_files)
            param.zDim.data = np.multiply(param.zDim.data, np.NaN)
        
            # Loading and recording data
            for f in tqdm(range(n_files)):
                file = file_paths[f]

                # Open file for warning
                try:
                    warn_file = open(path_save_checks / 'imaging_summary_ct_warnings.txt', 'a')
                except IOError:
                    print("Could not open file")

                # Loading Data
                try:
                    with open(path_data / file, 'rb') as fe: medscan = pickle.load(fe)
                    print(f'Currently working on: {file}', file=warn_file)
                    
                    # DICOM header
                    info = medscan.dicomH[1]

                    # Recording dates
                    try:
                        param.dates[f] = info.AcquisitionDate
                    except AttributeError:
                        param.dates[f] = info.StudyDate
                        # Recording years
                    try:
                        years = str(param.dates[f])  # Only the first four characters represent the years
                        param.years.data[f] = years[0:4]
                    except Exception as e:
                        print(f'Cannot read dates of : {file}. Error: {e}', file=warn_file)
                        # Recording manufacturers
                    try:
                        param.manufacturer[f] = info.Manufacturer
                    except Exception as e:
                        print(f'Cannot read Manufacturer of: {file}. Error: {e}', file=warn_file)
                        # Recording voltage
                    try:
                        param.voltage.data[f] = info.KVP
                    except Exception as e:
                        print(f'Cannot read voltage of: {file}. Error: {e}', file=warn_file)
                        # Recording exposure
                    try:
                        param.exposure.data[f] = info.Exposure
                    except Exception as e:
                        print(f'Cannot read exposure of: {file}. Error: {e}', file=warn_file)
                        # Recording reconstruction kernel
                    try:
                        param.kernel[f] = info.ConvolutionKernel
                    except Exception as e:
                        print(f'Cannot read Kernel of: {file}. Error: {e}', file=warn_file)
                        # Recording xy spacing
                    try:
                        param.xyDim.data[f] = medscan.data.volume.spatialRef.PixelExtentInWorldX
                    except Exception as e:
                        print(f'Cannot read x spacing of: {file}. Error: {e}', file=warn_file)
                        # Recording z spacing
                    try:
                        param.zDim.data[f] = medscan.data.volume.spatialRef.PixelExtentInWorldZ
                    except Exception as e:
                        print(f'Cannot read z spacing of: {file}. Error: {e}', file=warn_file)
                except Exception as e:
                    print(f'Cannot load file: {file}', file=warn_file)

                warn_file.close()

            # Summarize data
                # Summarizing years
            df_years = pd.DataFrame(param.years.data, columns=['years']).describe(percentiles=[min_percentile, max_percentile], include='all')
                # Summarizing voltage
            df_voltage = pd.DataFrame(param.voltage.data, columns=['voltage']).describe(percentiles=[min_percentile, max_percentile], include='all')
                # Summarizing exposure
            df_exposure = pd.DataFrame(param.exposure.data, columns=['exposure']).describe(percentiles=[min_percentile, max_percentile], include='all')
                # Summarizing kernel
            df_kernel = pd.DataFrame(param.kernel, columns=['kernel']).describe(percentiles=[min_percentile, max_percentile], include='all')
                # Summarize xy spacing
            df_xy = pd.DataFrame(param.xyDim.data, columns=['xyDim']).describe(percentiles=[min_percentile, max_percentile], include='all')
                # Summarize z spacing
            df_z = pd.DataFrame(param.zDim.data, columns=['zDim']).describe(percentiles=[min_percentile, max_percentile], include='all')
                # Summarizing xy-spacing
            df_xy = pd.DataFrame(param.xyDim.data, columns=['xyDim'])
                # Summarizing z-spacing
            df_z = pd.DataFrame(param.zDim.data, columns=['zDim'])

            # Plotting xy-spacing histogram
            ax = df_xy.hist(column='xyDim')
            min_quant, max_quant, average = df_xy.quantile(min_percentile), df_xy.quantile(max_percentile), param.xyDim.data.mean()
            for x in ax[0]:
                x.axvline(min_quant.xyDim, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}")
                x.axvline(max_quant.xyDim, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}")
                x.axvline(average, linestyle='solid', color='gold', label=f"Average: {float(average):.3f}")
                x.grid(False)
                plt.title(f"CT xy-spacing imaging summary for {wildcard}")
                plt.legend()
                plt.show()
            
            # Plotting z-spacing histogram
            ax = df_z.hist(column='zDim')
            min_quant, max_quant, average = df_z.quantile(min_percentile), df_z.quantile(max_percentile), param.zDim.data.mean()
            for x in ax[0]:
                x.axvline(min_quant.zDim, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}")
                x.axvline(max_quant.zDim, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}")
                x.axvline(average, linestyle='solid', color='gold', label=f"Average: {float(average):.3f}")
                x.grid(False)
                plt.title(f"CT z-spacing imaging summary for {wildcard}")
                plt.legend()
                plt.show()
            
            # Summarizing xy-spacing
            df_xy = df_xy.describe(percentiles=[min_percentile, max_percentile], include='all')
            # Summarizing  z-spacing
            df_z = df_z.describe(percentiles=[min_percentile, max_percentile], include='all')

            # Saving data
            name_save = wildcard.replace('*', '').replace('.npy', '')
            save_name = 'imagingSummary__' + name_save + ".json"
            df_all = [df_years, df_voltage, df_exposure, df_kernel, df_xy, df_z]
            df_all = df_all[0].join(df_all[1:])
            df_all.to_json(path_save_checks / save_name, orient='columns', indent=4)



[docs]
    def perform_imaging_summary(self, 
                                wildcards_scans: List[str],
                                path_data: Path = None,
                                path_save_checks: Path = None,
                                min_percentile: float = 0.05,
                                max_percentile: float = 0.95
                                ) -> None:
        """
        Summarizes CT and MR imaging acquisition parameters. Plots summary histograms
        for different dimensions and saves all acquisition parameters locally in JSON files.

        Args:
            wildcards_scans (List[str]): List of wildcards that determines the scans 
                that will be analyzed (CT and MRI scans will be analyzed). You can learn more about wildcards in
                `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_.
                For example: ``[\"STS*.CTscan.npy\", \"STS*.MRscan.npy\"]``.
            path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the 
                inner-class ``Paths`` in the current instance.
            path_save_checks (Path, optional): Path where to save the checks, if not specified will use the one 
                in the current instance.
            min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05.
            max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95.
        
        Returns:
            None.
        """
        # MR imaging summary
        wildcards_scans_mr = [wildcard for wildcard in wildcards_scans if 'MRscan' in wildcard]
        if len(wildcards_scans_mr) == 0:
            print("Cannot perform imaging summary for MR, no MR scan wildcard was given! ")
        else:
            self.perform_mr_imaging_summary(
                                wildcards_scans_mr, 
                                path_data, 
                                path_save_checks, 
                                min_percentile, 
                                max_percentile)
        # CT imaging summary
        wildcards_scans_ct = [wildcard for wildcard in wildcards_scans if 'CTscan' in wildcard]
        if len(wildcards_scans_ct) == 0:
            print("Cannot perform imaging summary for CT, no CT scan wildcard was given! ")
        else:
            self.perform_ct_imaging_summary(
                                wildcards_scans_ct, 
                                path_data, 
                                path_save_checks, 
                                min_percentile, 
                                max_percentile)