Source code for MEDiml.wrangling.DataManager

import json
import logging
import os
import pickle
import re
from dataclasses import dataclass
from pathlib import Path
from time import time
from typing import List, Union

import matplotlib.pyplot as plt
import nibabel as nib
import numpy as np
import pandas as pd
import pydicom
import pydicom.misc
import ray
from nilearn import image
from numpyencoder import NumpyEncoder
from tqdm import tqdm, trange

from ..MEDscan import MEDscan
from ..processing.PETSUVConverter import PETSUVConverter
from ..processing.segmentation import get_roi_from_indexes
from ..utils.get_file_paths import get_file_paths
from ..utils.get_patient_names import get_patient_names
from ..utils.imref import imref3d
from ..utils.json_utils import load_json, save_json
from ..utils.save_MEDscan import save_MEDscan
from .ProcessDICOM import ProcessDICOM


[docs] class DataManager(object): """Reads all the raw data (DICOM, NIfTI) content and organizes it in instances of the MEDscan class."""
[docs] @dataclass class DICOM(object): """DICOM data management class that will organize data during the conversion to MEDscan class process""" stack_series_rs: List stack_path_rs: List stack_frame_rs: List cell_series_id: List cell_path_rs: List cell_path_images: List cell_frame_rs: List cell_frame_id: List
[docs] @dataclass class NIfTI(object): """NIfTI data management class that will organize data during the conversion to MEDscan class process""" stack_path_images: List stack_path_roi: List stack_path_all: List
[docs] @dataclass class Paths(object): """Paths management class that will organize the paths used in the processing""" _path_to_dicoms: List _path_to_niftis: List _path_csv: Union[Path, str] _path_save: Union[Path, str] _path_save_checks: Union[Path, str] _path_pre_checks_settings: Union[Path, str]
[docs] def __init__( self, path_to_dicoms: List = [], path_to_niftis: List = [], path_csv: Union[Path, str] = None, path_save: Union[Path, str] = None, path_save_checks: Union[Path, str] = None, path_pre_checks_settings: Union[Path, str] = None, save: bool = True, n_batch: int = 2 ) -> None: """Constructor of the class DataManager. Args: path_to_dicoms (Union[Path, str], optional): Full path to the starting directory where the DICOM data is located. path_to_niftis (Union[Path, str], optional): Full path to the starting directory where the NIfTI is located. path_csv (Union[Path, str], optional): Full path to the CSV file containing the scans info list. path_save (Union[Path, str], optional): Full path to the directory where to save all the MEDscan classes. path_save_checks(Union[Path, str], optional): Full path to the directory where to save all the pre-radiomics checks analysis results. path_pre_checks_settings(Union[Path, str], optional): Full path to the JSON file of the pre-checks analysis parameters. save (bool, optional): True to save the MEDscan classes in `path_save`. n_batch (int, optional): Numerical value specifying the number of batch to use in the parallel computations (use 0 for serial computation). Returns: None """ # Convert all paths to Pathlib.Path if path_to_dicoms: path_to_dicoms = Path(path_to_dicoms) if path_to_niftis: path_to_niftis = Path(path_to_niftis) if path_csv: path_csv = Path(path_csv) if path_save: path_save = Path(path_save) if path_save_checks: path_save_checks = Path(path_save_checks) if path_pre_checks_settings: path_pre_checks_settings = Path(path_pre_checks_settings) self.paths = self.Paths( path_to_dicoms, path_to_niftis, path_csv, path_save, path_save_checks, path_pre_checks_settings, ) self.save = save self.n_batch = n_batch self.__dicom = self.DICOM( stack_series_rs=[], stack_path_rs=[], stack_frame_rs=[], cell_series_id=[], cell_path_rs=[], cell_path_images=[], cell_frame_rs=[], cell_frame_id=[] ) self.__nifti = self.NIfTI( stack_path_images=[], stack_path_roi=[], stack_path_all=[] ) self.path_to_objects = [] self.summary = {} self.csv_data = None self.__studies = [] self.__institutions = [] self.__scans = []
def __find_uid_cell_index(self, uid: Union[str, List[str]], cell: List[str]) -> List: """Finds the cell with the same `uid`. If not is present in `cell`, creates a new position in the `cell` for the new `uid`. Args: uid (Union[str, List[str]]): Unique identifier of the Series to find. cell (List[str]): List of Unique identifiers of the Series. Returns: Union[List[str], str]: List or string of the uid """ return [len(cell)] if uid not in cell else[i for i, e in enumerate(cell) if e == uid] def __get_list_of_files(self, dir_name: str) -> List: """Gets all files in the given directory Args: dir_name (str): directory name Returns: List: List of all files in the directory """ list_of_file = os.listdir(dir_name) all_files = list() for entry in list_of_file: full_path = os.path.join(dir_name, entry) if os.path.isdir(full_path): all_files = all_files + self.__get_list_of_files(full_path) else: all_files.append(full_path) return all_files def __get_MEDscan_name_save(self, medscan: MEDscan) -> str: """Returns the name that will be used to save the MEDscan instance, based on the values of the attributes. Args: medscan(MEDscan): A MEDscan class instance. Returns: str: String of the name save. """ series_description = medscan.series_description.translate({ord(ch): '-' for ch in '/\\ ()&:*'}) name_id = medscan.patientID.translate({ord(ch): '-' for ch in '/\\ ()&:*'}) # final saving name name_complete = name_id + '__' + series_description + '.' + medscan.type + '.npy' return name_complete def __associate_rt_stuct(self) -> None: """Associates the imaging volumes to their mask using UIDs Returns: None """ print('--> Associating all RT objects to imaging volumes') n_rs = len(self.__dicom.stack_path_rs) self.__dicom.stack_series_rs = list(dict.fromkeys(self.__dicom.stack_series_rs)) if n_rs: for i in trange(0, n_rs): try: # PUT ALL THE DICOM PATHS WITH THE SAME UID IN THE SAME PATH LIST ind_series_id = self.__find_uid_cell_index( self.__dicom.stack_series_rs[i], self.__dicom.cell_series_id) for n in range(len(ind_series_id)): if ind_series_id[n] < len(self.__dicom.cell_path_rs): self.__dicom.cell_path_rs[ind_series_id[n]] += [self.__dicom.stack_path_rs[i]] except: ind_series_id = self.__find_uid_cell_index( self.__dicom.stack_frame_rs[i], self.__dicom.cell_frame_id) for n in range(len(ind_series_id)): if ind_series_id[n] < len(self.__dicom.cell_path_rs): self.__dicom.cell_path_rs[ind_series_id[n]] += [self.__dicom.stack_path_rs[i]] print('DONE') def __read_all_dicoms(self) -> None: """Reads all the dicom files in the all the paths of the attribute `_path_to_dicoms` Returns: None """ # SCANNING ALL FOLDERS IN INITIAL DIRECTORY print('\n--> Scanning all folders in initial directory...', end='') p = Path(self.paths._path_to_dicoms) e_rglob = '*.dcm' # EXTRACT ALL FILES IN THE PATH TO DICOMS if self.paths._path_to_dicoms.is_dir(): stack_folder_temp = list(p.rglob(e_rglob)) stack_folder = [x for x in stack_folder_temp if not x.is_dir()] elif str(self.paths._path_to_dicoms).find('json') != -1: with open(self.paths._path_to_dicoms) as f: data = json.load(f) for value in data.values(): stack_folder_temp = value directory_name = str(stack_folder_temp).replace("'", '').replace('[', '').replace(']', '') stack_folder = self.__get_list_of_files(directory_name) else: raise ValueError("The given dicom folder path either doesn't exist or not a folder.") # READ ALL DICOM FILES AND UPDATE ATTRIBUTES FOR FURTHER PROCESSING for file in tqdm(stack_folder): if pydicom.misc.is_dicom(file): try: info = pydicom.dcmread(str(file)) if info.Modality in ['MR', 'PT', 'CT']: ind_series_id = self.__find_uid_cell_index( info.SeriesInstanceUID, self.__dicom.cell_series_id)[0] if ind_series_id == len(self.__dicom.cell_series_id): # New volume self.__dicom.cell_series_id = self.__dicom.cell_series_id + [info.SeriesInstanceUID] self.__dicom.cell_frame_id += [info.FrameOfReferenceUID] self.__dicom.cell_path_images += [[]] self.__dicom.cell_path_rs = self.__dicom.cell_path_rs + [[]] self.__dicom.cell_path_images[ind_series_id] += [file] elif info.Modality == 'RTSTRUCT': self.__dicom.stack_path_rs += [file] try: series_uid = info.ReferencedFrameOfReferenceSequence[ 0].RTReferencedStudySequence[ 0].RTReferencedSeriesSequence[ 0].SeriesInstanceUID except: series_uid = 'NotFound' self.__dicom.stack_series_rs += [series_uid] try: frame_uid = info.ReferencedFrameOfReferenceSequence[0].FrameOfReferenceUID except: frame_uid = info.FrameOfReferenceUID self.__dicom.stack_frame_rs += [frame_uid] else: print("Modality not supported: ", info.Modality) except Exception as e: print(f'Error while reading: {file}, error: {e}\n') continue print('DONE') # ASSOCIATE ALL VOLUMES TO THEIR MASK self.__associate_rt_stuct()
[docs] def process_all_dicoms(self) -> Union[List[MEDscan], None]: """This function reads the DICOM content of all the sub-folder tree of a starting directory defined by `path_to_dicoms`. It then organizes the data (files throughout the starting directory are associated by 'SeriesInstanceUID') in the MEDscan class including the region of interest (ROI) defined by an associated RTstruct. All MEDscan classes hereby created are saved in `path_save` with a name varying with every scan. Returns: List[MEDscan]: List of MEDscan instances. """ # Initialize ray if ray.is_initialized(): ray.shutdown() ray.init(local_mode=True, include_dashboard=False) print('--> Reading all DICOM objects to create MEDscan classes') self.__read_all_dicoms() print('--> Processing DICOMs and creating MEDscan objects') n_scans = len(self.__dicom.cell_path_images) if self.n_batch is None: n_batch = 1 elif n_scans < self.n_batch: n_batch = n_scans else: n_batch = self.n_batch # Distribute the first tasks to all workers pds = [ProcessDICOM( self.__dicom.cell_path_images[i], self.__dicom.cell_path_rs[i], self.paths._path_save, self.save) for i in range(n_batch)] ids = [pd.process_files() for pd in pds] # Update the path to the created instances if self.save: for name_save in ray.get(ids): if self.paths._path_save: self.path_to_objects.append(str(self.paths._path_save / name_save)) # Update processing summary if name_save.split('_')[0].count('-') >= 2: scan_type = name_save[name_save.find('__')+2 : name_save.find('.')] if name_save.split('-')[0] not in self.__studies: self.__studies.append(name_save.split('-')[0]) # add new study if name_save.split('-')[1] not in self.__institutions: self.__institutions.append(name_save.split('-')[1]) # add new study if name_save.split('-')[0] not in self.summary: self.summary[name_save.split('-')[0]] = {} if name_save.split('-')[1] not in self.summary[name_save.split('-')[0]]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]] = {} # add new institution if scan_type not in self.__scans: self.__scans.append(scan_type) if scan_type not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type] = [] if name_save not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type].append(name_save) else: logging.warning(f"The patient ID of the following file: {name_save} does not respect the MEDiml "\ "naming convention 'study-institution-id' (Ex: Glioma-TCGA-001)") nb_job_left = n_scans - n_batch return ray.get(ids) if not self.save else None # Distribute the remaining tasks for _ in trange(n_scans): _, ids = ray.wait(ids, num_returns=1) if nb_job_left > 0: idx = n_scans - nb_job_left pd = ProcessDICOM( self.__dicom.cell_path_images[idx], self.__dicom.cell_path_rs[idx], self.paths._path_save, self.save) ids.extend([pd.process_files()]) nb_job_left -= 1 # Update the path to the created instances if self.save: for name_save in ray.get(ids): if self.paths._path_save: self.path_to_objects.extend(str(self.paths._path_save / name_save)) # Update processing summary if name_save.split('_')[0].count('-') >= 2: scan_type = name_save[name_save.find('__')+2 : name_save.find('.')] if name_save.split('-')[0] not in self.__studies: self.__studies.append(name_save.split('-')[0]) # add new study if name_save.split('-')[1] not in self.__institutions: self.__institutions.append(name_save.split('-')[1]) # add new study if name_save.split('-')[0] not in self.summary: self.summary[name_save.split('-')[0]] = {} if name_save.split('-')[1] not in self.summary[name_save.split('-')[0]]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]] = {} # add new institution if scan_type not in self.__scans: self.__scans.append(scan_type) if scan_type not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type] = [] if name_save not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type].append(name_save) else: logging.warning(f"The patient ID of the following file: {name_save} does not respect the MEDiml "\ "naming convention 'study-institution-id' (Ex: Glioma-TCGA-001)") else: return ray.get(ids) print('DONE')
def __read_all_niftis(self) -> None: """Reads all files in the initial path and organizes other path to images and roi in the class attributes. Returns: None. """ print('\n--> Scanning all folders in initial directory') if not self.paths._path_to_niftis: raise ValueError("The path to the niftis is not defined") p = Path(self.paths._path_to_niftis) e_rglob1 = '*.nii' e_rglob2 = '*.nii.gz' # EXTRACT ALL FILES IN THE PATH TO DICOMS if p.is_dir(): self.__nifti.stack_path_all = list(p.rglob(e_rglob1)) self.__nifti.stack_path_all.extend(list(p.rglob(e_rglob2))) else: raise TypeError(f"{p} must be a path to a directory") all_niftis = list(self.__nifti.stack_path_all) for i in trange(0, len(all_niftis)): if 'ROI' in all_niftis[i].name.split("."): self.__nifti.stack_path_roi.append(all_niftis[i]) else: self.__nifti.stack_path_images.append(all_niftis[i]) print('DONE') def __associate_roi_to_image( self, image_file: Union[Path, str], medscan: MEDscan, nifti: nib.Nifti1Image, path_roi_data: Path = None ) -> MEDscan: """Extracts all ROI data from the given path for the given patient ID and updates all class attributes with the new extracted data. Args: image_file(Union[Path, str]): Path to the ROI data. medscan (MEDscan): MEDscan class instance that will hold the data. Returns: MEDscan: Returns a MEDscan instance with updated roi attributes. """ def load_mask(_id, file, medscan): # Load the patient's ROI nifti files: if file.name.startswith(_id) and 'ROI' in file.name.split("."): roi = nib.load(file) roi = image.resample_to_img(roi, nifti, interpolation='nearest') roi_data = roi.get_fdata() roi_name = file.name[file.name.find("(") + 1 : file.name.find(")")] name_set = file.name[file.name.find("_") + 2 : file.name.find("(")] medscan.data.ROI.update_indexes(key=roi_index, indexes=np.nonzero(roi_data.flatten())) medscan.data.ROI.update_name_set(key=roi_index, name_set=name_set) medscan.data.ROI.update_roi_name(key=roi_index, roi_name=roi_name) else: raise ValueError(f"The ROI file for patient ID: {_id} " f"was not found in the given path: {file} or was not correctly named.") image_file = Path(image_file) roi_index = 0 if not path_roi_data: if not self.paths._path_to_niftis: raise ValueError("The path to the niftis is not defined") else: path_roi_data = self.paths._path_to_niftis for file in self.__nifti.stack_path_roi: _id = file.name.split("(")[0] if ("(") in file.name else file.name # id is PatientID__ImagingScanName load_mask(_id, file, medscan) roi_index += 1 else: _id = image_file.name.split("(")[0] if ("(") in image_file.name else image_file.name # id is PatientID__ImagingScanName load_mask(_id, path_roi_data, medscan) return medscan def __associate_spatialRef(self, nifti_file: Union[Path, str], medscan: MEDscan) -> MEDscan: """Computes the imref3d spatialRef using a NIFTI file and updates the spatialRef attribute. Args: nifti_file(Union[Path, str]): Path to the nifti data. medscan (MEDscan): MEDscan class instance that will hold the data. Returns: MEDscan: Returns a MEDscan instance with updated spatialRef attribute. """ # Loading the nifti file : nifti = nib.load(nifti_file) nifti_data = medscan.data.volume.array # spatialRef Creation pixel_x = abs(nifti.affine[0, 0]) pixel_y = abs(nifti.affine[1, 1]) slices = abs(nifti.affine[2, 2]) min_grid = nifti.affine[:3, 3] * [-1.0, -1.0, 1.0] # x and y are flipped min_x_grid = min_grid[0] min_y_grid = min_grid[1] min_z_grid = min_grid[2] size_image = np.shape(nifti_data) spatialRef = imref3d(size_image, abs(pixel_x), abs(pixel_y), abs(slices)) spatialRef.XWorldLimits = (np.array(spatialRef.XWorldLimits) - (spatialRef.XWorldLimits[0] - (min_x_grid-pixel_x/2)) ).tolist() spatialRef.YWorldLimits = (np.array(spatialRef.YWorldLimits) - (spatialRef.YWorldLimits[0] - (min_y_grid-pixel_y/2)) ).tolist() spatialRef.ZWorldLimits = (np.array(spatialRef.ZWorldLimits) - (spatialRef.ZWorldLimits[0] - (min_z_grid-slices/2)) ).tolist() # Converting the results into lists spatialRef.ImageSize = spatialRef.ImageSize.tolist() spatialRef.XIntrinsicLimits = spatialRef.XIntrinsicLimits.tolist() spatialRef.YIntrinsicLimits = spatialRef.YIntrinsicLimits.tolist() spatialRef.ZIntrinsicLimits = spatialRef.ZIntrinsicLimits.tolist() # update spatialRef in the volume sub-class medscan.data.volume.update_spatialRef(spatialRef) return medscan def __process_one_nifti(self, nifti_file: Union[Path, str], path_data) -> MEDscan: """ Processes one NIfTI file to create a MEDscan class instance. Args: nifti_file (Union[Path, str]): Path to the NIfTI file. path_data (Union[Path, str]): Path to the data. Returns: MEDscan: MEDscan class instance. """ medscan = MEDscan() medscan.patientID = os.path.basename(nifti_file).split("_")[0] medscan.type = os.path.basename(nifti_file).split(".")[-3] medscan.series_description = nifti_file.name[nifti_file.name.find('__') + 2: nifti_file.name.find('(')] medscan.format = "nifti" medscan.data.set_orientation(orientation="Axial") medscan.data.set_patient_position(patient_position="HFS") medscan.data.volume.array = nib.load(nifti_file).get_fdata() medscan.data.volume.scan_rot = None # Update spatialRef self.__associate_spatialRef(nifti_file, medscan) # Assiocate ROI medscan = self.__associate_roi_to_image(nifti_file, medscan, nib.load(nifti_file), path_data) return medscan
[docs] def process_all(self) -> None: """Processes both DICOM & NIfTI content to create MEDscan classes """ self.process_all_dicoms() self.process_all_niftis()
[docs] def process_all_niftis(self) -> List[MEDscan]: """This function reads the NIfTI content of all the sub-folder tree of a starting directory. It then organizes the data in the MEDscan class including the region of interest (ROI) defined by an associated mask file. All MEDscan classes hereby created are saved in a specific path with a name specific name varying with every scan. Args: None. Returns: List[MEDscan]: List of MEDscan instances. """ # Reading all NIfTI files self.__read_all_niftis() # Create the MEDscan instances print('--> Reading all NIfTI objects (imaging volumes & masks) to create MEDscan classes') list_instances = [] for file in tqdm(self.__nifti.stack_path_images): # Assert the list of instances does not exceed the a size of 10 if len(list_instances) >= 10: print('The number of MEDscan instances exceeds 10, please consider saving the instances') break # INITIALIZE MEDscan INSTANCE AND UPDATE ATTRIBUTES medscan = MEDscan() medscan.patientID = os.path.basename(file).split("_")[0] medscan.type = os.path.basename(file).split(".")[-3] medscan.series_description = file.name[file.name.find('__') + 2: file.name.find('(')] if '__' in file.name else "" medscan.format = "nifti" medscan.data.set_orientation(orientation="Axial") medscan.data.set_patient_position(patient_position="HFS") medscan.data.volume.array = nib.load(file).get_fdata() # RAS to LPS #medscan.data.volume.convert_to_LPS() medscan.data.volume.scan_rot = None # Update spatialRef medscan = self.__associate_spatialRef(file, medscan) # Get ROI medscan = self.__associate_roi_to_image(file, medscan, nib.load(file)) # SAVE MEDscan INSTANCE if self.save and self.paths._path_save: save_MEDscan(medscan, self.paths._path_save) else: list_instances.append(medscan) # Update the path to the created instances name_save = self.__get_MEDscan_name_save(medscan) # Clear memory del medscan # Update the path to the created instances if self.paths._path_save: self.path_to_objects.append(str(self.paths._path_save / name_save)) # Update processing summary if name_save.split('_')[0].count('-') >= 2: scan_type = name_save[name_save.find('__')+2 : name_save.find('.')] if name_save.split('-')[0] not in self.__studies: self.__studies.append(name_save.split('-')[0]) # add new study if name_save.split('-')[1] not in self.__institutions: self.__institutions.append(name_save.split('-')[1]) # add new institution if name_save.split('-')[0] not in self.summary: self.summary[name_save.split('-')[0]] = {} # add new study to summary if name_save.split('-')[1] not in self.summary[name_save.split('-')[0]]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]] = {} # add new institution if scan_type not in self.__scans: self.__scans.append(scan_type) if scan_type not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type] = [] if name_save not in self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type]: self.summary[name_save.split('-')[0]][name_save.split('-')[1]][scan_type].append(name_save) else: if self.save: logging.warning(f"The patient ID of the following file: {name_save} does not respect the MEDiml "\ "naming convention 'study-institution-id' (Ex: Glioma-TCGA-001)") print('DONE') if list_instances: return list_instances
[docs] def process_one_nifti(self, path_image: Union[Path, str], path_mask: Union[Path, str]) -> MEDscan: """Processes one NIfTI file to create a MEDscan class instance. Args: nifti_file (Union[Path, str]): Path to the NIfTI file. path_data (Union[Path, str]): Path to the data. Returns: MEDscan: MEDscan class instance. """ medscan = self.__process_one_nifti(path_image, path_mask) # SAVE MEDscan INSTANCE if self.save and self.paths._path_save: save_MEDscan(medscan, self.paths._path_save) return medscan
[docs] def update_from_csv(self, path_csv: Union[str, Path] = None) -> None: """Updates the class from a given CSV and summarizes the processed scans again according to it. Args: path_csv(optional, Union[str, Path]): Path to a csv file, if not given, will check for csv info in the class attributes. Returns: None """ if not (path_csv or self.paths._path_csv): print('No csv provided, no updates will be made') else: if path_csv: self.paths._path_csv = path_csv # Extract roi type label from csv file name name_csv = self.paths._path_csv.name roi_type_label = name_csv[name_csv.find('_')+1 : name_csv.find('.')] # Create a dictionary csv_data = {} csv_data[roi_type_label] = pd.read_csv(self.paths._path_csv) self.csv_data = csv_data self.summarize()
[docs] def summarize(self, retrun_summary: bool = False) -> None: """Creates and shows a summary of processed scans organized by study, institution, scan type and roi type Args: retrun_summary (bool, optional): If True, will return the summary as a dictionary. Returns: None """ def count_scans(summary): count = 0 if type(summary) == dict: for study in summary: if type(summary[study]) == dict: for institution in summary[study]: if type(summary[study][institution]) == dict: for scan in self.summary[study][institution]: count += len(summary[study][institution][scan]) else: count += len(summary[study][institution]) else: count += len(summary[study]) elif type(summary) == list: count = len(summary) return count summary_df = pd.DataFrame(columns=['study', 'institution', 'scan_type', 'roi_type', 'count']) for study in self.summary: summary_df = summary_df.append({ 'study': study, 'institution': "", 'scan_type': "", 'roi_type': "", 'count' : count_scans(self.summary) }, ignore_index=True) for institution in self.summary[study]: summary_df = summary_df.append({ 'study': study, 'institution': institution, 'scan_type': "", 'roi_type': "", 'count' : count_scans(self.summary[study][institution]) }, ignore_index=True) for scan in self.summary[study][institution]: summary_df = summary_df.append({ 'study': study, 'institution': institution, 'scan_type': scan, 'roi_type': "", 'count' : count_scans(self.summary[study][institution][scan]) }, ignore_index=True) if self.csv_data: roi_count = 0 for roi_type in self.csv_data: csv_table = pd.DataFrame(self.csv_data[roi_type]) csv_table['under'] = '_' csv_table['dot'] = '.' csv_table['npy'] = '.npy' name_patients = (pd.Series( csv_table[['PatientID', 'under', 'under', 'ImagingScanName', 'dot', 'ImagingModality', 'npy']].fillna('').values.tolist()).str.join('')).tolist() for patient_id in self.summary[study][institution][scan]: if patient_id in name_patients: roi_count += 1 summary_df = summary_df.append({ 'study': study, 'institution': institution, 'scan_type': scan, 'roi_type': roi_type, 'count' : roi_count }, ignore_index=True) print(summary_df.to_markdown(index=False)) if retrun_summary: return summary_df
def __pre_radiomics_checks_dimensions( self, path_data: Union[Path, str] = None, wildcards_dimensions: List[str] = [], min_percentile: float = 0.05, max_percentile: float = 0.95, save: bool = False ) -> None: """Finds proper voxels dimension options for radiomics analyses for a group of scans Args: path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the inner-class ``Paths`` in the current instance. wildcards_dimensions(List[str], optional): List of wildcards that determines the scans that will be analyzed. You can learn more about wildcards in :ref:`this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`. min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05. max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95. save (bool, optional): If True, will save the results in a json file. Defaults to False. Returns: None. """ xy_dim = { "data": [], "mean": [], "median": [], "std": [], "min": [], "max": [], f"p{min_percentile}": [], f"p{max_percentile}": [] } z_dim = { "data": [], "mean": [], "median": [], "std": [], "min": [], "max": [], f"p{min_percentile}": [], f"p{max_percentile}": [] } if type(wildcards_dimensions) is str: wildcards_dimensions = [wildcards_dimensions] if len(wildcards_dimensions) == 0: print("Wildcard is empty, the pre-checks will be aborted") return # Updating plotting params plt.rcParams["figure.figsize"] = (20,20) plt.rcParams.update({'font.size': 22}) # TODO: seperate by studies and scan type (MRscan, CTscan...) # TODO: Two summaries (df, list of names saves) -> # name_save = name_save(ROI) : Glioma-Huashan-001__T1.MRscan.npy({GTV}) file_paths = list() for w in range(len(wildcards_dimensions)): wildcard = wildcards_dimensions[w] if path_data: file_paths = get_file_paths(path_data, wildcard) elif self.paths._path_save: file_paths = get_file_paths(self.paths._path_save, wildcard) else: raise ValueError("Path data is invalid.") n_files = len(file_paths) xy_dim["data"] = np.zeros((n_files, 1)) xy_dim["data"] = np.multiply(xy_dim["data"], np.nan) z_dim["data"] = np.zeros((n_files, 1)) z_dim["data"] = np.multiply(z_dim["data"], np.nan) for f in tqdm(range(len(file_paths))): try: if file_paths[f].name.endswith("nii.gz") or file_paths[f].name.endswith("nii"): with open(file_paths[f], 'rb') as file: medscan = pickle.load(file) xy_dim["data"][f] = medscan.header.get_zooms()[0] z_dim["data"][f] = medscan.header.get_zooms()[2] else: with open(file_paths[f], 'rb') as file: medscan = pickle.load(file) xy_dim["data"][f] = medscan.data.volume.spatialRef.PixelExtentInWorldX z_dim["data"][f] = medscan.data.volume.spatialRef.PixelExtentInWorldZ except Exception as e: print(e) # Running analysis xy_dim["data"] = np.concatenate(xy_dim["data"]) xy_dim["mean"] = np.mean(xy_dim["data"][~np.isnan(xy_dim["data"])]) xy_dim["median"] = np.median(xy_dim["data"][~np.isnan(xy_dim["data"])]) xy_dim["std"] = np.std(xy_dim["data"][~np.isnan(xy_dim["data"])]) xy_dim["min"] = np.min(xy_dim["data"][~np.isnan(xy_dim["data"])]) xy_dim["max"] = np.max(xy_dim["data"][~np.isnan(xy_dim["data"])]) xy_dim[f"p{min_percentile}"] = np.percentile(xy_dim["data"][~np.isnan(xy_dim["data"])], min_percentile) xy_dim[f"p{max_percentile}"] = np.percentile(xy_dim["data"][~np.isnan(xy_dim["data"])], max_percentile) z_dim["mean"] = np.mean(z_dim["data"][~np.isnan(z_dim["data"])]) z_dim["median"] = np.median(z_dim["data"][~np.isnan(z_dim["data"])]) z_dim["std"] = np.std(z_dim["data"][~np.isnan(z_dim["data"])]) z_dim["min"] = np.min(z_dim["data"][~np.isnan(z_dim["data"])]) z_dim["max"] = np.max(z_dim["data"][~np.isnan(z_dim["data"])]) z_dim[f"p{min_percentile}"] = np.percentile(z_dim["data"][~np.isnan(z_dim["data"])], min_percentile) z_dim[f"p{max_percentile}"] = np.percentile(z_dim["data"][~np.isnan(z_dim["data"])], max_percentile) xy_dim["data"] = xy_dim["data"].tolist() z_dim["data"] = z_dim["data"].tolist() # Plotting xy-spacing data histogram df_xy = pd.DataFrame(xy_dim["data"], columns=['data']) del xy_dim["data"] # no interest in keeping data (we only need statistics) ax = df_xy.hist(column='data') min_quant, max_quant, median = df_xy.quantile(min_percentile), df_xy.quantile(max_percentile), df_xy.median() for x in ax[0]: x.axvline(min_quant.data, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}") x.axvline(max_quant.data, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}") x.axvline(median.data, linestyle='solid', color='gold', label=f"Median: {float(median.data):.3f}") x.grid(False) plt.title(f"Voxels xy-spacing checks for {wildcard}") plt.legend() # Save the plot if save: plt.savefig(self.paths._path_save_checks / ('Voxels_xy_check.png')) else: plt.show() # Plotting z-spacing data histogram df_z = pd.DataFrame(z_dim["data"], columns=['data']) del z_dim["data"] # no interest in keeping data (we only need statistics) ax = df_z.hist(column='data') min_quant, max_quant, median = df_z.quantile(min_percentile), df_z.quantile(max_percentile), df_z.median() for x in ax[0]: x.axvline(min_quant.data, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}") x.axvline(max_quant.data, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}") x.axvline(median.data, linestyle='solid', color='gold', label=f"Median: {float(median.data):.3f}") x.grid(False) plt.title(f"Voxels z-spacing checks for {wildcard}") plt.legend() # Save the plot if save: plt.savefig(self.paths._path_save_checks / ('Voxels_z_check.png')) else: plt.show() # Saving files using wildcard for name if save: wildcard = str(wildcard).replace('*', '').replace('.npy', '.json') save_json(self.paths._path_save_checks / ('xyDim_' + wildcard), xy_dim, cls=NumpyEncoder) save_json(self.paths._path_save_checks / ('zDim_' + wildcard), z_dim, cls=NumpyEncoder) def __pre_radiomics_checks_window( self, path_data: Union[str, Path] = None, wildcards_window: List = [], path_csv: Union[str, Path] = None, min_percentile: float = 0.05, max_percentile: float = 0.95, bin_width: int = 0, hist_range: list = [], nifti: bool = True, save: bool = False ) -> None: """Finds proper re-segmentation ranges options for radiomics analyses for a group of scans Args: path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the inner-class ``Paths`` in the current instance. wildcards_window(List[str], optional): List of wildcards that determines the scans that will be analyzed. You can learn more about wildcards in :ref:`this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`. path_csv(Union[str, Path], optional): Path to a csv file containing a list of the scans that will be analyzed (a CSV file for a single ROI type). min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05. max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95. bin_width(int, optional): Width of the bins for the histograms. If not provided, will use the default number of bins in the method :ref:`pandas.DataFrame.hist <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.hist.html>`: 10 bins. hist_range(list, optional): Range of the histograms. If empty, will use the minimum and maximum values. nifti(bool, optional): If True, will use the NIfTI files, otherwise will use the numpy files. save (bool, optional): If True, will save the results in a json file. Defaults to False. Returns: None. """ # Updating plotting params plt.rcParams["figure.figsize"] = (20,20) plt.rcParams.update({'font.size': 22}) if type(wildcards_window) is str: wildcards_window = [wildcards_window] if len(wildcards_window) == 0: print("Wilcards is empty") return if path_csv: self.paths._path_csv = Path(path_csv) roi_table = pd.read_csv(self.paths._path_csv) if nifti: roi_table['under'] = '_' roi_table['dot'] = '.' roi_table['roi_label'] = 'GTV' roi_table['oparenthesis'] = '(' roi_table['cparenthesis'] = ')' roi_table['ext'] = '.nii.gz' patient_names = (pd.Series( roi_table[['PatientID', 'under', 'under', 'ImagingScanName', 'oparenthesis', 'roi_label', 'cparenthesis', 'dot', 'ImagingModality', 'ext']].fillna('').values.tolist()).str.join('')).tolist() else: roi_names = [[], [], []] roi_names[0] = roi_table['PatientID'] roi_names[1] = roi_table['ImagingScanName'] roi_names[2] = roi_table['ImagingModality'] patient_names = get_patient_names(roi_names) for w in range(len(wildcards_window)): temp_val = [] temp = [] file_paths = [] roi_data= { "data": [], "mean": [], "median": [], "std": [], "min": [], "max": [], f"p{min_percentile}": [], f"p{max_percentile}": [] } wildcard = wildcards_window[w] if path_data: file_paths = get_file_paths(path_data, wildcard) elif self.paths._path_save: path_data = self.paths._path_save file_paths = get_file_paths(self.paths._path_save, wildcard) else: raise ValueError("Path data is invalid.") n_files = len(file_paths) i = 0 for f in tqdm(range(n_files)): file = file_paths[f] _, filename = os.path.split(file) filename, ext = os.path.splitext(filename) patient_name = filename + ext try: if file.name.endswith('nii.gz') or file.name.endswith('nii'): medscan = self.__process_one_nifti(file, path_data) else: with open(file, 'rb') as file: medscan = pickle.load(file) if re.search('PTscan', wildcard) and medscan.format != 'nifti': suv_converter = PETSUVConverter(medscan.dicomH) medscan.data.volume.array = suv_converter.compute(np.double(medscan.data.volume.array)) patient_names = pd.Index(patient_names) ind_roi = patient_names.get_loc(patient_name) name_roi = roi_table.loc[ind_roi][3] vol_obj_init, roi_obj_init = get_roi_from_indexes(medscan, name_roi, 'box') temp = vol_obj_init.data[roi_obj_init.data == 1] temp_val.append(len(temp)) roi_data["data"].append(np.zeros(shape=(n_files, temp_val[i]))) roi_data["data"][i] = temp i+=1 del medscan del vol_obj_init del roi_obj_init except Exception as e: print(f"Problem with patient {patient_name}, error: {e}") roi_data["data"] = np.concatenate(roi_data["data"]) roi_data["mean"] = np.mean(roi_data["data"][~np.isnan(roi_data["data"])]) roi_data["median"] = np.median(roi_data["data"][~np.isnan(roi_data["data"])]) roi_data["std"] = np.std(roi_data["data"][~np.isnan(roi_data["data"])]) roi_data["min"] = np.min(roi_data["data"][~np.isnan(roi_data["data"])]) roi_data["max"] = np.max(roi_data["data"][~np.isnan(roi_data["data"])]) roi_data[f"p{min_percentile}"] = np.percentile(roi_data["data"][~np.isnan(roi_data["data"])], min_percentile) roi_data[f"p{max_percentile}"] = np.percentile(roi_data["data"][~np.isnan(roi_data["data"])], max_percentile) # Set bin width if not provided if bin_width != 0: if hist_range: nb_bins = (round(hist_range[1]) - round(hist_range[0])) // bin_width else: nb_bins = (round(roi_data["max"]) - round(roi_data["min"])) // bin_width else: nb_bins = 10 if hist_range: bin_width = int((round(hist_range[1]) - round(hist_range[0])) // nb_bins) else: bin_width = int((round(roi_data["max"]) - round(roi_data["min"])) // nb_bins) nb_bins = int(nb_bins) # Set histogram range if not provided if not hist_range: hist_range = (roi_data["min"], roi_data["max"]) # re-segment data according to histogram range roi_data["data"] = roi_data["data"][(roi_data["data"] > hist_range[0]) & (roi_data["data"] < hist_range[1])] df_data = pd.DataFrame(roi_data["data"], columns=['data']) del roi_data["data"] # no interest in keeping data (we only need statistics) # Plot histogram ax = df_data.hist(column='data', bins=nb_bins, range=(hist_range[0], hist_range[1]), edgecolor='black') min_quant, max_quant= df_data.quantile(min_percentile), df_data.quantile(max_percentile) for x in ax[0]: x.axvline(min_quant.data, linestyle=':', color='r', label=f"{min_percentile*100}% Percentile: {float(min_quant):.3f}") x.axvline(max_quant.data, linestyle=':', color='g', label=f"{max_percentile*100}% Percentile: {float(max_quant):.3f}") x.grid(False) x.xaxis.set_ticks(np.arange(hist_range[0], hist_range[1], bin_width, dtype=int)) x.set_xticklabels(x.get_xticks(), rotation=45) x.xaxis.set_tick_params(pad=15) plt.title(f"Intensity range checks for {wildcard}, bw={bin_width}") plt.legend() # Save the plot if save: plt.savefig(self.paths._path_save_checks / ('Intensity_range_check_' + f'bw_{bin_width}.png')) else: plt.show() # save final checks if save: wildcard = str(wildcard).replace('*', '').replace('.npy', '.json') save_json(self.paths._path_save_checks / ('roi_data_' + wildcard), roi_data, cls=NumpyEncoder)
[docs] def pre_radiomics_checks(self, path_data: Union[str, Path] = None, wildcards_dimensions: List = [], wildcards_window: List = [], path_csv: Union[str, Path] = None, min_percentile: float = 0.05, max_percentile: float = 0.95, bin_width: int = 0, hist_range: list = [], nifti: bool = False, save: bool = False) -> None: """Finds proper dimension and re-segmentation ranges options for radiomics analyses. The resulting files from this method can then be analyzed and used to set up radiomics parameters options in computation methods. Args: path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the inner-class ``Paths`` in the current instance. wildcards_dimensions(List[str], optional): List of wildcards that determines the scans that will be analyzed. You can learn more about wildcards in `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_. wildcards_window(List[str], optional): List of wildcards that determines the scans that will be analyzed. You can learn more about wildcards in `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_. path_csv(Union[str, Path], optional): Path to a csv file containing a list of the scans that will be analyzed (a CSV file for a single ROI type). min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05. max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95. bin_width(int, optional): Width of the bins for the histograms. If not provided, will use the default number of bins in the method :ref:`pandas.DataFrame.hist <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.hist.html>`: 10 bins. hist_range(list, optional): Range of the histograms. If empty, will use the minimum and maximum values. nifti (bool, optional): Set to True if the scans are nifti files. Defaults to False. save (bool, optional): If True, will save the results in a json file. Defaults to False. Returns: None """ # Initialization path_study = Path.cwd() # Load params if not self.paths._path_pre_checks_settings: if not wildcards_dimensions or not wildcards_window: raise ValueError("path to pre-checks settings is None.\ wildcards_dimensions and wildcards_window need to be specified") else: settings = self.paths._path_pre_checks_settings settings = load_json(settings) settings = settings['pre_radiomics_checks'] # Setting up paths if 'path_save_checks' in settings and settings['path_save_checks']: self.paths._path_save_checks = Path(settings['path_save_checks']) if 'path_csv' in settings and settings['path_csv']: self.paths._path_csv = Path(settings['path_csv']) # Wildcards of groups of files to analyze for dimensions in path_data. # See for example: https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html # Keep the cell empty if no dimension checks are to be performed. if not wildcards_dimensions: wildcards_dimensions = [] for i in range(len(settings['wildcards_dimensions'])): wildcards_dimensions.append(settings['wildcards_dimensions'][i]) # ROI intensity window checks params if not wildcards_window: wildcards_window = [] for i in range(len(settings['wildcards_window'])): wildcards_window.append(settings['wildcards_window'][i]) # PRE-RADIOMICS CHECKS if not self.paths._path_save_checks: if (path_study / 'checks').exists(): self.paths._path_save_checks = Path(path_study / 'checks') else: os.mkdir(path_study / 'checks') self.paths._path_save_checks = Path(path_study / 'checks') else: if self.paths._path_save_checks.name != 'checks': if (self.paths._path_save_checks / 'checks').exists(): self.paths._path_save_checks /= 'checks' else: os.mkdir(self.paths._path_save_checks / 'checks') self.paths._path_save_checks = Path(self.paths._path_save_checks / 'checks') # Initializing plotting params plt.rcParams["figure.figsize"] = (20,20) plt.rcParams.update({'font.size': 22}) start = time() print('\n\n************************* PRE-RADIOMICS CHECKS *************************', end='') # 1. PRE-RADIOMICS CHECKS -- DIMENSIONS start1 = time() print('\n--> PRE-RADIOMICS CHECKS -- DIMENSIONS ... ', end='') self.__pre_radiomics_checks_dimensions( path_data, wildcards_dimensions, min_percentile, max_percentile, save) print('DONE', end='') time1 = f"{time() - start1:.2f}" print(f'\nElapsed time: {time1} sec', end='') # 2. PRE-RADIOMICS CHECKS - WINDOW start2 = time() print('\n\n--> PRE-RADIOMICS CHECKS -- WINDOW ... \n', end='') self.__pre_radiomics_checks_window( path_data, wildcards_window, path_csv, min_percentile, max_percentile, bin_width, hist_range, nifti, save) print('DONE', end='') time2 = f"{time() - start2:.2f}" print(f'\nElapsed time: {time2} sec', end='') time_elapsed = f"{time() - start:.2f}" print(f'\n\n--> TOTAL TIME FOR PRE-RADIOMICS CHECKS: {time_elapsed} seconds') print('-------------------------------------------------------------------------------------')
[docs] def perform_mr_imaging_summary(self, wildcards_scans: List[str], path_data: Path = None, path_save_checks: Path = None, min_percentile: float = 0.05, max_percentile: float = 0.95 ) -> None: """ Summarizes MRI imaging acquisition parameters. Plots summary histograms for different dimensions and saves all acquisition parameters locally in JSON files. Args: wildcards_scans (List[str]): List of wildcards that determines the scans that will be analyzed (Only MRI scans will be analyzed). You can learn more about wildcards in `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_. For example: ``[\"STS*.MRscan.npy\"]``. path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the inner-class ``Paths`` in the current instance. path_save_checks (Path, optional): Path where to save the checks, if not specified will use the one in the current instance. min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05. max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95. Returns: None. """ # Initializing data structures class param: dates = [] manufacturer = [] scanning_sequence = [] class years: data = [] class fieldStrength: data = [] class repetitionTime: data = [] class echoTime: data = [] class inversionTime: data = [] class echoTrainLength: data = [] class flipAngle: data = [] class numberAverages: data = [] class xyDim: data = [] class zDim: data = [] if len(wildcards_scans) == 0: print('wildcards_scans is empty') # wildcards checks: no_mr_scan = True for wildcard in wildcards_scans: if 'MRscan' in wildcard: no_mr_scan = False if no_mr_scan: raise ValueError(f"wildcards: {wildcards_scans} does not include MR scans. (Only MR scans are supported)") # Initialization if path_data is None: if self.paths._path_save: path_data = Path(self.paths._path_save) else: print("No path to data was given and path save is None.") return 0 if not path_save_checks: if self.paths._path_save_checks: path_save_checks = Path(self.paths._path_save_checks) else: if (Path(os.getcwd()) / "checks").exists(): path_save_checks = Path(os.getcwd()) / "checks" else: path_save_checks = (Path(os.getcwd()) / "checks").mkdir() # Looping through all the different wildcards for i in tqdm(range(len(wildcards_scans))): wildcard = wildcards_scans[i] file_paths = get_file_paths(path_data, wildcard) n_files = len(file_paths) param.dates = np.zeros(n_files) param.years.data = np.zeros((n_files, 1)) param.years.data = np.multiply(param.years.data, np.NaN) param.manufacturer = [None] * n_files param.scanning_sequence = [None] * n_files param.fieldStrength.data = np.zeros((n_files, 1)) param.fieldStrength.data = np.multiply(param.fieldStrength.data, np.NaN) param.repetitionTime.data = np.zeros((n_files, 1)) param.repetitionTime.data = np.multiply(param.repetitionTime.data, np.NaN) param.echoTime.data = np.zeros((n_files, 1)) param.echoTime.data = np.multiply(param.echoTime.data, np.NaN) param.inversionTime.data = np.zeros((n_files, 1)) param.inversionTime.data = np.multiply(param.inversionTime.data, np.NaN) param.echoTrainLength.data = np.zeros((n_files, 1)) param.echoTrainLength.data = np.multiply(param.echoTrainLength.data, np.NaN) param.flipAngle.data = np.zeros((n_files, 1)) param.flipAngle.data = np.multiply(param.flipAngle.data, np.NaN) param.numberAverages.data = np.zeros((n_files, 1)) param.numberAverages.data = np.multiply(param.numberAverages.data, np.NaN) param.xyDim.data = np.zeros((n_files, 1)) param.xyDim.data = np.multiply(param.xyDim.data, np.NaN) param.zDim.data = np.zeros((n_files, 1)) param.zDim.data = np.multiply(param.zDim.data, np.NaN) # Loading and recording data for f in tqdm(range(n_files)): file = file_paths[f] #Open file for warning try: warn_file = open(path_save_checks / 'imaging_summary_mr_warnings.txt', 'a') except IOError: print("Could not open warning file") # Loading Data try: print(f'\nCurrently working on: {file}', file = warn_file) with open(path_data / file, 'rb') as fe: medscan = pickle.load(fe) # Example of DICOM header info = medscan.dicomH[1] # Recording dates (info.AcquistionDates) try: param.dates[f] = info.AcquisitionDate except AttributeError: param.dates[f] = info.StudyDate # Recording years try: y = str(param.dates[f]) # Only the first four characters represent the years param.years.data[f] = y[0:4] except Exception as e: print(f'Cannot read years of: {file}. Error: {e}', file = warn_file) # Recording manufacturers try: param.manufacturer[f] = info.Manufacturer except Exception as e: print(f'Cannot read manufacturer of: {file}. Error: {e}', file = warn_file) # Recording scanning sequence try: param.scanning_sequence[f] = info.scanning_sequence except Exception as e: print(f'Cannot read scanning sequence of: {file}. Error: {e}', file = warn_file) # Recording field strength try: param.fieldStrength.data[f] = info.MagneticFieldStrength except Exception as e: print(f'Cannot read field strength of: {file}. Error: {e}', file = warn_file) # Recording repetition time try: param.repetitionTime.data[f] = info.RepetitionTime except Exception as e: print(f'Cannot read repetition time of: {file}. Error: {e}', file = warn_file) # Recording echo time try: param.echoTime.data[f] = info.EchoTime except Exception as e: print(f'Cannot read echo time of: {file}. Error: {e}', file = warn_file) # Recording inversion time try: param.inversionTime.data[f] = info.InversionTime except Exception as e: print(f'Cannot read inversion time of: {file}. Error: {e}', file = warn_file) # Recording echo train length try: param.echoTrainLength.data[f] = info.EchoTrainLength except Exception as e: print(f'Cannot read echo train length of: {file}. Error: {e}', file = warn_file) # Recording flip angle try: param.flipAngle.data[f] = info.FlipAngle except Exception as e: print(f'Cannot read flip angle of: {file}. Error: {e}', file = warn_file) # Recording number of averages try: param.numberAverages.data[f] = info.NumberOfAverages except Exception as e: print(f'Cannot read number averages of: {file}. Error: {e}', file = warn_file) # Recording xy spacing try: param.xyDim.data[f] = medscan.data.volume.spatialRef.PixelExtentInWorldX except Exception as e: print(f'Cannot read x spacing of: {file}. Error: {e}', file = warn_file) # Recording z spacing try: param.zDim.data[f] = medscan.data.volume.spatialRef.PixelExtentInWorldZ except Exception as e: print(f'Cannot read z spacing of: {file}', file = warn_file) except Exception as e: print(f'Cannot read file: {file}. Error: {e}', file = warn_file) warn_file.close() # Summarize data # Summarizing years df_years = pd.DataFrame(param.years.data, columns=['years']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing field strength df_fs = pd.DataFrame(param.fieldStrength.data, columns=['fieldStrength']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing repetition time df_rt = pd.DataFrame(param.repetitionTime.data, columns=['repetitionTime']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing echo time df_et = pd.DataFrame(param.echoTime.data, columns=['echoTime']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing inversion time df_it = pd.DataFrame(param.inversionTime.data, columns=['inversionTime']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing echo train length df_etl = pd.DataFrame(param.echoTrainLength.data, columns=['echoTrainLength']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing flip angle df_fa = pd.DataFrame(param.flipAngle.data, columns=['flipAngle']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing number of averages df_na = pd.DataFrame(param.numberAverages.data, columns=['numberAverages']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing xy-spacing df_xy = pd.DataFrame(param.xyDim.data, columns=['xyDim']) # Summarizing z-spacing df_z = pd.DataFrame(param.zDim.data, columns=['zDim']) # Plotting xy-spacing histogram ax = df_xy.hist(column='xyDim') min_quant, max_quant, average = df_xy.quantile(min_percentile), df_xy.quantile(max_percentile), param.xyDim.data.mean() for x in ax[0]: x.axvline(min_quant.xyDim, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}") x.axvline(max_quant.xyDim, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}") x.axvline(average, linestyle='solid', color='gold', label=f"Average: {float(average):.3f}") x.grid(False) plt.title(f"MR xy-spacing imaging summary for {wildcard}") plt.legend() plt.show() # Plotting z-spacing histogram ax = df_z.hist(column='zDim') min_quant, max_quant, average = df_z.quantile(min_percentile), df_z.quantile(max_percentile), param.zDim.data.mean() for x in ax[0]: x.axvline(min_quant.zDim, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}") x.axvline(max_quant.zDim, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}") x.axvline(average, linestyle='solid', color='gold', label=f"Average: {float(average):.3f}") x.grid(False) plt.title(f"MR z-spacing imaging summary for {wildcard}") plt.legend() plt.show() # Summarizing xy-spacing df_xy = df_xy.describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing z-spacing df_z = df_z.describe(percentiles=[min_percentile, max_percentile], include='all') # Saving data name_save = wildcard.replace('*', '').replace('.npy', '') save_name = 'imagingSummary__' + name_save + ".json" df_all = [df_years, df_fs, df_rt, df_et, df_it, df_etl, df_fa, df_na, df_xy, df_z] df_all = df_all[0].join(df_all[1:]) df_all.to_json(path_save_checks / save_name, orient='columns', indent=4)
[docs] def perform_ct_imaging_summary(self, wildcards_scans: List[str], path_data: Path = None, path_save_checks: Path = None, min_percentile: float = 0.05, max_percentile: float = 0.95 ) -> None: """ Summarizes CT imaging acquisition parameters. Plots summary histograms for different dimensions and saves all acquisition parameters locally in JSON files. Args: wildcards_scans (List[str]): List of wildcards that determines the scans that will be analyzed (Only MRI scans will be analyzed). You can learn more about wildcards in `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_. For example: ``[\"STS*.CTscan.npy\"]``. path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the inner-class ``Paths`` in the current instance. path_save_checks (Path, optional): Path where to save the checks, if not specified will use the one in the current instance. min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05. max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95. Returns: None. """ class param: manufacturer = [] dates = [] kernel = [] class years: data = [] class voltage: data = [] class exposure: data = [] class xyDim: data = [] class zDim: data = [] if len(wildcards_scans) == 0: print('wildcards_scans is empty') # wildcards checks: no_mr_scan = True for wildcard in wildcards_scans: if 'CTscan' in wildcard: no_mr_scan = False if no_mr_scan: raise ValueError(f"wildcards: {wildcards_scans} does not include CT scans. (Only CT scans are supported)") # Initialization if path_data is None: if self.paths._path_save: path_data = Path(self.paths._path_save) else: print("No path to data was given and path save is None.") return 0 if not path_save_checks: if self.paths._path_save_checks: path_save_checks = Path(self.paths._path_save_checks) else: if (Path(os.getcwd()) / "checks").exists(): path_save_checks = Path(os.getcwd()) / "checks" else: path_save_checks = (Path(os.getcwd()) / "checks").mkdir() # Looping through all the different wildcards for i in tqdm(range(len(wildcards_scans))): wildcard = wildcards_scans[i] file_paths = get_file_paths(path_data, wildcard) n_files = len(file_paths) param.dates = np.zeros(n_files) param.years.data = np.zeros(n_files) param.years.data = np.multiply(param.years.data, np.NaN) param.manufacturer = [None] * n_files param.voltage.data = np.zeros(n_files) param.voltage.data = np.multiply(param.voltage.data, np.NaN) param.exposure.data = np.zeros(n_files) param.exposure.data = np.multiply(param.exposure.data, np.NaN) param.kernel = [None] * n_files param.xyDim.data = np.zeros(n_files) param.xyDim.data = np.multiply(param.xyDim.data, np.NaN) param.zDim.data = np.zeros(n_files) param.zDim.data = np.multiply(param.zDim.data, np.NaN) # Loading and recording data for f in tqdm(range(n_files)): file = file_paths[f] # Open file for warning try: warn_file = open(path_save_checks / 'imaging_summary_ct_warnings.txt', 'a') except IOError: print("Could not open file") # Loading Data try: with open(path_data / file, 'rb') as fe: medscan = pickle.load(fe) print(f'Currently working on: {file}', file=warn_file) # DICOM header info = medscan.dicomH[1] # Recording dates try: param.dates[f] = info.AcquisitionDate except AttributeError: param.dates[f] = info.StudyDate # Recording years try: years = str(param.dates[f]) # Only the first four characters represent the years param.years.data[f] = years[0:4] except Exception as e: print(f'Cannot read dates of : {file}. Error: {e}', file=warn_file) # Recording manufacturers try: param.manufacturer[f] = info.Manufacturer except Exception as e: print(f'Cannot read Manufacturer of: {file}. Error: {e}', file=warn_file) # Recording voltage try: param.voltage.data[f] = info.KVP except Exception as e: print(f'Cannot read voltage of: {file}. Error: {e}', file=warn_file) # Recording exposure try: param.exposure.data[f] = info.Exposure except Exception as e: print(f'Cannot read exposure of: {file}. Error: {e}', file=warn_file) # Recording reconstruction kernel try: param.kernel[f] = info.ConvolutionKernel except Exception as e: print(f'Cannot read Kernel of: {file}. Error: {e}', file=warn_file) # Recording xy spacing try: param.xyDim.data[f] = medscan.data.volume.spatialRef.PixelExtentInWorldX except Exception as e: print(f'Cannot read x spacing of: {file}. Error: {e}', file=warn_file) # Recording z spacing try: param.zDim.data[f] = medscan.data.volume.spatialRef.PixelExtentInWorldZ except Exception as e: print(f'Cannot read z spacing of: {file}. Error: {e}', file=warn_file) except Exception as e: print(f'Cannot load file: {file}', file=warn_file) warn_file.close() # Summarize data # Summarizing years df_years = pd.DataFrame(param.years.data, columns=['years']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing voltage df_voltage = pd.DataFrame(param.voltage.data, columns=['voltage']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing exposure df_exposure = pd.DataFrame(param.exposure.data, columns=['exposure']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing kernel df_kernel = pd.DataFrame(param.kernel, columns=['kernel']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarize xy spacing df_xy = pd.DataFrame(param.xyDim.data, columns=['xyDim']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarize z spacing df_z = pd.DataFrame(param.zDim.data, columns=['zDim']).describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing xy-spacing df_xy = pd.DataFrame(param.xyDim.data, columns=['xyDim']) # Summarizing z-spacing df_z = pd.DataFrame(param.zDim.data, columns=['zDim']) # Plotting xy-spacing histogram ax = df_xy.hist(column='xyDim') min_quant, max_quant, average = df_xy.quantile(min_percentile), df_xy.quantile(max_percentile), param.xyDim.data.mean() for x in ax[0]: x.axvline(min_quant.xyDim, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}") x.axvline(max_quant.xyDim, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}") x.axvline(average, linestyle='solid', color='gold', label=f"Average: {float(average):.3f}") x.grid(False) plt.title(f"CT xy-spacing imaging summary for {wildcard}") plt.legend() plt.show() # Plotting z-spacing histogram ax = df_z.hist(column='zDim') min_quant, max_quant, average = df_z.quantile(min_percentile), df_z.quantile(max_percentile), param.zDim.data.mean() for x in ax[0]: x.axvline(min_quant.zDim, linestyle=':', color='r', label=f"Min Percentile: {float(min_quant):.3f}") x.axvline(max_quant.zDim, linestyle=':', color='g', label=f"Max Percentile: {float(max_quant):.3f}") x.axvline(average, linestyle='solid', color='gold', label=f"Average: {float(average):.3f}") x.grid(False) plt.title(f"CT z-spacing imaging summary for {wildcard}") plt.legend() plt.show() # Summarizing xy-spacing df_xy = df_xy.describe(percentiles=[min_percentile, max_percentile], include='all') # Summarizing z-spacing df_z = df_z.describe(percentiles=[min_percentile, max_percentile], include='all') # Saving data name_save = wildcard.replace('*', '').replace('.npy', '') save_name = 'imagingSummary__' + name_save + ".json" df_all = [df_years, df_voltage, df_exposure, df_kernel, df_xy, df_z] df_all = df_all[0].join(df_all[1:]) df_all.to_json(path_save_checks / save_name, orient='columns', indent=4)
[docs] def perform_imaging_summary(self, wildcards_scans: List[str], path_data: Path = None, path_save_checks: Path = None, min_percentile: float = 0.05, max_percentile: float = 0.95 ) -> None: """ Summarizes CT and MR imaging acquisition parameters. Plots summary histograms for different dimensions and saves all acquisition parameters locally in JSON files. Args: wildcards_scans (List[str]): List of wildcards that determines the scans that will be analyzed (CT and MRI scans will be analyzed). You can learn more about wildcards in `this link <https://www.linuxtechtips.com/2013/11/how-wildcards-work-in-linux-and-unix.html>`_. For example: ``[\"STS*.CTscan.npy\", \"STS*.MRscan.npy\"]``. path_data (Path, optional): Path to the MEDscan objects, if not specified will use ``path_save`` from the inner-class ``Paths`` in the current instance. path_save_checks (Path, optional): Path where to save the checks, if not specified will use the one in the current instance. min_percentile (float, optional): Minimum percentile to use for the histograms. Defaults to 0.05. max_percentile (float, optional): Maximum percentile to use for the histograms. Defaults to 0.95. Returns: None. """ # MR imaging summary wildcards_scans_mr = [wildcard for wildcard in wildcards_scans if 'MRscan' in wildcard] if len(wildcards_scans_mr) == 0: print("Cannot perform imaging summary for MR, no MR scan wildcard was given! ") else: self.perform_mr_imaging_summary( wildcards_scans_mr, path_data, path_save_checks, min_percentile, max_percentile) # CT imaging summary wildcards_scans_ct = [wildcard for wildcard in wildcards_scans if 'CTscan' in wildcard] if len(wildcards_scans_ct) == 0: print("Cannot perform imaging summary for CT, no CT scan wildcard was given! ") else: self.perform_ct_imaging_summary( wildcards_scans_ct, path_data, path_save_checks, min_percentile, max_percentile)