Source code for MEDiml.learning.Normalization

import numpy as np
import pandas as pd
from neuroCombat import neuroCombat
from sklearn.base import BaseEstimator, TransformerMixin

from ..utils.get_institutions_from_ids import get_institutions_from_ids


[docs] class CombatNormalization(BaseEstimator, TransformerMixin): """ Sklearn-compatible Transformer for ComBat Normalization. This transformer assumes the input X (DataFrame) contains both the features to be normalized and the column identifying the institution/batch. """
[docs] def __init__( self, institution_col: str = None, covariates: list = None, drop_institution: bool = True ): """ Args: institution_col (str): Name of the column in X containing the institution/batch IDs. If None, tries to derive from Index using util. covariates (list): List of column names in X to treat as covariates (biological retention). drop_institution (bool): If True, removes the institution column from output. """ self.institution_col = institution_col self.covariates = covariates if covariates is not None else [] self.drop_institution = drop_institution
[docs] def fit(self, X, y=None): """ ComBat calculates parameters on the current batch data provided in transform. Standard fit does nothing but validate input exists. """ return self
[docs] def transform(self, X): """ Applies ComBat Normalization. """ # Validate Input if not isinstance(X, pd.DataFrame): raise ValueError("Input X must be a pandas DataFrame.") # Avoid modifying the original input X_df = X.copy() # 1. Identify Institutions if self.institution_col and self.institution_col in X_df.columns: institutions = X_df[self.institution_col] # If we plan to drop it later, we don't include it in features matrix if self.drop_institution: X_df = X_df.drop(columns=[self.institution_col]) else: # Fallback to index-based logic from original code institutions = get_institutions_from_ids(pd.Series(X_df.index)) # Encode institutions to integers (1, 2, 3...) required by logic institutions = self._process_institutions(institutions) # Check: If < 2 institutions, ComBat fails. Return original. if len(np.unique(institutions)) < 2: print("Warning: Less than 2 institutions detected. Skipping ComBat.") return X_df # 2. Prepare Covariates # We need to extract covariate data from X if specified covars_df = pd.DataFrame({'institution': institutions.flatten()}, index=X_df.index) for cov in self.covariates: if cov in X_df.columns: covars_df[cov] = X_df[cov] # Remove covariates from the feature matrix to be harmonized? # Usually ComBat harmonizes features *adjusting* for covariates. # We keep covariates in X_df usually, but standard combat expects data # to ONLY be the features. # Let's separate them: X_df = X_df.drop(columns=[cov]) # 3. Handle Single Feature Edge Case (from original code) cols_to_restore = [] if X_df.shape[1] == 1: X_df['temp_ones'] = 1 cols_to_restore.append('temp_ones') # 4. Run NeuroCombat # neuroCombat expects: data (Features x Samples), covars (Samples x Covars) # Note: We transpose X_df because neuroCombat expects Features as Rows try: results = neuroCombat( dat=X_df.T, covars=covars_df, batch_col='institution' ) # 5. Reconstruct DataFrame # Result 'data' is Features x Samples harmonized_data = pd.DataFrame(results['data'].T, index=X_df.index, columns=X_df.columns) # Remove temp columns if any if cols_to_restore: harmonized_data = harmonized_data.drop(columns=cols_to_restore) # Re-attach Covariates if they were stripped for cov in self.covariates: harmonized_data[cov] = covars_df[cov] return harmonized_data except Exception as e: print(f"ComBat failed: {e}. Returning original data.") return X
[docs] def _process_institutions(self, institutions): """Helper to map institution strings to integers.""" institutions = pd.Series(institutions) unique_inst = institutions.unique() mapping = {inst: i+1 for i, inst in enumerate(unique_inst)} return institutions.map(mapping).values.reshape(-1, 1)