import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import (train_test_split)
from sklearn.model_selection import RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier

from sklearn.multioutput import MultiOutputClassifier

import xgboost as xgb

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

df = pd.read_csv("../pool-datasets/clf/oncotypedx_subset_transpose.csv", index_col=0)

df.shape

(7244, 25)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7244 entries, MBC-MBCProject_wAiri7fp-Tumor-SM-AZ5DH to F3265repl
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   BIRC5   7244 non-null   float64
 1   CCNB1   7244 non-null   float64
 2   MYBL2   7244 non-null   float64
 3   MMP11   7244 non-null   float64
 4   GRB7    7244 non-null   float64
 5   PGR     7244 non-null   float64
 6   BCL2    7244 non-null   float64
 7   SCUBE2  7244 non-null   float64
 8   GSTM1   7244 non-null   float64
 9   BAG1    7244 non-null   float64
 10  CD68    7244 non-null   float64
 11  ACTB    7244 non-null   float64
 12  GAPDH   7244 non-null   float64
 13  RPLP0   7244 non-null   float64
 14  TFRC    7244 non-null   float64
 15  AURKA   7244 non-null   float64
 16  CTSV    7244 non-null   float64
 17  MKI67   7244 non-null   float64
 18  ERBB2   7244 non-null   float64
 19  GUSB    7244 non-null   float64
 20  ESR1    7244 non-null   float64
 21  IQGAP1  7244 non-null   float64
 22  IQGAP2  7244 non-null   float64
 23  FRG1    7244 non-null   float64
 24  EEF1A2  6144 non-null   float64
dtypes: float64(25)
memory usage: 1.4+ MB

df.describe(include="all")

df.fillna(df.mean(), inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7244 entries, MBC-MBCProject_wAiri7fp-Tumor-SM-AZ5DH to F3265repl
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   BIRC5   7244 non-null   float64
 1   CCNB1   7244 non-null   float64
 2   MYBL2   7244 non-null   float64
 3   MMP11   7244 non-null   float64
 4   GRB7    7244 non-null   float64
 5   PGR     7244 non-null   float64
 6   BCL2    7244 non-null   float64
 7   SCUBE2  7244 non-null   float64
 8   GSTM1   7244 non-null   float64
 9   BAG1    7244 non-null   float64
 10  CD68    7244 non-null   float64
 11  ACTB    7244 non-null   float64
 12  GAPDH   7244 non-null   float64
 13  RPLP0   7244 non-null   float64
 14  TFRC    7244 non-null   float64
 15  AURKA   7244 non-null   float64
 16  CTSV    7244 non-null   float64
 17  MKI67   7244 non-null   float64
 18  ERBB2   7244 non-null   float64
 19  GUSB    7244 non-null   float64
 20  ESR1    7244 non-null   float64
 21  IQGAP1  7244 non-null   float64
 22  IQGAP2  7244 non-null   float64
 23  FRG1    7244 non-null   float64
 24  EEF1A2  7244 non-null   float64
dtypes: float64(25)
memory usage: 1.4+ MB

train, test = train_test_split(df,test_size=0.20,random_state=SEED)
y_train, X_train = train.drop(['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2'],axis=1).copy(), train[['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2']].copy()
y_test, X_test = test.drop(['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2'],axis=1).copy(), test[['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2']].copy()
del(train)
del(test)

len(X_train), len(X_test), len(y_train), len(y_test)
print(f'Train Samples: {len(X_train), len(y_train)} and Test Samples: {len(X_test), len(y_test)}')

Train Samples: (5795, 5795) and Test Samples: (1449, 1449)

X_train.plot(kind="scatter", x="IQGAP1",y="FRG1", grid=True)
plt.show() #BEFORE removing 0 values

X_train.plot(kind="scatter", x="EEF1A2",y="IQGAP2", grid=True)

<Axes: xlabel='EEF1A2', ylabel='IQGAP2'>

X_train.plot(kind="scatter", x="EEF1A2",y="IQGAP1", grid=True)

<Axes: xlabel='EEF1A2', ylabel='IQGAP1'>

X_train.plot(kind="scatter", x="FRG1",y="IQGAP2", grid=True)

<Axes: xlabel='FRG1', ylabel='IQGAP2'>

import matplotlib.pyplot as plt

X_train.hist(bins=50, figsize=(12,8))
plt.show() #clearly victim age has some 0 values so we will remove that

y_train.hist(bins=50, figsize=(12,8))
plt.show()

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Simple preprocessor placeholder used by some pipelines below
preprocessor = 'passthrough'

# Define a function to create a pipeline with a given model
def custom_f1score(y_val, y_pred, model_name, f1_score): # take (samples, outputs)
  name_col = None
  n_col = []
  row_tup = tuple()
  row_tup = row_tup + (model_name,)  # Add model name to the tuple
  for idx, col in enumerate(y_val.columns):
      macro_acc = f1_score(y_val[col].values, y_pred[:, idx], average='macro')
      row_tup = row_tup + (round(macro_acc, 4),)
      n_col.append(col+'macro_score')
      micro_acc = f1_score(y_val[col].values, y_pred[:, idx], average='micro')
      row_tup = row_tup + (round(micro_acc, 4),)
      n_col.append(col+'micro_score')
  name_col = n_col
  return row_tup, name_col

from sklearn.utils import all_estimators
from sklearn.multioutput import MultiOutputClassifier
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification


# Get all sklearn classifiers
all_classifiers = all_estimators(type_filter="classifier")

results = []
name_col = None
for name, ClfClass in all_classifiers:
    try:
        clf = MultiOutputClassifier(ClfClass())
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        row_tup, name_col = custom_f1score(y_val, y_pred, name, f1_score)
        # Append the results
        results.append(row_tup)
        del row_tup  # Delete to save the memory!
    except Exception:
        pass  # Ignore models that fail

# Sort and show
results_df = pd.DataFrame(results, columns=["Model"] + name_col)
results_df = results_df.sort_values(by="ESR1micro_score", ascending=False)

from IPython.display import display
display(results_df.style.set_table_attributes("style='display:inline'").set_table_styles(
  [{'selector':'table', 'props': [('overflow', 'scroll'), ('display', 'block')]}]
))

results_df.shape

(32, 43)

results_df['x'] = np.linspace(0, 1, 32)

results_df.head()

# results_df.drop(columns=['x'], inplace=True)

results_df.to_csv('../results/model/multiclassclf_results.csv', index=False)

import plotly.io as pio

pio.renderers.default = "vscode"

import pandas as pd
import plotly.express as px

# Load results
file_path = "../results/model/multiclassclf_results.csv"   # update path if needed
df = pd.read_csv(file_path)

# Reshape into long format
df_long = df.melt(id_vars=["Model"], var_name="Gene_Metric", value_name="Score")

# Split "Gene_Metric" into "Gene" and "Metric"
df_long[["Gene", "Metric"]] = df_long["Gene_Metric"].str.extract(r"(.+?)(macro_score|micro_score)")

# Interactive line chart (first 6 genes to keep it readable)
genes_to_plot = df_long["Gene"].unique()
subset = df_long[df_long["Gene"].isin(genes_to_plot)]

fig = px.line(
    subset,
    x="Model",
    y="Score",
    color="Gene",
    line_dash="Metric",   # dashed line for macro/micro
    markers=True,
    title="Gene-wise Macro and Micro Scores Across Models (first 6 genes)",
    hover_data={"Score": ":.4f", "Model": True, "Gene": True, "Metric": True},
)

fig.update_layout(
    xaxis_tickangle=45,
    legend_title_text="Gene - Metric",
    width=1000,
    height=600
)

fig.show()

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

baseline_pipeline = Pipeline(steps=[
    ('classifier', MultiOutputClassifier(LogisticRegression(max_iter=5000)))
])

baseline_pipeline.fit(X_train, y_train)

y_pred = baseline_pipeline.predict(X_val)
from pprint import pprint
# Calculate the f1_score. Micro and Macro
row_tup, name_col = custom_f1score(y_val, y_pred, "Baseline Logistic Regression", f1_score)
print(name_col)
print(row_tup)

['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Logistic Regression', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7546, 0.7558, 0.6496, 0.7558, 0.7645, 0.7662, 0.6713, 0.679, 0.4225, 0.6695, 0.8384, 0.8387, 0.444, 0.7748, 0.4821, 0.8999, 0.4989, 0.9957, 0.5599, 0.8576, 0.7917, 0.8059, 0.4985, 0.994, 0.4679, 0.5815, 0.604, 0.8041, 0.7427, 0.7731, 0.7303, 0.7817, 0.6865, 0.8154)

# Logistic Regression with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold

logistic_pipeline = Pipeline(steps=[
  ('classifier', MultiOutputClassifier(LogisticRegression(max_iter=5000))) # Increased max_iter
])

# Note: Parameters of the underlying estimator inside MultiOutputClassifier
# are accessed via 'classifier__estimator__<param>'
param_grid_logistic = {
    'classifier__estimator__C': np.logspace(-4, 4, 20),
    'classifier__estimator__solver': ['liblinear', 'lbfgs']
}

# Use KFold (not StratifiedKFold) for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

random_search_logistic = RandomizedSearchCV(
    logistic_pipeline,
    param_distributions=param_grid_logistic,
    n_iter=10,
    cv=cv_reduced,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search_logistic.fit(X_train, y_train)

y_val_pred = random_search_logistic.predict(X_val)

row_tup, name_col = custom_f1score(y_val, y_val_pred, "Logistic Regression", f1_score)
print(name_col)
print(row_tup)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Logistic Regression', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7555, 0.7567, 0.6499, 0.7558, 0.7645, 0.7662, 0.6713, 0.679, 0.4225, 0.6695, 0.8384, 0.8387, 0.444, 0.7748, 0.4818, 0.8991, 0.4989, 0.9957, 0.5585, 0.8568, 0.7925, 0.8067, 0.4985, 0.994, 0.4711, 0.5833, 0.6143, 0.805, 0.7419, 0.7722, 0.7295, 0.7808, 0.6886, 0.8162)

# Baseline model: RandomForest with basic settings
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

random_forest_pipeline.fit(X_train, y_train)

y_val_pred = random_forest_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)

['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.5965, 0.9862, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7922, 0.7929, 0.8522, 0.8723, 0.8131, 0.8145, 0.813, 0.7817, 0.5961, 0.6997, 0.8521, 0.8525, 0.6331, 0.7929, 0.708, 0.9241, 0.4989, 0.9957, 0.7811, 0.9292, 0.9019, 0.8783, 0.4985, 0.994, 0.563, 0.585, 0.7228, 0.8352, 0.7193, 0.7532, 0.7495, 0.7877, 0.7418, 0.8395)

# Random Forest with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid_random_forest = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use KFold for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

random_search_random_forest = RandomizedSearchCV(random_forest_pipeline, param_distributions=param_grid_random_forest, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_random_forest.fit(X_train, y_train)

y_val_pred = random_search_random_forest.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.782, 0.7826, 0.8251, 0.8499, 0.7996, 0.8016, 0.8045, 0.7765, 0.5181, 0.6782, 0.847, 0.8473, 0.5719, 0.7826, 0.6658, 0.9146, 0.4989, 0.9957, 0.7658, 0.9301, 0.8959, 0.8714, 0.4985, 0.994, 0.5951, 0.6135, 0.6957, 0.8231, 0.7394, 0.7688, 0.7511, 0.7903, 0.7034, 0.824)

# Baseline model: XGBClassifier wrapped for multi-output
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False)))
])

xgboost_pipeline.fit(X_train, y_train)

y_val_pred = xgboost_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[33], line 7
      1 # Baseline model: XGBClassifier wrapped for multi-output
      2 xgboost_pipeline = Pipeline(steps=[
      3     ('preprocessor', preprocessor),
      4     ("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False)))
      5 ])
----> 7 xgboost_pipeline.fit(X_train, y_train)
      9 y_val_pred = xgboost_pipeline.predict(X_val)
     10 row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1358     estimator._validate_params()
   1360 with config_context(
   1361     skip_parameter_validation=(
   1362         prefer_skip_nested_validation or global_skip_validation
   1363     )
   1364 ):
-> 1365     return fit_method(estimator, *args, **kwargs)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/pipeline.py:663, in Pipeline.fit(self, X, y, **params)
    657     if self._final_estimator != "passthrough":
    658         last_step_params = self._get_metadata_for_step(
    659             step_idx=len(self) - 1,
    660             step_params=routed_params[self.steps[-1][0]],
    661             all_params=params,
    662         )
--> 663         self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    665 return self

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:547, in MultiOutputClassifier.fit(self, X, Y, sample_weight, **fit_params)
    521 def fit(self, X, Y, sample_weight=None, **fit_params):
    522     """Fit the model to data matrix X and targets Y.
    523 
    524     Parameters
   (...)    545         Returns a fitted instance.
    546     """
--> 547     super().fit(X, Y, sample_weight=sample_weight, **fit_params)
    548     self.classes_ = [estimator.classes_ for estimator in self.estimators_]
    549     return self

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1358     estimator._validate_params()
   1360 with config_context(
   1361     skip_parameter_validation=(
   1362         prefer_skip_nested_validation or global_skip_validation
   1363     )
   1364 ):
-> 1365     return fit_method(estimator, *args, **kwargs)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:278, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params)
    275     if sample_weight is not None:
    276         routed_params.estimator.fit["sample_weight"] = sample_weight
--> 278 self.estimators_ = Parallel(n_jobs=self.n_jobs)(
    279     delayed(_fit_estimator)(
    280         self.estimator, X, y[:, i], **routed_params.estimator.fit
    281     )
    282     for i in range(y.shape[1])
    283 )
    285 if hasattr(self.estimators_[0], "n_features_in_"):
    286     self.n_features_in_ = self.estimators_[0].n_features_in_

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/utils/parallel.py:82, in Parallel.__call__(self, iterable)
     73 warning_filters = warnings.filters
     74 iterable_with_config_and_warning_filters = (
     75     (
     76         _with_config_and_warning_filters(delayed_func, config, warning_filters),
   (...)     80     for delayed_func, args, kwargs in iterable
     81 )
---> 82 return super().__call__(iterable_with_config_and_warning_filters)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/joblib/parallel.py:1986, in Parallel.__call__(self, iterable)
   1984     output = self._get_sequential_output(iterable)
   1985     next(output)
-> 1986     return output if self.return_generator else list(output)
   1988 # Let's create an ID that uniquely identifies the current call. If the
   1989 # call is interrupted early and that the same instance is immediately
   1990 # reused, this id will be used to prevent workers that were
   1991 # concurrently finalizing a task from the previous call to run the
   1992 # callback.
   1993 with self._lock:

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/joblib/parallel.py:1914, in Parallel._get_sequential_output(self, iterable)
   1912 self.n_dispatched_batches += 1
   1913 self.n_dispatched_tasks += 1
-> 1914 res = func(*args, **kwargs)
   1915 self.n_completed_tasks += 1
   1916 self.print_progress()

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/utils/parallel.py:147, in _FuncWrapper.__call__(self, *args, **kwargs)
    145 with config_context(**config), warnings.catch_warnings():
    146     warnings.filters = warning_filters
--> 147     return self.function(*args, **kwargs)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:67, in _fit_estimator(estimator, X, y, sample_weight, **fit_params)
     65     estimator.fit(X, y, sample_weight=sample_weight, **fit_params)
     66 else:
---> 67     estimator.fit(X, y, **fit_params)
     68 return estimator

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/xgboost/core.py:705, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    703 for k, arg in zip(sig.parameters, args):
    704     kwargs[k] = arg
--> 705 return func(**kwargs)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/xgboost/sklearn.py:1640, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
   1635     expected_classes = self.classes_
   1636 if (
   1637     classes.shape != expected_classes.shape
   1638     or not (classes == expected_classes).all()
   1639 ):
-> 1640     raise ValueError(
   1641         f"Invalid classes inferred from unique values of `y`.  "
   1642         f"Expected: {expected_classes}, got {classes}"
   1643     )
   1645 params = self.get_xgb_params()
   1647 if callable(self.objective):

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [-1.  1.]

import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# -------------------------
# 1. Generate toy dataset
# -------------------------
X = np.random.rand(20, 5)  # 20 samples, 5 features
y = np.random.choice([-1, 1], size=20)  # labels in {-1, 1}

# -------------------------
# 2. Map labels -1 → 0, 1 → 1
# -------------------------
y_mapped = (y == 1).astype(int)  # {-1,1} → {0,1}

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, random_state=42)

# -------------------------
# 3. Train XGBoost model
# -------------------------
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "binary:logistic",  # binary classification
    "eval_metric": "logloss"
}
model = xgb.train(params, dtrain, num_boost_round=20)

# -------------------------
# 4. Make predictions
# -------------------------
y_pred_prob = model.predict(dtest)             # probabilities for class 1
y_pred = (y_pred_prob > 0.5).astype(int)       # threshold at 0.5
y_pred_original = np.where(y_pred == 1, 1, -1) # map back {0,1} → {-1,1}

# -------------------------
# 5. Evaluate
# -------------------------
y_test_original = np.where(y_test == 1, 1, -1)
acc = accuracy_score(y_test_original, y_pred_original)

print("True labels:     ", y_test_original)
print("Predicted labels:", y_pred_original)
print(f"Accuracy: {acc:.4f}")

True labels:      [ 1 -1 -1  1  1 -1]
Predicted labels: [ 1  1 -1  1 -1  1]
Accuracy: 0.5000

param_grid_xgboost = {
    'classifier__estimator__n_estimators': [100, 200, 300],
    'classifier__estimator__max_depth': [3, 6, 9, 12],
    'classifier__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__estimator__subsample': [0.6, 0.8, 1.0],
    'classifier__estimator__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__estimator__gamma': [0, 0.1, 0.2, 0.3],
    'classifier__estimator__min_child_weight': [1, 3, 5]
}

# Use KFold for multi-output targets
from sklearn.model_selection import KFold
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

# Rebuild the pipeline to ensure wrapped estimator for search
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False)))
])

random_search_xgboost = RandomizedSearchCV(xgboost_pipeline, param_distributions=param_grid_xgboost, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_xgboost.fit(X_train, y_train)

y_val_pred = random_search_xgboost.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
print(f"Best Parameters: {random_search_xgboost.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7728, 0.7731, 0.8088, 0.8378, 0.7944, 0.7955, 0.7875, 0.755, 0.4492, 0.6756, 0.846, 0.8464, 0.576, 0.7852, 0.659, 0.9129, 0.6418, 0.9957, 0.7343, 0.9189, 0.8965, 0.8714, 0.4985, 0.994, 0.619, 0.635, 0.6903, 0.8214, 0.74, 0.7705, 0.7584, 0.7955, 0.7144, 0.8283)
Best Parameters: {'classifier__estimator__subsample': 0.8, 'classifier__estimator__n_estimators': 200, 'classifier__estimator__min_child_weight': 5, 'classifier__estimator__max_depth': 3, 'classifier__estimator__learning_rate': 0.05, 'classifier__estimator__gamma': 0, 'classifier__estimator__colsample_bytree': 0.6}

#  Predict on the validation set using the best model
y_val_pred = random_search_xgboost.best_estimator_.predict(X_val)

# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix

y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred)

conf_matrices = []
if y_true_np.ndim == 2:
    for j in range(y_true_np.shape[1]):
        conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
    conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))

# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
    print(f"Confusion Matrix for output {idx}:")
    print(cm)

Confusion Matrix for output 0:
[[   0   18]
 [   0 1141]]
Confusion Matrix for output 1:
[[   0    1]
 [   0 1158]]
Confusion Matrix for output 2:
[[   0   48]
 [   0 1111]]
Confusion Matrix for output 3:
[[   0    4]
 [   0 1155]]
Confusion Matrix for output 4:
[[470 113]
 [150 426]]
Confusion Matrix for output 5:
[[679   0  56]
 [  0 178   0]
 [132   0 114]]
Confusion Matrix for output 6:
[[503  89]
 [148 419]]
Confusion Matrix for output 7:
[[200   0 192]
 [  0 178   0]
 [ 92   0 497]]
Confusion Matrix for output 8:
[[763  16]
 [360  20]]
Confusion Matrix for output 9:
[[462  45]
 [133 519]]
Confusion Matrix for output 10:
[[ 48 215]
 [ 34 862]]
Confusion Matrix for output 11:
[[  29   86]
 [  15 1029]]
Confusion Matrix for output 12:
[[   1    4]
 [   1 1153]]
Confusion Matrix for output 13:
[[ 16   0  80]
 [  0 178   0]
 [ 14   0 871]]
Confusion Matrix for output 14:
[[488   0  33]
 [  0 178   0]
 [116   0 344]]
Confusion Matrix for output 15:
[[   0    7]
 [   0 1152]]
Confusion Matrix for output 16:
[[288  32  76]
 [ 28 124  77]
 [126  84 324]]
Confusion Matrix for output 17:
[[ 99 143]
 [ 64 853]]
Confusion Matrix for output 18:
[[248 135]
 [131 645]]
Confusion Matrix for output 19:
[[234 134]
 [103 688]]
Confusion Matrix for output 20:
[[114 163]
 [ 36 846]]

# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
    ax = plt.subplot(rows, cols, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()

# Baseline model: DecisionTree
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=1))
])

decision_tree_pipeline.fit(X_train, y_train)

y_val_pred = decision_tree_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)

['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.5532, 0.9638, 0.4985, 0.994, 0.518, 0.9189, 0.4983, 0.9931, 0.6908, 0.6911, 0.8017, 0.811, 0.7477, 0.7481, 0.7831, 0.7386, 0.5768, 0.6221, 0.788, 0.7903, 0.6062, 0.7248, 0.6614, 0.887, 0.6418, 0.9957, 0.7549, 0.8913, 0.8539, 0.8162, 0.4978, 0.9914, 0.5006, 0.5229, 0.6698, 0.7834, 0.7101, 0.7489, 0.7003, 0.7455, 0.6703, 0.7593)

# Decision Tree with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

param_grid_decision_tree = {
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use KFold for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

random_search_decision_tree = RandomizedSearchCV(decision_tree_pipeline, param_distributions=param_grid_decision_tree, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_decision_tree.fit(X_train, y_train)

y_val_pred = random_search_decision_tree.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.5435, 0.9836, 0.4998, 0.9991, 0.5321, 0.9482, 0.4991, 0.9965, 0.6981, 0.6989, 0.7674, 0.8041, 0.7617, 0.7636, 0.7766, 0.7291, 0.5408, 0.6393, 0.7955, 0.7964, 0.5817, 0.7092, 0.6732, 0.9016, 0.6096, 0.994, 0.7485, 0.9008, 0.8729, 0.843, 0.4985, 0.994, 0.5602, 0.5764, 0.692, 0.8024, 0.7099, 0.7455, 0.7349, 0.7722, 0.6885, 0.7748)

#  Predict on the validation set using the best model
y_val_pred = random_search_decision_tree.best_estimator_.predict(X_val)

# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix

y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred)

conf_matrices = []
if y_true_np.ndim == 2:
    for j in range(y_true_np.shape[1]):
        conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
    conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))

# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
    print(f"Confusion Matrix for output {idx}:")
    print(cm)

Confusion Matrix for output 0:
[[   1   17]
 [   2 1139]]
Confusion Matrix for output 1:
[[   0    1]
 [   0 1158]]
Confusion Matrix for output 2:
[[   3   45]
 [  15 1096]]
Confusion Matrix for output 3:
[[   0    4]
 [   0 1155]]
Confusion Matrix for output 4:
[[435 148]
 [201 375]]
Confusion Matrix for output 5:
[[660   1  74]
 [  0 178   0]
 [151   1  94]]
Confusion Matrix for output 6:
[[494  98]
 [176 391]]
Confusion Matrix for output 7:
[[237   1 154]
 [  0 178   0]
 [158   1 430]]
Confusion Matrix for output 8:
[[639 140]
 [278 102]]
Confusion Matrix for output 9:
[[424  83]
 [153 499]]
Confusion Matrix for output 10:
[[ 91 172]
 [165 731]]
Confusion Matrix for output 11:
[[  38   77]
 [  37 1007]]
Confusion Matrix for output 12:
[[   1    4]
 [   3 1151]]
Confusion Matrix for output 13:
[[ 26   0  70]
 [  0 178   0]
 [ 43   2 840]]
Confusion Matrix for output 14:
[[463   1  57]
 [  0 178   0]
 [123   1 336]]
Confusion Matrix for output 15:
[[   0    7]
 [   0 1152]]
Confusion Matrix for output 16:
[[253  38 105]
 [ 39 115  75]
 [113 121 300]]
Confusion Matrix for output 17:
[[118 124]
 [105 812]]
Confusion Matrix for output 18:
[[229 154]
 [141 635]]
Confusion Matrix for output 19:
[[230 138]
 [126 665]]
Confusion Matrix for output 20:
[[144 133]
 [128 754]]

# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
    ax = plt.subplot(rows, cols, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()

# Baseline model: GradientBoosting (single-output estimator applied to multi-output via independent fits is not supported directly)
# Keep pipeline for reference; compute accuracy if it runs, otherwise suggest using MultiOutputClassifier wrapping another estimator.
gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", GradientBoostingClassifier(random_state=1))
])

# Attempt fit/predict per output if y is multi-output
import numpy as np

y_train_np = np.asarray(y_train)
y_val_np = np.asarray(y_val)

if y_train_np.ndim == 2:
    # Fit one-vs-output manually
    preds = []
    for j in range(y_train_np.shape[1]):
        gb = Pipeline(steps=[('preprocessor', preprocessor), ("classifier", GradientBoostingClassifier(random_state=1))])
        gb.fit(X_train, y_train_np[:, j])
        preds.append(gb.predict(X_val))
    y_val_pred_np = np.column_stack(preds)
else:
    gradient_boosting_pipeline.fit(X_train, y_train)
    y_val_pred_np = gradient_boosting_pipeline.predict(X_val)

row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)

['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.5435, 0.9836, 0.4998, 0.9991, 0.5321, 0.9482, 0.4991, 0.9965, 0.6981, 0.6989, 0.7674, 0.8041, 0.7617, 0.7636, 0.7766, 0.7291, 0.5408, 0.6393, 0.7955, 0.7964, 0.5817, 0.7092, 0.6732, 0.9016, 0.6096, 0.994, 0.7485, 0.9008, 0.8729, 0.843, 0.4985, 0.994, 0.5602, 0.5764, 0.692, 0.8024, 0.7099, 0.7455, 0.7349, 0.7722, 0.6885, 0.7748)

gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

param_grid_gradient_boosting = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__max_features': ['auto', 'sqrt', 'log2']
}

# Manual per-output tuning since GradientBoostingClassifier is single-output
from sklearn.model_selection import KFold
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

import numpy as np

y_train_np = np.asarray(y_train)
y_val_np = np.asarray(y_val)

best_models = []
for j in range(y_train_np.shape[1] if y_train_np.ndim == 2 else 1):
    rs = RandomizedSearchCV(gradient_boosting_pipeline, param_distributions=param_grid_gradient_boosting, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
    y_target = y_train_np[:, j] if y_train_np.ndim == 2 else y_train_np
    rs.fit(X_train, y_target)
    best_models.append(rs.best_estimator_)

# Predict using per-output best models
if y_val_np.ndim == 2:
    preds = [model.predict(X_val) for model in best_models]
    y_val_pred_np = np.column_stack(preds)
else:
    y_val_pred_np = best_models[0].predict(X_val)

# Compute micro/macro
row_tup, name_col = custom_f1score(y_val, y_val_pred_np, "Baseline Gradient Boosting", f1_score)
print(name_col)
print(row_tup)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Gradient Boosting', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7752, 0.7757, 0.8274, 0.8473, 0.7975, 0.799, 0.7879, 0.7575, 0.4129, 0.6747, 0.8444, 0.8447, 0.5752, 0.7947, 0.6755, 0.918, 0.7851, 0.9974, 0.7444, 0.9249, 0.8925, 0.8671, 0.4985, 0.994, 0.6179, 0.6359, 0.6978, 0.824, 0.7425, 0.7739, 0.7383, 0.786, 0.7204, 0.8283)

#  Predict on the validation set using the best per-output GB models
# (y_val_pred_np already computed in previous cell)

# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix

y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred_np)

conf_matrices = []
if y_true_np.ndim == 2:
    for j in range(y_true_np.shape[1]):
        conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
    conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))

# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
    print(f"Confusion Matrix for output {idx}:")
    print(cm)

Confusion Matrix for output 0:
[[   0   18]
 [   0 1141]]
Confusion Matrix for output 1:
[[   0    1]
 [   0 1158]]
Confusion Matrix for output 2:
[[   0   48]
 [   0 1111]]
Confusion Matrix for output 3:
[[   0    4]
 [   0 1155]]
Confusion Matrix for output 4:
[[475 108]
 [152 424]]
Confusion Matrix for output 5:
[[672   0  63]
 [  0 178   0]
 [114   0 132]]
Confusion Matrix for output 6:
[[512  80]
 [153 414]]
Confusion Matrix for output 7:
[[195   0 197]
 [  0 178   0]
 [ 84   0 505]]
Confusion Matrix for output 8:
[[778   1]
 [376   4]]
Confusion Matrix for output 9:
[[464  43]
 [137 515]]
Confusion Matrix for output 10:
[[ 44 219]
 [ 19 877]]
Confusion Matrix for output 11:
[[  31   84]
 [  11 1033]]
Confusion Matrix for output 12:
[[   2    3]
 [   0 1154]]
Confusion Matrix for output 13:
[[ 17   0  79]
 [  0 177   1]
 [  7   0 878]]
Confusion Matrix for output 14:
[[487   0  34]
 [  1 177   0]
 [119   0 341]]
Confusion Matrix for output 15:
[[   0    7]
 [   0 1152]]
Confusion Matrix for output 16:
[[290  31  75]
 [ 34 122  73]
 [119  90 325]]
Confusion Matrix for output 17:
[[103 139]
 [ 65 852]]
Confusion Matrix for output 18:
[[246 137]
 [125 651]]
Confusion Matrix for output 19:
[[208 160]
 [ 88 703]]
Confusion Matrix for output 20:
[[120 157]
 [ 42 840]]

# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
    ax = plt.subplot(rows, cols, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()

models = [random_search_logistic, random_search_random_forest, random_search_xgboost, random_search_decision_tree, gradient_boosting_pipeline]
model_names = ['LogReg', 'RF', 'XGB', 'DT', 'GB']

# Compute ROC curves per output (treating each output as binary/multiclass-ovo not covered here)
# We'll assume binary outputs for ROC; if multiclass per output, switch to one-vs-rest ROC per class or use macro-averaged AUC.
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import numpy as np

# Limit to first output for quick visualization if outputs > 1
output_index = 0

plt.figure(figsize=(10, 8))
for i, model in enumerate(models):
    # Try to get probabilities for the selected output
    est = model.best_estimator_ if hasattr(model, 'best_estimator_') else model
    # Handle MultiOutputClassifier vs single estimator
    try:
        # Multi-output wrapped estimators: est.named_steps['classifier'].estimators_
        clf = est.named_steps['classifier']
        if hasattr(clf, 'estimators_'):
            # Get proba for the selected output from the wrapped estimator
            proba = clf.estimators_[output_index].predict_proba(X_val)
        else:
            proba = est.predict_proba(X_val)
    except Exception:
        # Fallback: skip if proba not available
        continue

    # If predict_proba returned a list (multi-output from single estimator), pick the selected output
    if isinstance(proba, list):
        if len(proba) <= output_index:
            continue
        proba_sel = proba[output_index]
    else:
        proba_sel = proba

    y_true = np.asarray(y_val)[:, output_index] if np.asarray(y_val).ndim == 2 else np.asarray(y_val)

    # If multiclass for this output, binarize one class (class 1) for demonstration
    classes = np.unique(y_true)
    if len(classes) > 2:
        y_true_bin = (y_true == classes[1]).astype(int)
        # Map the chosen class index into the columns of proba_sel
        class_idx = list(classes).index(classes[1]) if proba_sel.shape[1] == len(classes) else 1
        y_score = proba_sel[:, class_idx]
    else:
        # binary
        y_true_bin = (y_true == classes[-1]).astype(int)
        y_score = proba_sel[:, 1] if proba_sel.shape[1] > 1 else proba_sel.ravel()

    fpr, tpr, _ = roc_curve(y_true_bin, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{model_names[i]} (AUC = {roc_auc:.2f})')

# Plot ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic (ROC) Curve - Output {output_index}')
plt.legend(loc='lower right')
plt.show()

# Save test predictions for multi-output to CSV (per-label columns, inverse-transformed)
# Prefer tuned XGBoost model if available
import numpy as np
import pandas as pd

# Choose the best available XGBoost-based model
if 'random_search_xgboost' in globals():
    best_xgb = random_search_xgboost.best_estimator_ if hasattr(random_search_xgboost, 'best_estimator_') else random_search_xgboost
elif 'xgboost_pipeline' in globals():
    best_xgb = xgboost_pipeline
else:
    raise RuntimeError("No XGBoost model found. Run the XGBoost training cell before exporting predictions.")

# Predict on the hold-out X_test
test_predictions = best_xgb.predict(X_test)

# Normalize to 2D numpy array
if isinstance(test_predictions, list):
    test_predictions = np.column_stack(test_predictions)

test_pred_np = np.asarray(test_predictions)

# Build DataFrame and inverse-transform using saved encoders if present
if isinstance(y_train, pd.DataFrame):
    pred_df = pd.DataFrame(test_pred_np, columns=y_train.columns, index=X_test.index)
    if 'encoders' in globals() and isinstance(encoders, dict):
        for col in pred_df.columns:
            le = encoders.get(col)
            if le is not None:
                pred_df[col] = le.inverse_transform(pred_df[col].astype(int))
else:
    pred_df = pd.DataFrame({'prediction': test_pred_np.ravel()}, index=X_test.index)
    if 'encoders' in globals() and isinstance(encoders, dict):
        le = encoders.get('_single')
        if le is not None:
            pred_df['prediction'] = le.inverse_transform(pred_df['prediction'].astype(int))

# Save to CSV with index (IDs)
output_path = '../pool-datasets/clf/submission_multioutput.csv'
pred_df.to_csv(output_path, index=True)
print(f"Saved multi-output predictions to {output_path} with shape {pred_df.shape}")
print(pred_df.head())

Saved multi-output predictions to ../pool-datasets/clf/submission_multioutput.csv with shape (1449, 21)
                 BIRC5  CCNB1  MYBL2  MMP11  GRB7  PGR  BCL2  SCUBE2  GSTM1  \
TCGA-EW-A6S9-01    1.0    1.0    1.0    1.0   1.0  0.0   1.0     0.0   -1.0   
TCGA-GM-A3XG-01    1.0    1.0    1.0    1.0   1.0  0.0   1.0     0.0    1.0   
F2500              1.0    1.0    1.0    1.0  -1.0 -1.0  -1.0     1.0   -1.0   
F1775              1.0    1.0    1.0    1.0  -1.0 -1.0  -1.0    -1.0   -1.0   
TCGA-E9-A1NE-01    1.0    1.0    1.0    1.0   1.0  0.0   1.0     0.0   -1.0   

                 BAG1  ...  ACTB  GAPDH  RPLP0  TFRC  AURKA  CTSV  MKI67  \
TCGA-EW-A6S9-01   1.0  ...   1.0    1.0    0.0   0.0    1.0   1.0    1.0   
TCGA-GM-A3XG-01   1.0  ...   1.0    1.0    0.0   0.0    1.0   1.0    1.0   
F2500            -1.0  ...   1.0    1.0    1.0  -1.0    1.0  -1.0    1.0   
F1775            -1.0  ...   1.0    1.0    1.0  -1.0    1.0  -1.0    1.0   
TCGA-E9-A1NE-01   1.0  ...   1.0    1.0    0.0   0.0    1.0   1.0    1.0   

                 ERBB2  GUSB  ESR1  
TCGA-EW-A6S9-01    1.0   1.0   1.0  
TCGA-GM-A3XG-01    1.0   1.0   1.0  
F2500             -1.0   1.0   1.0  
F1775             -1.0   1.0   1.0  
TCGA-E9-A1NE-01    1.0   1.0   1.0  

[5 rows x 21 columns]

y_test.loc['TCGA-EW-A6S9-01']

BIRC5     1.0
CCNB1     1.0
MYBL2     1.0
MMP11     1.0
GRB7      1.0
PGR       0.0
BCL2      1.0
SCUBE2    0.0
GSTM1     1.0
BAG1      1.0
CD68      1.0
ACTB      1.0
GAPDH     1.0
RPLP0     0.0
TFRC      0.0
AURKA     1.0
CTSV      1.0
MKI67     1.0
ERBB2     1.0
GUSB      1.0
ESR1      1.0
Name: TCGA-EW-A6S9-01, dtype: float64

pred_df.loc['TCGA-EW-A6S9-01']

BIRC5     1.0
CCNB1     1.0
MYBL2     1.0
MMP11     1.0
GRB7      1.0
PGR       0.0
BCL2      1.0
SCUBE2    0.0
GSTM1    -1.0
BAG1      1.0
CD68      1.0
ACTB      1.0
GAPDH     1.0
RPLP0     0.0
TFRC      0.0
AURKA     1.0
CTSV      1.0
MKI67     1.0
ERBB2     1.0
GUSB      1.0
ESR1      1.0
Name: TCGA-EW-A6S9-01, dtype: float64

y_val_np = np.asarray(y_test)
y_val_pred_np = np.asarray(pred_df)

rf_micro = (y_val_pred_np == y_val_np).mean()
if y_val_np.ndim == 2:
    rf_macro = (y_val_pred_np == y_val_np).mean(axis=0).mean()
else:
    from sklearn.metrics import accuracy_score
    rf_macro = accuracy_score(y_val_np, y_val_pred_np)

print(f"Best Model Test Accuracy (micro): {rf_micro:.4f}")
print(f"Best Model Test Accuracy (macro): {rf_macro:.4f}")
print(f"Best Parameters: {random_search_logistic.best_params_}")

Best Model Test Accuracy (micro): 0.8518
Best Model Test Accuracy (macro): 0.8518
Best Parameters: {'classifier__estimator__solver': 'lbfgs', 'classifier__estimator__C': np.float64(3792.690190732246)}

	BIRC5	CCNB1	MYBL2	MMP11	GRB7	PGR	BCL2	SCUBE2	GSTM1	BAG1	...	AURKA	CTSV	MKI67	ERBB2	GUSB	ESR1	IQGAP1	IQGAP2	FRG1	EEF1A2
count	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	...	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	7244.000000	6144.000000
mean	0.962452	0.994754	0.926560	0.993374	-0.041137	-0.442021	-0.066538	0.180839	-0.342076	0.088349	...	0.987300	0.110850	0.573716	0.370514	0.384594	0.523744	1.199757	1.453168	1.739550	1.276638
std	0.271472	0.102300	0.376173	0.114936	0.999222	0.807996	0.997853	0.903084	0.939737	0.996158	...	0.158879	0.884617	0.819111	0.928891	0.923149	0.851935	0.835877	1.014409	0.499565	1.126547
min	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	...	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	0.082729	0.017385	0.004461	0.000000
25%	1.000000	1.000000	1.000000	1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	...	1.000000	-1.000000	1.000000	-1.000000	-1.000000	1.000000	0.501858	0.692744	1.425864	0.277508
50%	1.000000	1.000000	1.000000	1.000000	-1.000000	-1.000000	-1.000000	1.000000	-1.000000	1.000000	...	1.000000	0.000000	1.000000	1.000000	1.000000	1.000000	0.731611	1.196585	1.632566	1.028403
75%	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.882118	1.984385	1.964976	2.004722
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	4.338948	6.246560	5.467065	7.079042

Gene	Role in Cancer
IQGAP1	Predominantly oncogene, with rare, context-specific exceptions
IQGAP2	Clear tumor suppressor across multiple cancer types
FRG1	Tumor suppressor, particularly in breast and prostate cancer
EEF1A2	Clearly functions as an oncogene

	Model	BIRC5macro_score	BIRC5micro_score	CCNB1macro_score	CCNB1micro_score	MYBL2macro_score	MYBL2micro_score	MMP11macro_score	MMP11micro_score	GRB7macro_score	GRB7micro_score	PGRmacro_score	PGRmicro_score	BCL2macro_score	BCL2micro_score	SCUBE2macro_score	SCUBE2micro_score	GSTM1macro_score	GSTM1micro_score	BAG1macro_score	BAG1micro_score	CD68macro_score	CD68micro_score	ACTBmacro_score	ACTBmicro_score	GAPDHmacro_score	GAPDHmicro_score	RPLP0macro_score	RPLP0micro_score	TFRCmacro_score	TFRCmicro_score	AURKAmacro_score	AURKAmicro_score	CTSVmacro_score	CTSVmicro_score	MKI67macro_score	MKI67micro_score	ERBB2macro_score	ERBB2micro_score	GUSBmacro_score	GUSBmicro_score	ESR1macro_score	ESR1micro_score
9	ExtraTreesClassifier	0.596500	0.986200	0.499800	0.999100	0.489200	0.957700	0.499100	0.996500	0.788700	0.789500	0.863300	0.882700	0.823600	0.824800	0.816800	0.786000	0.607500	0.697200	0.852100	0.852500	0.656300	0.799800	0.709600	0.923200	0.665800	0.996500	0.819900	0.934400	0.900800	0.878300	0.498500	0.994000	0.524800	0.559100	0.729700	0.836100	0.713400	0.746300	0.748300	0.786900	0.758900	0.843800
27	RandomForestClassifier	0.596500	0.986200	0.499800	0.999100	0.488800	0.956000	0.499100	0.996500	0.786300	0.786900	0.852200	0.872300	0.814700	0.816200	0.817100	0.785200	0.576100	0.676400	0.849500	0.849900	0.628600	0.780000	0.709600	0.921500	0.498900	0.995700	0.770200	0.923200	0.900300	0.876600	0.498500	0.994000	0.560600	0.585000	0.721900	0.834300	0.731100	0.763600	0.756400	0.792900	0.752000	0.840400
13	HistGradientBoostingClassifier	0.591500	0.985300	0.499800	0.999100	0.507600	0.956000	0.499100	0.996500	0.770800	0.771400	0.825000	0.847300	0.797700	0.799000	0.792000	0.756700	0.535100	0.661800	0.850300	0.850700	0.604500	0.780000	0.697000	0.916300	0.498700	0.994800	0.774900	0.915400	0.888900	0.861900	0.498300	0.993100	0.555900	0.576400	0.699700	0.816200	0.732300	0.765300	0.736300	0.779100	0.732500	0.831800
1	BaggingClassifier	0.579000	0.982700	0.499800	0.999100	0.520200	0.950800	0.499100	0.996500	0.753200	0.755000	0.819100	0.848100	0.793000	0.794700	0.784700	0.742000	0.549300	0.649700	0.824500	0.824800	0.610700	0.741200	0.693400	0.906800	0.498500	0.994000	0.800300	0.918900	0.895400	0.870600	0.497800	0.991400	0.550500	0.569500	0.725100	0.823100	0.727500	0.754100	0.739600	0.773900	0.742000	0.827400
21	MLPClassifier	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.755700	0.756700	0.791700	0.821400	0.787000	0.788600	0.778600	0.751500	0.410200	0.667800	0.841800	0.842100	0.557700	0.783400	0.629300	0.910300	0.498900	0.995700	0.698500	0.909400	0.877000	0.857600	0.498500	0.994000	0.609800	0.629900	0.712400	0.833500	0.744400	0.767000	0.736800	0.791200	0.706500	0.824800
0	AdaBoostClassifier	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.769400	0.769600	0.738900	0.780000	0.762600	0.764500	0.720700	0.705800	0.402000	0.672100	0.839200	0.839500	0.571900	0.785200	0.505800	0.899900	0.498700	0.994800	0.659200	0.895600	0.876100	0.858500	0.498300	0.993100	0.574600	0.616900	0.664800	0.812800	0.717800	0.763600	0.730900	0.768800	0.712800	0.824800
12	GradientBoostingClassifier	0.538900	0.981900	0.498900	0.995700	0.488300	0.954300	0.499100	0.996500	0.779600	0.780000	0.815300	0.839500	0.787000	0.788600	0.788100	0.755800	0.457800	0.676400	0.843500	0.843800	0.576700	0.783400	0.692000	0.917200	0.609600	0.994000	0.757300	0.923200	0.893100	0.867100	0.497000	0.987900	0.623900	0.638500	0.699300	0.826600	0.742700	0.773100	0.747000	0.787700	0.706700	0.824000
11	GaussianProcessClassifier	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.762600	0.763600	0.767500	0.816200	0.794000	0.795500	0.740500	0.727400	0.434400	0.671300	0.845200	0.845600	0.534300	0.787700	0.590600	0.907700	0.498900	0.995700	0.677100	0.897300	0.850200	0.840400	0.498500	0.994000	0.605100	0.628100	0.669300	0.814500	0.743600	0.772200	0.761900	0.800700	0.706800	0.823100
31	SVC	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.763600	0.764500	0.765400	0.813600	0.783300	0.786000	0.732500	0.723900	0.402000	0.672100	0.842000	0.842100	0.436000	0.773100	0.551800	0.905100	0.498900	0.995700	0.654200	0.897300	0.854200	0.843800	0.498500	0.994000	0.597500	0.629900	0.662600	0.820500	0.741100	0.771400	0.756900	0.799800	0.681000	0.817100
19	LogisticRegression	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.754600	0.755800	0.649600	0.755800	0.764500	0.766200	0.671300	0.679000	0.422500	0.669500	0.838400	0.838700	0.444000	0.774800	0.482100	0.899900	0.498900	0.995700	0.559900	0.857600	0.791700	0.805900	0.498500	0.994000	0.467900	0.581500	0.604000	0.804100	0.742700	0.773100	0.730300	0.781700	0.686500	0.815400
20	LogisticRegressionCV	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.754600	0.755800	0.643900	0.755000	0.766200	0.767900	0.669000	0.683300	0.402000	0.672100	0.838400	0.838700	0.444000	0.774800	0.473900	0.900800	0.498900	0.995700	0.559400	0.857600	0.789400	0.804100	0.498500	0.994000	0.429600	0.578100	0.602500	0.809300	0.748500	0.780000	0.730300	0.781700	0.684500	0.814500
3	CalibratedClassifierCV	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.753700	0.755000	0.616300	0.748100	0.763600	0.765300	0.675700	0.683300	0.419900	0.668700	0.839300	0.839500	0.440000	0.773900	0.481200	0.897300	0.498900	0.995700	0.561400	0.859400	0.768800	0.788600	0.498500	0.994000	0.446100	0.572000	0.615500	0.808500	0.745200	0.775700	0.732400	0.783400	0.679100	0.812800
17	LinearDiscriminantAnalysis	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.750000	0.751500	0.682600	0.760100	0.764000	0.766200	0.684200	0.688500	0.422500	0.669500	0.833400	0.833500	0.436000	0.773100	0.481800	0.899100	0.498900	0.995700	0.577500	0.858500	0.784700	0.799000	0.498500	0.994000	0.465900	0.579800	0.453000	0.789500	0.739300	0.767900	0.719100	0.776500	0.668500	0.811000
14	KNeighborsClassifier	0.495900	0.983600	0.499800	0.999100	0.488300	0.954300	0.499100	0.996500	0.737800	0.738600	0.765200	0.812800	0.781300	0.782600	0.762200	0.738600	0.528500	0.611700	0.815500	0.816200	0.587600	0.748100	0.637000	0.909400	0.665800	0.996500	0.705100	0.884400	0.849300	0.835200	0.498500	0.994000	0.555500	0.572900	0.690300	0.810200	0.714200	0.749800	0.702300	0.747200	0.703900	0.806700
18	LinearSVC	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.750000	0.751500	0.598900	0.747200	0.765000	0.767000	0.669300	0.679000	0.418300	0.670400	0.836800	0.836900	0.436000	0.773100	0.473900	0.900800	0.498900	0.995700	0.561400	0.859400	0.766200	0.786000	0.498500	0.994000	0.443100	0.578100	0.564900	0.805000	0.748600	0.774800	0.731300	0.786000	0.633400	0.803300
10	GaussianNB	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.498700	0.994800	0.736600	0.739400	0.803200	0.814500	0.757300	0.760100	0.788400	0.748100	0.493600	0.658300	0.816200	0.816200	0.562200	0.643700	0.480000	0.893900	0.498700	0.994800	0.756200	0.875800	0.891300	0.865400	0.498500	0.994000	0.473800	0.564300	0.661400	0.692800	0.762700	0.767900	0.721500	0.729100	0.684800	0.802400
26	QuadraticDiscriminantAnalysis	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.498700	0.994800	0.746600	0.748900	0.782300	0.810200	0.750500	0.753200	0.771800	0.745500	0.424400	0.668700	0.821300	0.821400	0.481500	0.769600	0.545300	0.891300	0.630900	0.990500	0.775200	0.902500	0.889300	0.862800	0.498500	0.994000	0.510400	0.591000	0.701700	0.742000	0.762800	0.769600	0.739600	0.753200	0.677100	0.799000
30	SGDClassifier	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.752700	0.754100	0.615200	0.754100	0.765700	0.767000	0.611800	0.659200	0.402000	0.672100	0.830000	0.830000	0.436000	0.773100	0.473900	0.900800	0.498900	0.995700	0.555400	0.851600	0.625500	0.733400	0.498500	0.994000	0.431200	0.568600	0.457800	0.792100	0.722000	0.766200	0.733300	0.764500	0.604400	0.798100
28	RidgeClassifier	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.750000	0.751500	0.596800	0.748100	0.764000	0.766200	0.663500	0.674700	0.417900	0.669500	0.833400	0.833500	0.436000	0.773100	0.473900	0.900800	0.498900	0.995700	0.565000	0.861100	0.700500	0.743700	0.498500	0.994000	0.436300	0.575500	0.441700	0.791200	0.725000	0.767000	0.696000	0.773900	0.595000	0.793800
29	RidgeClassifierCV	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.750000	0.751500	0.597300	0.748100	0.763100	0.765300	0.664900	0.675600	0.418300	0.670400	0.833400	0.833500	0.436000	0.773100	0.473900	0.900800	0.498900	0.995700	0.562900	0.859400	0.700500	0.743700	0.498500	0.994000	0.436300	0.575500	0.441700	0.791200	0.724600	0.767000	0.694400	0.773100	0.589700	0.792100
4	CategoricalNB	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.742600	0.744600	0.745200	0.783400	0.759400	0.761900	0.735700	0.717900	0.500400	0.647100	0.839300	0.839500	0.513200	0.771400	0.489900	0.899100	0.498900	0.995700	0.686800	0.864500	0.838700	0.831800	0.498500	0.994000	0.552200	0.591900	0.645600	0.743700	0.759400	0.770500	0.734500	0.751500	0.648100	0.771400
8	ExtraTreeClassifier	0.598700	0.974100	0.499100	0.996500	0.537500	0.941300	0.499100	0.996500	0.680600	0.680800	0.792500	0.815400	0.752300	0.753200	0.730300	0.685900	0.561100	0.610000	0.777600	0.780000	0.581600	0.691100	0.633400	0.874000	0.771600	0.995700	0.728100	0.866300	0.824700	0.797200	0.496100	0.984500	0.487200	0.516000	0.649800	0.769600	0.657100	0.703200	0.689400	0.737700	0.687600	0.769600
6	DecisionTreeClassifier	0.561600	0.968100	0.499100	0.996500	0.500100	0.920600	0.499100	0.996500	0.695400	0.695400	0.805800	0.815400	0.743300	0.743700	0.751700	0.706600	0.522200	0.574600	0.756400	0.761000	0.595000	0.713500	0.669700	0.887000	0.598300	0.993100	0.788800	0.896500	0.851400	0.815400	0.496700	0.987100	0.497700	0.523700	0.685800	0.792100	0.680100	0.717900	0.706100	0.749800	0.689000	0.769600
2	BernoulliNB	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.340400	0.505600	0.258700	0.634200	0.338100	0.510800	0.228000	0.509100	0.402000	0.672100	0.360000	0.562600	0.435200	0.770500	0.491000	0.901600	0.498900	0.995700	0.309300	0.766200	0.211400	0.452100	0.498500	0.994000	0.210300	0.460700	0.441700	0.791200	0.401000	0.669500	0.405600	0.682500	0.439400	0.761900
22	MultinomialNB	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.679300	0.684200	0.262500	0.634200	0.693100	0.704900	0.236000	0.511600	0.402000	0.672100	0.638300	0.653100	0.439700	0.773100	0.473900	0.900800	0.498900	0.995700	0.292500	0.764500	0.447500	0.638500	0.498500	0.994000	0.330600	0.496100	0.441700	0.791200	0.472000	0.671300	0.441200	0.685900	0.432100	0.761000
15	LabelPropagation	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.332000	0.497000	0.431900	0.349400	0.328500	0.489200	0.552800	0.644500	0.246900	0.327900	0.360000	0.562600	0.436000	0.773100	0.473900	0.900800	0.498900	0.995700	0.619400	0.899100	0.519000	0.531500	0.498500	0.994000	0.316900	0.439200	0.441700	0.791200	0.401000	0.669500	0.405600	0.682500	0.432100	0.761000
7	DummyClassifier	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.334700	0.503000	0.258700	0.634200	0.338100	0.510800	0.224600	0.508200	0.402000	0.672100	0.360000	0.562600	0.436000	0.773100	0.473900	0.900800	0.498900	0.995700	0.288600	0.763600	0.206700	0.449500	0.498500	0.994000	0.210300	0.460700	0.441700	0.791200	0.401000	0.669500	0.405600	0.682500	0.432100	0.761000
16	LabelSpreading	0.496100	0.984500	0.499800	0.999100	0.489400	0.958600	0.499100	0.996500	0.332000	0.497000	0.433900	0.350300	0.328500	0.489200	0.553900	0.645400	0.246900	0.327900	0.360000	0.562600	0.436000	0.773100	0.473900	0.900800	0.498900	0.995700	0.619400	0.899100	0.517800	0.530600	0.498500	0.994000	0.319500	0.440000	0.441700	0.791200	0.401000	0.669500	0.405600	0.682500	0.432100	0.761000
23	NearestCentroid	0.378200	0.550500	0.379600	0.611700	0.412800	0.551300	0.422200	0.698900	0.744500	0.746300	0.625900	0.704100	0.771900	0.773900	0.540700	0.528900	0.534700	0.561700	0.807600	0.807600	0.545600	0.567700	0.407800	0.476300	0.444600	0.757500	0.577800	0.718700	0.682700	0.720400	0.382200	0.589300	0.527500	0.573800	0.654200	0.679900	0.748700	0.753200	0.699500	0.705800	0.638500	0.678200
5	ComplementNB	0.379500	0.553100	0.377600	0.601400	0.410900	0.543600	0.402000	0.647100	0.629000	0.629000	0.399400	0.541000	0.673700	0.675600	0.441900	0.477100	0.503100	0.527200	0.696300	0.698000	0.558500	0.614300	0.419300	0.497000	0.415600	0.675600	0.376300	0.591900	0.429600	0.485800	0.357200	0.532400	0.467500	0.556500	0.628900	0.675600	0.634200	0.656600	0.644700	0.660100	0.576500	0.601400
24	PassiveAggressiveClassifier	0.496100	0.984500	0.499800	0.999100	0.519300	0.871400	0.499100	0.996500	0.716600	0.722200	0.524800	0.736800	0.746300	0.749800	0.573400	0.641900	0.478500	0.478900	0.805000	0.805000	0.484400	0.769600	0.503400	0.895600	0.498900	0.995700	0.607400	0.791200	0.669900	0.707500	0.498500	0.994000	0.450000	0.535800	0.479200	0.777400	0.764900	0.778300	0.419400	0.685100	0.495200	0.504700
25	Perceptron	0.496100	0.984500	0.499800	0.999100	0.504900	0.951700	0.499100	0.996500	0.457700	0.558200	0.525700	0.723000	0.686000	0.702300	0.614500	0.644500	0.411900	0.672100	0.734000	0.750600	0.185000	0.226900	0.473900	0.900800	0.498900	0.995700	0.581100	0.816200	0.763600	0.775700	0.498100	0.992200	0.327900	0.422800	0.445200	0.789500	0.456100	0.685100	0.534900	0.535800	0.500300	0.504700

IMPORTS¶

COMMON SET UP¶

DATA LOADING¶

Data Exploration¶

DATA PREPROCESSING¶

TRAIN, TEST, DEV SPLITS¶

CUSTOM FUNC FOR F1 MICRO AND MACRO¶

RUNNING ALL AVAILABEL ESTIMATORS ON SCIKIT-LEARN¶

SELECT MODEL AND FINETUNING¶

RANDOM_SEARCH_LOGISTIC¶

RANDOMFORESTCLASSIFIER MODEL FOR CLASSIFICATION¶

RANDOM_SEARCH_RANDOM_FOREST¶

XGBCLASSIFIER MODEL FOR CLASSIFICATION¶

DECISIONTREECLASSIFIER MODEL FOR CLASSIFICATION¶

GRADIENTBOOSTINGCLASSIFIER MODEL FOR CLASSIFICATION¶

PERFORMANCE ON THE TEST SET¶

	Model	BIRC5macro_score	BIRC5micro_score	CCNB1macro_score	CCNB1micro_score	MYBL2macro_score	MYBL2micro_score	MMP11macro_score	MMP11micro_score	GRB7macro_score	...	CTSVmicro_score	MKI67macro_score	MKI67micro_score	ERBB2macro_score	ERBB2micro_score	GUSBmacro_score	GUSBmicro_score	ESR1macro_score	ESR1micro_score	x
9	ExtraTreesClassifier	0.5965	0.9862	0.4998	0.9991	0.4892	0.9577	0.4991	0.9965	0.7887	...	0.5591	0.7297	0.8361	0.7134	0.7463	0.7483	0.7869	0.7589	0.8438	0.000000
27	RandomForestClassifier	0.5965	0.9862	0.4998	0.9991	0.4888	0.9560	0.4991	0.9965	0.7863	...	0.5850	0.7219	0.8343	0.7311	0.7636	0.7564	0.7929	0.7520	0.8404	0.032258
13	HistGradientBoostingClassifier	0.5915	0.9853	0.4998	0.9991	0.5076	0.9560	0.4991	0.9965	0.7708	...	0.5764	0.6997	0.8162	0.7323	0.7653	0.7363	0.7791	0.7325	0.8318	0.064516
1	BaggingClassifier	0.5790	0.9827	0.4998	0.9991	0.5202	0.9508	0.4991	0.9965	0.7532	...	0.5695	0.7251	0.8231	0.7275	0.7541	0.7396	0.7739	0.7420	0.8274	0.096774
21	MLPClassifier	0.4961	0.9845	0.4998	0.9991	0.4894	0.9586	0.4991	0.9965	0.7557	...	0.6299	0.7124	0.8335	0.7444	0.7670	0.7368	0.7912	0.7065	0.8248	0.129032