In [1]:
import warnings
warnings.filterwarnings('ignore')

IMPORTS¶

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import (train_test_split)
from sklearn.model_selection import RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier

from sklearn.multioutput import MultiOutputClassifier

import xgboost as xgb

COMMON SET UP¶

In [3]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

DATA LOADING¶

In [4]:
df = pd.read_csv("../pool-datasets/clf/oncotypedx_subset_transpose.csv", index_col=0)
In [5]:
df.shape
Out[5]:
(7244, 25)
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 7244 entries, MBC-MBCProject_wAiri7fp-Tumor-SM-AZ5DH to F3265repl
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   BIRC5   7244 non-null   float64
 1   CCNB1   7244 non-null   float64
 2   MYBL2   7244 non-null   float64
 3   MMP11   7244 non-null   float64
 4   GRB7    7244 non-null   float64
 5   PGR     7244 non-null   float64
 6   BCL2    7244 non-null   float64
 7   SCUBE2  7244 non-null   float64
 8   GSTM1   7244 non-null   float64
 9   BAG1    7244 non-null   float64
 10  CD68    7244 non-null   float64
 11  ACTB    7244 non-null   float64
 12  GAPDH   7244 non-null   float64
 13  RPLP0   7244 non-null   float64
 14  TFRC    7244 non-null   float64
 15  AURKA   7244 non-null   float64
 16  CTSV    7244 non-null   float64
 17  MKI67   7244 non-null   float64
 18  ERBB2   7244 non-null   float64
 19  GUSB    7244 non-null   float64
 20  ESR1    7244 non-null   float64
 21  IQGAP1  7244 non-null   float64
 22  IQGAP2  7244 non-null   float64
 23  FRG1    7244 non-null   float64
 24  EEF1A2  6144 non-null   float64
dtypes: float64(25)
memory usage: 1.4+ MB
In [7]:
df.describe(include="all")
Out[7]:
BIRC5 CCNB1 MYBL2 MMP11 GRB7 PGR BCL2 SCUBE2 GSTM1 BAG1 ... AURKA CTSV MKI67 ERBB2 GUSB ESR1 IQGAP1 IQGAP2 FRG1 EEF1A2
count 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 ... 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 7244.000000 6144.000000
mean 0.962452 0.994754 0.926560 0.993374 -0.041137 -0.442021 -0.066538 0.180839 -0.342076 0.088349 ... 0.987300 0.110850 0.573716 0.370514 0.384594 0.523744 1.199757 1.453168 1.739550 1.276638
std 0.271472 0.102300 0.376173 0.114936 0.999222 0.807996 0.997853 0.903084 0.939737 0.996158 ... 0.158879 0.884617 0.819111 0.928891 0.923149 0.851935 0.835877 1.014409 0.499565 1.126547
min -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 0.082729 0.017385 0.004461 0.000000
25% 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 ... 1.000000 -1.000000 1.000000 -1.000000 -1.000000 1.000000 0.501858 0.692744 1.425864 0.277508
50% 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 ... 1.000000 0.000000 1.000000 1.000000 1.000000 1.000000 0.731611 1.196585 1.632566 1.028403
75% 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.882118 1.984385 1.964976 2.004722
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 4.338948 6.246560 5.467065 7.079042

8 rows × 25 columns

In [8]:
df.fillna(df.mean(), inplace=True)
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 7244 entries, MBC-MBCProject_wAiri7fp-Tumor-SM-AZ5DH to F3265repl
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   BIRC5   7244 non-null   float64
 1   CCNB1   7244 non-null   float64
 2   MYBL2   7244 non-null   float64
 3   MMP11   7244 non-null   float64
 4   GRB7    7244 non-null   float64
 5   PGR     7244 non-null   float64
 6   BCL2    7244 non-null   float64
 7   SCUBE2  7244 non-null   float64
 8   GSTM1   7244 non-null   float64
 9   BAG1    7244 non-null   float64
 10  CD68    7244 non-null   float64
 11  ACTB    7244 non-null   float64
 12  GAPDH   7244 non-null   float64
 13  RPLP0   7244 non-null   float64
 14  TFRC    7244 non-null   float64
 15  AURKA   7244 non-null   float64
 16  CTSV    7244 non-null   float64
 17  MKI67   7244 non-null   float64
 18  ERBB2   7244 non-null   float64
 19  GUSB    7244 non-null   float64
 20  ESR1    7244 non-null   float64
 21  IQGAP1  7244 non-null   float64
 22  IQGAP2  7244 non-null   float64
 23  FRG1    7244 non-null   float64
 24  EEF1A2  7244 non-null   float64
dtypes: float64(25)
memory usage: 1.4+ MB
In [10]:
train, test = train_test_split(df,test_size=0.20,random_state=SEED)
y_train, X_train = train.drop(['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2'],axis=1).copy(), train[['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2']].copy()
y_test, X_test = test.drop(['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2'],axis=1).copy(), test[['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2']].copy()
del(train)
del(test)
In [11]:
len(X_train), len(X_test), len(y_train), len(y_test)
print(f'Train Samples: {len(X_train), len(y_train)} and Test Samples: {len(X_test), len(y_test)}')
Train Samples: (5795, 5795) and Test Samples: (1449, 1449)

IQGAP1 and EEF1A2 are oncogene IQGAP2 and FRG1 are tumor suppressor

Gene Role in Cancer
IQGAP1 Predominantly oncogene, with rare, context-specific exceptions
IQGAP2 Clear tumor suppressor across multiple cancer types
FRG1 Tumor suppressor, particularly in breast and prostate cancer
EEF1A2 Clearly functions as an oncogene

Data Exploration¶

In [12]:
X_train.plot(kind="scatter", x="IQGAP1",y="FRG1", grid=True)
plt.show() #BEFORE removing 0 values
No description has been provided for this image
In [13]:
X_train.plot(kind="scatter", x="EEF1A2",y="IQGAP2", grid=True)
Out[13]:
<Axes: xlabel='EEF1A2', ylabel='IQGAP2'>
No description has been provided for this image
In [14]:
X_train.plot(kind="scatter", x="EEF1A2",y="IQGAP1", grid=True)
Out[14]:
<Axes: xlabel='EEF1A2', ylabel='IQGAP1'>
No description has been provided for this image
In [15]:
X_train.plot(kind="scatter", x="FRG1",y="IQGAP2", grid=True)
Out[15]:
<Axes: xlabel='FRG1', ylabel='IQGAP2'>
No description has been provided for this image

In the SE crimes are dominated by part 2 and violent crimes, in NW crimes are more general

In [16]:
import matplotlib.pyplot as plt

X_train.hist(bins=50, figsize=(12,8))
plt.show() #clearly victim age has some 0 values so we will remove that
No description has been provided for this image
In [17]:
y_train.hist(bins=50, figsize=(12,8))
plt.show() 
No description has been provided for this image

NOTE: Dataset is imbalanced

DATA PREPROCESSING¶

No, preprocessing as of now.

TRAIN, TEST, DEV SPLITS¶

In [18]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Simple preprocessor placeholder used by some pipelines below
preprocessor = 'passthrough'

CUSTOM FUNC FOR F1 MICRO AND MACRO¶

In [19]:
# Define a function to create a pipeline with a given model
def custom_f1score(y_val, y_pred, model_name, f1_score): # take (samples, outputs)
  name_col = None
  n_col = []
  row_tup = tuple()
  row_tup = row_tup + (model_name,)  # Add model name to the tuple
  for idx, col in enumerate(y_val.columns):
      macro_acc = f1_score(y_val[col].values, y_pred[:, idx], average='macro')
      row_tup = row_tup + (round(macro_acc, 4),)
      n_col.append(col+'macro_score')
      micro_acc = f1_score(y_val[col].values, y_pred[:, idx], average='micro')
      row_tup = row_tup + (round(micro_acc, 4),)
      n_col.append(col+'micro_score')
  name_col = n_col
  return row_tup, name_col

RUNNING ALL AVAILABEL ESTIMATORS ON SCIKIT-LEARN¶

In [20]:
from sklearn.utils import all_estimators
from sklearn.multioutput import MultiOutputClassifier
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification


# Get all sklearn classifiers
all_classifiers = all_estimators(type_filter="classifier")

results = []
name_col = None
for name, ClfClass in all_classifiers:
    try:
        clf = MultiOutputClassifier(ClfClass())
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        row_tup, name_col = custom_f1score(y_val, y_pred, name, f1_score)
        # Append the results
        results.append(row_tup)
        del row_tup  # Delete to save the memory!
    except Exception:
        pass  # Ignore models that fail

# Sort and show
results_df = pd.DataFrame(results, columns=["Model"] + name_col)
results_df = results_df.sort_values(by="ESR1micro_score", ascending=False)
In [21]:
from IPython.display import display
display(results_df.style.set_table_attributes("style='display:inline'").set_table_styles(
  [{'selector':'table', 'props': [('overflow', 'scroll'), ('display', 'block')]}]
))
  Model BIRC5macro_score BIRC5micro_score CCNB1macro_score CCNB1micro_score MYBL2macro_score MYBL2micro_score MMP11macro_score MMP11micro_score GRB7macro_score GRB7micro_score PGRmacro_score PGRmicro_score BCL2macro_score BCL2micro_score SCUBE2macro_score SCUBE2micro_score GSTM1macro_score GSTM1micro_score BAG1macro_score BAG1micro_score CD68macro_score CD68micro_score ACTBmacro_score ACTBmicro_score GAPDHmacro_score GAPDHmicro_score RPLP0macro_score RPLP0micro_score TFRCmacro_score TFRCmicro_score AURKAmacro_score AURKAmicro_score CTSVmacro_score CTSVmicro_score MKI67macro_score MKI67micro_score ERBB2macro_score ERBB2micro_score GUSBmacro_score GUSBmicro_score ESR1macro_score ESR1micro_score
9 ExtraTreesClassifier 0.596500 0.986200 0.499800 0.999100 0.489200 0.957700 0.499100 0.996500 0.788700 0.789500 0.863300 0.882700 0.823600 0.824800 0.816800 0.786000 0.607500 0.697200 0.852100 0.852500 0.656300 0.799800 0.709600 0.923200 0.665800 0.996500 0.819900 0.934400 0.900800 0.878300 0.498500 0.994000 0.524800 0.559100 0.729700 0.836100 0.713400 0.746300 0.748300 0.786900 0.758900 0.843800
27 RandomForestClassifier 0.596500 0.986200 0.499800 0.999100 0.488800 0.956000 0.499100 0.996500 0.786300 0.786900 0.852200 0.872300 0.814700 0.816200 0.817100 0.785200 0.576100 0.676400 0.849500 0.849900 0.628600 0.780000 0.709600 0.921500 0.498900 0.995700 0.770200 0.923200 0.900300 0.876600 0.498500 0.994000 0.560600 0.585000 0.721900 0.834300 0.731100 0.763600 0.756400 0.792900 0.752000 0.840400
13 HistGradientBoostingClassifier 0.591500 0.985300 0.499800 0.999100 0.507600 0.956000 0.499100 0.996500 0.770800 0.771400 0.825000 0.847300 0.797700 0.799000 0.792000 0.756700 0.535100 0.661800 0.850300 0.850700 0.604500 0.780000 0.697000 0.916300 0.498700 0.994800 0.774900 0.915400 0.888900 0.861900 0.498300 0.993100 0.555900 0.576400 0.699700 0.816200 0.732300 0.765300 0.736300 0.779100 0.732500 0.831800
1 BaggingClassifier 0.579000 0.982700 0.499800 0.999100 0.520200 0.950800 0.499100 0.996500 0.753200 0.755000 0.819100 0.848100 0.793000 0.794700 0.784700 0.742000 0.549300 0.649700 0.824500 0.824800 0.610700 0.741200 0.693400 0.906800 0.498500 0.994000 0.800300 0.918900 0.895400 0.870600 0.497800 0.991400 0.550500 0.569500 0.725100 0.823100 0.727500 0.754100 0.739600 0.773900 0.742000 0.827400
21 MLPClassifier 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.755700 0.756700 0.791700 0.821400 0.787000 0.788600 0.778600 0.751500 0.410200 0.667800 0.841800 0.842100 0.557700 0.783400 0.629300 0.910300 0.498900 0.995700 0.698500 0.909400 0.877000 0.857600 0.498500 0.994000 0.609800 0.629900 0.712400 0.833500 0.744400 0.767000 0.736800 0.791200 0.706500 0.824800
0 AdaBoostClassifier 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.769400 0.769600 0.738900 0.780000 0.762600 0.764500 0.720700 0.705800 0.402000 0.672100 0.839200 0.839500 0.571900 0.785200 0.505800 0.899900 0.498700 0.994800 0.659200 0.895600 0.876100 0.858500 0.498300 0.993100 0.574600 0.616900 0.664800 0.812800 0.717800 0.763600 0.730900 0.768800 0.712800 0.824800
12 GradientBoostingClassifier 0.538900 0.981900 0.498900 0.995700 0.488300 0.954300 0.499100 0.996500 0.779600 0.780000 0.815300 0.839500 0.787000 0.788600 0.788100 0.755800 0.457800 0.676400 0.843500 0.843800 0.576700 0.783400 0.692000 0.917200 0.609600 0.994000 0.757300 0.923200 0.893100 0.867100 0.497000 0.987900 0.623900 0.638500 0.699300 0.826600 0.742700 0.773100 0.747000 0.787700 0.706700 0.824000
11 GaussianProcessClassifier 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.762600 0.763600 0.767500 0.816200 0.794000 0.795500 0.740500 0.727400 0.434400 0.671300 0.845200 0.845600 0.534300 0.787700 0.590600 0.907700 0.498900 0.995700 0.677100 0.897300 0.850200 0.840400 0.498500 0.994000 0.605100 0.628100 0.669300 0.814500 0.743600 0.772200 0.761900 0.800700 0.706800 0.823100
31 SVC 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.763600 0.764500 0.765400 0.813600 0.783300 0.786000 0.732500 0.723900 0.402000 0.672100 0.842000 0.842100 0.436000 0.773100 0.551800 0.905100 0.498900 0.995700 0.654200 0.897300 0.854200 0.843800 0.498500 0.994000 0.597500 0.629900 0.662600 0.820500 0.741100 0.771400 0.756900 0.799800 0.681000 0.817100
19 LogisticRegression 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.754600 0.755800 0.649600 0.755800 0.764500 0.766200 0.671300 0.679000 0.422500 0.669500 0.838400 0.838700 0.444000 0.774800 0.482100 0.899900 0.498900 0.995700 0.559900 0.857600 0.791700 0.805900 0.498500 0.994000 0.467900 0.581500 0.604000 0.804100 0.742700 0.773100 0.730300 0.781700 0.686500 0.815400
20 LogisticRegressionCV 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.754600 0.755800 0.643900 0.755000 0.766200 0.767900 0.669000 0.683300 0.402000 0.672100 0.838400 0.838700 0.444000 0.774800 0.473900 0.900800 0.498900 0.995700 0.559400 0.857600 0.789400 0.804100 0.498500 0.994000 0.429600 0.578100 0.602500 0.809300 0.748500 0.780000 0.730300 0.781700 0.684500 0.814500
3 CalibratedClassifierCV 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.753700 0.755000 0.616300 0.748100 0.763600 0.765300 0.675700 0.683300 0.419900 0.668700 0.839300 0.839500 0.440000 0.773900 0.481200 0.897300 0.498900 0.995700 0.561400 0.859400 0.768800 0.788600 0.498500 0.994000 0.446100 0.572000 0.615500 0.808500 0.745200 0.775700 0.732400 0.783400 0.679100 0.812800
17 LinearDiscriminantAnalysis 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.750000 0.751500 0.682600 0.760100 0.764000 0.766200 0.684200 0.688500 0.422500 0.669500 0.833400 0.833500 0.436000 0.773100 0.481800 0.899100 0.498900 0.995700 0.577500 0.858500 0.784700 0.799000 0.498500 0.994000 0.465900 0.579800 0.453000 0.789500 0.739300 0.767900 0.719100 0.776500 0.668500 0.811000
14 KNeighborsClassifier 0.495900 0.983600 0.499800 0.999100 0.488300 0.954300 0.499100 0.996500 0.737800 0.738600 0.765200 0.812800 0.781300 0.782600 0.762200 0.738600 0.528500 0.611700 0.815500 0.816200 0.587600 0.748100 0.637000 0.909400 0.665800 0.996500 0.705100 0.884400 0.849300 0.835200 0.498500 0.994000 0.555500 0.572900 0.690300 0.810200 0.714200 0.749800 0.702300 0.747200 0.703900 0.806700
18 LinearSVC 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.750000 0.751500 0.598900 0.747200 0.765000 0.767000 0.669300 0.679000 0.418300 0.670400 0.836800 0.836900 0.436000 0.773100 0.473900 0.900800 0.498900 0.995700 0.561400 0.859400 0.766200 0.786000 0.498500 0.994000 0.443100 0.578100 0.564900 0.805000 0.748600 0.774800 0.731300 0.786000 0.633400 0.803300
10 GaussianNB 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.498700 0.994800 0.736600 0.739400 0.803200 0.814500 0.757300 0.760100 0.788400 0.748100 0.493600 0.658300 0.816200 0.816200 0.562200 0.643700 0.480000 0.893900 0.498700 0.994800 0.756200 0.875800 0.891300 0.865400 0.498500 0.994000 0.473800 0.564300 0.661400 0.692800 0.762700 0.767900 0.721500 0.729100 0.684800 0.802400
26 QuadraticDiscriminantAnalysis 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.498700 0.994800 0.746600 0.748900 0.782300 0.810200 0.750500 0.753200 0.771800 0.745500 0.424400 0.668700 0.821300 0.821400 0.481500 0.769600 0.545300 0.891300 0.630900 0.990500 0.775200 0.902500 0.889300 0.862800 0.498500 0.994000 0.510400 0.591000 0.701700 0.742000 0.762800 0.769600 0.739600 0.753200 0.677100 0.799000
30 SGDClassifier 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.752700 0.754100 0.615200 0.754100 0.765700 0.767000 0.611800 0.659200 0.402000 0.672100 0.830000 0.830000 0.436000 0.773100 0.473900 0.900800 0.498900 0.995700 0.555400 0.851600 0.625500 0.733400 0.498500 0.994000 0.431200 0.568600 0.457800 0.792100 0.722000 0.766200 0.733300 0.764500 0.604400 0.798100
28 RidgeClassifier 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.750000 0.751500 0.596800 0.748100 0.764000 0.766200 0.663500 0.674700 0.417900 0.669500 0.833400 0.833500 0.436000 0.773100 0.473900 0.900800 0.498900 0.995700 0.565000 0.861100 0.700500 0.743700 0.498500 0.994000 0.436300 0.575500 0.441700 0.791200 0.725000 0.767000 0.696000 0.773900 0.595000 0.793800
29 RidgeClassifierCV 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.750000 0.751500 0.597300 0.748100 0.763100 0.765300 0.664900 0.675600 0.418300 0.670400 0.833400 0.833500 0.436000 0.773100 0.473900 0.900800 0.498900 0.995700 0.562900 0.859400 0.700500 0.743700 0.498500 0.994000 0.436300 0.575500 0.441700 0.791200 0.724600 0.767000 0.694400 0.773100 0.589700 0.792100
4 CategoricalNB 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.742600 0.744600 0.745200 0.783400 0.759400 0.761900 0.735700 0.717900 0.500400 0.647100 0.839300 0.839500 0.513200 0.771400 0.489900 0.899100 0.498900 0.995700 0.686800 0.864500 0.838700 0.831800 0.498500 0.994000 0.552200 0.591900 0.645600 0.743700 0.759400 0.770500 0.734500 0.751500 0.648100 0.771400
8 ExtraTreeClassifier 0.598700 0.974100 0.499100 0.996500 0.537500 0.941300 0.499100 0.996500 0.680600 0.680800 0.792500 0.815400 0.752300 0.753200 0.730300 0.685900 0.561100 0.610000 0.777600 0.780000 0.581600 0.691100 0.633400 0.874000 0.771600 0.995700 0.728100 0.866300 0.824700 0.797200 0.496100 0.984500 0.487200 0.516000 0.649800 0.769600 0.657100 0.703200 0.689400 0.737700 0.687600 0.769600
6 DecisionTreeClassifier 0.561600 0.968100 0.499100 0.996500 0.500100 0.920600 0.499100 0.996500 0.695400 0.695400 0.805800 0.815400 0.743300 0.743700 0.751700 0.706600 0.522200 0.574600 0.756400 0.761000 0.595000 0.713500 0.669700 0.887000 0.598300 0.993100 0.788800 0.896500 0.851400 0.815400 0.496700 0.987100 0.497700 0.523700 0.685800 0.792100 0.680100 0.717900 0.706100 0.749800 0.689000 0.769600
2 BernoulliNB 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.340400 0.505600 0.258700 0.634200 0.338100 0.510800 0.228000 0.509100 0.402000 0.672100 0.360000 0.562600 0.435200 0.770500 0.491000 0.901600 0.498900 0.995700 0.309300 0.766200 0.211400 0.452100 0.498500 0.994000 0.210300 0.460700 0.441700 0.791200 0.401000 0.669500 0.405600 0.682500 0.439400 0.761900
22 MultinomialNB 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.679300 0.684200 0.262500 0.634200 0.693100 0.704900 0.236000 0.511600 0.402000 0.672100 0.638300 0.653100 0.439700 0.773100 0.473900 0.900800 0.498900 0.995700 0.292500 0.764500 0.447500 0.638500 0.498500 0.994000 0.330600 0.496100 0.441700 0.791200 0.472000 0.671300 0.441200 0.685900 0.432100 0.761000
15 LabelPropagation 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.332000 0.497000 0.431900 0.349400 0.328500 0.489200 0.552800 0.644500 0.246900 0.327900 0.360000 0.562600 0.436000 0.773100 0.473900 0.900800 0.498900 0.995700 0.619400 0.899100 0.519000 0.531500 0.498500 0.994000 0.316900 0.439200 0.441700 0.791200 0.401000 0.669500 0.405600 0.682500 0.432100 0.761000
7 DummyClassifier 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.334700 0.503000 0.258700 0.634200 0.338100 0.510800 0.224600 0.508200 0.402000 0.672100 0.360000 0.562600 0.436000 0.773100 0.473900 0.900800 0.498900 0.995700 0.288600 0.763600 0.206700 0.449500 0.498500 0.994000 0.210300 0.460700 0.441700 0.791200 0.401000 0.669500 0.405600 0.682500 0.432100 0.761000
16 LabelSpreading 0.496100 0.984500 0.499800 0.999100 0.489400 0.958600 0.499100 0.996500 0.332000 0.497000 0.433900 0.350300 0.328500 0.489200 0.553900 0.645400 0.246900 0.327900 0.360000 0.562600 0.436000 0.773100 0.473900 0.900800 0.498900 0.995700 0.619400 0.899100 0.517800 0.530600 0.498500 0.994000 0.319500 0.440000 0.441700 0.791200 0.401000 0.669500 0.405600 0.682500 0.432100 0.761000
23 NearestCentroid 0.378200 0.550500 0.379600 0.611700 0.412800 0.551300 0.422200 0.698900 0.744500 0.746300 0.625900 0.704100 0.771900 0.773900 0.540700 0.528900 0.534700 0.561700 0.807600 0.807600 0.545600 0.567700 0.407800 0.476300 0.444600 0.757500 0.577800 0.718700 0.682700 0.720400 0.382200 0.589300 0.527500 0.573800 0.654200 0.679900 0.748700 0.753200 0.699500 0.705800 0.638500 0.678200
5 ComplementNB 0.379500 0.553100 0.377600 0.601400 0.410900 0.543600 0.402000 0.647100 0.629000 0.629000 0.399400 0.541000 0.673700 0.675600 0.441900 0.477100 0.503100 0.527200 0.696300 0.698000 0.558500 0.614300 0.419300 0.497000 0.415600 0.675600 0.376300 0.591900 0.429600 0.485800 0.357200 0.532400 0.467500 0.556500 0.628900 0.675600 0.634200 0.656600 0.644700 0.660100 0.576500 0.601400
24 PassiveAggressiveClassifier 0.496100 0.984500 0.499800 0.999100 0.519300 0.871400 0.499100 0.996500 0.716600 0.722200 0.524800 0.736800 0.746300 0.749800 0.573400 0.641900 0.478500 0.478900 0.805000 0.805000 0.484400 0.769600 0.503400 0.895600 0.498900 0.995700 0.607400 0.791200 0.669900 0.707500 0.498500 0.994000 0.450000 0.535800 0.479200 0.777400 0.764900 0.778300 0.419400 0.685100 0.495200 0.504700
25 Perceptron 0.496100 0.984500 0.499800 0.999100 0.504900 0.951700 0.499100 0.996500 0.457700 0.558200 0.525700 0.723000 0.686000 0.702300 0.614500 0.644500 0.411900 0.672100 0.734000 0.750600 0.185000 0.226900 0.473900 0.900800 0.498900 0.995700 0.581100 0.816200 0.763600 0.775700 0.498100 0.992200 0.327900 0.422800 0.445200 0.789500 0.456100 0.685100 0.534900 0.535800 0.500300 0.504700
In [22]:
results_df.shape
Out[22]:
(32, 43)
In [23]:
results_df['x'] = np.linspace(0, 1, 32)
In [24]:
results_df.head()
Out[24]:
Model BIRC5macro_score BIRC5micro_score CCNB1macro_score CCNB1micro_score MYBL2macro_score MYBL2micro_score MMP11macro_score MMP11micro_score GRB7macro_score ... CTSVmicro_score MKI67macro_score MKI67micro_score ERBB2macro_score ERBB2micro_score GUSBmacro_score GUSBmicro_score ESR1macro_score ESR1micro_score x
9 ExtraTreesClassifier 0.5965 0.9862 0.4998 0.9991 0.4892 0.9577 0.4991 0.9965 0.7887 ... 0.5591 0.7297 0.8361 0.7134 0.7463 0.7483 0.7869 0.7589 0.8438 0.000000
27 RandomForestClassifier 0.5965 0.9862 0.4998 0.9991 0.4888 0.9560 0.4991 0.9965 0.7863 ... 0.5850 0.7219 0.8343 0.7311 0.7636 0.7564 0.7929 0.7520 0.8404 0.032258
13 HistGradientBoostingClassifier 0.5915 0.9853 0.4998 0.9991 0.5076 0.9560 0.4991 0.9965 0.7708 ... 0.5764 0.6997 0.8162 0.7323 0.7653 0.7363 0.7791 0.7325 0.8318 0.064516
1 BaggingClassifier 0.5790 0.9827 0.4998 0.9991 0.5202 0.9508 0.4991 0.9965 0.7532 ... 0.5695 0.7251 0.8231 0.7275 0.7541 0.7396 0.7739 0.7420 0.8274 0.096774
21 MLPClassifier 0.4961 0.9845 0.4998 0.9991 0.4894 0.9586 0.4991 0.9965 0.7557 ... 0.6299 0.7124 0.8335 0.7444 0.7670 0.7368 0.7912 0.7065 0.8248 0.129032

5 rows × 44 columns

In [25]:
# results_df.drop(columns=['x'], inplace=True)
In [26]:
results_df.to_csv('../results/model/multiclassclf_results.csv', index=False)
In [27]:
import plotly.io as pio

pio.renderers.default = "vscode"
In [28]:
import pandas as pd
import plotly.express as px

# Load results
file_path = "../results/model/multiclassclf_results.csv"   # update path if needed
df = pd.read_csv(file_path)

# Reshape into long format
df_long = df.melt(id_vars=["Model"], var_name="Gene_Metric", value_name="Score")

# Split "Gene_Metric" into "Gene" and "Metric"
df_long[["Gene", "Metric"]] = df_long["Gene_Metric"].str.extract(r"(.+?)(macro_score|micro_score)")

# Interactive line chart (first 6 genes to keep it readable)
genes_to_plot = df_long["Gene"].unique()
subset = df_long[df_long["Gene"].isin(genes_to_plot)]

fig = px.line(
    subset,
    x="Model",
    y="Score",
    color="Gene",
    line_dash="Metric",   # dashed line for macro/micro
    markers=True,
    title="Gene-wise Macro and Micro Scores Across Models (first 6 genes)",
    hover_data={"Score": ":.4f", "Model": True, "Gene": True, "Metric": True},
)

fig.update_layout(
    xaxis_tickangle=45,
    legend_title_text="Gene - Metric",
    width=1000,
    height=600
)

fig.show()

SELECT MODEL AND FINETUNING¶

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

baseline_pipeline = Pipeline(steps=[
    ('classifier', MultiOutputClassifier(LogisticRegression(max_iter=5000)))
])

baseline_pipeline.fit(X_train, y_train)

y_pred = baseline_pipeline.predict(X_val)
from pprint import pprint
# Calculate the f1_score. Micro and Macro
row_tup, name_col = custom_f1score(y_val, y_pred, "Baseline Logistic Regression", f1_score)
print(name_col)
print(row_tup)
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Logistic Regression', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7546, 0.7558, 0.6496, 0.7558, 0.7645, 0.7662, 0.6713, 0.679, 0.4225, 0.6695, 0.8384, 0.8387, 0.444, 0.7748, 0.4821, 0.8999, 0.4989, 0.9957, 0.5599, 0.8576, 0.7917, 0.8059, 0.4985, 0.994, 0.4679, 0.5815, 0.604, 0.8041, 0.7427, 0.7731, 0.7303, 0.7817, 0.6865, 0.8154)

RANDOM_SEARCH_LOGISTIC¶

In [30]:
# Logistic Regression with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold

logistic_pipeline = Pipeline(steps=[
  ('classifier', MultiOutputClassifier(LogisticRegression(max_iter=5000))) # Increased max_iter
])

# Note: Parameters of the underlying estimator inside MultiOutputClassifier
# are accessed via 'classifier__estimator__<param>'
param_grid_logistic = {
    'classifier__estimator__C': np.logspace(-4, 4, 20),
    'classifier__estimator__solver': ['liblinear', 'lbfgs']
}

# Use KFold (not StratifiedKFold) for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

random_search_logistic = RandomizedSearchCV(
    logistic_pipeline,
    param_distributions=param_grid_logistic,
    n_iter=10,
    cv=cv_reduced,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search_logistic.fit(X_train, y_train)

y_val_pred = random_search_logistic.predict(X_val)

row_tup, name_col = custom_f1score(y_val, y_val_pred, "Logistic Regression", f1_score)
print(name_col)
print(row_tup)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Logistic Regression', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7555, 0.7567, 0.6499, 0.7558, 0.7645, 0.7662, 0.6713, 0.679, 0.4225, 0.6695, 0.8384, 0.8387, 0.444, 0.7748, 0.4818, 0.8991, 0.4989, 0.9957, 0.5585, 0.8568, 0.7925, 0.8067, 0.4985, 0.994, 0.4711, 0.5833, 0.6143, 0.805, 0.7419, 0.7722, 0.7295, 0.7808, 0.6886, 0.8162)

RANDOMFORESTCLASSIFIER MODEL FOR CLASSIFICATION¶

In [31]:
# Baseline model: RandomForest with basic settings
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

random_forest_pipeline.fit(X_train, y_train)

y_val_pred = random_forest_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.5965, 0.9862, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7922, 0.7929, 0.8522, 0.8723, 0.8131, 0.8145, 0.813, 0.7817, 0.5961, 0.6997, 0.8521, 0.8525, 0.6331, 0.7929, 0.708, 0.9241, 0.4989, 0.9957, 0.7811, 0.9292, 0.9019, 0.8783, 0.4985, 0.994, 0.563, 0.585, 0.7228, 0.8352, 0.7193, 0.7532, 0.7495, 0.7877, 0.7418, 0.8395)

RANDOM_SEARCH_RANDOM_FOREST¶

In [32]:
# Random Forest with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid_random_forest = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use KFold for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

random_search_random_forest = RandomizedSearchCV(random_forest_pipeline, param_distributions=param_grid_random_forest, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_random_forest.fit(X_train, y_train)

y_val_pred = random_search_random_forest.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.782, 0.7826, 0.8251, 0.8499, 0.7996, 0.8016, 0.8045, 0.7765, 0.5181, 0.6782, 0.847, 0.8473, 0.5719, 0.7826, 0.6658, 0.9146, 0.4989, 0.9957, 0.7658, 0.9301, 0.8959, 0.8714, 0.4985, 0.994, 0.5951, 0.6135, 0.6957, 0.8231, 0.7394, 0.7688, 0.7511, 0.7903, 0.7034, 0.824)

XGBCLASSIFIER MODEL FOR CLASSIFICATION¶

In [33]:
# Baseline model: XGBClassifier wrapped for multi-output
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False)))
])

xgboost_pipeline.fit(X_train, y_train)

y_val_pred = xgboost_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[33], line 7
      1 # Baseline model: XGBClassifier wrapped for multi-output
      2 xgboost_pipeline = Pipeline(steps=[
      3     ('preprocessor', preprocessor),
      4     ("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False)))
      5 ])
----> 7 xgboost_pipeline.fit(X_train, y_train)
      9 y_val_pred = xgboost_pipeline.predict(X_val)
     10 row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1358     estimator._validate_params()
   1360 with config_context(
   1361     skip_parameter_validation=(
   1362         prefer_skip_nested_validation or global_skip_validation
   1363     )
   1364 ):
-> 1365     return fit_method(estimator, *args, **kwargs)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/pipeline.py:663, in Pipeline.fit(self, X, y, **params)
    657     if self._final_estimator != "passthrough":
    658         last_step_params = self._get_metadata_for_step(
    659             step_idx=len(self) - 1,
    660             step_params=routed_params[self.steps[-1][0]],
    661             all_params=params,
    662         )
--> 663         self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    665 return self

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:547, in MultiOutputClassifier.fit(self, X, Y, sample_weight, **fit_params)
    521 def fit(self, X, Y, sample_weight=None, **fit_params):
    522     """Fit the model to data matrix X and targets Y.
    523 
    524     Parameters
   (...)    545         Returns a fitted instance.
    546     """
--> 547     super().fit(X, Y, sample_weight=sample_weight, **fit_params)
    548     self.classes_ = [estimator.classes_ for estimator in self.estimators_]
    549     return self

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1358     estimator._validate_params()
   1360 with config_context(
   1361     skip_parameter_validation=(
   1362         prefer_skip_nested_validation or global_skip_validation
   1363     )
   1364 ):
-> 1365     return fit_method(estimator, *args, **kwargs)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:278, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params)
    275     if sample_weight is not None:
    276         routed_params.estimator.fit["sample_weight"] = sample_weight
--> 278 self.estimators_ = Parallel(n_jobs=self.n_jobs)(
    279     delayed(_fit_estimator)(
    280         self.estimator, X, y[:, i], **routed_params.estimator.fit
    281     )
    282     for i in range(y.shape[1])
    283 )
    285 if hasattr(self.estimators_[0], "n_features_in_"):
    286     self.n_features_in_ = self.estimators_[0].n_features_in_

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/utils/parallel.py:82, in Parallel.__call__(self, iterable)
     73 warning_filters = warnings.filters
     74 iterable_with_config_and_warning_filters = (
     75     (
     76         _with_config_and_warning_filters(delayed_func, config, warning_filters),
   (...)     80     for delayed_func, args, kwargs in iterable
     81 )
---> 82 return super().__call__(iterable_with_config_and_warning_filters)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/joblib/parallel.py:1986, in Parallel.__call__(self, iterable)
   1984     output = self._get_sequential_output(iterable)
   1985     next(output)
-> 1986     return output if self.return_generator else list(output)
   1988 # Let's create an ID that uniquely identifies the current call. If the
   1989 # call is interrupted early and that the same instance is immediately
   1990 # reused, this id will be used to prevent workers that were
   1991 # concurrently finalizing a task from the previous call to run the
   1992 # callback.
   1993 with self._lock:

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/joblib/parallel.py:1914, in Parallel._get_sequential_output(self, iterable)
   1912 self.n_dispatched_batches += 1
   1913 self.n_dispatched_tasks += 1
-> 1914 res = func(*args, **kwargs)
   1915 self.n_completed_tasks += 1
   1916 self.print_progress()

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/utils/parallel.py:147, in _FuncWrapper.__call__(self, *args, **kwargs)
    145 with config_context(**config), warnings.catch_warnings():
    146     warnings.filters = warning_filters
--> 147     return self.function(*args, **kwargs)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:67, in _fit_estimator(estimator, X, y, sample_weight, **fit_params)
     65     estimator.fit(X, y, sample_weight=sample_weight, **fit_params)
     66 else:
---> 67     estimator.fit(X, y, **fit_params)
     68 return estimator

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/xgboost/core.py:705, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    703 for k, arg in zip(sig.parameters, args):
    704     kwargs[k] = arg
--> 705 return func(**kwargs)

File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/xgboost/sklearn.py:1640, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
   1635     expected_classes = self.classes_
   1636 if (
   1637     classes.shape != expected_classes.shape
   1638     or not (classes == expected_classes).all()
   1639 ):
-> 1640     raise ValueError(
   1641         f"Invalid classes inferred from unique values of `y`.  "
   1642         f"Expected: {expected_classes}, got {classes}"
   1643     )
   1645 params = self.get_xgb_params()
   1647 if callable(self.objective):

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [-1.  1.]
In [35]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# -------------------------
# 1. Generate toy dataset
# -------------------------
X = np.random.rand(20, 5)  # 20 samples, 5 features
y = np.random.choice([-1, 1], size=20)  # labels in {-1, 1}

# -------------------------
# 2. Map labels -1 → 0, 1 → 1
# -------------------------
y_mapped = (y == 1).astype(int)  # {-1,1} → {0,1}

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, random_state=42)

# -------------------------
# 3. Train XGBoost model
# -------------------------
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "binary:logistic",  # binary classification
    "eval_metric": "logloss"
}
model = xgb.train(params, dtrain, num_boost_round=20)

# -------------------------
# 4. Make predictions
# -------------------------
y_pred_prob = model.predict(dtest)             # probabilities for class 1
y_pred = (y_pred_prob > 0.5).astype(int)       # threshold at 0.5
y_pred_original = np.where(y_pred == 1, 1, -1) # map back {0,1} → {-1,1}

# -------------------------
# 5. Evaluate
# -------------------------
y_test_original = np.where(y_test == 1, 1, -1)
acc = accuracy_score(y_test_original, y_pred_original)

print("True labels:     ", y_test_original)
print("Predicted labels:", y_pred_original)
print(f"Accuracy: {acc:.4f}")
True labels:      [ 1 -1 -1  1  1 -1]
Predicted labels: [ 1  1 -1  1 -1  1]
Accuracy: 0.5000
In [ ]:
param_grid_xgboost = {
    'classifier__estimator__n_estimators': [100, 200, 300],
    'classifier__estimator__max_depth': [3, 6, 9, 12],
    'classifier__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__estimator__subsample': [0.6, 0.8, 1.0],
    'classifier__estimator__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__estimator__gamma': [0, 0.1, 0.2, 0.3],
    'classifier__estimator__min_child_weight': [1, 3, 5]
}

# Use KFold for multi-output targets
from sklearn.model_selection import KFold
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

# Rebuild the pipeline to ensure wrapped estimator for search
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False)))
])

random_search_xgboost = RandomizedSearchCV(xgboost_pipeline, param_distributions=param_grid_xgboost, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_xgboost.fit(X_train, y_train)

y_val_pred = random_search_xgboost.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
print(f"Best Parameters: {random_search_xgboost.best_params_}")
Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7728, 0.7731, 0.8088, 0.8378, 0.7944, 0.7955, 0.7875, 0.755, 0.4492, 0.6756, 0.846, 0.8464, 0.576, 0.7852, 0.659, 0.9129, 0.6418, 0.9957, 0.7343, 0.9189, 0.8965, 0.8714, 0.4985, 0.994, 0.619, 0.635, 0.6903, 0.8214, 0.74, 0.7705, 0.7584, 0.7955, 0.7144, 0.8283)
Best Parameters: {'classifier__estimator__subsample': 0.8, 'classifier__estimator__n_estimators': 200, 'classifier__estimator__min_child_weight': 5, 'classifier__estimator__max_depth': 3, 'classifier__estimator__learning_rate': 0.05, 'classifier__estimator__gamma': 0, 'classifier__estimator__colsample_bytree': 0.6}
In [ ]:
#  Predict on the validation set using the best model
y_val_pred = random_search_xgboost.best_estimator_.predict(X_val)

# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix

y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred)

conf_matrices = []
if y_true_np.ndim == 2:
    for j in range(y_true_np.shape[1]):
        conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
    conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))

# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
    print(f"Confusion Matrix for output {idx}:")
    print(cm)
Confusion Matrix for output 0:
[[   0   18]
 [   0 1141]]
Confusion Matrix for output 1:
[[   0    1]
 [   0 1158]]
Confusion Matrix for output 2:
[[   0   48]
 [   0 1111]]
Confusion Matrix for output 3:
[[   0    4]
 [   0 1155]]
Confusion Matrix for output 4:
[[470 113]
 [150 426]]
Confusion Matrix for output 5:
[[679   0  56]
 [  0 178   0]
 [132   0 114]]
Confusion Matrix for output 6:
[[503  89]
 [148 419]]
Confusion Matrix for output 7:
[[200   0 192]
 [  0 178   0]
 [ 92   0 497]]
Confusion Matrix for output 8:
[[763  16]
 [360  20]]
Confusion Matrix for output 9:
[[462  45]
 [133 519]]
Confusion Matrix for output 10:
[[ 48 215]
 [ 34 862]]
Confusion Matrix for output 11:
[[  29   86]
 [  15 1029]]
Confusion Matrix for output 12:
[[   1    4]
 [   1 1153]]
Confusion Matrix for output 13:
[[ 16   0  80]
 [  0 178   0]
 [ 14   0 871]]
Confusion Matrix for output 14:
[[488   0  33]
 [  0 178   0]
 [116   0 344]]
Confusion Matrix for output 15:
[[   0    7]
 [   0 1152]]
Confusion Matrix for output 16:
[[288  32  76]
 [ 28 124  77]
 [126  84 324]]
Confusion Matrix for output 17:
[[ 99 143]
 [ 64 853]]
Confusion Matrix for output 18:
[[248 135]
 [131 645]]
Confusion Matrix for output 19:
[[234 134]
 [103 688]]
Confusion Matrix for output 20:
[[114 163]
 [ 36 846]]
In [ ]:
# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
    ax = plt.subplot(rows, cols, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()
No description has been provided for this image

DECISIONTREECLASSIFIER MODEL FOR CLASSIFICATION¶

In [ ]:
# Baseline model: DecisionTree
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=1))
])

decision_tree_pipeline.fit(X_train, y_train)

y_val_pred = decision_tree_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.5532, 0.9638, 0.4985, 0.994, 0.518, 0.9189, 0.4983, 0.9931, 0.6908, 0.6911, 0.8017, 0.811, 0.7477, 0.7481, 0.7831, 0.7386, 0.5768, 0.6221, 0.788, 0.7903, 0.6062, 0.7248, 0.6614, 0.887, 0.6418, 0.9957, 0.7549, 0.8913, 0.8539, 0.8162, 0.4978, 0.9914, 0.5006, 0.5229, 0.6698, 0.7834, 0.7101, 0.7489, 0.7003, 0.7455, 0.6703, 0.7593)
In [ ]:
# Decision Tree with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

param_grid_decision_tree = {
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use KFold for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

random_search_decision_tree = RandomizedSearchCV(decision_tree_pipeline, param_distributions=param_grid_decision_tree, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_decision_tree.fit(X_train, y_train)

y_val_pred = random_search_decision_tree.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.5435, 0.9836, 0.4998, 0.9991, 0.5321, 0.9482, 0.4991, 0.9965, 0.6981, 0.6989, 0.7674, 0.8041, 0.7617, 0.7636, 0.7766, 0.7291, 0.5408, 0.6393, 0.7955, 0.7964, 0.5817, 0.7092, 0.6732, 0.9016, 0.6096, 0.994, 0.7485, 0.9008, 0.8729, 0.843, 0.4985, 0.994, 0.5602, 0.5764, 0.692, 0.8024, 0.7099, 0.7455, 0.7349, 0.7722, 0.6885, 0.7748)
In [ ]:
#  Predict on the validation set using the best model
y_val_pred = random_search_decision_tree.best_estimator_.predict(X_val)

# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix

y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred)

conf_matrices = []
if y_true_np.ndim == 2:
    for j in range(y_true_np.shape[1]):
        conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
    conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))

# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
    print(f"Confusion Matrix for output {idx}:")
    print(cm)
Confusion Matrix for output 0:
[[   1   17]
 [   2 1139]]
Confusion Matrix for output 1:
[[   0    1]
 [   0 1158]]
Confusion Matrix for output 2:
[[   3   45]
 [  15 1096]]
Confusion Matrix for output 3:
[[   0    4]
 [   0 1155]]
Confusion Matrix for output 4:
[[435 148]
 [201 375]]
Confusion Matrix for output 5:
[[660   1  74]
 [  0 178   0]
 [151   1  94]]
Confusion Matrix for output 6:
[[494  98]
 [176 391]]
Confusion Matrix for output 7:
[[237   1 154]
 [  0 178   0]
 [158   1 430]]
Confusion Matrix for output 8:
[[639 140]
 [278 102]]
Confusion Matrix for output 9:
[[424  83]
 [153 499]]
Confusion Matrix for output 10:
[[ 91 172]
 [165 731]]
Confusion Matrix for output 11:
[[  38   77]
 [  37 1007]]
Confusion Matrix for output 12:
[[   1    4]
 [   3 1151]]
Confusion Matrix for output 13:
[[ 26   0  70]
 [  0 178   0]
 [ 43   2 840]]
Confusion Matrix for output 14:
[[463   1  57]
 [  0 178   0]
 [123   1 336]]
Confusion Matrix for output 15:
[[   0    7]
 [   0 1152]]
Confusion Matrix for output 16:
[[253  38 105]
 [ 39 115  75]
 [113 121 300]]
Confusion Matrix for output 17:
[[118 124]
 [105 812]]
Confusion Matrix for output 18:
[[229 154]
 [141 635]]
Confusion Matrix for output 19:
[[230 138]
 [126 665]]
Confusion Matrix for output 20:
[[144 133]
 [128 754]]
In [ ]:
# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
    ax = plt.subplot(rows, cols, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()
No description has been provided for this image

GRADIENTBOOSTINGCLASSIFIER MODEL FOR CLASSIFICATION¶

In [ ]:
# Baseline model: GradientBoosting (single-output estimator applied to multi-output via independent fits is not supported directly)
# Keep pipeline for reference; compute accuracy if it runs, otherwise suggest using MultiOutputClassifier wrapping another estimator.
gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", GradientBoostingClassifier(random_state=1))
])

# Attempt fit/predict per output if y is multi-output
import numpy as np

y_train_np = np.asarray(y_train)
y_val_np = np.asarray(y_val)

if y_train_np.ndim == 2:
    # Fit one-vs-output manually
    preds = []
    for j in range(y_train_np.shape[1]):
        gb = Pipeline(steps=[('preprocessor', preprocessor), ("classifier", GradientBoostingClassifier(random_state=1))])
        gb.fit(X_train, y_train_np[:, j])
        preds.append(gb.predict(X_val))
    y_val_pred_np = np.column_stack(preds)
else:
    gradient_boosting_pipeline.fit(X_train, y_train)
    y_val_pred_np = gradient_boosting_pipeline.predict(X_val)

row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Random Forest', 0.5435, 0.9836, 0.4998, 0.9991, 0.5321, 0.9482, 0.4991, 0.9965, 0.6981, 0.6989, 0.7674, 0.8041, 0.7617, 0.7636, 0.7766, 0.7291, 0.5408, 0.6393, 0.7955, 0.7964, 0.5817, 0.7092, 0.6732, 0.9016, 0.6096, 0.994, 0.7485, 0.9008, 0.8729, 0.843, 0.4985, 0.994, 0.5602, 0.5764, 0.692, 0.8024, 0.7099, 0.7455, 0.7349, 0.7722, 0.6885, 0.7748)
In [ ]:
gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

param_grid_gradient_boosting = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__max_features': ['auto', 'sqrt', 'log2']
}

# Manual per-output tuning since GradientBoostingClassifier is single-output
from sklearn.model_selection import KFold
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)

import numpy as np

y_train_np = np.asarray(y_train)
y_val_np = np.asarray(y_val)

best_models = []
for j in range(y_train_np.shape[1] if y_train_np.ndim == 2 else 1):
    rs = RandomizedSearchCV(gradient_boosting_pipeline, param_distributions=param_grid_gradient_boosting, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
    y_target = y_train_np[:, j] if y_train_np.ndim == 2 else y_train_np
    rs.fit(X_train, y_target)
    best_models.append(rs.best_estimator_)

# Predict using per-output best models
if y_val_np.ndim == 2:
    preds = [model.predict(X_val) for model in best_models]
    y_val_pred_np = np.column_stack(preds)
else:
    y_val_pred_np = best_models[0].predict(X_val)

# Compute micro/macro
row_tup, name_col = custom_f1score(y_val, y_val_pred_np, "Baseline Gradient Boosting", f1_score)
print(name_col)
print(row_tup)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score']
('Baseline Gradient Boosting', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7752, 0.7757, 0.8274, 0.8473, 0.7975, 0.799, 0.7879, 0.7575, 0.4129, 0.6747, 0.8444, 0.8447, 0.5752, 0.7947, 0.6755, 0.918, 0.7851, 0.9974, 0.7444, 0.9249, 0.8925, 0.8671, 0.4985, 0.994, 0.6179, 0.6359, 0.6978, 0.824, 0.7425, 0.7739, 0.7383, 0.786, 0.7204, 0.8283)
In [ ]:
#  Predict on the validation set using the best per-output GB models
# (y_val_pred_np already computed in previous cell)

# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix

y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred_np)

conf_matrices = []
if y_true_np.ndim == 2:
    for j in range(y_true_np.shape[1]):
        conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
    conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))

# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
    print(f"Confusion Matrix for output {idx}:")
    print(cm)
Confusion Matrix for output 0:
[[   0   18]
 [   0 1141]]
Confusion Matrix for output 1:
[[   0    1]
 [   0 1158]]
Confusion Matrix for output 2:
[[   0   48]
 [   0 1111]]
Confusion Matrix for output 3:
[[   0    4]
 [   0 1155]]
Confusion Matrix for output 4:
[[475 108]
 [152 424]]
Confusion Matrix for output 5:
[[672   0  63]
 [  0 178   0]
 [114   0 132]]
Confusion Matrix for output 6:
[[512  80]
 [153 414]]
Confusion Matrix for output 7:
[[195   0 197]
 [  0 178   0]
 [ 84   0 505]]
Confusion Matrix for output 8:
[[778   1]
 [376   4]]
Confusion Matrix for output 9:
[[464  43]
 [137 515]]
Confusion Matrix for output 10:
[[ 44 219]
 [ 19 877]]
Confusion Matrix for output 11:
[[  31   84]
 [  11 1033]]
Confusion Matrix for output 12:
[[   2    3]
 [   0 1154]]
Confusion Matrix for output 13:
[[ 17   0  79]
 [  0 177   1]
 [  7   0 878]]
Confusion Matrix for output 14:
[[487   0  34]
 [  1 177   0]
 [119   0 341]]
Confusion Matrix for output 15:
[[   0    7]
 [   0 1152]]
Confusion Matrix for output 16:
[[290  31  75]
 [ 34 122  73]
 [119  90 325]]
Confusion Matrix for output 17:
[[103 139]
 [ 65 852]]
Confusion Matrix for output 18:
[[246 137]
 [125 651]]
Confusion Matrix for output 19:
[[208 160]
 [ 88 703]]
Confusion Matrix for output 20:
[[120 157]
 [ 42 840]]
In [ ]:
# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
    ax = plt.subplot(rows, cols, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()
No description has been provided for this image

PERFORMANCE ON THE TEST SET¶

In [ ]:
models = [random_search_logistic, random_search_random_forest, random_search_xgboost, random_search_decision_tree, gradient_boosting_pipeline]
model_names = ['LogReg', 'RF', 'XGB', 'DT', 'GB']

# Compute ROC curves per output (treating each output as binary/multiclass-ovo not covered here)
# We'll assume binary outputs for ROC; if multiclass per output, switch to one-vs-rest ROC per class or use macro-averaged AUC.
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import numpy as np

# Limit to first output for quick visualization if outputs > 1
output_index = 0

plt.figure(figsize=(10, 8))
for i, model in enumerate(models):
    # Try to get probabilities for the selected output
    est = model.best_estimator_ if hasattr(model, 'best_estimator_') else model
    # Handle MultiOutputClassifier vs single estimator
    try:
        # Multi-output wrapped estimators: est.named_steps['classifier'].estimators_
        clf = est.named_steps['classifier']
        if hasattr(clf, 'estimators_'):
            # Get proba for the selected output from the wrapped estimator
            proba = clf.estimators_[output_index].predict_proba(X_val)
        else:
            proba = est.predict_proba(X_val)
    except Exception:
        # Fallback: skip if proba not available
        continue

    # If predict_proba returned a list (multi-output from single estimator), pick the selected output
    if isinstance(proba, list):
        if len(proba) <= output_index:
            continue
        proba_sel = proba[output_index]
    else:
        proba_sel = proba

    y_true = np.asarray(y_val)[:, output_index] if np.asarray(y_val).ndim == 2 else np.asarray(y_val)

    # If multiclass for this output, binarize one class (class 1) for demonstration
    classes = np.unique(y_true)
    if len(classes) > 2:
        y_true_bin = (y_true == classes[1]).astype(int)
        # Map the chosen class index into the columns of proba_sel
        class_idx = list(classes).index(classes[1]) if proba_sel.shape[1] == len(classes) else 1
        y_score = proba_sel[:, class_idx]
    else:
        # binary
        y_true_bin = (y_true == classes[-1]).astype(int)
        y_score = proba_sel[:, 1] if proba_sel.shape[1] > 1 else proba_sel.ravel()

    fpr, tpr, _ = roc_curve(y_true_bin, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{model_names[i]} (AUC = {roc_auc:.2f})')

# Plot ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic (ROC) Curve - Output {output_index}')
plt.legend(loc='lower right')
plt.show()
No description has been provided for this image
In [ ]:
# Save test predictions for multi-output to CSV (per-label columns, inverse-transformed)
# Prefer tuned XGBoost model if available
import numpy as np
import pandas as pd

# Choose the best available XGBoost-based model
if 'random_search_xgboost' in globals():
    best_xgb = random_search_xgboost.best_estimator_ if hasattr(random_search_xgboost, 'best_estimator_') else random_search_xgboost
elif 'xgboost_pipeline' in globals():
    best_xgb = xgboost_pipeline
else:
    raise RuntimeError("No XGBoost model found. Run the XGBoost training cell before exporting predictions.")

# Predict on the hold-out X_test
test_predictions = best_xgb.predict(X_test)

# Normalize to 2D numpy array
if isinstance(test_predictions, list):
    test_predictions = np.column_stack(test_predictions)

test_pred_np = np.asarray(test_predictions)

# Build DataFrame and inverse-transform using saved encoders if present
if isinstance(y_train, pd.DataFrame):
    pred_df = pd.DataFrame(test_pred_np, columns=y_train.columns, index=X_test.index)
    if 'encoders' in globals() and isinstance(encoders, dict):
        for col in pred_df.columns:
            le = encoders.get(col)
            if le is not None:
                pred_df[col] = le.inverse_transform(pred_df[col].astype(int))
else:
    pred_df = pd.DataFrame({'prediction': test_pred_np.ravel()}, index=X_test.index)
    if 'encoders' in globals() and isinstance(encoders, dict):
        le = encoders.get('_single')
        if le is not None:
            pred_df['prediction'] = le.inverse_transform(pred_df['prediction'].astype(int))

# Save to CSV with index (IDs)
output_path = '../pool-datasets/clf/submission_multioutput.csv'
pred_df.to_csv(output_path, index=True)
print(f"Saved multi-output predictions to {output_path} with shape {pred_df.shape}")
print(pred_df.head())
Saved multi-output predictions to ../pool-datasets/clf/submission_multioutput.csv with shape (1449, 21)
                 BIRC5  CCNB1  MYBL2  MMP11  GRB7  PGR  BCL2  SCUBE2  GSTM1  \
TCGA-EW-A6S9-01    1.0    1.0    1.0    1.0   1.0  0.0   1.0     0.0   -1.0   
TCGA-GM-A3XG-01    1.0    1.0    1.0    1.0   1.0  0.0   1.0     0.0    1.0   
F2500              1.0    1.0    1.0    1.0  -1.0 -1.0  -1.0     1.0   -1.0   
F1775              1.0    1.0    1.0    1.0  -1.0 -1.0  -1.0    -1.0   -1.0   
TCGA-E9-A1NE-01    1.0    1.0    1.0    1.0   1.0  0.0   1.0     0.0   -1.0   

                 BAG1  ...  ACTB  GAPDH  RPLP0  TFRC  AURKA  CTSV  MKI67  \
TCGA-EW-A6S9-01   1.0  ...   1.0    1.0    0.0   0.0    1.0   1.0    1.0   
TCGA-GM-A3XG-01   1.0  ...   1.0    1.0    0.0   0.0    1.0   1.0    1.0   
F2500            -1.0  ...   1.0    1.0    1.0  -1.0    1.0  -1.0    1.0   
F1775            -1.0  ...   1.0    1.0    1.0  -1.0    1.0  -1.0    1.0   
TCGA-E9-A1NE-01   1.0  ...   1.0    1.0    0.0   0.0    1.0   1.0    1.0   

                 ERBB2  GUSB  ESR1  
TCGA-EW-A6S9-01    1.0   1.0   1.0  
TCGA-GM-A3XG-01    1.0   1.0   1.0  
F2500             -1.0   1.0   1.0  
F1775             -1.0   1.0   1.0  
TCGA-E9-A1NE-01    1.0   1.0   1.0  

[5 rows x 21 columns]
In [ ]:
y_test.loc['TCGA-EW-A6S9-01']
Out[ ]:
BIRC5     1.0
CCNB1     1.0
MYBL2     1.0
MMP11     1.0
GRB7      1.0
PGR       0.0
BCL2      1.0
SCUBE2    0.0
GSTM1     1.0
BAG1      1.0
CD68      1.0
ACTB      1.0
GAPDH     1.0
RPLP0     0.0
TFRC      0.0
AURKA     1.0
CTSV      1.0
MKI67     1.0
ERBB2     1.0
GUSB      1.0
ESR1      1.0
Name: TCGA-EW-A6S9-01, dtype: float64
In [ ]:
pred_df.loc['TCGA-EW-A6S9-01']
Out[ ]:
BIRC5     1.0
CCNB1     1.0
MYBL2     1.0
MMP11     1.0
GRB7      1.0
PGR       0.0
BCL2      1.0
SCUBE2    0.0
GSTM1    -1.0
BAG1      1.0
CD68      1.0
ACTB      1.0
GAPDH     1.0
RPLP0     0.0
TFRC      0.0
AURKA     1.0
CTSV      1.0
MKI67     1.0
ERBB2     1.0
GUSB      1.0
ESR1      1.0
Name: TCGA-EW-A6S9-01, dtype: float64
In [ ]:
y_val_np = np.asarray(y_test)
y_val_pred_np = np.asarray(pred_df)

rf_micro = (y_val_pred_np == y_val_np).mean()
if y_val_np.ndim == 2:
    rf_macro = (y_val_pred_np == y_val_np).mean(axis=0).mean()
else:
    from sklearn.metrics import accuracy_score
    rf_macro = accuracy_score(y_val_np, y_val_pred_np)

print(f"Best Model Test Accuracy (micro): {rf_micro:.4f}")
print(f"Best Model Test Accuracy (macro): {rf_macro:.4f}")
print(f"Best Parameters: {random_search_logistic.best_params_}")
Best Model Test Accuracy (micro): 0.8518
Best Model Test Accuracy (macro): 0.8518
Best Parameters: {'classifier__estimator__solver': 'lbfgs', 'classifier__estimator__C': np.float64(3792.690190732246)}