In [1]:
import warnings
warnings.filterwarnings('ignore')
IMPORTS¶
In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import (train_test_split)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
COMMON SET UP¶
In [3]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
DATA LOADING¶
In [4]:
df = pd.read_csv("../pool-datasets/clf/oncotypedx_subset_transpose.csv", index_col=0)
In [5]:
df.shape
Out[5]:
(7244, 25)
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 7244 entries, MBC-MBCProject_wAiri7fp-Tumor-SM-AZ5DH to F3265repl Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BIRC5 7244 non-null float64 1 CCNB1 7244 non-null float64 2 MYBL2 7244 non-null float64 3 MMP11 7244 non-null float64 4 GRB7 7244 non-null float64 5 PGR 7244 non-null float64 6 BCL2 7244 non-null float64 7 SCUBE2 7244 non-null float64 8 GSTM1 7244 non-null float64 9 BAG1 7244 non-null float64 10 CD68 7244 non-null float64 11 ACTB 7244 non-null float64 12 GAPDH 7244 non-null float64 13 RPLP0 7244 non-null float64 14 TFRC 7244 non-null float64 15 AURKA 7244 non-null float64 16 CTSV 7244 non-null float64 17 MKI67 7244 non-null float64 18 ERBB2 7244 non-null float64 19 GUSB 7244 non-null float64 20 ESR1 7244 non-null float64 21 IQGAP1 7244 non-null float64 22 IQGAP2 7244 non-null float64 23 FRG1 7244 non-null float64 24 EEF1A2 6144 non-null float64 dtypes: float64(25) memory usage: 1.4+ MB
In [7]:
df.describe(include="all")
Out[7]:
BIRC5 | CCNB1 | MYBL2 | MMP11 | GRB7 | PGR | BCL2 | SCUBE2 | GSTM1 | BAG1 | ... | AURKA | CTSV | MKI67 | ERBB2 | GUSB | ESR1 | IQGAP1 | IQGAP2 | FRG1 | EEF1A2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | ... | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 7244.000000 | 6144.000000 |
mean | 0.962452 | 0.994754 | 0.926560 | 0.993374 | -0.041137 | -0.442021 | -0.066538 | 0.180839 | -0.342076 | 0.088349 | ... | 0.987300 | 0.110850 | 0.573716 | 0.370514 | 0.384594 | 0.523744 | 1.199757 | 1.453168 | 1.739550 | 1.276638 |
std | 0.271472 | 0.102300 | 0.376173 | 0.114936 | 0.999222 | 0.807996 | 0.997853 | 0.903084 | 0.939737 | 0.996158 | ... | 0.158879 | 0.884617 | 0.819111 | 0.928891 | 0.923149 | 0.851935 | 0.835877 | 1.014409 | 0.499565 | 1.126547 |
min | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | ... | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 0.082729 | 0.017385 | 0.004461 | 0.000000 |
25% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | ... | 1.000000 | -1.000000 | 1.000000 | -1.000000 | -1.000000 | 1.000000 | 0.501858 | 0.692744 | 1.425864 | 0.277508 |
50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | -1.000000 | -1.000000 | -1.000000 | 1.000000 | -1.000000 | 1.000000 | ... | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.731611 | 1.196585 | 1.632566 | 1.028403 |
75% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.882118 | 1.984385 | 1.964976 | 2.004722 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 4.338948 | 6.246560 | 5.467065 | 7.079042 |
8 rows × 25 columns
In [8]:
df.fillna(df.mean(), inplace=True)
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 7244 entries, MBC-MBCProject_wAiri7fp-Tumor-SM-AZ5DH to F3265repl Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BIRC5 7244 non-null float64 1 CCNB1 7244 non-null float64 2 MYBL2 7244 non-null float64 3 MMP11 7244 non-null float64 4 GRB7 7244 non-null float64 5 PGR 7244 non-null float64 6 BCL2 7244 non-null float64 7 SCUBE2 7244 non-null float64 8 GSTM1 7244 non-null float64 9 BAG1 7244 non-null float64 10 CD68 7244 non-null float64 11 ACTB 7244 non-null float64 12 GAPDH 7244 non-null float64 13 RPLP0 7244 non-null float64 14 TFRC 7244 non-null float64 15 AURKA 7244 non-null float64 16 CTSV 7244 non-null float64 17 MKI67 7244 non-null float64 18 ERBB2 7244 non-null float64 19 GUSB 7244 non-null float64 20 ESR1 7244 non-null float64 21 IQGAP1 7244 non-null float64 22 IQGAP2 7244 non-null float64 23 FRG1 7244 non-null float64 24 EEF1A2 7244 non-null float64 dtypes: float64(25) memory usage: 1.4+ MB
In [10]:
train, test = train_test_split(df,test_size=0.20,random_state=SEED)
y_train, X_train = train.drop(['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2'],axis=1).copy(), train[['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2']].copy()
y_test, X_test = test.drop(['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2'],axis=1).copy(), test[['IQGAP1', 'IQGAP2', 'FRG1', 'EEF1A2']].copy()
del(train)
del(test)
In [11]:
len(X_train), len(X_test), len(y_train), len(y_test)
print(f'Train Samples: {len(X_train), len(y_train)} and Test Samples: {len(X_test), len(y_test)}')
Train Samples: (5795, 5795) and Test Samples: (1449, 1449)
IQGAP1
andEEF1A2
are oncogeneIQGAP2
andFRG1
are tumor suppressor
Gene | Role in Cancer |
---|---|
IQGAP1 | Predominantly oncogene, with rare, context-specific exceptions |
IQGAP2 | Clear tumor suppressor across multiple cancer types |
FRG1 | Tumor suppressor, particularly in breast and prostate cancer |
EEF1A2 | Clearly functions as an oncogene |
Data Exploration¶
In [12]:
X_train.plot(kind="scatter", x="IQGAP1",y="FRG1", grid=True)
plt.show() #BEFORE removing 0 values
In [13]:
X_train.plot(kind="scatter", x="EEF1A2",y="IQGAP2", grid=True)
Out[13]:
<Axes: xlabel='EEF1A2', ylabel='IQGAP2'>
In [14]:
X_train.plot(kind="scatter", x="EEF1A2",y="IQGAP1", grid=True)
Out[14]:
<Axes: xlabel='EEF1A2', ylabel='IQGAP1'>
In [15]:
X_train.plot(kind="scatter", x="FRG1",y="IQGAP2", grid=True)
Out[15]:
<Axes: xlabel='FRG1', ylabel='IQGAP2'>
In the SE crimes are dominated by part 2 and violent crimes, in NW crimes are more general
In [16]:
import matplotlib.pyplot as plt
X_train.hist(bins=50, figsize=(12,8))
plt.show() #clearly victim age has some 0 values so we will remove that
In [17]:
y_train.hist(bins=50, figsize=(12,8))
plt.show()
NOTE: Dataset is imbalanced
DATA PREPROCESSING¶
No, preprocessing as of now.
TRAIN, TEST, DEV SPLITS¶
In [18]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Simple preprocessor placeholder used by some pipelines below
preprocessor = 'passthrough'
CUSTOM FUNC FOR F1 MICRO AND MACRO¶
In [19]:
# Define a function to create a pipeline with a given model
def custom_f1score(y_val, y_pred, model_name, f1_score): # take (samples, outputs)
name_col = None
n_col = []
row_tup = tuple()
row_tup = row_tup + (model_name,) # Add model name to the tuple
for idx, col in enumerate(y_val.columns):
macro_acc = f1_score(y_val[col].values, y_pred[:, idx], average='macro')
row_tup = row_tup + (round(macro_acc, 4),)
n_col.append(col+'macro_score')
micro_acc = f1_score(y_val[col].values, y_pred[:, idx], average='micro')
row_tup = row_tup + (round(micro_acc, 4),)
n_col.append(col+'micro_score')
name_col = n_col
return row_tup, name_col
RUNNING ALL AVAILABEL ESTIMATORS ON SCIKIT-LEARN¶
In [20]:
from sklearn.utils import all_estimators
from sklearn.multioutput import MultiOutputClassifier
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
# Get all sklearn classifiers
all_classifiers = all_estimators(type_filter="classifier")
results = []
name_col = None
for name, ClfClass in all_classifiers:
try:
clf = MultiOutputClassifier(ClfClass())
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_pred, name, f1_score)
# Append the results
results.append(row_tup)
del row_tup # Delete to save the memory!
except Exception:
pass # Ignore models that fail
# Sort and show
results_df = pd.DataFrame(results, columns=["Model"] + name_col)
results_df = results_df.sort_values(by="ESR1micro_score", ascending=False)
In [21]:
from IPython.display import display
display(results_df.style.set_table_attributes("style='display:inline'").set_table_styles(
[{'selector':'table', 'props': [('overflow', 'scroll'), ('display', 'block')]}]
))
Model | BIRC5macro_score | BIRC5micro_score | CCNB1macro_score | CCNB1micro_score | MYBL2macro_score | MYBL2micro_score | MMP11macro_score | MMP11micro_score | GRB7macro_score | GRB7micro_score | PGRmacro_score | PGRmicro_score | BCL2macro_score | BCL2micro_score | SCUBE2macro_score | SCUBE2micro_score | GSTM1macro_score | GSTM1micro_score | BAG1macro_score | BAG1micro_score | CD68macro_score | CD68micro_score | ACTBmacro_score | ACTBmicro_score | GAPDHmacro_score | GAPDHmicro_score | RPLP0macro_score | RPLP0micro_score | TFRCmacro_score | TFRCmicro_score | AURKAmacro_score | AURKAmicro_score | CTSVmacro_score | CTSVmicro_score | MKI67macro_score | MKI67micro_score | ERBB2macro_score | ERBB2micro_score | GUSBmacro_score | GUSBmicro_score | ESR1macro_score | ESR1micro_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | ExtraTreesClassifier | 0.596500 | 0.986200 | 0.499800 | 0.999100 | 0.489200 | 0.957700 | 0.499100 | 0.996500 | 0.788700 | 0.789500 | 0.863300 | 0.882700 | 0.823600 | 0.824800 | 0.816800 | 0.786000 | 0.607500 | 0.697200 | 0.852100 | 0.852500 | 0.656300 | 0.799800 | 0.709600 | 0.923200 | 0.665800 | 0.996500 | 0.819900 | 0.934400 | 0.900800 | 0.878300 | 0.498500 | 0.994000 | 0.524800 | 0.559100 | 0.729700 | 0.836100 | 0.713400 | 0.746300 | 0.748300 | 0.786900 | 0.758900 | 0.843800 |
27 | RandomForestClassifier | 0.596500 | 0.986200 | 0.499800 | 0.999100 | 0.488800 | 0.956000 | 0.499100 | 0.996500 | 0.786300 | 0.786900 | 0.852200 | 0.872300 | 0.814700 | 0.816200 | 0.817100 | 0.785200 | 0.576100 | 0.676400 | 0.849500 | 0.849900 | 0.628600 | 0.780000 | 0.709600 | 0.921500 | 0.498900 | 0.995700 | 0.770200 | 0.923200 | 0.900300 | 0.876600 | 0.498500 | 0.994000 | 0.560600 | 0.585000 | 0.721900 | 0.834300 | 0.731100 | 0.763600 | 0.756400 | 0.792900 | 0.752000 | 0.840400 |
13 | HistGradientBoostingClassifier | 0.591500 | 0.985300 | 0.499800 | 0.999100 | 0.507600 | 0.956000 | 0.499100 | 0.996500 | 0.770800 | 0.771400 | 0.825000 | 0.847300 | 0.797700 | 0.799000 | 0.792000 | 0.756700 | 0.535100 | 0.661800 | 0.850300 | 0.850700 | 0.604500 | 0.780000 | 0.697000 | 0.916300 | 0.498700 | 0.994800 | 0.774900 | 0.915400 | 0.888900 | 0.861900 | 0.498300 | 0.993100 | 0.555900 | 0.576400 | 0.699700 | 0.816200 | 0.732300 | 0.765300 | 0.736300 | 0.779100 | 0.732500 | 0.831800 |
1 | BaggingClassifier | 0.579000 | 0.982700 | 0.499800 | 0.999100 | 0.520200 | 0.950800 | 0.499100 | 0.996500 | 0.753200 | 0.755000 | 0.819100 | 0.848100 | 0.793000 | 0.794700 | 0.784700 | 0.742000 | 0.549300 | 0.649700 | 0.824500 | 0.824800 | 0.610700 | 0.741200 | 0.693400 | 0.906800 | 0.498500 | 0.994000 | 0.800300 | 0.918900 | 0.895400 | 0.870600 | 0.497800 | 0.991400 | 0.550500 | 0.569500 | 0.725100 | 0.823100 | 0.727500 | 0.754100 | 0.739600 | 0.773900 | 0.742000 | 0.827400 |
21 | MLPClassifier | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.755700 | 0.756700 | 0.791700 | 0.821400 | 0.787000 | 0.788600 | 0.778600 | 0.751500 | 0.410200 | 0.667800 | 0.841800 | 0.842100 | 0.557700 | 0.783400 | 0.629300 | 0.910300 | 0.498900 | 0.995700 | 0.698500 | 0.909400 | 0.877000 | 0.857600 | 0.498500 | 0.994000 | 0.609800 | 0.629900 | 0.712400 | 0.833500 | 0.744400 | 0.767000 | 0.736800 | 0.791200 | 0.706500 | 0.824800 |
0 | AdaBoostClassifier | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.769400 | 0.769600 | 0.738900 | 0.780000 | 0.762600 | 0.764500 | 0.720700 | 0.705800 | 0.402000 | 0.672100 | 0.839200 | 0.839500 | 0.571900 | 0.785200 | 0.505800 | 0.899900 | 0.498700 | 0.994800 | 0.659200 | 0.895600 | 0.876100 | 0.858500 | 0.498300 | 0.993100 | 0.574600 | 0.616900 | 0.664800 | 0.812800 | 0.717800 | 0.763600 | 0.730900 | 0.768800 | 0.712800 | 0.824800 |
12 | GradientBoostingClassifier | 0.538900 | 0.981900 | 0.498900 | 0.995700 | 0.488300 | 0.954300 | 0.499100 | 0.996500 | 0.779600 | 0.780000 | 0.815300 | 0.839500 | 0.787000 | 0.788600 | 0.788100 | 0.755800 | 0.457800 | 0.676400 | 0.843500 | 0.843800 | 0.576700 | 0.783400 | 0.692000 | 0.917200 | 0.609600 | 0.994000 | 0.757300 | 0.923200 | 0.893100 | 0.867100 | 0.497000 | 0.987900 | 0.623900 | 0.638500 | 0.699300 | 0.826600 | 0.742700 | 0.773100 | 0.747000 | 0.787700 | 0.706700 | 0.824000 |
11 | GaussianProcessClassifier | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.762600 | 0.763600 | 0.767500 | 0.816200 | 0.794000 | 0.795500 | 0.740500 | 0.727400 | 0.434400 | 0.671300 | 0.845200 | 0.845600 | 0.534300 | 0.787700 | 0.590600 | 0.907700 | 0.498900 | 0.995700 | 0.677100 | 0.897300 | 0.850200 | 0.840400 | 0.498500 | 0.994000 | 0.605100 | 0.628100 | 0.669300 | 0.814500 | 0.743600 | 0.772200 | 0.761900 | 0.800700 | 0.706800 | 0.823100 |
31 | SVC | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.763600 | 0.764500 | 0.765400 | 0.813600 | 0.783300 | 0.786000 | 0.732500 | 0.723900 | 0.402000 | 0.672100 | 0.842000 | 0.842100 | 0.436000 | 0.773100 | 0.551800 | 0.905100 | 0.498900 | 0.995700 | 0.654200 | 0.897300 | 0.854200 | 0.843800 | 0.498500 | 0.994000 | 0.597500 | 0.629900 | 0.662600 | 0.820500 | 0.741100 | 0.771400 | 0.756900 | 0.799800 | 0.681000 | 0.817100 |
19 | LogisticRegression | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.754600 | 0.755800 | 0.649600 | 0.755800 | 0.764500 | 0.766200 | 0.671300 | 0.679000 | 0.422500 | 0.669500 | 0.838400 | 0.838700 | 0.444000 | 0.774800 | 0.482100 | 0.899900 | 0.498900 | 0.995700 | 0.559900 | 0.857600 | 0.791700 | 0.805900 | 0.498500 | 0.994000 | 0.467900 | 0.581500 | 0.604000 | 0.804100 | 0.742700 | 0.773100 | 0.730300 | 0.781700 | 0.686500 | 0.815400 |
20 | LogisticRegressionCV | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.754600 | 0.755800 | 0.643900 | 0.755000 | 0.766200 | 0.767900 | 0.669000 | 0.683300 | 0.402000 | 0.672100 | 0.838400 | 0.838700 | 0.444000 | 0.774800 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.559400 | 0.857600 | 0.789400 | 0.804100 | 0.498500 | 0.994000 | 0.429600 | 0.578100 | 0.602500 | 0.809300 | 0.748500 | 0.780000 | 0.730300 | 0.781700 | 0.684500 | 0.814500 |
3 | CalibratedClassifierCV | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.753700 | 0.755000 | 0.616300 | 0.748100 | 0.763600 | 0.765300 | 0.675700 | 0.683300 | 0.419900 | 0.668700 | 0.839300 | 0.839500 | 0.440000 | 0.773900 | 0.481200 | 0.897300 | 0.498900 | 0.995700 | 0.561400 | 0.859400 | 0.768800 | 0.788600 | 0.498500 | 0.994000 | 0.446100 | 0.572000 | 0.615500 | 0.808500 | 0.745200 | 0.775700 | 0.732400 | 0.783400 | 0.679100 | 0.812800 |
17 | LinearDiscriminantAnalysis | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.750000 | 0.751500 | 0.682600 | 0.760100 | 0.764000 | 0.766200 | 0.684200 | 0.688500 | 0.422500 | 0.669500 | 0.833400 | 0.833500 | 0.436000 | 0.773100 | 0.481800 | 0.899100 | 0.498900 | 0.995700 | 0.577500 | 0.858500 | 0.784700 | 0.799000 | 0.498500 | 0.994000 | 0.465900 | 0.579800 | 0.453000 | 0.789500 | 0.739300 | 0.767900 | 0.719100 | 0.776500 | 0.668500 | 0.811000 |
14 | KNeighborsClassifier | 0.495900 | 0.983600 | 0.499800 | 0.999100 | 0.488300 | 0.954300 | 0.499100 | 0.996500 | 0.737800 | 0.738600 | 0.765200 | 0.812800 | 0.781300 | 0.782600 | 0.762200 | 0.738600 | 0.528500 | 0.611700 | 0.815500 | 0.816200 | 0.587600 | 0.748100 | 0.637000 | 0.909400 | 0.665800 | 0.996500 | 0.705100 | 0.884400 | 0.849300 | 0.835200 | 0.498500 | 0.994000 | 0.555500 | 0.572900 | 0.690300 | 0.810200 | 0.714200 | 0.749800 | 0.702300 | 0.747200 | 0.703900 | 0.806700 |
18 | LinearSVC | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.750000 | 0.751500 | 0.598900 | 0.747200 | 0.765000 | 0.767000 | 0.669300 | 0.679000 | 0.418300 | 0.670400 | 0.836800 | 0.836900 | 0.436000 | 0.773100 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.561400 | 0.859400 | 0.766200 | 0.786000 | 0.498500 | 0.994000 | 0.443100 | 0.578100 | 0.564900 | 0.805000 | 0.748600 | 0.774800 | 0.731300 | 0.786000 | 0.633400 | 0.803300 |
10 | GaussianNB | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.498700 | 0.994800 | 0.736600 | 0.739400 | 0.803200 | 0.814500 | 0.757300 | 0.760100 | 0.788400 | 0.748100 | 0.493600 | 0.658300 | 0.816200 | 0.816200 | 0.562200 | 0.643700 | 0.480000 | 0.893900 | 0.498700 | 0.994800 | 0.756200 | 0.875800 | 0.891300 | 0.865400 | 0.498500 | 0.994000 | 0.473800 | 0.564300 | 0.661400 | 0.692800 | 0.762700 | 0.767900 | 0.721500 | 0.729100 | 0.684800 | 0.802400 |
26 | QuadraticDiscriminantAnalysis | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.498700 | 0.994800 | 0.746600 | 0.748900 | 0.782300 | 0.810200 | 0.750500 | 0.753200 | 0.771800 | 0.745500 | 0.424400 | 0.668700 | 0.821300 | 0.821400 | 0.481500 | 0.769600 | 0.545300 | 0.891300 | 0.630900 | 0.990500 | 0.775200 | 0.902500 | 0.889300 | 0.862800 | 0.498500 | 0.994000 | 0.510400 | 0.591000 | 0.701700 | 0.742000 | 0.762800 | 0.769600 | 0.739600 | 0.753200 | 0.677100 | 0.799000 |
30 | SGDClassifier | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.752700 | 0.754100 | 0.615200 | 0.754100 | 0.765700 | 0.767000 | 0.611800 | 0.659200 | 0.402000 | 0.672100 | 0.830000 | 0.830000 | 0.436000 | 0.773100 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.555400 | 0.851600 | 0.625500 | 0.733400 | 0.498500 | 0.994000 | 0.431200 | 0.568600 | 0.457800 | 0.792100 | 0.722000 | 0.766200 | 0.733300 | 0.764500 | 0.604400 | 0.798100 |
28 | RidgeClassifier | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.750000 | 0.751500 | 0.596800 | 0.748100 | 0.764000 | 0.766200 | 0.663500 | 0.674700 | 0.417900 | 0.669500 | 0.833400 | 0.833500 | 0.436000 | 0.773100 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.565000 | 0.861100 | 0.700500 | 0.743700 | 0.498500 | 0.994000 | 0.436300 | 0.575500 | 0.441700 | 0.791200 | 0.725000 | 0.767000 | 0.696000 | 0.773900 | 0.595000 | 0.793800 |
29 | RidgeClassifierCV | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.750000 | 0.751500 | 0.597300 | 0.748100 | 0.763100 | 0.765300 | 0.664900 | 0.675600 | 0.418300 | 0.670400 | 0.833400 | 0.833500 | 0.436000 | 0.773100 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.562900 | 0.859400 | 0.700500 | 0.743700 | 0.498500 | 0.994000 | 0.436300 | 0.575500 | 0.441700 | 0.791200 | 0.724600 | 0.767000 | 0.694400 | 0.773100 | 0.589700 | 0.792100 |
4 | CategoricalNB | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.742600 | 0.744600 | 0.745200 | 0.783400 | 0.759400 | 0.761900 | 0.735700 | 0.717900 | 0.500400 | 0.647100 | 0.839300 | 0.839500 | 0.513200 | 0.771400 | 0.489900 | 0.899100 | 0.498900 | 0.995700 | 0.686800 | 0.864500 | 0.838700 | 0.831800 | 0.498500 | 0.994000 | 0.552200 | 0.591900 | 0.645600 | 0.743700 | 0.759400 | 0.770500 | 0.734500 | 0.751500 | 0.648100 | 0.771400 |
8 | ExtraTreeClassifier | 0.598700 | 0.974100 | 0.499100 | 0.996500 | 0.537500 | 0.941300 | 0.499100 | 0.996500 | 0.680600 | 0.680800 | 0.792500 | 0.815400 | 0.752300 | 0.753200 | 0.730300 | 0.685900 | 0.561100 | 0.610000 | 0.777600 | 0.780000 | 0.581600 | 0.691100 | 0.633400 | 0.874000 | 0.771600 | 0.995700 | 0.728100 | 0.866300 | 0.824700 | 0.797200 | 0.496100 | 0.984500 | 0.487200 | 0.516000 | 0.649800 | 0.769600 | 0.657100 | 0.703200 | 0.689400 | 0.737700 | 0.687600 | 0.769600 |
6 | DecisionTreeClassifier | 0.561600 | 0.968100 | 0.499100 | 0.996500 | 0.500100 | 0.920600 | 0.499100 | 0.996500 | 0.695400 | 0.695400 | 0.805800 | 0.815400 | 0.743300 | 0.743700 | 0.751700 | 0.706600 | 0.522200 | 0.574600 | 0.756400 | 0.761000 | 0.595000 | 0.713500 | 0.669700 | 0.887000 | 0.598300 | 0.993100 | 0.788800 | 0.896500 | 0.851400 | 0.815400 | 0.496700 | 0.987100 | 0.497700 | 0.523700 | 0.685800 | 0.792100 | 0.680100 | 0.717900 | 0.706100 | 0.749800 | 0.689000 | 0.769600 |
2 | BernoulliNB | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.340400 | 0.505600 | 0.258700 | 0.634200 | 0.338100 | 0.510800 | 0.228000 | 0.509100 | 0.402000 | 0.672100 | 0.360000 | 0.562600 | 0.435200 | 0.770500 | 0.491000 | 0.901600 | 0.498900 | 0.995700 | 0.309300 | 0.766200 | 0.211400 | 0.452100 | 0.498500 | 0.994000 | 0.210300 | 0.460700 | 0.441700 | 0.791200 | 0.401000 | 0.669500 | 0.405600 | 0.682500 | 0.439400 | 0.761900 |
22 | MultinomialNB | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.679300 | 0.684200 | 0.262500 | 0.634200 | 0.693100 | 0.704900 | 0.236000 | 0.511600 | 0.402000 | 0.672100 | 0.638300 | 0.653100 | 0.439700 | 0.773100 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.292500 | 0.764500 | 0.447500 | 0.638500 | 0.498500 | 0.994000 | 0.330600 | 0.496100 | 0.441700 | 0.791200 | 0.472000 | 0.671300 | 0.441200 | 0.685900 | 0.432100 | 0.761000 |
15 | LabelPropagation | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.332000 | 0.497000 | 0.431900 | 0.349400 | 0.328500 | 0.489200 | 0.552800 | 0.644500 | 0.246900 | 0.327900 | 0.360000 | 0.562600 | 0.436000 | 0.773100 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.619400 | 0.899100 | 0.519000 | 0.531500 | 0.498500 | 0.994000 | 0.316900 | 0.439200 | 0.441700 | 0.791200 | 0.401000 | 0.669500 | 0.405600 | 0.682500 | 0.432100 | 0.761000 |
7 | DummyClassifier | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.334700 | 0.503000 | 0.258700 | 0.634200 | 0.338100 | 0.510800 | 0.224600 | 0.508200 | 0.402000 | 0.672100 | 0.360000 | 0.562600 | 0.436000 | 0.773100 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.288600 | 0.763600 | 0.206700 | 0.449500 | 0.498500 | 0.994000 | 0.210300 | 0.460700 | 0.441700 | 0.791200 | 0.401000 | 0.669500 | 0.405600 | 0.682500 | 0.432100 | 0.761000 |
16 | LabelSpreading | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.489400 | 0.958600 | 0.499100 | 0.996500 | 0.332000 | 0.497000 | 0.433900 | 0.350300 | 0.328500 | 0.489200 | 0.553900 | 0.645400 | 0.246900 | 0.327900 | 0.360000 | 0.562600 | 0.436000 | 0.773100 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.619400 | 0.899100 | 0.517800 | 0.530600 | 0.498500 | 0.994000 | 0.319500 | 0.440000 | 0.441700 | 0.791200 | 0.401000 | 0.669500 | 0.405600 | 0.682500 | 0.432100 | 0.761000 |
23 | NearestCentroid | 0.378200 | 0.550500 | 0.379600 | 0.611700 | 0.412800 | 0.551300 | 0.422200 | 0.698900 | 0.744500 | 0.746300 | 0.625900 | 0.704100 | 0.771900 | 0.773900 | 0.540700 | 0.528900 | 0.534700 | 0.561700 | 0.807600 | 0.807600 | 0.545600 | 0.567700 | 0.407800 | 0.476300 | 0.444600 | 0.757500 | 0.577800 | 0.718700 | 0.682700 | 0.720400 | 0.382200 | 0.589300 | 0.527500 | 0.573800 | 0.654200 | 0.679900 | 0.748700 | 0.753200 | 0.699500 | 0.705800 | 0.638500 | 0.678200 |
5 | ComplementNB | 0.379500 | 0.553100 | 0.377600 | 0.601400 | 0.410900 | 0.543600 | 0.402000 | 0.647100 | 0.629000 | 0.629000 | 0.399400 | 0.541000 | 0.673700 | 0.675600 | 0.441900 | 0.477100 | 0.503100 | 0.527200 | 0.696300 | 0.698000 | 0.558500 | 0.614300 | 0.419300 | 0.497000 | 0.415600 | 0.675600 | 0.376300 | 0.591900 | 0.429600 | 0.485800 | 0.357200 | 0.532400 | 0.467500 | 0.556500 | 0.628900 | 0.675600 | 0.634200 | 0.656600 | 0.644700 | 0.660100 | 0.576500 | 0.601400 |
24 | PassiveAggressiveClassifier | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.519300 | 0.871400 | 0.499100 | 0.996500 | 0.716600 | 0.722200 | 0.524800 | 0.736800 | 0.746300 | 0.749800 | 0.573400 | 0.641900 | 0.478500 | 0.478900 | 0.805000 | 0.805000 | 0.484400 | 0.769600 | 0.503400 | 0.895600 | 0.498900 | 0.995700 | 0.607400 | 0.791200 | 0.669900 | 0.707500 | 0.498500 | 0.994000 | 0.450000 | 0.535800 | 0.479200 | 0.777400 | 0.764900 | 0.778300 | 0.419400 | 0.685100 | 0.495200 | 0.504700 |
25 | Perceptron | 0.496100 | 0.984500 | 0.499800 | 0.999100 | 0.504900 | 0.951700 | 0.499100 | 0.996500 | 0.457700 | 0.558200 | 0.525700 | 0.723000 | 0.686000 | 0.702300 | 0.614500 | 0.644500 | 0.411900 | 0.672100 | 0.734000 | 0.750600 | 0.185000 | 0.226900 | 0.473900 | 0.900800 | 0.498900 | 0.995700 | 0.581100 | 0.816200 | 0.763600 | 0.775700 | 0.498100 | 0.992200 | 0.327900 | 0.422800 | 0.445200 | 0.789500 | 0.456100 | 0.685100 | 0.534900 | 0.535800 | 0.500300 | 0.504700 |
In [22]:
results_df.shape
Out[22]:
(32, 43)
In [23]:
results_df['x'] = np.linspace(0, 1, 32)
In [24]:
results_df.head()
Out[24]:
Model | BIRC5macro_score | BIRC5micro_score | CCNB1macro_score | CCNB1micro_score | MYBL2macro_score | MYBL2micro_score | MMP11macro_score | MMP11micro_score | GRB7macro_score | ... | CTSVmicro_score | MKI67macro_score | MKI67micro_score | ERBB2macro_score | ERBB2micro_score | GUSBmacro_score | GUSBmicro_score | ESR1macro_score | ESR1micro_score | x | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | ExtraTreesClassifier | 0.5965 | 0.9862 | 0.4998 | 0.9991 | 0.4892 | 0.9577 | 0.4991 | 0.9965 | 0.7887 | ... | 0.5591 | 0.7297 | 0.8361 | 0.7134 | 0.7463 | 0.7483 | 0.7869 | 0.7589 | 0.8438 | 0.000000 |
27 | RandomForestClassifier | 0.5965 | 0.9862 | 0.4998 | 0.9991 | 0.4888 | 0.9560 | 0.4991 | 0.9965 | 0.7863 | ... | 0.5850 | 0.7219 | 0.8343 | 0.7311 | 0.7636 | 0.7564 | 0.7929 | 0.7520 | 0.8404 | 0.032258 |
13 | HistGradientBoostingClassifier | 0.5915 | 0.9853 | 0.4998 | 0.9991 | 0.5076 | 0.9560 | 0.4991 | 0.9965 | 0.7708 | ... | 0.5764 | 0.6997 | 0.8162 | 0.7323 | 0.7653 | 0.7363 | 0.7791 | 0.7325 | 0.8318 | 0.064516 |
1 | BaggingClassifier | 0.5790 | 0.9827 | 0.4998 | 0.9991 | 0.5202 | 0.9508 | 0.4991 | 0.9965 | 0.7532 | ... | 0.5695 | 0.7251 | 0.8231 | 0.7275 | 0.7541 | 0.7396 | 0.7739 | 0.7420 | 0.8274 | 0.096774 |
21 | MLPClassifier | 0.4961 | 0.9845 | 0.4998 | 0.9991 | 0.4894 | 0.9586 | 0.4991 | 0.9965 | 0.7557 | ... | 0.6299 | 0.7124 | 0.8335 | 0.7444 | 0.7670 | 0.7368 | 0.7912 | 0.7065 | 0.8248 | 0.129032 |
5 rows × 44 columns
In [25]:
# results_df.drop(columns=['x'], inplace=True)
In [26]:
results_df.to_csv('../results/model/multiclassclf_results.csv', index=False)
In [27]:
import plotly.io as pio
pio.renderers.default = "vscode"
In [28]:
import pandas as pd
import plotly.express as px
# Load results
file_path = "../results/model/multiclassclf_results.csv" # update path if needed
df = pd.read_csv(file_path)
# Reshape into long format
df_long = df.melt(id_vars=["Model"], var_name="Gene_Metric", value_name="Score")
# Split "Gene_Metric" into "Gene" and "Metric"
df_long[["Gene", "Metric"]] = df_long["Gene_Metric"].str.extract(r"(.+?)(macro_score|micro_score)")
# Interactive line chart (first 6 genes to keep it readable)
genes_to_plot = df_long["Gene"].unique()
subset = df_long[df_long["Gene"].isin(genes_to_plot)]
fig = px.line(
subset,
x="Model",
y="Score",
color="Gene",
line_dash="Metric", # dashed line for macro/micro
markers=True,
title="Gene-wise Macro and Micro Scores Across Models (first 6 genes)",
hover_data={"Score": ":.4f", "Model": True, "Gene": True, "Metric": True},
)
fig.update_layout(
xaxis_tickangle=45,
legend_title_text="Gene - Metric",
width=1000,
height=600
)
fig.show()
SELECT MODEL AND FINETUNING¶
In [29]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
baseline_pipeline = Pipeline(steps=[
('classifier', MultiOutputClassifier(LogisticRegression(max_iter=5000)))
])
baseline_pipeline.fit(X_train, y_train)
y_pred = baseline_pipeline.predict(X_val)
from pprint import pprint
# Calculate the f1_score. Micro and Macro
row_tup, name_col = custom_f1score(y_val, y_pred, "Baseline Logistic Regression", f1_score)
print(name_col)
print(row_tup)
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Baseline Logistic Regression', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7546, 0.7558, 0.6496, 0.7558, 0.7645, 0.7662, 0.6713, 0.679, 0.4225, 0.6695, 0.8384, 0.8387, 0.444, 0.7748, 0.4821, 0.8999, 0.4989, 0.9957, 0.5599, 0.8576, 0.7917, 0.8059, 0.4985, 0.994, 0.4679, 0.5815, 0.604, 0.8041, 0.7427, 0.7731, 0.7303, 0.7817, 0.6865, 0.8154)
RANDOM_SEARCH_LOGISTIC¶
In [30]:
# Logistic Regression with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold
logistic_pipeline = Pipeline(steps=[
('classifier', MultiOutputClassifier(LogisticRegression(max_iter=5000))) # Increased max_iter
])
# Note: Parameters of the underlying estimator inside MultiOutputClassifier
# are accessed via 'classifier__estimator__<param>'
param_grid_logistic = {
'classifier__estimator__C': np.logspace(-4, 4, 20),
'classifier__estimator__solver': ['liblinear', 'lbfgs']
}
# Use KFold (not StratifiedKFold) for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)
random_search_logistic = RandomizedSearchCV(
logistic_pipeline,
param_distributions=param_grid_logistic,
n_iter=10,
cv=cv_reduced,
verbose=1,
random_state=42,
n_jobs=-1
)
random_search_logistic.fit(X_train, y_train)
y_val_pred = random_search_logistic.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Logistic Regression", f1_score)
print(name_col)
print(row_tup)
Fitting 3 folds for each of 10 candidates, totalling 30 fits ['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Logistic Regression', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7555, 0.7567, 0.6499, 0.7558, 0.7645, 0.7662, 0.6713, 0.679, 0.4225, 0.6695, 0.8384, 0.8387, 0.444, 0.7748, 0.4818, 0.8991, 0.4989, 0.9957, 0.5585, 0.8568, 0.7925, 0.8067, 0.4985, 0.994, 0.4711, 0.5833, 0.6143, 0.805, 0.7419, 0.7722, 0.7295, 0.7808, 0.6886, 0.8162)
RANDOMFORESTCLASSIFIER MODEL FOR CLASSIFICATION¶
In [31]:
# Baseline model: RandomForest with basic settings
random_forest_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
random_forest_pipeline.fit(X_train, y_train)
y_val_pred = random_forest_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Baseline Random Forest', 0.5965, 0.9862, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7922, 0.7929, 0.8522, 0.8723, 0.8131, 0.8145, 0.813, 0.7817, 0.5961, 0.6997, 0.8521, 0.8525, 0.6331, 0.7929, 0.708, 0.9241, 0.4989, 0.9957, 0.7811, 0.9292, 0.9019, 0.8783, 0.4985, 0.994, 0.563, 0.585, 0.7228, 0.8352, 0.7193, 0.7532, 0.7495, 0.7877, 0.7418, 0.8395)
RANDOM_SEARCH_RANDOM_FOREST¶
In [32]:
# Random Forest with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold
random_forest_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42))
])
param_grid_random_forest = {
'classifier__n_estimators': [100, 200, 300],
'classifier__max_features': ['auto', 'sqrt', 'log2'],
'classifier__max_depth': [10, 20, 30, None],
'classifier__min_samples_split': [2, 5, 10],
'classifier__min_samples_leaf': [1, 2, 4]
}
# Use KFold for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)
random_search_random_forest = RandomizedSearchCV(random_forest_pipeline, param_distributions=param_grid_random_forest, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_random_forest.fit(X_train, y_train)
y_val_pred = random_search_random_forest.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
Fitting 3 folds for each of 10 candidates, totalling 30 fits ['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Baseline Random Forest', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.782, 0.7826, 0.8251, 0.8499, 0.7996, 0.8016, 0.8045, 0.7765, 0.5181, 0.6782, 0.847, 0.8473, 0.5719, 0.7826, 0.6658, 0.9146, 0.4989, 0.9957, 0.7658, 0.9301, 0.8959, 0.8714, 0.4985, 0.994, 0.5951, 0.6135, 0.6957, 0.8231, 0.7394, 0.7688, 0.7511, 0.7903, 0.7034, 0.824)
XGBCLASSIFIER MODEL FOR CLASSIFICATION¶
In [33]:
# Baseline model: XGBClassifier wrapped for multi-output
xgboost_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False)))
])
xgboost_pipeline.fit(X_train, y_train)
y_val_pred = xgboost_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[33], line 7 1 # Baseline model: XGBClassifier wrapped for multi-output 2 xgboost_pipeline = Pipeline(steps=[ 3 ('preprocessor', preprocessor), 4 ("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False))) 5 ]) ----> 7 xgboost_pipeline.fit(X_train, y_train) 9 y_val_pred = xgboost_pipeline.predict(X_val) 10 row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score) File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs) 1358 estimator._validate_params() 1360 with config_context( 1361 skip_parameter_validation=( 1362 prefer_skip_nested_validation or global_skip_validation 1363 ) 1364 ): -> 1365 return fit_method(estimator, *args, **kwargs) File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/pipeline.py:663, in Pipeline.fit(self, X, y, **params) 657 if self._final_estimator != "passthrough": 658 last_step_params = self._get_metadata_for_step( 659 step_idx=len(self) - 1, 660 step_params=routed_params[self.steps[-1][0]], 661 all_params=params, 662 ) --> 663 self._final_estimator.fit(Xt, y, **last_step_params["fit"]) 665 return self File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:547, in MultiOutputClassifier.fit(self, X, Y, sample_weight, **fit_params) 521 def fit(self, X, Y, sample_weight=None, **fit_params): 522 """Fit the model to data matrix X and targets Y. 523 524 Parameters (...) 545 Returns a fitted instance. 546 """ --> 547 super().fit(X, Y, sample_weight=sample_weight, **fit_params) 548 self.classes_ = [estimator.classes_ for estimator in self.estimators_] 549 return self File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs) 1358 estimator._validate_params() 1360 with config_context( 1361 skip_parameter_validation=( 1362 prefer_skip_nested_validation or global_skip_validation 1363 ) 1364 ): -> 1365 return fit_method(estimator, *args, **kwargs) File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:278, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params) 275 if sample_weight is not None: 276 routed_params.estimator.fit["sample_weight"] = sample_weight --> 278 self.estimators_ = Parallel(n_jobs=self.n_jobs)( 279 delayed(_fit_estimator)( 280 self.estimator, X, y[:, i], **routed_params.estimator.fit 281 ) 282 for i in range(y.shape[1]) 283 ) 285 if hasattr(self.estimators_[0], "n_features_in_"): 286 self.n_features_in_ = self.estimators_[0].n_features_in_ File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/utils/parallel.py:82, in Parallel.__call__(self, iterable) 73 warning_filters = warnings.filters 74 iterable_with_config_and_warning_filters = ( 75 ( 76 _with_config_and_warning_filters(delayed_func, config, warning_filters), (...) 80 for delayed_func, args, kwargs in iterable 81 ) ---> 82 return super().__call__(iterable_with_config_and_warning_filters) File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/joblib/parallel.py:1986, in Parallel.__call__(self, iterable) 1984 output = self._get_sequential_output(iterable) 1985 next(output) -> 1986 return output if self.return_generator else list(output) 1988 # Let's create an ID that uniquely identifies the current call. If the 1989 # call is interrupted early and that the same instance is immediately 1990 # reused, this id will be used to prevent workers that were 1991 # concurrently finalizing a task from the previous call to run the 1992 # callback. 1993 with self._lock: File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/joblib/parallel.py:1914, in Parallel._get_sequential_output(self, iterable) 1912 self.n_dispatched_batches += 1 1913 self.n_dispatched_tasks += 1 -> 1914 res = func(*args, **kwargs) 1915 self.n_completed_tasks += 1 1916 self.print_progress() File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/utils/parallel.py:147, in _FuncWrapper.__call__(self, *args, **kwargs) 145 with config_context(**config), warnings.catch_warnings(): 146 warnings.filters = warning_filters --> 147 return self.function(*args, **kwargs) File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/sklearn/multioutput.py:67, in _fit_estimator(estimator, X, y, sample_weight, **fit_params) 65 estimator.fit(X, y, sample_weight=sample_weight, **fit_params) 66 else: ---> 67 estimator.fit(X, y, **fit_params) 68 return estimator File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/xgboost/core.py:705, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs) 703 for k, arg in zip(sig.parameters, args): 704 kwargs[k] = arg --> 705 return func(**kwargs) File ~/miniconda3/envs/sklearn_env/lib/python3.13/site-packages/xgboost/sklearn.py:1640, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights) 1635 expected_classes = self.classes_ 1636 if ( 1637 classes.shape != expected_classes.shape 1638 or not (classes == expected_classes).all() 1639 ): -> 1640 raise ValueError( 1641 f"Invalid classes inferred from unique values of `y`. " 1642 f"Expected: {expected_classes}, got {classes}" 1643 ) 1645 params = self.get_xgb_params() 1647 if callable(self.objective): ValueError: Invalid classes inferred from unique values of `y`. Expected: [0 1], got [-1. 1.]
In [35]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# -------------------------
# 1. Generate toy dataset
# -------------------------
X = np.random.rand(20, 5) # 20 samples, 5 features
y = np.random.choice([-1, 1], size=20) # labels in {-1, 1}
# -------------------------
# 2. Map labels -1 → 0, 1 → 1
# -------------------------
y_mapped = (y == 1).astype(int) # {-1,1} → {0,1}
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, random_state=42)
# -------------------------
# 3. Train XGBoost model
# -------------------------
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {
"objective": "binary:logistic", # binary classification
"eval_metric": "logloss"
}
model = xgb.train(params, dtrain, num_boost_round=20)
# -------------------------
# 4. Make predictions
# -------------------------
y_pred_prob = model.predict(dtest) # probabilities for class 1
y_pred = (y_pred_prob > 0.5).astype(int) # threshold at 0.5
y_pred_original = np.where(y_pred == 1, 1, -1) # map back {0,1} → {-1,1}
# -------------------------
# 5. Evaluate
# -------------------------
y_test_original = np.where(y_test == 1, 1, -1)
acc = accuracy_score(y_test_original, y_pred_original)
print("True labels: ", y_test_original)
print("Predicted labels:", y_pred_original)
print(f"Accuracy: {acc:.4f}")
True labels: [ 1 -1 -1 1 1 -1] Predicted labels: [ 1 1 -1 1 -1 1] Accuracy: 0.5000
In [ ]:
param_grid_xgboost = {
'classifier__estimator__n_estimators': [100, 200, 300],
'classifier__estimator__max_depth': [3, 6, 9, 12],
'classifier__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2],
'classifier__estimator__subsample': [0.6, 0.8, 1.0],
'classifier__estimator__colsample_bytree': [0.6, 0.8, 1.0],
'classifier__estimator__gamma': [0, 0.1, 0.2, 0.3],
'classifier__estimator__min_child_weight': [1, 3, 5]
}
# Use KFold for multi-output targets
from sklearn.model_selection import KFold
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)
# Rebuild the pipeline to ensure wrapped estimator for search
xgboost_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
("classifier", MultiOutputClassifier(xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train.values)), use_label_encoder=False)))
])
random_search_xgboost = RandomizedSearchCV(xgboost_pipeline, param_distributions=param_grid_xgboost, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_xgboost.fit(X_train, y_train)
y_val_pred = random_search_xgboost.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
print(f"Best Parameters: {random_search_xgboost.best_params_}")
Fitting 3 folds for each of 10 candidates, totalling 30 fits ['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Baseline Random Forest', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7728, 0.7731, 0.8088, 0.8378, 0.7944, 0.7955, 0.7875, 0.755, 0.4492, 0.6756, 0.846, 0.8464, 0.576, 0.7852, 0.659, 0.9129, 0.6418, 0.9957, 0.7343, 0.9189, 0.8965, 0.8714, 0.4985, 0.994, 0.619, 0.635, 0.6903, 0.8214, 0.74, 0.7705, 0.7584, 0.7955, 0.7144, 0.8283) Best Parameters: {'classifier__estimator__subsample': 0.8, 'classifier__estimator__n_estimators': 200, 'classifier__estimator__min_child_weight': 5, 'classifier__estimator__max_depth': 3, 'classifier__estimator__learning_rate': 0.05, 'classifier__estimator__gamma': 0, 'classifier__estimator__colsample_bytree': 0.6}
In [ ]:
# Predict on the validation set using the best model
y_val_pred = random_search_xgboost.best_estimator_.predict(X_val)
# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix
y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred)
conf_matrices = []
if y_true_np.ndim == 2:
for j in range(y_true_np.shape[1]):
conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))
# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
print(f"Confusion Matrix for output {idx}:")
print(cm)
Confusion Matrix for output 0: [[ 0 18] [ 0 1141]] Confusion Matrix for output 1: [[ 0 1] [ 0 1158]] Confusion Matrix for output 2: [[ 0 48] [ 0 1111]] Confusion Matrix for output 3: [[ 0 4] [ 0 1155]] Confusion Matrix for output 4: [[470 113] [150 426]] Confusion Matrix for output 5: [[679 0 56] [ 0 178 0] [132 0 114]] Confusion Matrix for output 6: [[503 89] [148 419]] Confusion Matrix for output 7: [[200 0 192] [ 0 178 0] [ 92 0 497]] Confusion Matrix for output 8: [[763 16] [360 20]] Confusion Matrix for output 9: [[462 45] [133 519]] Confusion Matrix for output 10: [[ 48 215] [ 34 862]] Confusion Matrix for output 11: [[ 29 86] [ 15 1029]] Confusion Matrix for output 12: [[ 1 4] [ 1 1153]] Confusion Matrix for output 13: [[ 16 0 80] [ 0 178 0] [ 14 0 871]] Confusion Matrix for output 14: [[488 0 33] [ 0 178 0] [116 0 344]] Confusion Matrix for output 15: [[ 0 7] [ 0 1152]] Confusion Matrix for output 16: [[288 32 76] [ 28 124 77] [126 84 324]] Confusion Matrix for output 17: [[ 99 143] [ 64 853]] Confusion Matrix for output 18: [[248 135] [131 645]] Confusion Matrix for output 19: [[234 134] [103 688]] Confusion Matrix for output 20: [[114 163] [ 36 846]]
In [ ]:
# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
ax = plt.subplot(rows, cols, i+1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()
DECISIONTREECLASSIFIER MODEL FOR CLASSIFICATION¶
In [ ]:
# Baseline model: DecisionTree
decision_tree_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
("classifier", DecisionTreeClassifier(random_state=1))
])
decision_tree_pipeline.fit(X_train, y_train)
y_val_pred = decision_tree_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Baseline Random Forest', 0.5532, 0.9638, 0.4985, 0.994, 0.518, 0.9189, 0.4983, 0.9931, 0.6908, 0.6911, 0.8017, 0.811, 0.7477, 0.7481, 0.7831, 0.7386, 0.5768, 0.6221, 0.788, 0.7903, 0.6062, 0.7248, 0.6614, 0.887, 0.6418, 0.9957, 0.7549, 0.8913, 0.8539, 0.8162, 0.4978, 0.9914, 0.5006, 0.5229, 0.6698, 0.7834, 0.7101, 0.7489, 0.7003, 0.7455, 0.6703, 0.7593)
In [ ]:
# Decision Tree with cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold
decision_tree_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier(random_state=42))
])
param_grid_decision_tree = {
'classifier__max_depth': [10, 20, 30, None],
'classifier__min_samples_split': [2, 5, 10],
'classifier__min_samples_leaf': [1, 2, 4]
}
# Use KFold for multi-output targets
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)
random_search_decision_tree = RandomizedSearchCV(decision_tree_pipeline, param_distributions=param_grid_decision_tree, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
random_search_decision_tree.fit(X_train, y_train)
y_val_pred = random_search_decision_tree.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
Fitting 3 folds for each of 10 candidates, totalling 30 fits ['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Baseline Random Forest', 0.5435, 0.9836, 0.4998, 0.9991, 0.5321, 0.9482, 0.4991, 0.9965, 0.6981, 0.6989, 0.7674, 0.8041, 0.7617, 0.7636, 0.7766, 0.7291, 0.5408, 0.6393, 0.7955, 0.7964, 0.5817, 0.7092, 0.6732, 0.9016, 0.6096, 0.994, 0.7485, 0.9008, 0.8729, 0.843, 0.4985, 0.994, 0.5602, 0.5764, 0.692, 0.8024, 0.7099, 0.7455, 0.7349, 0.7722, 0.6885, 0.7748)
In [ ]:
# Predict on the validation set using the best model
y_val_pred = random_search_decision_tree.best_estimator_.predict(X_val)
# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix
y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred)
conf_matrices = []
if y_true_np.ndim == 2:
for j in range(y_true_np.shape[1]):
conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))
# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
print(f"Confusion Matrix for output {idx}:")
print(cm)
Confusion Matrix for output 0: [[ 1 17] [ 2 1139]] Confusion Matrix for output 1: [[ 0 1] [ 0 1158]] Confusion Matrix for output 2: [[ 3 45] [ 15 1096]] Confusion Matrix for output 3: [[ 0 4] [ 0 1155]] Confusion Matrix for output 4: [[435 148] [201 375]] Confusion Matrix for output 5: [[660 1 74] [ 0 178 0] [151 1 94]] Confusion Matrix for output 6: [[494 98] [176 391]] Confusion Matrix for output 7: [[237 1 154] [ 0 178 0] [158 1 430]] Confusion Matrix for output 8: [[639 140] [278 102]] Confusion Matrix for output 9: [[424 83] [153 499]] Confusion Matrix for output 10: [[ 91 172] [165 731]] Confusion Matrix for output 11: [[ 38 77] [ 37 1007]] Confusion Matrix for output 12: [[ 1 4] [ 3 1151]] Confusion Matrix for output 13: [[ 26 0 70] [ 0 178 0] [ 43 2 840]] Confusion Matrix for output 14: [[463 1 57] [ 0 178 0] [123 1 336]] Confusion Matrix for output 15: [[ 0 7] [ 0 1152]] Confusion Matrix for output 16: [[253 38 105] [ 39 115 75] [113 121 300]] Confusion Matrix for output 17: [[118 124] [105 812]] Confusion Matrix for output 18: [[229 154] [141 635]] Confusion Matrix for output 19: [[230 138] [126 665]] Confusion Matrix for output 20: [[144 133] [128 754]]
In [ ]:
# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
ax = plt.subplot(rows, cols, i+1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()
GRADIENTBOOSTINGCLASSIFIER MODEL FOR CLASSIFICATION¶
In [ ]:
# Baseline model: GradientBoosting (single-output estimator applied to multi-output via independent fits is not supported directly)
# Keep pipeline for reference; compute accuracy if it runs, otherwise suggest using MultiOutputClassifier wrapping another estimator.
gradient_boosting_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
("classifier", GradientBoostingClassifier(random_state=1))
])
# Attempt fit/predict per output if y is multi-output
import numpy as np
y_train_np = np.asarray(y_train)
y_val_np = np.asarray(y_val)
if y_train_np.ndim == 2:
# Fit one-vs-output manually
preds = []
for j in range(y_train_np.shape[1]):
gb = Pipeline(steps=[('preprocessor', preprocessor), ("classifier", GradientBoostingClassifier(random_state=1))])
gb.fit(X_train, y_train_np[:, j])
preds.append(gb.predict(X_val))
y_val_pred_np = np.column_stack(preds)
else:
gradient_boosting_pipeline.fit(X_train, y_train)
y_val_pred_np = gradient_boosting_pipeline.predict(X_val)
row_tup, name_col = custom_f1score(y_val, y_val_pred, "Baseline Random Forest", f1_score)
print(name_col)
print(row_tup)
['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Baseline Random Forest', 0.5435, 0.9836, 0.4998, 0.9991, 0.5321, 0.9482, 0.4991, 0.9965, 0.6981, 0.6989, 0.7674, 0.8041, 0.7617, 0.7636, 0.7766, 0.7291, 0.5408, 0.6393, 0.7955, 0.7964, 0.5817, 0.7092, 0.6732, 0.9016, 0.6096, 0.994, 0.7485, 0.9008, 0.8729, 0.843, 0.4985, 0.994, 0.5602, 0.5764, 0.692, 0.8024, 0.7099, 0.7455, 0.7349, 0.7722, 0.6885, 0.7748)
In [ ]:
gradient_boosting_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier(random_state=42))
])
param_grid_gradient_boosting = {
'classifier__n_estimators': [100, 200, 300],
'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
'classifier__max_depth': [3, 4, 5, 6],
'classifier__min_samples_split': [2, 5, 10],
'classifier__min_samples_leaf': [1, 2, 4],
'classifier__subsample': [0.8, 0.9, 1.0],
'classifier__max_features': ['auto', 'sqrt', 'log2']
}
# Manual per-output tuning since GradientBoostingClassifier is single-output
from sklearn.model_selection import KFold
cv_reduced = KFold(n_splits=3, shuffle=True, random_state=42)
import numpy as np
y_train_np = np.asarray(y_train)
y_val_np = np.asarray(y_val)
best_models = []
for j in range(y_train_np.shape[1] if y_train_np.ndim == 2 else 1):
rs = RandomizedSearchCV(gradient_boosting_pipeline, param_distributions=param_grid_gradient_boosting, n_iter=10, cv=cv_reduced, verbose=1, random_state=42, n_jobs=-1)
y_target = y_train_np[:, j] if y_train_np.ndim == 2 else y_train_np
rs.fit(X_train, y_target)
best_models.append(rs.best_estimator_)
# Predict using per-output best models
if y_val_np.ndim == 2:
preds = [model.predict(X_val) for model in best_models]
y_val_pred_np = np.column_stack(preds)
else:
y_val_pred_np = best_models[0].predict(X_val)
# Compute micro/macro
row_tup, name_col = custom_f1score(y_val, y_val_pred_np, "Baseline Gradient Boosting", f1_score)
print(name_col)
print(row_tup)
Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits Fitting 3 folds for each of 10 candidates, totalling 30 fits ['BIRC5macro_score', 'BIRC5micro_score', 'CCNB1macro_score', 'CCNB1micro_score', 'MYBL2macro_score', 'MYBL2micro_score', 'MMP11macro_score', 'MMP11micro_score', 'GRB7macro_score', 'GRB7micro_score', 'PGRmacro_score', 'PGRmicro_score', 'BCL2macro_score', 'BCL2micro_score', 'SCUBE2macro_score', 'SCUBE2micro_score', 'GSTM1macro_score', 'GSTM1micro_score', 'BAG1macro_score', 'BAG1micro_score', 'CD68macro_score', 'CD68micro_score', 'ACTBmacro_score', 'ACTBmicro_score', 'GAPDHmacro_score', 'GAPDHmicro_score', 'RPLP0macro_score', 'RPLP0micro_score', 'TFRCmacro_score', 'TFRCmicro_score', 'AURKAmacro_score', 'AURKAmicro_score', 'CTSVmacro_score', 'CTSVmicro_score', 'MKI67macro_score', 'MKI67micro_score', 'ERBB2macro_score', 'ERBB2micro_score', 'GUSBmacro_score', 'GUSBmicro_score', 'ESR1macro_score', 'ESR1micro_score'] ('Baseline Gradient Boosting', 0.4961, 0.9845, 0.4998, 0.9991, 0.4894, 0.9586, 0.4991, 0.9965, 0.7752, 0.7757, 0.8274, 0.8473, 0.7975, 0.799, 0.7879, 0.7575, 0.4129, 0.6747, 0.8444, 0.8447, 0.5752, 0.7947, 0.6755, 0.918, 0.7851, 0.9974, 0.7444, 0.9249, 0.8925, 0.8671, 0.4985, 0.994, 0.6179, 0.6359, 0.6978, 0.824, 0.7425, 0.7739, 0.7383, 0.786, 0.7204, 0.8283)
In [ ]:
# Predict on the validation set using the best per-output GB models
# (y_val_pred_np already computed in previous cell)
# Compute confusion matrices per output
import numpy as np
from sklearn.metrics import confusion_matrix
y_true_np = np.asarray(y_val)
y_pred_np = np.asarray(y_val_pred_np)
conf_matrices = []
if y_true_np.ndim == 2:
for j in range(y_true_np.shape[1]):
conf_matrices.append(confusion_matrix(y_true_np[:, j], y_pred_np[:, j]))
else:
conf_matrices.append(confusion_matrix(y_true_np, y_pred_np))
# Print the confusion matrices
for idx, cm in enumerate(conf_matrices):
print(f"Confusion Matrix for output {idx}:")
print(cm)
Confusion Matrix for output 0: [[ 0 18] [ 0 1141]] Confusion Matrix for output 1: [[ 0 1] [ 0 1158]] Confusion Matrix for output 2: [[ 0 48] [ 0 1111]] Confusion Matrix for output 3: [[ 0 4] [ 0 1155]] Confusion Matrix for output 4: [[475 108] [152 424]] Confusion Matrix for output 5: [[672 0 63] [ 0 178 0] [114 0 132]] Confusion Matrix for output 6: [[512 80] [153 414]] Confusion Matrix for output 7: [[195 0 197] [ 0 178 0] [ 84 0 505]] Confusion Matrix for output 8: [[778 1] [376 4]] Confusion Matrix for output 9: [[464 43] [137 515]] Confusion Matrix for output 10: [[ 44 219] [ 19 877]] Confusion Matrix for output 11: [[ 31 84] [ 11 1033]] Confusion Matrix for output 12: [[ 2 3] [ 0 1154]] Confusion Matrix for output 13: [[ 17 0 79] [ 0 177 1] [ 7 0 878]] Confusion Matrix for output 14: [[487 0 34] [ 1 177 0] [119 0 341]] Confusion Matrix for output 15: [[ 0 7] [ 0 1152]] Confusion Matrix for output 16: [[290 31 75] [ 34 122 73] [119 90 325]] Confusion Matrix for output 17: [[103 139] [ 65 852]] Confusion Matrix for output 18: [[246 137] [125 651]] Confusion Matrix for output 19: [[208 160] [ 88 703]] Confusion Matrix for output 20: [[120 157] [ 42 840]]
In [ ]:
# Optionally, visualize confusion matrices per output using heatmaps
import math
num_outputs = len(conf_matrices)
cols = min(2, num_outputs)
rows = math.ceil(num_outputs / cols)
plt.figure(figsize=(6*cols, 4*rows))
for i, cm in enumerate(conf_matrices):
ax = plt.subplot(rows, cols, i+1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title(f'Confusion Matrix - Output {i}')
plt.tight_layout()
plt.show()
PERFORMANCE ON THE TEST SET¶
In [ ]:
models = [random_search_logistic, random_search_random_forest, random_search_xgboost, random_search_decision_tree, gradient_boosting_pipeline]
model_names = ['LogReg', 'RF', 'XGB', 'DT', 'GB']
# Compute ROC curves per output (treating each output as binary/multiclass-ovo not covered here)
# We'll assume binary outputs for ROC; if multiclass per output, switch to one-vs-rest ROC per class or use macro-averaged AUC.
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import numpy as np
# Limit to first output for quick visualization if outputs > 1
output_index = 0
plt.figure(figsize=(10, 8))
for i, model in enumerate(models):
# Try to get probabilities for the selected output
est = model.best_estimator_ if hasattr(model, 'best_estimator_') else model
# Handle MultiOutputClassifier vs single estimator
try:
# Multi-output wrapped estimators: est.named_steps['classifier'].estimators_
clf = est.named_steps['classifier']
if hasattr(clf, 'estimators_'):
# Get proba for the selected output from the wrapped estimator
proba = clf.estimators_[output_index].predict_proba(X_val)
else:
proba = est.predict_proba(X_val)
except Exception:
# Fallback: skip if proba not available
continue
# If predict_proba returned a list (multi-output from single estimator), pick the selected output
if isinstance(proba, list):
if len(proba) <= output_index:
continue
proba_sel = proba[output_index]
else:
proba_sel = proba
y_true = np.asarray(y_val)[:, output_index] if np.asarray(y_val).ndim == 2 else np.asarray(y_val)
# If multiclass for this output, binarize one class (class 1) for demonstration
classes = np.unique(y_true)
if len(classes) > 2:
y_true_bin = (y_true == classes[1]).astype(int)
# Map the chosen class index into the columns of proba_sel
class_idx = list(classes).index(classes[1]) if proba_sel.shape[1] == len(classes) else 1
y_score = proba_sel[:, class_idx]
else:
# binary
y_true_bin = (y_true == classes[-1]).astype(int)
y_score = proba_sel[:, 1] if proba_sel.shape[1] > 1 else proba_sel.ravel()
fpr, tpr, _ = roc_curve(y_true_bin, y_score)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=2, label=f'{model_names[i]} (AUC = {roc_auc:.2f})')
# Plot ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic (ROC) Curve - Output {output_index}')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Save test predictions for multi-output to CSV (per-label columns, inverse-transformed)
# Prefer tuned XGBoost model if available
import numpy as np
import pandas as pd
# Choose the best available XGBoost-based model
if 'random_search_xgboost' in globals():
best_xgb = random_search_xgboost.best_estimator_ if hasattr(random_search_xgboost, 'best_estimator_') else random_search_xgboost
elif 'xgboost_pipeline' in globals():
best_xgb = xgboost_pipeline
else:
raise RuntimeError("No XGBoost model found. Run the XGBoost training cell before exporting predictions.")
# Predict on the hold-out X_test
test_predictions = best_xgb.predict(X_test)
# Normalize to 2D numpy array
if isinstance(test_predictions, list):
test_predictions = np.column_stack(test_predictions)
test_pred_np = np.asarray(test_predictions)
# Build DataFrame and inverse-transform using saved encoders if present
if isinstance(y_train, pd.DataFrame):
pred_df = pd.DataFrame(test_pred_np, columns=y_train.columns, index=X_test.index)
if 'encoders' in globals() and isinstance(encoders, dict):
for col in pred_df.columns:
le = encoders.get(col)
if le is not None:
pred_df[col] = le.inverse_transform(pred_df[col].astype(int))
else:
pred_df = pd.DataFrame({'prediction': test_pred_np.ravel()}, index=X_test.index)
if 'encoders' in globals() and isinstance(encoders, dict):
le = encoders.get('_single')
if le is not None:
pred_df['prediction'] = le.inverse_transform(pred_df['prediction'].astype(int))
# Save to CSV with index (IDs)
output_path = '../pool-datasets/clf/submission_multioutput.csv'
pred_df.to_csv(output_path, index=True)
print(f"Saved multi-output predictions to {output_path} with shape {pred_df.shape}")
print(pred_df.head())
Saved multi-output predictions to ../pool-datasets/clf/submission_multioutput.csv with shape (1449, 21) BIRC5 CCNB1 MYBL2 MMP11 GRB7 PGR BCL2 SCUBE2 GSTM1 \ TCGA-EW-A6S9-01 1.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 -1.0 TCGA-GM-A3XG-01 1.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 1.0 F2500 1.0 1.0 1.0 1.0 -1.0 -1.0 -1.0 1.0 -1.0 F1775 1.0 1.0 1.0 1.0 -1.0 -1.0 -1.0 -1.0 -1.0 TCGA-E9-A1NE-01 1.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 -1.0 BAG1 ... ACTB GAPDH RPLP0 TFRC AURKA CTSV MKI67 \ TCGA-EW-A6S9-01 1.0 ... 1.0 1.0 0.0 0.0 1.0 1.0 1.0 TCGA-GM-A3XG-01 1.0 ... 1.0 1.0 0.0 0.0 1.0 1.0 1.0 F2500 -1.0 ... 1.0 1.0 1.0 -1.0 1.0 -1.0 1.0 F1775 -1.0 ... 1.0 1.0 1.0 -1.0 1.0 -1.0 1.0 TCGA-E9-A1NE-01 1.0 ... 1.0 1.0 0.0 0.0 1.0 1.0 1.0 ERBB2 GUSB ESR1 TCGA-EW-A6S9-01 1.0 1.0 1.0 TCGA-GM-A3XG-01 1.0 1.0 1.0 F2500 -1.0 1.0 1.0 F1775 -1.0 1.0 1.0 TCGA-E9-A1NE-01 1.0 1.0 1.0 [5 rows x 21 columns]
In [ ]:
y_test.loc['TCGA-EW-A6S9-01']
Out[ ]:
BIRC5 1.0 CCNB1 1.0 MYBL2 1.0 MMP11 1.0 GRB7 1.0 PGR 0.0 BCL2 1.0 SCUBE2 0.0 GSTM1 1.0 BAG1 1.0 CD68 1.0 ACTB 1.0 GAPDH 1.0 RPLP0 0.0 TFRC 0.0 AURKA 1.0 CTSV 1.0 MKI67 1.0 ERBB2 1.0 GUSB 1.0 ESR1 1.0 Name: TCGA-EW-A6S9-01, dtype: float64
In [ ]:
pred_df.loc['TCGA-EW-A6S9-01']
Out[ ]:
BIRC5 1.0 CCNB1 1.0 MYBL2 1.0 MMP11 1.0 GRB7 1.0 PGR 0.0 BCL2 1.0 SCUBE2 0.0 GSTM1 -1.0 BAG1 1.0 CD68 1.0 ACTB 1.0 GAPDH 1.0 RPLP0 0.0 TFRC 0.0 AURKA 1.0 CTSV 1.0 MKI67 1.0 ERBB2 1.0 GUSB 1.0 ESR1 1.0 Name: TCGA-EW-A6S9-01, dtype: float64
In [ ]:
y_val_np = np.asarray(y_test)
y_val_pred_np = np.asarray(pred_df)
rf_micro = (y_val_pred_np == y_val_np).mean()
if y_val_np.ndim == 2:
rf_macro = (y_val_pred_np == y_val_np).mean(axis=0).mean()
else:
from sklearn.metrics import accuracy_score
rf_macro = accuracy_score(y_val_np, y_val_pred_np)
print(f"Best Model Test Accuracy (micro): {rf_micro:.4f}")
print(f"Best Model Test Accuracy (macro): {rf_macro:.4f}")
print(f"Best Parameters: {random_search_logistic.best_params_}")
Best Model Test Accuracy (micro): 0.8518 Best Model Test Accuracy (macro): 0.8518 Best Parameters: {'classifier__estimator__solver': 'lbfgs', 'classifier__estimator__C': np.float64(3792.690190732246)}