triage-predictive-api/2_Unsupervised_Learning.py at gummor · gummor/triage-predictive-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Clustering Analysis (Unsupervised Learning)
Objective: Group clinical profiles using biomarkers without using diagnostic labels.

DISCLAIMER:
The clinical data used in this script consists of publicly available mean values
and does not contain private or sensitive patient records.
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, Birch, MiniBatchKMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score

# 1. Data Loading (Generated from the SQL script)
data = pd.read_csv('clean_dataset.csv')

# 2. Preprocessing
scaler = StandardScaler()
data['N_norm'] = scaler.fit_transform(data[['N']])
data['Iq1_norm'] = scaler.fit_transform(data[['Iq1']])
data['Iq3_norm'] = scaler.fit_transform(data[['Iq3']])
data['Copeptin_norm'] = scaler.fit_transform(data[['Copeptin']])

# Calculation of Engineered Feature
data['cv'] = (data['Iq3_norm'] - data['Iq1_norm']) / data['Copeptin_norm']

# Features for clustering (X) and ground truth (y) for validation
features = data[['cv', 'Iq3_norm', 'Iq1_norm', 'Copeptin_norm', 'N_norm']]
true_labels = data['Diagnosis_Group']

# 3. Models
clustering_models = {
    "KMeans": KMeans(n_clusters=2, random_state=42, n_init=10),
    "DBSCAN": DBSCAN(eps=0.5, min_samples=2),
    "Birch": Birch(n_clusters=2),
    "Agglomerative": AgglomerativeClustering(n_clusters=2)
}

# 4. Training and Evaluation
evaluation_metrics = {}

for name, model in clustering_models.items():
    model.fit(features)
    labels = model.labels_ if hasattr(model, 'labels_') else model.predict(features)

    # Validating metrics if more than 1 cluster is found
    if len(set(labels)) > 1:
        silhouette = silhouette_score(features, labels)
        adjusted_rand = adjusted_rand_score(true_labels, labels)
        evaluation_metrics[name] = {"Silhouette": silhouette, "Adjusted Rand": adjusted_rand}

# 5. Results
metrics_df = pd.DataFrame(evaluation_metrics).T
print("=== Unsupervised Results (Ranked by Adjusted Rand) ===")
print(metrics_df.sort_values(by="Adjusted Rand", ascending=False))