-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path2_Unsupervised_Learning.py
More file actions
56 lines (45 loc) · 2.07 KB
/
2_Unsupervised_Learning.py
File metadata and controls
56 lines (45 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Clustering Analysis (Unsupervised Learning)
Objective: Group clinical profiles using biomarkers without using diagnostic labels.
DISCLAIMER:
The clinical data used in this script consists of publicly available mean values
and does not contain private or sensitive patient records.
"""
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, Birch, MiniBatchKMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
# 1. Data Loading (Generated from the SQL script)
data = pd.read_csv('clean_dataset.csv')
# 2. Preprocessing
scaler = StandardScaler()
data['N_norm'] = scaler.fit_transform(data[['N']])
data['Iq1_norm'] = scaler.fit_transform(data[['Iq1']])
data['Iq3_norm'] = scaler.fit_transform(data[['Iq3']])
data['Copeptin_norm'] = scaler.fit_transform(data[['Copeptin']])
# Calculation of Engineered Feature
data['cv'] = (data['Iq3_norm'] - data['Iq1_norm']) / data['Copeptin_norm']
# Features for clustering (X) and ground truth (y) for validation
features = data[['cv', 'Iq3_norm', 'Iq1_norm', 'Copeptin_norm', 'N_norm']]
true_labels = data['Diagnosis_Group']
# 3. Models
clustering_models = {
"KMeans": KMeans(n_clusters=2, random_state=42, n_init=10),
"DBSCAN": DBSCAN(eps=0.5, min_samples=2),
"Birch": Birch(n_clusters=2),
"Agglomerative": AgglomerativeClustering(n_clusters=2)
}
# 4. Training and Evaluation
evaluation_metrics = {}
for name, model in clustering_models.items():
model.fit(features)
labels = model.labels_ if hasattr(model, 'labels_') else model.predict(features)
# Validating metrics if more than 1 cluster is found
if len(set(labels)) > 1:
silhouette = silhouette_score(features, labels)
adjusted_rand = adjusted_rand_score(true_labels, labels)
evaluation_metrics[name] = {"Silhouette": silhouette, "Adjusted Rand": adjusted_rand}
# 5. Results
metrics_df = pd.DataFrame(evaluation_metrics).T
print("=== Unsupervised Results (Ranked by Adjusted Rand) ===")
print(metrics_df.sort_values(by="Adjusted Rand", ascending=False))