-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun.py
More file actions
119 lines (87 loc) · 3.08 KB
/
run.py
File metadata and controls
119 lines (87 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from __future__ import print_function
import numpy as np
import pandas as pds
from preprocessing import smiles_to_seq, vectorize
import SSVAE
import time
from preprocessing import get_property, canonocalize
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
# Start time count
start_time = time.time()
# pre-defined parameters
frac=0.5
beta=10000.
char_set=[' ','1','2','3','4','5','6','7','8','9','-','#','(',')','[',']','+','=','B','Br','c','C','Cl','F','H','I','N','n','O','o','P','p','S','s','Si','Sn']
data_uri='./data/ZINC_310k.csv'
save_uri='./zinc_model.ckpt'
ntrn=300000
frac_val=0.05
ntst=10000
# data preparation
print('::: data preparation')
smiles = pds.read_csv(data_uri).as_matrix()[:ntrn+ntst,0] #0: SMILES
Y = np.asarray(pds.read_csv(data_uri).as_matrix()[:ntrn+ntst,1:], dtype=np.float32) # 1: MolWT, 2: LogP, 3: QED
list_seq = smiles_to_seq(smiles, char_set)
Xs, X=vectorize(list_seq, char_set)
tstX=X[-ntst:]
tstXs=Xs[-ntst:]
tstY=Y[-ntst:]
X=X[:ntrn]
Xs=Xs[:ntrn]
Y=Y[:ntrn]
nL=int(len(Y)*frac)
nU=len(Y)-nL
nL_trn=int(nL*(1-frac_val))
nL_val=nL-nL_trn
nU_trn=int(nU*(1-frac_val))
nU_val=nU-nU_trn
perm_id=np.random.permutation(len(Y))
trnX_L=X[perm_id[:nL_trn]]
trnXs_L=Xs[perm_id[:nL_trn]]
trnY_L=Y[perm_id[:nL_trn]]
valX_L=X[perm_id[nL_trn:nL_trn+nL_val]]
valXs_L=Xs[perm_id[nL_trn:nL_trn+nL_val]]
valY_L=Y[perm_id[nL_trn:nL_trn+nL_val]]
trnX_U=X[perm_id[nL_trn+nL_val:nL_trn+nL_val+nU_trn]]
trnXs_U=Xs[perm_id[nL_trn+nL_val:nL_trn+nL_val+nU_trn]]
valX_U=X[perm_id[nL_trn+nL_val+nU_trn:]]
valXs_U=Xs[perm_id[nL_trn+nL_val+nU_trn:]]
scaler_Y = StandardScaler()
scaler_Y.fit(Y)
trnY_L=scaler_Y.transform(trnY_L)
valY_L=scaler_Y.transform(valY_L)
## model training
print('::: model training')
seqlen_x = X.shape[1]
dim_x = X.shape[2]
dim_y = Y.shape[1]
dim_z = 100
dim_h = 250
n_hidden = 3
batch_size = 200
model = SSVAE.Model(seqlen_x = seqlen_x, dim_x = dim_x, dim_y = dim_y, dim_z = dim_z, dim_h = dim_h,
n_hidden = n_hidden, batch_size = batch_size, beta = float(beta), char_set = char_set)
with model.session:
model.train(trnX_L=trnX_L, trnXs_L=trnXs_L, trnY_L=trnY_L, trnX_U=trnX_U, trnXs_U=trnXs_U,
valX_L=valX_L, valXs_L=valXs_L, valY_L=valY_L, valX_U=valX_U, valXs_U=valXs_U)
model.saver.save(model.session, save_uri)
## property prediction performance
tstY_hat=scaler_Y.inverse_transform(model.predict(tstX))
for j in range(dim_y):
print([j, mean_absolute_error(tstY[:,j], tstY_hat[:,j])])
## unconditional generation
for t in range(10):
smi = model.sampling_unconditional()
print([t, smi, get_property(smi)])
## conditional generation (e.g. MolWt=250)
yid = 0
ytarget = 250.
ytarget_transform = (ytarget-scaler_Y.mean_[yid])/np.sqrt(scaler_Y.var_[yid])
for t in range(10):
smi = model.sampling_conditional(yid, ytarget_transform)
print([t, smi, get_property(smi)])
# End time count
elapsed_time = time.time() - start_time
print(elapsed_time)
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))