|
124 | 124 | "\n", |
125 | 125 | "def preprocess_smiles(smi):\n", |
126 | 126 | " \"\"\"Preprocesses the SMILES string\"\"\"\n", |
127 | | - " mol = dm.to_mol(smi, ordered=True, sanitize=False) \n", |
128 | | - " try: \n", |
| 127 | + " mol = dm.to_mol(smi, ordered=True, sanitize=False)\n", |
| 128 | + " try:\n", |
129 | 129 | " mol = dm.sanitize_mol(mol)\n", |
130 | | - " except: # noqa: E722\n", |
| 130 | + " except: # noqa: E722\n", |
131 | 131 | " mol = None\n", |
132 | | - " \n", |
133 | | - " if mol is None: \n", |
| 132 | + "\n", |
| 133 | + " if mol is None:\n", |
134 | 134 | " return\n", |
135 | | - " \n", |
| 135 | + "\n", |
136 | 136 | " mol = dm.standardize_mol(mol, disconnect_metals=True)\n", |
137 | 137 | " remover = SaltRemover.SaltRemover()\n", |
138 | 138 | " mol = remover.StripMol(mol, dontRemoveEverything=True)\n", |
|
144 | 144 | " \"\"\"In line with common practice, we will use the scaffold split to evaluate our models\"\"\"\n", |
145 | 145 | " scaffolds = [dm.to_smiles(dm.to_scaffold_murcko(dm.to_mol(smi))) for smi in smiles]\n", |
146 | 146 | " splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n", |
147 | | - " return next(splitter.split(smiles, groups=scaffolds))\n" |
| 147 | + " return next(splitter.split(smiles, groups=scaffolds))" |
148 | 148 | ] |
149 | 149 | }, |
150 | 150 | { |
|
158 | 158 | "# Setup the featurizers\n", |
159 | 159 | "trans_ecfp = FPVecTransformer(kind=\"ecfp:6\", n_jobs=-1)\n", |
160 | 160 | "trans_mordred = FPVecTransformer(kind=\"mordred\", replace_nan=True, n_jobs=-1)\n", |
161 | | - "trans_chemberta = PretrainedHFTransformer(kind='ChemBERTa-77M-MLM', notation='smiles')" |
| 161 | + "trans_chemberta = PretrainedHFTransformer(kind=\"ChemBERTa-77M-MLM\", notation=\"smiles\")" |
162 | 162 | ] |
163 | 163 | }, |
164 | 164 | { |
|
176 | 176 | "outputs": [], |
177 | 177 | "source": [ |
178 | 178 | "# Prepare the Lipophilicity dataset\n", |
179 | | - "smiles, y_true = load_dataset(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\", \"exp\")\n", |
| 179 | + "smiles, y_true = load_dataset(\n", |
| 180 | + " \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\", \"exp\"\n", |
| 181 | + ")\n", |
180 | 182 | "smiles = np.array([preprocess_smiles(smi) for smi in smiles])\n", |
181 | 183 | "smiles = np.array([smi for smi in smiles if dm.to_mol(smi) is not None])\n", |
182 | 184 | "\n", |
|
232 | 234 | "\n", |
233 | 235 | "lipo_scores = {}\n", |
234 | 236 | "for name, feats in X.items():\n", |
235 | | - " \n", |
236 | 237 | " # Train\n", |
237 | 238 | " automl = autosklearn.regression.AutoSklearnRegressor(\n", |
238 | | - " memory_limit=24576, \n", |
239 | | - " # For practicality’s sake, limit this to 5 minutes! \n", |
| 239 | + " memory_limit=24576,\n", |
| 240 | + " # For practicality’s sake, limit this to 5 minutes!\n", |
240 | 241 | " # (x3 = 15 min in total)\n", |
241 | | - " time_left_for_this_task=180, \n", |
| 242 | + " time_left_for_this_task=180,\n", |
242 | 243 | " n_jobs=1,\n", |
243 | 244 | " seed=1,\n", |
244 | 245 | " )\n", |
245 | 246 | " automl.fit(feats[train_ind], y_true[train_ind])\n", |
246 | | - " \n", |
| 247 | + "\n", |
247 | 248 | " # Predict and evaluate\n", |
248 | 249 | " y_hat = automl.predict(feats[test_ind])\n", |
249 | | - " \n", |
| 250 | + "\n", |
250 | 251 | " # Evaluate\n", |
251 | 252 | " mae = mean_absolute_error(y_true[test_ind], y_hat)\n", |
252 | 253 | " lipo_scores[name] = mae\n", |
|
282 | 283 | ], |
283 | 284 | "source": [ |
284 | 285 | "# Prepare the ClinTox dataset\n", |
285 | | - "smiles, y_true = load_dataset(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz\", \"CT_TOX\")\n", |
| 286 | + "smiles, y_true = load_dataset(\n", |
| 287 | + " \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz\", \"CT_TOX\"\n", |
| 288 | + ")\n", |
286 | 289 | "smiles = np.array([preprocess_smiles(smi) for smi in smiles])\n", |
287 | 290 | "smiles = np.array([smi for smi in smiles if smi is not None])\n", |
288 | 291 | "\n", |
|
333 | 336 | "\n", |
334 | 337 | "clintox_scores = {}\n", |
335 | 338 | "for name, feats in X.items():\n", |
336 | | - " \n", |
337 | 339 | " # Train\n", |
338 | 340 | " automl = autosklearn.classification.AutoSklearnClassifier(\n", |
339 | | - " memory_limit=24576, \n", |
340 | | - " # For practicality’s sake, limit this to 5 minutes! \n", |
| 341 | + " memory_limit=24576,\n", |
| 342 | + " # For practicality’s sake, limit this to 5 minutes!\n", |
341 | 343 | " # (x3 = 15 min in total)\n", |
342 | 344 | " time_left_for_this_task=180,\n", |
343 | 345 | " n_jobs=1,\n", |
344 | 346 | " seed=1,\n", |
345 | 347 | " )\n", |
346 | 348 | " automl.fit(feats[train_ind], y_true[train_ind])\n", |
347 | | - " \n", |
| 349 | + "\n", |
348 | 350 | " # Predict and evaluate\n", |
349 | 351 | " y_hat = automl.predict_proba(feats[test_ind])\n", |
350 | 352 | " y_hat = y_hat[:, 1]\n", |
351 | | - " \n", |
| 353 | + "\n", |
352 | 354 | " # Evaluate\n", |
353 | 355 | " auroc = roc_auc_score(y_true[test_ind], y_hat)\n", |
354 | 356 | " clintox_scores[name] = auroc\n", |
|
395 | 397 | "BASE_CMPD_URI = \"https://github.com/rdkit/benchmarking_platform/raw/master/compounds/DUD/cmp_list_DUD\"\n", |
396 | 398 | "CMPD_EXT = \".dat.gz\"\n", |
397 | 399 | "\n", |
398 | | - "BASE_SPLIT_URI = \"https://github.com/rdkit/benchmarking_platform/raw/master/query_lists/data_sets_I/DUD/training_DUD\"\n", |
| 400 | + "BASE_SPLIT_URI = (\n", |
| 401 | + " \"https://github.com/rdkit/benchmarking_platform/raw/master/query_lists/data_sets_I/DUD/training_DUD\"\n", |
| 402 | + ")\n", |
399 | 403 | "SPLIT_EXT = \".pkl\"\n", |
400 | 404 | "\n", |
401 | 405 | "# Out of practicality, we only use the first 10 targets\n", |
|
519 | 523 | "results = defaultdict(dict)\n", |
520 | 524 | "\n", |
521 | 525 | "for target in tqdm.tqdm(TARGETS, leave=False):\n", |
522 | | - "\n", |
523 | 526 | " # Load the structures (i.e. SMILES)\n", |
524 | 527 | " df = get_compounds_for_target(target)\n", |
525 | 528 | " n_actives = len(df[df[\"subset\"] == \"actives\"])\n", |
|
543 | 546 | " train_decoy_ind = [i + n_actives for i in train_decoy_ind]\n", |
544 | 547 | "\n", |
545 | 548 | " for feat_name, feats in X.items():\n", |
546 | | - "\n", |
547 | 549 | " # Train the model\n", |
548 | 550 | " knn = KNeighborsClassifier()\n", |
549 | 551 | " train_ind = np.concatenate([train_active_ind, train_decoy_ind])\n", |
|
0 commit comments