Skip to content

Commit 6ab7225

Browse files
authored
chore: Use a Cloudflare bucket through a HTTP file system (#115)
* chore: Update the ModelStore to use a HTTP file system * chore: Remove uv lock file * Formatting * Update Mamba setup action to move to new GHA caching service * Fix mkdocs issue * fix: Failing tests because wrong cache path was being used
1 parent 0e06ba2 commit 6ab7225

26 files changed

Lines changed: 241 additions & 253 deletions

.github/workflows/code-check.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
runs-on: ubuntu-latest
1515
steps:
1616
- name: Checkout the code
17-
uses: actions/checkout@v3
17+
uses: actions/checkout@v4
1818

1919
- name: Set up Python
2020
uses: actions/setup-python@v4

.github/workflows/doc.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ jobs:
2020

2121
steps:
2222
- name: Checkout the code
23-
uses: actions/checkout@v3
23+
uses: actions/checkout@v4
2424

2525
- name: Setup mamba
26-
uses: mamba-org/setup-micromamba@v1
26+
uses: mamba-org/setup-micromamba@v2
2727
with:
2828
environment-file: env.yml
2929
environment-name: my_env

.github/workflows/release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ jobs:
2525

2626
steps:
2727
- name: Checkout the code
28-
uses: actions/checkout@v3
28+
uses: actions/checkout@v4
2929

3030
- name: Setup mamba
31-
uses: mamba-org/setup-micromamba@v1
31+
uses: mamba-org/setup-micromamba@v2
3232
with:
3333
environment-file: env.yml
3434
environment-name: my_env

.github/workflows/test.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
strategy:
1717
fail-fast: false
1818
matrix:
19-
python-version: ["3.9"]
19+
python-version: ["3.9", "3.10"]
2020
os: ["ubuntu-latest"] #, "macos-latest", "windows-latest"]
2121
pytorch-version: ["1.13"]
2222

@@ -34,10 +34,10 @@ jobs:
3434
3535
steps:
3636
- name: Checkout the code
37-
uses: actions/checkout@v3
37+
uses: actions/checkout@v4
3838

3939
- name: Setup mamba
40-
uses: mamba-org/setup-micromamba@v1
40+
uses: mamba-org/setup-micromamba@v2
4141
with:
4242
environment-file: env.yml
4343
environment-name: my_env

docs/benchmark.ipynb

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,15 @@
124124
"\n",
125125
"def preprocess_smiles(smi):\n",
126126
" \"\"\"Preprocesses the SMILES string\"\"\"\n",
127-
" mol = dm.to_mol(smi, ordered=True, sanitize=False) \n",
128-
" try: \n",
127+
" mol = dm.to_mol(smi, ordered=True, sanitize=False)\n",
128+
" try:\n",
129129
" mol = dm.sanitize_mol(mol)\n",
130-
" except: # noqa: E722\n",
130+
" except: # noqa: E722\n",
131131
" mol = None\n",
132-
" \n",
133-
" if mol is None: \n",
132+
"\n",
133+
" if mol is None:\n",
134134
" return\n",
135-
" \n",
135+
"\n",
136136
" mol = dm.standardize_mol(mol, disconnect_metals=True)\n",
137137
" remover = SaltRemover.SaltRemover()\n",
138138
" mol = remover.StripMol(mol, dontRemoveEverything=True)\n",
@@ -144,7 +144,7 @@
144144
" \"\"\"In line with common practice, we will use the scaffold split to evaluate our models\"\"\"\n",
145145
" scaffolds = [dm.to_smiles(dm.to_scaffold_murcko(dm.to_mol(smi))) for smi in smiles]\n",
146146
" splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
147-
" return next(splitter.split(smiles, groups=scaffolds))\n"
147+
" return next(splitter.split(smiles, groups=scaffolds))"
148148
]
149149
},
150150
{
@@ -158,7 +158,7 @@
158158
"# Setup the featurizers\n",
159159
"trans_ecfp = FPVecTransformer(kind=\"ecfp:6\", n_jobs=-1)\n",
160160
"trans_mordred = FPVecTransformer(kind=\"mordred\", replace_nan=True, n_jobs=-1)\n",
161-
"trans_chemberta = PretrainedHFTransformer(kind='ChemBERTa-77M-MLM', notation='smiles')"
161+
"trans_chemberta = PretrainedHFTransformer(kind=\"ChemBERTa-77M-MLM\", notation=\"smiles\")"
162162
]
163163
},
164164
{
@@ -176,7 +176,9 @@
176176
"outputs": [],
177177
"source": [
178178
"# Prepare the Lipophilicity dataset\n",
179-
"smiles, y_true = load_dataset(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\", \"exp\")\n",
179+
"smiles, y_true = load_dataset(\n",
180+
" \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\", \"exp\"\n",
181+
")\n",
180182
"smiles = np.array([preprocess_smiles(smi) for smi in smiles])\n",
181183
"smiles = np.array([smi for smi in smiles if dm.to_mol(smi) is not None])\n",
182184
"\n",
@@ -232,21 +234,20 @@
232234
"\n",
233235
"lipo_scores = {}\n",
234236
"for name, feats in X.items():\n",
235-
" \n",
236237
" # Train\n",
237238
" automl = autosklearn.regression.AutoSklearnRegressor(\n",
238-
" memory_limit=24576, \n",
239-
" # For practicality’s sake, limit this to 5 minutes! \n",
239+
" memory_limit=24576,\n",
240+
" # For practicality’s sake, limit this to 5 minutes!\n",
240241
" # (x3 = 15 min in total)\n",
241-
" time_left_for_this_task=180, \n",
242+
" time_left_for_this_task=180,\n",
242243
" n_jobs=1,\n",
243244
" seed=1,\n",
244245
" )\n",
245246
" automl.fit(feats[train_ind], y_true[train_ind])\n",
246-
" \n",
247+
"\n",
247248
" # Predict and evaluate\n",
248249
" y_hat = automl.predict(feats[test_ind])\n",
249-
" \n",
250+
"\n",
250251
" # Evaluate\n",
251252
" mae = mean_absolute_error(y_true[test_ind], y_hat)\n",
252253
" lipo_scores[name] = mae\n",
@@ -282,7 +283,9 @@
282283
],
283284
"source": [
284285
"# Prepare the ClinTox dataset\n",
285-
"smiles, y_true = load_dataset(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz\", \"CT_TOX\")\n",
286+
"smiles, y_true = load_dataset(\n",
287+
" \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz\", \"CT_TOX\"\n",
288+
")\n",
286289
"smiles = np.array([preprocess_smiles(smi) for smi in smiles])\n",
287290
"smiles = np.array([smi for smi in smiles if smi is not None])\n",
288291
"\n",
@@ -333,22 +336,21 @@
333336
"\n",
334337
"clintox_scores = {}\n",
335338
"for name, feats in X.items():\n",
336-
" \n",
337339
" # Train\n",
338340
" automl = autosklearn.classification.AutoSklearnClassifier(\n",
339-
" memory_limit=24576, \n",
340-
" # For practicality’s sake, limit this to 5 minutes! \n",
341+
" memory_limit=24576,\n",
342+
" # For practicality’s sake, limit this to 5 minutes!\n",
341343
" # (x3 = 15 min in total)\n",
342344
" time_left_for_this_task=180,\n",
343345
" n_jobs=1,\n",
344346
" seed=1,\n",
345347
" )\n",
346348
" automl.fit(feats[train_ind], y_true[train_ind])\n",
347-
" \n",
349+
"\n",
348350
" # Predict and evaluate\n",
349351
" y_hat = automl.predict_proba(feats[test_ind])\n",
350352
" y_hat = y_hat[:, 1]\n",
351-
" \n",
353+
"\n",
352354
" # Evaluate\n",
353355
" auroc = roc_auc_score(y_true[test_ind], y_hat)\n",
354356
" clintox_scores[name] = auroc\n",
@@ -395,7 +397,9 @@
395397
"BASE_CMPD_URI = \"https://github.com/rdkit/benchmarking_platform/raw/master/compounds/DUD/cmp_list_DUD\"\n",
396398
"CMPD_EXT = \".dat.gz\"\n",
397399
"\n",
398-
"BASE_SPLIT_URI = \"https://github.com/rdkit/benchmarking_platform/raw/master/query_lists/data_sets_I/DUD/training_DUD\"\n",
400+
"BASE_SPLIT_URI = (\n",
401+
" \"https://github.com/rdkit/benchmarking_platform/raw/master/query_lists/data_sets_I/DUD/training_DUD\"\n",
402+
")\n",
399403
"SPLIT_EXT = \".pkl\"\n",
400404
"\n",
401405
"# Out of practicality, we only use the first 10 targets\n",
@@ -519,7 +523,6 @@
519523
"results = defaultdict(dict)\n",
520524
"\n",
521525
"for target in tqdm.tqdm(TARGETS, leave=False):\n",
522-
"\n",
523526
" # Load the structures (i.e. SMILES)\n",
524527
" df = get_compounds_for_target(target)\n",
525528
" n_actives = len(df[df[\"subset\"] == \"actives\"])\n",
@@ -543,7 +546,6 @@
543546
" train_decoy_ind = [i + n_actives for i in train_decoy_ind]\n",
544547
"\n",
545548
" for feat_name, feats in X.items():\n",
546-
"\n",
547549
" # Train the model\n",
548550
" knn = KNeighborsClassifier()\n",
549551
" train_ind = np.concatenate([train_active_ind, train_decoy_ind])\n",

docs/tutorials/add_your_own.ipynb

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,15 @@
5252
"from rdkit.Chem.rdMolDescriptors import CalcNumHeteroatoms\n",
5353
"\n",
5454
"smiles = dm.freesolv()[\"smiles\"].iloc[:3]\n",
55+
"\n",
56+
"\n",
5557
"def my_calculator(mol):\n",
5658
" \"\"\"My custom featurizer\"\"\"\n",
5759
" mol = dm.to_mol(mol)\n",
5860
" rng = np.random.default_rng(0)\n",
5961
" return [mol.GetNumAtoms(), mol.GetNumBonds(), CalcNumHeteroatoms(mol), rng.random()]\n",
6062
"\n",
63+
"\n",
6164
"# This directly works with the MoleculeTransformer\n",
6265
"mol_transf = MoleculeTransformer(my_calculator)\n",
6366
"mol_transf(smiles)"
@@ -113,9 +116,7 @@
113116
"attachments": {},
114117
"cell_type": "markdown",
115118
"metadata": {},
116-
"source": [
117-
"If your calculator can perform featurization of a batch of molecules in an efficient way, then you should implement the optional `batch_compute` method which will then be used by `MoleculeTransformer` instead of the default sequential or parallelization process. "
118-
]
119+
"source": "If your calculator can perform featurization of a batch of molecules in an efficient way, then you should implement the optional `batch_compute` method which will then be used by `MoleculeTransformer` instead of the default sequential or parallelization process."
119120
},
120121
{
121122
"cell_type": "code",
@@ -148,6 +149,7 @@
148149
"source": [
149150
"from molfeat.calc import SerializableCalculator\n",
150151
"\n",
152+
"\n",
151153
"class MyBatchableCalculator(SerializableCalculator):\n",
152154
" def __init__(self, random_seed=42, length=10):\n",
153155
" self.random_seed = random_seed\n",
@@ -157,15 +159,16 @@
157159
" def __call__(self, mol):\n",
158160
" print(\"We are in single compute mode!\")\n",
159161
" return self.rng.random(self.length)\n",
160-
" \n",
162+
"\n",
161163
" def __len__(self):\n",
162164
" return self.length\n",
163-
" \n",
165+
"\n",
164166
" def batch_compute(self, mols, **kwargs):\n",
165167
" # note that dm.parallelized information is passed along with the molecules list\n",
166168
" print(\"We are in batch mode!\")\n",
167169
" return self.rng.random((len(mols), self.length))\n",
168170
"\n",
171+
"\n",
169172
"mol_transf = MoleculeTransformer(MyBatchableCalculator())\n",
170173
"mol_transf(smiles)"
171174
]
@@ -179,7 +182,7 @@
179182
"source": [
180183
"## Define your own transformer\n",
181184
"The above example shows that in many cases, there's no direct need to create your own transformer class. You can simply use the `MoleculeTransformer` base class.\n",
182-
"In more complex cases, such as with pretrained models where batching would be advantageous, it is instead preferable to create your own subclass. "
185+
"In more complex cases, such as with pretrained models where batching would be advantageous, it is instead preferable to create your own subclass."
183186
]
184187
},
185188
{
@@ -220,7 +223,7 @@
220223
" \"\"\"Convert the molecule to a format that the model expects\"\"\"\n",
221224
" return self._featurizer(inputs)\n",
222225
"\n",
223-
" def _embed(self, mols: list, **kwargs):\n",
226+
" def _embed(self, mols: list, **kwargs):\n",
224227
" \"\"\"\n",
225228
" Embed the molecules using the pretrained model\n",
226229
" In this dummy example, we simply multiply the features by the importance of the feature\n",
@@ -256,7 +259,7 @@
256259
"cell_type": "markdown",
257260
"metadata": {},
258261
"source": [
259-
"Here is another example that shows how to extend Molfeat with an existing embedding language model for astrochemistry. \n",
262+
"Here is another example that shows how to extend Molfeat with an existing embedding language model for astrochemistry.\n",
260263
"\n",
261264
"```bash\n",
262265
"pip install astrochem_embedding\n",
@@ -286,20 +289,23 @@
286289
"from astrochem_embedding import VICGAE\n",
287290
"from molfeat.trans.pretrained import PretrainedMolTransformer\n",
288291
"\n",
292+
"\n",
289293
"class MyAstroChemFeaturizer(PretrainedMolTransformer):\n",
290294
" \"\"\"\n",
291-
" In this more practical example, we use embeddings from VICGAE a variance-invariance-covariance \n",
295+
" In this more practical example, we use embeddings from VICGAE a variance-invariance-covariance\n",
292296
" regularized GRU autoencoder trained on SELFIES strings.\n",
293297
" \"\"\"\n",
298+
"\n",
294299
" def __init__(self, *args, **kwargs):\n",
295-
" super().__init__(*args, **kwargs) \n",
300+
" super().__init__(*args, **kwargs)\n",
296301
" self.featurizer = VICGAE.from_pretrained()\n",
297-
" \n",
302+
"\n",
298303
" def _embed(self, smiles, **kwargs):\n",
299304
" return [self.featurizer.embed_smiles(x) for x in smiles]\n",
300305
"\n",
306+
"\n",
301307
"transformer = MyAstroChemFeaturizer(dtype=torch.float)\n",
302-
"transformer(dm.freesolv()[\"smiles\"][:10]).shape\n"
308+
"transformer(dm.freesolv()[\"smiles\"][:10]).shape"
303309
]
304310
},
305311
{
@@ -338,7 +344,7 @@
338344
"from molfeat.store import ModelInfo\n",
339345
"\n",
340346
"path = dm.fs.join(platformdirs.user_cache_dir(\"molfeat\"), \"custom_model_store\")\n",
341-
"store = ModelStore(model_store_bucket=path)\n",
347+
"store = ModelStore(model_store_root=path)\n",
342348
"len(store.available_models)"
343349
]
344350
},
@@ -384,18 +390,18 @@
384390
"source": [
385391
"# Let's define our model's info\n",
386392
"info = ModelInfo(\n",
387-
" name = \"my_foundation_model\",\n",
388-
" inputs = \"smiles\",\n",
393+
" name=\"my_foundation_model\",\n",
394+
" inputs=\"smiles\",\n",
389395
" type=\"pretrained\",\n",
390396
" group=\"my_group\",\n",
391397
" version=0,\n",
392398
" submitter=\"Datamol\",\n",
393399
" description=\"Solves chemistry!\",\n",
394400
" representation=\"vector\",\n",
395401
" require_3D=False,\n",
396-
" tags = [\"foundation_model\", \"random_forest\"],\n",
397-
" authors= [\"Datamol\"],\n",
398-
" reference = \"/fake/ref\"\n",
402+
" tags=[\"foundation_model\", \"random_forest\"],\n",
403+
" authors=[\"Datamol\"],\n",
404+
" reference=\"/fake/ref\",\n",
399405
")\n",
400406
"\n",
401407
"store.register(info)\n",

0 commit comments

Comments
 (0)