datamol-io
diff --git a/‎.github/workflows/code-check.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/code-check.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/doc.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/doc.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/release.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/test.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/benchmark.ipynb‎
Lines changed: 26 additions & 24 deletions b/‎docs/benchmark.ipynb‎
Lines changed: 26 additions & 24 deletions
diff --git a/‎docs/tutorials/add_your_own.ipynb‎
Lines changed: 24 additions & 18 deletions b/‎docs/tutorials/add_your_own.ipynb‎
Lines changed: 24 additions & 18 deletions
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout the code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
         uses: actions/setup-python@v4
 
@@ -20,10 +20,10 @@ jobs:
 
     steps:
       - name: Checkout the code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup mamba
-        uses: mamba-org/setup-micromamba@v1
+        uses: mamba-org/setup-micromamba@v2
         with:
           environment-file: env.yml
           environment-name: my_env
 
@@ -25,10 +25,10 @@ jobs:
 
     steps:
       - name: Checkout the code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup mamba
-        uses: mamba-org/setup-micromamba@v1
+        uses: mamba-org/setup-micromamba@v2
         with:
           environment-file: env.yml
           environment-name: my_env
 
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9"]
+        python-version: ["3.9", "3.10"]
         os: ["ubuntu-latest"] #, "macos-latest", "windows-latest"]
         pytorch-version: ["1.13"]
 
@@ -34,10 +34,10 @@ jobs:
 
     steps:
       - name: Checkout the code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup mamba
-        uses: mamba-org/setup-micromamba@v1
+        uses: mamba-org/setup-micromamba@v2
         with:
           environment-file: env.yml
           environment-name: my_env
 
@@ -124,15 +124,15 @@
     "\n",
     "def preprocess_smiles(smi):\n",
     "    \"\"\"Preprocesses the SMILES string\"\"\"\n",
-    "    mol = dm.to_mol(smi, ordered=True, sanitize=False)    \n",
-    "    try: \n",
+    "    mol = dm.to_mol(smi, ordered=True, sanitize=False)\n",
+    "    try:\n",
     "        mol = dm.sanitize_mol(mol)\n",
-    "    except: # noqa: E722\n",
+    "    except:  # noqa: E722\n",
     "        mol = None\n",
-    "            \n",
-    "    if mol is None: \n",
+    "\n",
+    "    if mol is None:\n",
     "        return\n",
-    "        \n",
+    "\n",
     "    mol = dm.standardize_mol(mol, disconnect_metals=True)\n",
     "    remover = SaltRemover.SaltRemover()\n",
     "    mol = remover.StripMol(mol, dontRemoveEverything=True)\n",
@@ -144,7 +144,7 @@
     "    \"\"\"In line with common practice, we will use the scaffold split to evaluate our models\"\"\"\n",
     "    scaffolds = [dm.to_smiles(dm.to_scaffold_murcko(dm.to_mol(smi))) for smi in smiles]\n",
     "    splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
-    "    return next(splitter.split(smiles, groups=scaffolds))\n"
+    "    return next(splitter.split(smiles, groups=scaffolds))"
    ]
   },
   {
@@ -158,7 +158,7 @@
     "# Setup the featurizers\n",
     "trans_ecfp = FPVecTransformer(kind=\"ecfp:6\", n_jobs=-1)\n",
     "trans_mordred = FPVecTransformer(kind=\"mordred\", replace_nan=True, n_jobs=-1)\n",
-    "trans_chemberta = PretrainedHFTransformer(kind='ChemBERTa-77M-MLM', notation='smiles')"
+    "trans_chemberta = PretrainedHFTransformer(kind=\"ChemBERTa-77M-MLM\", notation=\"smiles\")"
    ]
   },
   {
@@ -176,7 +176,9 @@
    "outputs": [],
    "source": [
     "# Prepare the Lipophilicity dataset\n",
-    "smiles, y_true = load_dataset(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\", \"exp\")\n",
+    "smiles, y_true = load_dataset(\n",
+    "    \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv\", \"exp\"\n",
+    ")\n",
     "smiles = np.array([preprocess_smiles(smi) for smi in smiles])\n",
     "smiles = np.array([smi for smi in smiles if dm.to_mol(smi) is not None])\n",
     "\n",
@@ -232,21 +234,20 @@
     "\n",
     "lipo_scores = {}\n",
     "for name, feats in X.items():\n",
-    "    \n",
     "    # Train\n",
     "    automl = autosklearn.regression.AutoSklearnRegressor(\n",
-    "        memory_limit=24576, \n",
-    "        # For practicality’s sake, limit this to 5 minutes! \n",
+    "        memory_limit=24576,\n",
+    "        # For practicality’s sake, limit this to 5 minutes!\n",
     "        # (x3 = 15 min in total)\n",
-    "        time_left_for_this_task=180,  \n",
+    "        time_left_for_this_task=180,\n",
     "        n_jobs=1,\n",
     "        seed=1,\n",
     "    )\n",
     "    automl.fit(feats[train_ind], y_true[train_ind])\n",
-    "    \n",
+    "\n",
     "    # Predict and evaluate\n",
     "    y_hat = automl.predict(feats[test_ind])\n",
-    "    \n",
+    "\n",
     "    # Evaluate\n",
     "    mae = mean_absolute_error(y_true[test_ind], y_hat)\n",
     "    lipo_scores[name] = mae\n",
@@ -282,7 +283,9 @@
    ],
    "source": [
     "# Prepare the ClinTox dataset\n",
-    "smiles, y_true = load_dataset(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz\", \"CT_TOX\")\n",
+    "smiles, y_true = load_dataset(\n",
+    "    \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz\", \"CT_TOX\"\n",
+    ")\n",
     "smiles = np.array([preprocess_smiles(smi) for smi in smiles])\n",
     "smiles = np.array([smi for smi in smiles if smi is not None])\n",
     "\n",
@@ -333,22 +336,21 @@
     "\n",
     "clintox_scores = {}\n",
     "for name, feats in X.items():\n",
-    "    \n",
     "    # Train\n",
     "    automl = autosklearn.classification.AutoSklearnClassifier(\n",
-    "        memory_limit=24576, \n",
-    "        # For practicality’s sake, limit this to 5 minutes! \n",
+    "        memory_limit=24576,\n",
+    "        # For practicality’s sake, limit this to 5 minutes!\n",
     "        # (x3 = 15 min in total)\n",
     "        time_left_for_this_task=180,\n",
     "        n_jobs=1,\n",
     "        seed=1,\n",
     "    )\n",
     "    automl.fit(feats[train_ind], y_true[train_ind])\n",
-    "    \n",
+    "\n",
     "    # Predict and evaluate\n",
     "    y_hat = automl.predict_proba(feats[test_ind])\n",
     "    y_hat = y_hat[:, 1]\n",
-    "    \n",
+    "\n",
     "    # Evaluate\n",
     "    auroc = roc_auc_score(y_true[test_ind], y_hat)\n",
     "    clintox_scores[name] = auroc\n",
@@ -395,7 +397,9 @@
     "BASE_CMPD_URI = \"https://github.com/rdkit/benchmarking_platform/raw/master/compounds/DUD/cmp_list_DUD\"\n",
     "CMPD_EXT = \".dat.gz\"\n",
     "\n",
-    "BASE_SPLIT_URI = \"https://github.com/rdkit/benchmarking_platform/raw/master/query_lists/data_sets_I/DUD/training_DUD\"\n",
+    "BASE_SPLIT_URI = (\n",
+    "    \"https://github.com/rdkit/benchmarking_platform/raw/master/query_lists/data_sets_I/DUD/training_DUD\"\n",
+    ")\n",
     "SPLIT_EXT = \".pkl\"\n",
     "\n",
     "# Out of practicality, we only use the first 10 targets\n",
@@ -519,7 +523,6 @@
     "results = defaultdict(dict)\n",
     "\n",
     "for target in tqdm.tqdm(TARGETS, leave=False):\n",
-    "\n",
     "    # Load the structures (i.e. SMILES)\n",
     "    df = get_compounds_for_target(target)\n",
     "    n_actives = len(df[df[\"subset\"] == \"actives\"])\n",
@@ -543,7 +546,6 @@
     "    train_decoy_ind = [i + n_actives for i in train_decoy_ind]\n",
     "\n",
     "    for feat_name, feats in X.items():\n",
-    "\n",
     "        # Train the model\n",
     "        knn = KNeighborsClassifier()\n",
     "        train_ind = np.concatenate([train_active_ind, train_decoy_ind])\n",
 
@@ -52,12 +52,15 @@
     "from rdkit.Chem.rdMolDescriptors import CalcNumHeteroatoms\n",
     "\n",
     "smiles = dm.freesolv()[\"smiles\"].iloc[:3]\n",
+    "\n",
+    "\n",
     "def my_calculator(mol):\n",
     "    \"\"\"My custom featurizer\"\"\"\n",
     "    mol = dm.to_mol(mol)\n",
     "    rng = np.random.default_rng(0)\n",
     "    return [mol.GetNumAtoms(), mol.GetNumBonds(), CalcNumHeteroatoms(mol), rng.random()]\n",
     "\n",
+    "\n",
     "# This directly works with the MoleculeTransformer\n",
     "mol_transf = MoleculeTransformer(my_calculator)\n",
     "mol_transf(smiles)"
@@ -113,9 +116,7 @@
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "If your calculator can perform featurization of a batch of molecules in an efficient way, then you should implement the optional `batch_compute` method which will then be used by `MoleculeTransformer` instead of the default sequential or parallelization process. "
-   ]
+   "source": "If your calculator can perform featurization of a batch of molecules in an efficient way, then you should implement the optional `batch_compute` method which will then be used by `MoleculeTransformer` instead of the default sequential or parallelization process."
   },
   {
    "cell_type": "code",
@@ -148,6 +149,7 @@
    "source": [
     "from molfeat.calc import SerializableCalculator\n",
     "\n",
+    "\n",
     "class MyBatchableCalculator(SerializableCalculator):\n",
     "    def __init__(self, random_seed=42, length=10):\n",
     "        self.random_seed = random_seed\n",
@@ -157,15 +159,16 @@
     "    def __call__(self, mol):\n",
     "        print(\"We are in single compute mode!\")\n",
     "        return self.rng.random(self.length)\n",
-    "    \n",
+    "\n",
     "    def __len__(self):\n",
     "        return self.length\n",
-    "        \n",
+    "\n",
     "    def batch_compute(self, mols, **kwargs):\n",
     "        # note that dm.parallelized information is passed along with the molecules list\n",
     "        print(\"We are in batch mode!\")\n",
     "        return self.rng.random((len(mols), self.length))\n",
     "\n",
+    "\n",
     "mol_transf = MoleculeTransformer(MyBatchableCalculator())\n",
     "mol_transf(smiles)"
    ]
@@ -179,7 +182,7 @@
    "source": [
     "## Define your own transformer\n",
     "The above example shows that in many cases, there's no direct need to create your own transformer class. You can simply use the `MoleculeTransformer` base class.\n",
-    "In more complex cases, such as with pretrained models where batching would be advantageous, it is instead preferable to create your own subclass. "
+    "In more complex cases, such as with pretrained models where batching would be advantageous, it is instead preferable to create your own subclass."
    ]
   },
   {
@@ -220,7 +223,7 @@
     "        \"\"\"Convert the molecule to a format that the model expects\"\"\"\n",
     "        return self._featurizer(inputs)\n",
     "\n",
-    "    def _embed(self,  mols: list, **kwargs):\n",
+    "    def _embed(self, mols: list, **kwargs):\n",
     "        \"\"\"\n",
     "        Embed the molecules using the pretrained model\n",
     "        In this dummy example, we simply multiply the features by the importance of the feature\n",
@@ -256,7 +259,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Here is another example that shows how to extend Molfeat with an existing embedding language model for astrochemistry. \n",
+    "Here is another example that shows how to extend Molfeat with an existing embedding language model for astrochemistry.\n",
     "\n",
     "```bash\n",
     "pip install astrochem_embedding\n",
@@ -286,20 +289,23 @@
     "from astrochem_embedding import VICGAE\n",
     "from molfeat.trans.pretrained import PretrainedMolTransformer\n",
     "\n",
+    "\n",
     "class MyAstroChemFeaturizer(PretrainedMolTransformer):\n",
     "    \"\"\"\n",
-    "    In this more practical example, we use embeddings from VICGAE a variance-invariance-covariance \n",
+    "    In this more practical example, we use embeddings from VICGAE a variance-invariance-covariance\n",
     "    regularized GRU autoencoder trained on SELFIES strings.\n",
     "    \"\"\"\n",
+    "\n",
     "    def __init__(self, *args, **kwargs):\n",
-    "        super().__init__(*args, **kwargs)        \n",
+    "        super().__init__(*args, **kwargs)\n",
     "        self.featurizer = VICGAE.from_pretrained()\n",
-    "    \n",
+    "\n",
     "    def _embed(self, smiles, **kwargs):\n",
     "        return [self.featurizer.embed_smiles(x) for x in smiles]\n",
     "\n",
+    "\n",
     "transformer = MyAstroChemFeaturizer(dtype=torch.float)\n",
-    "transformer(dm.freesolv()[\"smiles\"][:10]).shape\n"
+    "transformer(dm.freesolv()[\"smiles\"][:10]).shape"
    ]
   },
   {
@@ -338,7 +344,7 @@
     "from molfeat.store import ModelInfo\n",
     "\n",
     "path = dm.fs.join(platformdirs.user_cache_dir(\"molfeat\"), \"custom_model_store\")\n",
-    "store = ModelStore(model_store_bucket=path)\n",
+    "store = ModelStore(model_store_root=path)\n",
     "len(store.available_models)"
    ]
   },
@@ -384,18 +390,18 @@
    "source": [
     "# Let's define our model's info\n",
     "info = ModelInfo(\n",
-    "    name = \"my_foundation_model\",\n",
-    "    inputs = \"smiles\",\n",
+    "    name=\"my_foundation_model\",\n",
+    "    inputs=\"smiles\",\n",
     "    type=\"pretrained\",\n",
     "    group=\"my_group\",\n",
     "    version=0,\n",
     "    submitter=\"Datamol\",\n",
     "    description=\"Solves chemistry!\",\n",
     "    representation=\"vector\",\n",
     "    require_3D=False,\n",
-    "    tags = [\"foundation_model\", \"random_forest\"],\n",
-    "    authors= [\"Datamol\"],\n",
-    "    reference = \"/fake/ref\"\n",
+    "    tags=[\"foundation_model\", \"random_forest\"],\n",
+    "    authors=[\"Datamol\"],\n",
+    "    reference=\"/fake/ref\",\n",
     ")\n",
     "\n",
     "store.register(info)\n",