Handling NASIS and ontology terms directly

ddooley · ddooley · commit 70f64e7a392b · 2026-05-03T21:47:24.000-07:00
diff --git a/script/README_menu_manager.md b/script/README_menu_manager.md
@@ -9,6 +9,8 @@ assembles them into a `schema.yaml` suitable for use with DataHarmonizer.
 
 ## Quick-start workflow
 
+**Note that some terminals access python as "python3". ALSO, menu_manager.py needs to be run within the context of the folder you want to generate schema.yaml in. If menu_manager.py location is not set in the shell environment then you will need to reference it using a relative path to the DataHarmonizer script/ folder, .e.g  "python ../../../script/menu_manager.py" .**
+
 ```bash
 # 1. Add sources (auto-detects type, downloads, adds to menu_config.yaml)
 python menu_manager.py -a https://example.org/some-valueset.json
@@ -23,6 +25,10 @@ python menu_manager.py -b
 python menu_manager.py -f all -c -b
 ```
 
+One strategy with a new menu management installation is to pick a few libraries that you know contain enumerations / picklists useful in a project.  So for example, soil:
+python menu_manager.py -a https://example.org/some-valueset.json
+
+
 ---
 
 ## Command reference
@@ -122,6 +128,7 @@ python menu_manager.py -a https://example.org/some-valueset.json
 |---|---|
 | `OntologyAPI` | URL matches `aims.fao.org/aos/agrovoc/{id}` (pre-download) |
 | `OntologyAPI` | URL matches `snomed.info/id/{conceptId}` (pre-download) |
+| `OntologyAPI` | Bare CURIE `ENVO:00010483`, OBO shorthand `ENVO_00010483`, or OBO IRI `http://purl.obolibrary.org/obo/ENVO_00010483` (pre-download; routes to configured API or OLS4) |
 | `NSDBSNT` | URL contains `/snt/` under the NSDB soil domain |
 | `NSDBSLT` | URL contains `/slt/` under the NSDB soil domain |
 | `NSDB` | URL matches `sis.agr.gc.ca/cansis/nsdb/soil` prefix |
@@ -135,6 +142,7 @@ python menu_manager.py -a https://example.org/some-valueset.json
 | `NAPCSCanada` | CSV content with NAPCS-specific column headers |
 | `AgriFoodCA` | GitHub directory URL for `agrifooddatacanada/picklists_for_schemas` (pre-download) |
 | `AgriFoodCA` | CSV first row matches `,title,description,keywords,source` (content-based) |
+| `NASIS` | URL from `nrcs.usda.gov` containing `NASIS` with `.pdf` extension |
 
 ---
 
@@ -156,6 +164,34 @@ python menu_manager.py -b
 still falls back to OLS4 and auto-detects the `http://snomed.info/id/` IRI base
 from OLS4 ontology metadata — no explicit `iri_base` key required.
 
+### OBO ontology terms (ENVO, GO, UBERON, …)
+
+Pass either a bare CURIE or a full OBO IRI.  The prefix is looked up in the
+`apis` block of `menu_config.yaml` to choose the API (defaults to OLS4).  The
+term label and description are fetched immediately; hierarchy expansion runs
+separately with `-l`.
+
+```bash
+# OBO shorthand (underscore + numeric ID)
+python menu_manager.py -a ENVO_00010483
+python menu_manager.py -l ENVO_00010483
+
+# CURIE form (colon separator)
+python menu_manager.py -a ENVO:00010483
+python menu_manager.py -l ENVO_00010483
+
+# OBO IRI form
+python menu_manager.py -a http://purl.obolibrary.org/obo/ENVO_00010483
+python menu_manager.py -l ENVO_00010483
+
+# Ontology whose prefix is listed under bioportal in menu_config.yaml
+# will be routed to BioPortal automatically
+python menu_manager.py -a MESH:D001234
+python menu_manager.py -l MESH_D001234
+```
+
+The generated source key is always `{PREFIX}_{localID}` (e.g. `ENVO_00010483`).
+
 ### AGROVOC
 
 ```bash
@@ -225,29 +261,21 @@ NASIS publishes all domain tables (controlled vocabularies for every categorical
 field in the NASIS soil survey database) as a single PDF.  Each domain becomes
 one LinkML enum.  Requires `pypdf` (`pip install pypdf`).
 
-NASIS is not auto-detected by `-a`; add it manually to `menu_config.yaml`:
-
-```yaml
-sources:
-  NASIS:
-    title: NASIS Database Metadata
-    name: NASIS
-    version: '7.4.3'
-    content_type: NASIS
-    file_format: pdf
-    concise: true
-    reachable_from:
-      source_ontology: https://www.nrcs.usda.gov/sites/default/files/2025-07/NASIS%207.4.3%20Domains.pdf
-    download_date: '2026-04-29'
-    description: The USDA NRCS National Soil Information System (NASIS) domain tables
-      define the controlled vocabularies for all categorical fields in the NASIS
-      soil survey database.
+```bash
+python menu_manager.py -a "https://www.nrcs.usda.gov/sites/default/files/2025-07/NASIS%207.4.3%20Domains.pdf"
+python menu_manager.py -b
 ```
 
-Then download and process:
+The URL is auto-detected as NASIS: the PDF is saved to `sources/NASIS.pdf`, a
+source entry (with `concise: true`) is added to `menu_config.yaml`, and
+`process_nasis_source` runs immediately.  To update to a newer release, remove
+the `NASIS` key from `menu_config.yaml` first and re-run `-a` with the new URL.
+
+To process a manually placed PDF (already at `sources/NASIS.pdf`) without
+re-downloading:
 
 ```bash
-python menu_manager.py -c NASIS     # downloads PDF on first run, writes sources/NASIS.yaml
+python menu_manager.py -c NASIS     # writes sources/NASIS.yaml
 python menu_manager.py -b           # adds NASIS enums to schema.yaml
 ```
 
diff --git a/script/menu_manager.py b/script/menu_manager.py
@@ -177,6 +177,7 @@
     fetch_api_graph,
     process_skos_source,
     match_snomed,
+    match_ontology_term,
 )
 from source_linkml import (
     apply_sorted_prefixes,
@@ -934,13 +935,17 @@ def add_source(urls, config_file=MENU_CONFIG):
         # Unescape HTML entities (e.g. &amp; → &) so the server receives a valid URL
         url = html.unescape(url)
 
-        # Pre-download detectors: no file download needed for these URL patterns
+        # Pre-download detectors: handle their own download (or need none)
         if match_agrovoc(url, config_file):
             continue
         if match_snomed(url, config_file):
             continue
+        if match_ontology_term(url, config_file):
+            continue
         if match_agrifood_dir(url, config_file):
             continue
+        if match_nasis(url, config_file):
+            continue
 
         print(f"Fetching {url} ...")
         tmp_fd, tmp_path = tempfile.mkstemp()
diff --git a/script/menu_manager/source_nasis.py b/script/menu_manager/source_nasis.py
@@ -10,15 +10,17 @@
 
 import os
 import re
+import subprocess
 import sys
-import urllib.request
+import urllib.parse
 import yaml
 
 from source_utils import (
     add_permissible_value,
     IndentedDumper,
     make_config_schema,
-    BROWSER_HEADERS,
+    make_source_entry,
+    write_config,
     MENU_CONFIG,
 )
 
@@ -124,13 +126,22 @@ def _require_pypdf():
 
 
 def fetch_pdf(url, dest_path):
-    """Download *url* to *dest_path* using browser-like headers."""
-    req = urllib.request.Request(url, headers=BROWSER_HEADERS)
-    with urllib.request.urlopen(req) as resp:
-        data = resp.read()
-    with open(dest_path, "wb") as f:
-        f.write(data)
-    print(f"  Downloaded {url} → {dest_path} ({len(data):,} bytes)")
+    """Download *url* to *dest_path* via curl.
+
+    Uses curl rather than urllib so that HTTP/2 is negotiated automatically.
+    The NRCS server (Akamai CDN) drops urllib's HTTP/1.1 connections but
+    serves curl without issue.
+    """
+    print(f"  Downloading {url} ...")
+    result = subprocess.run(
+        ["curl", "-L", "--silent", "--show-error", "-o", dest_path, url],
+        capture_output=True, text=True,
+    )
+    if result.returncode != 0:
+        print(f"  curl error: {result.stderr.strip()}", file=sys.stderr)
+        sys.exit(1)
+    size = os.path.getsize(dest_path)
+    print(f"  Saved {dest_path} ({size:,} bytes)")
 
 
 # ---------------------------------------------------------------------------
@@ -379,12 +390,50 @@ def process_nasis_source(key, source, locales=None):
     print(f"Updated {yaml_path}")
 
 
-def match_nasis(url, tmp_path, config_file=MENU_CONFIG):
+def match_nasis(url, config_file=MENU_CONFIG):
     """Return True if *url* is a NASIS Domains PDF URL and was handled.
 
-    Placeholder for future -a auto-detection support.
+    Pre-download detector (no tmp_path): downloads the PDF itself via curl
+    so that HTTP/2 is used.  urllib's HTTP/1.1 is dropped by the Akamai CDN
+    that serves nrcs.usda.gov.
+
+    Matches URLs like:
+      https://www.nrcs.usda.gov/sites/default/files/2025-07/NASIS%207.4.3%20Domains.pdf
     """
-    if "nrcs.usda.gov" not in url or "NASIS" not in url or not url.lower().endswith(".pdf"):
+    decoded = urllib.parse.unquote(url)
+    if "nrcs.usda.gov" not in decoded or "NASIS" not in decoded or not decoded.lower().endswith(".pdf"):
         return False
-    # TODO: implement -a add-source detection for NASIS PDF URLs
-    return False
+
+    key = "NASIS"
+
+    with open(config_file) as f:
+        config = yaml.safe_load(f) or {}
+    if key in config.get("sources", {}):
+        print(f"  Skipping {url}: source key '{key}' already exists in {config_file}",
+              file=sys.stderr)
+        return True
+
+    version_m = re.search(r'NASIS\s+([\d.]+)', decoded)
+    version = version_m.group(1) if version_m else None
+
+    pdf_path = f"sources/{key}.pdf"
+    fetch_pdf(url, pdf_path)
+
+    entry = make_source_entry(
+        key, url, "NASIS", "pdf",
+        title="NASIS Database Metadata",
+        version=version,
+        description=(
+            "The USDA NRCS National Soil Information System (NASIS) domain tables"
+            " define the controlled vocabularies for all categorical fields in the"
+            " NASIS soil survey database."
+        ),
+    )
+    entry["concise"] = True
+
+    config.setdefault("sources", {})[key] = entry
+    write_config(config, config_file)
+    print(f"Added source '{key}' to {config_file}")
+
+    process_nasis_source(key, entry)
+    return True
diff --git a/script/menu_manager/source_ontologyapi.py b/script/menu_manager/source_ontologyapi.py
@@ -11,6 +11,7 @@
     fetch_api_graph(ontology, term_id, apis=None, locales=None)
     process_skos_source(key, source, config_file=None, locales=None)
     match_snomed(url, config_file=MENU_CONFIG)
+    match_ontology_term(url, config_file=MENU_CONFIG)
 """
 
 import json
@@ -476,3 +477,104 @@ def match_snomed(url, config_file=MENU_CONFIG):
     print(f"Added source '{key}' (title={title!r}, version={version!r}, "
           f"description={'set' if description else 'not available'}) to {config_file}")
     return True
+
+
+# Matches a bare CURIE like ENVO:00010483 or GO:0008150 (letter-started prefix, colon, local ID)
+_CURIE_INPUT_PAT = re.compile(r'^([A-Za-z][A-Za-z0-9]*):([\w]+)$')
+# Matches OBO shorthand with underscore: ENVO_00010483, GO_0008150 (numeric local part)
+_OBO_SHORTHAND_PAT = re.compile(r'^([A-Za-z][A-Za-z0-9]*)_(\d+)$')
+# Matches an OBO Foundry IRI: http(s)://purl.obolibrary.org/obo/PREFIX_localid
+_OBO_IRI_INPUT_PAT = re.compile(r'https?://purl\.obolibrary\.org/obo/([A-Za-z][A-Za-z0-9]*)_([\w]+)')
+
+
+def _find_api_for_prefix(prefix, apis):
+    """Return (api_name, api_conf) for the first API whose ontologies list contains *prefix*.
+
+    Comparison is case-insensitive.  Falls back to ('ols', apis['ols']) when no
+    explicit match is found, since OLS4 accepts any OBO ontology by default.
+    """
+    prefix_upper = prefix.upper()
+    for name, conf in (apis or {}).items():
+        ontologies = [o.upper() for o in (conf.get("ontologies") or [])]
+        if prefix_upper in ontologies:
+            return name, conf
+    return "ols", (apis or {}).get("ols") or {}
+
+
+def match_ontology_term(url, config_file=MENU_CONFIG):
+    """Return True if *url* is an ontology term CURIE, OBO shorthand, or OBO IRI and was handled.
+
+    Accepted forms:
+      ENVO:00010483                                    (bare CURIE, colon separator)
+      ENVO_00010483                                    (OBO shorthand, underscore + numeric ID)
+      http://purl.obolibrary.org/obo/ENVO_00010483     (OBO Foundry IRI)
+
+    Looks up the prefix in the `apis` block of menu_config.yaml to find which
+    configured API handles the ontology (defaults to OLS4 when none claim it).
+    The configured API is written to reachable_from for -l expansion; term
+    label and description are always fetched from OLS4 (which is public and
+    free) regardless of which API will be used for hierarchy expansion.
+    """
+    prefix = term_id = None
+
+    m = _CURIE_INPUT_PAT.match(url)
+    if m:
+        prefix, term_id = m.group(1), m.group(2)
+    else:
+        m = _OBO_SHORTHAND_PAT.match(url)
+        if m:
+            prefix, term_id = m.group(1), m.group(2)
+        else:
+            m = _OBO_IRI_INPUT_PAT.match(url)
+            if m:
+                prefix, term_id = m.group(1).upper(), m.group(2)
+
+    if not prefix:
+        return False
+
+    with open(config_file) as _cf:
+        config = yaml.safe_load(_cf) or {}
+
+    apis = config.get("apis") or {}
+    api_name, api_conf = _find_api_for_prefix(prefix, apis)
+
+    key = f"{prefix}_{term_id}"
+    curie = f"{prefix}:{term_id}"
+
+    if key in config.get("sources", {}):
+        print(f"  Skipping {url}: source key '{key}' already exists in {config_file}",
+              file=sys.stderr)
+        return True
+
+    api_type = "sparql" if _get_type_conf(api_conf, "sparql") else "rest"
+
+    # Always use OLS4 for the initial label/description lookup — it is public
+    # and free, and avoids auth issues with BioPortal or SPARQL endpoints.
+    # The api_name/api_type in the source entry controls -l routing only.
+    ols_conf = apis.get("ols") or {}
+    iri_base = resolve_ols4_iri_base(prefix, ols_conf)
+    concept_iri = iri_base + term_id
+
+    print(f"  Fetching OLS4 ontology metadata for {prefix} ...")
+    meta = _fetch_ols4_ontology_meta(prefix, ols_conf)
+    version = meta.get("version") or None
+
+    print(f"  Fetching OLS4 term info for {concept_iri} ...")
+    term_info = _fetch_ols4_term_info(prefix, concept_iri, ols_conf)
+    title = term_info["label"] or key
+    description = term_info["description"] or None
+
+    entry = make_source_entry(key, concept_iri, "OntologyAPI", "json",
+                              title=title, version=version, description=description)
+    entry["prefixes"] = {prefix: iri_base}
+    entry["reachable_from"] = {
+        "api": {api_name: {"type": api_type}},
+        "source_nodes": [curie],
+        "include_self": True,
+    }
+
+    config.setdefault("sources", {})[key] = entry
+    write_config(config, config_file)
+    print(f"Added source '{key}' (api={api_name}, title={title!r}) to {config_file}")
+    print(f"  Run: menu_manager.py -l {key}  to expand the hierarchy via {api_name}")
+    return True