gdcc · 4tikhonov · May 29, 2025 · May 29, 2025 · May 29, 2025 · May 29, 2025
diff --git a/.env b/.env
@@ -1 +1,2 @@
 CACHEDIR=./cache
+DATAVERSES=https://iqss.github.io/dataverse-installations/data/data.json
diff --git a/Dockerfile b/Dockerfile
@@ -15,8 +15,6 @@ COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install uv
-RUN pip install nemo_toolkit['asr']
-#RUN pip install git+https://github.com/Dans-labs/pyDataverse@development#egg=pyDataverse
 
 # Copy application code
 COPY app /app/app

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -7,9 +7,10 @@ services:
     env_file:
       - .env
     ports:
-      - "8000:8000"
+      - "8100:8000"
     environment:
       - PYTHONUNBUFFERED=1
+      - "DATAVERSES"
       - MCP_BASE_URL=http://localhost:8000/mcp
       - "CACHEDIR"
       - "KAGGLE_USERNAME"

diff --git a/semantic_croissant/server.py b/semantic_croissant/server.py
@@ -20,9 +20,10 @@
 from mcp.server.session import ServerSession
 import mcp.types as types
 from pyDataverse.Croissant import Croissant
+import requests
 #from mcp.server.lowlevel import TextContent
 #from mcp.schema import TextContent
-from utils.MultiMedia import MultiMedia
+#from utils.MultiMedia import MultiMedia
 import pydoi
 
 from utils.dataframe import CroissantRecipe
@@ -187,7 +188,7 @@ async def list_tools() -> list[types.Tool]:
             types.Tool(
                 name="get_croissant_record",
                 endpoint="/get_croissant_record",
-                description="Convert a dataset to Croissant ML format with get_croissant_record tool",
+                description="Convert a dataset to Croissant ML format with get_croissant_record tool and explore the dataset with DOI or handle.",
                 inputSchema={
                     "type": "object",
                     "required": ["doi"],
@@ -209,6 +210,53 @@ async def list_tools() -> list[types.Tool]:
                     },
                 },
             ),
+            types.Tool(
+                name="overview",
+                endpoint="/overview",
+                description="Get an overview of the Dataverse installations around the world sorted by country. Entrance point for the overview tools if no hosts are provided.",
+                inputSchema={
+                    "type": "object",
+                    "required": [],
+                    "properties": {},
+                },
+            ),
+            types.Tool(
+                name="overview_datasets",
+                endpoint="/overview/datasets",
+                description="Get an overview of the Dataverse datasets statistics by host",
+                inputSchema={
+                    "type": "object",
+                    "required": ["host"],
+                    "properties": {
+                        "host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"}
+                    },
+                },
+            ),
+            types.Tool(
+                name="overview_files",
+                endpoint="/overview/files",
+                description="Get an overview of the Dataverse files statistics by host",
+                inputSchema={
+                    "type": "object",
+                    "required": ["host"],
+                    "properties": {
+                        "host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"}
+                    },
+                },
+            ),
+            types.Tool(
+                name="search_datasets",
+                endpoint="/search/datasets",
+                description="Search for datasets in a Dataverse installation",
+                inputSchema={
+                    "type": "object",
+                    "required": ["host", "query"],
+                    "properties": {
+                        "host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"},
+                        "query": {"type": "string", "description": "Query to search for datasets"}
+                    },
+                },
+            ),
         ]
         return tools
 
@@ -345,6 +393,59 @@ async def run_fetch_website(request: Request):
             #return JSONResponse(content=serialized_result)
             return Response(content=result, media_type="text/html")
 
+        async def run_get_overview(request: Request):
+            url = os.environ.get("DATAVERSES")
+            data = requests.get(url)
+            installations = data.json()['installations']
+            return JSONResponse(content={"installations": installations})
+
+        async def run_get_overview_datasets(request: Request):
+            if request.method == "GET":
+                host = request.query_params.get("host")
+            else:
+                body = await request.json()
+                host = body.get("host")
+
+            return search_datasets(host, False)
+
+        async def run_search_datasets(request: Request):
+            if request.method == "GET":
+                host = request.query_params.get("host")
+                query = request.query_params.get("query")
+            else:
+                body = await request.json()
+                host = body.get("host")
+                query = body.get("query")
+            return search_datasets(host, query)
+
+        def search_datasets(host: str, query: str):
+            if query:
+                query = f"q={query}"
+            else:
+                query = "q=%2A"
+
+            if not 'http' in host:
+                host = f"https://{host}"
+            url = f"{host}/api/search?{query}&type=dataset"
+            data = requests.get(url)
+            datasets = data.json()['data']
+            return JSONResponse(content={"datasets": datasets})
+
+        async def run_get_overview_files(request: Request):
+            if request.method == "GET":
+                host = request.query_params.get("host")
+            else:
+                body = await request.json()
+                host = body.get("host")
+
+            if not 'http' in host:
+                host = f"https://{host}"
+            url = f"{host}/api/search?q=*&type=file&per_page=0"
+            data = requests.get(url)
+            files = data.json()['data']
+            return JSONResponse(content={"files": files})
+
+
         starlette_app = Starlette(
             debug=True,
             routes=[
@@ -366,7 +467,11 @@ async def run_fetch_website(request: Request):
                 Route("/mcp", endpoint=get_mcp, methods=["GET", "POST"]),
                 Route("/mcp/list_tools", endpoint=get_mcp, methods=["GET", "POST"]),
                 Route("/get_croissant_record", endpoint=mcp_croissant_record_endpoint, methods=["GET", "POST"]),
-                Route("/fetch", endpoint=run_fetch_website, methods=["GET", "POST"])
+                Route("/fetch", endpoint=run_fetch_website, methods=["GET", "POST"]),
+                Route("/overview", endpoint=run_get_overview, methods=["GET", "POST"]),
+                Route("/overview/datasets", endpoint=run_get_overview_datasets, methods=["GET", "POST"]),
+                Route("/search/datasets", endpoint=run_search_datasets, methods=["GET", "POST"]),
+                Route("/overview/files", endpoint=run_get_overview_files, methods=["GET", "POST"]),
             ],
         )
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		CACHEDIR=./cache
		DATAVERSES=https://iqss.github.io/dataverse-installations/data/data.json