Skip to content

Commit f6b8611

Browse files
authored
Merge pull request #6 from gdcc/navigator
Navigator
2 parents 184e8fe + 2cbcedd commit f6b8611

4 files changed

Lines changed: 111 additions & 6 deletions

File tree

.env

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
CACHEDIR=./cache
2+
DATAVERSES=https://iqss.github.io/dataverse-installations/data/data.json

Dockerfile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ COPY requirements.txt .
1515
# Install Python dependencies
1616
RUN pip install --no-cache-dir -r requirements.txt
1717
RUN pip install uv
18-
RUN pip install nemo_toolkit['asr']
19-
#RUN pip install git+https://github.com/Dans-labs/pyDataverse@development#egg=pyDataverse
2018

2119
# Copy application code
2220
COPY app /app/app

docker-compose.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@ services:
77
env_file:
88
- .env
99
ports:
10-
- "8000:8000"
10+
- "8100:8000"
1111
environment:
1212
- PYTHONUNBUFFERED=1
13+
- "DATAVERSES"
1314
- MCP_BASE_URL=http://localhost:8000/mcp
1415
- "CACHEDIR"
1516
- "KAGGLE_USERNAME"

semantic_croissant/server.py

Lines changed: 108 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020
from mcp.server.session import ServerSession
2121
import mcp.types as types
2222
from pyDataverse.Croissant import Croissant
23+
import requests
2324
#from mcp.server.lowlevel import TextContent
2425
#from mcp.schema import TextContent
25-
from utils.MultiMedia import MultiMedia
26+
#from utils.MultiMedia import MultiMedia
2627
import pydoi
2728

2829
from utils.dataframe import CroissantRecipe
@@ -187,7 +188,7 @@ async def list_tools() -> list[types.Tool]:
187188
types.Tool(
188189
name="get_croissant_record",
189190
endpoint="/get_croissant_record",
190-
description="Convert a dataset to Croissant ML format with get_croissant_record tool",
191+
description="Convert a dataset to Croissant ML format with get_croissant_record tool and explore the dataset with DOI or handle.",
191192
inputSchema={
192193
"type": "object",
193194
"required": ["doi"],
@@ -209,6 +210,53 @@ async def list_tools() -> list[types.Tool]:
209210
},
210211
},
211212
),
213+
types.Tool(
214+
name="overview",
215+
endpoint="/overview",
216+
description="Get an overview of the Dataverse installations around the world sorted by country. Entrance point for the overview tools if no hosts are provided.",
217+
inputSchema={
218+
"type": "object",
219+
"required": [],
220+
"properties": {},
221+
},
222+
),
223+
types.Tool(
224+
name="overview_datasets",
225+
endpoint="/overview/datasets",
226+
description="Get an overview of the Dataverse datasets statistics by host",
227+
inputSchema={
228+
"type": "object",
229+
"required": ["host"],
230+
"properties": {
231+
"host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"}
232+
},
233+
},
234+
),
235+
types.Tool(
236+
name="overview_files",
237+
endpoint="/overview/files",
238+
description="Get an overview of the Dataverse files statistics by host",
239+
inputSchema={
240+
"type": "object",
241+
"required": ["host"],
242+
"properties": {
243+
"host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"}
244+
},
245+
},
246+
),
247+
types.Tool(
248+
name="search_datasets",
249+
endpoint="/search/datasets",
250+
description="Search for datasets in a Dataverse installation",
251+
inputSchema={
252+
"type": "object",
253+
"required": ["host", "query"],
254+
"properties": {
255+
"host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"},
256+
"query": {"type": "string", "description": "Query to search for datasets"}
257+
},
258+
},
259+
),
212260
]
213261
return tools
214262

@@ -345,6 +393,59 @@ async def run_fetch_website(request: Request):
345393
#return JSONResponse(content=serialized_result)
346394
return Response(content=result, media_type="text/html")
347395

396+
async def run_get_overview(request: Request):
397+
url = os.environ.get("DATAVERSES")
398+
data = requests.get(url)
399+
installations = data.json()['installations']
400+
return JSONResponse(content={"installations": installations})
401+
402+
async def run_get_overview_datasets(request: Request):
403+
if request.method == "GET":
404+
host = request.query_params.get("host")
405+
else:
406+
body = await request.json()
407+
host = body.get("host")
408+
409+
return search_datasets(host, False)
410+
411+
async def run_search_datasets(request: Request):
412+
if request.method == "GET":
413+
host = request.query_params.get("host")
414+
query = request.query_params.get("query")
415+
else:
416+
body = await request.json()
417+
host = body.get("host")
418+
query = body.get("query")
419+
return search_datasets(host, query)
420+
421+
def search_datasets(host: str, query: str):
422+
if query:
423+
query = f"q={query}"
424+
else:
425+
query = "q=%2A"
426+
427+
if not 'http' in host:
428+
host = f"https://{host}"
429+
url = f"{host}/api/search?{query}&type=dataset"
430+
data = requests.get(url)
431+
datasets = data.json()['data']
432+
return JSONResponse(content={"datasets": datasets})
433+
434+
async def run_get_overview_files(request: Request):
435+
if request.method == "GET":
436+
host = request.query_params.get("host")
437+
else:
438+
body = await request.json()
439+
host = body.get("host")
440+
441+
if not 'http' in host:
442+
host = f"https://{host}"
443+
url = f"{host}/api/search?q=*&type=file&per_page=0"
444+
data = requests.get(url)
445+
files = data.json()['data']
446+
return JSONResponse(content={"files": files})
447+
448+
348449
starlette_app = Starlette(
349450
debug=True,
350451
routes=[
@@ -366,7 +467,11 @@ async def run_fetch_website(request: Request):
366467
Route("/mcp", endpoint=get_mcp, methods=["GET", "POST"]),
367468
Route("/mcp/list_tools", endpoint=get_mcp, methods=["GET", "POST"]),
368469
Route("/get_croissant_record", endpoint=mcp_croissant_record_endpoint, methods=["GET", "POST"]),
369-
Route("/fetch", endpoint=run_fetch_website, methods=["GET", "POST"])
470+
Route("/fetch", endpoint=run_fetch_website, methods=["GET", "POST"]),
471+
Route("/overview", endpoint=run_get_overview, methods=["GET", "POST"]),
472+
Route("/overview/datasets", endpoint=run_get_overview_datasets, methods=["GET", "POST"]),
473+
Route("/search/datasets", endpoint=run_search_datasets, methods=["GET", "POST"]),
474+
Route("/overview/files", endpoint=run_get_overview_files, methods=["GET", "POST"]),
370475
],
371476
)
372477

0 commit comments

Comments
 (0)