Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
CACHEDIR=./cache
DATAVERSES=https://iqss.github.io/dataverse-installations/data/data.json
2 changes: 0 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ COPY requirements.txt .
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install uv
RUN pip install nemo_toolkit['asr']
#RUN pip install git+https://github.com/Dans-labs/pyDataverse@development#egg=pyDataverse

# Copy application code
COPY app /app/app
Expand Down
3 changes: 2 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ services:
env_file:
- .env
ports:
- "8000:8000"
- "8100:8000"
environment:
- PYTHONUNBUFFERED=1
- "DATAVERSES"
- MCP_BASE_URL=http://localhost:8000/mcp
- "CACHEDIR"
- "KAGGLE_USERNAME"
Expand Down
111 changes: 108 additions & 3 deletions semantic_croissant/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
from mcp.server.session import ServerSession
import mcp.types as types
from pyDataverse.Croissant import Croissant
import requests
#from mcp.server.lowlevel import TextContent
#from mcp.schema import TextContent
from utils.MultiMedia import MultiMedia
#from utils.MultiMedia import MultiMedia
import pydoi

from utils.dataframe import CroissantRecipe
Expand Down Expand Up @@ -187,7 +188,7 @@ async def list_tools() -> list[types.Tool]:
types.Tool(
name="get_croissant_record",
endpoint="/get_croissant_record",
description="Convert a dataset to Croissant ML format with get_croissant_record tool",
description="Convert a dataset to Croissant ML format with get_croissant_record tool and explore the dataset with DOI or handle.",
inputSchema={
"type": "object",
"required": ["doi"],
Expand All @@ -209,6 +210,53 @@ async def list_tools() -> list[types.Tool]:
},
},
),
types.Tool(
name="overview",
endpoint="/overview",
description="Get an overview of the Dataverse installations around the world sorted by country. Entrance point for the overview tools if no hosts are provided.",
inputSchema={
"type": "object",
"required": [],
"properties": {},
},
),
types.Tool(
name="overview_datasets",
endpoint="/overview/datasets",
description="Get an overview of the Dataverse datasets statistics by host",
inputSchema={
"type": "object",
"required": ["host"],
"properties": {
"host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"}
},
},
),
types.Tool(
name="overview_files",
endpoint="/overview/files",
description="Get an overview of the Dataverse files statistics by host",
inputSchema={
"type": "object",
"required": ["host"],
"properties": {
"host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"}
},
},
),
types.Tool(
name="search_datasets",
endpoint="/search/datasets",
description="Search for datasets in a Dataverse installation",
inputSchema={
"type": "object",
"required": ["host", "query"],
"properties": {
"host": {"type": "string", "description": "Host of the Dataverse installation (e.g. dataverse.nl)"},
"query": {"type": "string", "description": "Query to search for datasets"}
},
},
),
]
return tools

Expand Down Expand Up @@ -345,6 +393,59 @@ async def run_fetch_website(request: Request):
#return JSONResponse(content=serialized_result)
return Response(content=result, media_type="text/html")

async def run_get_overview(request: Request):
url = os.environ.get("DATAVERSES")
data = requests.get(url)
installations = data.json()['installations']
return JSONResponse(content={"installations": installations})

async def run_get_overview_datasets(request: Request):
if request.method == "GET":
host = request.query_params.get("host")
else:
body = await request.json()
host = body.get("host")

return search_datasets(host, False)

async def run_search_datasets(request: Request):
if request.method == "GET":
host = request.query_params.get("host")
query = request.query_params.get("query")
else:
body = await request.json()
host = body.get("host")
query = body.get("query")
return search_datasets(host, query)

def search_datasets(host: str, query: str):
if query:
query = f"q={query}"
else:
query = "q=%2A"

if not 'http' in host:
host = f"https://{host}"
url = f"{host}/api/search?{query}&type=dataset"
data = requests.get(url)
datasets = data.json()['data']
return JSONResponse(content={"datasets": datasets})

async def run_get_overview_files(request: Request):
if request.method == "GET":
host = request.query_params.get("host")
else:
body = await request.json()
host = body.get("host")

if not 'http' in host:
host = f"https://{host}"
url = f"{host}/api/search?q=*&type=file&per_page=0"
data = requests.get(url)
files = data.json()['data']
return JSONResponse(content={"files": files})


starlette_app = Starlette(
debug=True,
routes=[
Expand All @@ -366,7 +467,11 @@ async def run_fetch_website(request: Request):
Route("/mcp", endpoint=get_mcp, methods=["GET", "POST"]),
Route("/mcp/list_tools", endpoint=get_mcp, methods=["GET", "POST"]),
Route("/get_croissant_record", endpoint=mcp_croissant_record_endpoint, methods=["GET", "POST"]),
Route("/fetch", endpoint=run_fetch_website, methods=["GET", "POST"])
Route("/fetch", endpoint=run_fetch_website, methods=["GET", "POST"]),
Route("/overview", endpoint=run_get_overview, methods=["GET", "POST"]),
Route("/overview/datasets", endpoint=run_get_overview_datasets, methods=["GET", "POST"]),
Route("/search/datasets", endpoint=run_search_datasets, methods=["GET", "POST"]),
Route("/overview/files", endpoint=run_get_overview_files, methods=["GET", "POST"]),
],
)

Expand Down