superlinear-ai
diff --git a/‎README.md‎
Lines changed: 22 additions & 0 deletions b/‎README.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/raglite/__init__.py‎
Lines changed: 4 additions & 1 deletion b/‎src/raglite/__init__.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/raglite/_chainlit.py‎
Lines changed: 2 additions & 2 deletions b/‎src/raglite/_chainlit.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/raglite/_config.py‎
Lines changed: 22 additions & 0 deletions b/‎src/raglite/_config.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/raglite/_database.py‎
Lines changed: 4 additions & 1 deletion b/‎src/raglite/_database.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/raglite/_markdown.py‎
Lines changed: 38 additions & 2 deletions b/‎src/raglite/_markdown.py‎
Lines changed: 38 additions & 2 deletions
@@ -34,6 +34,7 @@ RAGLite is a Python toolkit for Retrieval-Augmented Generation (RAG) with DuckDB
 - 🔌 A built-in [Model Context Protocol](https://modelcontextprotocol.io) (MCP) server that any MCP client like [Claude desktop](https://claude.ai/download) can connect with
 - 💬 Optional customizable ChatGPT-like frontend for [web](https://docs.chainlit.io/deploy/copilot), [Slack](https://docs.chainlit.io/deploy/slack), and [Teams](https://docs.chainlit.io/deploy/teams) with [Chainlit](https://github.com/Chainlit/chainlit)
 - ✍️ Optional conversion of any input document to Markdown with [Pandoc](https://github.com/jgm/pandoc)
+- 🔎 Optional high-quality document processing with [Mistral OCR](https://docs.mistral.ai/capabilities/document/) for PDFs, images, DOCX, and PPTX with automatic image descriptions
 - ✅ Optional evaluation of retrieval and generation performance with [Ragas](https://github.com/explodinggradients/ragas)
 
 ## Installing
@@ -69,6 +70,12 @@ To add support for filetypes other than PDF, use the `pandoc` extra:
 pip install raglite[pandoc]
 ```
 
+To add support for high-quality document processing with [Mistral OCR](https://docs.mistral.ai/capabilities/document/), use the `mistral-ocr` extra:
+
+```sh
+pip install raglite[mistral-ocr]
+```
+
 To add support for evaluation, use the `ragas` extra:
 
 ```sh
@@ -152,6 +159,21 @@ my_config = RAGLiteConfig(
 > [!TIP]
 > ✍️ To insert documents other than PDF, install the `pandoc` extra with `pip install raglite[pandoc]`.
 
+> [!TIP]
+> 🔎 For higher-quality document processing with automatic image descriptions, install the `mistral-ocr` extra with `pip install raglite[mistral-ocr]` and configure it as follows:
+> ```python
+> from raglite import RAGLiteConfig, MistralOCRConfig
+>
+> my_config = RAGLiteConfig(
+>     document_processor=MistralOCRConfig(
+>         include_image_descriptions=True,  # Describe images, charts, and diagrams as text
+>         image_types=frozenset({"chart", "diagram", "photo", "table", "logo", "icon"}),  # Custom image categories
+>         exclude_image_types=frozenset({"logo", "icon"}),  # Filter out specific types from the output
+>     ),
+> )
+> ```
+> The `image_types` parameter defines the categories that Mistral classifies each image into — you can use the defaults or provide your own domain-specific types. Use `exclude_image_types` to filter out any classified types that are not useful for retrieval.
+
 Next, insert some documents into the database. RAGLite will take care of the [conversion to Markdown](src/raglite/_markdown.py), [optimal level 4 semantic chunking](src/raglite/_split_chunks.py), and [multi-vector embedding with late chunking](src/raglite/_embed.py):
 
 ```python
 
@@ -70,6 +70,7 @@ dev = [
   "pytest (>=8.3.4)",
   "pytest-mock (>=3.14.0)",
   "pytest-xdist (>=3.6.1)",
+  "python-dotenv (>=1.0.0)",
   "ruff (>=0.10.0)",
   "typeguard (>=4.4.1)",
 ]
@@ -80,6 +81,7 @@ chainlit = ["chainlit (>=2.0.0)"]
 # Large Language Models:
 llama-cpp-python = ["llama-cpp-python (>=0.3.9)"]
 # Markdown conversion:
+mistral-ocr = ["mistralai (>=1.10.1)"]
 pandoc = ["pypandoc-binary (>=1.13)"]
 # Evaluation:
 ragas = ["pandas (>=2.1.1)", "ragas (>=0.3.3)"]
 
@@ -1,11 +1,12 @@
 """RAGLite."""
 
-from raglite._config import RAGLiteConfig
+from raglite._config import MistralOCRConfig, RAGLiteConfig
 from raglite._database import Document
 from raglite._delete import delete_documents, delete_documents_by_metadata
 from raglite._eval import answer_evals, evaluate, insert_evals
 from raglite._extract import expand_document_metadata
 from raglite._insert import insert_documents
+from raglite._mistral_ocr import MistralOCRError
 from raglite._query_adapter import update_query_adapter
 from raglite._rag import add_context, async_rag, rag, retrieve_context
 from raglite._search import (
@@ -22,6 +23,8 @@
 __all__ = [
     # Config
     "RAGLiteConfig",
+    "MistralOCRConfig",
+    "MistralOCRError",
     # Insert
     "Document",
     "insert_documents",
 
@@ -75,15 +75,15 @@ async def handle_message(user_message: cl.Message) -> None:
     inline_attachments = []
     for file in user_message.elements:
         if file.path:
-            doc_md = document_to_markdown(Path(file.path))
+            doc_md = document_to_markdown(Path(file.path), config=config)
             if len(doc_md) // 3 <= 5 * (config.chunk_max_size // 3):
                 # Document is small enough to attach to the context.
                 inline_attachments.append(f"{Path(file.path).name}:\n\n{doc_md}")
             else:
                 # Document is too large and must be inserted into the database.
                 async with cl.Step(name="insert", type="run") as step:
                     step.input = Path(file.path).name
-                    document = Document.from_path(Path(file.path))
+                    document = Document.from_path(Path(file.path), config=config)
                     await async_insert_documents([document], config=config)
     # Append any inline attachments to the user prompt.
     user_prompt = (
 
@@ -23,6 +23,26 @@
 cache_path = Path(user_data_dir("raglite", ensure_exists=True))
 
 
+DEFAULT_IMAGE_TYPES = frozenset(
+    {"graph", "chart", "diagram", "table", "photo", "screenshot", "logo", "icon", "other"}
+)
+
+
+@dataclass(frozen=True)
+class MistralOCRConfig:
+    """Configuration for MistralOCR document processor."""
+
+    # API key - falls back to MISTRAL_API_KEY env var if None.
+    api_key: str | None = None
+    # Whether to use vision to describe images in documents.
+    include_image_descriptions: bool = True
+    # Image types that Mistral classifies each image into.
+    image_types: frozenset[str] = DEFAULT_IMAGE_TYPES
+    # Image types to exclude from the output (e.g., {"logo", "icon"}).
+    exclude_image_types: frozenset[str] = frozenset()
+    model: str = "mistral-ocr-latest"
+
+
 # Lazily load the default search method to avoid circular imports.
 # TODO: Replace with search_and_rerank_chunk_spans after benchmarking.
 def _vector_search(
@@ -65,6 +85,8 @@ class RAGLiteConfig:
     embedder_normalize: bool = True
     # Chunk config used to partition documents into chunks.
     chunk_max_size: int = 2048  # Max number of characters per chunk.
+    # Document processing config. None = default processor.
+    document_processor: MistralOCRConfig | None = None
     # Vector search config.
     vector_search_distance_metric: Literal["cosine", "dot", "l2"] = "cosine"
     vector_search_multivector: bool = True
 
@@ -108,6 +108,7 @@ def from_path(
         *,
         id: DocumentId | None = None,  # noqa: A002
         url: str | None = None,
+        config: RAGLiteConfig | None = None,
         **kwargs: Any,
     ) -> "Document":
         """Create a document from a file path.
@@ -120,6 +121,8 @@ def from_path(
             The document id to use. If not provided, a hash of the document's content is used.
         url
             The URL of the document, if available.
+        config
+            The RAGLite configuration for document processing.
         kwargs
             Any additional metadata to store.
 
@@ -145,7 +148,7 @@ def from_path(
             filename=doc_path.name,
             url=url,
             metadata_=metadata,
-            content=document_to_markdown(doc_path),
+            content=document_to_markdown(doc_path, config=config),
         )
 
     @staticmethod
 
@@ -1,5 +1,6 @@
 """Convert any document to Markdown."""
 
+import logging
 import re
 from copy import deepcopy
 from pathlib import Path
@@ -9,6 +10,10 @@
 from pdftext.extraction import dictionary_output
 from sklearn.cluster import KMeans
 
+from raglite._config import MistralOCRConfig, RAGLiteConfig
+
+logger = logging.getLogger(__name__)
+
 
 def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]:  # noqa: C901, PLR0915
     """Convert a PDF parsed with pdftext to Markdown."""
@@ -194,8 +199,8 @@ def _merge_split_headings(match: re.Match[str]) -> str:
     return pages_md
 
 
-def document_to_markdown(doc_path: Path) -> str:
-    """Convert any document to GitHub Flavored Markdown."""
+def _default_document_to_markdown(doc_path: Path) -> str:
+    """Convert any document to GitHub Flavored Markdown using pdftext/pandoc."""
     # Convert the file's content to GitHub Flavored Markdown.
     if doc_path.suffix == ".pdf":
         # Parse the PDF with pdftext and convert it to Markdown.
@@ -219,3 +224,34 @@ def document_to_markdown(doc_path: Path) -> str:
             # File format not supported, fall back to reading the text.
             doc = doc_path.read_text()
     return doc
+
+
+def document_to_markdown(doc_path: Path, *, config: RAGLiteConfig | None = None) -> str:
+    """Convert any document to GitHub Flavored Markdown.
+
+    Parameters
+    ----------
+    doc_path
+        Path to the document file.
+    config
+        Optional RAGLite configuration. If document_processor is set to a
+        MistralOCRConfig, uses MistralOCR instead of the default processor.
+
+    Returns
+    -------
+    str
+        Document content as GitHub Flavored Markdown.
+    """
+    config = config or RAGLiteConfig()
+
+    if isinstance(config.document_processor, MistralOCRConfig):
+        # Lazy import to avoid requiring mistralai when not using MistralOCR.
+        from raglite._mistral_ocr import SUPPORTED_EXTENSIONS, mistral_ocr_to_markdown
+
+        if doc_path.suffix.lower() in SUPPORTED_EXTENSIONS:
+            return mistral_ocr_to_markdown(doc_path, processor_config=config.document_processor)
+        logger.debug(
+            "Mistral does not support file type: %s\nFalling back to default processor.", doc_path
+        )
+
+    return _default_document_to_markdown(doc_path)