Skip to content

Commit 54f4f80

Browse files
authored
feat: add Mistral Document AI for document parsing (#175)
* feat: support Mistral Document AI for document processing * chore: clean up code * feat: expose Mistral model explicitly, clarify dependency on import error * feat: add logging for unsupported file types in document processing * refactor: streamline MistralOCR tests * chore: specify type for metadata to satisfy mypy * test: add NVIDIA report PDF for Mistral OCR testing with enhanced assertions * chore: add python-dotenv to development dependencies * refactor: streamline OCR processing logic * feat: allow custom image type categories in MistralOCRConfig * docs: add Mistral OCR usage documentation to README
1 parent 8895270 commit 54f4f80

11 files changed

Lines changed: 406 additions & 8 deletions

File tree

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ RAGLite is a Python toolkit for Retrieval-Augmented Generation (RAG) with DuckDB
3434
- 🔌 A built-in [Model Context Protocol](https://modelcontextprotocol.io) (MCP) server that any MCP client like [Claude desktop](https://claude.ai/download) can connect with
3535
- 💬 Optional customizable ChatGPT-like frontend for [web](https://docs.chainlit.io/deploy/copilot), [Slack](https://docs.chainlit.io/deploy/slack), and [Teams](https://docs.chainlit.io/deploy/teams) with [Chainlit](https://github.com/Chainlit/chainlit)
3636
- ✍️ Optional conversion of any input document to Markdown with [Pandoc](https://github.com/jgm/pandoc)
37+
- 🔎 Optional high-quality document processing with [Mistral OCR](https://docs.mistral.ai/capabilities/document/) for PDFs, images, DOCX, and PPTX with automatic image descriptions
3738
- ✅ Optional evaluation of retrieval and generation performance with [Ragas](https://github.com/explodinggradients/ragas)
3839

3940
## Installing
@@ -69,6 +70,12 @@ To add support for filetypes other than PDF, use the `pandoc` extra:
6970
pip install raglite[pandoc]
7071
```
7172

73+
To add support for high-quality document processing with [Mistral OCR](https://docs.mistral.ai/capabilities/document/), use the `mistral-ocr` extra:
74+
75+
```sh
76+
pip install raglite[mistral-ocr]
77+
```
78+
7279
To add support for evaluation, use the `ragas` extra:
7380

7481
```sh
@@ -152,6 +159,21 @@ my_config = RAGLiteConfig(
152159
> [!TIP]
153160
> ✍️ To insert documents other than PDF, install the `pandoc` extra with `pip install raglite[pandoc]`.
154161
162+
> [!TIP]
163+
> 🔎 For higher-quality document processing with automatic image descriptions, install the `mistral-ocr` extra with `pip install raglite[mistral-ocr]` and configure it as follows:
164+
> ```python
165+
> from raglite import RAGLiteConfig, MistralOCRConfig
166+
>
167+
> my_config = RAGLiteConfig(
168+
> document_processor=MistralOCRConfig(
169+
> include_image_descriptions=True, # Describe images, charts, and diagrams as text
170+
> image_types=frozenset({"chart", "diagram", "photo", "table", "logo", "icon"}), # Custom image categories
171+
> exclude_image_types=frozenset({"logo", "icon"}), # Filter out specific types from the output
172+
> ),
173+
> )
174+
> ```
175+
> The `image_types` parameter defines the categories that Mistral classifies each image into — you can use the defaults or provide your own domain-specific types. Use `exclude_image_types` to filter out any classified types that are not useful for retrieval.
176+
155177
Next, insert some documents into the database. RAGLite will take care of the [conversion to Markdown](src/raglite/_markdown.py), [optimal level 4 semantic chunking](src/raglite/_split_chunks.py), and [multi-vector embedding with late chunking](src/raglite/_embed.py):
156178
157179
```python

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ dev = [
7070
"pytest (>=8.3.4)",
7171
"pytest-mock (>=3.14.0)",
7272
"pytest-xdist (>=3.6.1)",
73+
"python-dotenv (>=1.0.0)",
7374
"ruff (>=0.10.0)",
7475
"typeguard (>=4.4.1)",
7576
]
@@ -80,6 +81,7 @@ chainlit = ["chainlit (>=2.0.0)"]
8081
# Large Language Models:
8182
llama-cpp-python = ["llama-cpp-python (>=0.3.9)"]
8283
# Markdown conversion:
84+
mistral-ocr = ["mistralai (>=1.10.1)"]
8385
pandoc = ["pypandoc-binary (>=1.13)"]
8486
# Evaluation:
8587
ragas = ["pandas (>=2.1.1)", "ragas (>=0.3.3)"]

src/raglite/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
"""RAGLite."""
22

3-
from raglite._config import RAGLiteConfig
3+
from raglite._config import MistralOCRConfig, RAGLiteConfig
44
from raglite._database import Document
55
from raglite._delete import delete_documents, delete_documents_by_metadata
66
from raglite._eval import answer_evals, evaluate, insert_evals
77
from raglite._extract import expand_document_metadata
88
from raglite._insert import insert_documents
9+
from raglite._mistral_ocr import MistralOCRError
910
from raglite._query_adapter import update_query_adapter
1011
from raglite._rag import add_context, async_rag, rag, retrieve_context
1112
from raglite._search import (
@@ -22,6 +23,8 @@
2223
__all__ = [
2324
# Config
2425
"RAGLiteConfig",
26+
"MistralOCRConfig",
27+
"MistralOCRError",
2528
# Insert
2629
"Document",
2730
"insert_documents",

src/raglite/_chainlit.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,15 @@ async def handle_message(user_message: cl.Message) -> None:
7575
inline_attachments = []
7676
for file in user_message.elements:
7777
if file.path:
78-
doc_md = document_to_markdown(Path(file.path))
78+
doc_md = document_to_markdown(Path(file.path), config=config)
7979
if len(doc_md) // 3 <= 5 * (config.chunk_max_size // 3):
8080
# Document is small enough to attach to the context.
8181
inline_attachments.append(f"{Path(file.path).name}:\n\n{doc_md}")
8282
else:
8383
# Document is too large and must be inserted into the database.
8484
async with cl.Step(name="insert", type="run") as step:
8585
step.input = Path(file.path).name
86-
document = Document.from_path(Path(file.path))
86+
document = Document.from_path(Path(file.path), config=config)
8787
await async_insert_documents([document], config=config)
8888
# Append any inline attachments to the user prompt.
8989
user_prompt = (

src/raglite/_config.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,26 @@
2323
cache_path = Path(user_data_dir("raglite", ensure_exists=True))
2424

2525

26+
DEFAULT_IMAGE_TYPES = frozenset(
27+
{"graph", "chart", "diagram", "table", "photo", "screenshot", "logo", "icon", "other"}
28+
)
29+
30+
31+
@dataclass(frozen=True)
32+
class MistralOCRConfig:
33+
"""Configuration for MistralOCR document processor."""
34+
35+
# API key - falls back to MISTRAL_API_KEY env var if None.
36+
api_key: str | None = None
37+
# Whether to use vision to describe images in documents.
38+
include_image_descriptions: bool = True
39+
# Image types that Mistral classifies each image into.
40+
image_types: frozenset[str] = DEFAULT_IMAGE_TYPES
41+
# Image types to exclude from the output (e.g., {"logo", "icon"}).
42+
exclude_image_types: frozenset[str] = frozenset()
43+
model: str = "mistral-ocr-latest"
44+
45+
2646
# Lazily load the default search method to avoid circular imports.
2747
# TODO: Replace with search_and_rerank_chunk_spans after benchmarking.
2848
def _vector_search(
@@ -65,6 +85,8 @@ class RAGLiteConfig:
6585
embedder_normalize: bool = True
6686
# Chunk config used to partition documents into chunks.
6787
chunk_max_size: int = 2048 # Max number of characters per chunk.
88+
# Document processing config. None = default processor.
89+
document_processor: MistralOCRConfig | None = None
6890
# Vector search config.
6991
vector_search_distance_metric: Literal["cosine", "dot", "l2"] = "cosine"
7092
vector_search_multivector: bool = True

src/raglite/_database.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def from_path(
108108
*,
109109
id: DocumentId | None = None, # noqa: A002
110110
url: str | None = None,
111+
config: RAGLiteConfig | None = None,
111112
**kwargs: Any,
112113
) -> "Document":
113114
"""Create a document from a file path.
@@ -120,6 +121,8 @@ def from_path(
120121
The document id to use. If not provided, a hash of the document's content is used.
121122
url
122123
The URL of the document, if available.
124+
config
125+
The RAGLite configuration for document processing.
123126
kwargs
124127
Any additional metadata to store.
125128
@@ -145,7 +148,7 @@ def from_path(
145148
filename=doc_path.name,
146149
url=url,
147150
metadata_=metadata,
148-
content=document_to_markdown(doc_path),
151+
content=document_to_markdown(doc_path, config=config),
149152
)
150153

151154
@staticmethod

src/raglite/_markdown.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Convert any document to Markdown."""
22

3+
import logging
34
import re
45
from copy import deepcopy
56
from pathlib import Path
@@ -9,6 +10,10 @@
910
from pdftext.extraction import dictionary_output
1011
from sklearn.cluster import KMeans
1112

13+
from raglite._config import MistralOCRConfig, RAGLiteConfig
14+
15+
logger = logging.getLogger(__name__)
16+
1217

1318
def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0915
1419
"""Convert a PDF parsed with pdftext to Markdown."""
@@ -194,8 +199,8 @@ def _merge_split_headings(match: re.Match[str]) -> str:
194199
return pages_md
195200

196201

197-
def document_to_markdown(doc_path: Path) -> str:
198-
"""Convert any document to GitHub Flavored Markdown."""
202+
def _default_document_to_markdown(doc_path: Path) -> str:
203+
"""Convert any document to GitHub Flavored Markdown using pdftext/pandoc."""
199204
# Convert the file's content to GitHub Flavored Markdown.
200205
if doc_path.suffix == ".pdf":
201206
# Parse the PDF with pdftext and convert it to Markdown.
@@ -219,3 +224,34 @@ def document_to_markdown(doc_path: Path) -> str:
219224
# File format not supported, fall back to reading the text.
220225
doc = doc_path.read_text()
221226
return doc
227+
228+
229+
def document_to_markdown(doc_path: Path, *, config: RAGLiteConfig | None = None) -> str:
230+
"""Convert any document to GitHub Flavored Markdown.
231+
232+
Parameters
233+
----------
234+
doc_path
235+
Path to the document file.
236+
config
237+
Optional RAGLite configuration. If document_processor is set to a
238+
MistralOCRConfig, uses MistralOCR instead of the default processor.
239+
240+
Returns
241+
-------
242+
str
243+
Document content as GitHub Flavored Markdown.
244+
"""
245+
config = config or RAGLiteConfig()
246+
247+
if isinstance(config.document_processor, MistralOCRConfig):
248+
# Lazy import to avoid requiring mistralai when not using MistralOCR.
249+
from raglite._mistral_ocr import SUPPORTED_EXTENSIONS, mistral_ocr_to_markdown
250+
251+
if doc_path.suffix.lower() in SUPPORTED_EXTENSIONS:
252+
return mistral_ocr_to_markdown(doc_path, processor_config=config.document_processor)
253+
logger.debug(
254+
"Mistral does not support file type: %s\nFalling back to default processor.", doc_path
255+
)
256+
257+
return _default_document_to_markdown(doc_path)

0 commit comments

Comments
 (0)