langchain/libs/core/langchain_core/documents/base.py at 94a58825d352e15b2f5a132859b08827f7b208fb · langchain-ai/langchain · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
"""Base classes for media and documents.

This module contains core abstractions for **data retrieval and processing workflows**:

- `BaseMedia`: Base class providing `id` and `metadata` fields
- `Blob`: Raw data loading (files, binary data) - used by document loaders
- `Document`: Text content for retrieval (RAG, vector stores, semantic search)

!!! note "Not for LLM chat messages"

    These classes are for data processing pipelines, not LLM I/O. For multimodal
    content in chat messages (images, audio in conversations), see
    `langchain.messages` content blocks instead.
"""

from __future__ import annotations

import contextlib
import mimetypes
from io import BufferedReader, BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Any, Literal, cast

from pydantic import ConfigDict, Field, model_validator

from langchain_core.load.serializable import Serializable

if TYPE_CHECKING:
    from collections.abc import Generator

PathLike = str | PurePath


class BaseMedia(Serializable):
    """Base class for content used in retrieval and data processing workflows.

    Provides common fields for content that needs to be stored, indexed, or searched.

    !!! note

        For multimodal content in **chat messages** (images, audio sent to/from LLMs),
        use `langchain.messages` content blocks instead.
    """

    # The ID field is optional at the moment.
    # It will likely become required in a future major release after
    # it has been adopted by enough VectorStore implementations.
    id: str | None = Field(default=None, coerce_numbers_to_str=True)
    """An optional identifier for the document.

    Ideally this should be unique across the document collection and formatted
    as a UUID, but this will not be enforced.
    """

    metadata: dict = Field(default_factory=dict)
    """Arbitrary metadata associated with the content."""


class Blob(BaseMedia):
    """Raw data abstraction for document loading and file processing.

    Represents raw bytes or text, either in-memory or by file reference. Used
    primarily by document loaders to decouple data loading from parsing.

    Inspired by [Mozilla's `Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)

    ???+ example "Initialize a blob from in-memory data"

        ```python
        from langchain_core.documents import Blob

        blob = Blob.from_data("Hello, world!")

        # Read the blob as a string
        print(blob.as_string())

        # Read the blob as bytes
        print(blob.as_bytes())

        # Read the blob as a byte stream
        with blob.as_bytes_io() as f:
            print(f.read())
        ```

    ??? example "Load from memory and specify MIME type and metadata"

        ```python
        from langchain_core.documents import Blob

        blob = Blob.from_data(
            data="Hello, world!",
            mime_type="text/plain",
            metadata={"source": "https://example.com"},
        )
        ```

    ??? example "Load the blob from a file"

        ```python
        from langchain_core.documents import Blob

        blob = Blob.from_path("path/to/file.txt")

        # Read the blob as a string
        print(blob.as_string())

        # Read the blob as bytes
        print(blob.as_bytes())

        # Read the blob as a byte stream
        with blob.as_bytes_io() as f:
            print(f.read())
        ```
    """

    data: bytes | str | None = None
    """Raw data associated with the `Blob`."""

    mimetype: str | None = None
    """MIME type, not to be confused with a file extension."""

    encoding: str = "utf-8"
    """Encoding to use if decoding the bytes into a string.

    Uses `utf-8` as default encoding if decoding to string.
    """

    path: PathLike | None = None
    """Location where the original content was found."""

    model_config = ConfigDict(
        arbitrary_types_allowed=True,
        frozen=True,
    )

    @property
    def source(self) -> str | None:
        """The source location of the blob as string if known otherwise none.

        If a path is associated with the `Blob`, it will default to the path location.

        Unless explicitly set via a metadata field called `'source'`, in which
        case that value will be used instead.
        """
        if self.metadata and "source" in self.metadata:
            return cast("str | None", self.metadata["source"])
        return str(self.path) if self.path else None

    @model_validator(mode="before")
    @classmethod
    def check_blob_is_valid(cls, values: dict[str, Any]) -> Any:
        """Verify that either data or path is provided."""
        if "data" not in values and "path" not in values:
            msg = "Either data or path must be provided"
            raise ValueError(msg)
        return values

    def as_string(self) -> str:
        """Read data as a string.

        Raises:
            ValueError: If the blob cannot be represented as a string.

        Returns:
            The data as a string.
        """
        if self.data is None and self.path:
            return Path(self.path).read_text(encoding=self.encoding)
        if isinstance(self.data, bytes):
            return self.data.decode(self.encoding)
        if isinstance(self.data, str):
            return self.data
        msg = f"Unable to get string for blob {self}"
        raise ValueError(msg)

    def as_bytes(self) -> bytes:
        """Read data as bytes.

        Raises:
            ValueError: If the blob cannot be represented as bytes.

        Returns:
            The data as bytes.
        """
        if isinstance(self.data, bytes):
            return self.data
        if isinstance(self.data, str):
            return self.data.encode(self.encoding)
        if self.data is None and self.path:
            return Path(self.path).read_bytes()
        msg = f"Unable to get bytes for blob {self}"
        raise ValueError(msg)

    @contextlib.contextmanager
    def as_bytes_io(self) -> Generator[BytesIO | BufferedReader, None, None]:
        """Read data as a byte stream.

        Raises:
            NotImplementedError: If the blob cannot be represented as a byte stream.

        Yields:
            The data as a byte stream.
        """
        if isinstance(self.data, bytes):
            yield BytesIO(self.data)
        elif self.data is None and self.path:
            with Path(self.path).open("rb") as f:
                yield f
        else:
            msg = f"Unable to convert blob {self}"
            raise NotImplementedError(msg)

    @classmethod
    def from_path(
        cls,
        path: PathLike,
        *,
        encoding: str = "utf-8",
        mime_type: str | None = None,
        guess_type: bool = True,
        metadata: dict | None = None,
    ) -> Blob:
        """Load the blob from a path like object.

        Args:
            path: Path-like object to file to be read
            encoding: Encoding to use if decoding the bytes into a string
            mime_type: If provided, will be set as the MIME type of the data
            guess_type: If `True`, the MIME type will be guessed from the file
                extension, if a MIME type was not provided
            metadata: Metadata to associate with the `Blob`

        Returns:
            `Blob` instance
        """
        if mime_type is None and guess_type:
            mimetype = mimetypes.guess_type(path)[0]
        else:
            mimetype = mime_type
        # We do not load the data immediately, instead we treat the blob as a
        # reference to the underlying data.
        return cls(
            data=None,
            mimetype=mimetype,
            encoding=encoding,
            path=path,
            metadata=metadata if metadata is not None else {},
        )

    @classmethod
    def from_data(
        cls,
        data: str | bytes,
        *,
        encoding: str = "utf-8",
        mime_type: str | None = None,
        path: str | None = None,
        metadata: dict | None = None,
    ) -> Blob:
        """Initialize the `Blob` from in-memory data.

        Args:
            data: The in-memory data associated with the `Blob`
            encoding: Encoding to use if decoding the bytes into a string
            mime_type: If provided, will be set as the MIME type of the data
            path: If provided, will be set as the source from which the data came
            metadata: Metadata to associate with the `Blob`

        Returns:
            `Blob` instance
        """
        return cls(
            data=data,
            mimetype=mime_type,
            encoding=encoding,
            path=path,
            metadata=metadata if metadata is not None else {},
        )

    def __repr__(self) -> str:
        """Return the blob representation."""
        str_repr = f"Blob {id(self)}"
        if self.source:
            str_repr += f" {self.source}"
        return str_repr


class Document(BaseMedia):
    """Class for storing a piece of text and associated metadata.

    !!! note

        `Document` is for **retrieval workflows**, not chat I/O. For sending text
        to an LLM in a conversation, use message types from `langchain.messages`.

    Example:
        ```python
        from langchain_core.documents import Document

        document = Document(
            page_content="Hello, world!", metadata={"source": "https://example.com"}
        )
        ```
    """

    page_content: str
    """String text."""

    type: Literal["Document"] = "Document"

    def __init__(self, page_content: str, **kwargs: Any) -> None:
        """Pass page_content in as positional or named arg."""
        # my-py is complaining that page_content is not defined on the base class.
        # Here, we're relying on pydantic base class to handle the validation.
        super().__init__(page_content=page_content, **kwargs)  # type: ignore[call-arg,unused-ignore]

    @classmethod
    def is_lc_serializable(cls) -> bool:
        """Return `True` as this class is serializable."""
        return True

    @classmethod
    def get_lc_namespace(cls) -> list[str]:
        """Get the namespace of the LangChain object.

        Returns:
            `["langchain", "schema", "document"]`
        """
        return ["langchain", "schema", "document"]

    def __str__(self) -> str:
        """Override `__str__` to restrict it to page_content and metadata.

        Returns:
            A string representation of the `Document`.
        """
        # The format matches pydantic format for __str__.
        #
        # The purpose of this change is to make sure that user code that feeds
        # Document objects directly into prompts remains unchanged due to the addition
        # of the id field (or any other fields in the future).
        #
        # This override will likely be removed in the future in favor of a more general
        # solution of formatting content directly inside the prompts.
        if self.metadata:
            return f"page_content='{self.page_content}' metadata={self.metadata}"
        return f"page_content='{self.page_content}'"