-
Notifications
You must be signed in to change notification settings - Fork 22.4k
Expand file tree
/
Copy pathbase.py
More file actions
347 lines (270 loc) · 10.9 KB
/
base.py
File metadata and controls
347 lines (270 loc) · 10.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
"""Base classes for media and documents.
This module contains core abstractions for **data retrieval and processing workflows**:
- `BaseMedia`: Base class providing `id` and `metadata` fields
- `Blob`: Raw data loading (files, binary data) - used by document loaders
- `Document`: Text content for retrieval (RAG, vector stores, semantic search)
!!! note "Not for LLM chat messages"
These classes are for data processing pipelines, not LLM I/O. For multimodal
content in chat messages (images, audio in conversations), see
`langchain.messages` content blocks instead.
"""
from __future__ import annotations
import contextlib
import mimetypes
from io import BufferedReader, BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Any, Literal, cast
from pydantic import ConfigDict, Field, model_validator
from langchain_core.load.serializable import Serializable
if TYPE_CHECKING:
from collections.abc import Generator
PathLike = str | PurePath
class BaseMedia(Serializable):
"""Base class for content used in retrieval and data processing workflows.
Provides common fields for content that needs to be stored, indexed, or searched.
!!! note
For multimodal content in **chat messages** (images, audio sent to/from LLMs),
use `langchain.messages` content blocks instead.
"""
# The ID field is optional at the moment.
# It will likely become required in a future major release after
# it has been adopted by enough VectorStore implementations.
id: str | None = Field(default=None, coerce_numbers_to_str=True)
"""An optional identifier for the document.
Ideally this should be unique across the document collection and formatted
as a UUID, but this will not be enforced.
"""
metadata: dict = Field(default_factory=dict)
"""Arbitrary metadata associated with the content."""
class Blob(BaseMedia):
"""Raw data abstraction for document loading and file processing.
Represents raw bytes or text, either in-memory or by file reference. Used
primarily by document loaders to decouple data loading from parsing.
Inspired by [Mozilla's `Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)
???+ example "Initialize a blob from in-memory data"
```python
from langchain_core.documents import Blob
blob = Blob.from_data("Hello, world!")
# Read the blob as a string
print(blob.as_string())
# Read the blob as bytes
print(blob.as_bytes())
# Read the blob as a byte stream
with blob.as_bytes_io() as f:
print(f.read())
```
??? example "Load from memory and specify MIME type and metadata"
```python
from langchain_core.documents import Blob
blob = Blob.from_data(
data="Hello, world!",
mime_type="text/plain",
metadata={"source": "https://example.com"},
)
```
??? example "Load the blob from a file"
```python
from langchain_core.documents import Blob
blob = Blob.from_path("path/to/file.txt")
# Read the blob as a string
print(blob.as_string())
# Read the blob as bytes
print(blob.as_bytes())
# Read the blob as a byte stream
with blob.as_bytes_io() as f:
print(f.read())
```
"""
data: bytes | str | None = None
"""Raw data associated with the `Blob`."""
mimetype: str | None = None
"""MIME type, not to be confused with a file extension."""
encoding: str = "utf-8"
"""Encoding to use if decoding the bytes into a string.
Uses `utf-8` as default encoding if decoding to string.
"""
path: PathLike | None = None
"""Location where the original content was found."""
model_config = ConfigDict(
arbitrary_types_allowed=True,
frozen=True,
)
@property
def source(self) -> str | None:
"""The source location of the blob as string if known otherwise none.
If a path is associated with the `Blob`, it will default to the path location.
Unless explicitly set via a metadata field called `'source'`, in which
case that value will be used instead.
"""
if self.metadata and "source" in self.metadata:
return cast("str | None", self.metadata["source"])
return str(self.path) if self.path else None
@model_validator(mode="before")
@classmethod
def check_blob_is_valid(cls, values: dict[str, Any]) -> Any:
"""Verify that either data or path is provided."""
if "data" not in values and "path" not in values:
msg = "Either data or path must be provided"
raise ValueError(msg)
return values
def as_string(self) -> str:
"""Read data as a string.
Raises:
ValueError: If the blob cannot be represented as a string.
Returns:
The data as a string.
"""
if self.data is None and self.path:
return Path(self.path).read_text(encoding=self.encoding)
if isinstance(self.data, bytes):
return self.data.decode(self.encoding)
if isinstance(self.data, str):
return self.data
msg = f"Unable to get string for blob {self}"
raise ValueError(msg)
def as_bytes(self) -> bytes:
"""Read data as bytes.
Raises:
ValueError: If the blob cannot be represented as bytes.
Returns:
The data as bytes.
"""
if isinstance(self.data, bytes):
return self.data
if isinstance(self.data, str):
return self.data.encode(self.encoding)
if self.data is None and self.path:
return Path(self.path).read_bytes()
msg = f"Unable to get bytes for blob {self}"
raise ValueError(msg)
@contextlib.contextmanager
def as_bytes_io(self) -> Generator[BytesIO | BufferedReader, None, None]:
"""Read data as a byte stream.
Raises:
NotImplementedError: If the blob cannot be represented as a byte stream.
Yields:
The data as a byte stream.
"""
if isinstance(self.data, bytes):
yield BytesIO(self.data)
elif self.data is None and self.path:
with Path(self.path).open("rb") as f:
yield f
else:
msg = f"Unable to convert blob {self}"
raise NotImplementedError(msg)
@classmethod
def from_path(
cls,
path: PathLike,
*,
encoding: str = "utf-8",
mime_type: str | None = None,
guess_type: bool = True,
metadata: dict | None = None,
) -> Blob:
"""Load the blob from a path like object.
Args:
path: Path-like object to file to be read
encoding: Encoding to use if decoding the bytes into a string
mime_type: If provided, will be set as the MIME type of the data
guess_type: If `True`, the MIME type will be guessed from the file
extension, if a MIME type was not provided
metadata: Metadata to associate with the `Blob`
Returns:
`Blob` instance
"""
if mime_type is None and guess_type:
mimetype = mimetypes.guess_type(path)[0]
else:
mimetype = mime_type
# We do not load the data immediately, instead we treat the blob as a
# reference to the underlying data.
return cls(
data=None,
mimetype=mimetype,
encoding=encoding,
path=path,
metadata=metadata if metadata is not None else {},
)
@classmethod
def from_data(
cls,
data: str | bytes,
*,
encoding: str = "utf-8",
mime_type: str | None = None,
path: str | None = None,
metadata: dict | None = None,
) -> Blob:
"""Initialize the `Blob` from in-memory data.
Args:
data: The in-memory data associated with the `Blob`
encoding: Encoding to use if decoding the bytes into a string
mime_type: If provided, will be set as the MIME type of the data
path: If provided, will be set as the source from which the data came
metadata: Metadata to associate with the `Blob`
Returns:
`Blob` instance
"""
return cls(
data=data,
mimetype=mime_type,
encoding=encoding,
path=path,
metadata=metadata if metadata is not None else {},
)
def __repr__(self) -> str:
"""Return the blob representation."""
str_repr = f"Blob {id(self)}"
if self.source:
str_repr += f" {self.source}"
return str_repr
class Document(BaseMedia):
"""Class for storing a piece of text and associated metadata.
!!! note
`Document` is for **retrieval workflows**, not chat I/O. For sending text
to an LLM in a conversation, use message types from `langchain.messages`.
Example:
```python
from langchain_core.documents import Document
document = Document(
page_content="Hello, world!", metadata={"source": "https://example.com"}
)
```
"""
page_content: str
"""String text."""
type: Literal["Document"] = "Document"
def __init__(self, page_content: str, **kwargs: Any) -> None:
"""Pass page_content in as positional or named arg."""
# my-py is complaining that page_content is not defined on the base class.
# Here, we're relying on pydantic base class to handle the validation.
super().__init__(page_content=page_content, **kwargs) # type: ignore[call-arg,unused-ignore]
@classmethod
def is_lc_serializable(cls) -> bool:
"""Return `True` as this class is serializable."""
return True
@classmethod
def get_lc_namespace(cls) -> list[str]:
"""Get the namespace of the LangChain object.
Returns:
`["langchain", "schema", "document"]`
"""
return ["langchain", "schema", "document"]
def __str__(self) -> str:
"""Override `__str__` to restrict it to page_content and metadata.
Returns:
A string representation of the `Document`.
"""
# The format matches pydantic format for __str__.
#
# The purpose of this change is to make sure that user code that feeds
# Document objects directly into prompts remains unchanged due to the addition
# of the id field (or any other fields in the future).
#
# This override will likely be removed in the future in favor of a more general
# solution of formatting content directly inside the prompts.
if self.metadata:
return f"page_content='{self.page_content}' metadata={self.metadata}"
return f"page_content='{self.page_content}'"