Skip to content

Commit 2b53b28

Browse files
andreerpre-commit-ci[bot]agronholm
authored
Stack allocate small strings (#270)
* normalize str_errors='error' to 'strict' 'error' was never a valid Python string error handler. Accept it for backwards compatibility but normalize to 'strict' internally. Update error messages to only mention valid options. * fix str_errors handling in C string decoder The C implementation of decode_definite_short_string was not respecting the str_errors setting - it always used PyUnicode_FromStringAndSize which only supports strict mode. Now uses PyUnicode_DecodeUTF8 with the str_errors field passed directly. Store str_errors as const char* (NULL for strict, "replace" for replace) instead of PyObject*. This eliminates a conditional in the hot path since PyUnicode_DecodeUTF8 accepts NULL to mean strict mode. Fixes #255 * use fp_read directly in string decoder Skip creating an intermediate Python bytes object by using fp_read directly into a PyMem_Malloc buffer. This avoids Python allocator and reference counting overhead. * stack allocate small strings in C decoder Use a stack buffer for strings <= 256 bytes to avoid heap allocation overhead. Larger strings continue to use PyMem_Malloc. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Alex Grönholm <alex.gronholm@nextday.fi>
1 parent a7ac10d commit 2b53b28

File tree

5 files changed

+162
-33
lines changed

5 files changed

+162
-33
lines changed

cbor2/_decoder.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,14 @@ class CBORDecoder:
6565

6666
_fp: IO[bytes]
6767
_fp_read: Callable[[int], bytes]
68+
_str_errors: str
6869

6970
def __init__(
7071
self,
7172
fp: IO[bytes],
7273
tag_hook: Callable[[CBORDecoder, CBORTag], Any] | None = None,
7374
object_hook: Callable[[CBORDecoder, dict[Any, Any]], Any] | None = None,
74-
str_errors: Literal["strict", "error", "replace"] = "strict",
75+
str_errors: str = "strict",
7576
read_size: int = 1,
7677
*,
7778
max_depth: int = 400,
@@ -164,17 +165,19 @@ def object_hook(self, value: Callable[[CBORDecoder, dict[Any, Any]], Any] | None
164165
raise ValueError("object_hook must be None or a callable")
165166

166167
@property
167-
def str_errors(self) -> Literal["strict", "error", "replace"]:
168+
def str_errors(self) -> str:
168169
return self._str_errors
169170

170171
@str_errors.setter
171-
def str_errors(self, value: Literal["strict", "error", "replace"]) -> None:
172-
if value in ("strict", "error", "replace"):
172+
def str_errors(self, value: str) -> None:
173+
if value == "error":
174+
self._str_errors = "strict"
175+
elif value in ("strict", "error", "replace", "backslashreplace", "surrogateescape"):
173176
self._str_errors = value
174177
else:
175178
raise ValueError(
176-
f"invalid str_errors value {value!r} (must be one of 'strict', "
177-
"'error', or 'replace')"
179+
f"invalid str_errors value {value!r} (must be 'strict', 'error', 'replace', "
180+
f"'backslashreplace' or 'surrogateescape')"
178181
)
179182

180183
def set_shareable(self, value: T) -> T:

docs/versionhistory.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ This library adheres to `Semantic Versioning 2.0 <http://semver.org/>`_.
2222
(`#287 <https://github.com/agronholm/cbor2/issues/287>`_)
2323
- Fixed two reference/memory leaks in the C extension's long string decoder
2424
(`#290 <https://github.com/agronholm/cbor2/pull/290>`_ PR by @killiancowan82)
25+
- Fixed C decoder ignoring the ``str_errors`` setting when decoding strings, and improved
26+
string decoding performance by using stack allocation for small strings and eliminating
27+
unnecessary conditionals. Benchmarks show 9-17% faster deserialization.
28+
(`#255 <https://github.com/agronholm/cbor2/issues/255>`_; PR by @andreer)
2529

2630
**5.8.0** (2025-12-30)
2731

source/decoder.c

Lines changed: 75 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@
3434
// copied from cpython/Objects/bytesobject.c for bounds checks
3535
#define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
3636

37+
// Threshold for using stack allocation vs heap allocation for short strings
38+
#define SMALL_STRING_STACK_THRESHOLD 256
39+
3740
enum DecodeOption {
3841
DECODE_NORMAL = 0,
3942
DECODE_IMMUTABLE = 1,
@@ -106,7 +109,6 @@ CBORDecoder_clear(CBORDecoderObject *self)
106109
Py_CLEAR(self->object_hook);
107110
Py_CLEAR(self->shareables);
108111
Py_CLEAR(self->stringref_namespace);
109-
Py_CLEAR(self->str_errors);
110112
if (self->readahead) {
111113
PyMem_Free(self->readahead);
112114
self->readahead = NULL;
@@ -152,7 +154,7 @@ CBORDecoder_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
152154
self->tag_hook = Py_None;
153155
Py_INCREF(Py_None);
154156
self->object_hook = Py_None;
155-
self->str_errors = PyBytes_FromString("strict");
157+
self->str_errors = NULL; // NULL means strict mode
156158
self->max_depth = CBOR2_DEFAULT_MAX_DEPTH;
157159
self->immutable = false;
158160
self->shared_index = -1;
@@ -362,9 +364,8 @@ _CBORDecoder_set_object_hook(CBORDecoderObject *self, PyObject *value,
362364
static PyObject *
363365
_CBORDecoder_get_str_errors(CBORDecoderObject *self, void *closure)
364366
{
365-
return PyUnicode_DecodeASCII(
366-
PyBytes_AS_STRING(self->str_errors),
367-
PyBytes_GET_SIZE(self->str_errors), "strict");
367+
const char *mode = self->str_errors ? self->str_errors : "strict";
368+
return PyUnicode_FromString(mode);
368369
}
369370

370371

@@ -373,30 +374,46 @@ static int
373374
_CBORDecoder_set_str_errors(CBORDecoderObject *self, PyObject *value,
374375
void *closure)
375376
{
376-
PyObject *tmp, *bytes;
377-
378377
if (!value) {
379378
PyErr_SetString(PyExc_AttributeError,
380379
"cannot delete str_errors attribute");
381380
return -1;
382381
}
383382
if (PyUnicode_Check(value)) {
384-
bytes = PyUnicode_AsASCIIString(value);
383+
PyObject *bytes = PyUnicode_AsASCIIString(value);
385384
if (bytes) {
386-
if (!strcmp(PyBytes_AS_STRING(bytes), "strict") ||
387-
!strcmp(PyBytes_AS_STRING(bytes), "error") ||
388-
!strcmp(PyBytes_AS_STRING(bytes), "replace")) {
389-
tmp = self->str_errors;
390-
self->str_errors = bytes;
391-
Py_DECREF(tmp);
385+
const char *mode = PyBytes_AS_STRING(bytes);
386+
if (!strcmp(mode, "strict") || !strcmp(mode, "error")) {
387+
self->str_errors = NULL;
388+
Py_DECREF(bytes);
389+
return 0;
390+
}
391+
if (!strcmp(mode, "replace")) {
392+
self->str_errors = "replace";
393+
Py_DECREF(bytes);
394+
return 0;
395+
}
396+
if (!strcmp(mode, "ignore")) {
397+
self->str_errors = "ignore";
398+
Py_DECREF(bytes);
399+
return 0;
400+
}
401+
if (!strcmp(mode, "backslashreplace")) {
402+
self->str_errors = "backslashreplace";
403+
Py_DECREF(bytes);
404+
return 0;
405+
}
406+
if (!strcmp(mode, "surrogateescape")) {
407+
self->str_errors = "surrogateescape";
408+
Py_DECREF(bytes);
392409
return 0;
393410
}
394411
Py_DECREF(bytes);
395412
}
396413
}
397414
PyErr_Format(PyExc_ValueError,
398-
"invalid str_errors value %R (must be one of 'strict', "
399-
"'error', or 'replace')", value);
415+
"invalid str_errors value %R (must be 'strict', 'error', 'replace', "
416+
"'backslashreplace' or 'surrogateescape')", value);
400417
return -1;
401418
}
402419

@@ -861,13 +878,22 @@ decode_bytestring(CBORDecoderObject *self, uint8_t subtype)
861878
static PyObject *
862879
decode_definite_short_string(CBORDecoderObject *self, Py_ssize_t length)
863880
{
864-
PyObject *bytes_obj = fp_read_object(self, length);
865-
if (!bytes_obj)
881+
char stack_buf[SMALL_STRING_STACK_THRESHOLD];
882+
char *buf = (length <= SMALL_STRING_STACK_THRESHOLD) ? stack_buf : PyMem_Malloc(length);
883+
if (!buf)
884+
return PyErr_NoMemory();
885+
886+
if (self->fp_read(self, buf, length) == -1) {
887+
if (buf != stack_buf)
888+
PyMem_Free(buf);
866889
return NULL;
890+
}
891+
892+
PyObject *ret = PyUnicode_DecodeUTF8(buf, length, self->str_errors);
893+
894+
if (buf != stack_buf)
895+
PyMem_Free(buf);
867896

868-
const char *bytes = PyBytes_AS_STRING(bytes_obj);
869-
PyObject *ret = PyUnicode_FromStringAndSize(bytes, length);
870-
Py_DECREF(bytes_obj);
871897
if (ret && string_namespace_add(self, ret, length) == -1) {
872898
Py_DECREF(ret);
873899
return NULL;
@@ -925,18 +951,18 @@ decode_definite_long_string(CBORDecoderObject *self, Py_ssize_t length)
925951
}
926952

927953
consumed = chunk_length; // workaround for https://github.com/python/cpython/issues/99612
928-
string = PyUnicode_DecodeUTF8Stateful(source_buffer, chunk_length, NULL, &consumed);
954+
string = PyUnicode_DecodeUTF8Stateful(source_buffer, chunk_length, self->str_errors, &consumed);
929955
if (!string)
930956
goto error;
931957

932958
if (ret) {
933959
// Concatenate the result to the existing result
934960
PyObject *joined = PyUnicode_Concat(ret, string);
961+
Py_DECREF(string);
962+
string = NULL;
935963
if (!joined)
936964
goto error;
937965

938-
Py_DECREF(string);
939-
string = NULL;
940966
Py_DECREF(ret);
941967
ret = joined;
942968
} else {
@@ -967,12 +993,35 @@ decode_definite_long_string(CBORDecoderObject *self, Py_ssize_t length)
967993
chunk = NULL;
968994
}
969995

970-
if (ret && string_namespace_add(self, ret, length) == -1)
971-
goto error;
996+
// Handle any remaining bytes in the buffer (incomplete UTF-8 sequences)
997+
if (buffer_length > 0) {
998+
string = PyUnicode_DecodeUTF8(buffer, buffer_length, self->str_errors);
999+
if (!string)
1000+
goto error;
1001+
1002+
if (ret) {
1003+
PyObject *joined = PyUnicode_Concat(ret, string);
1004+
Py_DECREF(string);
1005+
string = NULL;
1006+
if (!joined)
1007+
goto error;
1008+
1009+
Py_DECREF(ret);
1010+
ret = joined;
1011+
} else {
1012+
ret = string;
1013+
string = NULL;
1014+
}
1015+
}
9721016

9731017
if (buffer)
9741018
PyMem_Free(buffer);
9751019

1020+
if (ret && string_namespace_add(self, ret, length) == -1) {
1021+
Py_DECREF(ret);
1022+
return NULL;
1023+
}
1024+
9761025
return ret;
9771026
error:
9781027
Py_XDECREF(ret);

source/decoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ typedef struct CBORDecoderObject_ {
2121
PyObject *object_hook;
2222
PyObject *shareables;
2323
PyObject *stringref_namespace;
24-
PyObject *str_errors;
24+
const char *str_errors; // NULL for strict, "replace" for replace mode
2525
Py_ssize_t max_depth;
2626
bool immutable;
2727
Py_ssize_t shared_index;

tests/test_decoder.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,3 +1204,76 @@ def tag_hook(decoder, tag):
12041204
assert result[0] == [1, 2, 3]
12051205
assert result[1] == "after"
12061206
assert result[2] == "final"
1207+
1208+
1209+
def test_str_errors_error_alias(impl):
1210+
"""'error' is not a valid Python string error handler, normalize to 'strict'."""
1211+
with BytesIO(b"\x65hello") as stream:
1212+
decoder = impl.CBORDecoder(stream, str_errors="error")
1213+
assert decoder.str_errors == "strict"
1214+
1215+
1216+
def test_str_errors_invalid_mode(impl):
1217+
payload = b"\x65hello"
1218+
with pytest.raises(ValueError, match="invalid str_errors value 'invalid'"):
1219+
impl.loads(payload, str_errors="invalid")
1220+
1221+
1222+
@pytest.mark.parametrize(
1223+
"mode, expected",
1224+
[
1225+
("strict", None), # Should raise exception
1226+
("replace", "hello\ufffdworld"), # Should replace invalid byte with U+FFFD
1227+
],
1228+
ids=["strict_mode", "replace_mode"],
1229+
)
1230+
def test_str_errors_handling(impl, mode, expected):
1231+
invalid_utf8 = b"\x6bhello\xffworld" # \xFF is invalid UTF-8
1232+
1233+
if expected is None:
1234+
with pytest.raises(impl.CBORDecodeValueError, match="error decoding unicode string"):
1235+
impl.loads(invalid_utf8, str_errors=mode)
1236+
else:
1237+
result = impl.loads(invalid_utf8, str_errors=mode)
1238+
assert result == expected
1239+
assert len(result) == 11
1240+
assert result[5] == "\ufffd"
1241+
1242+
1243+
@pytest.mark.parametrize(
1244+
"payload, mode, expected",
1245+
[
1246+
(b"\x66hello\xff", "replace", "hello\ufffd"), # <=256 bytes: stack path
1247+
(
1248+
b"\x79\x01\x05" + b"a" * 260 + b"\xff",
1249+
"replace",
1250+
"a" * 260 + "\ufffd",
1251+
), # >256: heap path
1252+
],
1253+
ids=["short_string", "long_string"],
1254+
)
1255+
def test_str_errors_different_lengths(impl, payload, mode, expected):
1256+
"""Tests both stack (<=256 bytes) and heap (>256 bytes) allocation paths."""
1257+
result = impl.loads(payload, str_errors=mode)
1258+
assert result == expected
1259+
assert result[-1] == "\ufffd"
1260+
1261+
1262+
def test_str_errors_long_string_over_65536_bytes(impl):
1263+
"""Issue #255: str_errors not respected for strings >65536 bytes."""
1264+
# 65537 bytes: 65536 'a' + 1 invalid UTF-8 byte
1265+
payload = unhexlify("7a00010001" + "61" * 65536 + "c3")
1266+
result = impl.loads(payload, str_errors="replace")
1267+
assert len(result) == 65537
1268+
assert result[-1] == "\ufffd"
1269+
1270+
1271+
def test_str_errors_long_string_invalid_middle(impl):
1272+
"""Test str_errors with invalid UTF-8 in the middle of a long string."""
1273+
# 65536 'a' + invalid byte + 65536 'b' = 131073 bytes
1274+
payload = unhexlify("7a00020001" + "61" * 65536 + "c3" + "62" * 65536)
1275+
result = impl.loads(payload, str_errors="replace")
1276+
assert len(result) == 131073
1277+
assert result[65536] == "\ufffd"
1278+
assert result[:65536] == "a" * 65536
1279+
assert result[65537:] == "b" * 65536

0 commit comments

Comments
 (0)