Skip to content

Commit 6ff2340

Browse files
authored
JSON perf improvements (#314)
1 parent 367a69e commit 6ff2340

File tree

12 files changed

+868
-63
lines changed

12 files changed

+868
-63
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@
1919
/crates/fuzz/corpus/
2020
/.worktrees/
2121
/.zed/
22+
/scripts/json_files/

crates/monty/src/bytecode/vm/mod.rs

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ use crate::{
3131
heap_data::{Closure, FunctionDefaults},
3232
intern::{FunctionId, Interns, StringId},
3333
io::PrintWriter,
34-
modules::StandardLib,
34+
modules::{StandardLib, json::JsonStringCache},
3535
os::OsFunction,
3636
parse::CodeRange,
3737
resource::ResourceTracker,
@@ -583,6 +583,13 @@ pub struct VM<'h, 'a, T: ResourceTracker> {
583583
/// back to a `NameError`, so the traceback points to the name reference rather than
584584
/// the call expression.
585585
ext_function_load_ip: Option<usize>,
586+
587+
/// Per-run string cache for `json.loads()`.
588+
///
589+
/// Deduplicates heap allocations for repeated strings (especially dict keys)
590+
/// across multiple `json.loads()` calls within a single execution. Lazily
591+
/// initialized on first use, cleaned up in [`cleanup()`](Self::cleanup).
592+
pub(crate) json_string_cache: JsonStringCache,
586593
}
587594

588595
impl<'h, 'a, T: ResourceTracker> VM<'h, 'a, T> {
@@ -605,6 +612,7 @@ impl<'h, 'a, T: ResourceTracker> VM<'h, 'a, T> {
605612
scheduler: Scheduler::new(),
606613
ext_function_load_ip: None, // Set by LoadGlobalCallable/LoadLocalCallable
607614
module_code: None,
615+
json_string_cache: JsonStringCache::default(),
608616
}
609617
}
610618

@@ -666,8 +674,10 @@ impl<'h, 'a, T: ResourceTracker> VM<'h, 'a, T> {
666674
scheduler: snapshot.scheduler,
667675
module_code: Some(module_code),
668676
ext_function_load_ip: None,
677+
json_string_cache: JsonStringCache::default(),
669678
}
670679
}
680+
671681
/// Consumes the VM and creates a snapshot for pause/resume.
672682
///
673683
/// **Ownership transfer:** This method takes `self` by value, consuming the VM.
@@ -676,7 +686,11 @@ impl<'h, 'a, T: ResourceTracker> VM<'h, 'a, T> {
676686
///
677687
/// This is NOT a clone - it's a transfer. After calling this, the original VM
678688
/// is gone and only the snapshot (+ serialized heap/namespaces) represents the state.
679-
pub fn snapshot(self) -> VMSnapshot {
689+
pub fn snapshot(mut self) -> VMSnapshot {
690+
// Drop cached JSON strings before consuming the VM — they are not
691+
// included in the snapshot and their refcounts must be decremented.
692+
self.json_string_cache.drop_all(self.heap);
693+
680694
VMSnapshot {
681695
// Move values directly — no clone, no refcount increment needed
682696
// (the VM owned them, now the snapshot owns them)
@@ -709,6 +723,8 @@ impl<'h, 'a, T: ResourceTracker> VM<'h, 'a, T> {
709723
// Clean up scheduler state (task stacks, pending calls, resolved values, frame cells)
710724
self.scheduler.cleanup(self.heap);
711725
self.globals.drain(..).drop_with_heap(self.heap);
726+
// Release cached JSON string values
727+
self.json_string_cache.drop_all(self.heap);
712728
}
713729

714730
/// Returns the `stack_base` of the current (topmost) call frame.
@@ -1710,9 +1726,14 @@ impl<'h, 'a, T: ResourceTracker> VM<'h, 'a, T> {
17101726
let stack_roots = self.stack.iter().filter_map(Value::ref_id);
17111727
let globals_roots = self.globals.iter().filter_map(Value::ref_id);
17121728
let exc_roots = self.exception_stack.iter().filter_map(Value::ref_id);
1729+
let json_cache_roots = self.json_string_cache.gc_roots();
17131730

17141731
// Collect all roots into a vec to avoid lifetime issues
1715-
let roots: Vec<HeapId> = stack_roots.chain(globals_roots).chain(exc_roots).collect();
1732+
let roots: Vec<HeapId> = stack_roots
1733+
.chain(globals_roots)
1734+
.chain(exc_roots)
1735+
.chain(json_cache_roots)
1736+
.collect();
17161737

17171738
self.heap.collect_garbage(roots);
17181739
}

crates/monty/src/modules/json/dump.rs

Lines changed: 115 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
use std::{
77
cmp::Ordering,
88
fmt::{Display, Write},
9-
mem,
109
};
1110

1211
use crate::{
@@ -17,7 +16,7 @@ use crate::{
1716
heap::{DropWithHeap, HeapData, HeapGuard, HeapId, HeapReadOutput},
1817
intern::StaticStrings,
1918
resource::ResourceTracker,
20-
sorting::sort_indices,
19+
sorting::{apply_permutation, sort_indices},
2120
types::{PyTrait, long_int::check_bigint_str_digits_limit, str::allocate_string},
2221
value::Value,
2322
};
@@ -521,18 +520,7 @@ fn serialize_dict(
521520
vm: &mut VM<'_, '_, impl ResourceTracker>,
522521
) -> RunResult<()> {
523522
if config.skipkeys() {
524-
// Cannot use `retain` here because removed `Value::Ref` entries need
525-
// `drop_with_heap` to decrement reference counts properly.
526-
let mut i = 0;
527-
while i < entries.len() {
528-
if is_json_key_allowed(&entries[i].0, vm) {
529-
i += 1;
530-
} else {
531-
let (key, value) = entries.remove(i);
532-
key.drop_with_heap(vm);
533-
value.drop_with_heap(vm);
534-
}
535-
}
523+
skip_disallowed_dict_keys(entries, vm);
536524
} else if let Some((key, _)) = entries.iter().find(|(key, _)| !is_json_key_allowed(key, vm)) {
537525
return Err(ExcType::json_invalid_key_error(key.py_type(vm)));
538526
}
@@ -575,18 +563,31 @@ fn sort_dict_entries(entries: &mut Vec<(Value, Value)>, vm: &mut VM<'_, '_, impl
575563
let mut compare_values_guard = HeapGuard::new(compare_values, vm);
576564
let (compare_values, vm) = compare_values_guard.as_parts_mut();
577565
sort_indices(&mut indices, compare_values.as_slice(), false, vm)?;
566+
apply_permutation(entries.as_mut_slice(), &mut indices);
567+
Ok(())
568+
}
578569

579-
let mut ordered: Vec<(Value, Value)> = Vec::with_capacity(entries.len());
580-
for index in indices {
581-
ordered.push((
582-
entries[index].0.clone_with_heap(vm),
583-
entries[index].1.clone_with_heap(vm),
584-
));
570+
/// Removes dict entries whose keys are not JSON-serializable, preserving order.
571+
///
572+
/// `skipkeys=True` must drop invalid entries without disturbing the relative
573+
/// order of the retained pairs. A two-pointer compaction avoids the repeated
574+
/// shifting cost of `Vec::remove(i)` while still cleaning up skipped `Value`
575+
/// references with `drop_with_heap`.
576+
fn skip_disallowed_dict_keys(entries: &mut Vec<(Value, Value)>, vm: &mut VM<'_, '_, impl ResourceTracker>) {
577+
let mut write = 0;
578+
for read in 0..entries.len() {
579+
if is_json_key_allowed(&entries[read].0, vm) {
580+
if write != read {
581+
entries.swap(write, read);
582+
}
583+
write += 1;
584+
}
585585
}
586586

587-
let old_entries = mem::replace(entries, ordered);
588-
old_entries.drop_with_heap(vm);
589-
Ok(())
587+
for (key, value) in entries.drain(write..) {
588+
key.drop_with_heap(vm);
589+
value.drop_with_heap(vm);
590+
}
590591
}
591592

592593
/// Returns whether a value is an allowed JSON object key type.
@@ -788,30 +789,105 @@ fn with_entered_container<R>(
788789

789790
/// Writes a Rust string as a JSON string token.
790791
///
791-
/// The writer escapes control characters, quotes, and backslashes in all modes.
792-
/// When `ensure_ascii` is enabled, non-ASCII code points are emitted as `\uXXXX`
793-
/// escapes using surrogate pairs for supplementary-plane characters.
792+
/// Uses a byte-oriented batch strategy inspired by serde_json: a 256-entry
793+
/// lookup table classifies each byte in O(1), and contiguous runs of safe bytes
794+
/// are flushed with a single `push_str` rather than character-by-character.
795+
///
796+
/// When `ensure_ascii` is enabled, non-ASCII code points (bytes >= 0x80) are
797+
/// emitted as `\uXXXX` escapes using surrogate pairs for supplementary-plane
798+
/// characters.
794799
fn write_json_string(value: &str, out: &mut String, ensure_ascii: bool) {
795800
out.push('"');
796-
for ch in value.chars() {
797-
match ch {
798-
'"' => out.push_str("\\\""),
799-
'\\' => out.push_str("\\\\"),
800-
'\u{08}' => out.push_str("\\b"),
801-
'\u{0C}' => out.push_str("\\f"),
802-
'\n' => out.push_str("\\n"),
803-
'\r' => out.push_str("\\r"),
804-
'\t' => out.push_str("\\t"),
805-
ch if ch <= '\u{1F}' => {
806-
write!(out, "\\u{:04x}", ch as u32).expect("writing to String cannot fail");
801+
let bytes = value.as_bytes();
802+
let mut start = 0;
803+
let mut i = 0;
804+
805+
while i < bytes.len() {
806+
let byte = bytes[i];
807+
808+
if ensure_ascii && byte >= 0x7F {
809+
// Flush the safe ASCII run accumulated so far.
810+
out.push_str(&value[start..i]);
811+
if byte == 0x7F {
812+
// DEL (0x7F) is a control character that CPython escapes.
813+
out.push_str("\\u007f");
814+
i += 1;
815+
} else {
816+
// Decode the full character at this position and emit \uXXXX escapes.
817+
let ch = value[i..].chars().next().expect("valid UTF-8");
818+
write_json_escape_for_non_ascii(ch, out);
819+
i += ch.len_utf8();
820+
}
821+
start = i;
822+
continue;
823+
}
824+
825+
let escape = ESCAPE_TABLE[byte as usize];
826+
if escape == 0 {
827+
// Safe byte — keep scanning.
828+
i += 1;
829+
continue;
830+
}
831+
832+
// Flush the safe run before this byte.
833+
out.push_str(&value[start..i]);
834+
835+
// Write the escape sequence.
836+
match escape {
837+
b'b' => out.push_str("\\b"),
838+
b't' => out.push_str("\\t"),
839+
b'n' => out.push_str("\\n"),
840+
b'f' => out.push_str("\\f"),
841+
b'r' => out.push_str("\\r"),
842+
b'"' => out.push_str("\\\""),
843+
b'\\' => out.push_str("\\\\"),
844+
b'u' => {
845+
write!(out, "\\u{:04x}", u32::from(byte)).expect("writing to String cannot fail");
807846
}
808-
ch if ensure_ascii && (ch as u32) > 0x7E => write_json_escape_for_non_ascii(ch, out),
809-
ch => out.push(ch),
847+
_ => unreachable!(),
810848
}
849+
850+
i += 1;
851+
start = i;
811852
}
853+
854+
// Flush the final safe run.
855+
out.push_str(&value[start..]);
812856
out.push('"');
813857
}
814858

859+
/// Byte lookup table for JSON string escaping.
860+
///
861+
/// Each entry is either 0 (byte is safe, no escaping needed) or a shorthand
862+
/// character that indicates which escape to emit:
863+
/// - `b'"'` → `\"`
864+
/// - `b'\\'` → `\\`
865+
/// - `b'b'` → `\b` (backspace, 0x08)
866+
/// - `b't'` → `\t` (tab, 0x09)
867+
/// - `b'n'` → `\n` (newline, 0x0A)
868+
/// - `b'f'` → `\f` (form feed, 0x0C)
869+
/// - `b'r'` → `\r` (carriage return, 0x0D)
870+
/// - `b'u'` → `\u00XX` (other control characters, 0x00–0x1F)
871+
#[rustfmt::skip]
872+
static ESCAPE_TABLE: [u8; 256] = {
873+
let mut table = [0u8; 256];
874+
// Control characters 0x00–0x1F default to \u00XX escapes.
875+
let mut i = 0;
876+
while i < 0x20 {
877+
table[i] = b'u';
878+
i += 1;
879+
}
880+
// Override the named escapes.
881+
table[0x08] = b'b'; // backspace
882+
table[0x09] = b't'; // tab
883+
table[0x0A] = b'n'; // newline
884+
table[0x0C] = b'f'; // form feed
885+
table[0x0D] = b'r'; // carriage return
886+
table[0x22] = b'"'; // quote
887+
table[0x5C] = b'\\'; // backslash
888+
table
889+
};
890+
815891
/// Writes a non-ASCII character using JSON `\uXXXX` escapes.
816892
///
817893
/// Code points above `U+FFFF` are encoded as UTF-16 surrogate pairs to match

0 commit comments

Comments
 (0)