Skip to content

Commit 2b7a450

Browse files
authored
Merge branch 'main' into set-comp-subquery
2 parents 38f0248 + 6746007 commit 2b7a450

File tree

63 files changed

+2506
-1493
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+2506
-1493
lines changed

.github/actions/setup-builder/action.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,17 @@ runs:
4646
# https://github.com/actions/checkout/issues/766
4747
shell: bash
4848
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
49+
- name: Remove unnecessary preinstalled software
50+
shell: bash
51+
run: |
52+
echo "Disk space before cleanup:"
53+
df -h
54+
apt-get clean
55+
# remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
56+
rm -rf /__t/* || true
57+
# remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
58+
rm -rf /host/usr/local/.ghcup || true
59+
# remove Android library: about 7.8GB (host /usr/local/lib/android)
60+
rm -rf /host/usr/local/lib/android || true
61+
echo "Disk space after cleanup:"
62+
df -h

.github/workflows/rust.yml

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -272,18 +272,6 @@ jobs:
272272
volumes:
273273
- /usr/local:/host/usr/local
274274
steps:
275-
- name: Remove unnecessary preinstalled software
276-
run: |
277-
echo "Disk space before cleanup:"
278-
df -h
279-
# remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
280-
rm -rf /__t/* || true
281-
# remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
282-
rm -rf /host/usr/local/.ghcup || true
283-
# remove Android library: about 7.8GB (host /usr/local/lib/android)
284-
rm -rf /host/usr/local/lib/android || true
285-
echo "Disk space after cleanup:"
286-
df -h
287275
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
288276
with:
289277
submodules: true
@@ -374,19 +362,6 @@ jobs:
374362
with:
375363
save-if: ${{ github.ref_name == 'main' }}
376364
shared-key: "amd-ci-linux-test-example"
377-
- name: Remove unnecessary preinstalled software
378-
run: |
379-
echo "Disk space before cleanup:"
380-
df -h
381-
apt-get clean
382-
rm -rf /__t/CodeQL
383-
rm -rf /__t/PyPy
384-
rm -rf /__t/Java_Temurin-Hotspot_jdk
385-
rm -rf /__t/Python
386-
rm -rf /__t/go
387-
rm -rf /__t/Ruby
388-
echo "Disk space after cleanup:"
389-
df -h
390365
- name: Run examples
391366
run: |
392367
# test datafusion-sql examples

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

benchmarks/src/bin/external_aggr.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ use datafusion::datasource::listing::{
3434
use datafusion::datasource::{MemTable, TableProvider};
3535
use datafusion::error::Result;
3636
use datafusion::execution::memory_pool::FairSpillPool;
37-
use datafusion::execution::memory_pool::{human_readable_size, units};
3837
use datafusion::execution::runtime_env::RuntimeEnvBuilder;
3938
use datafusion::execution::SessionStateBuilder;
4039
use datafusion::physical_plan::display::DisplayableExecutionPlan;
@@ -44,6 +43,7 @@ use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt, QueryResult};
4443
use datafusion_common::instant::Instant;
4544
use datafusion_common::utils::get_available_parallelism;
4645
use datafusion_common::{exec_err, DEFAULT_PARQUET_EXTENSION};
46+
use datafusion_common::{human_readable_size, units};
4747

4848
#[derive(Debug, StructOpt)]
4949
#[structopt(

benchmarks/src/util/memory.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
pub fn print_memory_stats() {
2020
#[cfg(all(feature = "mimalloc", feature = "mimalloc_extended"))]
2121
{
22-
use datafusion::execution::memory_pool::human_readable_size;
22+
use datafusion_common::human_readable_size;
2323
let mut peak_rss = 0;
2424
let mut peak_commit = 0;
2525
let mut page_faults = 0;

datafusion/catalog/src/listing_schema.rs

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -127,22 +127,13 @@ impl ListingSchemaProvider {
127127
.factory
128128
.create(
129129
state,
130-
&CreateExternalTable {
131-
schema: Arc::new(DFSchema::empty()),
130+
&CreateExternalTable::builder(
132131
name,
133-
location: table_url,
134-
file_type: self.format.clone(),
135-
table_partition_cols: vec![],
136-
if_not_exists: false,
137-
or_replace: false,
138-
temporary: false,
139-
definition: None,
140-
order_exprs: vec![],
141-
unbounded: false,
142-
options: Default::default(),
143-
constraints: Default::default(),
144-
column_defaults: Default::default(),
145-
},
132+
table_url,
133+
self.format.clone(),
134+
Arc::new(DFSchema::empty()),
135+
)
136+
.build(),
146137
)
147138
.await?;
148139
let _ =
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Helpers for rendering sizes, counts, and durations in human readable form.
19+
20+
/// Common data size units
21+
pub mod units {
22+
pub const TB: u64 = 1 << 40;
23+
pub const GB: u64 = 1 << 30;
24+
pub const MB: u64 = 1 << 20;
25+
pub const KB: u64 = 1 << 10;
26+
}
27+
28+
/// Present size in human-readable form
29+
pub fn human_readable_size(size: usize) -> String {
30+
use units::*;
31+
32+
let size = size as u64;
33+
let (value, unit) = {
34+
if size >= 2 * TB {
35+
(size as f64 / TB as f64, "TB")
36+
} else if size >= 2 * GB {
37+
(size as f64 / GB as f64, "GB")
38+
} else if size >= 2 * MB {
39+
(size as f64 / MB as f64, "MB")
40+
} else if size >= 2 * KB {
41+
(size as f64 / KB as f64, "KB")
42+
} else {
43+
(size as f64, "B")
44+
}
45+
};
46+
format!("{value:.1} {unit}")
47+
}
48+
49+
/// Present count in human-readable form with K, M, B, T suffixes
50+
pub fn human_readable_count(count: usize) -> String {
51+
let count = count as u64;
52+
let (value, unit) = {
53+
if count >= 1_000_000_000_000 {
54+
(count as f64 / 1_000_000_000_000.0, " T")
55+
} else if count >= 1_000_000_000 {
56+
(count as f64 / 1_000_000_000.0, " B")
57+
} else if count >= 1_000_000 {
58+
(count as f64 / 1_000_000.0, " M")
59+
} else if count >= 1_000 {
60+
(count as f64 / 1_000.0, " K")
61+
} else {
62+
return count.to_string();
63+
}
64+
};
65+
66+
// Format with appropriate precision
67+
// For values >= 100, show 1 decimal place (e.g., 123.4 K)
68+
// For values < 100, show 2 decimal places (e.g., 10.12 K)
69+
if value >= 100.0 {
70+
format!("{value:.1}{unit}")
71+
} else {
72+
format!("{value:.2}{unit}")
73+
}
74+
}
75+
76+
/// Present duration in human-readable form with 2 decimal places
77+
pub fn human_readable_duration(nanos: u64) -> String {
78+
const NANOS_PER_SEC: f64 = 1_000_000_000.0;
79+
const NANOS_PER_MILLI: f64 = 1_000_000.0;
80+
const NANOS_PER_MICRO: f64 = 1_000.0;
81+
82+
let nanos_f64 = nanos as f64;
83+
84+
if nanos >= 1_000_000_000 {
85+
// >= 1 second: show in seconds
86+
format!("{:.2}s", nanos_f64 / NANOS_PER_SEC)
87+
} else if nanos >= 1_000_000 {
88+
// >= 1 millisecond: show in milliseconds
89+
format!("{:.2}ms", nanos_f64 / NANOS_PER_MILLI)
90+
} else if nanos >= 1_000 {
91+
// >= 1 microsecond: show in microseconds
92+
format!("{:.2}µs", nanos_f64 / NANOS_PER_MICRO)
93+
} else {
94+
// < 1 microsecond: show in nanoseconds
95+
format!("{nanos}ns")
96+
}
97+
}
98+
99+
#[cfg(test)]
100+
mod tests {
101+
use super::*;
102+
103+
#[test]
104+
fn test_human_readable_count() {
105+
assert_eq!(human_readable_count(0), "0");
106+
assert_eq!(human_readable_count(1), "1");
107+
assert_eq!(human_readable_count(999), "999");
108+
assert_eq!(human_readable_count(1_000), "1.00 K");
109+
assert_eq!(human_readable_count(10_100), "10.10 K");
110+
assert_eq!(human_readable_count(1_532), "1.53 K");
111+
assert_eq!(human_readable_count(99_999), "100.00 K");
112+
assert_eq!(human_readable_count(1_000_000), "1.00 M");
113+
assert_eq!(human_readable_count(1_532_000), "1.53 M");
114+
assert_eq!(human_readable_count(99_000_000), "99.00 M");
115+
assert_eq!(human_readable_count(123_456_789), "123.5 M");
116+
assert_eq!(human_readable_count(1_000_000_000), "1.00 B");
117+
assert_eq!(human_readable_count(1_532_000_000), "1.53 B");
118+
assert_eq!(human_readable_count(999_999_999_999), "1000.0 B");
119+
assert_eq!(human_readable_count(1_000_000_000_000), "1.00 T");
120+
assert_eq!(human_readable_count(42_000_000_000_000), "42.00 T");
121+
}
122+
123+
#[test]
124+
fn test_human_readable_duration() {
125+
assert_eq!(human_readable_duration(0), "0ns");
126+
assert_eq!(human_readable_duration(1), "1ns");
127+
assert_eq!(human_readable_duration(999), "999ns");
128+
assert_eq!(human_readable_duration(1_000), "1.00µs");
129+
assert_eq!(human_readable_duration(1_234), "1.23µs");
130+
assert_eq!(human_readable_duration(999_999), "1000.00µs");
131+
assert_eq!(human_readable_duration(1_000_000), "1.00ms");
132+
assert_eq!(human_readable_duration(11_295_377), "11.30ms");
133+
assert_eq!(human_readable_duration(1_234_567), "1.23ms");
134+
assert_eq!(human_readable_duration(999_999_999), "1000.00ms");
135+
assert_eq!(human_readable_duration(1_000_000_000), "1.00s");
136+
assert_eq!(human_readable_duration(1_234_567_890), "1.23s");
137+
assert_eq!(human_readable_duration(42_000_000_000), "42.00s");
138+
}
139+
}

datafusion/common/src/display/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
//! Types for plan display
1919
2020
mod graphviz;
21+
pub mod human_readable;
2122
pub use graphviz::*;
2223

2324
use std::{

datafusion/common/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ pub use dfschema::{
6969
DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, qualified_name,
7070
};
7171
pub use diagnostic::Diagnostic;
72+
pub use display::human_readable::{
73+
human_readable_count, human_readable_duration, human_readable_size, units,
74+
};
7275
pub use error::{
7376
DataFusionError, Result, SchemaError, SharedResult, field_not_found,
7477
unqualified_field_not_found,

datafusion/common/src/stats.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,31 @@ impl Statistics {
317317
}
318318
}
319319

320+
/// Calculates `total_byte_size` based on the schema and `num_rows`.
321+
/// If any of the columns has non-primitive width, `total_byte_size` is set to inexact.
322+
pub fn calculate_total_byte_size(&mut self, schema: &Schema) {
323+
let mut row_size = Some(0);
324+
for field in schema.fields() {
325+
match field.data_type().primitive_width() {
326+
Some(width) => {
327+
row_size = row_size.map(|s| s + width);
328+
}
329+
None => {
330+
row_size = None;
331+
break;
332+
}
333+
}
334+
}
335+
match row_size {
336+
None => {
337+
self.total_byte_size = self.total_byte_size.to_inexact();
338+
}
339+
Some(size) => {
340+
self.total_byte_size = self.num_rows.multiply(&Precision::Exact(size));
341+
}
342+
}
343+
}
344+
320345
/// Returns an unbounded `ColumnStatistics` for each field in the schema.
321346
pub fn unknown_column(schema: &Schema) -> Vec<ColumnStatistics> {
322347
schema

0 commit comments

Comments
 (0)