Skip to content

Commit 0666b31

Browse files
committed
chore: Add TPCDS benchmarks
1 parent 3926849 commit 0666b31

3 files changed

Lines changed: 23 additions & 113 deletions

File tree

benchmarks/src/tpcds/mod.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,19 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
118
mod run;
219
pub use run::RunOpt;

benchmarks/src/tpcds/run.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -316,12 +316,12 @@ impl RunOpt {
316316
// Obtain a snapshot of the SessionState
317317
let state = ctx.state();
318318
let path = format!("{path}/{table}.parquet");
319-
319+
320320
// Check if the file exists
321321
if !std::path::Path::new(&path).exists() {
322-
eprintln!("Warning: Table file does not exist: {}", path);
322+
eprintln!("Warning registering {table}: Table file does not exist: {path}");
323323
}
324-
324+
325325
let format = ParquetFormat::default()
326326
.with_options(ctx.state().table_options().parquet.clone());
327327

@@ -333,7 +333,9 @@ impl RunOpt {
333333
let schema = options.infer_schema(&state, &table_path).await?;
334334

335335
if self.common.debug {
336-
println!("Inferred schema from {table_path} for table '{table}':\n{schema:#?}\n");
336+
println!(
337+
"Inferred schema from {table_path} for table '{table}':\n{schema:#?}\n"
338+
);
337339
}
338340

339341
let options = if self.sorted {

datafusion/core/tests/parquet/schema.rs

Lines changed: 0 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -231,112 +231,3 @@ fn assert_metadata(batches: &[RecordBatch], expected_metadata: &HashMap<String,
231231
assert_eq!(batch.schema().metadata(), expected_metadata,);
232232
}
233233
}
234-
235-
#[tokio::test]
236-
async fn infer_schema_from_gzip_parquet() {
237-
// Test schema inference from a gzip-compressed parquet file
238-
let file_path = "../../../datafusion-benchmarks/tpcds/data/sf1/web_site.parquet";
239-
240-
// Check if the file exists
241-
if !Path::new(file_path).exists() {
242-
eprintln!("Skipping test: file not found at {}", file_path);
243-
return;
244-
}
245-
246-
let ctx = SessionContext::new();
247-
248-
// Read the parquet file and infer schema
249-
let df = ctx
250-
.read_parquet(file_path, ParquetReadOptions::default())
251-
.await
252-
.expect("Failed to read parquet file");
253-
254-
let schema = df.schema();
255-
256-
// Verify that schema was successfully inferred
257-
assert!(
258-
!schema.fields().is_empty(),
259-
"Schema should have at least one field"
260-
);
261-
262-
// Print schema for debugging
263-
println!("Inferred schema from gzip parquet file:");
264-
for field in schema.fields() {
265-
println!(" - {}: {:?}", field.name(), field.data_type());
266-
}
267-
268-
// Verify we can actually read data from the file
269-
let results = df.collect().await.expect("Failed to collect results");
270-
271-
// Verify we got some data
272-
let total_rows: usize = results.iter().map(|batch| batch.num_rows()).sum();
273-
println!("Total rows read: {}", total_rows);
274-
275-
assert!(
276-
total_rows > 0,
277-
"Should have read at least one row from the file"
278-
);
279-
}
280-
281-
#[tokio::test]
282-
async fn infer_schema_from_gzip_parquet_with_listing_options() {
283-
use datafusion::datasource::file_format::parquet::ParquetFormat;
284-
use datafusion::datasource::listing::{ListingOptions, ListingTableUrl};
285-
use datafusion_common::file_options::file_type::DEFAULT_PARQUET_EXTENSION;
286-
287-
// Test schema inference using ListingOptions and ParquetFormat
288-
let file_path = "../../../datafusion-benchmarks/tpcds/data/sf1/web_site.parquet";
289-
290-
// Check if the file exists
291-
if !Path::new(file_path).exists() {
292-
eprintln!("Skipping test: file not found at {}", file_path);
293-
return;
294-
}
295-
296-
let ctx = SessionContext::new();
297-
let state = ctx.state();
298-
299-
// Create ParquetFormat with options from the session state
300-
let format = ParquetFormat::default()
301-
.with_options(state.table_options().parquet.clone());
302-
303-
// Parse the file path as a ListingTableUrl
304-
let table_path = ListingTableUrl::parse(file_path)
305-
.expect("Failed to parse table path");
306-
307-
// Create ListingOptions with the ParquetFormat
308-
let options = ListingOptions::new(Arc::new(format))
309-
.with_file_extension(DEFAULT_PARQUET_EXTENSION)
310-
.with_target_partitions(state.config().target_partitions())
311-
.with_collect_stat(state.config().collect_statistics());
312-
313-
// Infer schema using the ListingOptions
314-
let schema = options
315-
.infer_schema(&state, &table_path)
316-
.await
317-
.expect("Failed to infer schema");
318-
319-
// Verify that schema was successfully inferred
320-
assert!(
321-
!schema.fields().is_empty(),
322-
"Schema should have at least one field"
323-
);
324-
325-
// Print schema for debugging
326-
println!("Inferred schema using ListingOptions:");
327-
for field in schema.fields() {
328-
println!(" - {}: {:?}", field.name(), field.data_type());
329-
}
330-
331-
// Verify expected number of fields for web_site table
332-
assert_eq!(
333-
schema.fields().len(),
334-
26,
335-
"web_site table should have 26 fields"
336-
);
337-
338-
// Verify some specific fields exist
339-
assert!(schema.field_with_name("web_site_sk").is_ok());
340-
assert!(schema.field_with_name("web_site_id").is_ok());
341-
assert!(schema.field_with_name("web_name").is_ok());
342-
}

0 commit comments

Comments
 (0)