1717
1818use std:: sync:: Arc ;
1919
20+ use crate :: args:: TpchFormat ;
21+ use crate :: config:: AppConfig ;
2022use color_eyre:: { eyre, Result } ;
2123use datafusion:: { arrow:: record_batch:: RecordBatch , datasource:: listing:: ListingTableUrl } ;
2224use datafusion_app:: {
@@ -35,7 +37,12 @@ use tpchgen_arrow::{
3537} ;
3638use url:: Url ;
3739
38- use crate :: config:: AppConfig ;
40+ #[ cfg( feature = "vortex" ) ]
41+ use {
42+ datafusion:: arrow:: compute:: concat_batches,
43+ vortex:: { arrow:: FromArrowArray , stream:: ArrayStreamAdapter , ArrayRef } ,
44+ vortex_file:: VortexWriteOptions ,
45+ } ;
3946
4047enum GeneratorType {
4148 Customer ,
@@ -125,7 +132,75 @@ where
125132 Ok ( ( ) )
126133}
127134
128- pub async fn generate ( config : AppConfig , scale_factor : f64 ) -> Result < ( ) > {
135+ #[ cfg( feature = "vortex" ) ]
136+ async fn write_batches_to_vortex < I > (
137+ batches : std:: iter:: Peekable < I > ,
138+ table_path : & Url ,
139+ table_type : & str ,
140+ store : Arc < dyn ObjectStore > ,
141+ ) -> Result < ( ) >
142+ where
143+ I : Iterator < Item = RecordBatch > ,
144+ {
145+ let batches_vec: Vec < RecordBatch > = batches. collect ( ) ;
146+
147+ if batches_vec. is_empty ( ) {
148+ return Err ( eyre:: Error :: msg ( format ! (
149+ "unable to generate {table_type} TPC-H data"
150+ ) ) ) ;
151+ }
152+
153+ let file_url = table_path. join ( "data.vortex" ) ?;
154+ info ! ( "...file URL '{file_url}'" ) ;
155+
156+ // Concatenate all batches into a single batch
157+ let schema = batches_vec[ 0 ] . schema ( ) ;
158+ let concatenated = concat_batches ( & schema, & batches_vec) ?;
159+
160+ // Convert to Vortex array
161+ let vortex_array = ArrayRef :: from_arrow ( concatenated, false ) ;
162+ let dtype = vortex_array. dtype ( ) . clone ( ) ;
163+
164+ // Create a stream adapter
165+ let stream = ArrayStreamAdapter :: new (
166+ dtype,
167+ futures:: stream:: iter ( std:: iter:: once ( Ok ( vortex_array) ) ) ,
168+ ) ;
169+
170+ // Write to a buffer
171+ let mut buf: Vec < u8 > = Vec :: new ( ) ;
172+ info ! ( "...writing {table_type} batches to vortex format" ) ;
173+ VortexWriteOptions :: default ( )
174+ . write ( & mut buf, stream)
175+ . await
176+ . map_err ( |e| eyre:: Error :: msg ( format ! ( "Failed to write Vortex file: {}" , e) ) ) ?;
177+
178+ let file_path = object_store:: path:: Path :: from_url_path ( file_url. path ( ) ) ?;
179+ info ! ( "...putting to file path {}" , file_path) ;
180+ store. put ( & file_path, buf. into ( ) ) . await ?;
181+ Ok ( ( ) )
182+ }
183+
184+ async fn write_batches < I > (
185+ batches : std:: iter:: Peekable < I > ,
186+ table_path : & Url ,
187+ table_type : & str ,
188+ store : Arc < dyn ObjectStore > ,
189+ format : & TpchFormat ,
190+ ) -> Result < ( ) >
191+ where
192+ I : Iterator < Item = RecordBatch > ,
193+ {
194+ match format {
195+ TpchFormat :: Parquet => {
196+ write_batches_to_parquet ( batches, table_path, table_type, store) . await
197+ }
198+ #[ cfg( feature = "vortex" ) ]
199+ TpchFormat :: Vortex => write_batches_to_vortex ( batches, table_path, table_type, store) . await ,
200+ }
201+ }
202+
203+ pub async fn generate ( config : AppConfig , scale_factor : f64 , format : TpchFormat ) -> Result < ( ) > {
129204 let merged_exec_config = merge_configs ( config. shared . clone ( ) , config. cli . execution . clone ( ) ) ;
130205 let session_state_builder = DftSessionStateBuilder :: try_new ( Some ( merged_exec_config. clone ( ) ) ) ?
131206 . with_extensions ( )
@@ -155,96 +230,112 @@ pub async fn generate(config: AppConfig, scale_factor: f64) -> Result<()> {
155230 info ! ( "...generating customers" ) ;
156231 let arrow_generator =
157232 CustomerArrow :: new ( CustomerGenerator :: new ( scale_factor, 1 , 1 ) ) ;
158- write_batches_to_parquet (
233+ write_batches (
159234 arrow_generator. peekable ( ) ,
160235 & table_path,
161236 "Customer" ,
162237 Arc :: clone ( & store) ,
238+ & format,
163239 )
164240 . await ?;
165241 }
166242 GeneratorType :: Order => {
167243 info ! ( "...generating orders" ) ;
168244 let arrow_generator = OrderArrow :: new ( OrderGenerator :: new ( scale_factor, 1 , 1 ) ) ;
169- write_batches_to_parquet (
245+ write_batches (
170246 arrow_generator. peekable ( ) ,
171247 & table_path,
172248 "Order" ,
173249 Arc :: clone ( & store) ,
250+ & format,
174251 )
175252 . await ?;
176253 }
177254 GeneratorType :: LineItem => {
178255 info ! ( "...generating LineItems" ) ;
179256 let arrow_generator =
180257 LineItemArrow :: new ( LineItemGenerator :: new ( scale_factor, 1 , 1 ) ) ;
181- write_batches_to_parquet (
258+ write_batches (
182259 arrow_generator. peekable ( ) ,
183260 & table_path,
184261 "LineItem" ,
185262 Arc :: clone ( & store) ,
263+ & format,
186264 )
187265 . await ?;
188266 }
189267 GeneratorType :: Nation => {
190268 info ! ( "...generating Nations" ) ;
191269 let arrow_generator = NationArrow :: new ( NationGenerator :: new ( scale_factor, 1 , 1 ) ) ;
192- write_batches_to_parquet (
270+ write_batches (
193271 arrow_generator. peekable ( ) ,
194272 & table_path,
195273 "Nation" ,
196274 Arc :: clone ( & store) ,
275+ & format,
197276 )
198277 . await ?;
199278 }
200279 GeneratorType :: Part => {
201280 info ! ( "...generating Parts" ) ;
202281 let arrow_generator = PartArrow :: new ( PartGenerator :: new ( scale_factor, 1 , 1 ) ) ;
203- write_batches_to_parquet (
282+ write_batches (
204283 arrow_generator. peekable ( ) ,
205284 & table_path,
206285 "Part" ,
207286 Arc :: clone ( & store) ,
287+ & format,
208288 )
209289 . await ?;
210290 }
211291 GeneratorType :: PartSupp => {
212292 info ! ( "...generating PartSupps" ) ;
213293 let arrow_generator =
214294 PartSuppArrow :: new ( PartSuppGenerator :: new ( scale_factor, 1 , 1 ) ) ;
215- write_batches_to_parquet (
295+ write_batches (
216296 arrow_generator. peekable ( ) ,
217297 & table_path,
218298 "PartSupp" ,
219299 Arc :: clone ( & store) ,
300+ & format,
220301 )
221302 . await ?;
222303 }
223304 GeneratorType :: Region => {
224305 info ! ( "...generating Regions" ) ;
225306 let arrow_generator = RegionArrow :: new ( RegionGenerator :: new ( scale_factor, 1 , 1 ) ) ;
226- write_batches_to_parquet (
307+ write_batches (
227308 arrow_generator. peekable ( ) ,
228309 & table_path,
229310 "Region" ,
230311 Arc :: clone ( & store) ,
312+ & format,
231313 )
232314 . await ?;
233315 }
234316 GeneratorType :: Supplier => {
235317 info ! ( "...generating Suppliers" ) ;
236318 let arrow_generator =
237319 SupplierArrow :: new ( SupplierGenerator :: new ( scale_factor, 1 , 1 ) ) ;
238- write_batches_to_parquet (
320+ write_batches (
239321 arrow_generator. peekable ( ) ,
240322 & table_path,
241323 "Supplier" ,
242324 Arc :: clone ( & store) ,
325+ & format,
243326 )
244327 . await ?;
245328 }
246329 }
247330 }
248331
332+ let tpch_dir = config
333+ . db
334+ . path
335+ . join ( "tables/" ) ?
336+ . join ( "dft/" ) ?
337+ . join ( "tpch/" ) ?;
338+ println ! ( "TPC-H dataset saved to: {}" , tpch_dir) ;
339+
249340 Ok ( ( ) )
250341}
0 commit comments