@@ -27,7 +27,7 @@ const fs = require('node:fs');
2727const path = require ( 'node:path' ) ;
2828const yargs = require ( 'yargs' ) ;
2929const {
30- has, keys , pickBy, isEmpty,
30+ has, pickBy, isEmpty,
3131} = require ( 'lodash' ) ;
3232
3333// Helper functions.
@@ -82,7 +82,7 @@ function convertAuthorsIntoBibJSON(authors) {
8282 } ) ) ;
8383}
8484
85- function convertCitationsToBibJSON ( citation ) {
85+ function convertCitationsToBibJSON ( citation , issues = [ ] ) {
8686 // Convert a citation from its Regnum representation into the
8787 // BibJSON format (http://okfnlabs.org/bibjson/). We use this rather than
8888 // CSL-JSON (https://github.com/citation-style-language/schema) because it's
@@ -92,7 +92,7 @@ function convertCitationsToBibJSON(citation) {
9292 if ( ! citation ) return [ ] ;
9393 if ( Array . isArray ( citation ) ) {
9494 // If given an array of citation objects, convert each one separately.
95- return citation . map ( c => convertCitationsToBibJSON ( c ) )
95+ return citation . map ( c => convertCitationsToBibJSON ( c , issues ) )
9696 . reduce ( ( acc , val ) => acc . concat ( val ) , [ ] ) ;
9797 }
9898
@@ -163,7 +163,7 @@ function convertCitationsToBibJSON(citation) {
163163 // Since we've moved pages and ISBN into journal, we don't also need it in the main entry.
164164 if ( has ( entry , 'pages' ) ) entry . pages = undefined ;
165165 } else {
166- process . stderr . write ( `Unknown citation type: '${ type } ', using anyway.` ) ;
166+ issues . push ( `Unknown citation type: '${ type } ', using anyway.` ) ;
167167 }
168168
169169 return [ entry ] ;
@@ -198,6 +198,10 @@ const argv = yargs
198198 describe : 'Choose the prefix for the filename being generated' ,
199199 string : true ,
200200 } )
201+ . option ( 'report' , {
202+ describe : 'Path to write a CSV report of all processed phyloreferences' ,
203+ string : true ,
204+ } )
201205 . help ( 'h' )
202206 . alias ( 'h' , 'help' )
203207 . argv ;
@@ -207,18 +211,38 @@ const dump = JSON.parse(fs.readFileSync(argv._[0], 'utf8'));
207211
208212// This dump consists of multiple named phylogenetic clade definitions,
209213// each of which should be written out to a separate file.
210- const phyxProduced = { } ;
211- let countErrors = 0 ;
214+ const phyxProduced = { } ; // keeps phylorefLabel → entry for O(1) duplicate detection
215+ const results = [ ] ;
216+
217+ // Helper to escape a value for CSV output.
218+ function escapeCSV ( field ) {
219+ const str = String ( field == null ? '' : field ) ;
220+ if ( str . includes ( ',' ) || str . includes ( '"' ) || str . includes ( '\n' ) ) {
221+ return `"${ str . replace ( / " / g, '""' ) } "` ;
222+ }
223+ return str ;
224+ }
212225
213226// Loop through all phylorefs in the database dump.
214227dump . forEach ( ( entry , index ) => {
215228 const phylorefLabel = entry . name . trim ( ) ;
229+ const entryIssues = [ ] ;
230+ const result = {
231+ regnumId : entry . id ,
232+ label : phylorefLabel ,
233+ status : 'success' ,
234+ outputFile : null ,
235+ internalSpecifiers : [ ] ,
236+ externalSpecifiers : [ ] ,
237+ issues : entryIssues ,
238+ } ;
216239
217240 // Make sure we don't have multiple phyloreferences with the same label, since
218241 // we name the file after the phyloreference being produced.
219242 if ( has ( phyxProduced , phylorefLabel ) ) {
220- process . stderr . write ( `Duplicate phyloreference label '${ phylorefLabel } ', skipping.\n` ) ;
221- countErrors += 1 ;
243+ entryIssues . push ( `Duplicate phyloreference label '${ phylorefLabel } ', skipping.` ) ;
244+ result . status = 'skipped' ;
245+ results . push ( result ) ;
222246 return ;
223247 }
224248
@@ -230,11 +254,11 @@ dump.forEach((entry, index) => {
230254 // year, we can use it to check whether the "description" citation(s) are
231255 // empty or contain an actual citation. In the latter case, we throw an Error
232256 // so we fail with an error.
233- const descriptionCitations = convertCitationsToBibJSON ( entry . citations . description ) ;
257+ const descriptionCitations = convertCitationsToBibJSON ( entry . citations . description , entryIssues ) ;
234258
235259 if ( descriptionCitations . length > 0 ) {
236260 throw new Error ( `Citation of type 'description' found in entry: ${
237- JSON . stringify ( convertCitationsToBibJSON ( entry . citations . definitional ) , null , 4 )
261+ JSON . stringify ( convertCitationsToBibJSON ( entry . citations . definitional , entryIssues ) , null , 4 )
238262 } `) ;
239263 }
240264 }
@@ -244,19 +268,19 @@ dump.forEach((entry, index) => {
244268 regnumId : entry . id ,
245269 label : phylorefLabel ,
246270 'dwc:scientificNameAuthorship' : ( convertAuthorsIntoStrings ( entry . authors ) ) . join ( ' and ' ) ,
247- 'dwc:namePublishedIn' : convertCitationsToBibJSON ( entry . citations . preexisting ) ,
271+ 'dwc:namePublishedIn' : convertCitationsToBibJSON ( entry . citations . preexisting , entryIssues ) ,
248272 'obo:IAO_0000119' : // IAO:definition source (http://purl.obolibrary.org/obo/IAO_0000119)
249- convertCitationsToBibJSON ( entry . citations . definitional ) ,
273+ convertCitationsToBibJSON ( entry . citations . definitional , entryIssues ) ,
250274 cladeDefinition : ( entry . definition || '' ) . trim ( ) ,
251275 internalSpecifiers : [ ] ,
252276 externalSpecifiers : [ ] ,
253277 } ) ;
254278
255279 // Do we have any phylogenies to save?
256- const primaryPhylogenyCitation = convertCitationsToBibJSON ( entry . citations . primary_phylogeny ) . map (
280+ const primaryPhylogenyCitation = convertCitationsToBibJSON ( entry . citations . primary_phylogeny , entryIssues ) . map (
257281 phylogeny => pickBy ( { primaryPhylogenyCitation : phylogeny } )
258282 ) ;
259- const phylogenyCitation = convertCitationsToBibJSON ( entry . citations . phylogeny ) . map (
283+ const phylogenyCitation = convertCitationsToBibJSON ( entry . citations . phylogeny , entryIssues ) . map (
260284 phylogeny => pickBy ( { phylogenyCitation : phylogeny } )
261285 ) ;
262286 const phylogenies = primaryPhylogenyCitation . concat ( phylogenyCitation ) . filter (
@@ -270,15 +294,16 @@ dump.forEach((entry, index) => {
270294 if ( kind . startsWith ( 'internal' ) ) addTo = phylorefTemplate . internalSpecifiers ;
271295 else if ( kind . startsWith ( 'external' ) ) addTo = phylorefTemplate . externalSpecifiers ;
272296 else if ( specifier . specifier_type === 'apomorphy' ) {
273- process . stderr . write ( 'Apomorphy specifiers are not currently supported.\n' ) ;
297+ entryIssues . push ( 'Apomorphy specifiers are not currently supported.' ) ;
298+ if ( result . status === 'success' ) result . status = 'warning' ;
274299 } else {
275300 if ( specifier . specifier_type === 'crown' ) {
276- process . stderr . write ( 'Crown specifiers are not supported.\n ' ) ;
301+ entryIssues . push ( 'Crown specifiers are not supported.' ) ;
277302 } else {
278- process . stderr . write ( `Odd specifier: ${ JSON . stringify ( specifier , null , 2 ) } \n ` ) ;
279- process . stderr . write ( `Unknown specifier type: '${ kind } ' for phyloreference '${ phylorefLabel } '.\n ` ) ;
303+ entryIssues . push ( `Odd specifier: ${ JSON . stringify ( specifier , null , 2 ) } ` ) ;
304+ entryIssues . push ( `Unknown specifier type: '${ kind } ' for phyloreference '${ phylorefLabel } '.` ) ;
280305 }
281- countErrors += 1 ;
306+ result . status = 'warning' ;
282307 }
283308
284309 // Set up specifier name, authorship and nomenclatural code.
@@ -353,14 +378,67 @@ dump.forEach((entry, index) => {
353378 }
354379 fs . writeFileSync ( phyxFilename , JSON . stringify ( phyxTemplate , null , 4 ) ) ;
355380
381+ // Record output file and specifier labels for the report.
382+ result . outputFile = phyxFilename ;
383+ result . internalSpecifiers = phylorefTemplate . internalSpecifiers . map ( s => s . hasName . label ) ;
384+ result . externalSpecifiers = phylorefTemplate . externalSpecifiers . map ( s => s . hasName . label ) ;
385+
356386 // Save for later use if needed.
357387 phyxProduced [ phylorefLabel ] = entry ;
388+ results . push ( result ) ;
358389} ) ;
359390
360- // If there were any errors, report this and exit with a failure code.
391+ // Print attributed issues to stderr (only entries with problems).
392+ for ( const r of results ) {
393+ if ( r . issues . length === 0 ) continue ;
394+ const id = r . regnumId != null ? ` (regnum ID ${ r . regnumId } )` : '' ;
395+ process . stderr . write ( `${ r . status === 'skipped' ? 'Skipped' : 'Warning in' } '${ r . label } '${ id } :\n` ) ;
396+ for ( const issue of r . issues ) {
397+ process . stderr . write ( ` - ${ issue } \n` ) ;
398+ }
399+ }
400+
401+ // Write CSV report if --report was given.
402+ if ( argv . report ) {
403+ const maxInternal = Math . max ( 0 , ...results . map ( r => r . internalSpecifiers . length ) ) ;
404+ const maxExternal = Math . max ( 0 , ...results . map ( r => r . externalSpecifiers . length ) ) ;
405+
406+ const internalCols = Array . from ( { length : maxInternal } , ( _ , i ) => `internal_specifier_${ i + 1 } ` ) ;
407+ const externalCols = Array . from ( { length : maxExternal } , ( _ , i ) => `external_specifier_${ i + 1 } ` ) ;
408+
409+ const header = [
410+ 'regnum_id' , 'label' , 'status' , 'output_file' ,
411+ 'num_internal_specifiers' , 'num_external_specifiers' ,
412+ ...internalCols , ...externalCols ,
413+ 'issues' ,
414+ ] ;
415+
416+ const rows = results . map ( r => {
417+ const internalFields = Array . from ( { length : maxInternal } , ( _ , i ) => r . internalSpecifiers [ i ] || '' ) ;
418+ const externalFields = Array . from ( { length : maxExternal } , ( _ , i ) => r . externalSpecifiers [ i ] || '' ) ;
419+ return [
420+ r . regnumId , r . label , r . status , r . outputFile || '' ,
421+ r . internalSpecifiers . length , r . externalSpecifiers . length ,
422+ ...internalFields , ...externalFields ,
423+ r . issues . join ( '; ' ) ,
424+ ] . map ( escapeCSV ) . join ( ',' ) ;
425+ } ) ;
426+
427+ fs . writeFileSync ( argv . report , `${ [ header . join ( ',' ) , ...rows ] . join ( '\n' ) } \n` ) ;
428+ }
429+
430+ // Final summary and exit.
431+ const successCount = results . filter ( r => r . status === 'success' ) . length ;
432+ const warningCount = results . filter ( r => r . status === 'warning' ) . length ;
433+ const skippedCount = results . filter ( r => r . status === 'skipped' ) . length ;
434+ const countErrors = warningCount + skippedCount ;
435+
361436if ( countErrors > 0 ) {
437+ process . stderr . write (
438+ `Processed ${ results . length } entries: ${ successCount } written successfully, ${ skippedCount } skipped, ${ warningCount } written with issues.\n` ,
439+ ) ;
362440 process . stderr . write ( `${ countErrors } errors occurred while processing database dump.\n` ) ;
363441 process . exit ( 1 ) ;
364442} else {
365- process . stdout . write ( `${ keys ( phyxProduced ) . length } Phyx files produced successfully.\n` ) ;
443+ process . stdout . write ( `${ successCount } Phyx files produced successfully.\n` ) ;
366444}
0 commit comments