Skip to content

Commit 26f31df

Browse files
gauravclaude
andcommitted
Improve regnum2phyx output and add --report CSV option
- Replace bare per-line stderr writes with a per-entry results array so every warning/skip is attributed to its phyloreference label and regnum ID - Add --report <path> option that writes a rectangular CSV with columns for regnum_id, label, status, output_file, specifier counts, per-specifier labels, and a semicolon-joined issues field - Post-loop summary now breaks down counts: written successfully, skipped, written with issues (exit 1 if any errors, same as before) - Remove unused `keys` import from lodash Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 27a9b83 commit 26f31df

File tree

1 file changed

+99
-21
lines changed

1 file changed

+99
-21
lines changed

regnum2phyx/regnum2phyx.js

Lines changed: 99 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ const fs = require('node:fs');
2727
const path = require('node:path');
2828
const yargs = require('yargs');
2929
const {
30-
has, keys, pickBy, isEmpty,
30+
has, pickBy, isEmpty,
3131
} = require('lodash');
3232

3333
// Helper functions.
@@ -82,7 +82,7 @@ function convertAuthorsIntoBibJSON(authors) {
8282
}));
8383
}
8484

85-
function convertCitationsToBibJSON(citation) {
85+
function convertCitationsToBibJSON(citation, issues = []) {
8686
// Convert a citation from its Regnum representation into the
8787
// BibJSON format (http://okfnlabs.org/bibjson/). We use this rather than
8888
// CSL-JSON (https://github.com/citation-style-language/schema) because it's
@@ -92,7 +92,7 @@ function convertCitationsToBibJSON(citation) {
9292
if (!citation) return [];
9393
if (Array.isArray(citation)) {
9494
// If given an array of citation objects, convert each one separately.
95-
return citation.map(c => convertCitationsToBibJSON(c))
95+
return citation.map(c => convertCitationsToBibJSON(c, issues))
9696
.reduce((acc, val) => acc.concat(val), []);
9797
}
9898

@@ -163,7 +163,7 @@ function convertCitationsToBibJSON(citation) {
163163
// Since we've moved pages and ISBN into journal, we don't also need it in the main entry.
164164
if (has(entry, 'pages')) entry.pages = undefined;
165165
} else {
166-
process.stderr.write(`Unknown citation type: '${type}', using anyway.`);
166+
issues.push(`Unknown citation type: '${type}', using anyway.`);
167167
}
168168

169169
return [entry];
@@ -198,6 +198,10 @@ const argv = yargs
198198
describe: 'Choose the prefix for the filename being generated',
199199
string: true,
200200
})
201+
.option('report', {
202+
describe: 'Path to write a CSV report of all processed phyloreferences',
203+
string: true,
204+
})
201205
.help('h')
202206
.alias('h', 'help')
203207
.argv;
@@ -207,18 +211,38 @@ const dump = JSON.parse(fs.readFileSync(argv._[0], 'utf8'));
207211

208212
// This dump consists of multiple named phylogenetic clade definitions,
209213
// each of which should be written out to a separate file.
210-
const phyxProduced = {};
211-
let countErrors = 0;
214+
const phyxProduced = {}; // keeps phylorefLabel → entry for O(1) duplicate detection
215+
const results = [];
216+
217+
// Helper to escape a value for CSV output.
218+
function escapeCSV(field) {
219+
const str = String(field == null ? '' : field);
220+
if (str.includes(',') || str.includes('"') || str.includes('\n')) {
221+
return `"${str.replace(/"/g, '""')}"`;
222+
}
223+
return str;
224+
}
212225

213226
// Loop through all phylorefs in the database dump.
214227
dump.forEach((entry, index) => {
215228
const phylorefLabel = entry.name.trim();
229+
const entryIssues = [];
230+
const result = {
231+
regnumId: entry.id,
232+
label: phylorefLabel,
233+
status: 'success',
234+
outputFile: null,
235+
internalSpecifiers: [],
236+
externalSpecifiers: [],
237+
issues: entryIssues,
238+
};
216239

217240
// Make sure we don't have multiple phyloreferences with the same label, since
218241
// we name the file after the phyloreference being produced.
219242
if (has(phyxProduced, phylorefLabel)) {
220-
process.stderr.write(`Duplicate phyloreference label '${phylorefLabel}', skipping.\n`);
221-
countErrors += 1;
243+
entryIssues.push(`Duplicate phyloreference label '${phylorefLabel}', skipping.`);
244+
result.status = 'skipped';
245+
results.push(result);
222246
return;
223247
}
224248

@@ -230,11 +254,11 @@ dump.forEach((entry, index) => {
230254
// year, we can use it to check whether the "description" citation(s) are
231255
// empty or contain an actual citation. In the latter case, we throw an Error
232256
// so we fail with an error.
233-
const descriptionCitations = convertCitationsToBibJSON(entry.citations.description);
257+
const descriptionCitations = convertCitationsToBibJSON(entry.citations.description, entryIssues);
234258

235259
if (descriptionCitations.length > 0) {
236260
throw new Error(`Citation of type 'description' found in entry: ${
237-
JSON.stringify(convertCitationsToBibJSON(entry.citations.definitional), null, 4)
261+
JSON.stringify(convertCitationsToBibJSON(entry.citations.definitional, entryIssues), null, 4)
238262
}`);
239263
}
240264
}
@@ -244,19 +268,19 @@ dump.forEach((entry, index) => {
244268
regnumId: entry.id,
245269
label: phylorefLabel,
246270
'dwc:scientificNameAuthorship': (convertAuthorsIntoStrings(entry.authors)).join(' and '),
247-
'dwc:namePublishedIn': convertCitationsToBibJSON(entry.citations.preexisting),
271+
'dwc:namePublishedIn': convertCitationsToBibJSON(entry.citations.preexisting, entryIssues),
248272
'obo:IAO_0000119': // IAO:definition source (http://purl.obolibrary.org/obo/IAO_0000119)
249-
convertCitationsToBibJSON(entry.citations.definitional),
273+
convertCitationsToBibJSON(entry.citations.definitional, entryIssues),
250274
cladeDefinition: (entry.definition || '').trim(),
251275
internalSpecifiers: [],
252276
externalSpecifiers: [],
253277
});
254278

255279
// Do we have any phylogenies to save?
256-
const primaryPhylogenyCitation = convertCitationsToBibJSON(entry.citations.primary_phylogeny).map(
280+
const primaryPhylogenyCitation = convertCitationsToBibJSON(entry.citations.primary_phylogeny, entryIssues).map(
257281
phylogeny => pickBy({ primaryPhylogenyCitation: phylogeny })
258282
);
259-
const phylogenyCitation = convertCitationsToBibJSON(entry.citations.phylogeny).map(
283+
const phylogenyCitation = convertCitationsToBibJSON(entry.citations.phylogeny, entryIssues).map(
260284
phylogeny => pickBy({ phylogenyCitation: phylogeny })
261285
);
262286
const phylogenies = primaryPhylogenyCitation.concat(phylogenyCitation).filter(
@@ -270,15 +294,16 @@ dump.forEach((entry, index) => {
270294
if (kind.startsWith('internal')) addTo = phylorefTemplate.internalSpecifiers;
271295
else if (kind.startsWith('external')) addTo = phylorefTemplate.externalSpecifiers;
272296
else if (specifier.specifier_type === 'apomorphy') {
273-
process.stderr.write('Apomorphy specifiers are not currently supported.\n');
297+
entryIssues.push('Apomorphy specifiers are not currently supported.');
298+
if (result.status === 'success') result.status = 'warning';
274299
} else {
275300
if (specifier.specifier_type === 'crown') {
276-
process.stderr.write('Crown specifiers are not supported.\n');
301+
entryIssues.push('Crown specifiers are not supported.');
277302
} else {
278-
process.stderr.write(`Odd specifier: ${JSON.stringify(specifier, null, 2)}\n`);
279-
process.stderr.write(`Unknown specifier type: '${kind}' for phyloreference '${phylorefLabel}'.\n`);
303+
entryIssues.push(`Odd specifier: ${JSON.stringify(specifier, null, 2)}`);
304+
entryIssues.push(`Unknown specifier type: '${kind}' for phyloreference '${phylorefLabel}'.`);
280305
}
281-
countErrors += 1;
306+
result.status = 'warning';
282307
}
283308

284309
// Set up specifier name, authorship and nomenclatural code.
@@ -353,14 +378,67 @@ dump.forEach((entry, index) => {
353378
}
354379
fs.writeFileSync(phyxFilename, JSON.stringify(phyxTemplate, null, 4));
355380

381+
// Record output file and specifier labels for the report.
382+
result.outputFile = phyxFilename;
383+
result.internalSpecifiers = phylorefTemplate.internalSpecifiers.map(s => s.hasName.label);
384+
result.externalSpecifiers = phylorefTemplate.externalSpecifiers.map(s => s.hasName.label);
385+
356386
// Save for later use if needed.
357387
phyxProduced[phylorefLabel] = entry;
388+
results.push(result);
358389
});
359390

360-
// If there were any errors, report this and exit with a failure code.
391+
// Print attributed issues to stderr (only entries with problems).
392+
for (const r of results) {
393+
if (r.issues.length === 0) continue;
394+
const id = r.regnumId != null ? ` (regnum ID ${r.regnumId})` : '';
395+
process.stderr.write(`${r.status === 'skipped' ? 'Skipped' : 'Warning in'} '${r.label}'${id}:\n`);
396+
for (const issue of r.issues) {
397+
process.stderr.write(` - ${issue}\n`);
398+
}
399+
}
400+
401+
// Write CSV report if --report was given.
402+
if (argv.report) {
403+
const maxInternal = Math.max(0, ...results.map(r => r.internalSpecifiers.length));
404+
const maxExternal = Math.max(0, ...results.map(r => r.externalSpecifiers.length));
405+
406+
const internalCols = Array.from({ length: maxInternal }, (_, i) => `internal_specifier_${i + 1}`);
407+
const externalCols = Array.from({ length: maxExternal }, (_, i) => `external_specifier_${i + 1}`);
408+
409+
const header = [
410+
'regnum_id', 'label', 'status', 'output_file',
411+
'num_internal_specifiers', 'num_external_specifiers',
412+
...internalCols, ...externalCols,
413+
'issues',
414+
];
415+
416+
const rows = results.map(r => {
417+
const internalFields = Array.from({ length: maxInternal }, (_, i) => r.internalSpecifiers[i] || '');
418+
const externalFields = Array.from({ length: maxExternal }, (_, i) => r.externalSpecifiers[i] || '');
419+
return [
420+
r.regnumId, r.label, r.status, r.outputFile || '',
421+
r.internalSpecifiers.length, r.externalSpecifiers.length,
422+
...internalFields, ...externalFields,
423+
r.issues.join('; '),
424+
].map(escapeCSV).join(',');
425+
});
426+
427+
fs.writeFileSync(argv.report, `${[header.join(','), ...rows].join('\n')}\n`);
428+
}
429+
430+
// Final summary and exit.
431+
const successCount = results.filter(r => r.status === 'success').length;
432+
const warningCount = results.filter(r => r.status === 'warning').length;
433+
const skippedCount = results.filter(r => r.status === 'skipped').length;
434+
const countErrors = warningCount + skippedCount;
435+
361436
if (countErrors > 0) {
437+
process.stderr.write(
438+
`Processed ${results.length} entries: ${successCount} written successfully, ${skippedCount} skipped, ${warningCount} written with issues.\n`,
439+
);
362440
process.stderr.write(`${countErrors} errors occurred while processing database dump.\n`);
363441
process.exit(1);
364442
} else {
365-
process.stdout.write(`${keys(phyxProduced).length} Phyx files produced successfully.\n`);
443+
process.stdout.write(`${successCount} Phyx files produced successfully.\n`);
366444
}

0 commit comments

Comments
 (0)