Skip to content

Commit fbbc342

Browse files
authored
Add basic bibliographic data in response (#60)
1 parent 1733eb4 commit fbbc342

5 files changed

Lines changed: 467 additions & 87 deletions

File tree

Lines changed: 376 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,376 @@
1+
package org.grobid.core.data;
2+
3+
import com.fasterxml.jackson.core.JsonProcessingException;
4+
import com.fasterxml.jackson.databind.ObjectMapper;
5+
import org.apache.commons.collections4.CollectionUtils;
6+
import org.apache.commons.lang3.StringUtils;
7+
import org.slf4j.Logger;
8+
import org.slf4j.LoggerFactory;
9+
import org.w3c.dom.NodeList;
10+
11+
import javax.xml.namespace.NamespaceContext;
12+
import javax.xml.xpath.XPath;
13+
import javax.xml.xpath.XPathConstants;
14+
import javax.xml.xpath.XPathFactory;
15+
import java.util.ArrayList;
16+
import java.util.List;
17+
import java.util.Optional;
18+
19+
/**
20+
* Data class to hold article metadata (DOI, title, authors) extracted from documents.
21+
* This class provides a clean separation from BiblioComponent which is designed
22+
* for reference components within software mentions.
23+
*/
24+
public class ArticleBiblio {
25+
private static final Logger LOGGER = LoggerFactory.getLogger(ArticleBiblio.class);
26+
27+
private String doi;
28+
private String title;
29+
private String authors;
30+
31+
public ArticleBiblio() {
32+
}
33+
34+
public ArticleBiblio(String doi, String title, String authors) {
35+
this.doi = doi;
36+
this.title = title;
37+
this.authors = authors;
38+
}
39+
// Getters and setters
40+
public String getDoi() {
41+
return doi;
42+
}
43+
44+
public void setDoi(String doi) {
45+
this.doi = doi;
46+
}
47+
48+
public String getTitle() {
49+
return title;
50+
}
51+
52+
public void setTitle(String title) {
53+
this.title = title;
54+
}
55+
56+
public void setAuthors(String authors) {
57+
this.authors = authors;
58+
}
59+
60+
public String getAuthors() {
61+
return this.authors;
62+
}
63+
/**
64+
* Check if this metadata article has any meaningful content
65+
*/
66+
public boolean hasContent() {
67+
return (StringUtils.isNotBlank(doi)) ||
68+
(StringUtils.isNotBlank(title)) ||
69+
(StringUtils.isNotBlank(authors));
70+
}
71+
72+
/**
73+
* Convert this ArticleBiblio to JSON string for API response
74+
*/
75+
public String toJson() {
76+
ObjectMapper mapper = new ObjectMapper();
77+
StringBuilder json = new StringBuilder();
78+
boolean hasField = false;
79+
80+
json.append("\"biblio\": {");
81+
82+
// Add DOI if available
83+
if (StringUtils.isNotBlank(doi)) {
84+
json.append("\"doi\": ");
85+
try {
86+
json.append(mapper.writeValueAsString(doi));
87+
} catch (JsonProcessingException e) {
88+
json.append("\"\"");
89+
}
90+
hasField = true;
91+
}
92+
93+
// Add title if available
94+
if (StringUtils.isNotBlank(title)) {
95+
if (hasField) json.append(", ");
96+
json.append("\"title\": ");
97+
try {
98+
json.append(mapper.writeValueAsString(title));
99+
} catch (JsonProcessingException e) {
100+
json.append("\"\"");
101+
}
102+
hasField = true;
103+
}
104+
105+
// Add authors if available
106+
if (StringUtils.isNotBlank(authors)) {
107+
if (hasField) json.append(", ");
108+
try {
109+
json.append("\"authors\": ").append(mapper.writeValueAsString(authors));
110+
} catch (JsonProcessingException e) {
111+
json.append("\"authors\": \"\"");
112+
}
113+
}
114+
115+
json.append("}");
116+
return json.toString();
117+
}
118+
119+
/**
120+
* Create MetadataArticle from BiblioItem
121+
*/
122+
public static Optional<ArticleBiblio> fromBiblioItem(BiblioItem biblioItem) {
123+
if (biblioItem == null) {
124+
LOGGER.debug("BiblioItem is null, cannot create MetadataArticle");
125+
return Optional.empty();
126+
}
127+
128+
LOGGER.debug("Creating MetadataArticle from BiblioItem");
129+
ArticleBiblio metadata = new ArticleBiblio();
130+
131+
if (biblioItem.getDOI() != null && !biblioItem.getDOI().trim().isEmpty()) {
132+
metadata.setDoi(biblioItem.getDOI());
133+
LOGGER.debug("Extracted DOI: " + biblioItem.getDOI());
134+
}
135+
136+
if (biblioItem.getTitle() != null && !biblioItem.getTitle().trim().isEmpty()) {
137+
metadata.setTitle(biblioItem.getTitle());
138+
LOGGER.debug("Extracted title: " + biblioItem.getTitle());
139+
}
140+
141+
List<Person> authors = biblioItem.getFullAuthors();
142+
if (CollectionUtils.isNotEmpty(authors)) {
143+
String authorsAsList = formatAuthors(authors);
144+
metadata.setAuthors(authorsAsList);
145+
}
146+
147+
return metadata.hasContent() ? Optional.of(metadata) : Optional.empty();
148+
}
149+
150+
/**
151+
* Extract article metadata from TEI XML Document using XPath
152+
*/
153+
public static Optional<ArticleBiblio> fromTeiDocument(org.w3c.dom.Document teiDocument) {
154+
if (teiDocument == null) {
155+
return Optional.empty();
156+
}
157+
158+
try {
159+
XPathFactory xPathFactory = XPathFactory.newInstance();
160+
XPath xpath = xPathFactory.newXPath();
161+
162+
// Set up namespace context for TEI documents
163+
NamespaceContext nsContext = new TEINamespaceContext();
164+
xpath.setNamespaceContext(nsContext);
165+
166+
// Extract metadata using namespace-aware XPath
167+
String title = extractTitle(teiDocument, xpath);
168+
String doi = extractDOI(teiDocument, xpath);
169+
List<Person> authors = extractAuthors(teiDocument, xpath);
170+
171+
LOGGER.debug("Extracted from TEI: title='{}', doi='{}', authors={}", title, doi, authors.size());
172+
173+
ArticleBiblio articleMetadata = new ArticleBiblio();
174+
articleMetadata.setDoi(doi);
175+
articleMetadata.setTitle(title);
176+
177+
if (CollectionUtils.isNotEmpty(authors)) {
178+
articleMetadata.setAuthors(formatAuthors(authors));
179+
}
180+
181+
boolean hasContent = articleMetadata.hasContent();
182+
LOGGER.debug("Article metadata has content: {}, result: {}", hasContent, articleMetadata);
183+
return hasContent ? Optional.of(articleMetadata) : Optional.empty();
184+
} catch (Exception e) {
185+
LOGGER.error("Error extracting article metadata from TEI document", e);
186+
return Optional.empty();
187+
}
188+
}
189+
190+
private static String extractTitle(org.w3c.dom.Document doc, XPath xpath) {
191+
try {
192+
// Try multiple possible title paths with namespace
193+
String[] titlePaths = {
194+
"//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@level='a'][@type='main']/text()",
195+
"//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@level='a']",
196+
"//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type='main']/text()",
197+
};
198+
199+
for (String path : titlePaths) {
200+
try {
201+
NodeList titleNodes = (NodeList) xpath.evaluate(path, doc, XPathConstants.NODESET);
202+
if (titleNodes != null && titleNodes.getLength() > 0) {
203+
String title = titleNodes.item(0).getNodeValue().trim();
204+
if (!title.isEmpty()) {
205+
LOGGER.debug("Found title using path '{}': '{}'", path, title);
206+
return title;
207+
}
208+
}
209+
} catch (Exception e) {
210+
LOGGER.debug("Failed to extract title with path '{}': {}", path, e.getMessage());
211+
}
212+
}
213+
} catch (Exception e) {
214+
LOGGER.debug("Error extracting title from TEI document", e);
215+
}
216+
return "";
217+
}
218+
219+
private static String extractDOI(org.w3c.dom.Document doc, XPath xpath) {
220+
String path = "//tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:idno[@type='DOI']/text()";
221+
222+
try {
223+
NodeList doiNodes = (NodeList) xpath.evaluate(path, doc, XPathConstants.NODESET);
224+
if (doiNodes != null && doiNodes.getLength() > 0) {
225+
String doi = doiNodes.item(0).getNodeValue().trim();
226+
if (!doi.isEmpty() && (doi.startsWith("10.") || doi.contains("doi.org"))) {
227+
LOGGER.debug("Found DOI using path '{}': '{}'", path, doi);
228+
return doi;
229+
}
230+
}
231+
} catch (Exception e) {
232+
LOGGER.debug("Failed to extract DOI with path '{}': {}", path, e.getMessage());
233+
}
234+
return "";
235+
}
236+
237+
private static List<Person> extractAuthors(org.w3c.dom.Document doc, XPath xpath) {
238+
List<Person> authors = new ArrayList<>();
239+
String authorsPath = "//tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:analytic/tei:author/tei:persName";
240+
241+
try {
242+
NodeList authorNodes = (NodeList) xpath.evaluate(authorsPath, doc, XPathConstants.NODESET);
243+
LOGGER.debug("Found {} author nodes using path '{}'", authorNodes.getLength(), authorsPath);
244+
245+
for (int i = 0; i < authorNodes.getLength(); i++) {
246+
Person person = createPersonFromNode(authorNodes.item(i));
247+
if (person != null) {
248+
authors.add(person);
249+
LOGGER.debug("Added author: '{}'", formatPersonName(person));
250+
}
251+
}
252+
} catch (Exception e) {
253+
LOGGER.debug("Failed to extract authors with path '{}': {}", authorsPath, e.getMessage());
254+
}
255+
256+
return authors;
257+
}
258+
259+
/**
260+
* Create a Person object from an XML persName node
261+
*/
262+
private static Person createPersonFromNode(org.w3c.dom.Node node) {
263+
if (node == null) return null;
264+
265+
Person person = new Person();
266+
267+
if (node.getNodeName().equals("persName")) {
268+
NodeList childNodes = node.getChildNodes();
269+
for (int i = 0; i < childNodes.getLength(); i++) {
270+
org.w3c.dom.Node child = childNodes.item(i);
271+
if (child.getNodeName().equals("surname")) {
272+
person.setLastName(child.getTextContent().trim());
273+
} else if (child.getNodeName().equals("forename")) {
274+
person.setFirstName(child.getTextContent().trim());
275+
} else if (child.getNodeName().equals("middlename")) {
276+
person.setMiddleName(child.getTextContent().trim());
277+
}
278+
}
279+
} else {
280+
// If it's not a persName node, use the text content as the full name
281+
String fullName = node.getTextContent().trim();
282+
if (!fullName.isEmpty()) {
283+
person.setLastName(fullName); // Fallback: put full name in last name
284+
}
285+
}
286+
287+
// Return person if it has any name data
288+
return (StringUtils.isNotBlank(person.getLastName()) ||
289+
StringUtils.isNotBlank(person.getFirstName()) ||
290+
StringUtils.isNotBlank(person.getMiddleName())) ? person : null;
291+
}
292+
293+
/**
294+
* Format list of authors as "first middle last, first2 middle2 last2, ..."
295+
*/
296+
private static String formatAuthors(List<Person> authors) {
297+
StringBuilder formattedAuthors = new StringBuilder();
298+
for (int i = 0; i < authors.size(); i++) {
299+
if (i > 0) {
300+
formattedAuthors.append(", ");
301+
}
302+
303+
Person author = authors.get(i);
304+
formattedAuthors.append(formatPersonName(author));
305+
}
306+
return formattedAuthors.toString();
307+
}
308+
309+
/**
310+
* Format a Person's name as "first middle last"
311+
*/
312+
private static String formatPersonName(Person person) {
313+
String lastName = person.getLastName();
314+
String firstName = person.getFirstName();
315+
String middleName = person.getMiddleName();
316+
317+
StringBuilder fullName = new StringBuilder();
318+
319+
if (StringUtils.isNotBlank(firstName)) {
320+
fullName.append(firstName.trim());
321+
}
322+
323+
if (StringUtils.isNotBlank(middleName)) {
324+
if (!fullName.isEmpty()) {
325+
fullName.append(" ");
326+
}
327+
fullName.append(middleName.trim());
328+
}
329+
330+
if (StringUtils.isNotBlank(lastName)) {
331+
if (!fullName.isEmpty()) {
332+
fullName.append(" ");
333+
}
334+
fullName.append(lastName.trim());
335+
}
336+
337+
return fullName.toString();
338+
}
339+
340+
341+
@Override
342+
public String toString() {
343+
return String.format("MetadataArticle{doi='%s', title='%s', authors=%s}",
344+
doi, title, authors);
345+
}
346+
347+
/**
348+
* Simple namespace context for TEI documents
349+
*/
350+
private static class TEINamespaceContext implements NamespaceContext {
351+
@Override
352+
public String getNamespaceURI(String prefix) {
353+
if ("tei".equals(prefix)) {
354+
return "http://www.tei-c.org/ns/1.0";
355+
}
356+
return null;
357+
}
358+
359+
@Override
360+
public String getPrefix(String namespaceURI) {
361+
if ("http://www.tei-c.org/ns/1.0".equals(namespaceURI)) {
362+
return "tei";
363+
}
364+
return null;
365+
}
366+
367+
@Override
368+
public java.util.Iterator<String> getPrefixes(String namespaceURI) {
369+
java.util.Set<String> prefixes = new java.util.HashSet<>();
370+
if ("http://www.tei-c.org/ns/1.0".equals(namespaceURI)) {
371+
prefixes.add("tei");
372+
}
373+
return prefixes.iterator();
374+
}
375+
}
376+
}

0 commit comments

Comments
 (0)