1+ package org .grobid .core .data ;
2+
3+ import com .fasterxml .jackson .core .JsonProcessingException ;
4+ import com .fasterxml .jackson .databind .ObjectMapper ;
5+ import org .apache .commons .collections4 .CollectionUtils ;
6+ import org .apache .commons .lang3 .StringUtils ;
7+ import org .slf4j .Logger ;
8+ import org .slf4j .LoggerFactory ;
9+ import org .w3c .dom .NodeList ;
10+
11+ import javax .xml .namespace .NamespaceContext ;
12+ import javax .xml .xpath .XPath ;
13+ import javax .xml .xpath .XPathConstants ;
14+ import javax .xml .xpath .XPathFactory ;
15+ import java .util .ArrayList ;
16+ import java .util .List ;
17+ import java .util .Optional ;
18+
19+ /**
20+ * Data class to hold article metadata (DOI, title, authors) extracted from documents.
21+ * This class provides a clean separation from BiblioComponent which is designed
22+ * for reference components within software mentions.
23+ */
24+ public class ArticleBiblio {
25+ private static final Logger LOGGER = LoggerFactory .getLogger (ArticleBiblio .class );
26+
27+ private String doi ;
28+ private String title ;
29+ private String authors ;
30+
31+ public ArticleBiblio () {
32+ }
33+
34+ public ArticleBiblio (String doi , String title , String authors ) {
35+ this .doi = doi ;
36+ this .title = title ;
37+ this .authors = authors ;
38+ }
39+ // Getters and setters
40+ public String getDoi () {
41+ return doi ;
42+ }
43+
44+ public void setDoi (String doi ) {
45+ this .doi = doi ;
46+ }
47+
48+ public String getTitle () {
49+ return title ;
50+ }
51+
52+ public void setTitle (String title ) {
53+ this .title = title ;
54+ }
55+
56+ public void setAuthors (String authors ) {
57+ this .authors = authors ;
58+ }
59+
60+ public String getAuthors () {
61+ return this .authors ;
62+ }
63+ /**
64+ * Check if this metadata article has any meaningful content
65+ */
66+ public boolean hasContent () {
67+ return (StringUtils .isNotBlank (doi )) ||
68+ (StringUtils .isNotBlank (title )) ||
69+ (StringUtils .isNotBlank (authors ));
70+ }
71+
72+ /**
73+ * Convert this ArticleBiblio to JSON string for API response
74+ */
75+ public String toJson () {
76+ ObjectMapper mapper = new ObjectMapper ();
77+ StringBuilder json = new StringBuilder ();
78+ boolean hasField = false ;
79+
80+ json .append ("\" biblio\" : {" );
81+
82+ // Add DOI if available
83+ if (StringUtils .isNotBlank (doi )) {
84+ json .append ("\" doi\" : " );
85+ try {
86+ json .append (mapper .writeValueAsString (doi ));
87+ } catch (JsonProcessingException e ) {
88+ json .append ("\" \" " );
89+ }
90+ hasField = true ;
91+ }
92+
93+ // Add title if available
94+ if (StringUtils .isNotBlank (title )) {
95+ if (hasField ) json .append (", " );
96+ json .append ("\" title\" : " );
97+ try {
98+ json .append (mapper .writeValueAsString (title ));
99+ } catch (JsonProcessingException e ) {
100+ json .append ("\" \" " );
101+ }
102+ hasField = true ;
103+ }
104+
105+ // Add authors if available
106+ if (StringUtils .isNotBlank (authors )) {
107+ if (hasField ) json .append (", " );
108+ try {
109+ json .append ("\" authors\" : " ).append (mapper .writeValueAsString (authors ));
110+ } catch (JsonProcessingException e ) {
111+ json .append ("\" authors\" : \" \" " );
112+ }
113+ }
114+
115+ json .append ("}" );
116+ return json .toString ();
117+ }
118+
119+ /**
120+ * Create MetadataArticle from BiblioItem
121+ */
122+ public static Optional <ArticleBiblio > fromBiblioItem (BiblioItem biblioItem ) {
123+ if (biblioItem == null ) {
124+ LOGGER .debug ("BiblioItem is null, cannot create MetadataArticle" );
125+ return Optional .empty ();
126+ }
127+
128+ LOGGER .debug ("Creating MetadataArticle from BiblioItem" );
129+ ArticleBiblio metadata = new ArticleBiblio ();
130+
131+ if (biblioItem .getDOI () != null && !biblioItem .getDOI ().trim ().isEmpty ()) {
132+ metadata .setDoi (biblioItem .getDOI ());
133+ LOGGER .debug ("Extracted DOI: " + biblioItem .getDOI ());
134+ }
135+
136+ if (biblioItem .getTitle () != null && !biblioItem .getTitle ().trim ().isEmpty ()) {
137+ metadata .setTitle (biblioItem .getTitle ());
138+ LOGGER .debug ("Extracted title: " + biblioItem .getTitle ());
139+ }
140+
141+ List <Person > authors = biblioItem .getFullAuthors ();
142+ if (CollectionUtils .isNotEmpty (authors )) {
143+ String authorsAsList = formatAuthors (authors );
144+ metadata .setAuthors (authorsAsList );
145+ }
146+
147+ return metadata .hasContent () ? Optional .of (metadata ) : Optional .empty ();
148+ }
149+
150+ /**
151+ * Extract article metadata from TEI XML Document using XPath
152+ */
153+ public static Optional <ArticleBiblio > fromTeiDocument (org .w3c .dom .Document teiDocument ) {
154+ if (teiDocument == null ) {
155+ return Optional .empty ();
156+ }
157+
158+ try {
159+ XPathFactory xPathFactory = XPathFactory .newInstance ();
160+ XPath xpath = xPathFactory .newXPath ();
161+
162+ // Set up namespace context for TEI documents
163+ NamespaceContext nsContext = new TEINamespaceContext ();
164+ xpath .setNamespaceContext (nsContext );
165+
166+ // Extract metadata using namespace-aware XPath
167+ String title = extractTitle (teiDocument , xpath );
168+ String doi = extractDOI (teiDocument , xpath );
169+ List <Person > authors = extractAuthors (teiDocument , xpath );
170+
171+ LOGGER .debug ("Extracted from TEI: title='{}', doi='{}', authors={}" , title , doi , authors .size ());
172+
173+ ArticleBiblio articleMetadata = new ArticleBiblio ();
174+ articleMetadata .setDoi (doi );
175+ articleMetadata .setTitle (title );
176+
177+ if (CollectionUtils .isNotEmpty (authors )) {
178+ articleMetadata .setAuthors (formatAuthors (authors ));
179+ }
180+
181+ boolean hasContent = articleMetadata .hasContent ();
182+ LOGGER .debug ("Article metadata has content: {}, result: {}" , hasContent , articleMetadata );
183+ return hasContent ? Optional .of (articleMetadata ) : Optional .empty ();
184+ } catch (Exception e ) {
185+ LOGGER .error ("Error extracting article metadata from TEI document" , e );
186+ return Optional .empty ();
187+ }
188+ }
189+
190+ private static String extractTitle (org .w3c .dom .Document doc , XPath xpath ) {
191+ try {
192+ // Try multiple possible title paths with namespace
193+ String [] titlePaths = {
194+ "//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@level='a'][@type='main']/text()" ,
195+ "//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@level='a']" ,
196+ "//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type='main']/text()" ,
197+ };
198+
199+ for (String path : titlePaths ) {
200+ try {
201+ NodeList titleNodes = (NodeList ) xpath .evaluate (path , doc , XPathConstants .NODESET );
202+ if (titleNodes != null && titleNodes .getLength () > 0 ) {
203+ String title = titleNodes .item (0 ).getNodeValue ().trim ();
204+ if (!title .isEmpty ()) {
205+ LOGGER .debug ("Found title using path '{}': '{}'" , path , title );
206+ return title ;
207+ }
208+ }
209+ } catch (Exception e ) {
210+ LOGGER .debug ("Failed to extract title with path '{}': {}" , path , e .getMessage ());
211+ }
212+ }
213+ } catch (Exception e ) {
214+ LOGGER .debug ("Error extracting title from TEI document" , e );
215+ }
216+ return "" ;
217+ }
218+
219+ private static String extractDOI (org .w3c .dom .Document doc , XPath xpath ) {
220+ String path = "//tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:idno[@type='DOI']/text()" ;
221+
222+ try {
223+ NodeList doiNodes = (NodeList ) xpath .evaluate (path , doc , XPathConstants .NODESET );
224+ if (doiNodes != null && doiNodes .getLength () > 0 ) {
225+ String doi = doiNodes .item (0 ).getNodeValue ().trim ();
226+ if (!doi .isEmpty () && (doi .startsWith ("10." ) || doi .contains ("doi.org" ))) {
227+ LOGGER .debug ("Found DOI using path '{}': '{}'" , path , doi );
228+ return doi ;
229+ }
230+ }
231+ } catch (Exception e ) {
232+ LOGGER .debug ("Failed to extract DOI with path '{}': {}" , path , e .getMessage ());
233+ }
234+ return "" ;
235+ }
236+
237+ private static List <Person > extractAuthors (org .w3c .dom .Document doc , XPath xpath ) {
238+ List <Person > authors = new ArrayList <>();
239+ String authorsPath = "//tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:analytic/tei:author/tei:persName" ;
240+
241+ try {
242+ NodeList authorNodes = (NodeList ) xpath .evaluate (authorsPath , doc , XPathConstants .NODESET );
243+ LOGGER .debug ("Found {} author nodes using path '{}'" , authorNodes .getLength (), authorsPath );
244+
245+ for (int i = 0 ; i < authorNodes .getLength (); i ++) {
246+ Person person = createPersonFromNode (authorNodes .item (i ));
247+ if (person != null ) {
248+ authors .add (person );
249+ LOGGER .debug ("Added author: '{}'" , formatPersonName (person ));
250+ }
251+ }
252+ } catch (Exception e ) {
253+ LOGGER .debug ("Failed to extract authors with path '{}': {}" , authorsPath , e .getMessage ());
254+ }
255+
256+ return authors ;
257+ }
258+
259+ /**
260+ * Create a Person object from an XML persName node
261+ */
262+ private static Person createPersonFromNode (org .w3c .dom .Node node ) {
263+ if (node == null ) return null ;
264+
265+ Person person = new Person ();
266+
267+ if (node .getNodeName ().equals ("persName" )) {
268+ NodeList childNodes = node .getChildNodes ();
269+ for (int i = 0 ; i < childNodes .getLength (); i ++) {
270+ org .w3c .dom .Node child = childNodes .item (i );
271+ if (child .getNodeName ().equals ("surname" )) {
272+ person .setLastName (child .getTextContent ().trim ());
273+ } else if (child .getNodeName ().equals ("forename" )) {
274+ person .setFirstName (child .getTextContent ().trim ());
275+ } else if (child .getNodeName ().equals ("middlename" )) {
276+ person .setMiddleName (child .getTextContent ().trim ());
277+ }
278+ }
279+ } else {
280+ // If it's not a persName node, use the text content as the full name
281+ String fullName = node .getTextContent ().trim ();
282+ if (!fullName .isEmpty ()) {
283+ person .setLastName (fullName ); // Fallback: put full name in last name
284+ }
285+ }
286+
287+ // Return person if it has any name data
288+ return (StringUtils .isNotBlank (person .getLastName ()) ||
289+ StringUtils .isNotBlank (person .getFirstName ()) ||
290+ StringUtils .isNotBlank (person .getMiddleName ())) ? person : null ;
291+ }
292+
293+ /**
294+ * Format list of authors as "first middle last, first2 middle2 last2, ..."
295+ */
296+ private static String formatAuthors (List <Person > authors ) {
297+ StringBuilder formattedAuthors = new StringBuilder ();
298+ for (int i = 0 ; i < authors .size (); i ++) {
299+ if (i > 0 ) {
300+ formattedAuthors .append (", " );
301+ }
302+
303+ Person author = authors .get (i );
304+ formattedAuthors .append (formatPersonName (author ));
305+ }
306+ return formattedAuthors .toString ();
307+ }
308+
309+ /**
310+ * Format a Person's name as "first middle last"
311+ */
312+ private static String formatPersonName (Person person ) {
313+ String lastName = person .getLastName ();
314+ String firstName = person .getFirstName ();
315+ String middleName = person .getMiddleName ();
316+
317+ StringBuilder fullName = new StringBuilder ();
318+
319+ if (StringUtils .isNotBlank (firstName )) {
320+ fullName .append (firstName .trim ());
321+ }
322+
323+ if (StringUtils .isNotBlank (middleName )) {
324+ if (!fullName .isEmpty ()) {
325+ fullName .append (" " );
326+ }
327+ fullName .append (middleName .trim ());
328+ }
329+
330+ if (StringUtils .isNotBlank (lastName )) {
331+ if (!fullName .isEmpty ()) {
332+ fullName .append (" " );
333+ }
334+ fullName .append (lastName .trim ());
335+ }
336+
337+ return fullName .toString ();
338+ }
339+
340+
341+ @ Override
342+ public String toString () {
343+ return String .format ("MetadataArticle{doi='%s', title='%s', authors=%s}" ,
344+ doi , title , authors );
345+ }
346+
347+ /**
348+ * Simple namespace context for TEI documents
349+ */
350+ private static class TEINamespaceContext implements NamespaceContext {
351+ @ Override
352+ public String getNamespaceURI (String prefix ) {
353+ if ("tei" .equals (prefix )) {
354+ return "http://www.tei-c.org/ns/1.0" ;
355+ }
356+ return null ;
357+ }
358+
359+ @ Override
360+ public String getPrefix (String namespaceURI ) {
361+ if ("http://www.tei-c.org/ns/1.0" .equals (namespaceURI )) {
362+ return "tei" ;
363+ }
364+ return null ;
365+ }
366+
367+ @ Override
368+ public java .util .Iterator <String > getPrefixes (String namespaceURI ) {
369+ java .util .Set <String > prefixes = new java .util .HashSet <>();
370+ if ("http://www.tei-c.org/ns/1.0" .equals (namespaceURI )) {
371+ prefixes .add ("tei" );
372+ }
373+ return prefixes .iterator ();
374+ }
375+ }
376+ }
0 commit comments