Skip to content

Commit 8b88ab9

Browse files
authored
Merge pull request #53 from lfoppiano/sofair
Contribution and changes from the SoFAIR project
2 parents e0b38c1 + ca77883 commit 8b88ab9

3 files changed

Lines changed: 21 additions & 14 deletions

File tree

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ task downloadModelsGit {
362362
} else {
363363
def grgit = Grgit.clone(
364364
dir: "${grobidHome}/models/softcite",
365-
uri: "https://huggingface.co/sciencialab/software-mentions-models",
365+
uri: "https://huggingface.co/SoFairOA/software-mentions-models",
366366
depth: 1
367367
)
368368
}

localLibs/org/grobid/grobid-service/0.8.2/grobid-service-0.8.2.pom

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@
269269
<dependency>
270270
<groupId>org.grobid</groupId>
271271
<artifactId>grobid-core</artifactId>
272-
<version>0.8.3-SNAPSHOT</version>
272+
<version>0.8.2</version>
273273
<scope>runtime</scope>
274274
<exclusions>
275275
<exclusion>
@@ -285,7 +285,7 @@
285285
<dependency>
286286
<groupId>org.grobid</groupId>
287287
<artifactId>grobid-trainer</artifactId>
288-
<version>0.8.3-SNAPSHOT</version>
288+
<version>0.8.2</version>
289289
<scope>runtime</scope>
290290
<exclusions>
291291
<exclusion>

src/main/java/org/grobid/core/engines/SoftwareParser.java

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -879,10 +879,10 @@ private List<SoftwareEntity> processLayoutTokenSequenceMultiple(
879879
*/
880880
private List<SoftwareEntity> processLayoutTokenSequences(
881881
List<LayoutTokenization> layoutTokenizations,
882-
List<SoftwareEntity> entities,
883-
boolean disambiguate,
884-
boolean addParagraphContext,
885-
boolean fromPDF,
882+
List<SoftwareEntity> entities,
883+
boolean disambiguate,
884+
boolean addParagraphContext,
885+
boolean fromPDF,
886886
boolean fromXML,
887887
List<PDFAnnotation> pdfAnnotations
888888
) {
@@ -896,12 +896,14 @@ private List<SoftwareEntity> processLayoutTokenSequences(
896896

897897
// positions for lexical match
898898
List<OffsetPosition> softwareTokenPositions = softwareLexicon.tokenPositionsSoftwareNames(layoutTokens);
899-
List<OffsetPosition> urlPositions = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations).stream()
899+
List<OffsetPosition> urlTokensPositions = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations).stream()
900900
.map(Pair::getLeft)
901901
.collect(Collectors.toList());
902902

903+
urlTokensPositions.stream().forEach(o -> o.end += 1);
904+
903905
// string representation of the feature matrix for sequence labeling lib
904-
String ress = addFeatures(layoutTokens, softwareTokenPositions, urlPositions);
906+
String ress = addFeatures(layoutTokens, softwareTokenPositions, urlTokensPositions);
905907
allRess.append(ress);
906908
allRess.append("\n\n");
907909
}
@@ -990,10 +992,13 @@ private List<SoftwareEntity> processLayoutTokenSequences(
990992
* Process with the software model a set of arbitrary sequence of LayoutTokenization
991993
* from tables and figures, where the content is not structured (yet)
992994
*/
993-
private List<SoftwareEntity> processLayoutTokenSequenceTableFigure(List<LayoutToken> layoutTokens,
994-
List<SoftwareEntity> entities,
995-
boolean disambiguate,
996-
boolean addParagraphContext) {
995+
private List<SoftwareEntity> processLayoutTokenSequenceTableFigure(
996+
List<LayoutToken> layoutTokens,
997+
List<SoftwareEntity> entities,
998+
boolean disambiguate,
999+
boolean addParagraphContext,
1000+
List<PDFAnnotation> pdfAnnotations
1001+
) {
9971002
layoutTokens = SoftwareAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens);
9981003

9991004
int pos = 0;
@@ -1016,7 +1021,9 @@ private List<SoftwareEntity> processLayoutTokenSequenceTableFigure(List<LayoutTo
10161021

10171022
// positions for lexical match
10181023
List<OffsetPosition> softwareTokenPositions = softwareLexicon.tokenPositionsSoftwareNames(localLayoutTokens);
1019-
List<OffsetPosition> urlPositions = Lexicon.getInstance().tokenPositionsUrlPattern(localLayoutTokens);
1024+
List<OffsetPosition> urlPositions = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(layoutTokens, pdfAnnotations).stream()
1025+
.map(Pair::getLeft)
1026+
.collect(Collectors.toList());
10201027

10211028
// string representation of the feature matrix for sequence labeling lib
10221029
String ress = addFeatures(localLayoutTokens, softwareTokenPositions, urlPositions);

0 commit comments

Comments
 (0)