@@ -879,10 +879,10 @@ private List<SoftwareEntity> processLayoutTokenSequenceMultiple(
879879 */
880880 private List <SoftwareEntity > processLayoutTokenSequences (
881881 List <LayoutTokenization > layoutTokenizations ,
882- List <SoftwareEntity > entities ,
883- boolean disambiguate ,
884- boolean addParagraphContext ,
885- boolean fromPDF ,
882+ List <SoftwareEntity > entities ,
883+ boolean disambiguate ,
884+ boolean addParagraphContext ,
885+ boolean fromPDF ,
886886 boolean fromXML ,
887887 List <PDFAnnotation > pdfAnnotations
888888 ) {
@@ -896,12 +896,14 @@ private List<SoftwareEntity> processLayoutTokenSequences(
896896
897897 // positions for lexical match
898898 List <OffsetPosition > softwareTokenPositions = softwareLexicon .tokenPositionsSoftwareNames (layoutTokens );
899- List <OffsetPosition > urlPositions = Lexicon .tokenPositionUrlPatternWithPdfAnnotations (layoutTokens , pdfAnnotations ).stream ()
899+ List <OffsetPosition > urlTokensPositions = Lexicon .tokenPositionUrlPatternWithPdfAnnotations (layoutTokens , pdfAnnotations ).stream ()
900900 .map (Pair ::getLeft )
901901 .collect (Collectors .toList ());
902902
903+ urlTokensPositions .stream ().forEach (o -> o .end += 1 );
904+
903905 // string representation of the feature matrix for sequence labeling lib
904- String ress = addFeatures (layoutTokens , softwareTokenPositions , urlPositions );
906+ String ress = addFeatures (layoutTokens , softwareTokenPositions , urlTokensPositions );
905907 allRess .append (ress );
906908 allRess .append ("\n \n " );
907909 }
@@ -990,10 +992,13 @@ private List<SoftwareEntity> processLayoutTokenSequences(
990992 * Process with the software model a set of arbitrary sequence of LayoutTokenization
991993 * from tables and figures, where the content is not structured (yet)
992994 */
993- private List <SoftwareEntity > processLayoutTokenSequenceTableFigure (List <LayoutToken > layoutTokens ,
994- List <SoftwareEntity > entities ,
995- boolean disambiguate ,
996- boolean addParagraphContext ) {
995+ private List <SoftwareEntity > processLayoutTokenSequenceTableFigure (
996+ List <LayoutToken > layoutTokens ,
997+ List <SoftwareEntity > entities ,
998+ boolean disambiguate ,
999+ boolean addParagraphContext ,
1000+ List <PDFAnnotation > pdfAnnotations
1001+ ) {
9971002 layoutTokens = SoftwareAnalyzer .getInstance ().retokenizeLayoutTokens (layoutTokens );
9981003
9991004 int pos = 0 ;
@@ -1016,7 +1021,9 @@ private List<SoftwareEntity> processLayoutTokenSequenceTableFigure(List<LayoutTo
10161021
10171022 // positions for lexical match
10181023 List <OffsetPosition > softwareTokenPositions = softwareLexicon .tokenPositionsSoftwareNames (localLayoutTokens );
1019- List <OffsetPosition > urlPositions = Lexicon .getInstance ().tokenPositionsUrlPattern (localLayoutTokens );
1024+ List <OffsetPosition > urlPositions = Lexicon .tokenPositionUrlPatternWithPdfAnnotations (layoutTokens , pdfAnnotations ).stream ()
1025+ .map (Pair ::getLeft )
1026+ .collect (Collectors .toList ());
10201027
10211028 // string representation of the feature matrix for sequence labeling lib
10221029 String ress = addFeatures (localLayoutTokens , softwareTokenPositions , urlPositions );
0 commit comments