Skip to content

Commit 7d4dd48

Browse files
authored
Merge pull request #35877 from vespa-engine/arnej/locale-for-linguistics
Improve locale handling in linguistics code
2 parents ae982d5 + fcdd78e commit 7d4dd48

6 files changed

Lines changed: 14 additions & 8 deletions

File tree

linguistics-components/src/main/java/com/yahoo/language/huggingface/ModelInfo.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
package com.yahoo.language.huggingface;
44

5+
import com.yahoo.text.Text;
56
import java.util.Arrays;
67

78
/**
@@ -21,7 +22,7 @@ public static TruncationStrategy fromString(String v) {
2122
else if ("false".equals(v)) return DO_NOT_TRUNCATE;
2223
return Arrays.stream(values())
2324
.filter(s -> s.name().equalsIgnoreCase(v))
24-
.findAny().orElseThrow(() -> new IllegalArgumentException("Invalid strategy '%s'".formatted(v)));
25+
.findAny().orElseThrow(() -> new IllegalArgumentException(Text.format("Invalid strategy '%s'", v)));
2526
}
2627
}
2728

@@ -35,7 +36,7 @@ public static PaddingStrategy fromString(String v) {
3536
else if ("false".equals(v)) return DO_NOT_PAD;
3637
return Arrays.stream(values())
3738
.filter(s -> s.name().equalsIgnoreCase(v))
38-
.findAny().orElseThrow(() -> new IllegalArgumentException("Invalid strategy '%s'".formatted(v)));
39+
.findAny().orElseThrow(() -> new IllegalArgumentException(Text.format("Invalid strategy '%s'", v)));
3940
}
4041
}
4142
}

linguistics-components/src/main/java/com/yahoo/language/wordpiece/Model.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import java.util.Collections;
1919
import java.util.HashMap;
2020
import java.util.List;
21+
import java.util.Locale;
2122
import java.util.Map;
2223
import java.util.NavigableMap;
2324
import java.util.TreeMap;
@@ -67,7 +68,7 @@ class Model {
6768

6869
List<Integer> embed(String text, Tokenizer tokenizer, LinguisticsParameters parameters) {
6970
List<Integer> ids = new ArrayList<>();
70-
text = text.toLowerCase();
71+
text = text.toLowerCase(Locale.ROOT);
7172
for (Token t : tokenizer.tokenize(text, parameters)) {
7273
String originalToken = t.getTokenString();
7374
String candidate = originalToken;

linguistics-components/src/test/java/com/yahoo/language/huggingface/HuggingFaceTokenizerTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package com.yahoo.language.huggingface;
44

55
import com.yahoo.language.tools.EmbedderTester;
6+
import com.yahoo.text.Text;
67
import org.junit.jupiter.api.Test;
78
import org.junit.jupiter.api.io.TempDir;
89

@@ -129,7 +130,7 @@ private static void assertMaxLengthRespected(int maxLength, Encoding encoding) {
129130
}
130131

131132
private static Path decompressModelFile(Path tmp, String model) throws IOException {
132-
var source = Paths.get("src/test/models/huggingface/%s.json.gz".formatted(model));
133+
var source = Paths.get(Text.format("src/test/models/huggingface/%s.json.gz", model));
133134
Path destination = tmp.resolve(source.getFileName().toString().replace(".gz", ""));
134135
try (InputStream in = new GZIPInputStream(Files.newInputStream(source));
135136
OutputStream out = Files.newOutputStream(destination, StandardOpenOption.CREATE)) {

linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import com.yahoo.language.significance.SignificanceModel;
88
import com.yahoo.language.significance.SignificanceModelRegistry;
99
import com.yahoo.search.significance.config.SignificanceConfig;
10+
import com.yahoo.text.Text;
1011
import io.airlift.compress.zstd.ZstdInputStream;
1112

1213
import java.io.IOException;
@@ -61,12 +62,12 @@ public void addModel(Path path) {
6162
for (var pair : file.languages().entrySet()) {
6263

6364
var languagesStr = pair.getKey();
64-
log.fine(() -> "Found model for languages '%s'".formatted(languagesStr));
65+
log.fine(() -> Text.format("Found model for languages '%s'", languagesStr));
6566
String[] languageTags = languagesStr.split(",");
6667

6768
for (var languageTag : languageTags) {
6869
var language = Language.fromLanguageTag(languageTag);
69-
log.fine(() -> "Adding model for language %s with id %s".formatted(language, file.id()));
70+
log.fine(() -> Text.format("Adding model for language %s with id %s", language, file.id()));
7071
this.models.put(language, new DefaultSignificanceModel(pair.getValue(), file.id()));
7172
}
7273
}

linguistics/src/test/java/com/yahoo/language/process/GenWordCharsBitVector.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
package com.yahoo.language.process;
44
import java.nio.charset.StandardCharsets;
5+
import java.util.Locale;
56

67
// program to generate code tables used by Fast_UnicodeUtil::IsWordChar()
78

@@ -36,7 +37,7 @@ static String genTable() {
3637
while (s.length() < nextpos) s.append(' ');
3738
nextpos += 18;
3839
s.append("0x");
39-
String hex = Long.toHexString(val).toUpperCase();
40+
String hex = Long.toHexString(val).toUpperCase(Locale.ROOT);
4041
while (s.length() + hex.length() < nextpos) s.append('0');
4142
s.append(hex);
4243
if (codepoint + 1 < maxCodeBlocks * 0x100) s.append(",");

lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import java.util.ArrayList;
3636
import java.util.Iterator;
3737
import java.util.List;
38+
import java.util.Locale;
3839
import java.util.Map;
3940
import java.util.Optional;
4041

@@ -386,7 +387,7 @@ protected DuplicateTokenFilter(TokenStream input) {
386387
public boolean incrementToken() throws IOException {
387388
if (emitUppercase) {
388389
restoreState(savedState);
389-
String value = term.toString().toUpperCase();
390+
String value = term.toString().toUpperCase(Locale.ROOT);
390391
term.setEmpty();
391392
term.append(value);
392393
position.setPositionIncrement(0); // same position

0 commit comments

Comments
 (0)