Skip to content

Commit b52e311

Browse files
authored
Enforce consistent encodings throughout when loading templates (#547)
2 parents d0abaa6 + 6c8c042 commit b52e311

23 files changed

+2347
-124
lines changed

src/ontogpt/cli.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
from ontogpt.io.owl_exporter import OWLExporter
5656
from ontogpt.io.rdf_exporter import RDFExporter
5757
from ontogpt.io.template_loader import get_template_details, get_template_path
58+
from ontogpt.io.utils import read_text_with_fallbacks
5859
from ontogpt.io.yaml_wrapper import dump_minimal_yaml
5960
from ontogpt.templates.core import ExtractionResult
6061
from ontogpt.utils.multilingual import multilingual_analysis
@@ -196,7 +197,7 @@ def parse_input(
196197
parse_tabular_input(str(f), selectcols)
197198
if Path(f).suffix in VALID_TABULAR_FORMATS
198199
or Path(f).suffix in VALID_SPREADSHEET_FORMATS
199-
else open(f, "r").read()
200+
else read_text_with_fallbacks(f)
200201
)
201202
for f in inputfiles
202203
if f.is_file()
@@ -207,7 +208,7 @@ def parse_input(
207208
parse_tabular_input(str(f), selectcols)
208209
if Path(f).suffix in VALID_TABULAR_FORMATS
209210
or Path(f).suffix in VALID_SPREADSHEET_FORMATS
210-
else open(f, "r").read()
211+
else read_text_with_fallbacks(f)
211212
)
212213
for f in inputfiles
213214
if f.is_file()
@@ -225,7 +226,7 @@ def parse_input(
225226
text = parse_tabular_input(input, selectcols)
226227
logging.info(f"Input text: {text}")
227228
else:
228-
text = open(input, "rb").read().decode(encoding="utf-8", errors="ignore")
229+
text = read_text_with_fallbacks(Path(input))
229230
logging.info(f"Input text: {text}")
230231
parsedlist = [text]
231232
else:
@@ -1102,11 +1103,11 @@ def recipe_extract(
11021103
ke.client.cache_db_path = settings.cache_db
11031104

11041105
if recipes_urls_file:
1105-
with open(recipes_urls_file, "r") as f:
1106-
urls = [line.strip() for line in f.readlines() if url in line]
1107-
if len(urls) != 1:
1108-
raise ValueError(f"Found {len(urls)} URLs in {recipes_urls_file}")
1109-
url = urls[0]
1106+
content = read_text_with_fallbacks(Path(recipes_urls_file))
1107+
urls = [line.strip() for line in content.splitlines() if url in line]
1108+
if len(urls) != 1:
1109+
raise ValueError(f"Found {len(urls)} URLs in {recipes_urls_file}")
1110+
url = urls[0]
11101111
scraper = scrape_me(url)
11111112

11121113
if dictionary:
@@ -1172,8 +1173,7 @@ def convert(
11721173
)
11731174

11741175
cls = ke.template_pyclass
1175-
with open(input, "r") as f:
1176-
data = yaml.safe_load(f)
1176+
data = yaml.safe_load(read_text_with_fallbacks(Path(input)))
11771177
obj = cls(**data["extracted_object"])
11781178
results = ExtractionResult(extracted_object=obj)
11791179
write_extraction(results, output, output_format, ke, template, cut_input_text)
@@ -1580,7 +1580,7 @@ def diagnose(
15801580
if not model:
15811581
model = DEFAULT_MODEL
15821582

1583-
phenopackets = [json.load(open(f)) for f in phenopacket_files]
1583+
phenopackets = [json.loads(read_text_with_fallbacks(Path(f))) for f in phenopacket_files]
15841584
engine = PhenoEngine(
15851585
model=model,
15861586
temperature=temperature,
@@ -1633,7 +1633,9 @@ def run_multilingual_analysis(
16331633
elif input_data_dir and Path(input_data_dir).is_dir():
16341634
logging.info(f"Input file directory: {input_data_dir}")
16351635
inputfiles = Path(input_data_dir).glob("*.txt")
1636-
inputdict = {str(f.name): open(f, "r").read() for f in inputfiles if f.is_file()}
1636+
inputdict = {
1637+
str(f.name): read_text_with_fallbacks(f) for f in inputfiles if f.is_file()
1638+
}
16371639
logging.info(f"Found {len(inputdict)} input files here.")
16381640

16391641
i = 0
@@ -1698,7 +1700,7 @@ def answer(
16981700
**kwargs,
16991701
):
17001702
"""Answer a set of questions defined in YAML."""
1701-
qc = QuestionCollection(**yaml.safe_load(open(inputfile)))
1703+
qc = QuestionCollection(**yaml.safe_load(read_text_with_fallbacks(Path(inputfile))))
17021704
engine = GenericEngine(
17031705
model=model,
17041706
temperature=temperature,
@@ -1947,7 +1949,7 @@ def complete(
19471949
"""
19481950

19491951
if inputfile:
1950-
text = open(inputfile).read()
1952+
text = read_text_with_fallbacks(Path(inputfile))
19511953
else:
19521954
text = input.strip()
19531955

@@ -2174,7 +2176,7 @@ def _get_templates() -> Dict[str, Tuple[str, str]]:
21742176
template_dir = Path(__file__).parent / "templates"
21752177
template_paths = [f for f in template_dir.glob("*.yaml")]
21762178
for template_path in template_paths:
2177-
with open(template_path, "r") as template_file:
2179+
with open(template_path, "r", encoding="utf-8") as template_file:
21782180
data = yaml.safe_load(template_file)
21792181
if data["id"].startswith(http_prefixes):
21802182
identifier = data["id"].split("/")[-1]

src/ontogpt/engines/generic_engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from pydantic import BaseModel
3636

3737
from ontogpt.engines.knowledge_engine import KnowledgeEngine
38+
from ontogpt.io.utils import read_text_with_fallbacks
3839

3940
logger = logging.getLogger(__name__)
4041

@@ -77,9 +78,8 @@ def run(
7778
if isinstance(template_path, str):
7879
# Note: The default is a string, so this will always be true
7980
# create a Jinja2 template object
80-
with open(template_path) as file:
81-
template_txt = file.read()
82-
main_template = Template(template_txt)
81+
template_txt = read_text_with_fallbacks(Path(template_path))
82+
main_template = Template(template_txt)
8383
for question in question_collection.questions:
8484
for instruction in question_collection.instructions:
8585
template = main_template

src/ontogpt/engines/halo_engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
from ontogpt.clients import LLMClient
2525
from ontogpt.engines.knowledge_engine import FIELD, KnowledgeEngine
26+
from ontogpt.io.utils import read_text_with_fallbacks
2627
from ontogpt.io.yaml_wrapper import dump_minimal_yaml
2728
from ontogpt.templates.halo import Ontology, OntologyElement
2829

@@ -96,7 +97,8 @@ def seed_from_file(self, file_path: str) -> Ontology:
9697
:param file_path:
9798
:return:
9899
"""
99-
ontology = Ontology(**yaml.safe_load(open(file_path)))
100+
ontology_text = read_text_with_fallbacks(Path(file_path))
101+
ontology = Ontology(**yaml.safe_load(ontology_text))
100102
self.seed(ontology)
101103
logger.info(f"Seeded with {len(ontology.elements)} elements")
102104
return ontology

src/ontogpt/engines/knowledge_engine.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
from ontogpt import DEFAULT_MODEL
9595
from ontogpt.clients.llm_client import LLMClient
9696
from ontogpt.templates.core import ExtractionResult, NamedEntity
97+
from ontogpt.io.utils import read_text_with_fallbacks
9798

9899
this_path = Path(__file__).parent
99100
logger = logging.getLogger(__name__)
@@ -271,8 +272,7 @@ def extract_from_file(self, file: Union[str, Path, TextIO]) -> pydantic.BaseMode
271272
if isinstance(file, str):
272273
file = Path(file)
273274
if isinstance(file, Path):
274-
with file.open() as f:
275-
text = f.read()
275+
text = read_text_with_fallbacks(file)
276276
else:
277277
text = file.read()
278278
self.last_text = text
@@ -283,8 +283,8 @@ def extract_from_file(self, file: Union[str, Path, TextIO]) -> pydantic.BaseMode
283283
def load_dictionary(self, path: Union[str, Path, list]):
284284
if not isinstance(path, list):
285285
logger.info(f"Loading dictionary from {path}")
286-
with open(str(path)) as f:
287-
return self.load_dictionary(yaml.safe_load(f))
286+
data = read_text_with_fallbacks(Path(path))
287+
return self.load_dictionary(yaml.safe_load(data))
288288
if self.dictionary is None:
289289
self.dictionary = {}
290290
entries = [(entry["synonym"].lower(), entry["id"]) for entry in path]

src/ontogpt/engines/mapping_engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from sssom_schema import Mapping
4848

4949
from ontogpt.engines.knowledge_engine import KnowledgeEngine
50+
from ontogpt.io.utils import read_text_with_fallbacks
5051
from ontogpt.prompts.mapping import DEFAULT_MAPPING_EVAL_PROMPT
5152

5253
logger = logging.getLogger(__name__)
@@ -153,9 +154,8 @@ def categorize_mapping(
153154
template_path = str(template_path)
154155
if isinstance(template_path, str):
155156
# create a Jinja2 template object
156-
with open(template_path) as file:
157-
template_txt = file.read()
158-
template = Template(template_txt)
157+
template_txt = read_text_with_fallbacks(Path(template_path))
158+
template = Template(template_txt)
159159
subject_concept = self._concept(subject, self.subject_adapter)
160160
object_concept = self._concept(object, self.object_adapter)
161161
prompt = template.render(

src/ontogpt/engines/pheno_engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pydantic import BaseModel
1414

1515
from ontogpt.engines.knowledge_engine import KnowledgeEngine
16+
from ontogpt.io.utils import read_text_with_fallbacks
1617
from ontogpt.io.yaml_wrapper import dump_minimal_yaml
1718
from ontogpt.prompts.phenopacket import DEFAULT_PHENOPACKET_PROMPT
1819

@@ -58,9 +59,8 @@ def predict_disease(
5859
template_path = str(template_path)
5960
if isinstance(template_path, str):
6061
# create a Jinja2 template object
61-
with open(template_path) as file:
62-
template_txt = file.read()
63-
template = Template(template_txt)
62+
template_txt = read_text_with_fallbacks(Path(template_path))
63+
template = Template(template_txt)
6464
# Account for missing template fields if necessary
6565
# TODO: make this its own function
6666
for subject_key in ["sex", "ageAtCollection"]:

src/ontogpt/engines/reasoner_engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from pydantic import BaseModel
3838

3939
from ontogpt.engines.knowledge_engine import KnowledgeEngine
40+
from ontogpt.io.utils import read_text_with_fallbacks
4041
from ontogpt.ontex.extractor import (
4142
Answer,
4243
Axiom,
@@ -177,9 +178,8 @@ def reason(
177178
template_path = str(template_path)
178179
if isinstance(template_path, str):
179180
# create a Jinja2 template object
180-
with open(template_path) as file:
181-
template_txt = file.read()
182-
template = Template(template_txt)
181+
template_txt = read_text_with_fallbacks(Path(template_path))
182+
template = Template(template_txt)
183183
prompt = template.render(
184184
task=task,
185185
ontology=task.ontology,

src/ontogpt/engines/spires_engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
chunk_text_by_char,
3333
chunk_text_by_sentence,
3434
)
35+
from ontogpt.io.utils import read_text_with_fallbacks
3536
from ontogpt.io.yaml_wrapper import dump_minimal_yaml
3637
from ontogpt.templates.core import ExtractionResult, NamedEntity
3738
from ontogpt.utils.parse_utils import get_span_values, sanitize_text
@@ -212,7 +213,7 @@ def _remove_parenthetical_context(s: str):
212213
cache_path = Path(cache_path)
213214
if cache_path:
214215
if cache_path.exists() and not clear:
215-
db = yaml.safe_load(cache_path.open())
216+
db = yaml.safe_load(read_text_with_fallbacks(cache_path))
216217
if "entities_in_queue" not in db:
217218
db["entities_in_queue"] = []
218219
else:

src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import ontogpt.templates.drug as target_datamodel
2121
from ontogpt.engines.spires_engine import SPIRESEngine
2222
from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
23+
from ontogpt.io.utils import read_text_with_fallbacks
2324

2425
THIS_DIR = Path(__file__).parent
2526
DATABASE_DIR = Path(__file__).parent / "database"
@@ -115,16 +116,15 @@ def drug_to_mechanism_text(self) -> Dict[str, str]:
115116
:return:
116117
"""
117118
if self._drug_to_mechanism_text is None:
118-
with open(MOA_TEXTS, "r") as f:
119-
self._drug_to_mechanism_text = {}
120-
for line in f:
121-
toks = line.strip().split("\t")
122-
if len(toks) != 2:
123-
logging.warning(f"Bad line: {line}")
124-
continue
125-
drug, text = toks
126-
if text != "NA":
127-
self._drug_to_mechanism_text[f"drugbank:{drug}"] = text
119+
self._drug_to_mechanism_text = {}
120+
for line in read_text_with_fallbacks(Path(MOA_TEXTS)).splitlines():
121+
toks = line.strip().split("\t")
122+
if len(toks) != 2:
123+
logging.warning(f"Bad line: {line}")
124+
continue
125+
drug, text = toks
126+
if text != "NA":
127+
self._drug_to_mechanism_text[f"drugbank:{drug}"] = text
128128
return self._drug_to_mechanism_text
129129

130130
def load_source_mechanisms_from_path(
@@ -133,9 +133,10 @@ def load_source_mechanisms_from_path(
133133
if isinstance(file, Path):
134134
file = str(file)
135135
if isinstance(file, str):
136-
with open(file, "r") as f:
137-
return self.load_source_mechanisms_from_path(f)
138-
mechanisms = yaml.safe_load(file)
136+
data = read_text_with_fallbacks(Path(file))
137+
mechanisms = yaml.safe_load(data)
138+
else:
139+
mechanisms = yaml.safe_load(file)
139140
print(f"Loading {len(mechanisms)} mechanism objects from yaml; translating...")
140141
return [source_datamodel.Mechanism(**_fix_source_mechanism(m)) for m in mechanisms]
141142

src/ontogpt/evaluation/go/eval_go.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from ontogpt.engines.spires_engine import SPIRESEngine
1515
from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
16+
from ontogpt.io.utils import read_text_with_fallbacks
1617
from ontogpt.templates.metabolic_process import MetabolicProcess
1718

1819
TEST_CASES_DIR = Path(__file__).parent / "test_cases"
@@ -82,8 +83,8 @@ def make_term_from_ldef(self, ldef: LogicalDefinitionAxiom) -> MetabolicProcess:
8283
return mp
8384

8485
def valid_test_ids(self) -> List[str]:
85-
with open(TEST_CASES_DIR / "go-ids-2022.txt") as f:
86-
return [x.strip() for x in f.readlines()]
86+
data = read_text_with_fallbacks(TEST_CASES_DIR / "go-ids-2022.txt")
87+
return [x.strip() for x in data.splitlines()]
8788

8889
def ldef_matches(self, ldef: LogicalDefinitionAxiom) -> bool:
8990
"""Check if a logical definition matches the genus and differentia."""

0 commit comments

Comments
 (0)