-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtransform_data.py
More file actions
150 lines (125 loc) · 4.49 KB
/
transform_data.py
File metadata and controls
150 lines (125 loc) · 4.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""Converts a list of PIFs into tabular data for easier manipulation."""
import os
import gzip
import json
import random
from typing import List
from typing import Union
import pandas as pd
from pymatgen.core import Composition
def _read_pifs(json_file: str) -> List[dict]:
print(f"Reading PIFs from {json_file}...")
if os.path.splitext(json_file)[-1] == ".gz":
with gzip.open(json_file, "rb") as fr:
pifs = json.load(fr)
else:
with open(json_file, "rb") as fr:
pifs = json.load(fr)
print(f"Successfully read {len(pifs)} PIFs")
return pifs
def _get_overall_composition(pif: dict) -> dict:
comp = {e["element"]: e["idealAtomicPercent"] for e in pif["composition"]}
if not comp:
return None
return comp
def _get_prop_value(pif: dict, prop_name: str) -> Union[str, float]:
prop = filter(lambda x: prop_name in x["name"], pif["properties"])
try:
val = list(prop)[0]["scalars"]["value"]
except IndexError:
val = None
return val
def get_input_data_as_df(
pifs: List[dict],
/,
properties: List[str] = None,
*,
write_to_disk: bool = True,
fname: str = "data.csv",
) -> pd.DataFrame:
"""
Transform the input list of PIFs to a pd.DataFrame.
Args:
pifs (List[dict]): List of PIFs.
properties (List[str]): List of properties to extract from each PIF.
Defaults to the following:
[
"Top monolayer formula",
"Bulk formula",
"Binding energy of adsorbed",
"Filling of a d-band",
"Center of a d-band",
"Width of a d-band",
"Skewness of a d-band",
"Kurtosis of a d-band",
"Work function",
"Atomic radius",
"Spatial extent of d-orbitals",
"Ionization potential",
"Electron affinity",
"Pauling electronegativity",
"Local Pauling electronegativity",
"d coupling matrix"
]
write_to_disk (bool, optional): Whether to write the dataframe to disk.
Defaults to True.
fname (str, optional): Path to the file to write the dataframe to.
Defaults to "data.csv".
Returns:
pd.DataFrame: Input PIFs data as a dataframe.
"""
if properties is None:
properties = [
"Top monolayer formula",
"Bulk formula",
"Binding energy of adsorbed",
"Filling of a d-band",
"Center of a d-band",
"Width of a d-band",
"Skewness of a d-band",
"Kurtosis of a d-band",
"Work function",
"Atomic radius",
"Spatial extent of d-orbitals",
"Ionization potential",
"Electron affinity",
"Pauling electronegativity",
"Local Pauling electronegativity",
"d coupling matrix",
]
ddict = {}
for pif in pifs:
names = set(pif["name"])
# only one name (to be used as UID)
assert len(names) == 1
name = list(names)[0]
# get alloy composition
comp = _get_overall_composition(pif)
# get pymatgen reduced formula for later labeling
formula = Composition(comp).reduced_formula
# add this entry to the data dictionary
ddict[name] = {"name": name, "formula": formula, "comp": comp}
# get all properties to use for training, add to data dictionary
for prop in properties:
prop_key = "_".join(prop.lower().split())
ddict[name].update({prop_key: _get_prop_value(pif, prop)})
print(f"# of entries in the parsed input data: {len(ddict)}")
# convert to df
# list of dictionary keys as df column names
sample_key = random.choice(list(ddict.keys()))
# note: columns here are unsorted (no need to)
columns = list(ddict[sample_key].keys())
data = []
for name, props in ddict.items():
data.append([props[c] for c in columns])
df = pd.DataFrame(data=data, columns=columns)
print(df)
if write_to_disk:
print(f"Input data written to {fname}.")
df.to_csv(fname, index=False)
return df
if __name__ == "__main__":
pifs_file = "ma_2015_bimetallics_raw.json.gz"
pifs = _read_pifs(pifs_file)
df = get_input_data_as_df(pifs, fname="bimetallics_data.csv")
print(df)