-
Notifications
You must be signed in to change notification settings - Fork 41
Expand file tree
/
Copy pathmultinode_eval_v.py
More file actions
206 lines (148 loc) · 5.65 KB
/
multinode_eval_v.py
File metadata and controls
206 lines (148 loc) · 5.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import os
import sys
import subprocess
import shlex
from termcolor import cprint
from omegaconf import DictConfig, ListConfig, OmegaConf
def run_local(cmd: str, check: bool = True) -> None:
subprocess.run(f"bash -lc {shlex.quote(cmd)}", shell=True, check=check)
def run_local_async(cmd: str) -> subprocess.Popen:
return subprocess.Popen(f"bash -lc {shlex.quote(cmd)}", shell=True)
def run_remote(host: str, cmd: str, check: bool = True) -> None:
ssh_cmd = f'ssh root@{host} "bash -lc {shlex.quote(cmd)}"'
subprocess.run(ssh_cmd, shell=True, check=check)
def run_remote_async(host: str, cmd: str) -> subprocess.Popen:
ssh_cmd = f'ssh root@{host} "bash -lc {shlex.quote(cmd)}"'
return subprocess.Popen(ssh_cmd, shell=True)
def get_config():
cli_conf = OmegaConf.from_cli()
yaml_conf = OmegaConf.load(cli_conf.config)
return OmegaConf.merge(yaml_conf, cli_conf)
def begin_with(file_name: str):
with open(file_name, "w"):
pass
def make_init_bash(cfg) -> str:
sc = cfg.system
http_proxy = sc.HTTP_PROXY
https_proxy = sc.HTTP_PROXY
hf_home = sc.HF_HOME
envs_dir = sc.envs_dir
lines = []
lines.append("set -e")
if http_proxy is not None:
lines.append(f"echo 'export HTTP_PROXY={http_proxy}' >> ~/.bashrc")
if https_proxy is not None:
lines.append(f"echo 'export HTTPS_PROXY={https_proxy}' >> ~/.bashrc")
if hf_home is not None:
lines.append(f"echo 'export HF_HOME={hf_home}' >> ~/.bashrc")
lines.append("")
if envs_dir is not None:
lines.append(f"conda config --append envs_dirs {envs_dir} || true")
lines.append("")
lines.append("echo 'source ~/.bashrc' >> ~/.bash_profile")
lines.append("")
return "\n".join(lines)
if __name__ == "__main__":
def init_node(host: str):
run_remote(host, INIT_BASH, check=False)
def init_hosts(worker_hosts):
for h in worker_hosts:
init_node(h)
def env_prefix() -> str:
return (
"source ~/.bashrc && "
f"source activate {env_name} && "
)
def sample(worker_hosts, cfg, strategy):
project = cfg.experiment.project
model_base = cfg.model_base
procs = []
if model_base == "dream":
python_name = "dream_sample"
elif model_base == "llada" or model_base == "mmada":
python_name = "mmada_v_sample"
elif model_base == "sdar":
python_name = "sdar_sample"
elif model_base == "trado":
python_name = "trado_sample"
elif model_base == "lladav":
python_name = "lladav_sample"
for idx, host in enumerate(worker_hosts):
body = (
f"cd {BASE_DIR}/sample && "
f"python {python_name}.py "
f"config=../configs/{project}.yaml "
f"experiment.node_index={idx} "
f"model={cfg.model} "
f"rollout.remasking_strategy={strategy}"
)
full_cmd = env_prefix() + body
if idx == 0:
procs.append(run_local_async(full_cmd))
else:
procs.append(run_remote_async(host, full_cmd))
for p in procs:
p.wait()
def execute(worker_hosts, cfg, strategy):
project = cfg.experiment.project
procs = []
for idx, host in enumerate(worker_hosts):
full_cmd = env_prefix() + (
f"cd {BASE_DIR}/reward && "
f"python execute.py "
f"config=../configs/{project}.yaml "
f"experiment.node_index={idx} "
f"model={cfg.model} "
f"rollout.remasking_strategy={strategy}"
)
if idx == 0:
procs.append(run_local_async(full_cmd))
else:
procs.append(run_remote_async(host, full_cmd))
for p in procs:
p.wait()
def aggregate(cfg, strategy):
project = cfg.experiment.project
full_cmd = env_prefix() + (
f"cd {BASE_DIR}/reward && "
f"python aggregate_data_v.py "
f"config=../configs/{project}.yaml "
f"model={cfg.model} "
f"rollout.remasking_strategy={strategy}"
)
run_local(full_cmd)
def reward(cfg, strategy):
project = cfg.experiment.project
full_cmd = env_prefix() + (
f"cd {BASE_DIR}/reward && "
f"python reward_v.py "
f"config=../configs/{project}.yaml "
f"model={cfg.model} "
f"rollout.remasking_strategy={strategy}"
)
run_local(full_cmd)
# use first node to control others
cfg = get_config()
INIT_BASH = make_init_bash(cfg)
BASE_DIR = cfg.system.base_dir
env_name = cfg.system.env_name
project = cfg.experiment.project
num_node = cfg.experiment.num_node
worker_hosts = [os.environ[f"MLP_WORKER_{i}_HOST"] for i in range(num_node)]
eval_type = cfg.dataset.data_type
import time
time.sleep(30)
init_hosts(worker_hosts)
import time
time.sleep(10)
os.makedirs(f"{project}/results", exist_ok=True)
remasking_strategies = cfg.rollout.remasking_strategy
if not isinstance(remasking_strategies, (list, ListConfig)):
remasking_strategies = [remasking_strategies]
for strategy in remasking_strategies:
cprint(f"--- Running evaluation for strategy: {strategy} ---", "green")
sample(worker_hosts, cfg, strategy)
if eval_type == "code":
execute(worker_hosts, cfg, strategy)
aggregate(cfg, strategy)
reward(cfg, strategy)