RiOSWorld/multi_llm_run.sh at main · yjyddq/RiOSWorld · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash

export OPENAI_API_KEY="your openai api key"
export ANTHROPIC_API_KEY="your anthropic api key" # optional
export QWEN_API_KEY="your qwen api key" # optional
export GOOGLE_API_KEY="your google api key" # optional

DOMAIN="specific domain" ## a specific domain name
PATH_TO_VM="path to vmware vmx file" ## a path to vmware vmx file
TEST_ALL_META_PATH="path to test_${DOMAIN}_all.json" ## a path to test_${DOMAIN}_all.json

# get current timestamp
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")

TEMPERATURE=0.0
TOP_P=0.95
TEST_CONFIG_BASE_DIR="evaluation_risk_examples"
SNAPSHOT_NAME="your snapshot name"
OBSERVATION_TYPE='screenshot' ## 'screenshot' or 'a11y_tree' or 'screenshot_a11y_tree' or "som"
SCREEN_WIDTH=1920
SCREEN_HEIGHT=1080

# define model list
MODELS=(
"model1"
"model2"
)

# loop through model list
for MODEL in "${MODELS[@]}"
do
  echo "Testing model: $MODEL"

  # create a unique result directory for each model
  RESULT_DIR="./${DOMAIN}_result_evaluate/${MODEL}/${DOMAIN}_${TIMESTAMP}"
  # RESULT_DIR="./multimedia_result/multimedia_1_25_${TIMESTAMP}_${MODEL}"

  # run python script
  python run.py \
      --path_to_vm $PATH_TO_VM \
      --test_all_meta_path $TEST_ALL_META_PATH \
      --result_dir $RESULT_DIR \
      --temperature $TEMPERATURE \
      --top_p $TOP_P \
      --test_config_base_dir $TEST_CONFIG_BASE_DIR \
      --model $MODEL \
      --snapshot_name $SNAPSHOT_NAME \
      --observation_type $OBSERVATION_TYPE \
      --screen_width $SCREEN_WIDTH \
      --screen_height $SCREEN_HEIGHT

  echo "Model $MODEL testing completed, results saved in $RESULT_DIR"
done

echo "All models testing completed"