|
50 | 50 | " for item in data_list:\n", |
51 | 51 | " assert new_range[0] <= item[lb] <= new_range[1]\n", |
52 | 52 | " item[lb] = (item[lb] - new_range[0]) / max_range * 100\n", |
| 53 | + " return data_list, range_map\n", |
| 54 | + "\n", |
| 55 | + "# solve the problem that some benchmark score is too high and out of range\n", |
| 56 | + "def log_normalize(raw_data, labels):\n", |
| 57 | + " data_list = cp.deepcopy(raw_data)\n", |
| 58 | + " minimum, maximum, max_range, range_map = {}, {}, 0, {}\n", |
| 59 | + " for lb in labels:\n", |
| 60 | + " minimum[lb] = min([np.log(x[lb]) for x in data_list])\n", |
| 61 | + " maximum[lb] = max([np.log(x[lb]) for x in data_list])\n", |
| 62 | + " max_range = max(max_range, maximum[lb] - minimum[lb])\n", |
| 63 | + " max_range *= 1.005\n", |
| 64 | + " for lb in labels:\n", |
| 65 | + " mid = (minimum[lb] + maximum[lb]) / 2\n", |
| 66 | + " new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)\n", |
| 67 | + " range_map[lb] = new_range\n", |
| 68 | + " for item in data_list:\n", |
| 69 | + " assert new_range[0] <= np.log(item[lb]) <= new_range[1]\n", |
| 70 | + " item[lb] = (np.log(item[lb]) - new_range[0]) / max_range * 100\n", |
53 | 71 | " return data_list, range_map" |
54 | 72 | ] |
55 | 73 | }, |
|
64 | 82 | "models = list(data)\n", |
65 | 83 | "print(models)\n", |
66 | 84 | "\n", |
| 85 | + "# model2vis = [\n", |
| 86 | + "# 'GPT-4v (detail: low)', 'GeminiProVision', 'Qwen-VL-Plus', \n", |
| 87 | + "# 'InternLM-XComposer2-VL', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n", |
| 88 | + "# 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n", |
| 89 | + "# ]\n", |
| 90 | + "\n", |
67 | 91 | "model2vis = [\n", |
68 | | - " 'GPT-4v (detail: low)', 'GeminiProVision', 'Qwen-VL-Plus', \n", |
69 | | - " 'InternLM-XComposer2-VL', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n", |
| 92 | + " # 'GPT-4v (detail: low)', 'GeminiProVision', 'InternLM-XComposer2-VL', \n", |
| 93 | + " 'GPT-4v (1106, detail-low)', 'Gemini-1.0-Pro', 'Gemini-1.5-Pro', #'Gemini-1.5-Flash', 'Qwen-VL-Plus', \n", |
| 94 | + " 'InternLM-XComposer2', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n", |
70 | 95 | " 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n", |
71 | 96 | "]\n", |
| 97 | + "\n", |
72 | 98 | "colors = [\n", |
73 | 99 | " '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', \n", |
74 | 100 | " '#e377c2', '#7f7f7f', '#bcbd22'\n", |
|
81 | 107 | "metadata": {}, |
82 | 108 | "outputs": [], |
83 | 109 | "source": [ |
| 110 | + "from collections import defaultdict\n", |
| 111 | + "\n", |
84 | 112 | "split = 'MMBench_TEST_EN'\n", |
85 | | - "data_sub = {k: v[split] for k, v in data.items()}\n", |
| 113 | + "# data_sub = {k: v[split] for k, v in data.items()}\n", |
| 114 | + "data_sub = {k: defaultdict(int, v)[split] for k, v in data.items()}\n", |
| 115 | + "# solve the problem that some model lack the evaluation of MMBench_TEST_EN\n", |
86 | 116 | "\n", |
87 | 117 | "labels = list(data_sub[model2vis[0]])\n", |
88 | 118 | "labels.remove('Overall')\n", |
|
0 commit comments