-
Notifications
You must be signed in to change notification settings - Fork 111
Expand file tree
/
Copy pathdclm.py
More file actions
68 lines (56 loc) · 2.58 KB
/
dclm.py
File metadata and controls
68 lines (56 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0
import dataclasses
from levanter.data.text import TextLmDatasetFormat
from experiments.defaults import default_tokenize
from experiments.llama import llama3_tokenizer
from experiments.pretraining_datasets.simple import downloads, tokenized
from marin.execution.executor import executor_main
from marin.processing.tokenize import lm_mixture_data_config
DCLM_MIXTURE_WEIGHTS = {
# token counts are for neox tokenizer
"dclm_baseline": 3.8, # 3.8 trillion tokens https://huggingface.co/datasets/mlfoundations/dclm-baseline-1.0
"starcoderdata": 0.25, # 250 billion tokens https://huggingface.co/datasets/bigcode/starcoderdata
"proofpile_2": 0.055, # 55 billion tokens https://huggingface.co/datasets/EleutherAI/proof-pile-2
}
DCLM_BASELINE_ONLY_MIXTURE = {
"dclm_baseline": 3.8, # 3.8 trillion tokens https://huggingface.co/datasets/mlfoundations/dclm-baseline-1.0
"starcoderdata": 0, # 250 billion tokens https://huggingface.co/datasets/bigcode/starcoderdata
"proofpile_2": 0, # 55 billion tokens https://huggingface.co/datasets/EleutherAI/proof-pile-2
}
dclm_components_llama3 = {
"dclm_baseline": tokenized["dclm_baseline"],
"starcoderdata": tokenized["starcoderdata"],
"proofpile_2": tokenized["proofpile_2"],
}
dclm_mixture_config_llama3 = lm_mixture_data_config(components=dclm_components_llama3, weights=DCLM_MIXTURE_WEIGHTS)
## NOTE: on 20250211, we discovered that the DCLM baseline data in us-central2 was corrupted/partial.
# These are preserved for reproducibility, but future runs should use the correct data.
# YOU SHOULD NOT USE THESE TOKENIZED DATASETS FOR TRAINING
dclm_components_llama3_wrong = {
"dclm_baseline": dataclasses.replace(
default_tokenize(
name="dclm_baseline",
dataset=downloads["dclm_baseline_wrong"],
tokenizer=llama3_tokenizer,
),
override_output_path="gs://marin-us-central2/tokenized/dclm_baseline-0206f1_WRONG_20250211/",
),
"starcoderdata": default_tokenize(
name="starcoderdata",
dataset=downloads["starcoderdata"],
tokenizer=llama3_tokenizer,
format=TextLmDatasetFormat(text_key="content"),
),
"proofpile_2": default_tokenize(
name="proofpile_2",
dataset=downloads["proofpile_2"],
tokenizer=llama3_tokenizer,
),
}
dclm_mixture_config_llama3_wrong = lm_mixture_data_config(
components=dclm_components_llama3_wrong,
weights=DCLM_MIXTURE_WEIGHTS,
)
if __name__ == "__main__":
executor_main(steps=list(dclm_components_llama3.values()))