marin/experiments/pretraining_datasets/dclm.py at main · marin-community/marin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

import dataclasses

from levanter.data.text import TextLmDatasetFormat

from experiments.defaults import default_tokenize
from experiments.llama import llama3_tokenizer
from experiments.pretraining_datasets.simple import downloads, tokenized
from marin.execution.executor import executor_main
from marin.processing.tokenize import lm_mixture_data_config

DCLM_MIXTURE_WEIGHTS = {
    # token counts are for neox tokenizer
    "dclm_baseline": 3.8,  # 3.8 trillion tokens https://huggingface.co/datasets/mlfoundations/dclm-baseline-1.0
    "starcoderdata": 0.25,  # 250 billion tokens https://huggingface.co/datasets/bigcode/starcoderdata
    "proofpile_2": 0.055,  # 55 billion tokens https://huggingface.co/datasets/EleutherAI/proof-pile-2
}


DCLM_BASELINE_ONLY_MIXTURE = {
    "dclm_baseline": 3.8,  # 3.8 trillion tokens https://huggingface.co/datasets/mlfoundations/dclm-baseline-1.0
    "starcoderdata": 0,  # 250 billion tokens https://huggingface.co/datasets/bigcode/starcoderdata
    "proofpile_2": 0,  # 55 billion tokens https://huggingface.co/datasets/EleutherAI/proof-pile-2
}


dclm_components_llama3 = {
    "dclm_baseline": tokenized["dclm_baseline"],
    "starcoderdata": tokenized["starcoderdata"],
    "proofpile_2": tokenized["proofpile_2"],
}
dclm_mixture_config_llama3 = lm_mixture_data_config(components=dclm_components_llama3, weights=DCLM_MIXTURE_WEIGHTS)


## NOTE: on 20250211, we discovered that the DCLM baseline data in us-central2 was corrupted/partial.
# These are preserved for reproducibility, but future runs should use the correct data.
# YOU SHOULD NOT USE THESE TOKENIZED DATASETS FOR TRAINING
dclm_components_llama3_wrong = {
    "dclm_baseline": dataclasses.replace(
        default_tokenize(
            name="dclm_baseline",
            dataset=downloads["dclm_baseline_wrong"],
            tokenizer=llama3_tokenizer,
        ),
        override_output_path="gs://marin-us-central2/tokenized/dclm_baseline-0206f1_WRONG_20250211/",
    ),
    "starcoderdata": default_tokenize(
        name="starcoderdata",
        dataset=downloads["starcoderdata"],
        tokenizer=llama3_tokenizer,
        format=TextLmDatasetFormat(text_key="content"),
    ),
    "proofpile_2": default_tokenize(
        name="proofpile_2",
        dataset=downloads["proofpile_2"],
        tokenizer=llama3_tokenizer,
    ),
}

dclm_mixture_config_llama3_wrong = lm_mixture_data_config(
    components=dclm_components_llama3_wrong,
    weights=DCLM_MIXTURE_WEIGHTS,
)

if __name__ == "__main__":
    executor_main(steps=list(dclm_components_llama3.values()))