torchforge/tests/unit_tests/test_config.py at a168b355097e7b88235679089807090c36f753ac · meta-pytorch/torchforge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from unittest.mock import patch

import pytest

from forge.util.config import resolve_hf_hub_paths
from omegaconf import DictConfig, OmegaConf


# Core functionality tests
@pytest.mark.parametrize(
    "config_data,expected_calls",
    [
        # Simple hf:// path
        ({"model": "hf://meta-llama/Llama-2-7b-hf"}, [("meta-llama/Llama-2-7b-hf",)]),
        # Nested hf:// paths
        (
            {
                "model": {"pretrained": "hf://meta-llama/Llama-2-7b-hf"},
                "tokenizer": "hf://microsoft/DialoGPT-medium",
                "training": {"epochs": 10},
            },
            [("meta-llama/Llama-2-7b-hf",), ("microsoft/DialoGPT-medium",)],
        ),
        # hf:// in lists and tuples
        (
            {
                "models": ["hf://model1", "local/path", "hf://model2"],
                "tuple_data": ("hf://model3", "another/local/path"),
            },
            [("model1",), ("model2",), ("model3",)],
        ),
        # Deeply nested structure
        ({"level1": {"level2": {"model": "hf://deep/model"}}}, [("deep/model",)]),
    ],
)
@patch("forge.util.config.snapshot_download")
def test_hf_path_resolution(mock_download, config_data, expected_calls):
    """Test hf:// path resolution in various data structures."""
    mock_download.return_value = "/fake/cache/model"

    config = OmegaConf.create(config_data)
    result = resolve_hf_hub_paths(config)

    # Verify correct number of calls
    assert mock_download.call_count == len(expected_calls)

    # Verify each call was made with correct parameters
    for (repo_name,) in expected_calls:
        mock_download.assert_any_call(repo_name, revision="main", local_files_only=True)

    # Verify result is DictConfig
    assert isinstance(result, DictConfig)


@pytest.mark.parametrize(
    "config_data",
    [
        {"model": "local/path/to/model"},
        {"model": "/absolute/path", "tokenizer": "relative/path"},
        {"model": "https://example.com/model"},
        {"models": ["local1", "local2"], "other": "value"},
        {},  # Empty config
    ],
)
def test_non_hf_paths_unchanged(config_data):
    """Test that non-hf:// paths are left unchanged."""
    config = OmegaConf.create(config_data)
    result = resolve_hf_hub_paths(config)

    # Result should be identical to input for non-hf paths
    assert OmegaConf.to_container(result) == config_data


# Cache behavior tests
@patch("forge.util.config.snapshot_download")
def test_cache_hit_scenario(mock_download):
    """Test behavior when model is already cached."""
    mock_download.return_value = "/fake/cache/model"

    config = OmegaConf.create({"model": "hf://test/model"})
    result = resolve_hf_hub_paths(config)

    # Should call with local_files_only=True and succeed
    mock_download.assert_called_once_with(
        "test/model", revision="main", local_files_only=True
    )
    assert result.model == "/fake/cache/model"


@patch("forge.util.config.snapshot_download")
def test_cache_miss_scenario(mock_download):
    """Test behavior when model is not cached."""
    from huggingface_hub.utils import LocalEntryNotFoundError

    # First call fails (cache miss), second succeeds (download)
    mock_download.side_effect = [
        LocalEntryNotFoundError("Not in cache"),
        "/fake/cache/model",
    ]

    config = OmegaConf.create({"model": "hf://test/model"})
    result = resolve_hf_hub_paths(config)

    # Should call twice: first with local_files_only=True, then False
    assert mock_download.call_count == 2
    mock_download.assert_any_call("test/model", revision="main", local_files_only=True)
    mock_download.assert_any_call("test/model", revision="main", local_files_only=False)
    assert result.model == "/fake/cache/model"


# Error handling tests
@pytest.mark.parametrize(
    "invalid_input,expected_error",
    [
        (None, "Configuration cannot be None"),
        ({"model": "hf://test"}, "Input must be an OmegaConf config object"),
    ],
)
def test_input_validation(invalid_input, expected_error):
    """Test input validation with various invalid inputs."""
    with pytest.raises(ValueError) as exc_info:
        resolve_hf_hub_paths(invalid_input)
    assert expected_error in str(exc_info.value)


@pytest.mark.parametrize(
    "invalid_hf_url,expected_error",
    [
        ("hf://", "Empty repository name"),  # Empty repo name
        ("hf:///invalid", "Failed to resolve HuggingFace model"),  # Invalid repo format
    ],
)
def test_invalid_hf_urls(invalid_hf_url, expected_error):
    """Test handling of invalid hf:// URLs."""
    config = OmegaConf.create({"model": invalid_hf_url})

    with pytest.raises((ValueError, Exception)) as exc_info:
        resolve_hf_hub_paths(config)
    assert expected_error in str(exc_info.value)


@patch("forge.util.config.snapshot_download")
def test_download_failure_handling(mock_download):
    """Test error handling when download fails."""
    mock_download.side_effect = Exception("Network error: Repository not found")

    config = OmegaConf.create({"model": "hf://invalid/repo"})

    with pytest.raises(Exception) as exc_info:
        resolve_hf_hub_paths(config)
    assert "Failed to resolve HuggingFace model 'invalid/repo'" in str(exc_info.value)
    assert "Network error" in str(exc_info.value)


# Integration test with mixed data types
@patch("forge.util.config.snapshot_download")
def test_complex_real_world_config(mock_download):
    """Test with a realistic complex configuration."""
    mock_download.return_value = "/fake/cache/model"

    config = OmegaConf.create(
        {
            "model": {
                "pretrained_model": "hf://meta-llama/Llama-2-7b-hf",
                "lora_rank": 64,
                "use_cache": True,
            },
            "tokenizer": "hf://meta-llama/Llama-2-7b-hf",  # Same repo
            "training": {"batch_size": 32, "learning_rate": 0.0001, "epochs": 10},
            "output_dir": "/local/output",
            "resume_from": None,
        }
    )

    result = resolve_hf_hub_paths(config)

    # Should call download twice (same repo referenced twice)
    assert mock_download.call_count == 2
    mock_download.assert_any_call(
        "meta-llama/Llama-2-7b-hf", revision="main", local_files_only=True
    )

    # Verify hf:// paths were replaced
    assert result.model.pretrained_model == "/fake/cache/model"
    assert result.tokenizer == "/fake/cache/model"

    # Verify non-hf values unchanged
    assert result.model.lora_rank == 64
    assert result.training.batch_size == 32
    assert result.output_dir == "/local/output"
    assert result.resume_from is None