Move back to models with protobuf

SimJeg · SimJeg · commit 5637f5988e1c · 2026-01-27T14:57:17.000Z
Signed-off-by: SimJeg &lt;sjegou@nvidia.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -10,15 +10,21 @@ jobs:
   test:
     runs-on: linux-amd64-gpu-l4-latest-1
     container:
-      image: nvcr.io/nvidia/pytorch:25.10-py3
+      image: nvidia/cuda:13.0.0-devel-ubuntu24.04
     steps:
       - uses: actions/checkout@v3
 
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.12
+
       - name: Verify environment
         run: |
           nvidia-smi
           nvcc --version
           python3 --version
+          echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV
       - name: Install uv
         uses: astral-sh/setup-uv@v6
         with:
@@ -28,9 +34,6 @@ jobs:
         run: |
           uv sync --all-groups
           uv pip install torch==2.10
-        env:
-          UV_HTTP_TIMEOUT: 300
-
       - run: make test
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
     # transformers<4.54 is not supported due to refactoring of the transformers library.
     # transformers 4.54-4.55.2 are not compatible with kvpress due to flash attention bugs in transformers
     "transformers>=4.56,<5.0.0",
+    "sentencepiece>=0.2.0,<0.3",
+    "protobuf>=5.27.2,<6",
     "datasets>=2.21.0,<3",
     "pandas>=2.2.2,<3",
     "accelerate>=1.0.0,<2",
diff --git a/tests/fixtures.py b/tests/fixtures.py
@@ -14,38 +14,38 @@ def get_device():
 
 @pytest.fixture(scope="session")
 def unit_test_model():
-    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B").eval()
+    model = AutoModelForCausalLM.from_pretrained("MaxJeblick/llama2-0b-unit-test").eval()
     return model.to(get_device())
 
 
 @pytest.fixture(scope="session")
 def unit_test_model_output_attention():
     model = AutoModelForCausalLM.from_pretrained(
-        "Qwen/Qwen3-0.6B", attn_implementation="eager"
+        "MaxJeblick/llama2-0b-unit-test", attn_implementation="eager"
     ).eval()
     return model.to(get_device())
 
 
 @pytest.fixture(scope="session")
-def qwen3_600m_model():
-    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B").eval()
+def danube_500m_model():
+    model = AutoModelForCausalLM.from_pretrained("h2oai/h2o-danube3-500m-chat").eval()
     return model.to(get_device())
 
 
 @pytest.fixture(scope="session")
 def kv_press_unit_test_pipeline():
     return pipeline(
         "kv-press-text-generation",
-        model="Qwen/Qwen3-0.6B",
+        model="maxjeblick/llama2-0b-unit-test",
         device=get_device(),
     )
 
 
 @pytest.fixture(scope="session")
-def kv_press_qwen3_600m_pipeline():
+def kv_press_danube_pipeline():
     return pipeline(
         "kv-press-text-generation",
-        model="Qwen/Qwen3-0.6B",
+        model="h2oai/h2o-danube3-500m-chat",
         device=get_device(),
     )
 
diff --git a/tests/test_decoding_compression.py b/tests/test_decoding_compression.py
@@ -31,7 +31,7 @@ def test_decoding_compression(token_buffer_size):
     """Test that DecodingPress compresses the cache during decoding."""
 
     # Initialize pipeline with a small model
-    pipe = pipeline("kv-press-text-generation", model="Qwen/Qwen3-0.6B", device_map="auto")
+    pipe = pipeline("kv-press-text-generation", model="MaxJeblick/llama2-0b-unit-test", device_map="auto")
 
     # Create a DecodingPress with KnormPress
     press = DecodingPress(
@@ -65,7 +65,7 @@ def test_prefill_decoding_press_calls_both_phases():
     """Test that PrefillDecodingPress calls both prefilling and decoding presses."""
 
     # Initialize pipeline
-    pipe = pipeline("kv-press-text-generation", model="Qwen/Qwen3-0.6B", device_map="auto")
+    pipe = pipeline("kv-press-text-generation", model="MaxJeblick/llama2-0b-unit-test", device_map="auto")
 
     # Create PrefillDecodingPress with both presses
     combined_press = PrefillDecodingPress(
@@ -99,7 +99,7 @@ def test_decoding_press_without_prefill():
     """Test that DecodingPress works correctly when used standalone (no prefill compression)."""
 
     # Initialize pipeline
-    pipe = pipeline("kv-press-text-generation", model="Qwen/Qwen3-0.6B", device_map="auto")
+    pipe = pipeline("kv-press-text-generation", model="MaxJeblick/llama2-0b-unit-test", device_map="auto")
 
     # Create DecodingPress only
     decoding_press = DecodingPress(base_press=KnormPress(compression_ratio=0.4), compression_interval=5, target_size=64)
@@ -129,7 +129,7 @@ def test_prefill_decoding_press_decoding_only():
     """Test PrefillDecodingPress with only decoding press (no prefill compression)."""
 
     # Initialize pipeline
-    pipe = pipeline("kv-press-text-generation", model="Qwen/Qwen3-0.6B", device_map="auto")
+    pipe = pipeline("kv-press-text-generation", model="MaxJeblick/llama2-0b-unit-test", device_map="auto")
 
     # Create PrefillDecodingPress with only decoding press
     combined_press = PrefillDecodingPress(
@@ -167,7 +167,7 @@ def test_decoding_press_equivalence():
     torch.manual_seed(42)
 
     # Initialize pipeline
-    pipe = pipeline("kv-press-text-generation", model="Qwen/Qwen3-0.6B", device_map="auto")
+    pipe = pipeline("kv-press-text-generation", model="MaxJeblick/llama2-0b-unit-test", device_map="auto")
 
     # Create standalone decoding press
     decoding_press = DecodingPress(base_press=KnormPress(compression_ratio=0.5), compression_interval=3, target_size=52)
@@ -222,7 +222,7 @@ def test_all_presses_work_with_decoding_press(press_config):
     """Test that all default presses work as base presses for DecodingPress."""
 
     # Initialize pipeline
-    pipe = pipeline("kv-press-text-generation", model="Qwen/Qwen3-0.6B", device_map="auto")
+    pipe = pipeline("kv-press-text-generation", model="MaxJeblick/llama2-0b-unit-test", device_map="auto")
 
     # Get press class and use the first (easier) configuration
     press_cls = press_config["cls"]
@@ -274,7 +274,7 @@ def test_all_presses_work_with_decoding_press(press_config):
 def test_compression_actually_reduces_memory():
     """Test that compression actually reduces memory usage compared to no compression."""
 
-    pipe = pipeline("kv-press-text-generation", model="Qwen/Qwen3-0.6B", device_map="auto")
+    pipe = pipeline("kv-press-text-generation", model="MaxJeblick/llama2-0b-unit-test", device_map="auto")
 
     context = "The quick brown fox jumps over the lazy dog. " * 15  # Long context
     question = "What animal jumps over the dog?"
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -11,8 +11,8 @@
 
 from kvpress import ExpectedAttentionPress
 from kvpress.pipeline import KVPressTextGenerationPipeline
-from tests.fixtures import qwen3_600m_model  # noqa: F401
-from tests.fixtures import kv_press_qwen3_600m_pipeline  # noqa: F401
+from tests.fixtures import danube_500m_model  # noqa: F401
+from tests.fixtures import kv_press_danube_pipeline  # noqa: F401
 from tests.fixtures import unit_test_model  # noqa: F401
 from tests.fixtures import kv_press_llama3_2_flash_attn_pipeline, kv_press_unit_test_pipeline  # noqa: F401
 
@@ -94,9 +94,9 @@ def test_pipeline_no_press_works(kv_press_unit_test_pipeline, caplog):  # noqa:
     kv_press_unit_test_pipeline(context, question=question)
 
 
-def test_pipeline_answer_is_correct(qwen3_600m_model, caplog):  # noqa: F811
+def test_pipeline_answer_is_correct(danube_500m_model, caplog):  # noqa: F811
     with caplog.at_level(logging.DEBUG):
-        answers = generate_answer(qwen3_600m_model)
+        answers = generate_answer(danube_500m_model)
 
     for answer in answers:
         assert answer == "This article was written on January 1, 2022."
@@ -107,13 +107,13 @@ def test_pipeline_answer_is_correct(qwen3_600m_model, caplog):  # noqa: F811
 
 
 @pytest.mark.skipif(not is_optimum_quanto_available(), reason="Optimum Quanto is not available")
-def test_pipeline_with_quantized_cache(kv_press_qwen3_600m_pipeline, caplog):  # noqa: F811
+def test_pipeline_with_quantized_cache(kv_press_danube_pipeline, caplog):  # noqa: F811
     with caplog.at_level(logging.DEBUG):
         context = "This is a test article. It was written on 2022-01-01."
         questions = ["When was this article written?"]
         press = ExpectedAttentionPress(compression_ratio=0.4)
-        cache = QuantoQuantizedCache(config=kv_press_qwen3_600m_pipeline.model.config, nbits=4)
-        answers = kv_press_qwen3_600m_pipeline(context, questions=questions, press=press, cache=cache)["answers"]
+        cache = QuantoQuantizedCache(config=kv_press_danube_pipeline.model.config, nbits=4)
+        answers = kv_press_danube_pipeline(context, questions=questions, press=press, cache=cache)["answers"]
 
     assert len(answers) == 1
     assert isinstance(answers[0], str)