add generator offline throughput benchmark (#675) (#680)

JenniferWang · web-flow · commit 8fe87426da2c · 2025-12-29T09:58:41.000-05:00
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""TorchForge benchmarking utilities."""
diff --git a/benchmarks/generator/__init__.py b/benchmarks/generator/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Generator throughput benchmarking tools."""
diff --git a/benchmarks/generator/datasets.py b/benchmarks/generator/datasets.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Lightweight dataset utilities for generator throughput benchmarking.
+"""
+
+import random
+import uuid
+from dataclasses import dataclass
+
+from vllm import __version__ as vllm_version
+
+
+if vllm_version >= "0.13.0":
+    from vllm.tokenizers import TokenizerLike as Tokenizer
+else:
+    from vllm.transformers_utils.tokenizer import AnyTokenizer as Tokenizer
+
+
+@dataclass
+class BenchmarkRequest:
+    """
+    Attributes:
+        prompt: The text prompt to generate from
+        prompt_len: Length of the prompt in tokens
+        expected_output_len: Expected length of generated output in tokens
+        request_id: Unique identifier for this request.
+    """
+
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    request_id: str
+
+
+class RandomDataset:
+    """Generates prompts with random token sequences of specified lengths.
+
+    Args:
+        tokenizer: Tokenizer to use for encoding/decoding
+        num_requests: Number of benchmark requests to generate
+        input_len: Target input prompt length in tokens
+        output_len: Target output generation length in tokens
+        range_ratio: Variance ratio for input/output lengths (0.0-1.0).
+                     0.0 means fixed lengths, 0.2 means ±20% variance.
+    """
+
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        num_requests: int,
+        input_len: int,
+        output_len: int,
+        range_ratio: float = 0.0,
+    ):
+        self.tokenizer = tokenizer
+        self.num_requests = num_requests
+        self.input_len = input_len
+        self.output_len = output_len
+        self.range_ratio = range_ratio
+        self.vocab_size = tokenizer.vocab_size
+
+    def _sample_length(self, target_len: int) -> int:
+        """Sample a length with variance based on range_ratio."""
+        if self.range_ratio == 0.0:
+            return target_len
+
+        min_len = int(target_len * (1 - self.range_ratio))
+        max_len = int(target_len * (1 + self.range_ratio))
+        return random.randint(min_len, max_len)
+
+    def generate(self) -> list[BenchmarkRequest]:
+        """Generate benchmark requests with random token sequences.
+
+        Returns:
+            List of BenchmarkRequest objects with random prompts
+        """
+        requests = []
+
+        for i in range(self.num_requests):
+            # Sample lengths with variance
+            prompt_len = self._sample_length(self.input_len)
+            output_len = self._sample_length(self.output_len)
+
+            token_ids = [
+                random.randint(0, self.vocab_size - 1) for _ in range(prompt_len)
+            ]
+            prompt = self.tokenizer.decode(token_ids)
+
+            requests.append(
+                BenchmarkRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    request_id=f"random-{i}-{uuid.uuid4().hex[:8]}",
+                )
+            )
+
+        return requests
+
+
+class FixedDataset:
+    """Repeat a fixed prompt for baseline testing.
+
+    Args:
+        tokenizer: Tokenizer to use for encoding the prompt
+        prompt: The fixed text prompt to repeat
+        num_requests: Number of times to repeat the prompt
+        output_len: Target output generation length in tokens
+    """
+
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        prompt: str,
+        num_requests: int,
+        output_len: int,
+    ):
+        self.tokenizer = tokenizer
+        self.prompt = prompt
+        self.num_requests = num_requests
+        self.output_len = output_len
+        self.prompt_len = len(tokenizer.encode(prompt))
+
+    def generate(self) -> list[BenchmarkRequest]:
+        """Generate benchmark requests with the same fixed prompt.
+
+        Returns:
+            List of BenchmarkRequest objects with the fixed prompt
+        """
+        requests = []
+
+        for i in range(self.num_requests):
+            requests.append(
+                BenchmarkRequest(
+                    prompt=self.prompt,
+                    prompt_len=self.prompt_len,
+                    expected_output_len=self.output_len,
+                    request_id=f"fixed-{i}-{uuid.uuid4().hex[:8]}",
+                )
+            )
+
+        return requests
diff --git a/benchmarks/generator/metrics.py b/benchmarks/generator/metrics.py
@@ -0,0 +1,168 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Metrics collection and reporting for generator throughput benchmarks.
+
+Based on vLLM's throughput benchmark metrics patterns.
+Reference: vllm/benchmarks/throughput.py (lines 762-809)
+"""
+
+import json
+from dataclasses import asdict, dataclass
+
+from forge.data_models.completion import Completion
+
+
+@dataclass
+class ThroughputMetrics:
+    """Throughput benchmark metrics for offline inference.
+    Reference: https://github.com/vllm-project/vllm/blob/main/vllm/benchmarks/throughput.py
+
+    Attributes:
+        elapsed_time: Total wall-clock time in seconds
+        num_requests: Total number of requests processed
+        num_completions: Total number of completions (requests * n samples)
+        total_prompt_tokens: Sum of all prompt tokens
+        total_output_tokens: Sum of all generated output tokens
+        total_tokens: Sum of prompt and output tokens
+        requests_per_second: Request throughput (requests/sec)
+        completions_per_second: Completion throughput (completions/sec)
+        tokens_per_second: Total token throughput (tokens/sec)
+        output_tokens_per_second: Output token throughput (output tokens/sec)
+        model: Optional model name for reporting
+        config: Optional benchmark configuration dict
+    """
+
+    elapsed_time: float
+    num_requests: int
+    num_completions: int
+    total_prompt_tokens: int
+    total_output_tokens: int
+    total_tokens: int
+    requests_per_second: float
+    completions_per_second: float
+    tokens_per_second: float
+    output_tokens_per_second: float
+    model: str | None = None
+    config: dict | None = None
+
+
+def extract_token_counts(completions: list[list[Completion]]) -> tuple[int, int]:
+    """Extract token counts from generator completions.
+
+    Args:
+        completions: List of completion lists from Generator.generate() calls.
+                     Each Generator.generate() call returns a list of Completion objects.
+
+    Returns:
+        Tuple of (total_prompt_tokens, total_output_tokens)
+    """
+    total_prompt_tokens = 0
+    total_output_tokens = 0
+
+    for completion_list in completions:
+        for completion in completion_list:
+            # Completion has prompt_ids and token_ids as torch.Tensor
+            # Shape: (seq_len,)
+            total_prompt_tokens += completion.prompt_ids.shape[0]
+            total_output_tokens += completion.token_ids.shape[0]
+
+    return total_prompt_tokens, total_output_tokens
+
+
+def calculate_metrics(
+    completions: list[list[Completion]],
+    elapsed_time: float,
+    model: str | None = None,
+    config: dict | None = None,
+) -> ThroughputMetrics:
+    """Calculate throughput metrics from completions and timing.
+
+    Args:
+        completions: List of completion lists from Generator.generate() calls
+        elapsed_time: Total time elapsed in seconds
+        model: Optional model name
+        config: Optional benchmark configuration
+
+    Returns:
+        ThroughputMetrics object with calculated metrics
+    """
+    num_requests = len(completions)
+    num_completions = sum(len(completion_list) for completion_list in completions)
+    total_prompt_tokens, total_output_tokens = extract_token_counts(completions)
+    total_tokens = total_prompt_tokens + total_output_tokens
+
+    return ThroughputMetrics(
+        elapsed_time=elapsed_time,
+        num_requests=num_requests,
+        num_completions=num_completions,
+        total_prompt_tokens=total_prompt_tokens,
+        total_output_tokens=total_output_tokens,
+        total_tokens=total_tokens,
+        requests_per_second=num_requests / elapsed_time if elapsed_time > 0 else 0.0,
+        completions_per_second=(
+            num_completions / elapsed_time if elapsed_time > 0 else 0.0
+        ),
+        tokens_per_second=total_tokens / elapsed_time if elapsed_time > 0 else 0.0,
+        output_tokens_per_second=(
+            total_output_tokens / elapsed_time if elapsed_time > 0 else 0.0
+        ),
+        model=model,
+        config=config,
+    )
+
+
+def print_metrics(metrics: ThroughputMetrics) -> None:
+    """Print metrics to console in a formatted table.
+
+    Args:
+        metrics: ThroughputMetrics to print
+    """
+    print("=" * 55)
+    print("Throughput Benchmark Results".center(55))
+    print("=" * 55)
+
+    if metrics.model:
+        print(f"Model: {metrics.model}")
+
+    # Calculate samples per request
+    samples_per_request = (
+        metrics.num_completions / metrics.num_requests
+        if metrics.num_requests > 0
+        else 0
+    )
+
+    print(f"Requests: {metrics.num_requests}")
+    print(
+        f"Completions: {metrics.num_completions} ({samples_per_request:.1f} per request)"
+    )
+    print(f"Elapsed Time: {metrics.elapsed_time:.2f} seconds")
+    print("-" * 55)
+    print(f"Total Prompt Tokens: {metrics.total_prompt_tokens}")
+    print(f"Total Output Tokens: {metrics.total_output_tokens}")
+    print(f"Total Tokens: {metrics.total_tokens}")
+    print("-" * 55)
+    print("Throughput:")
+    print(f"  Requests/sec: {metrics.requests_per_second:.2f}")
+    print(f"  Completions/sec: {metrics.completions_per_second:.2f}")
+    print(f"  Total Tokens/sec: {metrics.tokens_per_second:.2f}")
+    print(f"  Output Tokens/sec: {metrics.output_tokens_per_second:.2f}")
+    print("=" * 55)
+
+
+def save_metrics_json(metrics: ThroughputMetrics, output_path: str) -> None:
+    """Save metrics to JSON file.
+
+    Args:
+        metrics: ThroughputMetrics to save
+        output_path: Path to output JSON file
+    """
+    metrics_dict = asdict(metrics)
+
+    with open(output_path, "w") as f:
+        json.dump(metrics_dict, f, indent=2)
+
+    print(f"\nMetrics saved to: {output_path}")
diff --git a/benchmarks/generator/throughput.py b/benchmarks/generator/throughput.py