perf: use multiprocessing to speed up recalculate_expanded_sizes.py

dsanders11 · dsanders11 · commit f7405d367456 · 2025-07-03T23:44:04.000-07:00
diff --git a/recalculate_expanded_sizes.py b/recalculate_expanded_sizes.py
@@ -3,20 +3,103 @@
 import argparse
 import csv
 import logging
+import math
+import multiprocessing
 import os
+import queue
 import sys
+import time
+from itertools import batched
 
 from common import FilteredIncludeChangeList, IgnoresConfiguration, IncludeChange
 from filter_include_changes import Change, filter_changes
 from list_transitive_includes import list_transitive_includes
 from include_analysis import IncludeAnalysisOutput, ParseError, parse_raw_include_analysis_output
 from typing import Callable, Dict, Iterator, List, Tuple
-from utils import load_config
+from utils import get_worker_count, load_config
 
 
-def recalculate_expanded_sizes(
+def recalculate_expanded_size(
+    include_analysis: IncludeAnalysisOutput,
+    filename: str,
+    changes: List[Change],
+    ignores: IgnoresConfiguration = None,
+    filter_generated_files=True,
+    filter_mojom_headers=True,
+    filter_third_party=False,
+    header_mappings: Dict[str, str] = None,
+    include_directories: List[str] = None,
+    remove_only=False,
+) -> Tuple[str, int]:
+    # Recalculate all of the transitive includes for this file
+    includes = list_transitive_includes(
+        include_analysis,
+        filename,
+        metric="file_size",
+        changes=changes,
+        ignores=ignores,
+        filter_generated_files=filter_generated_files,
+        filter_mojom_headers=filter_mojom_headers,
+        filter_third_party=filter_third_party,
+        header_mappings=header_mappings,
+        include_directories=include_directories,
+        apply_changes=True,
+        full=True,
+        remove_only=remove_only,
+    )
+
+    # The expanded size for the file is all of its include sizes, and its own size
+    expanded_size = sum(map(lambda entry: entry[1], set(map(lambda entry: entry[1:3], includes))))
+    expanded_size += include_analysis["sizes"][filename]
+
+    if expanded_size > include_analysis["tsizes"][filename]:
+        logging.warning(
+            f"{filename} unexpectedly increased in size from {include_analysis["tsizes"][filename]} to {expanded_size} - ignoring"
+        )
+        expanded_size = include_analysis["tsizes"][filename]
+
+    return (filename, expanded_size)
+
+
+def work_func(
+    filenames: List[str],
+    result_queue: multiprocessing.JoinableQueue,
     include_analysis: IncludeAnalysisOutput,
+    changes: List[Change],
+    ignores: IgnoresConfiguration = None,
+    filter_generated_files=True,
+    filter_mojom_headers=True,
+    filter_third_party=False,
+    header_mappings: Dict[str, str] = None,
+    include_directories: List[str] = None,
+    remove_only=False,
+):
+    try:
+        for filename in filenames:
+            result = recalculate_expanded_size(
+                include_analysis,
+                filename,
+                changes,
+                ignores=ignores,
+                filter_generated_files=filter_generated_files,
+                filter_mojom_headers=filter_mojom_headers,
+                filter_third_party=filter_third_party,
+                header_mappings=header_mappings,
+                include_directories=include_directories,
+                remove_only=remove_only,
+            )
+
+            result_queue.put_nowait(result)
+
+        # Don't exit until the result queue has been fully consumed
+        result_queue.join()
+    except KeyboardInterrupt:
+        pass  # Don't show the user anything
+
+
+def recalculate_expanded_sizes(
     filenames: List[str],
+    include_analysis: IncludeAnalysisOutput,
     changes: List[Change],
     progress_callback: Callable[[str], None] = None,
     ignores: IgnoresConfiguration = None,
@@ -27,6 +110,8 @@ def recalculate_expanded_sizes(
     include_directories: List[str] = None,
     remove_only=False,
 ) -> Iterator[Tuple[str, int]]:
+    result_queue: multiprocessing.JoinableQueue[Tuple[str, int]] = multiprocessing.JoinableQueue()
+
     include_changes = FilteredIncludeChangeList(
         filter_changes(
             changes,
@@ -38,38 +123,47 @@ def recalculate_expanded_sizes(
         )
     )
 
-    for filename in filenames:
-        # Recalculate all of the transitive includes for this file
-        includes = list_transitive_includes(
-            include_analysis,
-            filename,
-            metric="file_size",
-            changes=include_changes,
-            ignores=ignores,
-            filter_generated_files=filter_generated_files,
-            filter_mojom_headers=filter_mojom_headers,
-            filter_third_party=filter_third_party,
-            header_mappings=header_mappings,
-            include_directories=include_directories,
-            apply_changes=True,
-            full=True,
-            remove_only=remove_only,
-        )
+    worker_count = min(len(filenames), get_worker_count())
+    chunk_size = math.ceil(float(len(filenames)) / worker_count)
 
-        # The expanded size for the file is all of its include sizes, and its own size
-        expanded_size = sum(map(lambda entry: entry[1], set(map(lambda entry: entry[1:3], includes))))
-        expanded_size += include_analysis["sizes"][filename]
+    chunked = list(batched(filenames, chunk_size))
 
-        if expanded_size > include_analysis["tsizes"][filename]:
-            logging.warning(
-                f"{filename} unexpectedly increased in size from {include_analysis["tsizes"][filename]} to {expanded_size} - ignoring"
-            )
-            expanded_size = include_analysis["tsizes"][filename]
+    workers = [
+        multiprocessing.Process(
+            target=work_func,
+            args=(chunked[idx], result_queue, include_analysis, include_changes),
+            kwargs={
+                "ignores": ignores,
+                "filter_generated_files": filter_generated_files,
+                "filter_mojom_headers": filter_mojom_headers,
+                "filter_third_party": filter_third_party,
+                "header_mappings": header_mappings,
+                "include_directories": include_directories,
+                "remove_only": remove_only,
+            },
+        )
+        for idx in range(worker_count)
+    ]
+
+    for worker in workers:
+        worker.start()
 
-        yield (filename, expanded_size)
+    done = False
 
-        if progress_callback:
+    while not done:
+        try:
+            (filename, expanded_size) = result_queue.get_nowait()
+            yield (filename, expanded_size)
             progress_callback(filename)
+            result_queue.task_done()
+        except queue.Empty:
+            pass
+
+        # If all workers have exited, then there should be no more results
+        if any((worker.is_alive() for worker in workers)):
+            time.sleep(0.01)
+        else:
+            done = True
 
 
 def main():
@@ -141,8 +235,8 @@ def main():
             disable=len(filenames) == 1, total=len(filenames), unit="file"
         ) as progress_output:
             for row in recalculate_expanded_sizes(
-                include_analysis,
                 filenames,
+                include_analysis,
                 list(csv.reader(args.changes_file)),
                 progress_callback=lambda _: progress_output.update(),
                 ignores=ignores,
@@ -151,7 +245,7 @@ def main():
                 filter_third_party=args.filter_third_party,
                 header_mappings=config.headerMappings if config else None,
                 include_directories=config.includeDirs if config else None,
-                remove_only=args.remove_only
+                remove_only=args.remove_only,
             ):
                 csv_writer.writerow(row)