Commit 3115669
[SPARK-54925][PYTHON] Add the capability to dump threads for pyspark
### What changes were proposed in this pull request?
Add an optional capability to dump thread info of *all* pyspark processes. It is intentionally hidden now because it's not fully polished. It can be used as `python -m pyspark.threaddump -p <pid>`. It requires `pystack` and `psutil`. Without these libraries the command will fail.
For now it was only used when test hangs. The result would be like:
```
Thread dump:
Dumping threads for process 1904
Traceback for thread 2175 (python3.12) [] (most recent call last):
(Python) File "/usr/lib/python3.12/threading.py", line 1032, in _bootstrap
self._bootstrap_inner()
(Python) File "/usr/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
self.run()
(Python) File "/usr/lib/python3.12/threading.py", line 1012, in run
self._target(*self._args, **self._kwargs)
(Python) File "/usr/lib/python3.12/socketserver.py", line 240, in serve_forever
self._handle_request_noblock()
(Python) File "/usr/lib/python3.12/socketserver.py", line 318, in _handle_request_noblock
self.process_request(request, client_address)
(Python) File "/usr/lib/python3.12/socketserver.py", line 349, in process_request
self.finish_request(request, client_address)
(Python) File "/usr/lib/python3.12/socketserver.py", line 362, in finish_request
self.RequestHandlerClass(request, client_address, self)
(Python) File "/usr/lib/python3.12/socketserver.py", line 766, in __init__
self.handle()
(Python) File "/workspaces/spark/python/pyspark/accumulators.py", line 327, in handle
poll(accum_updates)
(Python) File "/workspaces/spark/python/pyspark/accumulators.py", line 281, in poll
for fd, event in poller.poll(1000):
Traceback for thread 2034 (python3.12) [] (most recent call last):
(Python) File "/usr/lib/python3.12/threading.py", line 1032, in _bootstrap
self._bootstrap_inner()
(Python) File "/usr/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
self.run()
(Python) File "/workspaces/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/clientserver.py", line 58, in run
Traceback for thread 1904 (python3.12) [] (most recent call last):
(Python) File "<frozen runpy>", line 198, in _run_module_as_main
(Python) File "<frozen runpy>", line 88, in _run_code
(Python) File "/workspaces/spark/python/pyspark/sql/tests/test_udf.py", line 1790, in <module>
unittest.main(testRunner=testRunner, verbosity=2)
(Python) File "/workspaces/spark/python/pyspark/testing/__init__.py", line 30, in unittest_main
res = _unittest_main(*args, **kwargs)
(Python) File "/usr/lib/python3.12/unittest/main.py", line 105, in __init__
self.runTests()
(Python) File "/usr/lib/python3.12/unittest/main.py", line 281, in runTests
self.result = testRunner.run(self.test)
(Python) File "/usr/local/lib/python3.12/dist-packages/xmlrunner/runner.py", line 67, in run
test(result)
(Python) File "/usr/lib/python3.12/unittest/suite.py", line 84, in __call__
return self.run(*args, **kwds)
(Python) File "/usr/lib/python3.12/unittest/suite.py", line 122, in run
test(result)
(Python) File "/usr/lib/python3.12/unittest/suite.py", line 84, in __call__
return self.run(*args, **kwds)
(Python) File "/usr/lib/python3.12/unittest/suite.py", line 122, in run
test(result)
(Python) File "/usr/lib/python3.12/unittest/case.py", line 690, in __call__
return self.run(*args, **kwds)
(Python) File "/usr/lib/python3.12/unittest/case.py", line 634, in run
self._callTestMethod(testMethod)
(Python) File "/usr/lib/python3.12/unittest/case.py", line 589, in _callTestMethod
if method() is not None:
(Python) File "/workspaces/spark/python/pyspark/sql/tests/test_udf.py", line 212, in test_chained_udf
[row] = self.spark.sql("SELECT double_int(double_int(1) + 1)").collect()
(Python) File "/workspaces/spark/python/pyspark/sql/classic/dataframe.py", line 469, in collect
sock_info = self._jdf.collectToPython()
(Python) File "/workspaces/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/java_gateway.py", line 1361, in __call__
(Python) File "/workspaces/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/java_gateway.py", line 1038, in send_command
(Python) File "/workspaces/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/clientserver.py", line 535, in send_command
(Python) File "/usr/lib/python3.12/socket.py", line 720, in readinto
return self._sock.recv_into(b)
Dumping threads for process 2191
Traceback for thread 2191 (python3.12) [] (most recent call last):
(Python) File "<frozen runpy>", line 198, in _run_module_as_main
(Python) File "<frozen runpy>", line 88, in _run_code
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 287, in <module>
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 180, in manager
Dumping threads for process 2198
Traceback for thread 2198 (python3.12) [] (most recent call last):
(Python) File "<frozen runpy>", line 198, in _run_module_as_main
(Python) File "<frozen runpy>", line 88, in _run_code
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 287, in <module>
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 259, in manager
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 88, in worker
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/util.py", line 981, in wrapper
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/worker.py", line 3439, in main
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 583, in read_int
Dumping threads for process 2257
Traceback for thread 2257 (python3.12) [] (most recent call last):
(Python) File "<frozen runpy>", line 198, in _run_module_as_main
(Python) File "<frozen runpy>", line 88, in _run_code
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 287, in <module>
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 259, in manager
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 88, in worker
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/util.py", line 981, in wrapper
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/worker.py", line 3439, in main
(Python) File "/workspaces/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 583, in read_int
```
Notice that it has not only the driver process, but also daemon and worker process. The plan is to incorporate this into our existing debug framework so `threaddump` button will return both JVM executor threads and python worker threads.
### Why are the changes needed?
We need insights into python worker/daemon.
We have some thread dump capability in our test, but that's not stable. `SIGTERM` sometimes is hooked and `faulthandler` can't work properly. Also it can't dump the subprocesses.
### Does this PR introduce _any_ user-facing change?
Yes, but it's hidden for now. A new command entry is introduced.
### How was this patch tested?
Locally it works.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #53705 from gaogaotiantian/python-threaddump.
Authored-by: Tian Gao <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>1 parent 3dc641b commit 3115669
File tree
4 files changed
+79
-2
lines changed- dev
- spark-test-image/python-311
- python
- pyspark
4 files changed
+79
-2
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
77 | 77 | | |
78 | 78 | | |
79 | 79 | | |
| 80 | + | |
| 81 | + | |
80 | 82 | | |
81 | 83 | | |
82 | 84 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
68 | 68 | | |
69 | 69 | | |
70 | 70 | | |
71 | | - | |
| 71 | + | |
72 | 72 | | |
73 | 73 | | |
74 | 74 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
| 1 | + | |
| 2 | + | |
| 3 | + | |
| 4 | + | |
| 5 | + | |
| 6 | + | |
| 7 | + | |
| 8 | + | |
| 9 | + | |
| 10 | + | |
| 11 | + | |
| 12 | + | |
| 13 | + | |
| 14 | + | |
| 15 | + | |
| 16 | + | |
| 17 | + | |
| 18 | + | |
| 19 | + | |
| 20 | + | |
| 21 | + | |
| 22 | + | |
| 23 | + | |
| 24 | + | |
| 25 | + | |
| 26 | + | |
| 27 | + | |
| 28 | + | |
| 29 | + | |
| 30 | + | |
| 31 | + | |
| 32 | + | |
| 33 | + | |
| 34 | + | |
| 35 | + | |
| 36 | + | |
| 37 | + | |
| 38 | + | |
| 39 | + | |
| 40 | + | |
| 41 | + | |
| 42 | + | |
| 43 | + | |
| 44 | + | |
| 45 | + | |
| 46 | + | |
| 47 | + | |
| 48 | + | |
| 49 | + | |
| 50 | + | |
| 51 | + | |
| 52 | + | |
| 53 | + | |
| 54 | + | |
| 55 | + | |
| 56 | + | |
| 57 | + | |
| 58 | + | |
| 59 | + | |
| 60 | + | |
| 61 | + | |
| 62 | + | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
113 | 113 | | |
114 | 114 | | |
115 | 115 | | |
116 | | - | |
| 116 | + | |
117 | 117 | | |
118 | 118 | | |
119 | 119 | | |
| |||
204 | 204 | | |
205 | 205 | | |
206 | 206 | | |
| 207 | + | |
| 208 | + | |
207 | 209 | | |
208 | 210 | | |
209 | 211 | | |
| 212 | + | |
| 213 | + | |
| 214 | + | |
| 215 | + | |
| 216 | + | |
| 217 | + | |
| 218 | + | |
| 219 | + | |
| 220 | + | |
| 221 | + | |
| 222 | + | |
210 | 223 | | |
211 | 224 | | |
212 | 225 | | |
| |||
0 commit comments