We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
=================================== FAILURES =================================== ___________________ TestSeed.test_multi_worker_deterministic ___________________ self = <test_seed.TestSeed object at 0x7f1b971e14b0> distributed_environment = <function DistributedEnvironment.bind.<locals>.init at 0x7f1b96fd0160> def test_multi_worker_deterministic(self, distributed_environment): states = distributed_environment(4).start(seed, 42) assert [s['seed'] for s in states] == [42, 42, 42, 42] # workers should have different states assert all((s['torch_state'] != states[0]['torch_state']).any() for s in states[1:]) assert all((s['numpy_state'] != states[0]['numpy_state']).any() for s in states[1:]) assert all((s['random_state'] != states[0]['random_state']).any() for s in states[1:]) # same seed should yield same states new_states = distributed_environment(4).start(seed, 42) assert [s['seed'] for s in new_states] == [42, 42, 42, 42] assert all((s1['torch_state'] == s2['torch_state']).all() for s1, s2 in zip(states, new_states)) assert all((s1['numpy_state'] == s2['numpy_state']).all() for s1, s2 in zip(states, new_states)) assert all((s1['random_state'] == s2['random_state']).all() for s1, s2 in zip(states, new_states)) # different seed should yield different states > new_states = distributed_environment(4).start(seed, 11) test/test_seed.py:84: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ test/conftest.py:63: in start ret = conn.recv() /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:250: in recv buf = self._recv_bytes() /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:414: in _recv_bytes buf = self._recv(4) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <multiprocessing.connection.Connection object at 0x7f1b971db640> size = 4, read = <built-in function read> def _recv(self, size, read=_read): buf = io.BytesIO() handle = self._handle remaining = size while remaining > 0: chunk = read(handle, remaining) n = len(chunk) if n == 0: if remaining == size: > raise EOFError E EOFError /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:383: EOFError ----------------------------- Captured stderr call ----------------------------- [E108 14:52:04.084461486 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error Process SpawnProcess-33: Traceback (most recent call last): File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 29, in _run torch.distributed.barrier() File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper return func(*args, **kwargs) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4164, in barrier work.wait() RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:525] Read error [10.1.0.114]:61849: Connection reset by peer Process SpawnProcess-32: Traceback (most recent call last): File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper return func(*args, **kwargs) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper func_return = func(*args, **kwargs) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group default_pg, _ = _new_process_group_helper( File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper backend_class = ProcessGroupGloo( RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error [E108 17:22:04.053240347 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none] Process SpawnProcess-34: Traceback (most recent call last): File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper return func(*args, **kwargs) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper func_return = func(*args, **kwargs) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group default_pg, _ = _new_process_group_helper( File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper backend_class = ProcessGroupGloo( RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none] [E108 17:22:04.118509503 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none] Process SpawnProcess-31: Traceback (most recent call last): File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper return func(*args, **kwargs) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper func_return = func(*args, **kwargs) File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group default_pg, _ = _new_process_group_helper( File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper backend_class = ProcessGroupGloo( RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none] =============================== warnings summary =============================== test/test_seed.py::TestSeed::test_single_worker_deterministic /home/runner/work/dmlcloud/dmlcloud/test/test_seed.py:23: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword prev_torch_state = np.array(torch.get_rng_state())
The text was updated successfully, but these errors were encountered:
No branches or pull requests
The text was updated successfully, but these errors were encountered: