Random test failure #47

sehoffmann · 2025-01-13T13:58:28Z

=================================== FAILURES ===================================
___________________ TestSeed.test_multi_worker_deterministic ___________________

self = <test_seed.TestSeed object at 0x7f1b971e14b0>
distributed_environment = <function DistributedEnvironment.bind.<locals>.init at 0x7f1b96fd0160>

    def test_multi_worker_deterministic(self, distributed_environment):
        states = distributed_environment(4).start(seed, 42)
        assert [s['seed'] for s in states] == [42, 42, 42, 42]
    
        # workers should have different states
        assert all((s['torch_state'] != states[0]['torch_state']).any() for s in states[1:])
        assert all((s['numpy_state'] != states[0]['numpy_state']).any() for s in states[1:])
        assert all((s['random_state'] != states[0]['random_state']).any() for s in states[1:])
    
        # same seed should yield same states
        new_states = distributed_environment(4).start(seed, 42)
        assert [s['seed'] for s in new_states] == [42, 42, 42, 42]
        assert all((s1['torch_state'] == s2['torch_state']).all() for s1, s2 in zip(states, new_states))
        assert all((s1['numpy_state'] == s2['numpy_state']).all() for s1, s2 in zip(states, new_states))
        assert all((s1['random_state'] == s2['random_state']).all() for s1, s2 in zip(states, new_states))
    
        # different seed should yield different states
>       new_states = distributed_environment(4).start(seed, 11)

test/test_seed.py:84: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
test/conftest.py:63: in start
    ret = conn.recv()
/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:250: in recv
    buf = self._recv_bytes()
/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:414: in _recv_bytes
    buf = self._recv(4)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <multiprocessing.connection.Connection object at 0x7f1b971db640>
size = 4, read = <built-in function read>

    def _recv(self, size, read=_read):
        buf = io.BytesIO()
        handle = self._handle
        remaining = size
        while remaining > 0:
            chunk = read(handle, remaining)
            n = len(chunk)
            if n == 0:
                if remaining == size:
>                   raise EOFError
E                   EOFError

/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:383: EOFError
----------------------------- Captured stderr call -----------------------------
[E108 14:52:04.084461486 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error
Process SpawnProcess-33:
Traceback (most recent call last):
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 29, in _run
    torch.distributed.barrier()
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
    return func(*args, **kwargs)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4164, in barrier
    work.wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:525] Read error [10.1.0.114]:61849: Connection reset by peer
Process SpawnProcess-32:
Traceback (most recent call last):
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run
    torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
    return func(*args, **kwargs)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper
    func_return = func(*args, **kwargs)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group
    default_pg, _ = _new_process_group_helper(
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper
    backend_class = ProcessGroupGloo(
RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error
[E108 17:22:04.053240347 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none]
Process SpawnProcess-34:
Traceback (most recent call last):
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run
    torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
    return func(*args, **kwargs)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper
    func_return = func(*args, **kwargs)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group
    default_pg, _ = _new_process_group_helper(
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper
    backend_class = ProcessGroupGloo(
RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none]
[E108 17:22:04.118509503 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none]
Process SpawnProcess-31:
Traceback (most recent call last):
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run
    torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
    return func(*args, **kwargs)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper
    func_return = func(*args, **kwargs)
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group
    default_pg, _ = _new_process_group_helper(
  File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper
    backend_class = ProcessGroupGloo(
RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none]
=============================== warnings summary ===============================
test/test_seed.py::TestSeed::test_single_worker_deterministic
  /home/runner/work/dmlcloud/dmlcloud/test/test_seed.py:23: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
    prev_torch_state = np.array(torch.get_rng_state())

sehoffmann added the bug Something isn't working label Jan 13, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Random test failure #47

Random test failure #47

sehoffmann commented Jan 13, 2025

Random test failure #47

Random test failure #47

Comments

sehoffmann commented Jan 13, 2025