From 3a45601f9ed73f0bed84a54e19a4ed3fb9e67835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steven!=20Ragnar=C3=B6k?= Date: Wed, 13 Nov 2019 09:06:21 -0500 Subject: [PATCH] Expressly destroy a node's objects before the node. (#456) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Expressly destroy a node's objects before the node. This seems to reduce hangs during test runs described in https://github.com/ros2/build_cop/issues/248. The handles corresponding to the destroyed objects *should* be getting destroyed explicitly when self.handle.destroy() is called below. It seems however that when running with Fast-RTPS it's possible to get into a state where multiple threads are waiting on futexes and none can move forward. The rclpy context of this apparent deadlock is while clearing a node's list of publishers or services (possibly others, although publishers and services were the only ones observed). I consider this patch to be a workaround rather than a fix. I think there may either be a race condition between the rcl/rmw layer and the rmw implementation layer which is being tripped by the haphazardness of Python's garbage collector or there is a logical problem with the handle destruction ordering in rclpy that only Fast-RTPS is sensitive to. * Don't pre-emptively remove items from Node lists. As pointed out by Shane, pop()ing each item from the list before passing it to the .destroy_ITEM() method prevents it from being destroyed as the individual methods first check that the item is present in the list then remove it before continuing to destroy it. Signed-off-by: Steven! Ragnarök Signed-off-by: AbhinavSingh --- rclpy/rclpy/node.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/rclpy/rclpy/node.py b/rclpy/rclpy/node.py index 46bb32d69..5c4ce8f2f 100644 --- a/rclpy/rclpy/node.py +++ b/rclpy/rclpy/node.py @@ -1482,12 +1482,20 @@ def destroy_node(self) -> bool: # It will be destroyed with other publishers below. self._parameter_event_publisher = None - self.__publishers.clear() - self.__subscriptions.clear() - self.__clients.clear() - self.__services.clear() - self.__timers.clear() - self.__guards.clear() + # Destroy dependent items eagerly to work around a possible hang + # https://github.com/ros2/build_cop/issues/248 + while self.__publishers: + self.destroy_publisher(self.__publishers[0]) + while self.__subscriptions: + self.destroy_subscription(self.__subscriptions[0]) + while self.__clients: + self.destroy_client(self.__clients[0]) + while self.__services: + self.destroy_service(self.__services[0]) + while self.__timers: + self.destroy_timer(self.__timers[0]) + while self.__guards: + self.destroy_guard_condition(self.__guards[0]) self.handle.destroy() self._wake_executor()