Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Debug Logs #471

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion smdebug/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.8"
__version__ = "1.0.8c0"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

2 changes: 2 additions & 0 deletions smdebug/core/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ def _increment_step(self):
self._write_state()

self.step += 1
self.logger.debug(f"Smdebug: Increment Step: {self.step}")
self.mode_steps[self.mode] += 1
self.written_tensor_name_for_step.clear()

Expand Down Expand Up @@ -592,6 +593,7 @@ def should_save_tensor_or_collection(self, tensor_name: str, collection_name: st

def _write_state(self):
if self.state_store.is_checkpoint_updated():
self.logger.debug(f"smdebug: writing checkpoint")
current_state = dict()
current_state[TRAINING_RUN] = self.training_run
current_state[LATEST_GLOBAL_STEP_SAVED] = self.last_saved_step
Expand Down
2 changes: 2 additions & 0 deletions smdebug/core/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def __init__(
self.flush_secs = flush_secs
self.verbose = verbose
self.write_checksum = write_checksum
self.logger = get_logger()

self._proto_writer = None

Expand Down Expand Up @@ -155,6 +156,7 @@ def write_tensor(
tag = tname
tensor_proto = make_tensor_proto(nparray_data=value, tag=tag)
s = Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor_proto)])
self.logger.debug(f"smdebug: writing tensor: {tname}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't the logger itself handle prepending smdebug as an identifier to each logging message?

if write_index:
self.proto_writer.write_summary_with_index(
s, self.step, tname, mode, mode_step, timestamp=timestamp
Expand Down
9 changes: 9 additions & 0 deletions smdebug/tensorflow/keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def register_model(self, model):
# This function is called by the hook in the AWS TF codebase
# It attaches a hook to every layer of the model to capture
# layer values
self.logger.debug("Smdebug Register Model")
self.model = model
self._wrap_model_with_input_output_saver()
self.has_registered_model = True
Expand Down Expand Up @@ -553,6 +554,7 @@ def _save_model_inputs_and_outputs_helper(self, collection_key, tensors_to_save,
def save_smdebug_logs(self, logs):
if logs is None:
return
self.logger.debug(f"Length of logs shared to debugger: {len(logs)}")

for key in logs:
if SMDEBUG_PREFIX in key:
Expand Down Expand Up @@ -725,6 +727,7 @@ def _remove_fetches_and_callbacks(self, mode):

def _prepare_collections_for_tf2(self):
self._prepare_collections()
self.logger.debug("SMDEBUG: Preparing Collections")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Case of "smdebug" is inconsistent. There should be some higher level wrapper that manages prepending "smdebug" so that it isn't expected for the writer of the log to remember to prepend "smdebug".

if self.has_default_hook_configuration():
# wrapping the model is only supported if the hook does not have the default hook configuration
self._unwrap_model_with_input_output_saver()
Expand All @@ -750,6 +753,7 @@ def _on_any_mode_begin(self, mode):

self.graph = tf.get_default_graph()
self.set_mode(mode)
self.logger.debug(f"SMDEBUG: Set mode: {mode}")

if self.prepared_collections is False and is_tf_version_2_3_x():
# Addresses ordering issues in TF 2.3.0
Expand All @@ -762,9 +766,11 @@ def _on_any_mode_begin(self, mode):
self.callable_cache.change_mode()

def on_train_begin(self, logs=None):
self.logger.debug("SMDEBUG: Entering On Train Begin")
self._on_any_mode_begin(ModeKeys.TRAIN)

def on_test_begin(self, logs=None):
self.logger.debug("SMDEBUG: Entering On Test Begin")
self._on_any_mode_begin(ModeKeys.EVAL)

def _on_any_mode_end(self, mode):
Expand Down Expand Up @@ -811,6 +817,7 @@ def on_predict_begin(self, logs=None):
def _wrap_model_with_input_output_saver(self):
if self.has_registered_model:
return
self.logger.debug("SMDEBUG: Attach Model Layers With Hooks")
for layer in self.model.layers:
layer._hooks = []
layer.call = get_layer_call_fn(layer)
Expand All @@ -822,10 +829,12 @@ def _wrap_model_with_input_output_saver(self):
def _unwrap_model_with_input_output_saver(self):
if self.has_registered_model is False:
return
self.logger.debug("SMDEBUG: Remove Hooks From Model Layers")
for layer in self.model.layers:
layer.call = layer.old_call

def _on_any_batch_begin(self, batch, mode, logs=None):
self.logger.debug(f"SMDEBUG: On Any Batch Begin: {mode}")
self.start = time.time()
if self._is_not_supported():
return
Expand Down