Skip to content

Commit

Permalink
Retry on broken pipe and refactor error handling (#154)
Browse files Browse the repository at this point in the history
  • Loading branch information
cjx10 authored Mar 14, 2024
1 parent 487a85b commit a1552b9
Showing 1 changed file with 50 additions and 39 deletions.
89 changes: 50 additions & 39 deletions experiment/builder_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,21 +322,58 @@ def get_coverage_local(

class CloudBuilderRunner(BuilderRunner):
"""Cloud BuilderRunner."""
_RETRYABLE_ERRORS = [
# As mentioned in pr #100.
'RESOURCE_EXHAUSTED',
# Temp workaround for issue #12.
'You do not currently have an active account selected',
# Workaround for issue #85.
'gcloud crashed (OSError): unexpected end of data',
]

def __init__(self, *args, experiment_name: str, experiment_bucket: str,
**kwargs):
self.experiment_name = experiment_name
self.experiment_bucket = experiment_bucket
super().__init__(*args, **kwargs)

@staticmethod
def _run_with_retry_control(target_path: str, *args, **kwargs) -> bool:
"""sp.run() with controllable retry and customized exponential backoff."""
# List of (error_str, exp_backoff_func).
retryable_errors = [
# As mentioned in pr #100.
('RESOURCE_EXHAUSTED', lambda x: 5 * 2**x + random.randint(50, 90)),
# As mentioned in pr #151.
('BrokenPipeError: [Errno 32] Broken pipe',
lambda x: 5 * 2**x + random.randint(1, 5)),
# Temp workaround for issue #12.
('You do not currently have an active account selected',
lambda x: 5 * 2**x),
# Workaround for issue #85.
('gcloud crashed (OSError): unexpected end of data', lambda x: 5 * 2**x
),
]

for attempt_id in range(1, CLOUD_EXP_MAX_ATTEMPT + 1):
try:
sp.run(*args, capture_output=True, check=True, **kwargs)
return True
except sp.CalledProcessError as e:
# Replace \n for single log entry on cloud.
stdout = e.stdout.decode('utf-8').replace('\n', '\t')
stderr = e.stderr.decode('utf-8').replace('\n', '\t')

delay = next((delay_f(attempt_id)
for err, delay_f in retryable_errors
if err in stdout + stderr), 0)

if not delay or attempt_id == CLOUD_EXP_MAX_ATTEMPT:
logging.error('Failed to evaluate %s on cloud, attempt %d:\n%s\n%s',
os.path.realpath(target_path), attempt_id, stdout,
stderr)
break

logging.warning(
'Failed to evaluate %s on cloud, attempt %d, retry in %ds:\n'
'%s\n%s', os.path.realpath(target_path), attempt_id, delay, stdout,
stderr)
time.sleep(delay)

return False

def build_and_run(self, generated_project: str, target_path: str,
iteration: int) -> tuple[BuildResult, Optional[RunResult]]:
build_result = BuildResult()
Expand All @@ -358,9 +395,9 @@ def build_and_run(self, generated_project: str, target_path: str,
coverage_name = f'{uid}.coverage'
coverage_path = f'gs://{self.experiment_bucket}/{coverage_name}'

for attempt_id in range(1, CLOUD_EXP_MAX_ATTEMPT + 1):
try:
sp.run([
if not self._run_with_retry_control(
os.path.realpath(target_path),
[
f'./{oss_fuzz_checkout.VENV_DIR}/bin/python3',
'infra/build/functions/target_experiment.py',
f'--project={generated_project}',
Expand All @@ -371,34 +408,8 @@ def build_and_run(self, generated_project: str, target_path: str,
f'--upload_coverage={coverage_path}',
f'--experiment_name={self.experiment_name}', '--'
] + self._libfuzzer_args(),
capture_output=True,
check=True,
cwd=oss_fuzz_checkout.OSS_FUZZ_DIR)
break
except sp.CalledProcessError as e:
# Replace \n for single log entry on cloud.
stdout = e.stdout.decode('utf-8').replace('\n', '\t')
stderr = e.stderr.decode('utf-8').replace('\n', '\t')

captured_error = next(
(err for err in self._RETRYABLE_ERRORS if err in stdout + stderr),
'')
if captured_error and attempt_id < CLOUD_EXP_MAX_ATTEMPT:
delay = 5 * 2**attempt_id
if captured_error == 'RESOURCE_EXHAUSTED':
# Add random jitter in case of exceeding request per minute quota.
delay += random.randint(50, 90)

logging.warning(
'Failed to evaluate %s on cloud, attempt %d:\n%s\n%s\n'
'Retry in %ds...', os.path.realpath(target_path), attempt_id,
stdout, stderr, delay)
time.sleep(delay)
else:
logging.error('Failed to evaluate %s on cloud, attempt %d:\n%s\n%s',
os.path.realpath(target_path), attempt_id, stdout,
stderr)
return build_result, None
cwd=oss_fuzz_checkout.OSS_FUZZ_DIR):
return build_result, None

print(f'Evaluated {os.path.realpath(target_path)} on cloud.')

Expand Down

0 comments on commit a1552b9

Please sign in to comment.