From 46b80996c3adfbf55835ae5d90b04d7e9aacbb0f Mon Sep 17 00:00:00 2001 From: Dongge Liu Date: Fri, 28 Jun 2024 15:04:48 +1000 Subject: [PATCH] Add `extern` only when a c project has c++ fuzz targets (#393) Almost all regressions in https://github.com/google/oss-fuzz-gen/pull/382#issuecomment-2190303945 are due to C projects using C++ fuzz targets. This PR resolves that. --- .../project_context/context_introspector.py | 6 ++--- data_prep/project_targets.py | 12 ++++++--- experiment/benchmark.py | 6 +++++ llm_toolkit/prompt_builder.py | 27 ++++++++++++++----- run_one_experiment.py | 3 ++- 5 files changed, 39 insertions(+), 15 deletions(-) diff --git a/data_prep/project_context/context_introspector.py b/data_prep/project_context/context_introspector.py index bcd5158b27..e25146e49f 100644 --- a/data_prep/project_context/context_introspector.py +++ b/data_prep/project_context/context_introspector.py @@ -18,12 +18,12 @@ def __init__(self, benchmark: benchmarklib.Benchmark): self._benchmark = benchmark def _get_embeddable_declaration(self) -> str: - """Retrieves declaration by language.""" + """Retrieves declaration by language. Attach extern C if needed.""" lang = self._benchmark.language.lower() sig = self._benchmark.function_signature + ';' - if lang == 'c': - return sig + if self._benchmark.needs_extern: + return 'extern "C" ' + sig if lang != 'c++': logging.warning('Unsupported decl - Lang: %s Project: %s', lang, diff --git a/data_prep/project_targets.py b/data_prep/project_targets.py index d71d5cca37..af49765a9b 100755 --- a/data_prep/project_targets.py +++ b/data_prep/project_targets.py @@ -168,17 +168,21 @@ def generate_data(project_name: str, def _remove_header_comments(code: str) -> str: """Removes comments and empty lines in the code.""" - # Remove single-line comments. - single_line_comment = re.compile(r'//.*?\n') - code = re.sub(single_line_comment, '\n', code) - # Remove multi-line comments. multi_line_comment = re.compile(r'/\*.*?\*/', re.DOTALL) code = re.sub(multi_line_comment, '', code) + # Remove single-line comments. + single_line_comment = re.compile(r'(?:^|\s+)//.*\n') + code = re.sub(single_line_comment, '\n', code) + # Remove empty lines. empty_line = re.compile(r'\n+\s*\n+') code = re.sub(empty_line, '\n', code) + + # Trim all newlines and spaces. + code.lstrip('\n ') + code.rstrip('\n ') return code diff --git a/experiment/benchmark.py b/experiment/benchmark.py index eb0d6ff806..5817f1fb94 100644 --- a/experiment/benchmark.py +++ b/experiment/benchmark.py @@ -174,6 +174,12 @@ def file_type(self) -> FileType: """Returns the file type of the benchmark.""" return get_file_type(self.target_path) + @property + def needs_extern(self) -> bool: + """Checks if it is C++ fuzz target for a C project, which needs `extern`.""" + return (self.file_type.value.lower() == 'c++' and + self.language.lower() == 'c') + def get_file_type(file_path: str) -> FileType: """Returns the file type based on the extension of |file_name|.""" diff --git a/llm_toolkit/prompt_builder.py b/llm_toolkit/prompt_builder.py index 657220dbee..f07a8765a6 100644 --- a/llm_toolkit/prompt_builder.py +++ b/llm_toolkit/prompt_builder.py @@ -92,7 +92,8 @@ def build(self, target_file_type: FileType, example_pair: list[list[str]], project_example_content: Optional[list[list[str]]] = None, - project_context_content: Optional[dict] = None) -> prompts.Prompt: + project_context_content: Optional[dict] = None, + needs_extern: bool = False) -> prompts.Prompt: """Builds a prompt.""" @abstractmethod @@ -141,10 +142,15 @@ def __init__(self, self.triager_problem_template_file = self._find_template( template_dir, 'triager_problem.txt') - def _format_priming(self, target_file_type: FileType) -> str: + def _format_priming(self, target_file_type: FileType, + needs_extern: bool) -> str: """Formats a priming based on the prompt template.""" priming = self._get_template(self.priming_template_file) priming = priming.replace('{LANGUAGE}', target_file_type.value) + if needs_extern: + priming += ('\nNote that some code may need to be wrapped with ' + 'extern "C" because the project under test is ' + 'written in C but the fuzz target is in C++.\n') if target_file_type == FileType.CPP: type_specific_priming = self._get_template(self.cpp_priming_filler_file) else: @@ -274,9 +280,10 @@ def build(self, target_file_type: FileType, example_pair: list[list[str]], project_example_content: Optional[list[list[str]]] = None, - project_context_content: Optional[dict] = None) -> prompts.Prompt: + project_context_content: Optional[dict] = None, + needs_extern: bool = False) -> prompts.Prompt: """Constructs a prompt using the templates in |self| and saves it.""" - priming = self._format_priming(target_file_type) + priming = self._format_priming(target_file_type, needs_extern) final_problem = self.format_problem(function_signature) final_problem += (f'You MUST call \n' f'{function_signature}\n' @@ -303,7 +310,11 @@ def _format_fixer_priming(self, benchmark: Benchmark) -> Tuple[str, int]: """Formats a priming for code fixer based on the template.""" with open(self.fixer_priming_template_file) as f: priming = f.read().strip() + '\n' - priming = priming.replace('{LANGUAGE}', benchmark.language) + priming = priming.replace('{LANGUAGE}', benchmark.file_type.value) + if benchmark.needs_extern: + priming += ('\nNote that some code may need to be wrapped with ' + 'extern "C" because the project under test is ' + 'written in C but the fuzz target is in C++.\n') priming_prompt = self._prompt.create_prompt_piece(priming, 'system') priming_weight = self._model.estimate_token_num(priming_prompt) # NOTE: We need to return the priming _as text_ and the weight. Otherwise, @@ -733,7 +744,8 @@ def build(self, target_file_type: FileType, example_pair: list[list[str]], project_example_content: Optional[list[list[str]]] = None, - project_context_content: Optional[dict] = None) -> prompts.Prompt: + project_context_content: Optional[dict] = None, + needs_extern: bool = False) -> prompts.Prompt: """Constructs a prompt using the templates in |self| and saves it. Ignore target_file_type, project_example_content and project_context_content parameters. @@ -817,7 +829,8 @@ def build(self, target_file_type: FileType, example_pair: list[list[str]], project_example_content: Optional[list[list[str]]] = None, - project_context_content: Optional[dict] = None) -> prompts.Prompt: + project_context_content: Optional[dict] = None, + needs_extern: bool = False) -> prompts.Prompt: """Constructs a prompt using the templates in |self| and saves it.""" with open(self.priming_template_file, 'r') as f: diff --git a/run_one_experiment.py b/run_one_experiment.py index 22bbdf35e5..8afbaecbbe 100644 --- a/run_one_experiment.py +++ b/run_one_experiment.py @@ -269,7 +269,8 @@ def run(benchmark: Benchmark, benchmark.file_type, example_pair, project_examples, - project_context_content=context_info) + project_context_content=context_info, + needs_extern=benchmark.needs_extern) prompt.save(work_dirs.prompt) if dry_run: