From 46b80996c3adfbf55835ae5d90b04d7e9aacbb0f Mon Sep 17 00:00:00 2001
From: Dongge Liu <donggeliu@google.com>
Date: Fri, 28 Jun 2024 15:04:48 +1000
Subject: [PATCH] Add `extern` only when a c project has c++ fuzz targets
 (#393)

Almost all regressions in
https://github.com/google/oss-fuzz-gen/pull/382#issuecomment-2190303945
are due to C projects using C++ fuzz targets.
This PR resolves that.
---
 .../project_context/context_introspector.py   |  6 ++---
 data_prep/project_targets.py                  | 12 ++++++---
 experiment/benchmark.py                       |  6 +++++
 llm_toolkit/prompt_builder.py                 | 27 ++++++++++++++-----
 run_one_experiment.py                         |  3 ++-
 5 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/data_prep/project_context/context_introspector.py b/data_prep/project_context/context_introspector.py
index bcd5158b27..e25146e49f 100644
--- a/data_prep/project_context/context_introspector.py
+++ b/data_prep/project_context/context_introspector.py
@@ -18,12 +18,12 @@ def __init__(self, benchmark: benchmarklib.Benchmark):
     self._benchmark = benchmark
 
   def _get_embeddable_declaration(self) -> str:
-    """Retrieves declaration by language."""
+    """Retrieves declaration by language.  Attach extern C if needed."""
     lang = self._benchmark.language.lower()
     sig = self._benchmark.function_signature + ';'
 
-    if lang == 'c':
-      return sig
+    if self._benchmark.needs_extern:
+      return 'extern "C" ' + sig
 
     if lang != 'c++':
       logging.warning('Unsupported decl - Lang: %s Project: %s', lang,
diff --git a/data_prep/project_targets.py b/data_prep/project_targets.py
index d71d5cca37..af49765a9b 100755
--- a/data_prep/project_targets.py
+++ b/data_prep/project_targets.py
@@ -168,17 +168,21 @@ def generate_data(project_name: str,
 
 def _remove_header_comments(code: str) -> str:
   """Removes comments and empty lines in the code."""
-  # Remove single-line comments.
-  single_line_comment = re.compile(r'//.*?\n')
-  code = re.sub(single_line_comment, '\n', code)
-
   # Remove multi-line comments.
   multi_line_comment = re.compile(r'/\*.*?\*/', re.DOTALL)
   code = re.sub(multi_line_comment, '', code)
 
+  # Remove single-line comments.
+  single_line_comment = re.compile(r'(?:^|\s+)//.*\n')
+  code = re.sub(single_line_comment, '\n', code)
+
   # Remove empty lines.
   empty_line = re.compile(r'\n+\s*\n+')
   code = re.sub(empty_line, '\n', code)
+
+  # Trim all newlines and spaces.
+  code.lstrip('\n ')
+  code.rstrip('\n ')
   return code
 
 
diff --git a/experiment/benchmark.py b/experiment/benchmark.py
index eb0d6ff806..5817f1fb94 100644
--- a/experiment/benchmark.py
+++ b/experiment/benchmark.py
@@ -174,6 +174,12 @@ def file_type(self) -> FileType:
     """Returns the file type of the benchmark."""
     return get_file_type(self.target_path)
 
+  @property
+  def needs_extern(self) -> bool:
+    """Checks if it is C++ fuzz target for a C project, which needs `extern`."""
+    return (self.file_type.value.lower() == 'c++' and
+            self.language.lower() == 'c')
+
 
 def get_file_type(file_path: str) -> FileType:
   """Returns the file type based on the extension of |file_name|."""
diff --git a/llm_toolkit/prompt_builder.py b/llm_toolkit/prompt_builder.py
index 657220dbee..f07a8765a6 100644
--- a/llm_toolkit/prompt_builder.py
+++ b/llm_toolkit/prompt_builder.py
@@ -92,7 +92,8 @@ def build(self,
             target_file_type: FileType,
             example_pair: list[list[str]],
             project_example_content: Optional[list[list[str]]] = None,
-            project_context_content: Optional[dict] = None) -> prompts.Prompt:
+            project_context_content: Optional[dict] = None,
+            needs_extern: bool = False) -> prompts.Prompt:
     """Builds a prompt."""
 
   @abstractmethod
@@ -141,10 +142,15 @@ def __init__(self,
     self.triager_problem_template_file = self._find_template(
         template_dir, 'triager_problem.txt')
 
-  def _format_priming(self, target_file_type: FileType) -> str:
+  def _format_priming(self, target_file_type: FileType,
+                      needs_extern: bool) -> str:
     """Formats a priming based on the prompt template."""
     priming = self._get_template(self.priming_template_file)
     priming = priming.replace('{LANGUAGE}', target_file_type.value)
+    if needs_extern:
+      priming += ('\nNote that some code may need to be wrapped with '
+                  '<code>extern "C"</code> because the project under test is '
+                  'written in C but the fuzz target is in C++.\n')
     if target_file_type == FileType.CPP:
       type_specific_priming = self._get_template(self.cpp_priming_filler_file)
     else:
@@ -274,9 +280,10 @@ def build(self,
             target_file_type: FileType,
             example_pair: list[list[str]],
             project_example_content: Optional[list[list[str]]] = None,
-            project_context_content: Optional[dict] = None) -> prompts.Prompt:
+            project_context_content: Optional[dict] = None,
+            needs_extern: bool = False) -> prompts.Prompt:
     """Constructs a prompt using the templates in |self| and saves it."""
-    priming = self._format_priming(target_file_type)
+    priming = self._format_priming(target_file_type, needs_extern)
     final_problem = self.format_problem(function_signature)
     final_problem += (f'You MUST call <code>\n'
                       f'{function_signature}\n'
@@ -303,7 +310,11 @@ def _format_fixer_priming(self, benchmark: Benchmark) -> Tuple[str, int]:
     """Formats a priming for code fixer based on the template."""
     with open(self.fixer_priming_template_file) as f:
       priming = f.read().strip() + '\n'
-    priming = priming.replace('{LANGUAGE}', benchmark.language)
+    priming = priming.replace('{LANGUAGE}', benchmark.file_type.value)
+    if benchmark.needs_extern:
+      priming += ('\nNote that some code may need to be wrapped with '
+                  '<code>extern "C"</code> because the project under test is '
+                  'written in C but the fuzz target is in C++.\n')
     priming_prompt = self._prompt.create_prompt_piece(priming, 'system')
     priming_weight = self._model.estimate_token_num(priming_prompt)
     # NOTE: We need to return the priming _as text_ and the weight. Otherwise,
@@ -733,7 +744,8 @@ def build(self,
             target_file_type: FileType,
             example_pair: list[list[str]],
             project_example_content: Optional[list[list[str]]] = None,
-            project_context_content: Optional[dict] = None) -> prompts.Prompt:
+            project_context_content: Optional[dict] = None,
+            needs_extern: bool = False) -> prompts.Prompt:
     """Constructs a prompt using the templates in |self| and saves it.
        Ignore target_file_type, project_example_content
        and project_context_content parameters.
@@ -817,7 +829,8 @@ def build(self,
             target_file_type: FileType,
             example_pair: list[list[str]],
             project_example_content: Optional[list[list[str]]] = None,
-            project_context_content: Optional[dict] = None) -> prompts.Prompt:
+            project_context_content: Optional[dict] = None,
+            needs_extern: bool = False) -> prompts.Prompt:
     """Constructs a prompt using the templates in |self| and saves it."""
 
     with open(self.priming_template_file, 'r') as f:
diff --git a/run_one_experiment.py b/run_one_experiment.py
index 22bbdf35e5..8afbaecbbe 100644
--- a/run_one_experiment.py
+++ b/run_one_experiment.py
@@ -269,7 +269,8 @@ def run(benchmark: Benchmark,
                            benchmark.file_type,
                            example_pair,
                            project_examples,
-                           project_context_content=context_info)
+                           project_context_content=context_info,
+                           needs_extern=benchmark.needs_extern)
     prompt.save(work_dirs.prompt)
 
     if dry_run: