Skip to content

Commit

Permalink
Pass code block to fixer prompt, not the raw LLM response (#259) (#261)
Browse files Browse the repository at this point in the history
Passing the raw response can confuse LLM when it contains more text than
the code block, which occurred on `Gemini 1.5`.
Also, `Gemini 1.5`'s response structure seems to be different from
`code-bison-32k`.

This PR does three things:
1. Make the response parser compatible with both models.
2. Pass the code block in response to LLM fixer, not the raw response.
3. More instructions in the code-fixing prompt to avoid common mistakes.
  • Loading branch information
DonggeLiu authored May 10, 2024
1 parent 7dbfc82 commit f1a66c1
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 16 deletions.
10 changes: 5 additions & 5 deletions llm_toolkit/code_fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,8 +278,7 @@ def llm_fix(ai_binary: str, target_path: str, benchmark: benchmarklib.Benchmark,
llm_fix_id: int, error_desc: Optional[str], errors: list[str],
fixer_model_name: str) -> None:
"""Reads and fixes |target_path| in place with LLM based on |error_log|."""
with open(target_path) as target_file:
raw_code = target_file.read()
fuzz_target_source_code = parser.parse_code(target_path)

_, target_ext = os.path.splitext(os.path.basename(target_path))
response_dir = f'{os.path.splitext(target_path)[0]}-F{llm_fix_id}'
Expand All @@ -288,7 +287,7 @@ def llm_fix(ai_binary: str, target_path: str, benchmark: benchmarklib.Benchmark,

apply_llm_fix(ai_binary,
benchmark,
raw_code,
fuzz_target_source_code,
error_desc,
errors,
prompt_path,
Expand Down Expand Up @@ -328,7 +327,7 @@ def llm_fix(ai_binary: str, target_path: str, benchmark: benchmarklib.Benchmark,

def apply_llm_fix(ai_binary: str,
benchmark: benchmarklib.Benchmark,
raw_code: str,
fuzz_target_source_code: str,
error_desc: Optional[str],
errors: list[str],
prompt_path: str,
Expand All @@ -344,7 +343,8 @@ def apply_llm_fix(ai_binary: str,
)

builder = prompt_builder.DefaultTemplateBuilder(fixer_model)
prompt = builder.build_fixer_prompt(benchmark, raw_code, error_desc, errors)
prompt = builder.build_fixer_prompt(benchmark, fuzz_target_source_code,
error_desc, errors)
prompt.save(prompt_path)

fixer_model.generate_code(prompt, response_dir)
Expand Down
37 changes: 27 additions & 10 deletions llm_toolkit/output_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,24 +44,41 @@ def parse_args() -> argparse.Namespace:
return args


def _parse_code_block_by_marker(lines: list[str], start_marker: str,
end_marker: str) -> list[str]:
"""Parses code block lines based on markers."""
block = []
in_block = False
contains_api = False

for line in lines:
if not in_block and start_marker in line.lower():
in_block = True # Start a code block.
if not contains_api:
block = [] # Ignore previous block because it does not contain API.
elif in_block and end_marker in line:
in_block = False # Finish a code block.
if contains_api:
break # Found fuzz target.
elif in_block:
block.append(line)
contains_api = contains_api or 'LLVMFuzzerTestOneInput' in line
return block if block else lines


def parse_code(response_path: str) -> str:
"""Parses the expected output from the |response_path|."""
with open(response_path) as file:
response = file.read()
solution = response.split('</solution>')[0]
solution = solution.replace('<code>', '').replace('</code>', '')

lines = solution.splitlines()
lines = _parse_code_block_by_marker(lines, '```c', '```')
lines = _parse_code_block_by_marker(lines, '<code>', '</code>')

def should_remove(line):
line = line.strip()
return not line or line.startswith('```')

# Remove leading empty lines or lines starting with ```.
while lines and should_remove(lines[0]):
# Remove leading and trailing empty lines.
while lines and not lines[0].strip():
lines.pop(0)
# Remove trailing empty lines or lines starting with ```.
while lines and should_remove(lines[-1]):
while lines and not lines[-1].strip():
lines.pop()

return '\n'.join(lines)
Expand Down
5 changes: 4 additions & 1 deletion prompts/template_xml/fixer_priming.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@ Given the following C++ fuzz harness and its build error message, fix the code t

If there is undeclared identifier or unknown type name error, fix it by finding and including the related libraries.

Note that some code may need to be wrapped with <code>extern "C"</code> as their source is C program.
Note that some code may need to be wrapped with <code>extern "C"</code> as their source is C program.

MUST RETURN THE FULL CODE, INCLUDING UNCHANGED PARTS.
EXTREMELY IMPORTANT: AVOID USING <code>goto</code>. If you have to write code using <code>goto</code>, you MUST MUST also declare all variables BEFORE the <code>goto</code>. Never introduce new variables after the <code>goto</code>.

0 comments on commit f1a66c1

Please sign in to comment.