Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
Elfsong committed Apr 15, 2024
1 parent 3c5cb6e commit 74f8e19
Show file tree
Hide file tree
Showing 4 changed files with 238 additions and 18 deletions.
41 changes: 26 additions & 15 deletions bigcode_eval/tasks/custom_metrics/beyond_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,16 +300,14 @@ def run_samples(samples, n_workers=4):
return results


def estimate_at_k(num_samples, num_correct, k):
"""Estimates beyond@k of each problem and returns them in an array."""

def cf(n, k):
return math.gamma(n+1) / (math.gamma(k+1) * (math.gamma(n-k+1)))
def estimate_pass_at_k(num_samples, num_correct, k):
"""Estimates pass@k of each problem and returns them in an array."""

def estimator(n: int, c: int, k: int) -> float:
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
if n - c < k:
return 1.0
return 1 - cf(n-c, k) / cf(n, k)
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

if isinstance(num_samples, int):
num_samples_it = itertools.repeat(num_samples, len(num_correct))
Expand All @@ -319,6 +317,17 @@ def estimator(n: int, c: int, k: int) -> float:

return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])

def estimate_beyond_at_k(runtimes, k):
"""Estimates pass@k of each problem and returns them in an array."""

def estimator(runtimes: list, k: int) -> float:
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
print(runtimes)
print("============")
return sum(runtimes[:k])/len(runtimes)

return np.array([estimator(r, k) for r in runtimes])

def compute_beyond_eval(generations_list, reference_list, timeout=30):
sandbox = Sandbox()

Expand Down Expand Up @@ -353,7 +362,8 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
max_runtime = max(runtimes)

# Evaluate generated solutions
t_c, p_c, b_c = 0, 0, 0
t_c, p_c = 0, 0
b_l = list()
difficulty = instance['difficulty']

for index, solution in enumerate(generations):
Expand All @@ -373,29 +383,30 @@ def compute_beyond_eval(generations_list, reference_list, timeout=30):
# Calculate Beyond
if result['result'] == "passed":
runtime = result['runtime']
runtime = min(runtime, max_runtime)
runtime = max(runtime, min_runtime)
b_c += (max_runtime - runtime) / (max_runtime - min_runtime)
p_c += 1
else:
runtime = float('inf')

runtime = min(runtime, max_runtime)
runtime = max(runtime, min_runtime)
b_l += [(max_runtime - runtime) / (max_runtime - min_runtime)]

scores[difficulty]['total_c'] += [t_c]
scores[difficulty]['correct_c'] += [p_c]
scores[difficulty]['beyond_c'] += [b_c]
scores[difficulty]['beyond_c'] += [b_l]

scores['Average']['total_c'] += [t_c]
scores['Average']['correct_c'] += [p_c]
scores['Average']['beyond_c'] += [b_c]
scores['Average']['beyond_c'] += [b_l]

results = dict()
for difficulty in ['Easy', "Medium", "Hard", "Average"]:
total = np.array(scores[difficulty]['total_c'])
correct = np.array(scores[difficulty]['correct_c'])
beyond = np.array(scores[difficulty]['beyond_c'])
beyond = scores[difficulty]['beyond_c']

pass_at_k = {f"{difficulty}_pass@{k}": estimate_at_k(total, correct, k).mean() for k in [1,3,5] if (total >= k).all()}
beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_at_k(total, beyond, k).mean() for k in [1,3,5] if (total >= k).all()}
pass_at_k = {f"{difficulty}_pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in [1,3,5] if (total >= k).all()}
beyond_at_k = {f"{difficulty}_beyond@{k}": estimate_beyond_at_k(beyond, k).mean() for k in [1,3,5] if (total >= k).all()}

results.update(pass_at_k)
results.update(beyond_at_k)
Expand Down
8 changes: 5 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def main():
model_kwargs = {
"revision": args.revision,
"trust_remote_code": args.trust_remote_code,
"use_auth_token": args.use_auth_token,
"token": args.use_auth_token,
}
if args.load_in_8bit:
print("Loading model in 8bit")
Expand All @@ -275,6 +275,8 @@ def main():
elif args.load_in_4bit:
print("Loading model in 4bit")
model_kwargs["load_in_4bit"] = args.load_in_4bit
model_kwargs["torch_dtype"] = torch.float16
model_kwargs["bnb_4bit_compute_dtype"] = torch.float16
model_kwargs["device_map"] = {"": accelerator.process_index}
else:
print(f"Loading model in {args.precision}")
Expand Down Expand Up @@ -322,7 +324,7 @@ def main():
args.model,
revision=args.revision,
trust_remote_code=args.trust_remote_code,
use_auth_token=args.use_auth_token,
token=args.use_auth_token,
padding_side="left",
)
else:
Expand All @@ -331,7 +333,7 @@ def main():
args.model,
revision=args.revision,
trust_remote_code=args.trust_remote_code,
use_auth_token=args.use_auth_token,
token=args.use_auth_token,
truncation_side="left",
padding_side="right",
)
Expand Down
22 changes: 22 additions & 0 deletions mercury.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
accelerate launch --main_process_port 29501 main.py \
--model deepseek-ai/deepseek-coder-6.7b-base \
--load_in_4bit \
--limit 256 \
--max_length_generation 1024 \
--tasks mercury \
--n_samples 5 \
--temperature 0.2 \
--batch_size 6 \
--allow_code_execution

accelerate launch --main_process_port 29502 main.py \
--model deepseek-ai/deepseek-coder-6.7b-instruct \
--load_in_4bit \
--limit 256 \
--max_length_generation 1024 \
--tasks mercury \
--n_samples 5 \
--temperature 0.2 \
--batch_size 6 \
--allow_code_execution

185 changes: 185 additions & 0 deletions playground.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "070e463eddd94abf8d5d2384fcd7674a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/749 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7122460fac9643cda39186565c42eeb4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b51157e2cc5f436cb49b972b4bd23ecc",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.json: 0%| | 0.00/1.84M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cf661fb7a0224e68be5f5d1255b608c2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"special_tokens_map.json: 0%| | 0.00/411 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "06601ec5d98847809cd3d4d1f0af3fb5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"config.json: 0%| | 0.00/588 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "843d8e08d9a94b268c8f7c7df3030765",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model.safetensors.index.json: 0%| | 0.00/37.6k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cefc9c5dbd1b4bc2b9a2a903192b33f8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading shards: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a5bedd3959124f1f8875178b13450682",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model-00001-of-00007.safetensors: 0%| | 0.00/9.85G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Load model directly\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"codellama/CodeLlama-34b-hf\")\n",
"model = AutoModelForCausalLM.from_pretrained(\"codellama/CodeLlama-34b-hf\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"import transformers\n",
"import torch\n",
"\n",
"model = \"codellama/CodeLlama-34b-hf\"\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model)\n",
"pipeline = transformers.pipeline(\n",
" \"text-generation\",\n",
" model=model,\n",
" torch_dtype=torch.float16,\n",
" device_map=\"auto\",\n",
")\n",
"\n",
"sequences = pipeline(\n",
" 'import socket\\n\\ndef ping_exponential_backoff(host: str):',\n",
" do_sample=True,\n",
" top_k=10,\n",
" temperature=0.1,\n",
" top_p=0.95,\n",
" num_return_sequences=1,\n",
" eos_token_id=tokenizer.eos_token_id,\n",
" max_length=200,\n",
")\n",
"for seq in sequences:\n",
" print(f\"Result: {seq['generated_text']}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "workspace",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 74f8e19

Please sign in to comment.