Add GPQA Diamond and fix evaluation deps (#196)

* Add GPQA Diamond * Add table * Fix README * Up * Fixes * Ignore logs * Fix * Pin deps * Fix GRPO * Add Llama 70B tabels * Restore dp * Pin lighteval * Use bfloat16 * Tune table * Add note
2025-02-06 15:24:52 +01:00
commit cec57f3a55
@@ -26,7 +26,7 @@ evaluate:
 		fi \
 	),))
 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
-	MODEL_ARGS="pretrained=$(MODEL),dtype=float16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilisation=0.8" && \
+	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilisation=0.8" && \
 	lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
 		--custom-tasks src/open_r1/evaluate.py \
 		--use-chat-template \
@@ -50,23 +50,23 @@ To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/ge


 ```shell
-uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip
+uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip --link-mode=copy
 ```

 Next, install vLLM:

 ```shell
-uv pip install vllm>=0.7.0
+uv pip install vllm==0.7.1

 # For CUDA 12.1
-pip install vllm>=0.7.0 --extra-index-url https://download.pytorch.org/whl/cu121
+uv pip install vllm==0.7.1 --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match --link-mode=copy
 export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0] + '/nvidia/nvjitlink/lib')"):$LD_LIBRARY_PATH
 ```

 This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:

 ```shell
-pip install -e ".[dev]"
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]" --link-mode=copy
 ```

 Next, log into your Hugging Face and Weights and Biases accounts as follows:
@@ -141,30 +141,46 @@ We use `lighteval` to evaluate models, with custom tasks defined in `src/open_r1

 ```shell
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=float16,max_model_length=32768,gpu_memory_utilisation=0.8"
-TASK=aime24
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8"
 OUTPUT_DIR=data/evals/$MODEL

+# AIME 2024
+TASK=aime24
+lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+
+# MATH-500
+TASK=math_500
+lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+
+# GPQA Diamond
+TASK=gpqa:diamond
 lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
    --custom-tasks src/open_r1/evaluate.py \
    --use-chat-template \
-    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
    --output-dir $OUTPUT_DIR 
 ```

+> [!IMPORTANT]
+> You must set `max_model_length=32768` in the `vllm` command to align with the `generation_size` we define per eval. Without this, `lighteval` will throw an error.
+
 To increase throughput across multiple GPUs, use _data parallel_ as follows:

 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=float16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL

 lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
    --custom-tasks src/open_r1/evaluate.py \
    --use-chat-template \
-    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
    --output-dir $OUTPUT_DIR 
 ```

@@ -173,7 +189,7 @@ For large models which require sharding across GPUs, use _tensor parallel_ and r
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-MODEL_ARGS="pretrained=$MODEL,dtype=float16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL

@@ -181,50 +197,97 @@ export VLLM_WORKER_MULTIPROC_METHOD=spawn
 lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
    --custom-tasks src/open_r1/evaluate.py \
    --use-chat-template \
-    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
    --output-dir $OUTPUT_DIR 
 ```

 You can also launch an evaluation with `make evaluate`, specifying the model, task, and optionally the parallelism technique and number of GPUs.

 To evaluate on a single GPU:
+
 ```shell
 make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24
 ```

 To use Data Parallelism:
+
 ```shell
 make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=data NUM_GPUS=8
 ```

 To use Tensor Parallelism:
+
 ```shell
 make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=tensor NUM_GPUS=8
 ```
-## Reproducing Deepseek's evaluation results on MATH-500
-We are able to reproduce Deepseek's reported results on the MATH-500 Benchmark:
-| Model                      | MATH-500 (HF lighteval) | MATH-500 (DeepSeek Reported) |
-| :-------------------------- | :-------: | :----------------------------: |
-| DeepSeek-R1-Distill-Qwen-1.5B  |  81.6   |              83.9              |
-| DeepSeek-R1-Distill-Qwen-7B    |  91.8   |              92.8              |
-| DeepSeek-R1-Distill-Qwen-14B   |  94.2   |              93.9              |
-| DeepSeek-R1-Distill-Qwen-32B   |  95.0   |              94.3              |
-| DeepSeek-R1-Distill-Llama-8B   |  85.8   |              89.1              |
-| DeepSeek-R1-Distill-Llama-70B  |  93.4   |              94.5              |

+## Reproducing Deepseek's evaluation results

+> [!NOTE]
+> The DeepSeek-R1 paper uses sampling with a temperature of 0.6, a top-p value of 0.95, and 64 responses per query to estimate `pass@1`. Below, we report the results from greedy decoding, which likely explains the small 1-3σ discrepancies between our results and theirs.
+
+### MATH-500
+
+We are able to reproduce Deepseek's reported results on the MATH-500 benchmark within ~1-3 standard deviations:
+
+| Model                         | MATH-500 (🤗 LightEval) | MATH-500 (DeepSeek Reported) |
+|:------------------------------|:-----------------------:|:----------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |          81.2           |             83.9             |
+| DeepSeek-R1-Distill-Qwen-7B   |          91.8           |             92.8             |
+| DeepSeek-R1-Distill-Qwen-14B  |          94.2           |             93.9             |
+| DeepSeek-R1-Distill-Qwen-32B  |          95.0           |             94.3             |
+| DeepSeek-R1-Distill-Llama-8B  |          85.4           |             89.1             |
+| DeepSeek-R1-Distill-Llama-70B |          93.4           |             94.5             |

 To reproduce these results use the following command:
+
 ```shell
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B math_500
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-7B math_500
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-14B math_500
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-32B math_500 tp
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Llama-8B math_500
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Llama-70B math_500 tp
+NUM_GPUS=1 # Set to 8 for 32B and 70B models
+MODEL=deepseek-ai/{model_name}
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS"
+OUTPUT_DIR=data/evals/$MODEL
+
+lighteval vllm $MODEL_ARGS "custom|math_500|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
 ```

+Alternatively, you can launch Slurm jobs as follows:

+```shell
+python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks math_500
+```
+
+### GPQA Diamond
+
+We are able to reproduce Deepseek's reported results on the GPQA Diamond benchmark within ~1-3 standard deviations:
+
+| Model                         | GPQA Diamond (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
+|:------------------------------|:---------------------------:|:--------------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |            33.3             |               33.8               |
+| DeepSeek-R1-Distill-Qwen-7B   |            48.4             |               49.1               |
+| DeepSeek-R1-Distill-Qwen-14B  |            55.6             |               59.1               |
+| DeepSeek-R1-Distill-Qwen-32B  |            58.6             |               62.1               |
+| DeepSeek-R1-Distill-Llama-8B  |            51.0             |               49.0               |
+| DeepSeek-R1-Distill-Llama-70B |            65.2             |               65.2               |
+
+To reproduce these results use the following command:
+
+```shell
+NUM_GPUS=1 # Set to 8 for 32B and 70B models
+MODEL=deepseek-ai/{model_name}
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS"
+OUTPUT_DIR=data/evals/$MODEL
+
+lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+```
+
+```shell
+python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks gpqa
+```

 ## Data generation

@@ -53,17 +53,17 @@ _deps = [
    "huggingface-hub[cli]>=0.19.2,<1.0",
    "isort>=5.12.0",
    "liger_kernel==0.5.2",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@0e462692436e1f0575bdb4c6ef63453ad9bde7d4#egg=lighteval[math]",
-    "math-verify>=0.3.3",  # Used for math verification in grpo
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@86f62259f105ae164f655e0b91c92a823a742724#egg=lighteval[math]",
+    "math-verify==0.5.2",  # Used for math verification in grpo
    "packaging>=23.0",
    "parameterized>=0.9.0",
    "pytest",
    "safetensors>=0.3.3",
    "sentencepiece>=0.1.99",
-    "torch>=2.5.1",
+    "torch==2.5.1",
    "transformers @ git+https://github.com/huggingface/transformers.git@main",
    "trl @ git+https://github.com/huggingface/trl.git@main",
-    "vllm>=0.7.1",
+    "vllm==0.7.1",
    "wandb>=0.19.1",
 ]

@@ -1,75 +0,0 @@
-#!/bin/bash
-#SBATCH --ntasks-per-node=1
-#SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod
-#SBATCH --output=./logs/evaluate/%x-%j.out
-#SBATCH --err=./logs/evaluate/%x-%j.err
-#SBATCH --requeue
-
-set -x -e
-source ~/.bashrc
-source openr1/bin/activate
-TASK_NAME=$1
-TASKS=$2
-MODEL_ID=$3
-MODEL_REVISION=$4
-# Optional args
-[ -z "$5"] && TENSOR_PARALLEL=False || TENSOR_PARALLEL=$5
-[ -z "$6"] && TRUST_REMOTE_CODE=False || TRUST_REMOTE_CODE=$6
-# $7 is reserved for system_prompt, see line 51
-NUM_GPUS=$(nvidia-smi -L | wc -l)
-
-# Set Whether to use tensor parallelism or data parallelism
-if [ "$TENSOR_PARALLEL" = "True" ]; then
-    # use TP to shard model across NUM_GPUS
-    export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
-else
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
-fi
-
-LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
-MODEL_NAME=$(echo $MODEL_ID | sed 's/\//_/g') # replaces / with _
-DETAILS_REPO_ID="open-r1/details-$MODEL_NAME"
-OUTPUT_DIR="eval_results/$MODEL_ID/$MODEL_REVISION/$TASK_NAME"
-# We need this flag since we run this script from training jobs that use DeepSpeed and the env vars get progated which causes errors during evaluation
-ACCELERATE_USE_DEEPSPEED=false
-# Enable fast downloads
-HF_HUB_ENABLE_HF_TRANSFER=1
-
-echo "Running lighteval script ..."
-echo "Eval results will be saved to $OUTPUT_DIR"
-# Check if "custom" is a substring of TASKS
-if [[ $TASKS == *"custom"* ]]; then
-    echo "Custom task detected. Running custom task evaluation script ..."
-    lighteval vllm $MODEL_ARGS $TASKS \
-    --custom-tasks "src/open_r1/evaluate.py" \
-    --use-chat-template \
-    --output-dir $OUTPUT_DIR \
-    --save-details \
-    ${7:+--system-prompt "$7"}
-else
-    lighteval vllm $MODEL_ARGS $TASKS \
-    --use-chat-template \
-    --output-dir $OUTPUT_DIR \
-    --save-details \
-    ${7:+--system-prompt "$7"}
-fi
-
-OUTPUT_FILEPATHS=$(find $OUTPUT_DIR/results/ -type f \( -name "*.json" \))
-for filepath in $OUTPUT_FILEPATHS; do
-    echo "Uploading $filepath to Hugging Face Hub..."
-    filename=$(basename -- "$filepath")
-    huggingface-cli upload --repo-type space --private $LM_EVAL_REPO_ID $filepath $OUTPUT_DIR/$filename
-done
-
-echo "Uploading details to Hugging Face Hub..."
-DETAILS_FILEPATHS=$(find $OUTPUT_DIR/details/ -type f \( -name "*.parquet" \))
-echo "DETAILS_FILEPATHS: $DETAILS_FILEPATHS"
-TIMESTAMP=$(date +"%Y-%m-%dT%H-%M-%S")
-python src/open_r1/utils/upload_details.py --data_files $DETAILS_FILEPATHS --hub_repo_id $DETAILS_REPO_ID --config_name $MODEL_REVISION.$TASK_NAME.$TIMESTAMP
-    
-echo "Cleaning up ..."
-rm -rf $OUTPUT_DIR
-
-echo "Done!"
@@ -1,55 +1,75 @@
 #!/bin/bash
-#SBATCH --job-name=open-r1-evaluate
-#SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
-#SBATCH --exclusive
 #SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod 
-#SBATCH --time=01:59:00
-#SBATCH --output=./logs/evaluate/%x-%j.out
-#SBATCH --err=./logs/evaluate/%x-%j.err
-
-# Usage: sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B aime24
+#SBATCH --partition=hopper-prod
+#SBATCH --output=./logs/%x-%j.out
+#SBATCH --err=./logs/%x-%j.err
+#SBATCH --requeue

 set -x -e
-
 source ~/.bashrc
 source openr1/bin/activate
-module load cuda/12.1
-echo "START TIME: $(date)"
-echo "PYTHON ENV: $(which python)"
+TASK_NAME=$1
+TASKS=$2
+MODEL_ID=$3
+MODEL_REVISION=$4
+# Optional args
+[ -z "$5"] && TENSOR_PARALLEL=False || TENSOR_PARALLEL=$5
+[ -z "$6"] && TRUST_REMOTE_CODE=False || TRUST_REMOTE_CODE=$6
+# $7 is reserved for system_prompt, see line 51
+NUM_GPUS=$(nvidia-smi -L | wc -l)

-
-NUM_GPUS=8
-MODEL=$1
-TASK=$2
-# Check if a third argument is passed, if it is tp then eval with tensor parallelism. Required for larger models
-if [ -n "$3" ] && [ "$3" == "tp" ]; then
-  MODEL_ARGS="pretrained=$MODEL,dtype=float16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+# Set Whether to use tensor parallelism or data parallelism
+if [ "$TENSOR_PARALLEL" = "True" ]; then
+    # use TP to shard model across NUM_GPUS
+    export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
 else
-  MODEL_ARGS="pretrained=$MODEL,dtype=float16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
 fi
-OUTPUT_DIR=data/evals/$MODEL

+LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
+MODEL_NAME=$(echo $MODEL_ID | sed 's/\//_/g') # replaces / with _
+DETAILS_REPO_ID="open-r1/details-$MODEL_NAME"
+OUTPUT_DIR="eval_results/$MODEL_ID/$MODEL_REVISION/$TASK_NAME"
+# We need this flag since we run this script from training jobs that use DeepSpeed and the env vars get progated which causes errors during evaluation
+ACCELERATE_USE_DEEPSPEED=false
+# Enable fast downloads
+HF_HUB_ENABLE_HF_TRANSFER=1

-# force crashing on nccl issues like hanging broadcast
-export NCCL_ASYNC_ERROR_HANDLING=1
-# export NCCL_DEBUG=INFO
-# export NCCL_DEBUG_SUBSYS=COLL
-# export NCCL_SOCKET_NTHREADS=1
-# export NCCL_NSOCKS_PERTHREAD=1
-# export CUDA_LAUNCH_BLOCKING=1
-
-# Specific configuration optimized for the Hugging Face Compute Cluster
-# Be ye warned this may not work on other clusters!
-module load cuda/12.1
-
-lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+echo "Running lighteval script ..."
+echo "Eval results will be saved to $OUTPUT_DIR"
+# Check if "custom" is a substring of TASKS
+if [[ $TASKS == *"custom"* ]]; then
+    echo "Custom task detected. Running custom task evaluation script ..."
+    lighteval vllm $MODEL_ARGS $TASKS \
+    --custom-tasks "src/open_r1/evaluate.py" \
    --use-chat-template \
-    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
+    --output-dir $OUTPUT_DIR \
    --save-details \
-    --output-dir $OUTPUT_DIR 
+    ${7:+--system-prompt "$7"}
+else
+    lighteval vllm $MODEL_ARGS $TASKS \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR \
+    --save-details \
+    ${7:+--system-prompt "$7"}
+fi

+OUTPUT_FILEPATHS=$(find $OUTPUT_DIR/results/ -type f \( -name "*.json" \))
+for filepath in $OUTPUT_FILEPATHS; do
+    echo "Uploading $filepath to Hugging Face Hub..."
+    filename=$(basename -- "$filepath")
+    huggingface-cli upload --repo-type space --private $LM_EVAL_REPO_ID $filepath $OUTPUT_DIR/$filename
+done

-echo "END TIME: $(date)"
+echo "Uploading details to Hugging Face Hub..."
+DETAILS_FILEPATHS=$(find $OUTPUT_DIR/details/ -type f \( -name "*.parquet" \))
+echo "DETAILS_FILEPATHS: $DETAILS_FILEPATHS"
+TIMESTAMP=$(date +"%Y-%m-%dT%H-%M-%S")
+python src/open_r1/utils/upload_details.py --data_files $DETAILS_FILEPATHS --hub_repo_id $DETAILS_REPO_ID --config_name $MODEL_REVISION.$TASK_NAME.$TIMESTAMP
+    
+echo "Cleaning up ..."
+rm -rf $OUTPUT_DIR
+
+echo "Done!"
@@ -14,8 +14,11 @@

 """Custom evaluation tasks for LightEval."""

+import random
+
 from lighteval.metrics.dynamic_metrics import (
    ExprExtractionConfig,
+    IndicesExtractionConfig,
    LatexExtractionConfig,
    multilingual_extractive_match_metric,
 )
@@ -44,6 +47,13 @@ expr_gold_metric = multilingual_extractive_match_metric(
    aggregation_function=max,
 )

+gpqa_metric = multilingual_extractive_match_metric(
+    language=Language.ENGLISH,
+    gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+    pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+    precision=5,
+)
+

 def prompt_fn(line, task_name: str = None):
    """Assumes the model is either prompted to emit \\boxed{answer} or does so automatically"""
@@ -64,6 +74,23 @@ def aime_prompt_fn(line, task_name: str = None):
    )


+def gpqa_prompt_fn(line, task_name: str = None):
+    """Prompt template adapted from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
+    gold_index = random.randint(0, 3)
+    choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
+    choices.insert(gold_index, line["Correct Answer"])
+    query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
+    query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=["A", "B", "C", "D"],
+        gold_index=gold_index,
+        instruction=query,
+    )
+
+
 # Define tasks
 aime24 = LightevalTaskConfig(
    name="aime24",
@@ -93,11 +120,29 @@ math_500 = LightevalTaskConfig(
    metric=[latex_gold_metric],
    version=1,
 )
+gpqa_diamond = LightevalTaskConfig(
+    name="gpqa:diamond",
+    suite=["custom"],
+    prompt_function=gpqa_prompt_fn,
+    hf_repo="Idavidrein/gpqa",
+    hf_subset="gpqa_diamond",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,  # needed for reasoning models like R1
+    metric=[gpqa_metric],
+    stop_sequence=[],  # no stop sequence, will use eos token
+    trust_dataset=True,
+    version=1,
+)
+

 # Add tasks to the table
 TASKS_TABLE = []
 TASKS_TABLE.append(aime24)
 TASKS_TABLE.append(math_500)
+TASKS_TABLE.append(gpqa_diamond)

 # MODULE LOGIC
 if __name__ == "__main__":
@@ -68,7 +68,7 @@ def accuracy_reward(completions, solution, **kwargs):
                            malformed_operators=False,
                            basic_latex=True,
                            equations=True,
-                            boxed=True,
+                            boxed="all",
                            units=True,
                        ),
                        # Ensures that boxed is tried first
@@ -48,6 +48,7 @@ LIGHTEVAL_TASKS = {}

 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "math_500", "math_500", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)


 def get_lighteval_tasks():
@@ -74,7 +75,7 @@ def run_lighteval_job(
    cmd_args = [
        f"--gres=gpu:{num_gpus}",
        f"--job-name=or1_{benchmark}_{model_name.split('/')[-1]}_{model_revision}",
-        "slurm/eval_callback.slurm",
+        "slurm/evaluate.slurm",
        benchmark,
        f'"{task_list}"',
        model_name,