Skip to content

Commit

Permalink
AIME multiple runs (#48)
Browse files Browse the repository at this point in the history
Co-authored-by: Lingjiao Chen <[email protected]>
  • Loading branch information
lchen001 and Lingjiao Chen authored Dec 13, 2024
1 parent df03f9d commit fea1a07
Show file tree
Hide file tree
Showing 7 changed files with 294 additions and 56 deletions.
2 changes: 2 additions & 0 deletions eureka_ml_insights/data_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
SamplerTransform,
SequenceTransform,
ShuffleColumnsTransform,
MajorityVoteTransform,
TokenCounterTransform,
)

Expand Down Expand Up @@ -73,5 +74,6 @@
ShuffleColumnsTransform,
ColumnMatchMapTransform,
TokenCounterTransform,
MajorityVoteTransform,
NumpyEncoder,
]
31 changes: 29 additions & 2 deletions eureka_ml_insights/data_utils/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
class ShuffleColumnsTransform(MultiColumnTransform):
"""
For a set of columns, shuffles the values across each row of these columns.
Values will be shuffled differently for each row.
Values will be shuffled differently for each row.
This class is meant to be used in MCQ benchmarks to shuffle answer choices
across different letter options (e.g. shuffle what choice maps to 'A' vs 'B' vs 'C').
Expand Down Expand Up @@ -223,7 +223,7 @@ def _find_matching_column(self, row):
if row[col] == row[self.key_col]:
return col
return None # If no match is found (optional)

def validate(self, df: pd.DataFrame):
"""Check that all columns to be transformed are present actually in the data frame."""
extra_columns = set(self.columns + [self.key_col]) - set(df.columns)
Expand Down Expand Up @@ -360,3 +360,30 @@ def transform(self, df: pd.DataFrame, encoding="cl100k_base") -> pd.DataFrame:
token_count_column = f"{column}_token_count"
df[token_count_column] = token_count
return df


@dataclass
class MajorityVoteTransform:
"""Applies the majority vote transformation to the specified model output column per id_col."""

model_output_col: str = "model_output" # Default column name for model outputs
id_col: str = "data_point_id" # Default column name for IDs
majority_vote_col: str = "majority_vote"

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the dataframe by calculating the majority vote of model_output_col per id_col.
If the 'model_output' is NaN, it will be droped before calculating the majority vote.
Args:
df (pd.DataFrame): Input dataframe containing model_output_col and id_col.
Returns:
pd.DataFrame: Transformed dataframe with majority vote for each id_col.
"""
# Step 1: Group by 'ID' and calculate the majority vote within each group
df[self.majority_vote_col] = df.groupby(self.id_col)[self.model_output_col].transform(
lambda x: x.dropna().mode()[0] if not x.dropna().mode().empty else pd.NA
)

return df
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
You are a genius math graduate student solving math problems from the AIME competition. Follow the following steps.

Step 1: Reflect on what the problem is asking.
Step 2: Think step by step and explain your reasoning in detail.
Step 3: Provide your final answer in the format: 'Final Answer: [numeric value]'. Dont box it, just provide the answer directly at the end.

{{prompt}}
18 changes: 17 additions & 1 deletion eureka_ml_insights/user_configs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
from .aime import AIME_PIPELINE
from .aime import (
AIME_PIPELINE,
AIME_PIPELINE5Run,
AIME_PIPELINE16Run,
AIME_PIPELINE32Run,
AIME_PIPELINE64Run,
AIME_PIPELINE256Run,
AIME_PIPELINE512Run,
AIME_PIPELINE1024Run,
)
from .dna import DNA_PIPELINE
from .drop import Drop_Experiment_Pipeline
from .flenqa import FlenQA_Experiment_Pipeline
Expand Down Expand Up @@ -110,4 +119,11 @@
IFEval_Nondeterminism,
Kitab_Nondeterminism,
AIME_PIPELINE,
AIME_PIPELINE5Run,
AIME_PIPELINE16Run,
AIME_PIPELINE32Run,
AIME_PIPELINE64Run,
AIME_PIPELINE256Run,
AIME_PIPELINE512Run,
AIME_PIPELINE1024Run,
]
208 changes: 193 additions & 15 deletions eureka_ml_insights/user_configs/aime.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,38 @@
import os
from typing import Any

from eureka_ml_insights.configs import (
AggregatorConfig,
DataProcessingConfig,
DataSetConfig,
EvalReportingConfig,
ExperimentConfig,
InferenceConfig,
MetricConfig,
ModelConfig,
PipelineConfig,
PromptProcessingConfig,
)
from eureka_ml_insights.core import DataProcessing, Inference, PromptProcessing
from eureka_ml_insights.core.eval_reporting import EvalReporting
from eureka_ml_insights.data_utils import (
AddColumn,
ColumnRename,
DataReader,
HFDataReader,
SamplerTransform,
MajorityVoteTransform,
MultiplyTransform,
SequenceTransform,
)
from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
from eureka_ml_insights.data_utils.data import DataLoader
from eureka_ml_insights.metrics.metrics_base import ExactMatch
from eureka_ml_insights.metrics.reports import CountAggregator

from eureka_ml_insights.configs import (
AggregatorConfig,
DataProcessingConfig,
DataSetConfig,
EvalReportingConfig,
InferenceConfig,
MetricConfig,
ModelConfig,
PipelineConfig,
PromptProcessingConfig,
from eureka_ml_insights.metrics.reports import (
BiLevelCountAggregator,
CountAggregator,
)
from eureka_ml_insights.configs import ExperimentConfig

# from eureka_ml_insights.data_utils.transform import MajorityVoteTransform


class AIME_PIPELINE(ExperimentConfig):
Expand Down Expand Up @@ -58,7 +63,7 @@ def configure_pipeline(
},
),
prompt_template_path=os.path.join(
os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_1a.jinja"
os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_1clean.jinja"
),
output_dir=os.path.join(self.log_dir, "data_processing_output"),
)
Expand All @@ -73,6 +78,7 @@ def configure_pipeline(
),
output_dir=os.path.join(self.log_dir, "inference_result"),
resume_from=resume_from,
max_concurrent=10,
)
# post process the response to extract the answer
self.data_post_processing = DataProcessingConfig(
Expand Down Expand Up @@ -124,13 +130,185 @@ def configure_pipeline(
output_dir=os.path.join(self.log_dir, "eval_report"),
)

# Aggregate the results by a majority vote
# First, let us perform majority_vote
self.data_post_processing_addmv = DataProcessingConfig(
component_type=DataProcessing,
data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
"format": ".jsonl",
"transform": SequenceTransform(
[
ColumnRename(
name_mapping={
"model_output": "raw_output",
}
),
AddColumn("model_output"),
AIMEExtractAnswer("raw_output", "model_output"),
MajorityVoteTransform(id_col="ID"),
ColumnRename(
name_mapping={
"model_output": "model_output_onerun",
"majority_vote": "model_output",
}
),
]
),
},
),
output_dir=os.path.join(self.log_dir, "data_addmv_output"),
)
# Second, compute eaxct match
self.postevalprocess_comp = EvalReportingConfig(
component_type=EvalReporting,
data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(self.data_post_processing_addmv.output_dir, "transformed_data.jsonl"),
"format": ".jsonl",
},
),
metric_config=MetricConfig(ExactMatch),
aggregator_configs=[
AggregatorConfig(
BiLevelCountAggregator,
{
"column_names": [
"ExactMatch_result",
],
"first_groupby": "ID",
"filename_base": "MajorityVote",
"normalize": True,
},
),
],
output_dir=os.path.join(self.log_dir, "eval_report_majorityVote"),
)

# Configure the pipeline
return PipelineConfig(
[
self.data_processing_comp,
self.inference_comp,
self.data_post_processing,
self.evalreporting_comp,
self.data_post_processing_addmv,
self.postevalprocess_comp,
],
self.log_dir,
)


class AIME_PIPELINE5Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
MultiplyTransform(n_repeats=5)
)
return pipeline


class AIME_PIPELINE16Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
MultiplyTransform(n_repeats=16)
)
return pipeline


class AIME_PIPELINE32Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
MultiplyTransform(n_repeats=32)
)
return pipeline


class AIME_PIPELINE64Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
MultiplyTransform(n_repeats=64)
)
return pipeline


class AIME_PIPELINE128Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
MultiplyTransform(n_repeats=128)
)
return pipeline


class AIME_PIPELINE256Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
MultiplyTransform(n_repeats=256)
)
return pipeline


class AIME_PIPELINE512Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
MultiplyTransform(n_repeats=512)
)
return pipeline


class AIME_PIPELINE1024Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
MultiplyTransform(n_repeats=1024)
)
return pipeline
Loading

0 comments on commit fea1a07

Please sign in to comment.