Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

numeric match and topic tag #72

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions eureka_ml_insights/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
SpatialAndLayoutReasoningMetric,
)

from .aime_metrics import NumericMatch
__all__ = [
Metric,
ClassicMetric,
Expand All @@ -52,4 +53,5 @@
SumAggregator,
MMMUMetric,
MaxTokenF1ScoreMetric,
NumericMatch,
]
20 changes: 20 additions & 0 deletions eureka_ml_insights/metrics/aime_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from tqdm.auto import tqdm

from eureka_ml_insights.metrics.metrics_base import ClassicMetric

import numpy as np

class NumericMatch(ClassicMetric):
"""This class checks for a numeric match."""
eps = 1e-6
def __evaluate__(self, answer_text, target_text, is_valid):
if not is_valid:
return "none"
try:
diff = np.abs(float(target_text)-float(answer_text))
except:
return "none"
if diff<self.eps:
return "correct"
else:
return "incorrect"
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
You are a genius math graduate student solving math problems from the AIME competition.

Provide your final answer in the format: 'Final Answer: [numeric value]'. Dont box it, just provide the answer directly at the end.

{{prompt}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
You are a genius math expert in understanding math questions. Please read the following math question, and then decide which math categories it falls into.

Your judgment should be one or more of the following:

arithmetic
algebra
counting
geometry
number theory
probability
other topics

Do not generate any other texts except one or more of the above topics. For multiple topics, seperate them by commas.

----------
Original question:
{{prompt}}
----------
Your judgment:
lchen001 marked this conversation as resolved.
Show resolved Hide resolved
4 changes: 4 additions & 0 deletions eureka_ml_insights/user_configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
AIME_PIPELINE16Run,
AIME_PIPELINE32Run,
AIME_PIPELINE64Run,
AIME_PIPELINE128Run,
AIME_PIPELINE256Run,
AIME_PIPELINE512Run,
AIME_PIPELINE1024Run,
AIME_PIPELINE5Run,
AIME_PIPELINEDirect5Run,
AIME_PIPELINETag,
)
from .dna import DNA_PIPELINE
from .drop import Drop_Experiment_Pipeline
Expand Down
45 changes: 38 additions & 7 deletions eureka_ml_insights/user_configs/aime.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@
MajorityVoteTransform,
MultiplyTransform,
SequenceTransform,
SamplerTransform
)
from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
from eureka_ml_insights.data_utils.data import DataLoader
from eureka_ml_insights.metrics.metrics_base import ExactMatch
from eureka_ml_insights.metrics.aime_metrics import NumericMatch
from eureka_ml_insights.metrics.reports import (
BiLevelCountAggregator,
CountAggregator,
Expand Down Expand Up @@ -78,7 +79,7 @@ def configure_pipeline(
),
output_dir=os.path.join(self.log_dir, "inference_result"),
resume_from=resume_from,
max_concurrent=10,
max_concurrent=1,
)
# post process the response to extract the answer
self.data_post_processing = DataProcessingConfig(
Expand Down Expand Up @@ -114,16 +115,16 @@ def configure_pipeline(
"format": ".jsonl",
},
),
metric_config=MetricConfig(ExactMatch),
metric_config=MetricConfig(NumericMatch),
aggregator_configs=[
AggregatorConfig(
CountAggregator,
{
"column_names": [
"ExactMatch_result",
"NumericMatch_result",
],
"group_by": "Year",
"filename_base": "ExactMatch_GroupBy",
"filename_base": "NumericMatch_GroupBy",
},
),
],
Expand Down Expand Up @@ -171,13 +172,13 @@ def configure_pipeline(
"format": ".jsonl",
},
),
metric_config=MetricConfig(ExactMatch),
metric_config=MetricConfig(NumericMatch),
aggregator_configs=[
AggregatorConfig(
BiLevelCountAggregator,
{
"column_names": [
"ExactMatch_result",
"NumericMatch_result",
],
"first_groupby": "ID",
"filename_base": "MajorityVote",
Expand Down Expand Up @@ -215,6 +216,19 @@ def configure_pipeline(
)
return pipeline

class AIME_PIPELINEDirect5Run(AIME_PIPELINE5Run):
"""This class specifies the config for running AIME benchmark 5 repeated times"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.prompt_template_path=os.path.join(
os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_1direct.jinja"
)
return pipeline


class AIME_PIPELINE16Run(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""
Expand Down Expand Up @@ -312,3 +326,20 @@ def configure_pipeline(
MultiplyTransform(n_repeats=1024)
)
return pipeline


class AIME_PIPELINETag(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark 5 repeated times"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update comment so it reflects the functionality of the class


def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
# data preprocessing
self.data_processing_comp.prompt_template_path = os.path.join(
os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_tag1.jinja"
)
# Each query is tagged with one or more topics from arithmetic, algebra, counting, geometry, number theory, and probability and other.
# These topics follow the description on the official website: https://artofproblemsolving.com/wiki/index.php/American_Invitational_Mathematics_Examination?srsltid=AfmBOooSIQ8ua5aJX00ZtYCKDuOAB4I4c-YE9zr1xYZ86fq8x5RL2sEg.
# In their own words, "The AIME tests mathematical problem solving with arithmetic, algebra, counting, geometry, number theory, and probability and other secondary school math topics"
return pipeline
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the class inherits from the original AIME_PIPELINE it will continue to run the rest of the AIME_PIPELINE but with the tagging prompt. For example, this means that it will also try to extract an answer and generate the report. There are two options here: 1) Either to not inherit from AIME_PIPELINE, or 2) Inherit from AIME_PIPELINE but then return only the components you need in the pipeline. For example,

return PipelineConfig(
[
self.data_processing_comp,
self.inference_comp,
self.data_post_processing,
],
self.log_dir,
)

In case 2, also requires changing the answer extractor as the marker is different here.

Loading