microsoft · lchen001 · Dec 17, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/eureka_ml_insights/metrics/__init__.py b/eureka_ml_insights/metrics/__init__.py
@@ -28,6 +28,7 @@
     SpatialAndLayoutReasoningMetric,
 )
 
+from .aime_metrics import NumericMatch
 __all__ = [
     Metric,
     ClassicMetric,
@@ -52,4 +53,5 @@
     SumAggregator,
     MMMUMetric,
     MaxTokenF1ScoreMetric,
+    NumericMatch,
 ]
diff --git a/eureka_ml_insights/metrics/aime_metrics.py b/eureka_ml_insights/metrics/aime_metrics.py
@@ -0,0 +1,20 @@
+from tqdm.auto import tqdm
+
+from eureka_ml_insights.metrics.metrics_base import ClassicMetric
+
+import numpy as np
+
+class NumericMatch(ClassicMetric):
+    """This class checks for a numeric match."""
+    eps = 1e-6
+    def __evaluate__(self, answer_text, target_text, is_valid):
+        if not is_valid:
+            return "none"
+        try:
+            diff = np.abs(float(target_text)-float(answer_text))
+        except:
+            return "none"
+        if diff<self.eps:
+            return "correct"
+        else:
+            return "incorrect"
diff --git a/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja b/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja
@@ -0,0 +1,19 @@
+You are a genius math expert in understanding math questions. Please read the following math question, and then decide which math category it falls into.
+
+Your judgment should be one of the following:
+
+arithmetic
+algebra
+counting
+geometry
+number theory
+probability
+other topics
+
+Do not generate any other texts except one of the above topics.
+
+----------
+Original question:
+{{prompt}}
+----------
+Your judgment:
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
@@ -7,6 +7,7 @@
     AIME_PIPELINE256Run,
     AIME_PIPELINE512Run,
     AIME_PIPELINE1024Run,
+    AIME_PIPELINETag,
 )
 from .dna import DNA_PIPELINE
 from .drop import Drop_Experiment_Pipeline

diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
@@ -27,6 +27,8 @@
 from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
 from eureka_ml_insights.data_utils.data import DataLoader
 from eureka_ml_insights.metrics.metrics_base import ExactMatch
+from eureka_ml_insights.metrics.aime_metrics import NumericMatch
+
 from eureka_ml_insights.metrics.reports import (
     BiLevelCountAggregator,
     CountAggregator,
@@ -114,16 +116,16 @@ def configure_pipeline(
                     "format": ".jsonl",
                 },
             ),
-            metric_config=MetricConfig(ExactMatch),
+            metric_config=MetricConfig(NumericMatch),
             aggregator_configs=[
                 AggregatorConfig(
                     CountAggregator,
                     {
                         "column_names": [
-                            "ExactMatch_result",
+                            "NumericMatch_result",
                         ],
                         "group_by": "Year",
-                        "filename_base": "ExactMatch_GroupBy",
+                        "filename_base": "NumericMatch_GroupBy",
                     },
                 ),
             ],
@@ -171,13 +173,13 @@ def configure_pipeline(
                     "format": ".jsonl",
                 },
             ),
-            metric_config=MetricConfig(ExactMatch),
+            metric_config=MetricConfig(NumericMatch),
             aggregator_configs=[
                 AggregatorConfig(
                     BiLevelCountAggregator,
                     {
                         "column_names": [
-                            "ExactMatch_result",
+                            "NumericMatch_result",
                         ],
                         "first_groupby": "ID",
                         "filename_base": "MajorityVote",
@@ -312,3 +314,24 @@ def configure_pipeline(
             MultiplyTransform(n_repeats=1024)
         )
         return pipeline
+
+
+class AIME_PIPELINETag(AIME_PIPELINE):
+    """This class specifies the config for running AIME benchmark 5 repeated times"""
+
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
+        '''
+        self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
+            SamplerTransform(random_seed=0,
+                             sample_count=10,
+                              )
+        )
+        '''
+        # data preprocessing
+        self.data_processing_comp.prompt_template_path=os.path.join(
+                os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_tag1.jinja"
+            )
+        return pipeline