microsoft · lchen001 · Dec 17, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/eureka_ml_insights/metrics/__init__.py b/eureka_ml_insights/metrics/__init__.py
@@ -28,6 +28,7 @@
     SpatialAndLayoutReasoningMetric,
 )
 
+from .aime_metrics import NumericMatch
 __all__ = [
     Metric,
     ClassicMetric,
@@ -52,4 +53,5 @@
     SumAggregator,
     MMMUMetric,
     MaxTokenF1ScoreMetric,
+    NumericMatch,
 ]
diff --git a/eureka_ml_insights/metrics/aime_metrics.py b/eureka_ml_insights/metrics/aime_metrics.py
@@ -0,0 +1,20 @@
+from tqdm.auto import tqdm
+
+from eureka_ml_insights.metrics.metrics_base import ClassicMetric
+
+import numpy as np
+
+class NumericMatch(ClassicMetric):
+    """This class checks for a numeric match."""
+    eps = 1e-6
+    def __evaluate__(self, answer_text, target_text, is_valid):
+        if not is_valid:
+            return "none"
+        try:
+            diff = np.abs(float(target_text)-float(answer_text))
+        except:
+            return "none"
+        if diff<self.eps:
+            return "correct"
+        else:
+            return "incorrect"
diff --git a/eureka_ml_insights/prompt_templates/aime_templates/Template_1direct.jinja b/eureka_ml_insights/prompt_templates/aime_templates/Template_1direct.jinja
@@ -0,0 +1,5 @@
+You are a genius math graduate student solving math problems from the AIME competition. 
+
+Provide your final answer in the format: 'Final Answer: [numeric value]'. Dont box it, just provide the answer directly at the end.
+
+{{prompt}}
diff --git a/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja b/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja
@@ -0,0 +1,19 @@
+You are a genius math expert in understanding math questions. Please read the following math question, and then decide which math categories it falls into.
+
+Your judgment should be one or more of the following:
+
+arithmetic
+algebra
+counting
+geometry
+number theory
+probability
+other topics
+
+Do not generate any other texts except one or more of the above topics. For multiple topics, seperate them by commas.
+
+----------
+Original question:
+{{prompt}}
+----------
+Your judgment:
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
@@ -4,9 +4,13 @@
     AIME_PIPELINE16Run,
     AIME_PIPELINE32Run,
     AIME_PIPELINE64Run,
+    AIME_PIPELINE128Run,
     AIME_PIPELINE256Run,
     AIME_PIPELINE512Run,
     AIME_PIPELINE1024Run,
+    AIME_PIPELINE5Run,
+    AIME_PIPELINEDirect5Run,
+    AIME_PIPELINETag,
 )
 from .dna import DNA_PIPELINE
 from .drop import Drop_Experiment_Pipeline

diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
@@ -23,10 +23,11 @@
     MajorityVoteTransform,
     MultiplyTransform,
     SequenceTransform,
+    SamplerTransform
 )
 from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
 from eureka_ml_insights.data_utils.data import DataLoader
-from eureka_ml_insights.metrics.metrics_base import ExactMatch
+from eureka_ml_insights.metrics.aime_metrics import NumericMatch
 from eureka_ml_insights.metrics.reports import (
     BiLevelCountAggregator,
     CountAggregator,
@@ -78,7 +79,7 @@ def configure_pipeline(
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
             resume_from=resume_from,
-            max_concurrent=10,
+            max_concurrent=1,
         )
         # post process the response to extract the answer
         self.data_post_processing = DataProcessingConfig(
@@ -114,16 +115,16 @@ def configure_pipeline(
                     "format": ".jsonl",
                 },
             ),
-            metric_config=MetricConfig(ExactMatch),
+            metric_config=MetricConfig(NumericMatch),
             aggregator_configs=[
                 AggregatorConfig(
                     CountAggregator,
                     {
                         "column_names": [
-                            "ExactMatch_result",
+                            "NumericMatch_result",
                         ],
                         "group_by": "Year",
-                        "filename_base": "ExactMatch_GroupBy",
+                        "filename_base": "NumericMatch_GroupBy",
                     },
                 ),
             ],
@@ -171,13 +172,13 @@ def configure_pipeline(
                     "format": ".jsonl",
                 },
             ),
-            metric_config=MetricConfig(ExactMatch),
+            metric_config=MetricConfig(NumericMatch),
             aggregator_configs=[
                 AggregatorConfig(
                     BiLevelCountAggregator,
                     {
                         "column_names": [
-                            "ExactMatch_result",
+                            "NumericMatch_result",
                         ],
                         "first_groupby": "ID",
                         "filename_base": "MajorityVote",
@@ -215,6 +216,19 @@ def configure_pipeline(
         )
         return pipeline
 
+class AIME_PIPELINEDirect5Run(AIME_PIPELINE5Run):
+    """This class specifies the config for running AIME benchmark 5 repeated times"""
+
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
+        # data preprocessing
+        self.data_processing_comp.prompt_template_path=os.path.join(
+                os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_1direct.jinja"
+            )
+        return pipeline
+
 
 class AIME_PIPELINE16Run(AIME_PIPELINE):
     """This class specifies the config for running AIME benchmark 5 repeated times"""
@@ -312,3 +326,20 @@ def configure_pipeline(
             MultiplyTransform(n_repeats=1024)
         )
         return pipeline
+
+
+class AIME_PIPELINETag(AIME_PIPELINE):
+    """This class specifies the config for running AIME benchmark 5 repeated times"""
+
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
+        # data preprocessing
+        self.data_processing_comp.prompt_template_path = os.path.join(
+            os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_tag1.jinja"
+        )
+        # Each query is tagged with one or more topics from arithmetic, algebra, counting, geometry, number theory, and probability and other.
+        # These topics follow the description on the official website: https://artofproblemsolving.com/wiki/index.php/American_Invitational_Mathematics_Examination?srsltid=AfmBOooSIQ8ua5aJX00ZtYCKDuOAB4I4c-YE9zr1xYZ86fq8x5RL2sEg.
+        # In their own words, "The AIME tests mathematical problem solving with arithmetic, algebra, counting, geometry, number theory, and probability and other secondary school math topics"
+        return pipeline