Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding new benchmark - BA-Calendar #58

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
10 changes: 9 additions & 1 deletion eureka_ml_insights/configs/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@
"key_vault_url": None,
}

GEMINI_EXP_1114_PRO_CONFIG = ModelConfig(
GeminiModel,
{
"model_name": "gemini-exp-1114",
"secret_key_params": GEMINI_SECRET_KEY_PARAMS,
},
)

GEMINI_V15_PRO_CONFIG = ModelConfig(
GeminiModel,
{
Expand Down Expand Up @@ -198,4 +206,4 @@
},
"model_name": "Mistral-large-2407",
},
)
)
287 changes: 287 additions & 0 deletions eureka_ml_insights/metrics/ba_calendar_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
# This file was authored by BenchAgents authors and is being reused under the MIT license.
# All code in this file is directly copied from the original source repository.
# https://github.com/microsoft/benchagents

import ast
import json
import re
import numpy as np
from datetime import datetime, timedelta

import pandas as pd

from eureka_ml_insights.metrics.metrics_base import CompositeMetric

# Helper functions
def is_formatted(solution):
pattern = r"^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) ([0-9]|[01]\d|2[0-3]):[0-5]\d-([0-9]|[01]\d|2[0-3]):[0-5]\d$"
return bool(re.match(pattern, solution))

def generate_time_slots(start_time, end_time, granularity):
granularity=5
slots = []
current_time = start_time
while current_time + timedelta(minutes=granularity) <= end_time:
slots.append((current_time, current_time + timedelta(minutes=granularity)))
current_time += timedelta(minutes=granularity)
return slots

def parse_time_block(time_block):
start_str, end_str = time_block.split('-')
start_time = datetime.strptime(start_str, "%H:%M")
end_time = datetime.strptime(end_str, "%H:%M")
return start_time, end_time

def filter_slots_by_duration(time_slots, duration):
filtered_slots = []
for i in range(len(time_slots)):
accumulated_duration = timedelta()
for j in range(i, len(time_slots)):
accumulated_duration += time_slots[j][1] - time_slots[j][0]
if accumulated_duration >= timedelta(minutes=duration):
filtered_slots.append((time_slots[i][0], time_slots[j][1]))
break
return filtered_slots

def filter_slots_by_constraints(time_slots, constraints, day):
filtered_slots = []
for slot in time_slots:
start_time, end_time = slot
if constraints['no_meetings_before']:
no_meetings_before = datetime.strptime(f"{constraints['no_meetings_before']}:00", "%H:%M")
if start_time < no_meetings_before:
continue
if constraints['no_meetings_after']:
no_meetings_after = datetime.strptime(f"{constraints['no_meetings_after']}:00", "%H:%M")
if end_time >= no_meetings_after:
continue
if constraints['no_meetings_on_weekends'] and day in ['Saturday', 'Sunday']:
continue
if constraints['no_meetings_during_specific_times']:
no_meetings_start, no_meetings_end = parse_time_block(constraints['no_meetings_during_specific_times'])
if (start_time < no_meetings_end and end_time > no_meetings_start):
continue
filtered_slots.append(slot)
return filtered_slots

class BACalendarMetric(CompositeMetric):
"""
Composite metric for evaluating if a response for each criteria.

This metric evaluates if a given response follows the provided constraints.
"""

def __init__(self):
super().__init__()
self.no_solution_response = "No common time slot available"

def __evaluate__(self, row):
results = {}
results.update(self.run_programmatic_tests(row))
return results

def run_programmatic_tests(self, instance):
result = {}
solution = instance['model_output']
solution = solution.strip('"').strip('`').strip('\n')
if not is_formatted(solution):
result['format_programmatic'] = 1
result.update(self.check_availability_programmatic(instance, solution))
result.update(self.check_meeting_duration_programmatic(instance, solution))
result.update(self.check_buffer_time_programmatic(instance, solution))
result.update(self.check_no_weekends_programmatic(instance, solution))
result.update(self.check_time_restrictions_programmatic(instance, solution))
result.update(self.check_specific_times_programmatic(instance, solution))
result.update(self.check_priority_programmatic(instance, solution))
all_correct = 1
passed_constraints = []
for key, value in result.items():
if value == 0:
all_correct = 0
x = value
if x != 'NA' and pd.notna(x) and isinstance(x, int):
vidhishanair marked this conversation as resolved.
Show resolved Hide resolved
passed_constraints.append(value)
result['all_correct'] = all_correct
result['fraction_passed'] = np.mean(passed_constraints)
return result

def is_formatted(self, solution):
vidhishanair marked this conversation as resolved.
Show resolved Hide resolved
run_tests=True
if solution == self.no_solution_response:
run_tests=False
if not is_formatted(solution):
run_tests=False
return run_tests

def check_availability_programmatic(self, instance, solution):
if not instance['constraints'].get('availability', True):
result = {'availability_programmatic_check': 'NA'}
return result

if not self.is_formatted(solution):
result = {'availability_programmatic_check': 0}
return result

day, time_range = solution.split()
start_time, end_time = parse_time_block(time_range)
all_available = 1
availability = json.loads(instance['metadata']['availability'].replace("'", '"'))
for participant, schedule in availability.items():
if day not in schedule:
all_available = 0
break
available_blocks = schedule[day]
available = False
for block in available_blocks:
block_start, block_end = parse_time_block(block)
if block_start <= start_time and block_end >= end_time:
available = True
break
if not available:
all_available = 0
break

return {'availability_programmatic_check': all_available}

def check_meeting_duration_programmatic(self, instance, solution):
if not instance['constraints'].get('meeting_duration', True):
result = {'meeting_duration_programmatic_check': 'NA'}
return result

if not self.is_formatted(solution):
result = {'meeting_duration_programmatic_check': 0}
return result

_, time_range = solution.split()
start_time, end_time = parse_time_block(time_range)
meeting_duration = (end_time - start_time).total_seconds() / 60
expected_duration = instance['constraints']['meeting_duration']

return {'meeting_duration_programmatic_check': int(meeting_duration == expected_duration)}


def check_buffer_time_programmatic(self, instance, solution):
buffer_time = instance['constraints'].get('buffer_time_before_and_after_meeting', True)
if buffer_time is None or not buffer_time:
result = {'buffer_time_programmatic_check': 'NA'}
return result

if not self.is_formatted(solution):
result = {'buffer_time_programmatic_check': 0}
return result

buffer_time = instance['constraints']['buffer_time_before_and_after_meeting']
day, time_range = solution.split()
start_time, end_time = parse_time_block(time_range)
buffer_start_time = start_time - timedelta(minutes=buffer_time)
buffer_end_time = end_time + timedelta(minutes=buffer_time)
all_buffer_respected = 1

availability = json.loads(instance['metadata']['availability'].replace("'", '"'))
for participant, schedule in availability.items():
if day not in schedule:
all_buffer_respected = 0
break
available_blocks = schedule[day]
buffer_respected = False
for block in available_blocks:
block_start, block_end = parse_time_block(block)
if block_start <= buffer_start_time and block_end >= buffer_end_time:
buffer_respected = True
break
if not buffer_respected:
all_buffer_respected = 0
break
return {'buffer_time_programmatic_check': all_buffer_respected}

def check_no_weekends_programmatic(self, instance, solution):
if not instance['constraints'].get('no_meetings_on_weekends', True):
return {'no_weekends_programmatic_check': 'NA'}

if not self.is_formatted(solution):
return {'no_weekends_programmatic_check': 0}

day, _ = solution.split()
day_of_week = datetime.strptime(day, '%A').weekday()
no_weekends = day_of_week < 5
return {'no_weekends_programmatic_check': int(no_weekends)}

def check_time_restrictions_programmatic(self, instance, solution):
if not instance['constraints'].get('no_meetings_before', True) and not instance['constraints'].get('no_meetings_after', True):
return {'time_restrictions_programmatic_check': 'NA'}

if not self.is_formatted(solution):
return {'time_restrictions_programmatic_check': 0}

_, time_range = solution.split()
start_time, end_time = parse_time_block(time_range)

no_meetings_before = instance['constraints'].get('no_meetings_before')
no_meetings_after = instance['constraints'].get('no_meetings_after')

if no_meetings_before:
no_meetings_before = datetime.strptime(f"{no_meetings_before}:00", "%H:%M")
if start_time < no_meetings_before:
return {'time_restrictions_programmatic_check': 0}

if no_meetings_after:
no_meetings_after = datetime.strptime(f"{no_meetings_after}:00", '%H:%M')
if end_time > no_meetings_after:
return {'time_restrictions_programmatic_check': 0}
return {'time_restrictions_programmatic_check': 1}

def check_priority_programmatic(self, instance, solution):
if not instance['constraints'].get('high_priority_meeting', False):
return {'priority_programmatic_check': 'NA'}

if not self.is_formatted(solution):
return {'priority_programmatic_check': 0}

metadata = instance['metadata']
result = False
params = instance['params']
constraints = instance['constraints']
if constraints['buffer_time_before_and_after_meeting']:
buffer_time = constraints['buffer_time_before_and_after_meeting']
else:
buffer_time = 0
for day in params['days_of_week']: # update this post cleaning up data!
vidhishanair marked this conversation as resolved.
Show resolved Hide resolved
common_time_slots = None
availability = json.loads(metadata['availability'].replace("'", '"'))
for participant, schedule in availability.items():
if day in schedule:
participant_time_slots = []
for time_slot in schedule[day]:
start_time, end_time = parse_time_block(time_slot)
time_slots = generate_time_slots(start_time, end_time, params['granularity'])
time_slots = filter_slots_by_duration(time_slots, constraints['meeting_duration'] + 2 * buffer_time)
time_slots = filter_slots_by_constraints(time_slots, constraints, day=day)
participant_time_slots.extend(time_slots)
if common_time_slots is None:
common_time_slots = set(participant_time_slots)
else:
common_time_slots = common_time_slots.intersection(participant_time_slots)
if common_time_slots:
first_available_slot = sorted(list(common_time_slots))[0]
first_available_start = (first_available_slot[0]+timedelta(minutes=buffer_time)).strftime('%H:%M')
first_available_end = (first_available_slot[1]-timedelta(minutes=buffer_time)).strftime('%H:%M')
result = solution == f"{day} {first_available_start}-{first_available_end}"
return {'priority_programmatic_check': int(result)}

def check_specific_times_programmatic(self, instance, solution):
if not instance['constraints'].get('no_meetings_during_specific_times', True):
return {'specific_times_programmatic_check': 'NA'}

if not self.is_formatted(solution):
return {'specific_times_programmatic_check': 0}

restricted_times = instance['constraints']['no_meetings_during_specific_times']
restricted_start, restricted_end = parse_time_block(restricted_times)
day, time_range = solution.split()
start_time, end_time = parse_time_block(time_range)

if (start_time < restricted_end and end_time > restricted_start):
result = 0
else:
result = 1
return {'specific_times_programmatic_check': result}
35 changes: 32 additions & 3 deletions eureka_ml_insights/metrics/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,43 @@ def _aggregate_grouped(self, data):
class AverageAggregator(NumericalAggregator):

def _aggregate(self, data):
averages = {col: data[col].mean().round(3) for col in self.column_names}
if len(data) == 0:
averages = {col: 0 for col in self.column_names}
else:
averages = {col: data[col].mean().round(3) for col in self.column_names}
self.aggregated_result = averages

def _aggregate_grouped(self, data):
gb = data.groupby(self.group_by)
averages = {col: round(gb[col].mean(), 3).to_dict() for col in self.column_names}
if len(data) == 0:
averages = {col: 0 for col in self.column_names}
else:
gb = data.groupby(self.group_by)
averages = {col: round(gb[col].mean(), 3).to_dict() for col in self.column_names}
self.aggregated_result = averages

class NAFilteredAverageAggregator(AverageAggregator):
vidhishanair marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self, column_name, output_dir, group_by=None, ignore_non_numeric=False, filename_base=None, **kwargs):
"""
args:
column_name: column name to filter and aggregate
output_dir: str. directory to save the report
group_by: str. or list of str. column(s) to group by before aggregating
ignore_non_numeric: bool. if True ignore non-numeric values for average aggregator
filename_base: str. optional base string to be used in the file name for the report. If not None, the report filename will concatenate the class name, datetime, and filename_base.
"""

self.column_name = column_name
self.group_by = group_by
self.output_dir = output_dir
self.aggregated_result = None
self.ignore_non_numeric = ignore_non_numeric
self.filename_base = filename_base
super().__init__([column_name], output_dir, group_by, ignore_non_numeric, filename_base, **kwargs)

def aggregate(self, data):
filtered_data = data[data[self.column_name] != "NA"].copy()
super().aggregate(filtered_data)


class AverageSTDDevAggregator(NumericalAggregator):

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
You are a scheduling assistant. Given the availability schedules of multiple participants and some additional constraints, your task is to find a common time slot.
Make sure you use the availability schedules to generate your response.
High priority meetings should be scheduled as early as possible.
Buffer time refers to the required remaining available time before and after a meeting. For example, if buffer time is 15 minutes, a meeting from 9:00-10:00 will require availability from 8:45-10:15.
Respond with "[day] [start_time]-[end_time]" or "No common time slot available"
Do not respond with any additional information or comments.
{{prompt}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
You are a scheduling assistant. Given the availability schedules of multiple participants and some additional constraints, your task is to find a common time slot.
Make sure you use the availability schedules to generate your response.
High priority meetings should be scheduled as early as possible.
Buffer time refers to the required remaining available time before and after a meeting. For example, if buffer time is 15 minutes, a meeting from 9:00-10:00 will require availability from 8:45-10:15.
Respond with "[day] [start_time]-[end_time]" or "No common time slot available"
{{prompt}}
vidhishanair marked this conversation as resolved.
Show resolved Hide resolved
3 changes: 3 additions & 0 deletions eureka_ml_insights/user_configs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .aime import AIME_PIPELINE
from .aime import (
AIME_PIPELINE,
AIME_PIPELINE5Run,
Expand All @@ -8,6 +9,7 @@
AIME_PIPELINE512Run,
AIME_PIPELINE1024Run,
)
from .ba_calendar import Calendar_Schedule_PIPELINE
from .dna import DNA_PIPELINE
from .drop import Drop_Experiment_Pipeline
from .flenqa import FlenQA_Experiment_Pipeline
Expand Down Expand Up @@ -112,6 +114,7 @@
KITAB_TWO_BOOK_CONSTRAINT_PIPELINE_WITH_CONTEXT,
GPT35_KITAB_ONE_BOOK_CONSTRAINT_PIPELINE,
DNA_PIPELINE,
Calendar_Schedule_PIPELINE,
ToxiGen_Discriminative_PIPELINE,
ToxiGen_Generative_PIPELINE,
Geo_Nondeterminism,
Expand Down
Loading
Loading